[Qemu-devel] [QEMU v2 7/9] bitmap: Add a new bitmap_move function
Sometimes, it is need to move a portion of bitmap to another place in a large bitmap, if overlap happens, the bitmap_copy can't not work correctly, we need a new function to do this work. Signed-off-by: Liang Li <liang.z...@intel.com> --- include/qemu/bitmap.h | 13 + 1 file changed, 13 insertions(+) diff --git a/include/qemu/bitmap.h b/include/qemu/bitmap.h index ec5146f..6ac89ca 100644 --- a/include/qemu/bitmap.h +++ b/include/qemu/bitmap.h @@ -37,6 +37,7 @@ * bitmap_set(dst, pos, nbits) Set specified bit area * bitmap_set_atomic(dst, pos, nbits) Set specified bit area with atomic ops * bitmap_clear(dst, pos, nbits) Clear specified bit area + * bitmap_move(dst, src, nbits) Move *src to *dst * bitmap_test_and_clear_atomic(dst, pos, nbits)Test and clear area * bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area */ @@ -136,6 +137,18 @@ static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, } } +static inline void bitmap_move(unsigned long *dst, const unsigned long *src, + long nbits) +{ +if (small_nbits(nbits)) { +unsigned long tmp = *src; +*dst = tmp; +} else { +long len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); +memmove(dst, src, len); +} +} + static inline int bitmap_and(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, long nbits) { -- 1.9.1
[Qemu-devel] [QEMU v2 9/9] migration: skip free pages during live migration
After sending out the request for free pages, live migration process will start without waiting for the free page bitmap is ready. If the free page bitmap is not ready when doing the 1st migration_bitmap_sync() after ram_save_setup(), the free page bitmap will be ignored, this means the free pages will not be filtered out in this case. The current implementation can not work with post copy, if post copy is enabled, we simply ignore the free pages. Will make it work later. Signed-off-by: Liang Li <liang.z...@intel.com> --- migration/ram.c | 86 + 1 file changed, 86 insertions(+) diff --git a/migration/ram.c b/migration/ram.c index 815bc0e..223d243 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -43,6 +43,8 @@ #include "trace.h" #include "exec/ram_addr.h" #include "qemu/rcu_queue.h" +#include "sysemu/balloon.h" +#include "sysemu/kvm.h" #ifdef DEBUG_MIGRATION_RAM #define DPRINTF(fmt, ...) \ @@ -228,6 +230,8 @@ static QemuMutex migration_bitmap_mutex; static uint64_t migration_dirty_pages; static uint32_t last_version; static bool ram_bulk_stage; +static bool ignore_freepage_rsp; +static uint64_t free_page_req_id; /* used by the search for pages to send */ struct PageSearchStatus { @@ -244,6 +248,7 @@ static struct BitmapRcu { struct rcu_head rcu; /* Main migration bitmap */ unsigned long *bmap; +unsigned long *free_page_bmap; /* bitmap of pages that haven't been sent even once * only maintained and used in postcopy at the moment * where it's used to send the dirtymap at the start @@ -636,6 +641,7 @@ static void migration_bitmap_sync(void) rcu_read_unlock(); qemu_mutex_unlock(_bitmap_mutex); +ignore_freepage_rsp = true; trace_migration_bitmap_sync_end(migration_dirty_pages - num_dirty_pages_init); num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init; @@ -1409,6 +1415,7 @@ static void migration_bitmap_free(struct BitmapRcu *bmap) { g_free(bmap->bmap); g_free(bmap->unsentmap); +g_free(bmap->free_page_bmap); g_free(bmap); } @@ -1479,6 +1486,77 @@ void migration_bitmap_extend(ram_addr_t old, ram_addr_t new) } } +static void filter_out_guest_free_page(unsigned long *free_page_bmap, + long nbits) +{ +long i, page_count = 0, len; +unsigned long *bitmap; + +tighten_guest_free_page_bmap(free_page_bmap); +qemu_mutex_lock(_bitmap_mutex); +bitmap = atomic_rcu_read(_bitmap_rcu)->bmap; +slow_bitmap_complement(bitmap, free_page_bmap, nbits); + +len = (last_ram_offset() >> TARGET_PAGE_BITS) / BITS_PER_LONG; +for (i = 0; i < len; i++) { +page_count += hweight_long(bitmap[i]); +} + +migration_dirty_pages = page_count; +qemu_mutex_unlock(_bitmap_mutex); +} + +static void ram_request_free_page(unsigned long *bmap, unsigned long max_pfn) +{ +BalloonReqStatus status; + +free_page_req_id++; +status = balloon_get_free_pages(bmap, max_pfn / BITS_PER_BYTE, +free_page_req_id); +if (status == REQ_START) { +ignore_freepage_rsp = false; +} +} + +static void ram_handle_free_page(void) +{ +unsigned long nbits, req_id = 0; +RAMBlock *pc_ram_block; +BalloonReqStatus status; + +status = balloon_free_page_ready(_id); +switch (status) { +case REQ_DONE: +if (req_id != free_page_req_id) { +return; +} +rcu_read_lock(); +pc_ram_block = QLIST_FIRST_RCU(_list.blocks); +nbits = pc_ram_block->used_length >> TARGET_PAGE_BITS; +filter_out_guest_free_page(migration_bitmap_rcu->free_page_bmap, nbits); +rcu_read_unlock(); + +qemu_mutex_lock_iothread(); +migration_bitmap_sync(); +qemu_mutex_unlock_iothread(); +/* + * bulk stage assumes in (migration_bitmap_find_and_reset_dirty) that + * every page is dirty, that's no longer ture at this point. + */ +ram_bulk_stage = false; +last_seen_block = NULL; +last_sent_block = NULL; +last_offset = 0; +break; +case REQ_ERROR: +ignore_freepage_rsp = true; +error_report("failed to get free page"); +break; +default: +break; +} +} + /* * 'expected' is the value you expect the bitmap mostly to be full * of; it won't bother printing lines that are all this value. @@ -1944,6 +2022,11 @@ static int ram_save_setup(QEMUFile *f, void *opaque) qemu_mutex_unlock_ramlist(); qemu_mutex_unlock_iothread(); +if (balloon_free_pages_support() && !migrate_postcopy_ram()) { +unsigned long max_pfn = get_guest_max_pfn(); +migration_bitmap_rcu->free_page_bmap = bitmap_new(max_pfn); +ram_
[Qemu-devel] [QEMU v2 6/9] balloon: migrate vq elem to destination
After live migration, 'guest-stats' can't get the expected memory status in the guest. This issue is caused by commit 4eae2a657d. The value of 's->stats_vq_elem' will be NULL after live migration, and the check in the function 'balloon_stats_poll_cb()' will prevent the 'virtio_notify()' from executing. So guest will not update the memory status. Commit 4eae2a657d is doing the right thing, but 's->stats_vq_elem' should be treated as part of balloon device state and migrated to destination if it's not NULL to make everything works well. For the same reason, 's->misc_vq_elem' should be migrated to destination too. Michael has other idea to solve this issue, but he is busy at the moment, this patch can be used for test before his patch is ready. Signed-off-by: Liang Li <liang.z...@intel.com> --- hw/virtio/virtio-balloon.c | 36 ++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index b0c09a7..f9bf26d 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -31,6 +31,7 @@ #include "hw/virtio/virtio-access.h" #define BALLOON_PAGE_SIZE (1 << VIRTIO_BALLOON_PFN_SHIFT) +#define BALLOON_VERSION 2 static void balloon_page(void *addr, int deflate) { @@ -610,15 +611,33 @@ static void virtio_balloon_save(QEMUFile *f, void *opaque) static void virtio_balloon_save_device(VirtIODevice *vdev, QEMUFile *f) { VirtIOBalloon *s = VIRTIO_BALLOON(vdev); +uint16_t elem_num = 0; qemu_put_be32(f, s->num_pages); qemu_put_be32(f, s->actual); +if (s->stats_vq_elem != NULL) { +elem_num = 1; +} +qemu_put_be16(f, elem_num); +if (elem_num) { +qemu_put_virtqueue_element(f, s->stats_vq_elem); +} + +elem_num = 0; +if (s->misc_vq_elem != NULL) { +elem_num = 1; +} +qemu_put_be16(f, elem_num); +if (elem_num) { +qemu_put_virtqueue_element(f, s->misc_vq_elem); +} } static int virtio_balloon_load(QEMUFile *f, void *opaque, int version_id) { -if (version_id != 1) +if (version_id < 1 || version_id > BALLOON_VERSION) { return -EINVAL; +} return virtio_load(VIRTIO_DEVICE(opaque), f, version_id); } @@ -627,9 +646,22 @@ static int virtio_balloon_load_device(VirtIODevice *vdev, QEMUFile *f, int version_id) { VirtIOBalloon *s = VIRTIO_BALLOON(vdev); +uint16_t elem_num = 0; s->num_pages = qemu_get_be32(f); s->actual = qemu_get_be32(f); +if (version_id == BALLOON_VERSION) { +elem_num = qemu_get_be16(f); +if (elem_num == 1) { +s->stats_vq_elem = +qemu_get_virtqueue_element(f, sizeof(VirtQueueElement)); +} +elem_num = qemu_get_be16(f); +if (elem_num == 1) { +s->misc_vq_elem = +qemu_get_virtqueue_element(f, sizeof(VirtQueueElement)); +} +} if (balloon_stats_enabled(s)) { balloon_stats_change_timer(s, s->stats_poll_interval); @@ -665,7 +697,7 @@ static void virtio_balloon_device_realize(DeviceState *dev, Error **errp) reset_stats(s); s->req_status = REQ_INIT; -register_savevm(dev, "virtio-balloon", -1, 1, +register_savevm(dev, "virtio-balloon", -1, BALLOON_VERSION, virtio_balloon_save, virtio_balloon_load, s); } -- 1.9.1
[Qemu-devel] [QEMU v2 4/9] virtio-balloon: update linux head file for new feature
Update the new feature bit definition for the new virt queue and the request header struct to keep consistent with kernel side. Signed-off-by: Liang Li <liang.z...@intel.com> --- include/standard-headers/linux/virtio_balloon.h | 22 ++ 1 file changed, 22 insertions(+) diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h index d577359..797a868 100644 --- a/include/standard-headers/linux/virtio_balloon.h +++ b/include/standard-headers/linux/virtio_balloon.h @@ -35,6 +35,7 @@ #define VIRTIO_BALLOON_F_STATS_VQ 1 /* Memory Stats virtqueue */ #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM2 /* Deflate balloon on OOM */ #define VIRTIO_BALLOON_F_PAGE_BITMAP 3 /* Send page info with bitmap */ +#define VIRTIO_BALLOON_F_MISC_VQ 4 /* Misc info virtqueue */ /* Size of a PFN in the balloon interface. */ #define VIRTIO_BALLOON_PFN_SHIFT 12 @@ -101,4 +102,25 @@ struct balloon_bmap_hdr { __virtio64 bmap_len; }; +enum balloon_req_id { + /* Get free pages information */ + BALLOON_GET_FREE_PAGES, +}; + +enum balloon_flag { + /* Have more data for a request */ + BALLOON_FLAG_CONT, + /* No more data for a request */ + BALLOON_FLAG_DONE, +}; + +struct balloon_req_hdr { + /* Used to distinguish different request */ + __virtio16 cmd; + /* Reserved */ + __virtio16 reserved[3]; + /* Request parameter */ + __virtio64 param; +}; + #endif /* _LINUX_VIRTIO_BALLOON_H */ -- 1.9.1
[Qemu-devel] [QEMU v2 2/9] virtio-balloon: update linux head file
Update the new feature bit definition and the page bitmap header struct to keep consistent with kernel side. Signed-off-by: Liang Li <liang.z...@intel.com> --- include/standard-headers/linux/virtio_balloon.h | 19 +++ 1 file changed, 19 insertions(+) diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h index 9d06ccd..d577359 100644 --- a/include/standard-headers/linux/virtio_balloon.h +++ b/include/standard-headers/linux/virtio_balloon.h @@ -34,6 +34,7 @@ #define VIRTIO_BALLOON_F_MUST_TELL_HOST0 /* Tell before reclaiming pages */ #define VIRTIO_BALLOON_F_STATS_VQ 1 /* Memory Stats virtqueue */ #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM2 /* Deflate balloon on OOM */ +#define VIRTIO_BALLOON_F_PAGE_BITMAP 3 /* Send page info with bitmap */ /* Size of a PFN in the balloon interface. */ #define VIRTIO_BALLOON_PFN_SHIFT 12 @@ -82,4 +83,22 @@ struct virtio_balloon_stat { __virtio64 val; } QEMU_PACKED; +/* Page bitmap header structure */ +struct balloon_bmap_hdr { + /* Used to distinguish different request */ + __virtio16 cmd; + /* Shift width of page in the bitmap */ + __virtio16 page_shift; + /* flag used to identify different status */ + __virtio16 flag; + /* Reserved */ + __virtio16 reserved; + /* ID of the request */ + __virtio64 req_id; + /* The pfn of 0 bit in the bitmap */ + __virtio64 start_pfn; + /* The length of the bitmap, in bytes */ + __virtio64 bmap_len; +}; + #endif /* _LINUX_VIRTIO_BALLOON_H */ -- 1.9.1
[Qemu-devel] [QEMU v2 1/9] virtio-balloon: Remove needless precompiled directive
Since there in wrapper around madvise(), the virtio-balloon code is able to work without the precompiled directive, the directive can be removed. Signed-off-by: Liang Li <liang.z...@intel.com> Suggested-by: Thomas Huth <th...@redhat.com> --- hw/virtio/virtio-balloon.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index 1a22e6d..62931b3 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -34,13 +34,11 @@ static void balloon_page(void *addr, int deflate) { -#if defined(__linux__) if (!qemu_balloon_is_inhibited() && (!kvm_enabled() || kvm_has_sync_mmu())) { qemu_madvise(addr, BALLOON_PAGE_SIZE, deflate ? QEMU_MADV_WILLNEED : QEMU_MADV_DONTNEED); } -#endif } static const char *balloon_stat_names[] = { -- 1.9.1
[Qemu-devel] [QEMU v2 8/9] kvm: Add two new arch specific functions
Add a new function to get the vm's max pfn and a new function to filter out the holes in the undressed free page bitmap to get a tight free page bitmap. They are implemented on X86 and should be implemented on other arches for live migration optimization. Signed-off-by: Liang Li <liang.z...@intel.com> --- include/sysemu/kvm.h | 18 ++ target-arm/kvm.c | 14 ++ target-i386/kvm.c| 37 + target-mips/kvm.c| 14 ++ target-ppc/kvm.c | 14 ++ target-s390x/kvm.c | 14 ++ 6 files changed, 111 insertions(+) diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index ad6f837..fd0956f 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -230,6 +230,24 @@ int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, target_ulong len, int type); void kvm_remove_all_breakpoints(CPUState *cpu); int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap); + +/** + * tighten_guest_free_page_bmap - process the free page bitmap from + * guest to get a tight page bitmap which does not contain + * holes. + * @bmap: undressed guest free page bitmap + * Returns: a tight guest free page bitmap, the n th bit in the + * returned bitmap and the n th bit in the migration bitmap + * should correspond to the same guest RAM page. + */ +unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap); + +/** + * get_guest_max_pfn - get the max pfn of guest + * Returns: the max pfn of guest + */ +unsigned long get_guest_max_pfn(void); + #ifndef _WIN32 int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset); #endif diff --git a/target-arm/kvm.c b/target-arm/kvm.c index 5c2bd7a..1cfccb3 100644 --- a/target-arm/kvm.c +++ b/target-arm/kvm.c @@ -626,3 +626,17 @@ int kvm_arch_msi_data_to_gsi(uint32_t data) { return (data - 32) & 0x; } + +unsigned long get_guest_max_pfn(void) +{ +/* To be done */ + +return 0; +} + +unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap) +{ +/* To be done */ + +return bmap; +} diff --git a/target-i386/kvm.c b/target-i386/kvm.c index 9327523..f0503dd 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -3378,3 +3378,40 @@ int kvm_arch_msi_data_to_gsi(uint32_t data) { abort(); } + +#define _4G (1ULL << 32) + +unsigned long get_guest_max_pfn(void) +{ +PCMachineState *pcms = PC_MACHINE(current_machine); +ram_addr_t above_4g_mem = pcms->above_4g_mem_size; +unsigned long max_pfn; + +if (above_4g_mem) { +max_pfn = (_4G + above_4g_mem) >> TARGET_PAGE_BITS; +} else { +max_pfn = pcms->below_4g_mem_size >> TARGET_PAGE_BITS; +} + +return max_pfn; +} + +unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap) +{ +PCMachineState *pcms = PC_MACHINE(current_machine); +ram_addr_t above_4g_mem = pcms->above_4g_mem_size; + +if (above_4g_mem) { +unsigned long *src, *dst, len, pos; +ram_addr_t below_4g_mem = pcms->below_4g_mem_size; +src = bmap + (_4G >> TARGET_PAGE_BITS) / BITS_PER_LONG; +dst = bmap + (below_4g_mem >> TARGET_PAGE_BITS) / BITS_PER_LONG; +bitmap_move(dst, src, above_4g_mem >> TARGET_PAGE_BITS); + +pos = (above_4g_mem + below_4g_mem) >> TARGET_PAGE_BITS; +len = (_4G - below_4g_mem) >> TARGET_PAGE_BITS; +bitmap_clear(bmap, pos, len); +} + +return bmap; +} diff --git a/target-mips/kvm.c b/target-mips/kvm.c index f3f832d..ba39827 100644 --- a/target-mips/kvm.c +++ b/target-mips/kvm.c @@ -1047,3 +1047,17 @@ int kvm_arch_msi_data_to_gsi(uint32_t data) { abort(); } + +unsigned long get_guest_max_pfn(void) +{ +/* To be done */ + +return 0; +} + +unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap) +{ +/* To be done */ + +return bmap; +} diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c index 884d564..ff67b3e 100644 --- a/target-ppc/kvm.c +++ b/target-ppc/kvm.c @@ -2630,3 +2630,17 @@ int kvmppc_enable_hwrng(void) return kvmppc_enable_hcall(kvm_state, H_RANDOM); } + +unsigned long get_guest_max_pfn(void) +{ +/* To be done */ + +return 0; +} + +unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap) +{ +/* To be done */ + +return bmap; +} diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c index 2991bff..2e5c763 100644 --- a/target-s390x/kvm.c +++ b/target-s390x/kvm.c @@ -2271,3 +2271,17 @@ int kvm_arch_msi_data_to_gsi(uint32_t data) { abort(); } + +unsigned long get_guest_max_pfn(void) +{ +/* To be done */ + +return 0; +} + +unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap) +{ +/* To be done */ + +return bmap; +} -- 1.9.1
[Qemu-devel] [QEMU v2 3/9] virtio-balloon: speed up inflating & deflating process
The implementation of the current virtio-balloon is not very efficient, the time spends on different stages of inflating the balloon to 7GB of a 8GB idle guest: a. allocating pages (6.5%) b. sending PFNs to host (68.3%) c. address translation (6.1%) d. madvise (19%) It takes about 4126ms for the inflating process to complete. Debugging shows that the bottle neck are the stage b and stage d. If using a bitmap to send the page info instead of the PFNs, we can reduce the overhead in stage b quite a lot. Furthermore, we can do the address translation and call madvise() with a bulk of RAM pages, instead of the current page per page way, the overhead of stage c and stage d can also be reduced a lot. This patch is the kernel side implementation which is intended to speed up the inflating & deflating process by adding a new feature to the virtio-balloon device. With this new feature, inflating the balloon to 7GB of a 8GB idle guest only takes 590ms, the performance improvement is about 85%. TODO: optimize stage a by allocating/freeing a chunk of pages instead of a single page at a time. Signed-off-by: Liang Li <liang.z...@intel.com> Suggested-by: Michael S. Tsirkin <m...@redhat.com> --- hw/virtio/virtio-balloon.c | 144 ++--- 1 file changed, 123 insertions(+), 21 deletions(-) diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index 62931b3..a7152c8 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -52,6 +52,77 @@ static const char *balloon_stat_names[] = { [VIRTIO_BALLOON_S_NR] = NULL }; +static void do_balloon_bulk_pages(ram_addr_t base_pfn, uint16_t page_shift, + unsigned long len, bool deflate) +{ +ram_addr_t size, processed, chunk, base; +MemoryRegionSection section = {.mr = NULL}; + +size = len << page_shift; +base = base_pfn << page_shift; + +for (processed = 0; processed < size; processed += chunk) { +chunk = size - processed; +while (chunk >= TARGET_PAGE_SIZE) { +section = memory_region_find(get_system_memory(), + base + processed, chunk); +if (!section.mr) { +chunk = QEMU_ALIGN_DOWN(chunk / 2, TARGET_PAGE_SIZE); +} else { +break; +} +} + +if (section.mr && +(int128_nz(section.size) && memory_region_is_ram(section.mr))) { +void *addr = section.offset_within_region + + memory_region_get_ram_ptr(section.mr); +qemu_madvise(addr, chunk, + deflate ? QEMU_MADV_WILLNEED : QEMU_MADV_DONTNEED); +} else { +qemu_log_mask(LOG_GUEST_ERROR, + "Invalid guest RAM range [0x%lx, 0x%lx]\n", + base + processed, chunk); +chunk = TARGET_PAGE_SIZE; +} +} +} + +static void balloon_bulk_pages(struct balloon_bmap_hdr *hdr, + unsigned long *bitmap, bool deflate) +{ +ram_addr_t base_pfn = hdr->start_pfn; +uint16_t page_shift = hdr->page_shift; +unsigned long len = hdr->bmap_len; +unsigned long current = 0, end = len * BITS_PER_BYTE; + +if (!qemu_balloon_is_inhibited() && (!kvm_enabled() || + kvm_has_sync_mmu())) { +while (current < end) { +unsigned long one = find_next_bit(bitmap, end, current); + +if (one < end) { +unsigned long pages, zero; + +zero = find_next_zero_bit(bitmap, end, one + 1); +if (zero >= end) { +pages = end - one; +} else { +pages = zero - one; +} + +if (pages) { +do_balloon_bulk_pages(base_pfn + one, page_shift, + pages, deflate); +} +current = one + pages; +} else { +current = one; +} +} +} +} + /* * reset_stats - Mark all items in the stats array as unset * @@ -72,6 +143,13 @@ static bool balloon_stats_supported(const VirtIOBalloon *s) return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_STATS_VQ); } +static bool balloon_page_bitmap_supported(const VirtIOBalloon *s) +{ +VirtIODevice *vdev = VIRTIO_DEVICE(s); + +return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_BITMAP); +} + static bool balloon_stats_enabled(const VirtIOBalloon *s) { return s->stats_poll_interval > 0; @@ -213,32 +291,54 @@ static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq) for (;;) { size_t offset = 0; uint32_t pfn; + elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); if (!elem) {
[Qemu-devel] [QEMU v2 0/9] Fast (de)inflating & fast live migration
This patch set intends to do two optimizations, one is to speed up the (de)inflating process of virtio balloon, and another one which is to speed up the live migration process. We put them together because both of them are required to change the virtio balloon spec. The main idea of speeding up the (de)inflating process is to use bitmap to send the page information to host instead of the PFNs, to reduce the overhead of virtio data transmission, address translation and madvise(). This can help to improve the performance by about 85%. The idea of speeding up live migration is to skip process guest's free pages in the first round of data copy, to reduce needless data processing, this can help to save quite a lot of CPU cycles and network bandwidth. We get guest's free page information through the virt queue of virtio-balloon, and filter out these free pages during live migration. For an idle 8GB guest, this can help to shorten the total live migration time from 2Sec to about 500ms in the 10Gbps network environment. Changes from v1 to v2: * Abandon the patch for dropping page cache. * Get a struct from vq instead of separate variables. * Use two separate APIs to request free pages and query the status. * Changed the virtio balloon interface. * Addressed some of the comments of v1. Liang Li (9): virtio-balloon: Remove needless precompiled directive virtio-balloon: update linux head file virtio-balloon: speed up inflating & deflating process virtio-balloon: update linux head file for new feature balloon: get free page info from guest balloon: migrate vq elem to destination bitmap: Add a new bitmap_move function kvm: Add two new arch specific functions migration: skip free pages during live migration balloon.c | 47 +++- hw/virtio/virtio-balloon.c | 304 ++-- include/hw/virtio/virtio-balloon.h | 18 +- include/qemu/bitmap.h | 13 + include/standard-headers/linux/virtio_balloon.h | 41 include/sysemu/balloon.h| 18 +- include/sysemu/kvm.h| 18 ++ migration/ram.c | 86 +++ target-arm/kvm.c| 14 ++ target-i386/kvm.c | 37 +++ target-mips/kvm.c | 14 ++ target-ppc/kvm.c| 14 ++ target-s390x/kvm.c | 14 ++ 13 files changed, 608 insertions(+), 30 deletions(-) -- 1.9.1
[Qemu-devel] [PATCH v2] balloon: Fix failure of updating guest memory status
After live migration, 'guest-stats' can't get the expected memory status in the guest. This issue is caused by commit 4eae2a657d. The value of 's->stats_vq_elem' will be NULL after live migration, and the check in the function 'balloon_stats_poll_cb()' will prevent the 'virtio_notify()' from executing. So guest will not update the memory status. Commit 4eae2a657d is doing the right thing, but 's->stats_vq_elem' should be treated as part of balloon device state and migrated to destination if it's not NULL to make everything works well. Signed-off-by: Liang Li <liang.z...@intel.com> Suggested-by: Paolo Bonzini <pbonz...@redhat.com> Cc: Michael S. Tsirkin <m...@redhat.com> Cc: Ladi Prosek <lpro...@redhat.com> Cc: Paolo Bonzini <pbonz...@redhat.com> --- hw/virtio/virtio-balloon.c | 22 -- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index 557d3f9..64e80c6 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -31,6 +31,7 @@ #include "hw/virtio/virtio-access.h" #define BALLOON_PAGE_SIZE (1 << VIRTIO_BALLOON_PFN_SHIFT) +#define BALLOON_VERSION 2 static void balloon_page(void *addr, int deflate) { @@ -404,15 +405,24 @@ static void virtio_balloon_save(QEMUFile *f, void *opaque) static void virtio_balloon_save_device(VirtIODevice *vdev, QEMUFile *f) { VirtIOBalloon *s = VIRTIO_BALLOON(vdev); +uint16_t elem_num = 0; qemu_put_be32(f, s->num_pages); qemu_put_be32(f, s->actual); +if (s->stats_vq_elem != NULL) { +elem_num = 1; +} +qemu_put_be16(f, elem_num); +if (elem_num) { +qemu_put_virtqueue_element(f, s->stats_vq_elem); +} } static int virtio_balloon_load(QEMUFile *f, void *opaque, int version_id) { -if (version_id != 1) +if (version_id < 1 || version_id > BALLOON_VERSION) { return -EINVAL; +} return virtio_load(VIRTIO_DEVICE(opaque), f, version_id); } @@ -421,9 +431,17 @@ static int virtio_balloon_load_device(VirtIODevice *vdev, QEMUFile *f, int version_id) { VirtIOBalloon *s = VIRTIO_BALLOON(vdev); +uint16_t elem_num = 0; s->num_pages = qemu_get_be32(f); s->actual = qemu_get_be32(f); +if (version_id == BALLOON_VERSION) { +elem_num = qemu_get_be16(f); +if (elem_num == 1) { +s->stats_vq_elem = +qemu_get_virtqueue_element(f, sizeof(VirtQueueElement)); +} +} if (balloon_stats_enabled(s)) { balloon_stats_change_timer(s, s->stats_poll_interval); @@ -455,7 +473,7 @@ static void virtio_balloon_device_realize(DeviceState *dev, Error **errp) reset_stats(s); -register_savevm(dev, "virtio-balloon", -1, 1, +register_savevm(dev, "virtio-balloon", -1, BALLOON_VERSION, virtio_balloon_save, virtio_balloon_load, s); } -- 1.8.3.1
[Qemu-devel] [PATCH] balloon: Fix failure of updating guest memory status
After live migration, 'guest-stats' can't get the expected memory status in the guest. This issue is caused by commit 4eae2a657d. The value of 's->stats_vq_elem' will be NULL after live migration, and the check in the function 'balloon_stats_poll_cb()' will prevent the 'virtio_notify()' from executing. So guest will not update the memory status. Signed-off-by: Liang Li <liang.z...@intel.com> Cc: Michael S. Tsirkin <m...@redhat.com> Cc: Ladi Prosek <lpro...@redhat.com> Cc: Paolo Bonzini <pbonz...@redhat.com> --- hw/virtio/virtio-balloon.c | 8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index 557d3f9..cc6947f 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -98,13 +98,19 @@ static void balloon_stats_poll_cb(void *opaque) { VirtIOBalloon *s = opaque; VirtIODevice *vdev = VIRTIO_DEVICE(s); +VirtQueueElement elem = {0}; -if (s->stats_vq_elem == NULL || !balloon_stats_supported(s)) { +if (!balloon_stats_supported(s)) { /* re-schedule */ balloon_stats_change_timer(s, s->stats_poll_interval); return; } +if (s->stats_vq_elem == NULL) { +virtqueue_push(s->svq, , 0); +virtio_notify(vdev, s->svq); +return; +} virtqueue_push(s->svq, s->stats_vq_elem, s->stats_vq_offset); virtio_notify(vdev, s->svq); g_free(s->stats_vq_elem); -- 1.9.1
[Qemu-devel] [PATCH v2 kernel 7/7] virtio-balloon: tell host vm's free page info
Support the request for vm's free page information, response with a page bitmap. QEMU can make use of this free page bitmap to speed up live migration process by skipping process the free pages. Signed-off-by: Liang Li <liang.z...@intel.com> Cc: Michael S. Tsirkin <m...@redhat.com> Cc: Paolo Bonzini <pbonz...@redhat.com> Cc: Cornelia Huck <cornelia.h...@de.ibm.com> Cc: Amit Shah <amit.s...@redhat.com> --- drivers/virtio/virtio_balloon.c | 104 +--- 1 file changed, 98 insertions(+), 6 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 2d18ff6..5ca4ad3 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -62,10 +62,13 @@ module_param(oom_pages, int, S_IRUSR | S_IWUSR); MODULE_PARM_DESC(oom_pages, "pages to free on OOM"); extern unsigned long get_max_pfn(void); +extern int get_free_pages(unsigned long start_pfn, unsigned long end_pfn, + unsigned long *bitmap, unsigned long len); + struct virtio_balloon { struct virtio_device *vdev; - struct virtqueue *inflate_vq, *deflate_vq, *stats_vq; + struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *misc_vq; /* The balloon servicing is delegated to a freezable workqueue. */ struct work_struct update_balloon_stats_work; @@ -89,6 +92,8 @@ struct virtio_balloon { unsigned long pfn_limit; /* Used to record the processed pfn range */ unsigned long min_pfn, max_pfn, start_pfn, end_pfn; + /* Request header */ + struct balloon_req_hdr req_hdr; /* * The pages we've told the Host we're not using are enqueued * at vb_dev_info->pages list. @@ -373,6 +378,49 @@ static void update_balloon_stats(struct virtio_balloon *vb) pages_to_bytes(available)); } +static void update_free_pages_stats(struct virtio_balloon *vb, + unsigned long req_id) +{ + struct scatterlist sg_in, sg_out; + unsigned long pfn = 0, bmap_len, max_pfn; + struct virtqueue *vq = vb->misc_vq; + struct balloon_bmap_hdr *hdr = vb->bmap_hdr; + int ret = 1; + + max_pfn = get_max_pfn(); + mutex_lock(>balloon_lock); + while (pfn < max_pfn) { + memset(vb->page_bitmap, 0, vb->bmap_len); + ret = get_free_pages(pfn, pfn + vb->pfn_limit, + vb->page_bitmap, vb->bmap_len * BITS_PER_BYTE); + hdr->cmd = cpu_to_virtio16(vb->vdev, BALLOON_GET_FREE_PAGES); + hdr->page_shift = cpu_to_virtio16(vb->vdev, PAGE_SHIFT); + hdr->req_id = cpu_to_virtio64(vb->vdev, req_id); + hdr->start_pfn = cpu_to_virtio64(vb->vdev, pfn); + bmap_len = vb->pfn_limit / BITS_PER_BYTE; + if (!ret) { + hdr->flag = cpu_to_virtio16(vb->vdev, + BALLOON_FLAG_DONE); + if (pfn + vb->pfn_limit > max_pfn) + bmap_len = (max_pfn - pfn) / BITS_PER_BYTE; + } else + hdr->flag = cpu_to_virtio16(vb->vdev, + BALLOON_FLAG_CONT); + hdr->bmap_len = cpu_to_virtio64(vb->vdev, bmap_len); + sg_init_one(_out, hdr, +sizeof(struct balloon_bmap_hdr) + bmap_len); + + virtqueue_add_outbuf(vq, _out, 1, vb, GFP_KERNEL); + virtqueue_kick(vq); + pfn += vb->pfn_limit; + } + + sg_init_one(_in, >req_hdr, sizeof(vb->req_hdr)); + virtqueue_add_inbuf(vq, _in, 1, >req_hdr, GFP_KERNEL); + virtqueue_kick(vq); + mutex_unlock(>balloon_lock); +} + /* * While most virtqueues communicate guest-initiated requests to the hypervisor, * the stats queue operates in reverse. The driver initializes the virtqueue @@ -511,18 +559,49 @@ static void update_balloon_size_func(struct work_struct *work) queue_work(system_freezable_wq, work); } +static void misc_handle_rq(struct virtio_balloon *vb) +{ + struct balloon_req_hdr *ptr_hdr; + unsigned int len; + + ptr_hdr = virtqueue_get_buf(vb->misc_vq, ); + if (!ptr_hdr || len != sizeof(vb->req_hdr)) + return; + + switch (ptr_hdr->cmd) { + case BALLOON_GET_FREE_PAGES: + update_free_pages_stats(vb, ptr_hdr->param); + break; + default: + break; + } +} + +static void misc_request(struct virtqueue *vq) +{ + struct virtio_balloon *vb = vq->vdev->priv; + + misc_handle_rq(vb); +} + static int init_vqs(struct virtio_balloon *vb) { - struct virtqueue *vqs[3]; -
[Qemu-devel] [PATCH v2 kernel 1/7] virtio-balloon: rework deflate to add page to a list
will allow faster notifications using a bitmap down the road. balloon_pfn_to_page() can be removed because it's useless. Signed-off-by: Liang Li <liang.z...@intel.com> Signed-off-by: Michael S. Tsirkin <m...@redhat.com> Cc: Paolo Bonzini <pbonz...@redhat.com> Cc: Cornelia Huck <cornelia.h...@de.ibm.com> Cc: Amit Shah <amit.s...@redhat.com> --- drivers/virtio/virtio_balloon.c | 22 -- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 476c0e3..8d649a2 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -98,12 +98,6 @@ static u32 page_to_balloon_pfn(struct page *page) return pfn * VIRTIO_BALLOON_PAGES_PER_PAGE; } -static struct page *balloon_pfn_to_page(u32 pfn) -{ - BUG_ON(pfn % VIRTIO_BALLOON_PAGES_PER_PAGE); - return pfn_to_page(pfn / VIRTIO_BALLOON_PAGES_PER_PAGE); -} - static void balloon_ack(struct virtqueue *vq) { struct virtio_balloon *vb = vq->vdev->priv; @@ -176,18 +170,16 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num) return num_allocated_pages; } -static void release_pages_balloon(struct virtio_balloon *vb) +static void release_pages_balloon(struct virtio_balloon *vb, +struct list_head *pages) { - unsigned int i; - struct page *page; + struct page *page, *next; - /* Find pfns pointing at start of each page, get pages and free them. */ - for (i = 0; i < vb->num_pfns; i += VIRTIO_BALLOON_PAGES_PER_PAGE) { - page = balloon_pfn_to_page(virtio32_to_cpu(vb->vdev, - vb->pfns[i])); + list_for_each_entry_safe(page, next, pages, lru) { if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) adjust_managed_page_count(page, 1); + list_del(>lru); put_page(page); /* balloon reference */ } } @@ -197,6 +189,7 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num) unsigned num_freed_pages; struct page *page; struct balloon_dev_info *vb_dev_info = >vb_dev_info; + LIST_HEAD(pages); /* We can only do one array worth at a time. */ num = min(num, ARRAY_SIZE(vb->pfns)); @@ -208,6 +201,7 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num) if (!page) break; set_page_pfns(vb, vb->pfns + vb->num_pfns, page); + list_add(>lru, ); vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE; } @@ -219,7 +213,7 @@ static unsigned leak_balloon(struct virtio_balloon *vb, size_t num) */ if (vb->num_pfns != 0) tell_host(vb, vb->deflate_vq); - release_pages_balloon(vb); + release_pages_balloon(vb, ); mutex_unlock(>balloon_lock); return num_freed_pages; } -- 1.8.3.1
[Qemu-devel] [PATCH v2 kernel 3/7] mm: add a function to get the max pfn
Expose the function to get the max pfn, so it can be used in the virtio-balloon device driver. Signed-off-by: Liang Li <liang.z...@intel.com> Cc: Andrew Morton <a...@linux-foundation.org> Cc: Mel Gorman <mgor...@techsingularity.net> Cc: Michael S. Tsirkin <m...@redhat.com> Cc: Paolo Bonzini <pbonz...@redhat.com> Cc: Cornelia Huck <cornelia.h...@de.ibm.com> Cc: Amit Shah <amit.s...@redhat.com> --- mm/page_alloc.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6903b69..2083b40 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4515,6 +4515,12 @@ void show_free_areas(unsigned int filter) show_swap_cache_info(); } +unsigned long get_max_pfn(void) +{ + return max_pfn; +} +EXPORT_SYMBOL(get_max_pfn); + static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) { zoneref->zone = zone; -- 1.8.3.1
[Qemu-devel] [PATCH v2 kernel 6/7] mm: add the related functions to get free page info
Save the free page info into a page bitmap, will be used in virtio balloon device driver. Signed-off-by: Liang Li <liang.z...@intel.com> Cc: Andrew Morton <a...@linux-foundation.org> Cc: Mel Gorman <mgor...@techsingularity.net> Cc: Michael S. Tsirkin <m...@redhat.com> Cc: Paolo Bonzini <pbonz...@redhat.com> Cc: Cornelia Huck <cornelia.h...@de.ibm.com> Cc: Amit Shah <amit.s...@redhat.com> --- mm/page_alloc.c | 46 ++ 1 file changed, 46 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2083b40..c2a6669 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4521,6 +4521,52 @@ unsigned long get_max_pfn(void) } EXPORT_SYMBOL(get_max_pfn); +static void mark_free_pages_bitmap(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn, unsigned long *bitmap, unsigned long len) +{ + unsigned long pfn, flags, page_num; + unsigned int order, t; + struct list_head *curr; + + if (zone_is_empty(zone)) + return; + end_pfn = min(start_pfn + len, end_pfn); + spin_lock_irqsave(>lock, flags); + + for_each_migratetype_order(order, t) { + list_for_each(curr, >free_area[order].free_list[t]) { + pfn = page_to_pfn(list_entry(curr, struct page, lru)); + if (pfn >= start_pfn && pfn <= end_pfn) { + page_num = 1UL << order; + if (pfn + page_num > end_pfn) + page_num = end_pfn - pfn; + bitmap_set(bitmap, pfn - start_pfn, page_num); + } + } + } + + spin_unlock_irqrestore(>lock, flags); +} + +int get_free_pages(unsigned long start_pfn, unsigned long end_pfn, + unsigned long *bitmap, unsigned long len) +{ + struct zone *zone; + int ret = 0; + + if (bitmap == NULL || start_pfn > end_pfn || start_pfn >= max_pfn) + return 0; + if (end_pfn < max_pfn) + ret = 1; + if (end_pfn >= max_pfn) + ret = 0; + + for_each_populated_zone(zone) + mark_free_pages_bitmap(zone, start_pfn, end_pfn, bitmap, len); + return ret; +} +EXPORT_SYMBOL(get_free_pages); + static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) { zoneref->zone = zone; -- 1.8.3.1
[Qemu-devel] [PATCH v2 kernel 4/7] virtio-balloon: speed up inflate/deflate process
The implementation of the current virtio-balloon is not very efficient, the time spends on different stages of inflating the balloon to 7GB of a 8GB idle guest: a. allocating pages (6.5%) b. sending PFNs to host (68.3%) c. address translation (6.1%) d. madvise (19%) It takes about 4126ms for the inflating process to complete. Debugging shows that the bottle neck are the stage b and stage d. If using a bitmap to send the page info instead of the PFNs, we can reduce the overhead in stage b quite a lot. Furthermore, we can do the address translation and call madvise() with a bulk of RAM pages, instead of the current page per page way, the overhead of stage c and stage d can also be reduced a lot. This patch is the kernel side implementation which is intended to speed up the inflating & deflating process by adding a new feature to the virtio-balloon device. With this new feature, inflating the balloon to 7GB of a 8GB idle guest only takes 590ms, the performance improvement is about 85%. TODO: optimize stage a by allocating/freeing a chunk of pages instead of a single page at a time. Signed-off-by: Liang Li <liang.z...@intel.com> Suggested-by: Michael S. Tsirkin <m...@redhat.com> Cc: Michael S. Tsirkin <m...@redhat.com> Cc: Paolo Bonzini <pbonz...@redhat.com> Cc: Cornelia Huck <cornelia.h...@de.ibm.com> Cc: Amit Shah <amit.s...@redhat.com> --- drivers/virtio/virtio_balloon.c | 184 +++- 1 file changed, 162 insertions(+), 22 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 8d649a2..2d18ff6 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -41,10 +41,28 @@ #define OOM_VBALLOON_DEFAULT_PAGES 256 #define VIRTBALLOON_OOM_NOTIFY_PRIORITY 80 +/* + * VIRTIO_BALLOON_PFNS_LIMIT is used to limit the size of page bitmap + * to prevent a very large page bitmap, there are two reasons for this: + * 1) to save memory. + * 2) allocate a large bitmap may fail. + * + * The actual limit of pfn is determined by: + * pfn_limit = min(max_pfn, VIRTIO_BALLOON_PFNS_LIMIT); + * + * If system has more pages than VIRTIO_BALLOON_PFNS_LIMIT, we will scan + * the page list and send the PFNs with several times. To reduce the + * overhead of scanning the page list. VIRTIO_BALLOON_PFNS_LIMIT should + * be set with a value which can cover most cases. + */ +#define VIRTIO_BALLOON_PFNS_LIMIT ((32 * (1ULL << 30)) >> PAGE_SHIFT) /* 32GB */ + static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES; module_param(oom_pages, int, S_IRUSR | S_IWUSR); MODULE_PARM_DESC(oom_pages, "pages to free on OOM"); +extern unsigned long get_max_pfn(void); + struct virtio_balloon { struct virtio_device *vdev; struct virtqueue *inflate_vq, *deflate_vq, *stats_vq; @@ -62,6 +80,15 @@ struct virtio_balloon { /* Number of balloon pages we've told the Host we're not using. */ unsigned int num_pages; + /* Pointer of the bitmap header. */ + void *bmap_hdr; + /* Bitmap and length used to tell the host the pages */ + unsigned long *page_bitmap; + unsigned long bmap_len; + /* Pfn limit */ + unsigned long pfn_limit; + /* Used to record the processed pfn range */ + unsigned long min_pfn, max_pfn, start_pfn, end_pfn; /* * The pages we've told the Host we're not using are enqueued * at vb_dev_info->pages list. @@ -105,12 +132,45 @@ static void balloon_ack(struct virtqueue *vq) wake_up(>acked); } +static inline void init_pfn_range(struct virtio_balloon *vb) +{ + vb->min_pfn = ULONG_MAX; + vb->max_pfn = 0; +} + +static inline void update_pfn_range(struct virtio_balloon *vb, +struct page *page) +{ + unsigned long balloon_pfn = page_to_balloon_pfn(page); + + if (balloon_pfn < vb->min_pfn) + vb->min_pfn = balloon_pfn; + if (balloon_pfn > vb->max_pfn) + vb->max_pfn = balloon_pfn; +} + static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq) { struct scatterlist sg; unsigned int len; - sg_init_one(, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns); + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_PAGE_BITMAP)) { + struct balloon_bmap_hdr *hdr = vb->bmap_hdr; + unsigned long bmap_len; + + /* cmd and req_id are not used here, set them to 0 */ + hdr->cmd = cpu_to_virtio16(vb->vdev, 0); + hdr->page_shift = cpu_to_virtio16(vb->vdev, PAGE_SHIFT); + hdr->reserved = cpu_to_virtio16(vb->vdev, 0); + hdr->req_id = cpu_to_virtio64(vb->vdev, 0); + hdr->start_pfn = cpu_to_virtio64(vb->vdev, vb->start_pfn); + bmap_len = min(vb->bmap_
[Qemu-devel] [PATCH v2 kernel 5/7] virtio-balloon: define feature bit and head for misc virt queue
Define a new feature bit which supports a new virtual queue. This new virtual qeuque is for information exchange between hypervisor and guest. The VMM hypervisor can make use of this virtual queue to request the guest do some operations, e.g. drop page cache, synchronize file system, etc. And the VMM hypervisor can get some of guest's runtime information through this virtual queue, e.g. the guest's free page information, which can be used for live migration optimization. Signed-off-by: Liang Li <liang.z...@intel.com> Cc: Michael S. Tsirkin <m...@redhat.com> Cc: Paolo Bonzini <pbonz...@redhat.com> Cc: Cornelia Huck <cornelia.h...@de.ibm.com> Cc: Amit Shah <amit.s...@redhat.com> --- include/uapi/linux/virtio_balloon.h | 22 ++ 1 file changed, 22 insertions(+) diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h index d3b182a..be4880f 100644 --- a/include/uapi/linux/virtio_balloon.h +++ b/include/uapi/linux/virtio_balloon.h @@ -35,6 +35,7 @@ #define VIRTIO_BALLOON_F_STATS_VQ 1 /* Memory Stats virtqueue */ #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM2 /* Deflate balloon on OOM */ #define VIRTIO_BALLOON_F_PAGE_BITMAP 3 /* Send page info with bitmap */ +#define VIRTIO_BALLOON_F_MISC_VQ 4 /* Misc info virtqueue */ /* Size of a PFN in the balloon interface. */ #define VIRTIO_BALLOON_PFN_SHIFT 12 @@ -101,4 +102,25 @@ struct balloon_bmap_hdr { __virtio64 bmap_len; }; +enum balloon_req_id { + /* Get free pages information */ + BALLOON_GET_FREE_PAGES, +}; + +enum balloon_flag { + /* Have more data for a request */ + BALLOON_FLAG_CONT, + /* No more data for a request */ + BALLOON_FLAG_DONE, +}; + +struct balloon_req_hdr { + /* Used to distinguish different request */ + __virtio16 cmd; + /* Reserved */ + __virtio16 reserved[3]; + /* Request parameter */ + __virtio64 param; +}; + #endif /* _LINUX_VIRTIO_BALLOON_H */ -- 1.8.3.1
[Qemu-devel] [PATCH v2 kernel 2/7] virtio-balloon: define new feature bit and page bitmap head
Add a new feature which supports sending the page information with a bitmap. The current implementation uses PFNs array, which is not very efficient. Using bitmap can improve the performance of inflating/deflating significantly The page bitmap header will used to tell the host some information about the page bitmap. e.g. the page size, page bitmap length and start pfn. Signed-off-by: Liang Li <liang.z...@intel.com> Cc: Michael S. Tsirkin <m...@redhat.com> Cc: Paolo Bonzini <pbonz...@redhat.com> Cc: Cornelia Huck <cornelia.h...@de.ibm.com> Cc: Amit Shah <amit.s...@redhat.com> --- include/uapi/linux/virtio_balloon.h | 19 +++ 1 file changed, 19 insertions(+) diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h index 343d7dd..d3b182a 100644 --- a/include/uapi/linux/virtio_balloon.h +++ b/include/uapi/linux/virtio_balloon.h @@ -34,6 +34,7 @@ #define VIRTIO_BALLOON_F_MUST_TELL_HOST0 /* Tell before reclaiming pages */ #define VIRTIO_BALLOON_F_STATS_VQ 1 /* Memory Stats virtqueue */ #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM2 /* Deflate balloon on OOM */ +#define VIRTIO_BALLOON_F_PAGE_BITMAP 3 /* Send page info with bitmap */ /* Size of a PFN in the balloon interface. */ #define VIRTIO_BALLOON_PFN_SHIFT 12 @@ -82,4 +83,22 @@ struct virtio_balloon_stat { __virtio64 val; } __attribute__((packed)); +/* Page bitmap header structure */ +struct balloon_bmap_hdr { + /* Used to distinguish different request */ + __virtio16 cmd; + /* Shift width of page in the bitmap */ + __virtio16 page_shift; + /* flag used to identify different status */ + __virtio16 flag; + /* Reserved */ + __virtio16 reserved; + /* ID of the request */ + __virtio64 req_id; + /* The pfn of 0 bit in the bitmap */ + __virtio64 start_pfn; + /* The length of the bitmap, in bytes */ + __virtio64 bmap_len; +}; + #endif /* _LINUX_VIRTIO_BALLOON_H */ -- 1.8.3.1
[Qemu-devel] [PATCH v2 kernel 0/7] Extend virtio-balloon for fast (de)inflating & fast live migration
This patch set contains two parts of changes to the virtio-balloon. One is the change for speeding up the inflating & deflating process, the main idea of this optimization is to use bitmap to send the page information to host instead of the PFNs, to reduce the overhead of virtio data transmission, address translation and madvise(). This can help to improve the performance by about 85%. Another change is for speeding up live migration. By skipping process guest's free pages in the first round of data copy, to reduce needless data processing, this can help to save quite a lot of CPU cycles and network bandwidth. We put guest's free page information in bitmap and send it to host with the virt queue of virtio-balloon. For an idle 8GB guest, this can help to shorten the total live migration time from 2Sec to about 500ms in the 10Gbps network environment. Changes from v1 to v2: * Abandon the patch for dropping page cache. * Put some structures to uapi head file. * Use a new way to determine the page bitmap size. * Use a unified way to send the free page information with the bitmap * Address the issues referred in MST's comments Liang Li (7): virtio-balloon: rework deflate to add page to a list virtio-balloon: define new feature bit and page bitmap head mm: add a function to get the max pfn virtio-balloon: speed up inflate/deflate process virtio-balloon: define feature bit and head for misc virt queue mm: add the related functions to get free page info virtio-balloon: tell host vm's free page info drivers/virtio/virtio_balloon.c | 306 +++- include/uapi/linux/virtio_balloon.h | 41 + mm/page_alloc.c | 52 ++ 3 files changed, 359 insertions(+), 40 deletions(-) -- 1.8.3.1
[Qemu-devel] [QEMU 7/7] migration: skip free pages during live migration
After sending out the request for free pages, live migration process will start without waiting for the free page bitmap is ready. If the free page bitmap is not ready when doing the 1st migration_bitmap_sync() after ram_save_setup(), the free page bitmap will be ignored, this means the free pages will not be filtered out in this case. The current implementation can not work with post copy, if post copy is enabled, we simply ignore the free pages. Will make it work later. Signed-off-by: Liang Li <liang.z...@intel.com> --- migration/ram.c | 93 + 1 file changed, 93 insertions(+) diff --git a/migration/ram.c b/migration/ram.c index 844ea46..5f1c3ff 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -43,6 +43,8 @@ #include "trace.h" #include "exec/ram_addr.h" #include "qemu/rcu_queue.h" +#include "sysemu/balloon.h" +#include "sysemu/kvm.h" #ifdef DEBUG_MIGRATION_RAM #define DPRINTF(fmt, ...) \ @@ -228,6 +230,7 @@ static QemuMutex migration_bitmap_mutex; static uint64_t migration_dirty_pages; static uint32_t last_version; static bool ram_bulk_stage; +static bool ignore_freepage_rsp; /* used by the search for pages to send */ struct PageSearchStatus { @@ -244,6 +247,7 @@ static struct BitmapRcu { struct rcu_head rcu; /* Main migration bitmap */ unsigned long *bmap; +unsigned long *free_page_bmap; /* bitmap of pages that haven't been sent even once * only maintained and used in postcopy at the moment * where it's used to send the dirtymap at the start @@ -639,6 +643,7 @@ static void migration_bitmap_sync(void) rcu_read_unlock(); qemu_mutex_unlock(_bitmap_mutex); +ignore_freepage_rsp = true; trace_migration_bitmap_sync_end(migration_dirty_pages - num_dirty_pages_init); num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init; @@ -1417,6 +1422,7 @@ static void migration_bitmap_free(struct BitmapRcu *bmap) { g_free(bmap->bmap); g_free(bmap->unsentmap); +g_free(bmap->free_page_bmap); g_free(bmap); } @@ -1487,6 +1493,85 @@ void migration_bitmap_extend(ram_addr_t old, ram_addr_t new) } } +static void filter_out_guest_free_page(unsigned long *free_page_bmap, + long nbits) +{ +long i, page_count = 0, len; +unsigned long *bitmap; + +tighten_guest_free_page_bmap(free_page_bmap); +qemu_mutex_lock(_bitmap_mutex); +bitmap = atomic_rcu_read(_bitmap_rcu)->bmap; +slow_bitmap_complement(bitmap, free_page_bmap, nbits); + +len = (last_ram_offset() >> TARGET_PAGE_BITS) / BITS_PER_LONG; +for (i = 0; i < len; i++) { +page_count += hweight_long(bitmap[i]); +} + +migration_dirty_pages = page_count; +qemu_mutex_unlock(_bitmap_mutex); +} + +static void ram_request_free_page(unsigned long *bmap, unsigned long max_pfn) +{ +BalloonReqStatus status; + +status = balloon_get_free_pages(bmap, max_pfn); +switch (status) { +case REQ_DONE: +ignore_freepage_rsp = false; +break; +case REQ_ERROR: +error_report("Errro happend when request free page"); +break; +default: +error_report("unexpected response status: %d", status); +break; +} +} + +static void ram_handle_free_page(void) +{ +unsigned long nbits; +RAMBlock *pc_ram_block; +BalloonReqStatus status; + +status = balloon_get_free_pages(migration_bitmap_rcu->free_page_bmap, +get_guest_max_pfn()); +switch (status) { +case REQ_DONE: +rcu_read_lock(); +pc_ram_block = QLIST_FIRST_RCU(_list.blocks); +nbits = pc_ram_block->used_length >> TARGET_PAGE_BITS; +filter_out_guest_free_page(migration_bitmap_rcu->free_page_bmap, nbits); +rcu_read_unlock(); + +qemu_mutex_lock_iothread(); +migration_bitmap_sync(); +qemu_mutex_unlock_iothread(); +/* + * bulk stage assumes in (migration_bitmap_find_and_reset_dirty) that + * every page is dirty, that's no longer ture at this point. + */ +ram_bulk_stage = false; +last_seen_block = NULL; +last_sent_block = NULL; +last_offset = 0; +break; +case REQ_ERROR: +ignore_freepage_rsp = true; +error_report("failed to get free page"); +break; +case REQ_INVALID_PARAM: +ignore_freepage_rsp = true; +error_report("buffer overflow"); +break; +default: +break; +} +} + /* * 'expected' is the value you expect the bitmap mostly to be full * of; it won't bother printing lines that are all this value. @@ -1950,6 +2035,11 @@ static int ram_save_setup(QEMUFile *f, void *opaque) qemu_mutex_unlock_ramlist();
[Qemu-devel] [QEMU 5/7] bitmap: Add a new bitmap_move function
Sometimes, it is need to move a portion of bitmap to another place in a large bitmap, if overlap happens, the bitmap_copy can't not work correctly, we need a new function to do this work. Signed-off-by: Liang Li <liang.z...@intel.com> --- include/qemu/bitmap.h | 13 + 1 file changed, 13 insertions(+) diff --git a/include/qemu/bitmap.h b/include/qemu/bitmap.h index ec5146f..6ac89ca 100644 --- a/include/qemu/bitmap.h +++ b/include/qemu/bitmap.h @@ -37,6 +37,7 @@ * bitmap_set(dst, pos, nbits) Set specified bit area * bitmap_set_atomic(dst, pos, nbits) Set specified bit area with atomic ops * bitmap_clear(dst, pos, nbits) Clear specified bit area + * bitmap_move(dst, src, nbits) Move *src to *dst * bitmap_test_and_clear_atomic(dst, pos, nbits)Test and clear area * bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area */ @@ -136,6 +137,18 @@ static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, } } +static inline void bitmap_move(unsigned long *dst, const unsigned long *src, + long nbits) +{ +if (small_nbits(nbits)) { +unsigned long tmp = *src; +*dst = tmp; +} else { +long len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); +memmove(dst, src, len); +} +} + static inline int bitmap_and(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, long nbits) { -- 1.9.1
[Qemu-devel] [QEMU 6/7] kvm: Add two new arch specific functions
Add a new function to get the vm's max pfn and a new function to filter out the holes to get a tight free page bitmap. They are implemented on X86, and all the arches should implement them for live migration optimization. Signed-off-by: Liang Li <liang.z...@intel.com> --- include/sysemu/kvm.h | 2 ++ target-arm/kvm.c | 14 ++ target-i386/kvm.c| 35 +++ target-mips/kvm.c| 14 ++ target-ppc/kvm.c | 14 ++ target-s390x/kvm.c | 14 ++ 6 files changed, 93 insertions(+) diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index ad6f837..50915f9 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -230,6 +230,8 @@ int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, target_ulong len, int type); void kvm_remove_all_breakpoints(CPUState *cpu); int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap); +unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap); +unsigned long get_guest_max_pfn(void); #ifndef _WIN32 int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset); #endif diff --git a/target-arm/kvm.c b/target-arm/kvm.c index 83da447..6464542 100644 --- a/target-arm/kvm.c +++ b/target-arm/kvm.c @@ -627,3 +627,17 @@ int kvm_arch_msi_data_to_gsi(uint32_t data) { return (data - 32) & 0x; } + +unsigned long get_guest_max_pfn(void) +{ +/* To be done */ + +return 0; +} + +unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap) +{ +/* To be done */ + +return bmap; +} diff --git a/target-i386/kvm.c b/target-i386/kvm.c index abf50e6..0b394cb 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -3327,3 +3327,38 @@ int kvm_arch_msi_data_to_gsi(uint32_t data) { abort(); } + +unsigned long get_guest_max_pfn(void) +{ +PCMachineState *pcms = PC_MACHINE(current_machine); +ram_addr_t above_4g_mem = pcms->above_4g_mem_size; +unsigned long max_pfn; + +if (above_4g_mem) { +max_pfn = ((1ULL << 32) + above_4g_mem) >> TARGET_PAGE_BITS; +} else { +max_pfn = pcms->below_4g_mem_size >> TARGET_PAGE_BITS; +} + +return max_pfn; +} + +unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap) +{ +PCMachineState *pcms = PC_MACHINE(current_machine); +ram_addr_t above_4g_mem = pcms->above_4g_mem_size; + +if (above_4g_mem) { +unsigned long *src, *dst, len, pos; +ram_addr_t below_4g_mem = pcms->below_4g_mem_size; +src = bmap + ((1ULL << 32) >> TARGET_PAGE_BITS) / BITS_PER_LONG; +dst = bmap + (below_4g_mem >> TARGET_PAGE_BITS) / BITS_PER_LONG; +bitmap_move(dst, src, above_4g_mem >> TARGET_PAGE_BITS); + +pos = (above_4g_mem + below_4g_mem) >> TARGET_PAGE_BITS; +len = ((1ULL << 32) - below_4g_mem) >> TARGET_PAGE_BITS; +bitmap_clear(bmap, pos, len); +} + +return bmap; +} diff --git a/target-mips/kvm.c b/target-mips/kvm.c index a854e4d..89a54e5 100644 --- a/target-mips/kvm.c +++ b/target-mips/kvm.c @@ -1048,3 +1048,17 @@ int kvm_arch_msi_data_to_gsi(uint32_t data) { abort(); } + +unsigned long get_guest_max_pfn(void) +{ +/* To be done */ + +return 0; +} + +unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap) +{ +/* To be done */ + +return bmap; +} diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c index 24d6032..e222b31 100644 --- a/target-ppc/kvm.c +++ b/target-ppc/kvm.c @@ -2579,3 +2579,17 @@ int kvmppc_enable_hwrng(void) return kvmppc_enable_hcall(kvm_state, H_RANDOM); } + +unsigned long get_guest_max_pfn(void) +{ +/* To be done */ + +return 0; +} + +unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap) +{ +/* To be done */ + +return bmap; +} diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c index 8f46fd0..893755b 100644 --- a/target-s390x/kvm.c +++ b/target-s390x/kvm.c @@ -2271,3 +2271,17 @@ int kvm_arch_msi_data_to_gsi(uint32_t data) { abort(); } + +unsigned long get_guest_max_pfn(void) +{ +/* To be done */ + +return 0; +} + +unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap) +{ +/* To be done */ + +return bmap; +} -- 1.9.1
[Qemu-devel] [QEMU 4/7] balloon: get free page info from guest
Add a new feature to get the free page information from guest, the free page information is saved in a bitmap. Please note that 'free page' only means these pages are free before the request, some of the pages will become no free during the process of sending the free page bitmap to QEMU. Signed-off-by: Liang Li <liang.z...@intel.com> --- balloon.c | 24 +++- hw/virtio/virtio-balloon.c | 75 +- include/hw/virtio/virtio-balloon.h | 4 ++ include/sysemu/balloon.h | 8 4 files changed, 108 insertions(+), 3 deletions(-) diff --git a/balloon.c b/balloon.c index 3d96111..c74c472 100644 --- a/balloon.c +++ b/balloon.c @@ -37,6 +37,7 @@ static QEMUBalloonEvent *balloon_event_fn; static QEMUBalloonStatus *balloon_stat_fn; static QEMUBalloonDropCache *balloon_drop_cache_fn; +static QEMUBalloonGetFreePage *balloon_get_free_page_fn; static void *balloon_opaque; static bool balloon_inhibited; @@ -68,10 +69,11 @@ static bool have_balloon(Error **errp) int qemu_add_balloon_handler(QEMUBalloonEvent *event_func, QEMUBalloonStatus *stat_func, QEMUBalloonDropCache *drop_cache_func, + QEMUBalloonGetFreePage *get_free_page_func, void *opaque) { if (balloon_event_fn || balloon_stat_fn || balloon_drop_cache_fn -|| balloon_opaque) { +|| balloon_get_free_page_fn || balloon_opaque) { /* We're already registered one balloon handler. How many can * a guest really have? */ @@ -80,6 +82,7 @@ int qemu_add_balloon_handler(QEMUBalloonEvent *event_func, balloon_event_fn = event_func; balloon_stat_fn = stat_func; balloon_drop_cache_fn = drop_cache_func; +balloon_get_free_page_fn = get_free_page_func; balloon_opaque = opaque; return 0; } @@ -92,6 +95,7 @@ void qemu_remove_balloon_handler(void *opaque) balloon_event_fn = NULL; balloon_stat_fn = NULL; balloon_drop_cache_fn = NULL; +balloon_get_free_page_fn = NULL; balloon_opaque = NULL; } @@ -141,3 +145,21 @@ void qmp_balloon_drop_cache(DropCacheType type, Error **errp) balloon_drop_cache_fn(balloon_opaque, type); } + +bool balloon_free_pages_support(void) +{ +return balloon_get_free_page_fn ? true : false; +} + +BalloonReqStatus balloon_get_free_pages(unsigned long *bitmap, unsigned long len) +{ +if (!balloon_get_free_page_fn) { +return REQ_UNSUPPORT; +} + +if (!bitmap) { +return REQ_INVALID_PARAM; +} + +return balloon_get_free_page_fn(balloon_opaque, bitmap, len); +} diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index 4757ba5..30ba074 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -38,6 +38,7 @@ enum balloon_req_id { BALLOON_DROP_CACHE, + BALLOON_GET_FREE_PAGES, }; static void balloon_page(void *addr, int deflate) @@ -435,7 +436,8 @@ static void virtio_balloon_handle_resp(VirtIODevice *vdev, VirtQueue *vq) VirtIOBalloon *s = VIRTIO_BALLOON(vdev); VirtQueueElement *elem; size_t offset = 0; -uint32_t tmp32, id = 0; +uint32_t tmp32, id = 0, page_shift; +uint64_t base_pfn, tmp64, bmap_len; elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); if (!elem) { @@ -457,6 +459,32 @@ static void virtio_balloon_handle_resp(VirtIODevice *vdev, VirtQueue *vq) case BALLOON_DROP_CACHE: s->req_status = REQ_DONE; break; +case BALLOON_GET_FREE_PAGES: +iov_to_buf(elem->out_sg, elem->out_num, offset, + , sizeof(uint32_t)); +page_shift = virtio_ldl_p(vdev, ); +offset += sizeof(uint32_t); +s->page_shift = page_shift; + +iov_to_buf(elem->out_sg, elem->out_num, offset, + , sizeof(uint64_t)); +base_pfn = virtio_ldq_p(vdev, ); +offset += sizeof(uint64_t); +s->base_pfn = base_pfn; + +iov_to_buf(elem->out_sg, elem->out_num, offset, + , sizeof(uint64_t)); +bmap_len = virtio_ldq_p(vdev, ); +offset += sizeof(uint64_t); +if (s->bmap_len < bmap_len) { + s->req_status = REQ_INVALID_PARAM; + return; +} + +iov_to_buf(elem->out_sg, elem->out_num, offset, + s->free_page_bmap, bmap_len); +s->req_status = REQ_DONE; + break; default: break; } @@ -574,6 +602,48 @@ static int virtio_balloon_drop_cache(void *opaque, unsigned long type) return REQ_DONE; } +static BalloonReqStatus virtio_balloon_free_pages(void *opaque, + unsigned long *bitmap, + unsigned long bmap_len) +{ +VirtIOBalloon *s = opaque; +VirtIODevice *vd
[Qemu-devel] [QEMU 3/7] Add the hmp and qmp interface for dropping cache
Add the hmp and qmp interface to drop vm's page cache, users can control the type of cache they want vm to drop. Signed-off-by: Liang Li <liang.z...@intel.com> --- balloon.c| 19 +++ hmp-commands.hx | 15 +++ hmp.c| 22 ++ hmp.h| 3 +++ monitor.c| 18 ++ qapi-schema.json | 35 +++ qmp-commands.hx | 23 +++ 7 files changed, 135 insertions(+) diff --git a/balloon.c b/balloon.c index 0fb34bf..3d96111 100644 --- a/balloon.c +++ b/balloon.c @@ -122,3 +122,22 @@ void qmp_balloon(int64_t target, Error **errp) trace_balloon_event(balloon_opaque, target); balloon_event_fn(balloon_opaque, target); } + +void qmp_balloon_drop_cache(DropCacheType type, Error **errp) +{ +if (!have_balloon(errp)) { +return; +} + +if (!balloon_drop_cache_fn) { +error_setg(errp, QERR_UNSUPPORTED); +return; +} +if (type < 0 && type >= DROP_CACHE_TYPE__MAX) { +error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "type", + "a value in range[0, 3]"); +return; +} + +balloon_drop_cache_fn(balloon_opaque, type); +} diff --git a/hmp-commands.hx b/hmp-commands.hx index 98b4b1a..c73572c 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -1378,6 +1378,21 @@ Request VM to change its memory allocation to @var{value} (in MB). ETEXI { +.name = "balloon_drop_cache", +.args_type = "type:s", +.params = "type", +.help = "request VM to drop its page caches", +.mhandler.cmd = hmp_balloon_drop_cache, +.command_completion = balloon_drop_cache_completion +}, + +STEXI +@item balloon_drop_cache @var{type} +@findex balloon_drop_cache +Request VM to dorp its page caches. +ETEXI + +{ .name = "set_link", .args_type = "name:s,up:b", .params = "name on|off", diff --git a/hmp.c b/hmp.c index a4b1d3d..3aa1062 100644 --- a/hmp.c +++ b/hmp.c @@ -1061,6 +1061,28 @@ void hmp_balloon(Monitor *mon, const QDict *qdict) } } +void hmp_balloon_drop_cache(Monitor *mon, const QDict *qdict) +{ +const char *type = qdict_get_str(qdict, "type"); +Error *err = NULL; +int i; + +for (i = 0; i < DROP_CACHE_TYPE__MAX; i++) { +if (strcmp(type, DropCacheType_lookup[i]) == 0) { +qmp_balloon_drop_cache(1 + i, ); +break; +} +} + +if (i == DROP_CACHE_TYPE__MAX) { +error_setg(, QERR_INVALID_PARAMETER, type); +} + +if (err) { +error_report_err(err); +} +} + void hmp_block_resize(Monitor *mon, const QDict *qdict) { const char *device = qdict_get_str(qdict, "device"); diff --git a/hmp.h b/hmp.h index 093d65f..6bb6499 100644 --- a/hmp.h +++ b/hmp.h @@ -55,6 +55,7 @@ void hmp_nmi(Monitor *mon, const QDict *qdict); void hmp_set_link(Monitor *mon, const QDict *qdict); void hmp_block_passwd(Monitor *mon, const QDict *qdict); void hmp_balloon(Monitor *mon, const QDict *qdict); +void hmp_balloon_drop_cache(Monitor *mon, const QDict *qdict); void hmp_block_resize(Monitor *mon, const QDict *qdict); void hmp_snapshot_blkdev(Monitor *mon, const QDict *qdict); void hmp_snapshot_blkdev_internal(Monitor *mon, const QDict *qdict); @@ -120,6 +121,8 @@ void watchdog_action_completion(ReadLineState *rs, int nb_args, const char *str); void migrate_set_capability_completion(ReadLineState *rs, int nb_args, const char *str); +void balloon_drop_cache_completion(ReadLineState *rs, int nb_args, + const char *str); void migrate_set_parameter_completion(ReadLineState *rs, int nb_args, const char *str); void host_net_add_completion(ReadLineState *rs, int nb_args, const char *str); diff --git a/monitor.c b/monitor.c index a27e115..eefdf3d 100644 --- a/monitor.c +++ b/monitor.c @@ -3367,6 +3367,24 @@ void migrate_set_parameter_completion(ReadLineState *rs, int nb_args, } } +void balloon_drop_cache_completion(ReadLineState *rs, int nb_args, + const char *str) +{ +size_t len; + +len = strlen(str); +readline_set_completion_index(rs, len); +if (nb_args == 2) { +int i; +for (i = 0; i < DROP_CACHE_TYPE__MAX; i++) { +const char *name = DropCacheType_lookup[i]; +if (!strncmp(str, name, len)) { +readline_add_completion(rs, name); +} +} +} +} + void host_net_add_completion(ReadLineState *rs, int nb_args, const char *str) { int i; diff --git a/qapi-schema.json b/qapi-schema.json index 8483bdf..117f70a 100644 --- a/qapi-schema.json +++
[Qemu-devel] [QEMU 2/7] virtio-balloon: add drop cache support
virtio-balloon can make use of the amount of free memory to determine the amount of memory to be filled in the balloon, but the amount of free memory will be effected by the page cache, which can be reclaimed. Drop the cache before getting the amount of free memory will be very helpful to relect the exact amount of memroy that can be reclaimed. This patch add a new feature to the balloon device to support this operation, hypervisor can request the VM to drop it's cache, so as to reclaim more memory. Signed-off-by: Liang Li <liang.z...@intel.com> --- balloon.c | 10 ++- hw/virtio/virtio-balloon.c | 85 - include/hw/virtio/virtio-balloon.h | 19 +- include/standard-headers/linux/virtio_balloon.h | 1 + include/sysemu/balloon.h| 5 +- 5 files changed, 115 insertions(+), 5 deletions(-) diff --git a/balloon.c b/balloon.c index f2ef50c..0fb34bf 100644 --- a/balloon.c +++ b/balloon.c @@ -36,6 +36,7 @@ static QEMUBalloonEvent *balloon_event_fn; static QEMUBalloonStatus *balloon_stat_fn; +static QEMUBalloonDropCache *balloon_drop_cache_fn; static void *balloon_opaque; static bool balloon_inhibited; @@ -65,9 +66,12 @@ static bool have_balloon(Error **errp) } int qemu_add_balloon_handler(QEMUBalloonEvent *event_func, - QEMUBalloonStatus *stat_func, void *opaque) + QEMUBalloonStatus *stat_func, + QEMUBalloonDropCache *drop_cache_func, + void *opaque) { -if (balloon_event_fn || balloon_stat_fn || balloon_opaque) { +if (balloon_event_fn || balloon_stat_fn || balloon_drop_cache_fn +|| balloon_opaque) { /* We're already registered one balloon handler. How many can * a guest really have? */ @@ -75,6 +79,7 @@ int qemu_add_balloon_handler(QEMUBalloonEvent *event_func, } balloon_event_fn = event_func; balloon_stat_fn = stat_func; +balloon_drop_cache_fn = drop_cache_func; balloon_opaque = opaque; return 0; } @@ -86,6 +91,7 @@ void qemu_remove_balloon_handler(void *opaque) } balloon_event_fn = NULL; balloon_stat_fn = NULL; +balloon_drop_cache_fn = NULL; balloon_opaque = NULL; } diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index 8cf74c2..4757ba5 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -36,6 +36,10 @@ #define BALLOON_PAGE_SIZE (1 << VIRTIO_BALLOON_PFN_SHIFT) +enum balloon_req_id { + BALLOON_DROP_CACHE, +}; + static void balloon_page(void *addr, int deflate) { #if defined(__linux__) @@ -154,6 +158,12 @@ static bool balloon_page_bitmap_supported(const VirtIOBalloon *s) return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_BITMAP); } +static bool balloon_misc_supported(const VirtIOBalloon *s) +{ +VirtIODevice *vdev = VIRTIO_DEVICE(s); +return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_MISC); +} + static bool balloon_stats_enabled(const VirtIOBalloon *s) { return s->stats_poll_interval > 0; @@ -420,6 +430,39 @@ out: } } +static void virtio_balloon_handle_resp(VirtIODevice *vdev, VirtQueue *vq) +{ +VirtIOBalloon *s = VIRTIO_BALLOON(vdev); +VirtQueueElement *elem; +size_t offset = 0; +uint32_t tmp32, id = 0; + +elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); +if (!elem) { +s->req_status = REQ_ERROR; +return; +} + +s->misc_vq_elem = elem; + +if (!elem->out_num) { +return; +} + +iov_to_buf(elem->out_sg, elem->out_num, offset, + , sizeof(uint32_t)); +id = virtio_ldl_p(vdev, ); +offset += sizeof(uint32_t); +switch (id) { +case BALLOON_DROP_CACHE: +s->req_status = REQ_DONE; +break; +default: +break; +} + +} + static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data) { VirtIOBalloon *dev = VIRTIO_BALLOON(vdev); @@ -490,6 +533,7 @@ static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f, f |= dev->host_features; virtio_add_feature(, VIRTIO_BALLOON_F_STATS_VQ); virtio_add_feature(, VIRTIO_BALLOON_F_PAGE_BITMAP); +virtio_add_feature(, VIRTIO_BALLOON_F_MISC); return f; } @@ -500,6 +544,36 @@ static void virtio_balloon_stat(void *opaque, BalloonInfo *info) VIRTIO_BALLOON_PFN_SHIFT); } +static int virtio_balloon_drop_cache(void *opaque, unsigned long type) +{ +VirtIOBalloon *s = opaque; +VirtIODevice *vdev = VIRTIO_DEVICE(s); +VirtQueueElement *elem = s->misc_vq_elem; +int len; + +if (!balloon_misc_supported(s)) { +return REQ_UNSUPPORT; +} + +if (elem == NULL || !elem->in_num) { +elem = virtqueue_pop(s->mvq, sizeof(VirtQueueElem
[Qemu-devel] [QEMU 0/7] Fast balloon and fast live migration
This patch set is intended to speed up the inflating/deflating process of virtio-balloon and speed up live migration by skipping process guest's free pages. The virtio-balloon is extended to support some new features, so as to make things faster. Liang Li (7): balloon: speed up inflating & deflating process virtio-balloon: add drop cache support Add the hmp and qmp interface for dropping cache balloon: get free page info from guest bitmap: Add a new bitmap_move function kvm: Add two new arch specific functions migration: skip free pages during live migration balloon.c | 51 +++- hmp-commands.hx | 15 ++ hmp.c | 22 ++ hmp.h | 3 + hw/virtio/virtio-balloon.c | 315 ++-- include/hw/virtio/virtio-balloon.h | 23 +- include/qemu/bitmap.h | 13 + include/standard-headers/linux/virtio_balloon.h | 2 + include/sysemu/balloon.h| 13 +- include/sysemu/kvm.h| 2 + migration/ram.c | 93 +++ monitor.c | 18 ++ qapi-schema.json| 35 +++ qmp-commands.hx | 23 ++ target-arm/kvm.c| 14 ++ target-i386/kvm.c | 35 +++ target-mips/kvm.c | 14 ++ target-ppc/kvm.c| 14 ++ target-s390x/kvm.c | 14 ++ 19 files changed, 693 insertions(+), 26 deletions(-) -- 1.9.1
[Qemu-devel] [QEMU 1/7] balloon: speed up inflating & deflating process
The implementation of the current virtio-balloon is not very efficient, Bellow is test result of time spends on inflating the balloon to 3GB of a 4GB idle guest: a. allocating pages (6.5%, 103ms) b. sending PFNs to host (68.3%, 787ms) c. address translation (6.1%, 96ms) d. madvise (19%, 300ms) It takes about 1577ms for the whole inflating process to complete. The test shows that the bottle neck is the stage b and stage d. If using a bitmap to send the page info instead of the PFNs, we can reduce the overhead spends on stage b quite a lot. Furthermore, it's possible to do the address translation and do the madvise with a bulk of pages, instead of the current page per page way, so the overhead of stage c and stage d can also be reduced a lot. This patch is the QEMU side implementation which is intended to speed up the inflating & deflating process by adding a new feature to the virtio-balloon device. And now, inflating the balloon to 3GB of a 4GB idle guest only takes 210ms, it's about 8 times as fast as before. TODO: optimize stage a by allocating/freeing a chunk of pages instead of a single page at a time. Signed-off-by: Liang Li <liang.z...@intel.com> --- hw/virtio/virtio-balloon.c | 159 include/standard-headers/linux/virtio_balloon.h | 1 + 2 files changed, 139 insertions(+), 21 deletions(-) diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index 8c15e09..8cf74c2 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -47,6 +47,76 @@ static void balloon_page(void *addr, int deflate) #endif } +static void do_balloon_bulk_pages(ram_addr_t base_pfn, int page_shift, + unsigned long len, bool deflate) +{ +ram_addr_t size, processed, chunk, base; +void *addr; +MemoryRegionSection section = {.mr = NULL}; + +size = (len << page_shift); +base = (base_pfn << page_shift); + +for (processed = 0; processed < size; processed += chunk) { +chunk = size - processed; +while (chunk >= TARGET_PAGE_SIZE) { +section = memory_region_find(get_system_memory(), + base + processed, chunk); +if (!section.mr) { +chunk = QEMU_ALIGN_DOWN(chunk / 2, TARGET_PAGE_SIZE); +} else { +break; +} +} + +if (section.mr && +(int128_nz(section.size) && memory_region_is_ram(section.mr))) { +addr = section.offset_within_region + + memory_region_get_ram_ptr(section.mr); +qemu_madvise(addr, chunk, + deflate ? QEMU_MADV_WILLNEED : QEMU_MADV_DONTNEED); +} else { +fprintf(stderr, "can't find the chunk, skip\n"); +chunk = TARGET_PAGE_SIZE; +} +} +} + +static void balloon_bulk_pages(ram_addr_t base_pfn, unsigned long *bitmap, + unsigned long len, int page_shift, bool deflate) +{ +#if defined(__linux__) +unsigned long end = len * 8; +unsigned long current = 0; + +if (!qemu_balloon_is_inhibited() && (!kvm_enabled() || + kvm_has_sync_mmu())) { +while (current < end) { +unsigned long one = find_next_bit(bitmap, end, current); + +if (one < end) { +unsigned long zero = find_next_zero_bit(bitmap, end, one + 1); +unsigned long page_length; + +if (zero >= end) { +page_length = end - one; +} else { +page_length = zero - one; +} + +if (page_length) { +do_balloon_bulk_pages(base_pfn + one, page_shift, + page_length, deflate); +} +current = one + page_length; +} else { +current = one; +} +} +} +#endif +} + static const char *balloon_stat_names[] = { [VIRTIO_BALLOON_S_SWAP_IN] = "stat-swap-in", [VIRTIO_BALLOON_S_SWAP_OUT] = "stat-swap-out", @@ -78,6 +148,12 @@ static bool balloon_stats_supported(const VirtIOBalloon *s) return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_STATS_VQ); } +static bool balloon_page_bitmap_supported(const VirtIOBalloon *s) +{ +VirtIODevice *vdev = VIRTIO_DEVICE(s); +return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_BITMAP); +} + static bool balloon_stats_enabled(const VirtIOBalloon *s) { return s->stats_poll_interval > 0; @@ -224,27 +300,66 @@ static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq) return; } -while (iov_to_buf(elem->out_sg, elem->out_num, offset, , 4) == 4) { -ram_addr_t pa; -ram_addr_t addr; -
[Qemu-devel] [PATCH RFC v2 QEMU] balloon: speed up inflating & deflating process
The implementation of the current virtio-balloon is not very efficient, Bellow is test result of time spends on inflating the balloon to 3GB of a 4GB idle guest: a. allocating pages (6.5%, 103ms) b. sending PFNs to host (68.3%, 787ms) c. address translation (6.1%, 96ms) d. madvise (19%, 300ms) It takes about 1577ms for the whole inflating process to complete. The test shows that the bottle neck is the stage b and stage d. If using a bitmap to send the page info instead of the PFNs, we can reduce the overhead spends on stage b quite a lot. Furthermore, it's possible to do the address translation and do the madvise with a bulk of pages, instead of the current page per page way, so the overhead of stage c and stage d can also be reduced a lot. This patch is the QEMU side implementation which is intended to speed up the inflating & deflating process by adding a new feature to the virtio-balloon device. And now, inflating the balloon to 3GB of a 4GB idle guest only takes 210ms, it's about 8 times as fast as before. TODO: optimize stage a by allocating/freeing a chunk of pages instead of a single page at a time. v2 changes: change the interface Signed-off-by: Liang Li <liang.z...@intel.com> --- hw/virtio/virtio-balloon.c | 159 include/standard-headers/linux/virtio_balloon.h | 1 + 2 files changed, 139 insertions(+), 21 deletions(-) diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index 8c15e09..d6f423c 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -47,6 +47,76 @@ static void balloon_page(void *addr, int deflate) #endif } +static void do_balloon_bulk_pages(ram_addr_t base_pfn, int page_shift, + unsigned long len, bool deflate) +{ +ram_addr_t size, processed, chunk, base; +void *addr; +MemoryRegionSection section = {.mr = NULL}; + +size = (len << page_shift); +base = (base_pfn << page_shift); + +for (processed = 0; processed < size; processed += chunk) { +chunk = size - processed; +while (chunk >= TARGET_PAGE_SIZE) { +section = memory_region_find(get_system_memory(), + base + processed, chunk); +if (!section.mr) { +chunk = QEMU_ALIGN_DOWN(chunk / 2, TARGET_PAGE_SIZE); +} else { +break; +} +} + +if (section.mr && +(int128_nz(section.size) && memory_region_is_ram(section.mr))) { +addr = section.offset_within_region + + memory_region_get_ram_ptr(section.mr); +qemu_madvise(addr, chunk, + deflate ? QEMU_MADV_WILLNEED : QEMU_MADV_DONTNEED); +} else { +fprintf(stderr, "can't find the chunk, skip\n"); +chunk = TARGET_PAGE_SIZE; +} +} +} + +static void balloon_bulk_pages(ram_addr_t base_pfn, unsigned long *bitmap, + unsigned long len, int page_shift, bool deflate) +{ +#if defined(__linux__) +unsigned long end = len * 8; +unsigned long current = 0; + +if (!qemu_balloon_is_inhibited() && (!kvm_enabled() || + kvm_has_sync_mmu())) { +while (current < end) { +unsigned long one = find_next_bit(bitmap, end, current); + +if (one < end) { +unsigned long zero = find_next_zero_bit(bitmap, end, one + 1); +unsigned long page_length; + +if (zero >= end) { +page_length = end - one; +} else { +page_length = zero - one; +} + +if (page_length) { +do_balloon_bulk_pages(base_pfn + one, page_shift, + page_length, deflate); +} +current = one + page_length; +} else { +current = one; +} +} +} +#endif +} + static const char *balloon_stat_names[] = { [VIRTIO_BALLOON_S_SWAP_IN] = "stat-swap-in", [VIRTIO_BALLOON_S_SWAP_OUT] = "stat-swap-out", @@ -78,6 +148,12 @@ static bool balloon_stats_supported(const VirtIOBalloon *s) return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_STATS_VQ); } +static bool balloon_page_bitmap_supported(const VirtIOBalloon *s) +{ +VirtIODevice *vdev = VIRTIO_DEVICE(s); +return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_BITMAP); +} + static bool balloon_stats_enabled(const VirtIOBalloon *s) { return s->stats_poll_interval > 0; @@ -224,27 +300,66 @@ static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq) return; } -while (iov_to_buf(elem->out_sg, elem->out_num, offset, , 4) == 4) { -ram_addr_t pa; -
[Qemu-devel] [PATCH RFC v2 kernel] balloon: speed up inflating/deflating process
The implementation of the current virtio-balloon is not very efficient, Bellow is test result of time spends on inflating the balloon to 3GB of a 4GB idle guest: a. allocating pages (6.5%, 103ms) b. sending PFNs to host (68.3%, 787ms) c. address translation (6.1%, 96ms) d. madvise (19%, 300ms) It takes about 1577ms for the whole inflating process to complete. The test shows that the bottle neck is the stage b and stage d. If using a bitmap to send the page info instead of the PFNs, we can reduce the overhead spends on stage b quite a lot. Furthermore, it's possible to do the address translation and do the madvise with a bulk of pages, instead of the current page per page way, so the overhead of stage c and stage d can also be reduced a lot. This patch is the kernel side implementation which is intended to speed up the inflating & deflating process by adding a new feature to the virtio-balloon device. And now, inflating the balloon to 3GB of a 4GB idle guest only takes 200ms, it's about 8 times as fast as before. TODO: optimize stage a by allocating/freeing a chunk of pages instead of a single page at a time. v2 change: 1. Use a small page bitmap instead of a large one. 2. Address some of comments of v1. Signed-off-by: Liang Li <liang.z...@intel.com> Suggested-by: Michael S. Tsirkin <m...@redhat.com> Cc: Michael S. Tsirkin <m...@redhat.com> Cc: Paolo Bonzini <pbonz...@redhat.com> Cc: Cornelia Huck <cornelia.h...@de.ibm.com> Cc: Amit Shah <amit.s...@redhat.com> --- drivers/virtio/virtio_balloon.c | 207 ++-- include/uapi/linux/virtio_balloon.h | 1 + 2 files changed, 200 insertions(+), 8 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 476c0e3..823b4e4 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -40,11 +40,19 @@ #define VIRTIO_BALLOON_ARRAY_PFNS_MAX 256 #define OOM_VBALLOON_DEFAULT_PAGES 256 #define VIRTBALLOON_OOM_NOTIFY_PRIORITY 80 +#define VIRTIO_BALLOON_PFNS_LIMIT ((2 * (1ULL << 30)) >> PAGE_SHIFT) /* 2GB */ static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES; module_param(oom_pages, int, S_IRUSR | S_IWUSR); MODULE_PARM_DESC(oom_pages, "pages to free on OOM"); +struct balloon_bmap_hdr { + __virtio32 type; + __virtio32 page_shift; + __virtio64 start_pfn; + __virtio64 bmap_len; +}; + struct virtio_balloon { struct virtio_device *vdev; struct virtqueue *inflate_vq, *deflate_vq, *stats_vq; @@ -62,6 +70,13 @@ struct virtio_balloon { /* Number of balloon pages we've told the Host we're not using. */ unsigned int num_pages; + /* Bitmap and length used to tell the host the pages */ + unsigned long *page_bitmap; + unsigned long bmap_len; + /* Used to record the processed pfn range */ + unsigned long min_pfn, max_pfn, start_pfn, end_pfn; + /* Used for sending page bitmap and header */ + struct scatterlist sg[2]; /* * The pages we've told the Host we're not using are enqueued * at vb_dev_info->pages list. @@ -111,15 +126,39 @@ static void balloon_ack(struct virtqueue *vq) wake_up(>acked); } +static inline void init_pfn_range(struct virtio_balloon *vb) +{ + vb->min_pfn = (1UL << 48); + vb->max_pfn = 0; +} + static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq) { - struct scatterlist sg; unsigned int len; - sg_init_one(, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns); + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_PAGE_BITMAP)) { + struct balloon_bmap_hdr hdr; + unsigned long bmap_len; + + hdr.type = cpu_to_virtio32(vb->vdev, 0); + hdr.page_shift = cpu_to_virtio32(vb->vdev, PAGE_SHIFT); + hdr.start_pfn = cpu_to_virtio64(vb->vdev, vb->start_pfn); + bmap_len = min(vb->bmap_len, + (vb->end_pfn - vb->start_pfn) / BITS_PER_BYTE); + hdr.bmap_len = cpu_to_virtio64(vb-vdev, bmap_len); + sg_set_buf(>sg[0], , sizeof(hdr)); + sg_set_buf(>sg[1], vb->page_bitmap, bmap_len); + virtqueue_add_outbuf(vq, vb->sg, 2, vb, GFP_KERNEL); + } else { + struct scatterlist sg; + + sg_init_one(, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns); + /* We should always be able to add one buffer to an + * empty queue. + */ + virtqueue_add_outbuf(vq, , 1, vb, GFP_KERNEL); + } - /* We should always be able to add one buffer to an empty queue. */ - virtqueue_add_outbuf(vq, , 1, vb, GFP_KERNEL); virtqueue_kick(vq); /* When host has read buffer, this completes via balloon_ack */ @@ -13
[Qemu-devel] [PATCH RFC kernel] balloon: speed up inflating/deflating process
The implementation of the current virtio-balloon is not very efficient, Bellow is test result of time spends on inflating the balloon to 3GB of a 4GB idle guest: a. allocating pages (6.5%, 103ms) b. sending PFNs to host (68.3%, 787ms) c. address translation (6.1%, 96ms) d. madvise (19%, 300ms) It takes about 1577ms for the whole inflating process to complete. The test shows that the bottle neck is the stage b and stage d. If using a bitmap to send the page info instead of the PFNs, we can reduce the overhead spends on stage b quite a lot. Furthermore, it's possible to do the address translation and do the madvise with a bulk of pages, instead of the current page per page way, so the overhead of stage c and stage d can also be reduced a lot. This patch is the kernel side implementation which is intended to speed up the inflating & deflating process by adding a new feature to the virtio-balloon device. And now, inflating the balloon to 3GB of a 4GB idle guest only takes 175ms, it's about 9 times as fast as before. TODO: optimize stage a by allocating/freeing a chunk of pages instead of a single page at a time. Signed-off-by: Liang Li <liang.z...@intel.com> --- drivers/virtio/virtio_balloon.c | 199 ++-- include/uapi/linux/virtio_balloon.h | 1 + mm/page_alloc.c | 6 ++ 3 files changed, 198 insertions(+), 8 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 7b6d74f..5330b6f 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -45,6 +45,8 @@ static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES; module_param(oom_pages, int, S_IRUSR | S_IWUSR); MODULE_PARM_DESC(oom_pages, "pages to free on OOM"); +extern unsigned long get_max_pfn(void); + struct virtio_balloon { struct virtio_device *vdev; struct virtqueue *inflate_vq, *deflate_vq, *stats_vq; @@ -62,6 +64,9 @@ struct virtio_balloon { /* Number of balloon pages we've told the Host we're not using. */ unsigned int num_pages; + unsigned long *page_bitmap; + unsigned long start_pfn, end_pfn; + unsigned long bmap_len; /* * The pages we've told the Host we're not using are enqueued * at vb_dev_info->pages list. @@ -111,15 +116,66 @@ static void balloon_ack(struct virtqueue *vq) wake_up(>acked); } +static int balloon_page_bitmap_init(struct virtio_balloon *vb) +{ + unsigned long max_pfn, bmap_bytes; + + max_pfn = get_max_pfn(); + bmap_bytes = ALIGN(max_pfn, BITS_PER_LONG) / BITS_PER_BYTE; + if (!vb->page_bitmap) + vb->page_bitmap = kzalloc(bmap_bytes, GFP_KERNEL); + else { + if (bmap_bytes <= vb->bmap_len) + memset(vb->page_bitmap, 0, bmap_bytes); + else { + kfree(vb->page_bitmap); + vb->page_bitmap = kzalloc(bmap_bytes, GFP_KERNEL); + } + } + if (!vb->page_bitmap) { + dev_err(>vdev->dev, "%s failure: allocate page bitmap\n", +__func__); + return -ENOMEM; + } + vb->bmap_len = bmap_bytes; + vb->start_pfn = max_pfn; + vb->end_pfn = 0; + + return 0; +} + static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq) { - struct scatterlist sg; unsigned int len; - sg_init_one(, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns); + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_PAGE_BITMAP)) { + u32 page_shift = PAGE_SHIFT; + unsigned long start_pfn, end_pfn, flags = 0, bmap_len; + struct scatterlist sg[5]; + + start_pfn = rounddown(vb->start_pfn, BITS_PER_LONG); + end_pfn = roundup(vb->end_pfn, BITS_PER_LONG); + bmap_len = (end_pfn - start_pfn) / BITS_PER_LONG * sizeof(long); + + sg_init_table(sg, 5); + sg_set_buf([0], , sizeof(flags)); + sg_set_buf([1], _pfn, sizeof(start_pfn)); + sg_set_buf([2], _shift, sizeof(page_shift)); + sg_set_buf([3], _len, sizeof(bmap_len)); + sg_set_buf([4], vb->page_bitmap + +(start_pfn / BITS_PER_LONG), bmap_len); + virtqueue_add_outbuf(vq, sg, 5, vb, GFP_KERNEL); + + } else { + struct scatterlist sg; + + sg_init_one(, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns); + /* We should always be able to add one buffer to an + * empty queue. + */ + virtqueue_add_outbuf(vq, , 1, vb, GFP_KERNEL); + } - /* We should always be able to add one buffer to an empty queue. */ - virtqueue_add_outbuf(vq, , 1, vb, GFP_KE
[Qemu-devel] [PATCH RFC QEMU] balloon: speed up inflating & deflating process
The implementation of the current virtio-balloon is not very efficient, Bellow is test result of time spends on inflating the balloon to 3GB of a 4GB idle guest: a. allocating pages (6.5%, 103ms) b. sending PFNs to host (68.3%, 787ms) c. address translation (6.1%, 96ms) d. madvise (19%, 300ms) It takes about 1577ms for the whole inflating process to complete. The test shows that the bottle neck is the stage b and stage d. If using a bitmap to send the page info instead of the PFNs, we can reduce the overhead spends on stage b quite a lot. Furthermore, it's possible to do the address translation and do the madvise with a bulk of pages, instead of the current page per page way, so the overhead of stage c and stage d can also be reduced a lot. This patch is the QEMU side implementation which is intended to speed up the inflating & deflating process by adding a new feature to the virtio-balloon device. And now, inflating the balloon to 3GB of a 4GB idle guest only takes 175ms, it's about 9 times as fast as before. TODO: optimize stage a by allocating/freeing a chunk of pages instead of a single page at a time. Signed-off-by: Liang Li <liang.z...@intel.com> --- hw/virtio/virtio-balloon.c | 159 include/standard-headers/linux/virtio_balloon.h | 1 + 2 files changed, 139 insertions(+), 21 deletions(-) diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index 9dbe681..ce67465 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -47,6 +47,76 @@ static void balloon_page(void *addr, int deflate) #endif } +static void do_balloon_bulk_pages(ram_addr_t base_pfn, int page_shift, + unsigned long len, bool deflate) +{ +ram_addr_t size, processed, chunk, base; +void *addr; +MemoryRegionSection section = {.mr = NULL}; + +size = (len << page_shift); +base = (base_pfn << page_shift); + +for (processed = 0; processed < size; processed += chunk) { +chunk = size - processed; +while (chunk >= TARGET_PAGE_SIZE) { +section = memory_region_find(get_system_memory(), + base + processed, chunk); +if (!section.mr) { +chunk = QEMU_ALIGN_DOWN(chunk / 2, TARGET_PAGE_SIZE); +} else { +break; +} +} + +if (section.mr && +(int128_nz(section.size) && memory_region_is_ram(section.mr))) { +addr = section.offset_within_region + + memory_region_get_ram_ptr(section.mr); +qemu_madvise(addr, chunk, + deflate ? QEMU_MADV_WILLNEED : QEMU_MADV_DONTNEED); +} else { +fprintf(stderr, "can't find the chunk, skip\n"); +chunk = TARGET_PAGE_SIZE; +} +} +} + +static void balloon_bulk_pages(ram_addr_t base_pfn, unsigned long *bitmap, + unsigned long len, int page_shift, bool deflate) +{ +#if defined(__linux__) +unsigned long end = len * 8; +unsigned long current = 0; + +if (!qemu_balloon_is_inhibited() && (!kvm_enabled() || + kvm_has_sync_mmu())) { +while (current < end) { +unsigned long one = find_next_bit(bitmap, end, current); + +if (one < end) { +unsigned long zero = find_next_zero_bit(bitmap, end, one + 1); +unsigned long page_length; + +if (zero >= end) { +page_length = end - one; +} else { +page_length = zero - one; +} + +if (page_length) { +do_balloon_bulk_pages(base_pfn + one, page_shift, + page_length, deflate); +} +current = one + page_length; +} else { +current = one; +} +} +} +#endif +} + static const char *balloon_stat_names[] = { [VIRTIO_BALLOON_S_SWAP_IN] = "stat-swap-in", [VIRTIO_BALLOON_S_SWAP_OUT] = "stat-swap-out", @@ -78,6 +148,12 @@ static bool balloon_stats_supported(const VirtIOBalloon *s) return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_STATS_VQ); } +static bool balloon_page_bitmap_supported(const VirtIOBalloon *s) +{ +VirtIODevice *vdev = VIRTIO_DEVICE(s); +return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_BITMAP); +} + static bool balloon_stats_enabled(const VirtIOBalloon *s) { return s->stats_poll_interval > 0; @@ -223,27 +299,66 @@ static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq) return; } -while (iov_to_buf(elem->out_sg, elem->out_num, offset, , 4) == 4) { -ram_addr_t pa; -ram_addr_t addr; -
[Qemu-devel] [PATCH v2 9/9] migration: code clean up
Use 'QemuMutex comp_done_lock' and 'QemuCond comp_done_cond' instead of 'QemuMutex *comp_done_lock' and 'QemuCond comp_done_cond'. To keep consistent with 'QemuMutex decomp_done_lock' and 'QemuCond comp_done_cond'. Signed-off-by: Liang Li <liang.z...@intel.com> --- migration/ram.c | 36 +++- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 51de958..5076862 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -278,8 +278,8 @@ static QemuThread *compress_threads; * one of the compression threads has finished the compression. * comp_done_lock is used to co-work with comp_done_cond. */ -static QemuMutex *comp_done_lock; -static QemuCond *comp_done_cond; +static QemuMutex comp_done_lock; +static QemuCond comp_done_cond; /* The empty QEMUFileOps will be used by file in CompressParam */ static const QEMUFileOps empty_ops = { }; @@ -308,10 +308,10 @@ static void *do_data_compress(void *opaque) do_compress_ram_page(param->file, block, offset); -qemu_mutex_lock(comp_done_lock); +qemu_mutex_lock(_done_lock); param->done = true; -qemu_cond_signal(comp_done_cond); -qemu_mutex_unlock(comp_done_lock); +qemu_cond_signal(_done_cond); +qemu_mutex_unlock(_done_lock); qemu_mutex_lock(>mutex); } else { @@ -351,16 +351,12 @@ void migrate_compress_threads_join(void) qemu_mutex_destroy(_param[i].mutex); qemu_cond_destroy(_param[i].cond); } -qemu_mutex_destroy(comp_done_lock); -qemu_cond_destroy(comp_done_cond); +qemu_mutex_destroy(_done_lock); +qemu_cond_destroy(_done_cond); g_free(compress_threads); g_free(comp_param); -g_free(comp_done_cond); -g_free(comp_done_lock); compress_threads = NULL; comp_param = NULL; -comp_done_cond = NULL; -comp_done_lock = NULL; } void migrate_compress_threads_create(void) @@ -374,10 +370,8 @@ void migrate_compress_threads_create(void) thread_count = migrate_compress_threads(); compress_threads = g_new0(QemuThread, thread_count); comp_param = g_new0(CompressParam, thread_count); -comp_done_cond = g_new0(QemuCond, 1); -comp_done_lock = g_new0(QemuMutex, 1); -qemu_cond_init(comp_done_cond); -qemu_mutex_init(comp_done_lock); +qemu_cond_init(_done_cond); +qemu_mutex_init(_done_lock); for (i = 0; i < thread_count; i++) { /* com_param[i].file is just used as a dummy buffer to save data, set * it's ops to empty. @@ -840,13 +834,13 @@ static void flush_compressed_data(QEMUFile *f) } thread_count = migrate_compress_threads(); -qemu_mutex_lock(comp_done_lock); +qemu_mutex_lock(_done_lock); for (idx = 0; idx < thread_count; idx++) { while (!comp_param[idx].done) { -qemu_cond_wait(comp_done_cond, comp_done_lock); +qemu_cond_wait(_done_cond, _done_lock); } } -qemu_mutex_unlock(comp_done_lock); +qemu_mutex_unlock(_done_lock); for (idx = 0; idx < thread_count; idx++) { qemu_mutex_lock(_param[idx].mutex); @@ -872,7 +866,7 @@ static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block, int idx, thread_count, bytes_xmit = -1, pages = -1; thread_count = migrate_compress_threads(); -qemu_mutex_lock(comp_done_lock); +qemu_mutex_lock(_done_lock); while (true) { for (idx = 0; idx < thread_count; idx++) { if (comp_param[idx].done) { @@ -891,10 +885,10 @@ static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block, if (pages > 0) { break; } else { -qemu_cond_wait(comp_done_cond, comp_done_lock); +qemu_cond_wait(_done_cond, _done_lock); } } -qemu_mutex_unlock(comp_done_lock); +qemu_mutex_unlock(_done_lock); return pages; } -- 1.9.1
[Qemu-devel] [PATCH v2 5/9] migration: refine ram_save_compressed_page
Use qemu_put_compression_data to do the compression directly instead of using do_compress_ram_page, avoid some data copy. very small improvement, at the same time, add code to check if the compression is successful. Signed-off-by: Liang Li <liang.z...@intel.com> --- migration/ram.c | 27 +-- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 8a21c2c..1a45227 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -929,24 +929,20 @@ static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss, uint64_t *bytes_transferred) { int pages = -1; -uint64_t bytes_xmit; +uint64_t bytes_xmit = 0; uint8_t *p; -int ret; +int ret, blen; RAMBlock *block = pss->block; ram_addr_t offset = pss->offset; p = block->host + offset; -bytes_xmit = 0; ret = ram_control_save_page(f, block->offset, offset, TARGET_PAGE_SIZE, _xmit); if (bytes_xmit) { *bytes_transferred += bytes_xmit; pages = 1; } -if (block == last_sent_block) { -offset |= RAM_SAVE_FLAG_CONTINUE; -} if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { if (ret != RAM_SAVE_CONTROL_DELAYED) { if (bytes_xmit > 0) { @@ -966,19 +962,22 @@ static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss, flush_compressed_data(f); pages = save_zero_page(f, block, offset, p, bytes_transferred); if (pages == -1) { -set_compress_params(_param[0], block, offset); -/* Use the qemu thread to compress the data to make sure the - * first page is sent out before other pages - */ -bytes_xmit = do_compress_ram_page(_param[0]); -if (bytes_xmit > 0) { +/* Make sure the first page is sent out before other pages */ +bytes_xmit = save_page_header(f, block, offset | + RAM_SAVE_FLAG_COMPRESS_PAGE); +blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE, + migrate_compress_level()); +if (blen > 0) { +*bytes_transferred += bytes_xmit + blen; acct_info.norm_pages++; -qemu_put_qemu_file(f, comp_param[0].file); -*bytes_transferred += bytes_xmit; pages = 1; +} else { +qemu_file_set_error(f, blen); +error_report("compressed data failed!"); } } } else { +offset |= RAM_SAVE_FLAG_CONTINUE; pages = save_zero_page(f, block, offset, p, bytes_transferred); if (pages == -1) { pages = compress_page_with_multi_thread(f, block, offset, -- 1.9.1
[Qemu-devel] [PATCH v2 4/9] qemu-file: Fix qemu_put_compression_data flaw
Current qemu_put_compression_data can only work with no writable QEMUFile, and can't work with the writable QEMUFile. But it does not provide any measure to prevent users from using it with a writable QEMUFile. We should fix this flaw to make it works with writable QEMUFile. Signed-off-by: Liang Li <liang.z...@intel.com> Suggested-by: Juan Quintela <quint...@redhat.com> --- migration/qemu-file.c | 23 +-- migration/ram.c | 18 +- 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/migration/qemu-file.c b/migration/qemu-file.c index 6f4a129..b0ef1f3 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -607,8 +607,14 @@ uint64_t qemu_get_be64(QEMUFile *f) return v; } -/* compress size bytes of data start at p with specific compression +/* Compress size bytes of data start at p with specific compression * level and store the compressed data to the buffer of f. + * + * When f is not writable, return -1 if f has no space to save the + * compressed data. + * When f is wirtable and it has no space to save the compressed data, + * do fflush first, if f still has no space to save the compressed + * data, return -1. */ ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size, @@ -617,7 +623,14 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size, ssize_t blen = IO_BUF_SIZE - f->buf_index - sizeof(int32_t); if (blen < compressBound(size)) { -return 0; +if (!qemu_file_is_writable(f)) { +return -1; +} +qemu_fflush(f); +blen = IO_BUF_SIZE - sizeof(int32_t); +if (blen < compressBound(size)) { +return -1; +} } if (compress2(f->buf + f->buf_index + sizeof(int32_t), (uLongf *), (Bytef *)p, size, level) != Z_OK) { @@ -625,7 +638,13 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size, return 0; } qemu_put_be32(f, blen); +if (f->ops->writev_buffer) { +add_to_iovec(f, f->buf + f->buf_index, blen); +} f->buf_index += blen; +if (f->buf_index == IO_BUF_SIZE) { +qemu_fflush(f); +} return blen + sizeof(int32_t); } diff --git a/migration/ram.c b/migration/ram.c index 5d544c0..8a21c2c 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -821,7 +821,13 @@ static int do_compress_ram_page(CompressParam *param) RAM_SAVE_FLAG_COMPRESS_PAGE); blen = qemu_put_compression_data(param->file, p, TARGET_PAGE_SIZE, migrate_compress_level()); -bytes_sent += blen; +if (blen < 0) { +bytes_sent = 0; +qemu_file_set_error(migrate_get_current()->to_dst_file, blen); +error_report("compressed data failed!"); +} else { +bytes_sent += blen; +} return bytes_sent; } @@ -965,10 +971,12 @@ static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss, * first page is sent out before other pages */ bytes_xmit = do_compress_ram_page(_param[0]); -acct_info.norm_pages++; -qemu_put_qemu_file(f, comp_param[0].file); -*bytes_transferred += bytes_xmit; -pages = 1; +if (bytes_xmit > 0) { +acct_info.norm_pages++; +qemu_put_qemu_file(f, comp_param[0].file); +*bytes_transferred += bytes_xmit; +pages = 1; +} } } else { pages = save_zero_page(f, block, offset, p, bytes_transferred); -- 1.9.1
[Qemu-devel] [PATCH v2 6/9] migration: protect the quit flag by lock
quit_comp_thread and quit_decomp_thread are accessed by several thread, it's better to protect them with locks. We use a per thread flag to replace the global one, and the new flag is protected by a lock. Signed-off-by: Liang Li <liang.z...@intel.com> --- migration/ram.c | 32 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 1a45227..50eae7c 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -253,6 +253,7 @@ static struct BitmapRcu { struct CompressParam { bool start; bool done; +bool quit; QEMUFile *file; QemuMutex mutex; QemuCond cond; @@ -264,6 +265,7 @@ typedef struct CompressParam CompressParam; struct DecompressParam { bool start; bool done; +bool quit; QemuMutex mutex; QemuCond cond; void *des; @@ -284,8 +286,6 @@ static QemuCond *comp_done_cond; static const QEMUFileOps empty_ops = { }; static bool compression_switch; -static bool quit_comp_thread; -static bool quit_decomp_thread; static DecompressParam *decomp_param; static QemuThread *decompress_threads; static QemuMutex decomp_done_lock; @@ -297,18 +297,18 @@ static void *do_data_compress(void *opaque) { CompressParam *param = opaque; -while (!quit_comp_thread) { +while (!param->quit) { qemu_mutex_lock(>mutex); -/* Re-check the quit_comp_thread in case of +/* Re-check the quit flag in case of * terminate_compression_threads is called just before * qemu_mutex_lock(>mutex) and after - * while(!quit_comp_thread), re-check it here can make + * while(!param->quit), re-check it here can make * sure the compression thread terminate as expected. */ -while (!param->start && !quit_comp_thread) { +while (!param->start && !param->quit) { qemu_cond_wait(>cond, >mutex); } -if (!quit_comp_thread) { +if (!param->quit) { do_compress_ram_page(param); } param->start = false; @@ -328,9 +328,9 @@ static inline void terminate_compression_threads(void) int idx, thread_count; thread_count = migrate_compress_threads(); -quit_comp_thread = true; for (idx = 0; idx < thread_count; idx++) { qemu_mutex_lock(_param[idx].mutex); +comp_param[idx].quit = true; qemu_cond_signal(_param[idx].cond); qemu_mutex_unlock(_param[idx].mutex); } @@ -370,7 +370,6 @@ void migrate_compress_threads_create(void) if (!migrate_use_compression()) { return; } -quit_comp_thread = false; compression_switch = true; thread_count = migrate_compress_threads(); compress_threads = g_new0(QemuThread, thread_count); @@ -385,6 +384,7 @@ void migrate_compress_threads_create(void) */ comp_param[i].file = qemu_fopen_ops(NULL, _ops); comp_param[i].done = true; +comp_param[i].quit = false; qemu_mutex_init(_param[i].mutex); qemu_cond_init(_param[i].cond); qemu_thread_create(compress_threads + i, "compress", @@ -863,12 +863,12 @@ static void flush_compressed_data(QEMUFile *f) for (idx = 0; idx < thread_count; idx++) { if (!comp_param[idx].done) { qemu_mutex_lock(comp_done_lock); -while (!comp_param[idx].done && !quit_comp_thread) { +while (!comp_param[idx].done && !comp_param[idx].quit) { qemu_cond_wait(comp_done_cond, comp_done_lock); } qemu_mutex_unlock(comp_done_lock); } -if (!quit_comp_thread) { +if (!comp_param[idx].quit) { len = qemu_put_qemu_file(f, comp_param[idx].file); bytes_transferred += len; } @@ -2200,12 +2200,12 @@ static void *do_data_decompress(void *opaque) DecompressParam *param = opaque; unsigned long pagesize; -while (!quit_decomp_thread) { +while (!param->quit) { qemu_mutex_lock(>mutex); -while (!param->start && !quit_decomp_thread) { +while (!param->start && !param->quit) { qemu_cond_wait(>cond, >mutex); } -if (!quit_decomp_thread) { +if (!param->quit) { pagesize = TARGET_PAGE_SIZE; /* uncompress() will return failed in some case, especially * when the page is dirted when doing the compression, it's @@ -2252,7 +2252,6 @@ void migrate_decompress_threads_create(void) thread_count = migrate_decompress_threads(); decompress_threads = g_new0(QemuThread, thread_count); decomp_param = g_new0(DecompressParam, thread_count); -quit_decomp_thread = false; qemu_mutex_init(_done_lock); qemu_cond_init(_done_cond); for (i = 0; i < thread_count; i++) { @@ -2260,6 +2259,7 @@
[Qemu-devel] [PATCH v2 3/9] migration: remove useless code
page_buffer is set twice repeatedly, remove the previous set. Signed-off-by: Liang Li <liang.z...@intel.com> Reviewed-by: Dr. David Alan Gilbert <dgilb...@redhat.com> Reviewed-by: Juan Quintela <quint...@redhat.com> --- migration/ram.c | 1 - 1 file changed, 1 deletion(-) diff --git a/migration/ram.c b/migration/ram.c index 8a59a08..5d544c0 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -2356,7 +2356,6 @@ static int ram_load_postcopy(QEMUFile *f) ret = -EINVAL; break; } -page_buffer = host; /* * Postcopy requires that we place whole host pages atomically. * To make it atomic, the data is read into a temporary page -- 1.9.1
[Qemu-devel] [PATCH v2 7/9] migration: refine the compression code
The current code for multi-thread compression is not clear, especially in the aspect of using lock. Refine the code to make it clear. Signed-off-by: Liang Li <liang.z...@intel.com> --- migration/ram.c | 84 +++-- 1 file changed, 40 insertions(+), 44 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 50eae7c..f44f833 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -251,7 +251,6 @@ static struct BitmapRcu { } *migration_bitmap_rcu; struct CompressParam { -bool start; bool done; bool quit; QEMUFile *file; @@ -291,34 +290,36 @@ static QemuThread *decompress_threads; static QemuMutex decomp_done_lock; static QemuCond decomp_done_cond; -static int do_compress_ram_page(CompressParam *param); +static int do_compress_ram_page(QEMUFile *f, RAMBlock *block, +ram_addr_t offset); static void *do_data_compress(void *opaque) { CompressParam *param = opaque; +RAMBlock *block; +ram_addr_t offset; +qemu_mutex_lock(>mutex); while (!param->quit) { -qemu_mutex_lock(>mutex); -/* Re-check the quit flag in case of - * terminate_compression_threads is called just before - * qemu_mutex_lock(>mutex) and after - * while(!param->quit), re-check it here can make - * sure the compression thread terminate as expected. - */ -while (!param->start && !param->quit) { +if (param->block) { +block = param->block; +offset = param->offset; +param->block = NULL; +qemu_mutex_unlock(>mutex); + +do_compress_ram_page(param->file, block, offset); + +qemu_mutex_lock(comp_done_lock); +param->done = true; +qemu_cond_signal(comp_done_cond); +qemu_mutex_unlock(comp_done_lock); + +qemu_mutex_lock(>mutex); +} else { qemu_cond_wait(>cond, >mutex); } -if (!param->quit) { -do_compress_ram_page(param); -} -param->start = false; -qemu_mutex_unlock(>mutex); - -qemu_mutex_lock(comp_done_lock); -param->done = true; -qemu_cond_signal(comp_done_cond); -qemu_mutex_unlock(comp_done_lock); } +qemu_mutex_unlock(>mutex); return NULL; } @@ -808,18 +809,15 @@ static int ram_save_page(QEMUFile *f, PageSearchStatus *pss, return pages; } -static int do_compress_ram_page(CompressParam *param) +static int do_compress_ram_page(QEMUFile *f, RAMBlock *block, +ram_addr_t offset) { int bytes_sent, blen; -uint8_t *p; -RAMBlock *block = param->block; -ram_addr_t offset = param->offset; +uint8_t *p = block->host + (offset & TARGET_PAGE_MASK); -p = block->host + (offset & TARGET_PAGE_MASK); - -bytes_sent = save_page_header(param->file, block, offset | +bytes_sent = save_page_header(f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE); -blen = qemu_put_compression_data(param->file, p, TARGET_PAGE_SIZE, +blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE, migrate_compress_level()); if (blen < 0) { bytes_sent = 0; @@ -832,15 +830,6 @@ static int do_compress_ram_page(CompressParam *param) return bytes_sent; } -static inline void start_compression(CompressParam *param) -{ -param->done = false; -qemu_mutex_lock(>mutex); -param->start = true; -qemu_cond_signal(>cond); -qemu_mutex_unlock(>mutex); -} - static inline void start_decompression(DecompressParam *param) { param->done = false; @@ -860,18 +849,22 @@ static void flush_compressed_data(QEMUFile *f) return; } thread_count = migrate_compress_threads(); + +qemu_mutex_lock(comp_done_lock); for (idx = 0; idx < thread_count; idx++) { -if (!comp_param[idx].done) { -qemu_mutex_lock(comp_done_lock); -while (!comp_param[idx].done && !comp_param[idx].quit) { -qemu_cond_wait(comp_done_cond, comp_done_lock); -} -qemu_mutex_unlock(comp_done_lock); +while (!comp_param[idx].done) { +qemu_cond_wait(comp_done_cond, comp_done_lock); } +} +qemu_mutex_unlock(comp_done_lock); + +for (idx = 0; idx < thread_count; idx++) { +qemu_mutex_lock(_param[idx].mutex); if (!comp_param[idx].quit) { len = qemu_put_qemu_file(f, comp_param[idx].file); bytes_transferred += len; } +qemu_mutex_unlock(_param[idx].mutex); } } @@ -893,9 +886,12 @@ static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block, while (true) { for (idx = 0; idx
[Qemu-devel] [PATCH v2 8/9] migration: refine the decompression code
The current code for multi-thread decompression is not clear, especially in the aspect of using lock. Refine the code to make it clear. Signed-off-by: Liang Li <liang.z...@intel.com> --- migration/ram.c | 50 +- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index f44f833..51de958 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -262,7 +262,6 @@ struct CompressParam { typedef struct CompressParam CompressParam; struct DecompressParam { -bool start; bool done; bool quit; QemuMutex mutex; @@ -830,15 +829,6 @@ static int do_compress_ram_page(QEMUFile *f, RAMBlock *block, return bytes_sent; } -static inline void start_decompression(DecompressParam *param) -{ -param->done = false; -qemu_mutex_lock(>mutex); -param->start = true; -qemu_cond_signal(>cond); -qemu_mutex_unlock(>mutex); -} - static uint64_t bytes_transferred; static void flush_compressed_data(QEMUFile *f) @@ -2195,30 +2185,37 @@ static void *do_data_decompress(void *opaque) { DecompressParam *param = opaque; unsigned long pagesize; +uint8_t *des; +int len; +qemu_mutex_lock(>mutex); while (!param->quit) { -qemu_mutex_lock(>mutex); -while (!param->start && !param->quit) { -qemu_cond_wait(>cond, >mutex); -} -if (!param->quit) { +if (param->des) { +des = param->des; +len = param->len; +param->des = 0; +qemu_mutex_unlock(>mutex); + pagesize = TARGET_PAGE_SIZE; /* uncompress() will return failed in some case, especially * when the page is dirted when doing the compression, it's * not a problem because the dirty page will be retransferred * and uncompress() won't break the data in other pages. */ -uncompress((Bytef *)param->des, , - (const Bytef *)param->compbuf, param->len); -} -param->start = false; -qemu_mutex_unlock(>mutex); +uncompress((Bytef *)des, , + (const Bytef *)param->compbuf, len); -qemu_mutex_lock(_done_lock); -param->done = true; -qemu_cond_signal(_done_cond); -qemu_mutex_unlock(_done_lock); +qemu_mutex_lock(_done_lock); +param->done = true; +qemu_cond_signal(_done_cond); +qemu_mutex_unlock(_done_lock); + +qemu_mutex_lock(>mutex); +} else { +qemu_cond_wait(>cond, >mutex); +} } +qemu_mutex_unlock(>mutex); return NULL; } @@ -2295,10 +2292,13 @@ static void decompress_data_with_multi_threads(QEMUFile *f, while (true) { for (idx = 0; idx < thread_count; idx++) { if (decomp_param[idx].done) { +decomp_param[idx].done = false; +qemu_mutex_lock(_param[idx].mutex); qemu_get_buffer(f, decomp_param[idx].compbuf, len); decomp_param[idx].des = host; decomp_param[idx].len = len; -start_decompression(_param[idx]); +qemu_cond_signal(_param[idx].cond); +qemu_mutex_unlock(_param[idx].mutex); break; } } -- 1.9.1
[Qemu-devel] [PATCH v2 2/9] migration: Fix a potential issue
At the end of live migration and before vm_start() on the destination side, we should make sure all the decompression tasks are finished, if this can not be guaranteed, the VM may get the incorrect memory data, or the updated memory may be overwritten by the decompression thread. Add the code to fix this potential issue. Suggested-by: David Alan Gilbert <dgilb...@redhat.com> Suggested-by: Juan Quintela <quint...@redhat.com> Signed-off-by: Liang Li <liang.z...@intel.com> --- migration/ram.c | 19 +++ 1 file changed, 19 insertions(+) diff --git a/migration/ram.c b/migration/ram.c index 7ab6ab5..8a59a08 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -2220,6 +2220,24 @@ static void *do_data_decompress(void *opaque) return NULL; } +static void wait_for_decompress_done(void) +{ +int idx, thread_count; + +if (!migrate_use_compression()) { +return; +} + +thread_count = migrate_decompress_threads(); +qemu_mutex_lock(_done_lock); +for (idx = 0; idx < thread_count; idx++) { +while (!decomp_param[idx].done) { +qemu_cond_wait(_done_cond, _done_lock); +} +} +qemu_mutex_unlock(_done_lock); +} + void migrate_decompress_threads_create(void) { int i, thread_count; @@ -2554,6 +2572,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) } } +wait_for_decompress_done(); rcu_read_unlock(); DPRINTF("Completed load of VM with exit code %d seq iteration " "%" PRIu64 "\n", ret, seq_iter); -- 1.9.1
[Qemu-devel] [PATCH v2 1/9] migration: Fix multi-thread compression bug
Recently, a bug related to multiple thread compression feature for live migration is reported. The destination side will be blocked during live migration if there are heavy workload in host and memory intensive workload in guest, this is most likely to happen when there is one decompression thread. Some parts of the decompression code are incorrect: 1. The main thread receives data from source side will enter a busy loop to wait for a free decompression thread. 2. A lock is needed to protect the decomp_param[idx]->start, because it is checked in the main thread and is updated in the decompression thread. Fix these two issues by following the code pattern for compression. Signed-off-by: Liang Li <liang.z...@intel.com> Reported-by: Daniel P. Berrange <berra...@redhat.com> Reviewed-by: Daniel P. Berrange <berra...@redhat.com> Reviewed-by: Dr. David Alan Gilbert <dgilb...@redhat.com> Reviewed-by: Juan Quintela <quint...@redhat.com> Tested-by: Daniel P. Berrange <berra...@redhat.com> Signed-off-by: Liang Li <liang.z...@intel.com> --- migration/ram.c | 38 +++--- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 3f05738..7ab6ab5 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -263,6 +263,7 @@ typedef struct CompressParam CompressParam; struct DecompressParam { bool start; +bool done; QemuMutex mutex; QemuCond cond; void *des; @@ -287,6 +288,8 @@ static bool quit_comp_thread; static bool quit_decomp_thread; static DecompressParam *decomp_param; static QemuThread *decompress_threads; +static QemuMutex decomp_done_lock; +static QemuCond decomp_done_cond; static int do_compress_ram_page(CompressParam *param); @@ -834,6 +837,7 @@ static inline void start_compression(CompressParam *param) static inline void start_decompression(DecompressParam *param) { +param->done = false; qemu_mutex_lock(>mutex); param->start = true; qemu_cond_signal(>cond); @@ -2193,19 +2197,24 @@ static void *do_data_decompress(void *opaque) qemu_mutex_lock(>mutex); while (!param->start && !quit_decomp_thread) { qemu_cond_wait(>cond, >mutex); +} +if (!quit_decomp_thread) { pagesize = TARGET_PAGE_SIZE; -if (!quit_decomp_thread) { -/* uncompress() will return failed in some case, especially - * when the page is dirted when doing the compression, it's - * not a problem because the dirty page will be retransferred - * and uncompress() won't break the data in other pages. - */ -uncompress((Bytef *)param->des, , - (const Bytef *)param->compbuf, param->len); -} -param->start = false; +/* uncompress() will return failed in some case, especially + * when the page is dirted when doing the compression, it's + * not a problem because the dirty page will be retransferred + * and uncompress() won't break the data in other pages. + */ +uncompress((Bytef *)param->des, , + (const Bytef *)param->compbuf, param->len); } +param->start = false; qemu_mutex_unlock(>mutex); + +qemu_mutex_lock(_done_lock); +param->done = true; +qemu_cond_signal(_done_cond); +qemu_mutex_unlock(_done_lock); } return NULL; @@ -2219,10 +2228,13 @@ void migrate_decompress_threads_create(void) decompress_threads = g_new0(QemuThread, thread_count); decomp_param = g_new0(DecompressParam, thread_count); quit_decomp_thread = false; +qemu_mutex_init(_done_lock); +qemu_cond_init(_done_cond); for (i = 0; i < thread_count; i++) { qemu_mutex_init(_param[i].mutex); qemu_cond_init(_param[i].cond); decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); +decomp_param[i].done = true; qemu_thread_create(decompress_threads + i, "decompress", do_data_decompress, decomp_param + i, QEMU_THREAD_JOINABLE); @@ -2258,9 +2270,10 @@ static void decompress_data_with_multi_threads(QEMUFile *f, int idx, thread_count; thread_count = migrate_decompress_threads(); +qemu_mutex_lock(_done_lock); while (true) { for (idx = 0; idx < thread_count; idx++) { -if (!decomp_param[idx].start) { +if (decomp_param[idx].done) { qemu_get_buffer(f, decomp_param[idx].compbuf, len); decomp_param[idx].des = host; decomp_param[idx].len = len; @@ -2270,8 +2283,11 @@ static void decompress_data_with_multi_threads(QEMUFile *f, }
[Qemu-devel] [PATCH v2 0/9] live migration bug fix and refine
This patch set fixed a bug which will block live migration and another potential issue when using multi-thread (de)compression. The last patches try to refine the code and make the using of lock more clear. Some of the code snippets are from Juan's multiple-fd patches, with very small change. Thanks for Juan's work. Liang Li (9): migration: Fix multi-thread compression bug migration: Fix a potential issue migration: remove useless code qemu-file: Fix qemu_put_compression_data flaw migration: refine ram_save_compressed_page migration: protect the quit flag by lock migration: refine the compression code migration: refine the decompression code migration: code clean up migration/qemu-file.c | 23 - migration/ram.c | 251 -- 2 files changed, 162 insertions(+), 112 deletions(-) -- 1.9.1
[Qemu-devel] [PATCH 2/5] migration: Fix a potential issue
At the end of live migration and before vm_start() on the destination side, we should make sure all the decompression tasks are finished, if this can not be guaranteed, the VM may get the incorrect memory data, or the updated memory may be overwritten by the decompression thread. Add the code to fix this potential issue. Suggested-by: David Alan Gilbert <dgilb...@redhat.com> Signed-off-by: Liang Li <liang.z...@intel.com> --- include/migration/migration.h | 1 + migration/migration.c | 2 +- migration/ram.c | 20 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/include/migration/migration.h b/include/migration/migration.h index ac2c12c..1c9051e 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -223,6 +223,7 @@ void migrate_compress_threads_create(void); void migrate_compress_threads_join(void); void migrate_decompress_threads_create(void); void migrate_decompress_threads_join(void); +void wait_for_decompress_done(void); uint64_t ram_bytes_remaining(void); uint64_t ram_bytes_transferred(void); uint64_t ram_bytes_total(void); diff --git a/migration/migration.c b/migration/migration.c index 991313a..5228c28 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -347,7 +347,7 @@ static void process_incoming_migration_bh(void *opaque) /* If global state section was not received or we are in running state, we need to obey autostart. Any other state is set with runstate_set. */ - +wait_for_decompress_done(); if (!global_state_received() || global_state_get_runstate() == RUN_STATE_RUNNING) { if (autostart) { diff --git a/migration/ram.c b/migration/ram.c index 7ab6ab5..4459b38 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -2220,6 +2220,26 @@ static void *do_data_decompress(void *opaque) return NULL; } +void wait_for_decompress_done(void) +{ +int idx, thread_count; + +if (!migrate_use_compression()) { +return; +} +thread_count = migrate_decompress_threads(); +for (idx = 0; idx < thread_count; idx++) { +if (!decomp_param[idx].done) { +qemu_mutex_lock(_done_lock); +while (!decomp_param[idx].done) { +qemu_cond_wait(_done_cond, _done_lock); +} +qemu_mutex_unlock(_done_lock); +} +} + +} + void migrate_decompress_threads_create(void) { int i, thread_count; -- 1.9.1
[Qemu-devel] [PATCH 5/5] migration: refine ram_save_compressed_page
Use qemu_put_compression_data to do the compression directly instead of using do_compress_ram_page, avoid some data copy. very small improvement, at the same time, add code to check if the compression is successful. Signed-off-by: Liang Li <liang.z...@intel.com> --- migration/ram.c | 28 +--- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 7e62d8d..ec2c0bf 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -927,24 +927,20 @@ static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss, uint64_t *bytes_transferred) { int pages = -1; -uint64_t bytes_xmit; +uint64_t bytes_xmit = 0; uint8_t *p; -int ret; +int ret, blen; RAMBlock *block = pss->block; ram_addr_t offset = pss->offset; p = block->host + offset; -bytes_xmit = 0; ret = ram_control_save_page(f, block->offset, offset, TARGET_PAGE_SIZE, _xmit); if (bytes_xmit) { *bytes_transferred += bytes_xmit; pages = 1; } -if (block == last_sent_block) { -offset |= RAM_SAVE_FLAG_CONTINUE; -} if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { if (ret != RAM_SAVE_CONTROL_DELAYED) { if (bytes_xmit > 0) { @@ -964,17 +960,19 @@ static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss, flush_compressed_data(f); pages = save_zero_page(f, block, offset, p, bytes_transferred); if (pages == -1) { -set_compress_params(_param[0], block, offset); -/* Use the qemu thread to compress the data to make sure the - * first page is sent out before other pages - */ -bytes_xmit = do_compress_ram_page(_param[0]); -acct_info.norm_pages++; -qemu_put_qemu_file(f, comp_param[0].file); -*bytes_transferred += bytes_xmit; -pages = 1; +/* Make sure the first page is sent out before other pages */ +bytes_xmit = save_page_header(f, block, offset | + RAM_SAVE_FLAG_COMPRESS_PAGE); +blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE, + migrate_compress_level()); +if (blen > 0) { +*bytes_transferred += bytes_xmit + blen; +acct_info.norm_pages++; +pages = 1; +} } } else { +offset |= RAM_SAVE_FLAG_CONTINUE; pages = save_zero_page(f, block, offset, p, bytes_transferred); if (pages == -1) { pages = compress_page_with_multi_thread(f, block, offset, -- 1.9.1
[Qemu-devel] [PATCH 3/5] migration: remove useless code
page_buffer is set twice repeatedly, remove the previous set. Signed-off-by: Liang Li <liang.z...@intel.com> Reviewed-by: Dr. David Alan Gilbert <dgilb...@redhat.com> --- migration/ram.c | 1 - 1 file changed, 1 deletion(-) diff --git a/migration/ram.c b/migration/ram.c index 4459b38..bc34bc5 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -2358,7 +2358,6 @@ static int ram_load_postcopy(QEMUFile *f) ret = -EINVAL; break; } -page_buffer = host; /* * Postcopy requires that we place whole host pages atomically. * To make it atomic, the data is read into a temporary page -- 1.9.1
[Qemu-devel] [PATCH 4/5] qemu-file: Fix qemu_put_compression_data flaw
Current qemu_put_compression_data can only work with no writable QEMUFile, and can't work with the writable QEMUFile. But it does not provide any measure to prevent users from using it with a writable QEMUFile. We should fix this flaw to make it works with writable QEMUFile. Suggested-by: Juan Quintela <quint...@redhat.com> Signed-off-by: Liang Li <liang.z...@intel.com> --- migration/qemu-file.c | 23 +-- migration/ram.c | 6 +- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/migration/qemu-file.c b/migration/qemu-file.c index 6f4a129..b0ef1f3 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -607,8 +607,14 @@ uint64_t qemu_get_be64(QEMUFile *f) return v; } -/* compress size bytes of data start at p with specific compression +/* Compress size bytes of data start at p with specific compression * level and store the compressed data to the buffer of f. + * + * When f is not writable, return -1 if f has no space to save the + * compressed data. + * When f is wirtable and it has no space to save the compressed data, + * do fflush first, if f still has no space to save the compressed + * data, return -1. */ ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size, @@ -617,7 +623,14 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size, ssize_t blen = IO_BUF_SIZE - f->buf_index - sizeof(int32_t); if (blen < compressBound(size)) { -return 0; +if (!qemu_file_is_writable(f)) { +return -1; +} +qemu_fflush(f); +blen = IO_BUF_SIZE - sizeof(int32_t); +if (blen < compressBound(size)) { +return -1; +} } if (compress2(f->buf + f->buf_index + sizeof(int32_t), (uLongf *), (Bytef *)p, size, level) != Z_OK) { @@ -625,7 +638,13 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size, return 0; } qemu_put_be32(f, blen); +if (f->ops->writev_buffer) { +add_to_iovec(f, f->buf + f->buf_index, blen); +} f->buf_index += blen; +if (f->buf_index == IO_BUF_SIZE) { +qemu_fflush(f); +} return blen + sizeof(int32_t); } diff --git a/migration/ram.c b/migration/ram.c index bc34bc5..7e62d8d 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -821,7 +821,11 @@ static int do_compress_ram_page(CompressParam *param) RAM_SAVE_FLAG_COMPRESS_PAGE); blen = qemu_put_compression_data(param->file, p, TARGET_PAGE_SIZE, migrate_compress_level()); -bytes_sent += blen; +if (blen < 0) { +error_report("Insufficient buffer for compressed data!"); +} else { +bytes_sent += blen; +} return bytes_sent; } -- 1.9.1
[Qemu-devel] [PATCH 1/5] migration: Fix multi-thread compression bug
Recently, a bug related to multiple thread compression feature for live migration is reported. The destination side will be blocked during live migration if there are heavy workload in host and memory intensive workload in guest, this is most likely to happen when there is one decompression thread. Some parts of the decompression code are incorrect: 1. The main thread receives data from source side will enter a busy loop to wait for a free decompression thread. 2. A lock is needed to protect the decomp_param[idx]->start, because it is checked in the main thread and is updated in the decompression thread. Fix these two issues by following the code pattern for compression. Signed-off-by: Liang Li <liang.z...@intel.com> Reported-by: Daniel P. Berrange <berra...@redhat.com> Reviewed-by: Daniel P. Berrange <berra...@redhat.com> Reviewed-by: Dr. David Alan Gilbert <dgilb...@redhat.com> Tested-by: Daniel P. Berrange <berra...@redhat.com> --- migration/ram.c | 38 +++--- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 3f05738..7ab6ab5 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -263,6 +263,7 @@ typedef struct CompressParam CompressParam; struct DecompressParam { bool start; +bool done; QemuMutex mutex; QemuCond cond; void *des; @@ -287,6 +288,8 @@ static bool quit_comp_thread; static bool quit_decomp_thread; static DecompressParam *decomp_param; static QemuThread *decompress_threads; +static QemuMutex decomp_done_lock; +static QemuCond decomp_done_cond; static int do_compress_ram_page(CompressParam *param); @@ -834,6 +837,7 @@ static inline void start_compression(CompressParam *param) static inline void start_decompression(DecompressParam *param) { +param->done = false; qemu_mutex_lock(>mutex); param->start = true; qemu_cond_signal(>cond); @@ -2193,19 +2197,24 @@ static void *do_data_decompress(void *opaque) qemu_mutex_lock(>mutex); while (!param->start && !quit_decomp_thread) { qemu_cond_wait(>cond, >mutex); +} +if (!quit_decomp_thread) { pagesize = TARGET_PAGE_SIZE; -if (!quit_decomp_thread) { -/* uncompress() will return failed in some case, especially - * when the page is dirted when doing the compression, it's - * not a problem because the dirty page will be retransferred - * and uncompress() won't break the data in other pages. - */ -uncompress((Bytef *)param->des, , - (const Bytef *)param->compbuf, param->len); -} -param->start = false; +/* uncompress() will return failed in some case, especially + * when the page is dirted when doing the compression, it's + * not a problem because the dirty page will be retransferred + * and uncompress() won't break the data in other pages. + */ +uncompress((Bytef *)param->des, , + (const Bytef *)param->compbuf, param->len); } +param->start = false; qemu_mutex_unlock(>mutex); + +qemu_mutex_lock(_done_lock); +param->done = true; +qemu_cond_signal(_done_cond); +qemu_mutex_unlock(_done_lock); } return NULL; @@ -2219,10 +2228,13 @@ void migrate_decompress_threads_create(void) decompress_threads = g_new0(QemuThread, thread_count); decomp_param = g_new0(DecompressParam, thread_count); quit_decomp_thread = false; +qemu_mutex_init(_done_lock); +qemu_cond_init(_done_cond); for (i = 0; i < thread_count; i++) { qemu_mutex_init(_param[i].mutex); qemu_cond_init(_param[i].cond); decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); +decomp_param[i].done = true; qemu_thread_create(decompress_threads + i, "decompress", do_data_decompress, decomp_param + i, QEMU_THREAD_JOINABLE); @@ -2258,9 +2270,10 @@ static void decompress_data_with_multi_threads(QEMUFile *f, int idx, thread_count; thread_count = migrate_decompress_threads(); +qemu_mutex_lock(_done_lock); while (true) { for (idx = 0; idx < thread_count; idx++) { -if (!decomp_param[idx].start) { +if (decomp_param[idx].done) { qemu_get_buffer(f, decomp_param[idx].compbuf, len); decomp_param[idx].des = host; decomp_param[idx].len = len; @@ -2270,8 +2283,11 @@ static void decompress_data_with_multi_threads(QEMUFile *f, } if (idx < thread_count) { break; +} else { +qemu_cond_wait(_done_cond, _done_lock); } } +qemu_mutex_unlock(_done_lock); } /* -- 1.9.1
[Qemu-devel] [PATCH 0/5] live migration bug fix and refine
This patch set fixed a bug which will block live migration and another potential issue when using multi-thread (de)compression. The last three patches were submitted before, put them here together. Liang Li (5): migration: Fix multi-thread compression bug migration: Fix a potential issue migration: remove useless code qemu-file: Fix qemu_put_compression_data flaw migration: refine ram_save_compressed_page include/migration/migration.h | 1 + migration/migration.c | 2 +- migration/qemu-file.c | 23 ++- migration/ram.c | 93 ++- 4 files changed, 88 insertions(+), 31 deletions(-) -- 1.9.1
[Qemu-devel] [PATCH] migration: Fix multi-thread compression bug
Recently, a bug related to multiple thread compression feature for live migration is reported. The destination side will be blocked during live migration if there are heavy workload in host and memory intensive workload in guest, this is most likely to happen when there is one decompression thread. Some parts of the decompression code are incorrect: 1. The main thread receives data from source side will enter a busy loop to wait for a free decompression thread. 2. A lock is needed to protect the decomp_param[idx]->start, because it is checked in the main thread and is updated in the decompression thread. Fix these two issues by following the code pattern for compression. Reported-by: Daniel P. Berrange <berra...@redhat.com> Signed-off-by: Liang Li <liang.z...@intel.com> --- migration/ram.c | 38 +++--- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 3f05738..7ab6ab5 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -263,6 +263,7 @@ typedef struct CompressParam CompressParam; struct DecompressParam { bool start; +bool done; QemuMutex mutex; QemuCond cond; void *des; @@ -287,6 +288,8 @@ static bool quit_comp_thread; static bool quit_decomp_thread; static DecompressParam *decomp_param; static QemuThread *decompress_threads; +static QemuMutex decomp_done_lock; +static QemuCond decomp_done_cond; static int do_compress_ram_page(CompressParam *param); @@ -834,6 +837,7 @@ static inline void start_compression(CompressParam *param) static inline void start_decompression(DecompressParam *param) { +param->done = false; qemu_mutex_lock(>mutex); param->start = true; qemu_cond_signal(>cond); @@ -2193,19 +2197,24 @@ static void *do_data_decompress(void *opaque) qemu_mutex_lock(>mutex); while (!param->start && !quit_decomp_thread) { qemu_cond_wait(>cond, >mutex); +} +if (!quit_decomp_thread) { pagesize = TARGET_PAGE_SIZE; -if (!quit_decomp_thread) { -/* uncompress() will return failed in some case, especially - * when the page is dirted when doing the compression, it's - * not a problem because the dirty page will be retransferred - * and uncompress() won't break the data in other pages. - */ -uncompress((Bytef *)param->des, , - (const Bytef *)param->compbuf, param->len); -} -param->start = false; +/* uncompress() will return failed in some case, especially + * when the page is dirted when doing the compression, it's + * not a problem because the dirty page will be retransferred + * and uncompress() won't break the data in other pages. + */ +uncompress((Bytef *)param->des, , + (const Bytef *)param->compbuf, param->len); } +param->start = false; qemu_mutex_unlock(>mutex); + +qemu_mutex_lock(_done_lock); +param->done = true; +qemu_cond_signal(_done_cond); +qemu_mutex_unlock(_done_lock); } return NULL; @@ -2219,10 +2228,13 @@ void migrate_decompress_threads_create(void) decompress_threads = g_new0(QemuThread, thread_count); decomp_param = g_new0(DecompressParam, thread_count); quit_decomp_thread = false; +qemu_mutex_init(_done_lock); +qemu_cond_init(_done_cond); for (i = 0; i < thread_count; i++) { qemu_mutex_init(_param[i].mutex); qemu_cond_init(_param[i].cond); decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); +decomp_param[i].done = true; qemu_thread_create(decompress_threads + i, "decompress", do_data_decompress, decomp_param + i, QEMU_THREAD_JOINABLE); @@ -2258,9 +2270,10 @@ static void decompress_data_with_multi_threads(QEMUFile *f, int idx, thread_count; thread_count = migrate_decompress_threads(); +qemu_mutex_lock(_done_lock); while (true) { for (idx = 0; idx < thread_count; idx++) { -if (!decomp_param[idx].start) { +if (decomp_param[idx].done) { qemu_get_buffer(f, decomp_param[idx].compbuf, len); decomp_param[idx].des = host; decomp_param[idx].len = len; @@ -2270,8 +2283,11 @@ static void decompress_data_with_multi_threads(QEMUFile *f, } if (idx < thread_count) { break; +} else { +qemu_cond_wait(_done_cond, _done_lock); } } +qemu_mutex_unlock(_done_lock); } /* -- 1.9.1
[Qemu-devel] [PATCH kernel 1/2] mm: add the related functions to build the free page bitmap
The free page bitmap will be sent to QEMU through virtio interface and used for live migration optimization. Drop the cache before building the free page bitmap can get more free pages. Whether dropping the cache is decided by user. Signed-off-by: Liang Li <liang.z...@intel.com> --- fs/drop_caches.c | 22 ++ include/linux/fs.h | 1 + mm/page_alloc.c| 46 ++ 3 files changed, 61 insertions(+), 8 deletions(-) diff --git a/fs/drop_caches.c b/fs/drop_caches.c index d72d52b..f488086 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -50,14 +50,8 @@ int drop_caches_sysctl_handler(struct ctl_table *table, int write, if (write) { static int stfu; - if (sysctl_drop_caches & 1) { - iterate_supers(drop_pagecache_sb, NULL); - count_vm_event(DROP_PAGECACHE); - } - if (sysctl_drop_caches & 2) { - drop_slab(); - count_vm_event(DROP_SLAB); - } + drop_cache(sysctl_drop_caches); + if (!stfu) { pr_info("%s (%d): drop_caches: %d\n", current->comm, task_pid_nr(current), @@ -67,3 +61,15 @@ int drop_caches_sysctl_handler(struct ctl_table *table, int write, } return 0; } + +void drop_cache(int drop_ctl) +{ + if (drop_ctl & 1) { + iterate_supers(drop_pagecache_sb, NULL); + count_vm_event(DROP_PAGECACHE); + } + if (drop_ctl & 2) { + drop_slab(); + count_vm_event(DROP_SLAB); + } +} diff --git a/include/linux/fs.h b/include/linux/fs.h index 70e61b5..b8a0bc0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2864,6 +2864,7 @@ extern void drop_super(struct super_block *sb); extern void iterate_supers(void (*)(struct super_block *, void *), void *); extern void iterate_supers_type(struct file_system_type *, void (*)(struct super_block *, void *), void *); +extern void drop_cache(int drop_ctl); extern int dcache_dir_open(struct inode *, struct file *); extern int dcache_dir_close(struct inode *, struct file *); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 59de90d..4799983 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include @@ -4029,6 +4030,51 @@ void show_free_areas(unsigned int filter) show_swap_cache_info(); } +static void mark_free_pages_bitmap(struct zone *zone, + unsigned long *bitmap, unsigned long len) +{ + unsigned long pfn, flags, i, limit; + unsigned int order, t; + struct list_head *curr; + + if (zone_is_empty(zone)) + return; + + spin_lock_irqsave(>lock, flags); + + limit = min(len, max_pfn); + for_each_migratetype_order(order, t) { + list_for_each(curr, >free_area[order].free_list[t]) { + pfn = page_to_pfn(list_entry(curr, struct page, lru)); + for (i = 0; i < (1UL << order); i++) { + if ((pfn + i) < limit) + set_bit_le(pfn + i, bitmap); + else + break; + } + } + } + + spin_unlock_irqrestore(>lock, flags); +} + +unsigned long get_max_pfn(void) +{ + return max_pfn; +} +EXPORT_SYMBOL(get_max_pfn); + +void get_free_pages(unsigned long *bitmap, unsigned long len, int drop) +{ + struct zone *zone; + + drop_cache(drop); + + for_each_populated_zone(zone) + mark_free_pages_bitmap(zone, bitmap, len); +} +EXPORT_SYMBOL(get_free_pages); + static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) { zoneref->zone = zone; -- 1.8.3.1
[Qemu-devel] [PATCH kernel 2/2] virtio-balloon: extend balloon driver to support the new feature
Extend the virtio balloon to support the new feature VIRTIO_BALLOON_F_GET_FREE_PAGES, so that we can use it to send the free page bitmap from guest to QEMU, the free page bitmap will be used for live migration optimization. Signed-off-by: Liang Li <liang.z...@intel.com> --- drivers/virtio/virtio_balloon.c | 100 ++-- include/uapi/linux/virtio_balloon.h | 1 + 2 files changed, 96 insertions(+), 5 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 7b6d74f..cf17694 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -45,9 +45,17 @@ static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES; module_param(oom_pages, int, S_IRUSR | S_IWUSR); MODULE_PARM_DESC(oom_pages, "pages to free on OOM"); +extern void get_free_pages(unsigned long *free_page_bitmap, + unsigned long len, int drop); +extern unsigned long get_max_pfn(void); + +struct cache_drop_ctrl { + u64 ctrl; +}; + struct virtio_balloon { struct virtio_device *vdev; - struct virtqueue *inflate_vq, *deflate_vq, *stats_vq; + struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_pages_vq; /* The balloon servicing is delegated to a freezable workqueue. */ struct work_struct update_balloon_stats_work; @@ -77,6 +85,10 @@ struct virtio_balloon { unsigned int num_pfns; u32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX]; + unsigned long *free_pages; + unsigned long bmap_len; + struct cache_drop_ctrl cache_drop; + /* Memory statistics */ struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR]; @@ -256,6 +268,64 @@ static void update_balloon_stats(struct virtio_balloon *vb) pages_to_bytes(available)); } +static void update_free_pages_stats(struct virtio_balloon *vb) +{ + unsigned long bitmap_bytes, max_pfn; + + max_pfn = get_max_pfn(); + bitmap_bytes = ALIGN(max_pfn, BITS_PER_LONG) / 8; + + if (!vb->free_pages) + vb->free_pages = kzalloc(bitmap_bytes, GFP_KERNEL); + else { + if (bitmap_bytes < vb->bmap_len) + memset(vb->free_pages, 0, bitmap_bytes); + else { + kfree(vb->free_pages); + vb->free_pages = kzalloc(bitmap_bytes, GFP_KERNEL); + } + } + if (!vb->free_pages) { + vb->bmap_len = 0; + return; + } + + vb->bmap_len = bitmap_bytes; + get_free_pages(vb->free_pages, max_pfn, vb->cache_drop.ctrl); +} + +static void free_pages_handle_rq(struct virtio_balloon *vb) +{ + struct virtqueue *vq; + struct scatterlist sg[2]; + unsigned int len; + struct cache_drop_ctl *ptr_cache_drop; + struct scatterlist sg_in; + + vq = vb->free_pages_vq; + ptr_cache_drop = virtqueue_get_buf(vq, ); + + if (!ptr_cache_drop || len != sizeof(vb->cache_drop)) + return; + update_free_pages_stats(vb); + sg_init_table(sg, 2); + sg_set_buf([0], &(vb->bmap_len), sizeof(vb->bmap_len)); + sg_set_buf([1], vb->free_pages, vb->bmap_len); + + sg_init_one(_in, >cache_drop, sizeof(vb->cache_drop)); + + virtqueue_add_outbuf(vq, [0], 2, vb, GFP_KERNEL); + virtqueue_add_inbuf(vq, _in, 1, >cache_drop, GFP_KERNEL); + virtqueue_kick(vq); +} + +static void free_pages_rq(struct virtqueue *vq) +{ + struct virtio_balloon *vb = vq->vdev->priv; + + free_pages_handle_rq(vb); +} + /* * While most virtqueues communicate guest-initiated requests to the hypervisor, * the stats queue operates in reverse. The driver initializes the virtqueue @@ -392,16 +462,22 @@ static void update_balloon_size_func(struct work_struct *work) static int init_vqs(struct virtio_balloon *vb) { - struct virtqueue *vqs[3]; - vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request }; - static const char * const names[] = { "inflate", "deflate", "stats" }; + struct virtqueue *vqs[4]; + vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, +stats_request, free_pages_rq }; + const char *names[] = { "inflate", "deflate", "stats", "free_pages" }; int err, nvqs; /* * We expect two virtqueues: inflate and deflate, and * optionally stat. */ - nvqs = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2; + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_GET_FREE_PAGES)) + nvqs = 4; + else + nvqs = virtio_has_feature(vb->vdev, + VIRTIO_BALLOON_F_STATS_VQ)
[Qemu-devel] [PATCH kernel 0/2] speed up live migration by skipping free pages
Current QEMU live migration implementation mark all guest's RAM pages as dirtied in the ram bulk stage, all these pages will be processed and it consumes quite a lot of CPU cycles and network bandwidth. >From guest's point of view, it doesn't care about the content in free page. We can make use of this fact and skip processing the free pages, this can save a lot CPU cycles and reduce the network traffic significantly while speed up the live migration process obviously. This patch set is the kernel side implementation. The virtio-balloon driver is extended to send the free page bitmap from guest to QEMU. After getting the free page bitmap, QEMU can use it to filter out guest's free pages. This make the live migration process much more efficient. In order to skip more free pages, we add an interface to let the user decide whether dropping the cache in guest during live migration. Liang Li (2): mm: add the related functions to build the free page bitmap virtio-balloon: extend balloon driver to support the new feature drivers/virtio/virtio_balloon.c | 100 ++-- fs/drop_caches.c| 22 +--- include/linux/fs.h | 1 + include/uapi/linux/virtio_balloon.h | 1 + mm/page_alloc.c | 46 + 5 files changed, 157 insertions(+), 13 deletions(-) -- 1.8.3.1
[Qemu-devel] [PATCH QEMU 2/5] kvm: Add two new arch specific functions
Add a new function to get the vm's max pfn and a new function to filter out the holes to get a tight free page bitmap. They are implemented on X86, and all the arches should implement them for live migration optimization. Signed-off-by: Liang Li <liang.z...@intel.com> --- include/sysemu/kvm.h | 2 ++ target-arm/kvm.c | 14 ++ target-i386/kvm.c| 35 +++ target-mips/kvm.c| 14 ++ target-ppc/kvm.c | 14 ++ target-s390x/kvm.c | 14 ++ 6 files changed, 93 insertions(+) diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index 0e18f15..5263304 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -228,6 +228,8 @@ int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, target_ulong len, int type); void kvm_remove_all_breakpoints(CPUState *cpu); int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap); +unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap); +unsigned long get_guest_max_pfn(void); #ifndef _WIN32 int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset); #endif diff --git a/target-arm/kvm.c b/target-arm/kvm.c index 3671032..59e9417 100644 --- a/target-arm/kvm.c +++ b/target-arm/kvm.c @@ -626,3 +626,17 @@ int kvm_arch_msi_data_to_gsi(uint32_t data) { return (data - 32) & 0x; } + +unsigned long get_guest_max_pfn(void) +{ +/* To be done */ + +return 0; +} + +unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap) +{ +/* To be done */ + +return bmap; +} diff --git a/target-i386/kvm.c b/target-i386/kvm.c index 799fdfa..76a33bd 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -3334,3 +3334,38 @@ int kvm_arch_msi_data_to_gsi(uint32_t data) { abort(); } + +unsigned long get_guest_max_pfn(void) +{ +PCMachineState *pcms = PC_MACHINE(current_machine); +ram_addr_t above_4g_mem = pcms->above_4g_mem_size; +unsigned long max_pfn; + +if (above_4g_mem) { +max_pfn = ((1ULL << 32) + above_4g_mem) >> TARGET_PAGE_BITS; +} else { +max_pfn = pcms->below_4g_mem_size >> TARGET_PAGE_BITS; +} + +return max_pfn; +} + +unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap) +{ +PCMachineState *pcms = PC_MACHINE(current_machine); +ram_addr_t above_4g_mem = pcms->above_4g_mem_size; + +if (above_4g_mem) { +unsigned long *src, *dst, len, pos; +ram_addr_t below_4g_mem = pcms->below_4g_mem_size; +src = bmap + ((1ULL << 32) >> TARGET_PAGE_BITS) / BITS_PER_LONG; +dst = bmap + (below_4g_mem >> TARGET_PAGE_BITS) / BITS_PER_LONG; +bitmap_move(dst, src, above_4g_mem >> TARGET_PAGE_BITS); + +pos = (above_4g_mem + below_4g_mem) >> TARGET_PAGE_BITS; +len = ((1ULL << 32) - below_4g_mem) >> TARGET_PAGE_BITS; +bitmap_clear(bmap, pos, len); +} + +return bmap; +} diff --git a/target-mips/kvm.c b/target-mips/kvm.c index 950bc05..23fdc50 100644 --- a/target-mips/kvm.c +++ b/target-mips/kvm.c @@ -1048,3 +1048,17 @@ int kvm_arch_msi_data_to_gsi(uint32_t data) { abort(); } + +unsigned long get_guest_max_pfn(void) +{ +/* To be done */ + +return 0; +} + +unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap) +{ +/* To be done */ + +return bmap; +} diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c index c4c8146..bf4c60f 100644 --- a/target-ppc/kvm.c +++ b/target-ppc/kvm.c @@ -2579,3 +2579,17 @@ int kvmppc_enable_hwrng(void) return kvmppc_enable_hcall(kvm_state, H_RANDOM); } + +unsigned long get_guest_max_pfn(void) +{ +/* To be done */ + +return 0; +} + +unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap) +{ +/* To be done */ + +return bmap; +} diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c index e1859ca..7f5e1b8 100644 --- a/target-s390x/kvm.c +++ b/target-s390x/kvm.c @@ -2250,3 +2250,17 @@ int kvm_arch_msi_data_to_gsi(uint32_t data) { abort(); } + +unsigned long get_guest_max_pfn(void) +{ +/* To be done */ + +return 0; +} + +unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap) +{ +/* To be done */ + +return bmap; +} -- 1.8.3.1
[Qemu-devel] [PATCH QEMU 4/5] migration: filter out free pages during live migration
After sending out the request for free pages, live migration process will start without waiting for the free page bitmap is ready. If the free page bitmap is not ready when doing the 1st migration_bitmap_sync() after ram_save_setup(), the free page bitmap will be ignored, this means the free pages will not be filtered out in this case. The current implementation can not work with post copy, if post copy is enabled, we simply ignore the free pages. Will make it work later. Signed-off-by: Liang Li <liang.z...@intel.com> --- migration/ram.c | 101 1 file changed, 101 insertions(+) diff --git a/migration/ram.c b/migration/ram.c index 3f05738..3944426 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -41,6 +41,8 @@ #include "trace.h" #include "exec/ram_addr.h" #include "qemu/rcu_queue.h" +#include "sysemu/balloon.h" +#include "sysemu/kvm.h" #ifdef DEBUG_MIGRATION_RAM #define DPRINTF(fmt, ...) \ @@ -226,6 +228,8 @@ static QemuMutex migration_bitmap_mutex; static uint64_t migration_dirty_pages; static uint32_t last_version; static bool ram_bulk_stage; +static bool ignore_freepage_rsp; +static bool drop_page_cache; /* used by the search for pages to send */ struct PageSearchStatus { @@ -242,6 +246,7 @@ static struct BitmapRcu { struct rcu_head rcu; /* Main migration bitmap */ unsigned long *bmap; +unsigned long *free_page_bmap; /* bitmap of pages that haven't been sent even once * only maintained and used in postcopy at the moment * where it's used to send the dirtymap at the start @@ -639,6 +644,7 @@ static void migration_bitmap_sync(void) rcu_read_unlock(); qemu_mutex_unlock(_bitmap_mutex); +ignore_freepage_rsp = true; trace_migration_bitmap_sync_end(migration_dirty_pages - num_dirty_pages_init); num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init; @@ -1417,6 +1423,9 @@ static void migration_bitmap_free(struct BitmapRcu *bmap) { g_free(bmap->bmap); g_free(bmap->unsentmap); +if (balloon_free_pages_support() && !migrate_postcopy_ram()) { +g_free(bmap->free_page_bmap); +} g_free(bmap); } @@ -1487,6 +1496,90 @@ void migration_bitmap_extend(ram_addr_t old, ram_addr_t new) } } +static void filter_out_guest_free_page(unsigned long *free_page_bmap, + long nbits) +{ +long i, page_count = 0, len; +unsigned long *bitmap; + +tighten_guest_free_page_bmap(free_page_bmap); +qemu_mutex_lock(_bitmap_mutex); +bitmap = atomic_rcu_read(_bitmap_rcu)->bmap; +slow_bitmap_complement(bitmap, free_page_bmap, nbits); + +len = (last_ram_offset() >> TARGET_PAGE_BITS) / BITS_PER_LONG; +for (i = 0; i < len; i++) { +page_count += hweight_long(bitmap[i]); +} + +migration_dirty_pages = page_count; +qemu_mutex_unlock(_bitmap_mutex); +} + +static void ram_request_free_page(unsigned long *bmap, unsigned long max_pfn) +{ +FreePageStatus status; + +/* drop_page_cache should be set by user, the related code will be + * added later, set it to ture temporarily. + */ +drop_page_cache = true; + +status = balloon_get_free_pages(bmap, max_pfn, drop_page_cache); +switch (status) { +case FREE_PAGE_REQ: +ignore_freepage_rsp = false; +break; +case FREE_PAGE_ERROR: +error_report("Errro happend when request free page"); +break; +default: +error_report("unexpected response status: %d", status); +break; +} +} + +static void ram_handle_free_page(void) +{ +unsigned long nbits; +RAMBlock *pc_ram_block; +FreePageStatus status; + +status = balloon_get_free_pages(migration_bitmap_rcu->free_page_bmap, +get_guest_max_pfn(), drop_page_cache); +switch (status) { +case FREE_PAGE_READY: +rcu_read_lock(); +pc_ram_block = QLIST_FIRST_RCU(_list.blocks); +nbits = pc_ram_block->used_length >> TARGET_PAGE_BITS; +filter_out_guest_free_page(migration_bitmap_rcu->free_page_bmap, nbits); +rcu_read_unlock(); + +qemu_mutex_lock_iothread(); +migration_bitmap_sync(); +qemu_mutex_unlock_iothread(); +/* + * bulk stage assumes in (migration_bitmap_find_and_reset_dirty) that + * every page is dirty, that's no longer ture at this point. + */ +ram_bulk_stage = false; +last_seen_block = NULL; +last_sent_block = NULL; +last_offset = 0; +break; +case FREE_PAGE_ERROR: +ignore_freepage_rsp = true; +error_report("failed to get free page"); +break; +case FREE_PAGE_INVALID_PARAM: +ignore_freepage_rsp = true; +err
[Qemu-devel] [PATCH QEMU 5/5] migration: Add the interface for cache drop control
Whether drop the cache and drop what kind of cache depend on the user, add the related qmp and hmp interface to query and set the cache control value. Signed-off-by: Liang Li <liang.z...@intel.com> --- hmp.c | 8 include/migration/migration.h | 1 + migration/migration.c | 31 ++- migration/ram.c | 10 ++ qapi-schema.json | 25 ++--- 5 files changed, 63 insertions(+), 12 deletions(-) diff --git a/hmp.c b/hmp.c index d510236..17f418e 100644 --- a/hmp.c +++ b/hmp.c @@ -286,6 +286,9 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) monitor_printf(mon, " %s: %" PRId64, MigrationParameter_lookup[MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT], params->x_cpu_throttle_increment); +monitor_printf(mon, " %s: %" PRId64, +MigrationParameter_lookup[MIGRATION_PARAMETER_X_DROP_CACHE], +params->x_drop_cache); monitor_printf(mon, "\n"); } @@ -1242,6 +1245,7 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) bool has_decompress_threads = false; bool has_x_cpu_throttle_initial = false; bool has_x_cpu_throttle_increment = false; +bool has_x_drop_cache = false; int i; for (i = 0; i < MIGRATION_PARAMETER__MAX; i++) { @@ -1262,12 +1266,16 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) case MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT: has_x_cpu_throttle_increment = true; break; +case MIGRATION_PARAMETER_X_DROP_CACHE: +has_x_drop_cache = true; +break; } qmp_migrate_set_parameters(has_compress_level, value, has_compress_threads, value, has_decompress_threads, value, has_x_cpu_throttle_initial, value, has_x_cpu_throttle_increment, value, + has_x_drop_cache, value, ); break; } diff --git a/include/migration/migration.h b/include/migration/migration.h index ac2c12c..873e3bc 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -283,6 +283,7 @@ bool migrate_use_compression(void); int migrate_compress_level(void); int migrate_compress_threads(void); int migrate_decompress_threads(void); +int migrate_drop_cache(void); bool migrate_use_events(void); /* Sending on the return path - generic and then for each message type */ diff --git a/migration/migration.c b/migration/migration.c index 991313a..ecd07b8 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -52,6 +52,13 @@ /* Define default autoconverge cpu throttle migration parameters */ #define DEFAULT_MIGRATE_X_CPU_THROTTLE_INITIAL 20 #define DEFAULT_MIGRATE_X_CPU_THROTTLE_INCREMENT 10 +/* Default cache drop control + * 0: no drop + * 1: drop clean page cache + * 2: drop slab cache + * 3: drop both clean and slab cache + */ +#define DEFAULT_MIGRATE_X_DROP_CACHE 0 /* Migration XBZRLE default cache size */ #define DEFAULT_MIGRATE_CACHE_SIZE (64 * 1024 * 1024) @@ -91,6 +98,8 @@ MigrationState *migrate_get_current(void) DEFAULT_MIGRATE_X_CPU_THROTTLE_INITIAL, .parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT] = DEFAULT_MIGRATE_X_CPU_THROTTLE_INCREMENT, +.parameters[MIGRATION_PARAMETER_X_DROP_CACHE] = +DEFAULT_MIGRATE_X_DROP_CACHE, }; if (!once) { @@ -525,6 +534,7 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INITIAL]; params->x_cpu_throttle_increment = s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT]; +params->x_drop_cache = s->parameters[MIGRATION_PARAMETER_X_DROP_CACHE]; return params; } @@ -721,7 +731,9 @@ void qmp_migrate_set_parameters(bool has_compress_level, bool has_x_cpu_throttle_initial, int64_t x_cpu_throttle_initial, bool has_x_cpu_throttle_increment, -int64_t x_cpu_throttle_increment, Error **errp) +int64_t x_cpu_throttle_increment, +bool has_x_drop_cache, +int64_t x_drop_cache, Error **errp) { MigrationState *s = migrate_get_current(); @@ -756,6 +768,11 @@ void qmp_migrate_set_parameters(bool has_compress_level, "x_cpu_throttle_increment", "an integer in the range of 1 to 99"); } +if (h
[Qemu-devel] [PATCH QEMU 3/5] virtio-balloon: Add a new feature to balloon device
Extend the virtio balloon device to support a new feature, this new feature can help to get guest's free pages information, which can be used for live migration optimzation. Signed-off-by: Liang Li <liang.z...@intel.com> --- balloon.c | 29 +++- hw/virtio/virtio-balloon.c | 92 - include/hw/virtio/virtio-balloon.h | 30 +++- include/standard-headers/linux/virtio_balloon.h | 1 + include/sysemu/balloon.h| 12 +++- 5 files changed, 159 insertions(+), 5 deletions(-) diff --git a/balloon.c b/balloon.c index f2ef50c..346e215 100644 --- a/balloon.c +++ b/balloon.c @@ -36,6 +36,7 @@ static QEMUBalloonEvent *balloon_event_fn; static QEMUBalloonStatus *balloon_stat_fn; +static QEMUBalloonGetFreePage *balloon_get_free_page_fn; static void *balloon_opaque; static bool balloon_inhibited; @@ -65,9 +66,12 @@ static bool have_balloon(Error **errp) } int qemu_add_balloon_handler(QEMUBalloonEvent *event_func, - QEMUBalloonStatus *stat_func, void *opaque) + QEMUBalloonStatus *stat_func, + QEMUBalloonGetFreePage *get_free_page_func, + void *opaque) { -if (balloon_event_fn || balloon_stat_fn || balloon_opaque) { +if (balloon_event_fn || balloon_stat_fn || balloon_get_free_page_fn +|| balloon_opaque) { /* We're already registered one balloon handler. How many can * a guest really have? */ @@ -75,6 +79,7 @@ int qemu_add_balloon_handler(QEMUBalloonEvent *event_func, } balloon_event_fn = event_func; balloon_stat_fn = stat_func; +balloon_get_free_page_fn = get_free_page_func; balloon_opaque = opaque; return 0; } @@ -86,6 +91,7 @@ void qemu_remove_balloon_handler(void *opaque) } balloon_event_fn = NULL; balloon_stat_fn = NULL; +balloon_get_free_page_fn = NULL; balloon_opaque = NULL; } @@ -116,3 +122,22 @@ void qmp_balloon(int64_t target, Error **errp) trace_balloon_event(balloon_opaque, target); balloon_event_fn(balloon_opaque, target); } + +bool balloon_free_pages_support(void) +{ +return balloon_get_free_page_fn ? true : false; +} + +FreePageStatus balloon_get_free_pages(unsigned long *bitmap, + unsigned long len, int drop_cache) +{ +if (!balloon_get_free_page_fn) { +return FREE_PAGE_UNSUPPORT; +} + +if (!bitmap) { +return FREE_PAGE_INVALID_PARAM; +} + +return balloon_get_free_page_fn(balloon_opaque, bitmap, len, drop_cache); +} diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index 9dbe681..0abf375 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -78,6 +78,12 @@ static bool balloon_stats_supported(const VirtIOBalloon *s) return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_STATS_VQ); } +static bool balloon_free_pages_supported(const VirtIOBalloon *s) +{ +VirtIODevice *vdev = VIRTIO_DEVICE(s); +return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_GET_FREE_PAGE); +} + static bool balloon_stats_enabled(const VirtIOBalloon *s) { return s->stats_poll_interval > 0; @@ -304,6 +310,38 @@ out: } } +static void virtio_balloon_get_free_pages(VirtIODevice *vdev, VirtQueue *vq) +{ +VirtIOBalloon *s = VIRTIO_BALLOON(vdev); +VirtQueueElement *elem; +size_t offset = 0; +uint64_t bitmap_bytes = 0; + +elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); +if (!elem) { +s->req_status = REQ_ERROR; +return; +} + +s->free_page_vq_elem = elem; + +if (!elem->out_num) { +return; +} + +iov_to_buf(elem->out_sg, elem->out_num, offset, + _bytes, sizeof(uint64_t)); + +if (s->bmap_len < bitmap_bytes) { +s->req_status = REQ_INVALID_PARAM; +return; +} +offset += sizeof(uint64_t); +iov_to_buf(elem->out_sg, elem->out_num, offset, + s->free_page_bmap, bitmap_bytes); +s->req_status = REQ_DONE; +} + static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data) { VirtIOBalloon *dev = VIRTIO_BALLOON(vdev); @@ -373,6 +411,7 @@ static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f, VirtIOBalloon *dev = VIRTIO_BALLOON(vdev); f |= dev->host_features; virtio_add_feature(, VIRTIO_BALLOON_F_STATS_VQ); +virtio_add_feature(, VIRTIO_BALLOON_F_GET_FREE_PAGE); return f; } @@ -383,6 +422,48 @@ static void virtio_balloon_stat(void *opaque, BalloonInfo *info) VIRTIO_BALLOON_PFN_SHIFT); } +static FreePageStatus virtio_balloon_free_pages(void *opaque, +unsigned long *bitmap, +
[Qemu-devel] [PATCH QEMU 1/5] bitmap: Add a new bitmap_move function
Sometimes, it is need to move a portion of bitmap to another place in a large bitmap, if overlap happens, the bitmap_copy can't not work correctly, we need a new function to do this work. Signed-off-by: Liang Li <liang.z...@intel.com> --- include/qemu/bitmap.h | 13 + 1 file changed, 13 insertions(+) diff --git a/include/qemu/bitmap.h b/include/qemu/bitmap.h index 0e33fa5..ce07444 100644 --- a/include/qemu/bitmap.h +++ b/include/qemu/bitmap.h @@ -38,6 +38,7 @@ * bitmap_set(dst, pos, nbits) Set specified bit area * bitmap_set_atomic(dst, pos, nbits) Set specified bit area with atomic ops * bitmap_clear(dst, pos, nbits) Clear specified bit area + * bitmap_move(dst, src, nbits) Move *src to *dst * bitmap_test_and_clear_atomic(dst, pos, nbits)Test and clear area * bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area */ @@ -137,6 +138,18 @@ static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, } } +static inline void bitmap_move(unsigned long *dst, const unsigned long *src, + long nbits) +{ +if (small_nbits(nbits)) { +unsigned long tmp = *src; +*dst = tmp; +} else { +long len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); +memmove(dst, src, len); +} +} + static inline int bitmap_and(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, long nbits) { -- 1.8.3.1
[Qemu-devel] [PATCH QEMU 0/5] spee up live migration by skipping free pages
Current QEMU live migration implementation marks all guest's RAM pages as dirtied in the ram bulk stage, all these pages will be processed and it consumes quite a lot of CPU cycles and network bandwidth. >From guest's point of view, it doesn't care about the content in free page. We can make use of this fact and skip processing the free pages, it can save a lot CPU cycles and reduce the network traffic significantly while speed up the live migration process obviously. This patch set is the QEMU side implementation. The virtio-balloon is extended so that QEMU can get the free pages information from the guest. After getting the free page bitmap, QEMU can use it to filter out guest's free pages. This make the live migration process much more efficient. In order to skip more free pages, we add an interface to let the user decide whether dropping the cache in guest during live migration. Performance data Test environment: CPU: Intel (R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz Host RAM: 64GB Host Linux Kernel: 4.4.0 Host OS: CentOS 7.1 Guest Linux Kernel: 4.5.rc6Guest OS: CentOS 6.6 Network: Intel X710 with 10 Gigabit connection Guest RAM: 8GB Case 1: Idle guest just boots: == | original | skip free page -- total time(ms) |1505 | 573 -- transferred ram(KB) | 399792 | 379057 == Case 2: The guest has ever run some memory consuming workload, the workload is terminated before live migration. == | original | skip free page -- total time(ms) | 10641| 597 -- transferred ram(KB) | 8350829 | 389900 == Case 3: The guest has ever built the linux kernel, the building is completed before live migration. The page cache is dropped during live migration in this case. == | original | skip free page -- total time(ms) | 2636| 914 -- transferred ram(KB) | 1316747 | 421980 == Liang Li (5): bitmap: Add a new bitmap_move function kvm: Add two new arch specific functions virtio-balloon: Add a new feature to balloon device migration: filter out free pages during live migration migration: Add the interface for cache drop control balloon.c | 29 +++- hmp.c | 8 +++ hw/virtio/virtio-balloon.c | 92 +++- include/hw/virtio/virtio-balloon.h | 30 +++- include/migration/migration.h | 1 + include/qemu/bitmap.h | 13 include/standard-headers/linux/virtio_balloon.h | 1 + include/sysemu/balloon.h| 12 +++- include/sysemu/kvm.h| 2 + migration/migration.c | 31 +++- migration/ram.c | 95 + qapi-schema.json| 25 ++- target-arm/kvm.c| 14 target-i386/kvm.c | 35 + target-mips/kvm.c | 14 target-ppc/kvm.c| 14 target-s390x/kvm.c | 14 17 files changed, 421 insertions(+), 9 deletions(-) -- 1.8.3.1
[Qemu-devel] [PATCH] migration: remove useless code
page_buffer is set twice repeatedly, remove the previous set. Signed-off-by: Liang Li <liang.z...@intel.com> --- migration/ram.c | 1 - 1 file changed, 1 deletion(-) diff --git a/migration/ram.c b/migration/ram.c index 3f05738..31d40f4 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -2322,7 +2322,6 @@ static int ram_load_postcopy(QEMUFile *f) ret = -EINVAL; break; } -page_buffer = host; /* * Postcopy requires that we place whole host pages atomically. * To make it atomic, the data is read into a temporary page -- 1.9.1
[Qemu-devel] [RFC Design Doc]Speed up live migration by skipping free pages
I have sent the RFC version patch set for live migration optimization by skipping processing the free pages in the ram bulk stage and received a lot of comments. The related threads can be found at: https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg00715.html https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg00714.html https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg00717.html https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg00716.html https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg00718.html https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg00719.html https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg00720.html https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg00721.html To make things easier, I wrote this doc about the possible designs and my choices. Comments are welcome! Content === 1. Background 2. Why not use virtio-balloon 3. Virtio interface 4. Constructing free page bitmap 5. Tighten free page bitmap 6. Handling page cache in the guest 7. APIs for live migration 8. Pseudo code?? Details === 1. Background As we know, in the ram bulk stage of live migration, current QEMU live migration implementation mark the all guest's RAM pages as dirtied in the ram bulk stage, all these pages will be checked for zero page first, and the page content will be sent to the destination depends on the checking result, that process consumes quite a lot of CPU cycles and network bandwidth. >From guest's point of view, there are some pages currently not used by the guest, guest doesn't care about the content in these pages. Free pages are this kind of pages which are not used by guest. We can make use of this fact and skip processing the free pages in the ram bulk stage, it can save a lot CPU cycles and reduce the network traffic while speed up the live migration process obviously. Usually, only the guest has the information of free pages. But it???s possible to let the guest tell QEMU it???s free page information by some mechanism. E.g. Through the virtio interface. Once QEMU get the free page information, it can skip processing these free pages in the ram bulk stage by clearing the corresponding bit of the migration bitmap. 2. Why not use virtio-balloon Actually, the virtio-balloon can do the similar thing by inflating the balloon before live migration, but its performance is no good, for an 8GB idle guest just boots, it takes about 5.7 Sec to inflate the balloon to 7GB, but it only takes 25ms to get a valid free page bitmap from the guest. There are some of reasons for the bad performance of vitio-balloon: a. allocating pages (5%, 304ms) b. sending PFNs to host (71%, 4194ms) c. address translation and madvise() operation (24%, 1423ms) Debugging shows the time spends on these operations are listed in the brackets above. By changing the VIRTIO_BALLOON_ARRAY_PFNS_MAX to a large value, such as 16384, the time spends on sending the PFNs can be reduced to about 400ms, but it???s still too long. Obviously, the virtio-balloon mechanism has a bigger performance impact to the guest than the way we are trying to implement. 3. Virtio interface There are three different ways of using the virtio interface to send the free page information. a. Extend the current virtio device The virtio spec has already defined some virtio devices, and we can extend one of these devices so as to use it to transport the free page information. It requires modifying the virtio spec. b. Implement a new virtio device Implementing a brand new virtio device to exchange information between host and guest is another choice. It requires modifying the virtio spec too. c. Make use of virtio-serial (Amit???s suggestion, my choice) It???s possible to make use the virtio-serial for communication between host and guest, the benefit of this solution is no need to modify the virtio spec. 4. Construct free page bitmap To minimize the space for saving free page information, it???s better to use a bitmap to describe the free pages. There are two ways to construct the free page bitmap. a. Construct free page bitmap when demand (My choice) Guest can allocate memory for the free page bitmap only when it receives the request from QEMU, and set the free page bitmap by traversing the free page list. The advantage of this way is that it???s quite simple and easy to implement. The disadvantage is that the traversing operation may consume quite a long time when there are a lot of free pages. (About 20ms for 7GB free pages) b. Update free page bitmap when allocating/freeing pages Another choice is to allocate the memory for the free page bitmap when guest boots, and then update the free page bitmap when allocating/freeing pages. It needs more modification to the code related to memory management in guest. The advantage of this way is that guest can response QEMU???s request for a free page bitmap very quickly, no matter how many free pages in the guest. Do the kernel guys like this? 5.
[Qemu-devel] [PATCH v6 2/2] cutils: add avx2 instruction optimization
buffer_find_nonzero_offset() is a hot function during live migration. Now it use SSE2 instructions for optimization. For platform supports AVX2 instructions, use AVX2 instructions for optimization can help to improve the performance of buffer_find_nonzero_offset() about 30% comparing to SSE2. Live migration can be faster with this optimization, the test result shows that for an 8GiB RAM idle guest just boots, this patch can help to shorten the total live migration time about 6%. This patch use the ifunc mechanism to select the proper function when running, for platform supports AVX2, execute the AVX2 instructions, else, execute the original instructions. Signed-off-by: Liang Li <liang.z...@intel.com> Suggested-by: Paolo Bonzini <pbonz...@redhat.com> Suggested-by: Richard Henderson <r...@twiddle.net> Reviewed-by: Paolo Bonzini <pbonz...@redhat.com> --- include/qemu-common.h | 8 +--- util/cutils.c | 124 -- 2 files changed, 121 insertions(+), 11 deletions(-) diff --git a/include/qemu-common.h b/include/qemu-common.h index ced2994..887ca71 100644 --- a/include/qemu-common.h +++ b/include/qemu-common.h @@ -476,13 +476,7 @@ void qemu_hexdump(const char *buf, FILE *fp, const char *prefix, size_t size); #endif #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8 -static inline bool -can_use_buffer_find_nonzero_offset(const void *buf, size_t len) -{ -return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR - * sizeof(VECTYPE)) == 0 -&& ((uintptr_t) buf) % sizeof(VECTYPE) == 0); -} +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len); size_t buffer_find_nonzero_offset(const void *buf, size_t len); /* diff --git a/util/cutils.c b/util/cutils.c index 59e1f70..c3dd534 100644 --- a/util/cutils.c +++ b/util/cutils.c @@ -160,6 +160,14 @@ int qemu_fdatasync(int fd) #endif } +static bool +can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len) +{ +return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR + * sizeof(VECTYPE)) == 0 +&& ((uintptr_t) buf) % sizeof(VECTYPE) == 0); +} + /* * Searches for an area with non-zero content in a buffer * @@ -168,8 +176,8 @@ int qemu_fdatasync(int fd) * and addr must be a multiple of sizeof(VECTYPE) due to * restriction of optimizations in this function. * - * can_use_buffer_find_nonzero_offset() can be used to check - * these requirements. + * can_use_buffer_find_nonzero_offset_inner() can be used to + * check these requirements. * * The return value is the offset of the non-zero area rounded * down to a multiple of sizeof(VECTYPE) for the first @@ -180,13 +188,13 @@ int qemu_fdatasync(int fd) * If the buffer is all zero the return value is equal to len. */ -size_t buffer_find_nonzero_offset(const void *buf, size_t len) +static size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len) { const VECTYPE *p = buf; const VECTYPE zero = (VECTYPE){0}; size_t i; -assert(can_use_buffer_find_nonzero_offset(buf, len)); +assert(can_use_buffer_find_nonzero_offset_inner(buf, len)); if (!len) { return 0; @@ -216,6 +224,114 @@ size_t buffer_find_nonzero_offset(const void *buf, size_t len) } /* + * GCC before version 4.9 has a bug which will cause the target + * attribute work incorrectly and failed to compile in some case, + * restrict the gcc version to 4.9+ to prevent the failure. + */ + +#if defined CONFIG_AVX2_OPT && QEMU_GNUC_PREREQ(4, 9) +#pragma GCC push_options +#pragma GCC target("avx2") +#include +#include + +#define AVX2_VECTYPE__m256i +#define AVX2_SPLAT(p) _mm256_set1_epi8(*(p)) +#define AVX2_ALL_EQ(v1, v2) \ +(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0x) +#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2)) + +static bool +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len) +{ +return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR + * sizeof(AVX2_VECTYPE)) == 0 +&& ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0); +} + +static size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len) +{ +const AVX2_VECTYPE *p = buf; +const AVX2_VECTYPE zero = (AVX2_VECTYPE){0}; +size_t i; + +assert(can_use_buffer_find_nonzero_offset_avx2(buf, len)); + +if (!len) { +return 0; +} + +for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) { +if (!AVX2_ALL_EQ(p[i], zero)) { +return i * sizeof(AVX2_VECTYPE); +} +} + +for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; + i < len / sizeof(AVX2_VECTYPE); + i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) { +AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]); +AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]); +AVX2_VE
[Qemu-devel] [PATCH v6 0/2] add avx2 instruction optimization
buffer_find_nonzero_offset() is a hot function during live migration. Now it use SSE2 instructions for optimization. For platform supports AVX2 instructions, use the AVX2 instructions for optimization can help to improve the performance of zero page checking about 30% comparing to SSE2. Live migration can be faster with this optimization, the test result shows that for an 8GB RAM idle guest, this patch can help to shorten the total live migration time about 6%. This patch use the ifunc mechanism to select the proper function when running, for platform supports AVX2, execute the AVX2 instructions, else, execute the original instructions. With this patch, the QEMU binary can run on both platforms support AVX2 or not. Compiler which doesn't support the AVX2 and ifunc attribute can also build the source code successfully. v5 -> v6 changes: * Restrict the optimization to GCC 4.9+ to prevent compiling failure in some case (Paolo's suggestion) v4 -> v5 changes: * Enhance the ifunc attribute detection (Paolo's suggestion) v3 -> v4 changes: * Use the GCC #pragma to make things simple (Paolo's suggestion) * Put avx2 related code in cutils.c (Richard's suggestion) * Change the configure, detect ifunc and avx2 attributes together v2 -> v3 changes: * Detect the ifunc attribute support (Paolo's suggestion) * Use the ifunc attribute instead of the inline asm (Richard's suggestion) * Change the configure (Juan's suggestion) Liang Li (2): configure: detect ifunc and avx2 attribute cutils: add avx2 instruction optimization configure | 21 + include/qemu-common.h | 8 +--- util/cutils.c | 124 -- 3 files changed, 142 insertions(+), 11 deletions(-) -- 1.9.1
[Qemu-devel] [PATCH v6 1/2] configure: detect ifunc and avx2 attribute
Detect if the compiler can support the ifun and avx2, if so, set CONFIG_AVX2_OPT which will be used to turn on the avx2 instruction optimization. Suggested-by: Paolo Bonzini <pbonz...@redhat.com> Suggested-by: Peter Maydell <peter.mayd...@linaro.org> Signed-off-by: Liang Li <liang.z...@intel.com> --- configure | 21 + 1 file changed, 21 insertions(+) diff --git a/configure b/configure index 0c0472a..2b32876 100755 --- a/configure +++ b/configure @@ -280,6 +280,7 @@ libusb="" usb_redir="" opengl="" opengl_dmabuf="no" +avx2_opt="no" zlib="yes" lzo="" snappy="" @@ -1773,6 +1774,21 @@ EOF fi ## +# avx2 optimization requirement check + +cat > $TMPC << EOF +static void bar(void) {} +static void *bar_ifunc(void) {return (void*) bar;} +static void foo(void) __attribute__((ifunc("bar_ifunc"))); +int main(void) { foo(); return 0; } +EOF +if compile_prog "-mavx2" "" ; then +if readelf --syms $TMPE |grep "IFUNC.*foo" >/dev/null 2>&1; then +avx2_opt="yes" +fi +fi + +# # zlib check if test "$zlib" != "no" ; then @@ -4790,6 +4806,7 @@ echo "bzip2 support $bzip2" echo "NUMA host support $numa" echo "tcmalloc support $tcmalloc" echo "jemalloc support $jemalloc" +echo "avx2 optimization $avx2_opt" if test "$sdl_too_old" = "yes"; then echo "-> Your SDL version is too old - please upgrade to have SDL support" @@ -5178,6 +5195,10 @@ if test "$opengl" = "yes" ; then fi fi +if test "$avx2_opt" = "yes" ; then + echo "CONFIG_AVX2_OPT=y" >> $config_host_mak +fi + if test "$lzo" = "yes" ; then echo "CONFIG_LZO=y" >> $config_host_mak fi -- 1.9.1
[Qemu-devel] [RFC kernel 2/2] virtio-balloon: extend balloon driver to support a new feature
Extend the virio balloon to support the new feature VIRTIO_BALLOON_F_GET_FREE_PAGES, so that we can use it to send the free pages information from guest to QEMU, and then optimize the live migration process. Signed-off-by: Liang Li <liang.z...@intel.com> --- drivers/virtio/virtio_balloon.c | 106 ++-- include/uapi/linux/virtio_balloon.h | 1 + 2 files changed, 102 insertions(+), 5 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 0c3691f..7461d3e 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -45,9 +45,18 @@ static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES; module_param(oom_pages, int, S_IRUSR | S_IWUSR); MODULE_PARM_DESC(oom_pages, "pages to free on OOM"); +extern void get_free_pages(unsigned long *free_page_bitmap, + unsigned long *free_pages_num, + unsigned long lowmem); +extern unsigned long get_total_pages_count(unsigned long lowmem); + +struct mem_layout { + unsigned long low_mem; +}; + struct virtio_balloon { struct virtio_device *vdev; - struct virtqueue *inflate_vq, *deflate_vq, *stats_vq; + struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_pages_vq; /* Where the ballooning thread waits for config to change. */ wait_queue_head_t config_change; @@ -75,6 +84,11 @@ struct virtio_balloon { unsigned int num_pfns; u32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX]; + unsigned long *free_pages; + unsigned long free_pages_len; + unsigned long free_pages_num; + struct mem_layout mem_config; + /* Memory statistics */ int need_stats_update; struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR]; @@ -245,6 +259,34 @@ static void update_balloon_stats(struct virtio_balloon *vb) pages_to_bytes(i.totalram)); } +static void update_free_pages_stats(struct virtio_balloon *vb) +{ + unsigned long total_page_count, bitmap_bytes; + + total_page_count = get_total_pages_count(vb->mem_config.low_mem); + bitmap_bytes = ALIGN(total_page_count, BITS_PER_LONG) / 8; + + if (!vb->free_pages) + vb->free_pages = kzalloc(bitmap_bytes, GFP_KERNEL); + else { + if (bitmap_bytes < vb->free_pages_len) + memset(vb->free_pages, 0, bitmap_bytes); + else { + kfree(vb->free_pages); + vb->free_pages = kzalloc(bitmap_bytes, GFP_KERNEL); + } + } + if (!vb->free_pages) { + vb->free_pages_len = 0; + vb->free_pages_num = 0; + return; + } + + vb->free_pages_len = bitmap_bytes; + get_free_pages(vb->free_pages, >free_pages_num, + vb->mem_config.low_mem); +} + /* * While most virtqueues communicate guest-initiated requests to the hypervisor, * the stats queue operates in reverse. The driver initializes the virtqueue @@ -278,6 +320,39 @@ static void stats_handle_request(struct virtio_balloon *vb) virtqueue_kick(vq); } +static void free_pages_handle_rq(struct virtio_balloon *vb) +{ + struct virtqueue *vq; + struct scatterlist sg[3]; + unsigned int len; + struct mem_layout *ptr_mem_layout; + struct scatterlist sg_in; + + vq = vb->free_pages_vq; + ptr_mem_layout = virtqueue_get_buf(vq, ); + + if (!ptr_mem_layout) + return; + update_free_pages_stats(vb); + sg_init_table(sg, 3); + sg_set_buf([0], &(vb->free_pages_num), sizeof(vb->free_pages_num)); + sg_set_buf([1], &(vb->free_pages_len), sizeof(vb->free_pages_len)); + sg_set_buf([2], vb->free_pages, vb->free_pages_len); + + sg_init_one(_in, >mem_config, sizeof(vb->mem_config)); + + virtqueue_add_outbuf(vq, [0], 3, vb, GFP_KERNEL); + virtqueue_add_inbuf(vq, _in, 1, >mem_config, GFP_KERNEL); + virtqueue_kick(vq); +} + +static void free_pages_rq(struct virtqueue *vq) +{ + struct virtio_balloon *vb = vq->vdev->priv; + + free_pages_handle_rq(vb); +} + static void virtballoon_changed(struct virtio_device *vdev) { struct virtio_balloon *vb = vdev->priv; @@ -386,16 +461,22 @@ static int balloon(void *_vballoon) static int init_vqs(struct virtio_balloon *vb) { - struct virtqueue *vqs[3]; - vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request }; - static const char * const names[] = { "inflate", "deflate", "stats" }; + struct virtqueue *vqs[4]; + vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, +stats_request, free_pages_rq }; + const char *names[] = { &
[Qemu-devel] [RFC kernel 1/2] mm: Add the functions used to get free pages information
get_total_pages_count() tries to get the page count of the system RAM. get_free_pages() is intend to construct a free pages bitmap by traversing the free_list. The free pages information will be sent to QEMU through virtio and used for live migration optimization. Signed-off-by: Liang Li <liang.z...@intel.com> --- mm/page_alloc.c | 57 + 1 file changed, 57 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 838ca8bb..81922e6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3860,6 +3860,63 @@ void show_free_areas(unsigned int filter) show_swap_cache_info(); } +#define PFN_4G (0x1 >> PAGE_SHIFT) + +unsigned long get_total_pages_count(unsigned long low_mem) +{ + if (max_pfn >= PFN_4G) { + unsigned long pfn_gap = PFN_4G - (low_mem >> PAGE_SHIFT); + + return max_pfn - pfn_gap; + } else + return max_pfn; +} +EXPORT_SYMBOL(get_total_pages_count); + +static void mark_free_pages_bitmap(struct zone *zone, +unsigned long *free_page_bitmap, unsigned long pfn_gap) +{ + unsigned long pfn, flags, i; + unsigned int order, t; + struct list_head *curr; + + if (zone_is_empty(zone)) + return; + + spin_lock_irqsave(>lock, flags); + + for_each_migratetype_order(order, t) { + list_for_each(curr, >free_area[order].free_list[t]) { + + pfn = page_to_pfn(list_entry(curr, struct page, lru)); + for (i = 0; i < (1UL << order); i++) { + if ((pfn + i) >= PFN_4G) + set_bit_le(pfn + i - pfn_gap, + free_page_bitmap); + else + set_bit_le(pfn + i, free_page_bitmap); + } + } + } + + spin_unlock_irqrestore(>lock, flags); +} + +void get_free_pages(unsigned long *free_page_bitmap, + unsigned long *free_pages_count, + unsigned long low_mem) +{ + struct zone *zone; + unsigned long pfn_gap; + + pfn_gap = PFN_4G - (low_mem >> PAGE_SHIFT); + for_each_populated_zone(zone) + mark_free_pages_bitmap(zone, free_page_bitmap, pfn_gap); + + *free_pages_count = global_page_state(NR_FREE_PAGES); +} +EXPORT_SYMBOL(get_free_pages); + static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) { zoneref->zone = zone; -- 1.8.3.1
[Qemu-devel] [RFC qemu 4/4] migration: filter out guest's free pages in ram bulk stage
Get the free pages information through virtio and filter out the free pages in the ram bulk stage. This can significantly reduce the total live migration time as well as network traffic. Signed-off-by: Liang Li <liang.z...@intel.com> --- migration/ram.c | 52 ++-- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index ee2547d..819553b 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -40,6 +40,7 @@ #include "trace.h" #include "exec/ram_addr.h" #include "qemu/rcu_queue.h" +#include "sysemu/balloon.h" #ifdef DEBUG_MIGRATION_RAM #define DPRINTF(fmt, ...) \ @@ -241,6 +242,7 @@ static struct BitmapRcu { struct rcu_head rcu; /* Main migration bitmap */ unsigned long *bmap; +unsigned long *free_pages_bmap; /* bitmap of pages that haven't been sent even once * only maintained and used in postcopy at the moment * where it's used to send the dirtymap at the start @@ -561,12 +563,7 @@ ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb, unsigned long next; bitmap = atomic_rcu_read(_bitmap_rcu)->bmap; -if (ram_bulk_stage && nr > base) { -next = nr + 1; -} else { -next = find_next_bit(bitmap, size, nr); -} - +next = find_next_bit(bitmap, size, nr); *ram_addr_abs = next << TARGET_PAGE_BITS; return (next - base) << TARGET_PAGE_BITS; } @@ -1415,6 +1412,9 @@ void free_xbzrle_decoded_buf(void) static void migration_bitmap_free(struct BitmapRcu *bmap) { g_free(bmap->bmap); +if (balloon_free_pages_support()) { +g_free(bmap->free_pages_bmap); +} g_free(bmap->unsentmap); g_free(bmap); } @@ -1873,6 +1873,28 @@ err: return ret; } +static void filter_out_guest_free_pages(unsigned long *free_pages_bmap) +{ +RAMBlock *block; +DirtyMemoryBlocks *blocks; +unsigned long end, page; + +blocks = atomic_rcu_read(_list.dirty_memory[DIRTY_MEMORY_MIGRATION]); +block = QLIST_FIRST_RCU(_list.blocks); +end = TARGET_PAGE_ALIGN(block->offset + +block->used_length) >> TARGET_PAGE_BITS; +page = block->offset >> TARGET_PAGE_BITS; + +while (page < end) { +unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE; +unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE; +unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset); +unsigned long *p = free_pages_bmap + BIT_WORD(page); + +slow_bitmap_complement(blocks->blocks[idx], p, num); +page += num; +} +} /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has * long-running RCU critical section. When rcu-reclaims in the code @@ -1884,6 +1906,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque) { RAMBlock *block; int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */ +uint64_t free_pages_count = 0; dirty_rate_high_cnt = 0; bitmap_sync_count = 0; @@ -1931,6 +1954,9 @@ static int ram_save_setup(QEMUFile *f, void *opaque) ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS; migration_bitmap_rcu = g_new0(struct BitmapRcu, 1); migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages); +if (balloon_free_pages_support()) { +migration_bitmap_rcu->free_pages_bmap = bitmap_new(ram_bitmap_pages); +} if (migrate_postcopy_ram()) { migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages); @@ -1945,6 +1971,20 @@ static int ram_save_setup(QEMUFile *f, void *opaque) DIRTY_MEMORY_MIGRATION); } memory_global_dirty_log_start(); + +if (balloon_free_pages_support() && +balloon_get_free_pages(migration_bitmap_rcu->free_pages_bmap, + _pages_count) == 0) { +qemu_mutex_unlock_iothread(); +while (balloon_get_free_pages(migration_bitmap_rcu->free_pages_bmap, + _pages_count) == 0) { +usleep(1000); +} +qemu_mutex_lock_iothread(); + +filter_out_guest_free_pages(migration_bitmap_rcu->free_pages_bmap); +} + migration_bitmap_sync(); qemu_mutex_unlock_ramlist(); qemu_mutex_unlock_iothread(); -- 1.8.3.1
[Qemu-devel] [RFC kernel 0/2]A PV solution for KVM live migration optimization
The current QEMU live migration implementation mark the all the guest's RAM pages as dirtied in the ram bulk stage, all these pages will be processed and that takes quit a lot of CPU cycles. >From guest's point of view, it doesn't care about the content in free pages. We can make use of this fact and skip processing the free pages in the ram bulk stage, it can save a lot CPU cycles and reduce the network traffic significantly while speed up the live migration process obviously. This patch set is the kernel side implementation. It get the free pages information by traversing zone->free_area[order].free_list, and construct a free pages bitmap. The virtio-balloon driver is extended so as to send the free pages bitmap to QEMU for live migration optimization. Performance data Test environment: CPU: Intel (R) Xeon(R) CPU ES-2699 v3 @ 2.30GHz Host RAM: 64GB Host Linux Kernel: 4.2.0 Host OS: CentOS 7.1 Guest Linux Kernel: 4.5.rc6Guest OS: CentOS 6.6 Network: X540-AT2 with 10 Gigabit connection Guest RAM: 8GB Case 1: Idle guest just boots: | original |pv --- total time(ms) |1894 | 421 transferred ram(KB) | 398017 | 353242 Case 2: The guest has ever run some memory consuming workload, the workload is terminated just before live migration. | original |pv --- total time(ms) | 7436| 552 transferred ram(KB) | 8146291 | 361375 Liang Li (2): mm: Add the functions used to get free pages information virtio-balloon: extend balloon driver to support a new feature drivers/virtio/virtio_balloon.c | 108 ++-- include/uapi/linux/virtio_balloon.h | 1 + mm/page_alloc.c | 58 +++ 3 files changed, 162 insertions(+), 5 deletions(-) -- 1.8.3.1
[Qemu-devel] [RFC qemu 2/4] virtio-balloon: Add a new feature to balloon device
Extend the virtio balloon device to support a new feature, this new feature can help to get guest's free pages information, which can be used for live migration optimzation. Signed-off-by: Liang Li <liang.z...@intel.com> --- balloon.c | 30 - hw/virtio/virtio-balloon.c | 81 - include/hw/virtio/virtio-balloon.h | 17 +- include/standard-headers/linux/virtio_balloon.h | 1 + include/sysemu/balloon.h| 10 ++- 5 files changed, 134 insertions(+), 5 deletions(-) diff --git a/balloon.c b/balloon.c index f2ef50c..a37717e 100644 --- a/balloon.c +++ b/balloon.c @@ -36,6 +36,7 @@ static QEMUBalloonEvent *balloon_event_fn; static QEMUBalloonStatus *balloon_stat_fn; +static QEMUBalloonFreePages *balloon_free_pages_fn; static void *balloon_opaque; static bool balloon_inhibited; @@ -65,9 +66,12 @@ static bool have_balloon(Error **errp) } int qemu_add_balloon_handler(QEMUBalloonEvent *event_func, - QEMUBalloonStatus *stat_func, void *opaque) + QEMUBalloonStatus *stat_func, + QEMUBalloonFreePages *free_pages_func, + void *opaque) { -if (balloon_event_fn || balloon_stat_fn || balloon_opaque) { +if (balloon_event_fn || balloon_stat_fn || balloon_free_pages_fn +|| balloon_opaque) { /* We're already registered one balloon handler. How many can * a guest really have? */ @@ -75,6 +79,7 @@ int qemu_add_balloon_handler(QEMUBalloonEvent *event_func, } balloon_event_fn = event_func; balloon_stat_fn = stat_func; +balloon_free_pages_fn = free_pages_func; balloon_opaque = opaque; return 0; } @@ -86,6 +91,7 @@ void qemu_remove_balloon_handler(void *opaque) } balloon_event_fn = NULL; balloon_stat_fn = NULL; +balloon_free_pages_fn = NULL; balloon_opaque = NULL; } @@ -116,3 +122,23 @@ void qmp_balloon(int64_t target, Error **errp) trace_balloon_event(balloon_opaque, target); balloon_event_fn(balloon_opaque, target); } + +bool balloon_free_pages_support(void) +{ +return balloon_free_pages_fn ? true : false; +} + +int balloon_get_free_pages(unsigned long *free_pages_bitmap, + unsigned long *free_pages_count) +{ +if (!balloon_free_pages_fn) { +return -1; +} + +if (!free_pages_bitmap || !free_pages_count) { +return -1; +} + +return balloon_free_pages_fn(balloon_opaque, + free_pages_bitmap, free_pages_count); + } diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index e9c30e9..a5b9d08 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -76,6 +76,12 @@ static bool balloon_stats_supported(const VirtIOBalloon *s) return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_STATS_VQ); } +static bool balloon_free_pages_supported(const VirtIOBalloon *s) +{ +VirtIODevice *vdev = VIRTIO_DEVICE(s); +return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_GET_FREE_PAGES); +} + static bool balloon_stats_enabled(const VirtIOBalloon *s) { return s->stats_poll_interval > 0; @@ -293,6 +299,37 @@ out: } } +static void virtio_balloon_get_free_pages(VirtIODevice *vdev, VirtQueue *vq) +{ +VirtIOBalloon *s = VIRTIO_BALLOON(vdev); +VirtQueueElement *elem; +size_t offset = 0; +uint64_t bitmap_bytes = 0, free_pages_count = 0; + +elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); +if (!elem) { +return; +} +s->free_pages_vq_elem = elem; + +if (!elem->out_num) { +return; +} + +iov_to_buf(elem->out_sg, elem->out_num, offset, + _pages_count, sizeof(uint64_t)); + +offset += sizeof(uint64_t); +iov_to_buf(elem->out_sg, elem->out_num, offset, + _bytes, sizeof(uint64_t)); + +offset += sizeof(uint64_t); +iov_to_buf(elem->out_sg, elem->out_num, offset, + s->free_pages_bitmap, bitmap_bytes); +s->req_status = DONE; +s->free_pages_count = free_pages_count; +} + static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data) { VirtIOBalloon *dev = VIRTIO_BALLOON(vdev); @@ -362,6 +399,7 @@ static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f, VirtIOBalloon *dev = VIRTIO_BALLOON(vdev); f |= dev->host_features; virtio_add_feature(, VIRTIO_BALLOON_F_STATS_VQ); +virtio_add_feature(, VIRTIO_BALLOON_F_GET_FREE_PAGES); return f; } @@ -372,6 +410,45 @@ static void virtio_balloon_stat(void *opaque, BalloonInfo *info) VIRTIO_BALLOON_PFN_SHIFT); } +static int virtio_balloon_free_pages(void *opaque, + unsigned long *free_pages_bitmap, +
[Qemu-devel] [RFC qemu 1/4] pc: Add code to get the lowmem form PCMachineState
The lowmem will be used by the following patch to get a correct free pages bitmap. Signed-off-by: Liang Li <liang.z...@intel.com> --- hw/i386/pc.c | 5 + hw/i386/pc_piix.c| 1 + hw/i386/pc_q35.c | 1 + include/hw/i386/pc.h | 3 ++- 4 files changed, 9 insertions(+), 1 deletion(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 0aeefd2..f794a84 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1115,6 +1115,11 @@ void pc_hot_add_cpu(const int64_t id, Error **errp) object_unref(OBJECT(cpu)); } +ram_addr_t pc_get_lowmem(PCMachineState *pcms) +{ + return pcms->lowmem; +} + void pc_cpus_init(PCMachineState *pcms) { int i; diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 6f8c2cd..268a08c 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -113,6 +113,7 @@ static void pc_init1(MachineState *machine, } } +pcms->lowmem = lowmem; if (machine->ram_size >= lowmem) { pcms->above_4g_mem_size = machine->ram_size - lowmem; pcms->below_4g_mem_size = lowmem; diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 46522c9..8d9bd39 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -101,6 +101,7 @@ static void pc_q35_init(MachineState *machine) } } +pcms->lowmem = lowmem; if (machine->ram_size >= lowmem) { pcms->above_4g_mem_size = machine->ram_size - lowmem; pcms->below_4g_mem_size = lowmem; diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index 8b3546e..3694c91 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -60,7 +60,7 @@ struct PCMachineState { bool nvdimm; /* RAM information (sizes, addresses, configuration): */ -ram_addr_t below_4g_mem_size, above_4g_mem_size; +ram_addr_t below_4g_mem_size, above_4g_mem_size, lowmem; /* CPU and apic information: */ bool apic_xrupt_override; @@ -229,6 +229,7 @@ void pc_hot_add_cpu(const int64_t id, Error **errp); void pc_acpi_init(const char *default_dsdt); void pc_guest_info_init(PCMachineState *pcms); +ram_addr_t pc_get_lowmem(PCMachineState *pcms); #define PCI_HOST_PROP_PCI_HOLE_START "pci-hole-start" #define PCI_HOST_PROP_PCI_HOLE_END "pci-hole-end" -- 1.8.3.1
[Qemu-devel] [RFC qemu 0/4] A PV solution for live migration optimization
The current QEMU live migration implementation mark the all the guest's RAM pages as dirtied in the ram bulk stage, all these pages will be processed and that takes quit a lot of CPU cycles. >From guest's point of view, it doesn't care about the content in free pages. We can make use of this fact and skip processing the free pages in the ram bulk stage, it can save a lot CPU cycles and reduce the network traffic significantly while speed up the live migration process obviously. This patch set is the QEMU side implementation. The virtio-balloon is extended so that QEMU can get the free pages information from the guest through virtio. After getting the free pages information (a bitmap), QEMU can use it to filter out the guest's free pages in the ram bulk stage. This make the live migration process much more efficient. This RFC version doesn't take the post-copy and RDMA into consideration, maybe both of them can benefit from this PV solution by with some extra modifications. Performance data Test environment: CPU: Intel (R) Xeon(R) CPU ES-2699 v3 @ 2.30GHz Host RAM: 64GB Host Linux Kernel: 4.2.0 Host OS: CentOS 7.1 Guest Linux Kernel: 4.5.rc6Guest OS: CentOS 6.6 Network: X540-AT2 with 10 Gigabit connection Guest RAM: 8GB Case 1: Idle guest just boots: | original |pv --- total time(ms) |1894 | 421 transferred ram(KB) | 398017 | 353242 Case 2: The guest has ever run some memory consuming workload, the workload is terminated just before live migration. | original |pv --- total time(ms) | 7436| 552 transferred ram(KB) | 8146291 | 361375 ==== Liang Li (4): pc: Add code to get the lowmem form PCMachineState virtio-balloon: Add a new feature to balloon device migration: not set migration bitmap in setup stage migration: filter out guest's free pages in ram bulk stage balloon.c | 30 - hw/i386/pc.c| 5 ++ hw/i386/pc_piix.c | 1 + hw/i386/pc_q35.c| 1 + hw/virtio/virtio-balloon.c | 81 - include/hw/i386/pc.h| 3 +- include/hw/virtio/virtio-balloon.h | 17 +- include/standard-headers/linux/virtio_balloon.h | 1 + include/sysemu/balloon.h| 10 ++- migration/ram.c | 64 +++ 10 files changed, 195 insertions(+), 18 deletions(-) -- 1.8.3.1
[Qemu-devel] [RFC qemu 3/4] migration: not set migration bitmap in setup stage
Set ram_list.dirty_memory instead of migration bitmap, the migration bitmap will be update when doing migration_bitmap_sync(). Set migration_dirty_pages to 0 and it will be updated by migration_dirty_pages() too. The following patch is based on this change. Signed-off-by: Liang Li <liang.z...@intel.com> --- migration/ram.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 704f6a9..ee2547d 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1931,19 +1931,19 @@ static int ram_save_setup(QEMUFile *f, void *opaque) ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS; migration_bitmap_rcu = g_new0(struct BitmapRcu, 1); migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages); -bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages); if (migrate_postcopy_ram()) { migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages); bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages); } -/* - * Count the total number of pages used by ram blocks not including any - * gaps due to alignment or unplugs. - */ -migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS; +migration_dirty_pages = 0; +QLIST_FOREACH_RCU(block, _list.blocks, next) { +cpu_physical_memory_set_dirty_range(block->offset, +block->used_length, +DIRTY_MEMORY_MIGRATION); +} memory_global_dirty_log_start(); migration_bitmap_sync(); qemu_mutex_unlock_ramlist(); -- 1.8.3.1
[Qemu-devel] [PATCH v3 1/2] qemu-file: Fix qemu_put_compression_data flaw
Current qemu_put_compression_data can only work with no writable QEMUFile, and can't work with the writable QEMUFile. But it does not provide any measure to prevent users from using it with a writable QEMUFile. We should fix this flaw to make it works with writable QEMUFile. Suggested-by: Juan Quintela <quint...@redhat.com> Signed-off-by: Liang Li <liang.z...@intel.com> --- migration/qemu-file.c | 23 +-- migration/ram.c |6 +- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/migration/qemu-file.c b/migration/qemu-file.c index 6f4a129..b0ef1f3 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -607,8 +607,14 @@ uint64_t qemu_get_be64(QEMUFile *f) return v; } -/* compress size bytes of data start at p with specific compression +/* Compress size bytes of data start at p with specific compression * level and store the compressed data to the buffer of f. + * + * When f is not writable, return -1 if f has no space to save the + * compressed data. + * When f is wirtable and it has no space to save the compressed data, + * do fflush first, if f still has no space to save the compressed + * data, return -1. */ ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size, @@ -617,7 +623,14 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size, ssize_t blen = IO_BUF_SIZE - f->buf_index - sizeof(int32_t); if (blen < compressBound(size)) { -return 0; +if (!qemu_file_is_writable(f)) { +return -1; +} +qemu_fflush(f); +blen = IO_BUF_SIZE - sizeof(int32_t); +if (blen < compressBound(size)) { +return -1; +} } if (compress2(f->buf + f->buf_index + sizeof(int32_t), (uLongf *), (Bytef *)p, size, level) != Z_OK) { @@ -625,7 +638,13 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size, return 0; } qemu_put_be32(f, blen); +if (f->ops->writev_buffer) { +add_to_iovec(f, f->buf + f->buf_index, blen); +} f->buf_index += blen; +if (f->buf_index == IO_BUF_SIZE) { +qemu_fflush(f); +} return blen + sizeof(int32_t); } diff --git a/migration/ram.c b/migration/ram.c index 704f6a9..4de13c2 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -817,7 +817,11 @@ static int do_compress_ram_page(CompressParam *param) RAM_SAVE_FLAG_COMPRESS_PAGE); blen = qemu_put_compression_data(param->file, p, TARGET_PAGE_SIZE, migrate_compress_level()); -bytes_sent += blen; +if (blen < 0) { +error_report("Insufficient buffer for compressed data!"); +} else { +bytes_sent += blen; +} return bytes_sent; } -- 1.7.1
[Qemu-devel] [PATCH v3 2/2] migration: refine ram_save_compressed_page
Use qemu_put_compression_data to do the compression directly instead of using do_compress_ram_page, avoid some data copy. very small improvement, at the same time, add code to check if the compression is successful. Signed-off-by: Liang Li <liang.z...@intel.com> --- migration/ram.c | 28 +--- 1 files changed, 13 insertions(+), 15 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 4de13c2..9be00eb 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -922,24 +922,20 @@ static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss, uint64_t *bytes_transferred) { int pages = -1; -uint64_t bytes_xmit; +uint64_t bytes_xmit = 0; uint8_t *p; -int ret; +int ret, blen; RAMBlock *block = pss->block; ram_addr_t offset = pss->offset; p = block->host + offset; -bytes_xmit = 0; ret = ram_control_save_page(f, block->offset, offset, TARGET_PAGE_SIZE, _xmit); if (bytes_xmit) { *bytes_transferred += bytes_xmit; pages = 1; } -if (block == last_sent_block) { -offset |= RAM_SAVE_FLAG_CONTINUE; -} if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { if (ret != RAM_SAVE_CONTROL_DELAYED) { if (bytes_xmit > 0) { @@ -959,17 +955,19 @@ static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss, flush_compressed_data(f); pages = save_zero_page(f, block, offset, p, bytes_transferred); if (pages == -1) { -set_compress_params(_param[0], block, offset); -/* Use the qemu thread to compress the data to make sure the - * first page is sent out before other pages - */ -bytes_xmit = do_compress_ram_page(_param[0]); -acct_info.norm_pages++; -qemu_put_qemu_file(f, comp_param[0].file); -*bytes_transferred += bytes_xmit; -pages = 1; +/* Make sure the first page is sent out before other pages */ +bytes_xmit = save_page_header(f, block, offset | + RAM_SAVE_FLAG_COMPRESS_PAGE); +blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE, + migrate_compress_level()); +if (blen > 0) { +*bytes_transferred += bytes_xmit + blen; +acct_info.norm_pages++; +pages = 1; +} } } else { +offset |= RAM_SAVE_FLAG_CONTINUE; pages = save_zero_page(f, block, offset, p, bytes_transferred); if (pages == -1) { pages = compress_page_with_multi_thread(f, block, offset, -- 1.7.1
[Qemu-devel] [PATCH v3 0/2] Fix flaw of qemu_put_compression_data
The implementation of qemu_put_compression_data only consider the case QEMUFile is writable, it can't work with a writable QEMUFile and does not provide any measure to prevent users from using it with a writable QEMUFile. For safety, it should be improved to avoid some issues. ram_save_compressed_page can be refined based on the change of qemu_put_compression_data, very small improvement, but code looks better. Liang Li (2): qemu-file: Fix qemu_put_compression_data flaw migration: refine ram_save_compressed_page migration/qemu-file.c | 23 +-- migration/ram.c | 34 ++ 2 files changed, 39 insertions(+), 18 deletions(-)
[Qemu-devel] [PATCH v5 1/2] configure: detect ifunc and avx2 attribute
Detect if the compiler can support the ifun and avx2, if so, set CONFIG_AVX2_OPT which will be used to turn on the avx2 instruction optimization. Suggested-by: Paolo Bonzini <pbonz...@redhat.com> Suggested-by: Peter Maydell <peter.mayd...@linaro.org> Signed-off-by: Liang Li <liang.z...@intel.com> --- configure | 21 + 1 file changed, 21 insertions(+) diff --git a/configure b/configure index 3506e44..a50dcf5 100755 --- a/configure +++ b/configure @@ -311,6 +311,7 @@ smartcard="" libusb="" usb_redir="" opengl="" +avx2_opt="no" zlib="yes" lzo="" snappy="" @@ -1832,6 +1833,21 @@ EOF fi ## +# avx2 optimization requirement check + +cat > $TMPC << EOF +static void bar(void) {} +static void *bar_ifunc(void) {return (void*) bar;} +static void foo(void) __attribute__((ifunc("bar_ifunc"))); +int main(void) { foo(); return 0; } +EOF +if compile_prog "-mavx2" "" ; then +if readelf --syms $TMPE |grep "IFUNC.*foo" >/dev/null 2>&1; then +avx2_opt="yes" +fi +fi + +# # zlib check if test "$zlib" != "no" ; then @@ -4922,6 +4938,7 @@ echo "bzip2 support $bzip2" echo "NUMA host support $numa" echo "tcmalloc support $tcmalloc" echo "jemalloc support $jemalloc" +echo "avx2 optimization $avx2_opt" if test "$sdl_too_old" = "yes"; then echo "-> Your SDL version is too old - please upgrade to have SDL support" @@ -5306,6 +5323,10 @@ if test "$opengl" = "yes" ; then echo "OPENGL_LIBS=$opengl_libs" >> $config_host_mak fi +if test "$avx2_opt" = "yes" ; then + echo "CONFIG_AVX2_OPT=y" >> $config_host_mak +fi + if test "$lzo" = "yes" ; then echo "CONFIG_LZO=y" >> $config_host_mak fi -- 1.9.1
[Qemu-devel] [PATCH v5 2/2] cutils: add avx2 instruction optimization
buffer_find_nonzero_offset() is a hot function during live migration. Now it use SSE2 instructions for optimization. For platform supports AVX2 instructions, use AVX2 instructions for optimization can help to improve the performance of buffer_find_nonzero_offset() about 30% comparing to SSE2. Live migration can be faster with this optimization, the test result shows that for an 8GiB RAM idle guest just boots, this patch can help to shorten the total live migration time about 6%. This patch use the ifunc mechanism to select the proper function when running, for platform supports AVX2, execute the AVX2 instructions, else, execute the original instructions. Signed-off-by: Liang Li <liang.z...@intel.com> Suggested-by: Paolo Bonzini <pbonz...@redhat.com> Suggested-by: Richard Henderson <r...@twiddle.net> Reviewed-by: Paolo Bonzini <pbonz...@redhat.com> --- include/qemu-common.h | 8 +--- util/cutils.c | 118 -- 2 files changed, 115 insertions(+), 11 deletions(-) diff --git a/include/qemu-common.h b/include/qemu-common.h index 22b010c..f4c8c24 100644 --- a/include/qemu-common.h +++ b/include/qemu-common.h @@ -483,13 +483,7 @@ void qemu_hexdump(const char *buf, FILE *fp, const char *prefix, size_t size); #endif #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8 -static inline bool -can_use_buffer_find_nonzero_offset(const void *buf, size_t len) -{ -return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR - * sizeof(VECTYPE)) == 0 -&& ((uintptr_t) buf) % sizeof(VECTYPE) == 0); -} +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len); size_t buffer_find_nonzero_offset(const void *buf, size_t len); /* diff --git a/util/cutils.c b/util/cutils.c index cfeb848..5c8ee5c 100644 --- a/util/cutils.c +++ b/util/cutils.c @@ -161,6 +161,14 @@ int qemu_fdatasync(int fd) #endif } +static bool +can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len) +{ +return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR + * sizeof(VECTYPE)) == 0 +&& ((uintptr_t) buf) % sizeof(VECTYPE) == 0); +} + /* * Searches for an area with non-zero content in a buffer * @@ -169,8 +177,8 @@ int qemu_fdatasync(int fd) * and addr must be a multiple of sizeof(VECTYPE) due to * restriction of optimizations in this function. * - * can_use_buffer_find_nonzero_offset() can be used to check - * these requirements. + * can_use_buffer_find_nonzero_offset_inner() can be used to + * check these requirements. * * The return value is the offset of the non-zero area rounded * down to a multiple of sizeof(VECTYPE) for the first @@ -181,13 +189,13 @@ int qemu_fdatasync(int fd) * If the buffer is all zero the return value is equal to len. */ -size_t buffer_find_nonzero_offset(const void *buf, size_t len) +static size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len) { const VECTYPE *p = buf; const VECTYPE zero = (VECTYPE){0}; size_t i; -assert(can_use_buffer_find_nonzero_offset(buf, len)); +assert(can_use_buffer_find_nonzero_offset_inner(buf, len)); if (!len) { return 0; @@ -216,6 +224,108 @@ size_t buffer_find_nonzero_offset(const void *buf, size_t len) return i * sizeof(VECTYPE); } +#ifdef CONFIG_AVX2_OPT +#pragma GCC push_options +#pragma GCC target("avx2") +#include +#include + +#define AVX2_VECTYPE__m256i +#define AVX2_SPLAT(p) _mm256_set1_epi8(*(p)) +#define AVX2_ALL_EQ(v1, v2) \ +(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0x) +#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2)) + +static bool +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len) +{ +return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR + * sizeof(AVX2_VECTYPE)) == 0 +&& ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0); +} + +static size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len) +{ +const AVX2_VECTYPE *p = buf; +const AVX2_VECTYPE zero = (AVX2_VECTYPE){0}; +size_t i; + +assert(can_use_buffer_find_nonzero_offset_avx2(buf, len)); + +if (!len) { +return 0; +} + +for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) { +if (!AVX2_ALL_EQ(p[i], zero)) { +return i * sizeof(AVX2_VECTYPE); +} +} + +for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; + i < len / sizeof(AVX2_VECTYPE); + i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) { +AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]); +AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]); +AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]); +AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]); +AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1); +AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2,
[Qemu-devel] [PATCH v5 0/2] add avx2 instruction optimization
buffer_find_nonzero_offset() is a hot function during live migration. Now it use SSE2 instructions for optimization. For platform supports AVX2 instructions, use the AVX2 instructions for optimization can help to improve the performance of zero page checking about 30% comparing to SSE2. Live migration can be faster with this optimization, the test result shows that for an 8GB RAM idle guest, this patch can help to shorten the total live migration time about 6%. This patch use the ifunc mechanism to select the proper function when running, for platform supports AVX2, execute the AVX2 instructions, else, execute the original instructions. With this patch, the QEMU binary can run on both platforms support AVX2 or not. Compiler which doesn't support the AVX2 and ifunc attribute can also build the source code successfully. v5 -> v4 changes: * Enhance the ifunc attribute detection (Paolo's suggestion) v3 -> v4 changes: * Use the GCC #pragma to make things simple (Paolo's suggestion) * Put avx2 related code in cutils.c (Richard's suggestion) * Change the configure, detect ifunc and avx2 attributes together v2 -> v3 changes: * Detect the ifunc attribute support (Paolo's suggestion) * Use the ifunc attribute instead of the inline asm (Richard's suggestion) * Change the configure (Juan's suggestion) Liang Li (2): configure: detect ifunc and avx2 attribute cutils: add avx2 instruction optimization configure | 21 + include/qemu-common.h | 8 +--- util/cutils.c | 118 -- 3 files changed, 136 insertions(+), 11 deletions(-) -- 1.9.1
[Qemu-devel] [PATCH] migration: remove useless code.
Since 's->state' will be set in migrate_init(), there is no need to set it before calling migrate_init(). The code and the related comments can be removed. Signed-off-by: Liang Li <liang.z...@intel.com> --- migration/migration.c | 6 -- 1 file changed, 6 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index aaca451..ae38242 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1006,12 +1006,6 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk, return; } -/* We are starting a new migration, so we want to start in a clean - state. This change is only needed if previous migration - failed/was cancelled. We don't use migrate_set_state() because - we are setting the initial state, not changing it. */ -s->state = MIGRATION_STATUS_NONE; - s = migrate_init(); if (strstart(uri, "tcp:", )) { -- 1.9.1
[Qemu-devel] [PATCH v4 0/3] add avx2 instruction optimization
buffer_find_nonzero_offset() is a hot function during live migration. Now it use SSE2 instructions for optimization. For platform supports AVX2 instructions, use the AVX2 instructions for optimization can help to improve the performance about 30% comparing to SSE2. Zero page check can be faster with this optimization, the test result shows that for an 8GB RAM idle guest, this patch can help to shorten the total live migration time about 6%. This patch use the ifunc mechanism to select the proper function when running, for platform supports AVX2, execute the AVX2 instructions, else, execute the original instructions. With this patch, the QEMU binary can run on both platforms support AVX2 or not. Compiler which doesn't support the AVX2 and ifunc attribute can also build the source code successfully. v3 -> v4 changes: * Use the GCC #pragma to make things simple (Paolo's suggestion) * Put avx2 related code in cutils.c (Richard's suggestion) * Change the configure, detect ifunc and avx2 attributes together v2 -> v3 changes: * Detect the ifunc attribute support (Paolo's suggestion) * Use the ifunc attribute instead of the inline asm (Richard's suggestion) * Change the configure (Juan's suggestion) Liang Li (2): configure: detect ifunc and avx2 attribute cutils: add avx2 instruction optimization configure | 20 + include/qemu-common.h | 8 +--- util/cutils.c | 118 -- 3 files changed, 135 insertions(+), 11 deletions(-) -- 1.9.1
[Qemu-devel] [PATCH v4 2/2] cutils: add avx2 instruction optimization
buffer_find_nonzero_offset() is a hot function during live migration. Now it use SSE2 instructions for optimization. For platform supports AVX2 instructions, use AVX2 instructions for optimization can help to improve the performance about 30% comparing to SSE2. Zero page check can be faster with this optimization, the test result shows that for an 8GiB RAM idle guest just boots, this patch can help to shorten the total live migration time about 6%. This patch use the ifunc mechanism to select the proper function when running, for platform supports AVX2, execute the AVX2 instructions, else, execute the original instructions. Signed-off-by: Liang Li <liang.z...@intel.com> --- include/qemu-common.h | 8 +--- util/cutils.c | 118 -- 2 files changed, 115 insertions(+), 11 deletions(-) diff --git a/include/qemu-common.h b/include/qemu-common.h index 22b010c..f4c8c24 100644 --- a/include/qemu-common.h +++ b/include/qemu-common.h @@ -483,13 +483,7 @@ void qemu_hexdump(const char *buf, FILE *fp, const char *prefix, size_t size); #endif #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8 -static inline bool -can_use_buffer_find_nonzero_offset(const void *buf, size_t len) -{ -return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR - * sizeof(VECTYPE)) == 0 -&& ((uintptr_t) buf) % sizeof(VECTYPE) == 0); -} +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len); size_t buffer_find_nonzero_offset(const void *buf, size_t len); /* diff --git a/util/cutils.c b/util/cutils.c index cfeb848..5c8ee5c 100644 --- a/util/cutils.c +++ b/util/cutils.c @@ -161,6 +161,14 @@ int qemu_fdatasync(int fd) #endif } +static bool +can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len) +{ +return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR + * sizeof(VECTYPE)) == 0 +&& ((uintptr_t) buf) % sizeof(VECTYPE) == 0); +} + /* * Searches for an area with non-zero content in a buffer * @@ -169,8 +177,8 @@ int qemu_fdatasync(int fd) * and addr must be a multiple of sizeof(VECTYPE) due to * restriction of optimizations in this function. * - * can_use_buffer_find_nonzero_offset() can be used to check - * these requirements. + * can_use_buffer_find_nonzero_offset_inner() can be used to + * check these requirements. * * The return value is the offset of the non-zero area rounded * down to a multiple of sizeof(VECTYPE) for the first @@ -181,13 +189,13 @@ int qemu_fdatasync(int fd) * If the buffer is all zero the return value is equal to len. */ -size_t buffer_find_nonzero_offset(const void *buf, size_t len) +static size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len) { const VECTYPE *p = buf; const VECTYPE zero = (VECTYPE){0}; size_t i; -assert(can_use_buffer_find_nonzero_offset(buf, len)); +assert(can_use_buffer_find_nonzero_offset_inner(buf, len)); if (!len) { return 0; @@ -216,6 +224,108 @@ size_t buffer_find_nonzero_offset(const void *buf, size_t len) return i * sizeof(VECTYPE); } +#ifdef CONFIG_AVX2_OPT +#pragma GCC push_options +#pragma GCC target("avx2") +#include +#include + +#define AVX2_VECTYPE__m256i +#define AVX2_SPLAT(p) _mm256_set1_epi8(*(p)) +#define AVX2_ALL_EQ(v1, v2) \ +(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0x) +#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2)) + +static bool +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len) +{ +return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR + * sizeof(AVX2_VECTYPE)) == 0 +&& ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0); +} + +static size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len) +{ +const AVX2_VECTYPE *p = buf; +const AVX2_VECTYPE zero = (AVX2_VECTYPE){0}; +size_t i; + +assert(can_use_buffer_find_nonzero_offset_avx2(buf, len)); + +if (!len) { +return 0; +} + +for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) { +if (!AVX2_ALL_EQ(p[i], zero)) { +return i * sizeof(AVX2_VECTYPE); +} +} + +for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; + i < len / sizeof(AVX2_VECTYPE); + i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) { +AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]); +AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]); +AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]); +AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]); +AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1); +AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3); +if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) { +break; +} +} + +return i * sizeof(AVX2_VECTYPE); +} + +static bool avx2_support(void) +{ +int a, b,
[Qemu-devel] [PATCH v4 1/2] configure: detect ifunc and avx2 attribute
Detect if the compiler can support the ifun and avx2, if so, set CONFIG_AVX2_OPT which will be used to turn on the avx2 instruction optimization. Signed-off-by: Liang Li <liang.z...@intel.com> --- configure | 20 1 file changed, 20 insertions(+) diff --git a/configure b/configure index 44ac9ab..b7f4661 100755 --- a/configure +++ b/configure @@ -310,6 +310,7 @@ smartcard="" libusb="" usb_redir="" opengl="" +avx2_opt="" zlib="yes" lzo="" snappy="" @@ -1827,6 +1828,20 @@ EOF fi ## +# avx2 optimization requirement check + +cat > $TMPC << EOF +static void bar(void) {} +static void foo(void) __attribute__((ifunc("bar"))); +int main(void) { foo(); return 0; } +EOF +if compile_prog "" "-mavx2" ; then +avx2_opt="yes" +else +avx2_opt="no" +fi + +# # zlib check if test "$zlib" != "no" ; then @@ -4855,6 +4870,7 @@ echo "bzip2 support $bzip2" echo "NUMA host support $numa" echo "tcmalloc support $tcmalloc" echo "jemalloc support $jemalloc" +echo "avx2 optimization $avx2_opt" if test "$sdl_too_old" = "yes"; then echo "-> Your SDL version is too old - please upgrade to have SDL support" @@ -5236,6 +5252,10 @@ if test "$opengl" = "yes" ; then echo "OPENGL_LIBS=$opengl_libs" >> $config_host_mak fi +if test "$avx2_opt" = "yes" ; then + echo "CONFIG_AVX2_OPT=y" >> $config_host_mak +fi + if test "$lzo" = "yes" ; then echo "CONFIG_LZO=y" >> $config_host_mak fi -- 1.9.1
[Qemu-devel] [PATCH v4 0/2] add avx2 instruction optimization
buffer_find_nonzero_offset() is a hot function during live migration. Now it use SSE2 instructions for optimization. For platform supports AVX2 instructions, use the AVX2 instructions for optimization can help to improve the performance about 30% comparing to SSE2. Zero page check can be faster with this optimization, the test result shows that for an 8GB RAM idle guest, this patch can help to shorten the total live migration time about 6%. This patch use the ifunc mechanism to select the proper function when running, for platform supports AVX2, execute the AVX2 instructions, else, execute the original instructions. With this patch, the QEMU binary can run on both platforms support AVX2 or not. Compiler which doesn't support the AVX2 and ifunc attribute can also build the source code successfully. v3 -> v4 changes: * Use the GCC #pragma to make things simple (Paolo's suggestion) * Put avx2 related code in cutils.c (Richard's suggestion) * Change the configure, detect ifunc and avx2 attributes together v2 -> v3 changes: * Detect the ifunc attribute support (Paolo's suggestion) * Use the ifunc attribute instead of the inline asm (Richard's suggestion) * Change the configure (Juan's suggestion) Liang Li (2): configure: detect ifunc and avx2 attribute cutils: add avx2 instruction optimization configure | 20 + include/qemu-common.h | 8 +--- util/cutils.c | 118 -- 3 files changed, 135 insertions(+), 11 deletions(-) -- 1.9.1
[Qemu-devel] [PATCH RESEND v2 2/2] migration: refine ram_save_compressed_page
Use qemu_put_compression_data to do the compression directly instead of using do_compress_ram_page, avoid some data copy. very small improvement, but the code looks better. Signed-off-by: Liang Li <liang.z...@intel.com> --- migration/ram.c | 20 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 4e606ab..48ebef0 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -917,22 +917,18 @@ static int ram_save_compressed_page(QEMUFile *f, RAMBlock *block, uint64_t *bytes_transferred) { int pages = -1; -uint64_t bytes_xmit; +uint64_t bytes_xmit = 0; uint8_t *p; int ret; p = block->host + offset; -bytes_xmit = 0; ret = ram_control_save_page(f, block->offset, offset, TARGET_PAGE_SIZE, _xmit); if (bytes_xmit) { *bytes_transferred += bytes_xmit; pages = 1; } -if (block == last_sent_block) { -offset |= RAM_SAVE_FLAG_CONTINUE; -} if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { if (ret != RAM_SAVE_CONTROL_DELAYED) { if (bytes_xmit > 0) { @@ -952,17 +948,17 @@ static int ram_save_compressed_page(QEMUFile *f, RAMBlock *block, flush_compressed_data(f); pages = save_zero_page(f, block, offset, p, bytes_transferred); if (pages == -1) { -set_compress_params(_param[0], block, offset); -/* Use the qemu thread to compress the data to make sure the - * first page is sent out before other pages - */ -bytes_xmit = do_compress_ram_page(_param[0]); -acct_info.norm_pages++; -qemu_put_qemu_file(f, comp_param[0].file); +/* Make sure the first page is sent out before other pages */ +bytes_xmit = save_page_header(f, block, offset | + RAM_SAVE_FLAG_COMPRESS_PAGE); +bytes_xmit += qemu_put_compression_data(f, p, TARGET_PAGE_SIZE, +migrate_compress_level()); *bytes_transferred += bytes_xmit; +acct_info.norm_pages++; pages = 1; } } else { +offset |= RAM_SAVE_FLAG_CONTINUE; pages = save_zero_page(f, block, offset, p, bytes_transferred); if (pages == -1) { pages = compress_page_with_multi_thread(f, block, offset, -- 1.9.1
[Qemu-devel] (no subject)
Subject: [PATCH RESEND v2 0/2] Fix flaw of qemu_put_compression_data The implementation of qemu_put_compression_data only consider the case QEMUFile is writable, it can't work with a writable QEMUFile and does not provide any measure to prevent users from using it with a writable QEMUFile. For safety, it should be improved to avoid some issues. ram_save_compressed_page can be refined based on the change of qemu_put_compression_data, very small improvement, but code looks better. Liang Li (2): qemu-file: Fix qemu_put_compression_data flaw migration: refine ram_save_compressed_page migration/qemu-file.c | 23 +-- migration/ram.c | 20 2 files changed, 29 insertions(+), 14 deletions(-) -- 1.9.1
[Qemu-devel] [PATCH RESEND v2 1/2] qemu-file: Fix qemu_put_compression_data flaw
Current qemu_put_compression_data can only work with no writable QEMUFile, and can't work with the writable QEMUFile. But it does not provide any measure to prevent users from using it with a writable QEMUFile. We should fix this flaw to make it works with writable QEMUFile. Signed-off-by: Liang Li <liang.z...@intel.com> --- migration/qemu-file.c | 23 +-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/migration/qemu-file.c b/migration/qemu-file.c index 0bbd257..b956ab6 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -606,8 +606,14 @@ uint64_t qemu_get_be64(QEMUFile *f) return v; } -/* compress size bytes of data start at p with specific compression +/* Compress size bytes of data start at p with specific compression * level and store the compressed data to the buffer of f. + * + * When f is not writable, return 0 if f has no space to save the + * compressed data. + * When f is wirtable and it has no space to save the compressed data, + * do fflush first, if f still has no space to save the compressed + * data, return 0. */ ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size, @@ -616,7 +622,14 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size, ssize_t blen = IO_BUF_SIZE - f->buf_index - sizeof(int32_t); if (blen < compressBound(size)) { -return 0; +if (!qemu_file_is_writable(f)) { +return 0; +} +qemu_fflush(f); +blen = IO_BUF_SIZE - sizeof(int32_t); +if (blen < compressBound(size)) { +return 0; +} } if (compress2(f->buf + f->buf_index + sizeof(int32_t), (uLongf *), (Bytef *)p, size, level) != Z_OK) { @@ -624,7 +637,13 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size, return 0; } qemu_put_be32(f, blen); +if (f->ops->writev_buffer) { +add_to_iovec(f, f->buf + f->buf_index, blen); +} f->buf_index += blen; +if (f->buf_index == IO_BUF_SIZE) { +qemu_fflush(f); +} return blen + sizeof(int32_t); } -- 1.9.1
[Qemu-devel] [PATCH RESEND v2 0/2] Fix flaw of qemu_put_compression_data
The implementation of qemu_put_compression_data only consider the case QEMUFile is writable, it can't work with a writable QEMUFile and does not provide any measure to prevent users from using it with a writable QEMUFile. For safety, it should be improved to avoid some issues. ram_save_compressed_page can be refined based on the change of qemu_put_compression_data, very small improvement, but code looks better. Liang Li (2): qemu-file: Fix qemu_put_compression_data flaw migration: refine ram_save_compressed_page migration/qemu-file.c | 23 +-- migration/ram.c | 20 2 files changed, 29 insertions(+), 14 deletions(-) -- 1.9.1
[Qemu-devel] [PATCH] migration: not send zero page header in ram bulk stage
Now that VM's RAM pages are initialized to zero, (VM's RAM is allcated with the mmap() and MAP_ANONYMOUS option, or mmap() without MAP_SHARED if hugetlbfs is used.) so there is no need to send the zero page header to destination. For guest just uses a small portions of RAM, this change can avoid allocating all the guest's RAM pages in the destination node after live migration. Another benefit is destination QEMU can save lots of CPU cycles for zero page checking. Signed-off-by: Liang Li <liang.z...@intel.com> --- migration/ram.c | 10 ++ 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 4e606ab..c4821d1 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -705,10 +705,12 @@ static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset, if (is_zero_range(p, TARGET_PAGE_SIZE)) { acct_info.dup_pages++; -*bytes_transferred += save_page_header(f, block, - offset | RAM_SAVE_FLAG_COMPRESS); -qemu_put_byte(f, 0); -*bytes_transferred += 1; +if (!ram_bulk_stage) { +*bytes_transferred += save_page_header(f, block, offset | + RAM_SAVE_FLAG_COMPRESS); +qemu_put_byte(f, 0); +*bytes_transferred += 1; +} pages = 1; } -- 1.9.1
[Qemu-devel] [v3 1/3] cutils: add avx2 instruction optimization
buffer_find_nonzero_offset() is a hot function during live migration. Now it use SSE2 intructions for optimization. For platform supports AVX2 instructions, use the AVX2 instructions for optimization can help to improve the performance about 30% comparing to SSE2. Zero page check can be faster with this optimization, the test result shows that for an 8GB RAM idle guest, this patch can help to shorten the total live migration time about 6%. This patch use the ifunc mechanism to select the proper function when running, for platform supports AVX2, excute the AVX2 instructions, else, excute the original code. Signed-off-by: Liang Li <liang.z...@intel.com> --- include/qemu-common.h | 13 +- util/Makefile.objs | 2 ++ util/buffer-zero-avx2.c | 54 util/cutils.c | 65 +++-- 4 files changed, 125 insertions(+), 9 deletions(-) create mode 100644 util/buffer-zero-avx2.c diff --git a/include/qemu-common.h b/include/qemu-common.h index 405364f..be8ba79 100644 --- a/include/qemu-common.h +++ b/include/qemu-common.h @@ -484,15 +484,14 @@ void qemu_hexdump(const char *buf, FILE *fp, const char *prefix, size_t size); #endif #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8 -static inline bool -can_use_buffer_find_nonzero_offset(const void *buf, size_t len) -{ -return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR - * sizeof(VECTYPE)) == 0 -&& ((uintptr_t) buf) % sizeof(VECTYPE) == 0); -} +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len); size_t buffer_find_nonzero_offset(const void *buf, size_t len); +#if defined CONFIG_IFUNC && defined CONFIG_AVX2 +bool can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len); +size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len); +#endif + /* * helper to parse debug environment variables */ diff --git a/util/Makefile.objs b/util/Makefile.objs index 89dd80e..a130b35 100644 --- a/util/Makefile.objs +++ b/util/Makefile.objs @@ -1,4 +1,5 @@ util-obj-y = osdep.o cutils.o unicode.o qemu-timer-common.o +util-obj-$(CONFIG_AVX2) += buffer-zero-avx2.o util-obj-$(CONFIG_POSIX) += compatfd.o util-obj-$(CONFIG_POSIX) += event_notifier-posix.o util-obj-$(CONFIG_POSIX) += mmap-alloc.o @@ -30,3 +31,4 @@ util-obj-y += qemu-coroutine-sleep.o util-obj-y += coroutine-$(CONFIG_COROUTINE_BACKEND).o util-obj-y += buffer.o util-obj-y += timed-average.o +buffer-zero-avx2.o-cflags := $(AVX2_CFLAGS) diff --git a/util/buffer-zero-avx2.c b/util/buffer-zero-avx2.c new file mode 100644 index 000..b9da0e3 --- /dev/null +++ b/util/buffer-zero-avx2.c @@ -0,0 +1,54 @@ +#include "qemu-common.h" + +#if defined CONFIG_IFUNC && defined CONFIG_AVX2 +#include +#define AVX2_VECTYPE__m256i +#define AVX2_SPLAT(p) _mm256_set1_epi8(*(p)) +#define AVX2_ALL_EQ(v1, v2) \ +(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0x) +#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2)) + +inline bool +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len) +{ +return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR + * sizeof(AVX2_VECTYPE)) == 0 +&& ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0); +} + +size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len) +{ +const AVX2_VECTYPE *p = buf; +const AVX2_VECTYPE zero = (AVX2_VECTYPE){0}; +size_t i; + +assert(can_use_buffer_find_nonzero_offset_avx2(buf, len)); + +if (!len) { +return 0; +} + +for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) { +if (!AVX2_ALL_EQ(p[i], zero)) { +return i * sizeof(AVX2_VECTYPE); +} +} + +for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; + i < len / sizeof(AVX2_VECTYPE); + i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) { +AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]); +AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]); +AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]); +AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]); +AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1); +AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3); +if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) { +break; +} +} + +return i * sizeof(AVX2_VECTYPE); +} + +#endif diff --git a/util/cutils.c b/util/cutils.c index cfeb848..3631c02 100644 --- a/util/cutils.c +++ b/util/cutils.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "qemu/sockets.h" #include "qemu/iov.h" @@ -161,6 +162,14 @@ int qemu_fdatasync(int fd) #endif } +static inline bool +can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len) +{ +return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR + * size
[Qemu-devel] [v3 3/3] configure: add options to config avx2
Add the '--enable-avx2' & '--disable-avx2' option so as to config the AVX2 instruction optimization. If '--disable-avx2' is not set, configure will detect if the compiler can support AVX2 option, if yes, AVX2 optimization is eabled, else disabled. Signed-off-by: Liang Li <liang.z...@intel.com> --- configure | 30 ++ 1 file changed, 30 insertions(+) diff --git a/configure b/configure index 394db3b..94e45fa 100755 --- a/configure +++ b/configure @@ -311,6 +311,7 @@ libusb="" usb_redir="" opengl="" ifunc="" +avx2="" zlib="yes" lzo="" snappy="" @@ -1063,6 +1064,10 @@ for opt do ;; --enable-usb-redir) usb_redir="yes" ;; + --disable-avx2) avx2="no" + ;; + --enable-avx2) avx2="yes" + ;; --disable-zlib-test) zlib="no" ;; --disable-lzo) lzo="no" @@ -1378,6 +1383,7 @@ disabled with --disable-FEATURE, default is enabled if available: smartcard smartcard support (libcacard) libusb libusb (for usb passthrough) usb-redir usb network redirection support + avx2support of avx2 instruction lzo support of lzo compression library snappy support of snappy compression library bzip2 support of bzip2 compression library @@ -1841,6 +1847,23 @@ else ifunc="no" fi + +# avx2 check + +if test "$avx2" != "no" ; then +cat > $TMPC << EOF +int main(void) { return 0; } +EOF +if compile_prog "" "-mavx2" ; then +avx2="yes" +else +if test "$avx2" = "yes" ; then +feature_not_found "avx2" "Your compiler don't support avx2" +fi +avx2="no" +fi +fi + # # zlib check @@ -4853,6 +4876,7 @@ echo "TPM passthrough $tpm_passthrough" echo "QOM debugging $qom_cast_debug" echo "vhdx $vhdx" echo "ifunc support $ifunc" +echo "avx2 support $avx2" echo "lzo support $lzo" echo "snappy support$snappy" echo "bzip2 support $bzip2" @@ -5241,6 +5265,12 @@ if test "$ifunc" = "yes" ; then echo "CONFIG_IFUNC=y" >> $config_host_mak fi +if test "$avx2" = "yes" ; then + avx2_cflags=" -mavx2" + echo "AVX2_CFLAGS=$avx2_cflags" >> $config_host_mak + echo "CONFIG_AVX2=y" >> $config_host_mak +fi + if test "$lzo" = "yes" ; then echo "CONFIG_LZO=y" >> $config_host_mak fi -- 1.9.1
[Qemu-devel] [v3 2/3] configure: detect ifunc attribute
Detect if the compiler can support the ifunc attribute, the avx2 optimization depends on ifunc attribute. Signed-off-by: Liang Li <liang.z...@intel.com> --- configure | 20 1 file changed, 20 insertions(+) diff --git a/configure b/configure index b9552fd..394db3b 100755 --- a/configure +++ b/configure @@ -310,6 +310,7 @@ smartcard="" libusb="" usb_redir="" opengl="" +ifunc="" zlib="yes" lzo="" snappy="" @@ -1827,6 +1828,20 @@ EOF fi ## +# ifunc check + +cat > $TMPC << EOF +static void bar(void) {} +static void foo(void) __attribute__((ifunc("bar"))); +int main(void) { foo(); return 0; } +EOF +if compile_prog "" "" ; then +ifunc="yes" +else +ifunc="no" +fi + +# # zlib check if test "$zlib" != "no" ; then @@ -4837,6 +4852,7 @@ echo "libssh2 support $libssh2" echo "TPM passthrough $tpm_passthrough" echo "QOM debugging $qom_cast_debug" echo "vhdx $vhdx" +echo "ifunc support $ifunc" echo "lzo support $lzo" echo "snappy support$snappy" echo "bzip2 support $bzip2" @@ -5221,6 +5237,10 @@ if test "$opengl" = "yes" ; then echo "OPENGL_LIBS=$opengl_libs" >> $config_host_mak fi +if test "$ifunc" = "yes" ; then + echo "CONFIG_IFUNC=y" >> $config_host_mak +fi + if test "$lzo" = "yes" ; then echo "CONFIG_LZO=y" >> $config_host_mak fi -- 1.9.1
[Qemu-devel] [v3 0/3] add avx2 instruction optimization
buffer_find_nonzero_offset() is a hot function during live migration. Now it use SSE2 intructions for optimization. For platform supports AVX2 instructions, use the AVX2 instructions for optimization can help to improve the performance about 30% comparing to SSE2. Zero page check can be faster with this optimization, the test result shows that for an 8GB RAM idle guest, this patch can help to shorten the total live migration time about 6%. This patch use the ifunc mechanism to select the proper function when running, for platform supports AVX2, excute the AVX2 instructions, else, excute the original code. With this patch, the QEMU binary can run on both platforms support AVX2 or not. Compiler which desn't support the AVX2 or ifunc attribute can build the source code successfully. v2 -> v3 changes: * Detect the ifunc attribute support (Paolo's suggestion) * Use the ifunc attribute instead of the inline asm (Richard's suggestion) * Change the configure (Juan's suggestion) Liang Li (3): cutils: add avx2 instruction optimization configure: detect ifunc attribute configure: add options to config avx2 configure | 50 + include/qemu-common.h | 13 +- util/Makefile.objs | 2 ++ util/buffer-zero-avx2.c | 54 util/cutils.c | 65 +++-- 5 files changed, 175 insertions(+), 9 deletions(-) create mode 100644 util/buffer-zero-avx2.c -- 1.9.1
[Qemu-devel] [v2 1/2] qemu-file: Fix qemu_put_compression_data flaw
Current qemu_put_compression_data can only work with no writable QEMUFile, and can't work with the writable QEMUFile. But it does not provide any measure to prevent users from using it with a writable QEMUFile. We should fix this flaw to make it works with writable QEMUFile. Signed-off-by: Liang Li <liang.z...@intel.com> --- migration/qemu-file.c | 23 +-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/migration/qemu-file.c b/migration/qemu-file.c index 0bbd257..b956ab6 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -606,8 +606,14 @@ uint64_t qemu_get_be64(QEMUFile *f) return v; } -/* compress size bytes of data start at p with specific compression +/* Compress size bytes of data start at p with specific compression * level and store the compressed data to the buffer of f. + * + * When f is not writable, return 0 if f has no space to save the + * compressed data. + * When f is wirtable and it has no space to save the compressed data, + * do fflush first, if f still has no space to save the compressed + * data, return 0. */ ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size, @@ -616,7 +622,14 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size, ssize_t blen = IO_BUF_SIZE - f->buf_index - sizeof(int32_t); if (blen < compressBound(size)) { -return 0; +if (!qemu_file_is_writable(f)) { +return 0; +} +qemu_fflush(f); +blen = IO_BUF_SIZE - sizeof(int32_t); +if (blen < compressBound(size)) { +return 0; +} } if (compress2(f->buf + f->buf_index + sizeof(int32_t), (uLongf *), (Bytef *)p, size, level) != Z_OK) { @@ -624,7 +637,13 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size, return 0; } qemu_put_be32(f, blen); +if (f->ops->writev_buffer) { +add_to_iovec(f, f->buf + f->buf_index, blen); +} f->buf_index += blen; +if (f->buf_index == IO_BUF_SIZE) { +qemu_fflush(f); +} return blen + sizeof(int32_t); } -- 1.9.1
[Qemu-devel] [v2 0/2] Fix flaw of qemu_put_compression_data
The implementation of qemu_put_compression_data only consider the case QEMUFile is writable, it can't work with a writable QEMUFile and does not provide any measure to prevent users from using it with a writable QEMUFile. For safety, it should be improved to avoid some issues. ram_save_compressed_page can be refined based on the change of qemu_put_compression_data, very small improvement, but code looks better. Liang Li (2): qemu-file: Fix qemu_put_compression_data flaw migration: refine ram_save_compressed_page migration/qemu-file.c | 23 +-- migration/ram.c | 20 2 files changed, 29 insertions(+), 14 deletions(-) -- 1.9.1
[Qemu-devel] [v2 2/2] migration: refine ram_save_compressed_page
Use qemu_put_compression_data to do the compression directly instead of using do_compress_ram_page, avoid some data copy. very small improvement, but the code looks better. Signed-off-by: Liang Li <liang.z...@intel.com> --- migration/ram.c | 20 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 1eb155a..44b3edc 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -911,22 +911,18 @@ static int ram_save_compressed_page(QEMUFile *f, RAMBlock *block, uint64_t *bytes_transferred) { int pages = -1; -uint64_t bytes_xmit; +uint64_t bytes_xmit = 0; uint8_t *p; int ret; p = block->host + offset; -bytes_xmit = 0; ret = ram_control_save_page(f, block->offset, offset, TARGET_PAGE_SIZE, _xmit); if (bytes_xmit) { *bytes_transferred += bytes_xmit; pages = 1; } -if (block == last_sent_block) { -offset |= RAM_SAVE_FLAG_CONTINUE; -} if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { if (ret != RAM_SAVE_CONTROL_DELAYED) { if (bytes_xmit > 0) { @@ -946,17 +942,17 @@ static int ram_save_compressed_page(QEMUFile *f, RAMBlock *block, flush_compressed_data(f); pages = save_zero_page(f, block, offset, p, bytes_transferred); if (pages == -1) { -set_compress_params(_param[0], block, offset); -/* Use the qemu thread to compress the data to make sure the - * first page is sent out before other pages - */ -bytes_xmit = do_compress_ram_page(_param[0]); -acct_info.norm_pages++; -qemu_put_qemu_file(f, comp_param[0].file); +/* Make sure the first page is sent out before other pages */ +bytes_xmit = save_page_header(f, block, offset | + RAM_SAVE_FLAG_COMPRESS_PAGE); +bytes_xmit += qemu_put_compression_data(f, p, TARGET_PAGE_SIZE, +migrate_compress_level()); *bytes_transferred += bytes_xmit; +acct_info.norm_pages++; pages = 1; } } else { +offset |= RAM_SAVE_FLAG_CONTINUE; pages = save_zero_page(f, block, offset, p, bytes_transferred); if (pages == -1) { pages = compress_page_with_multi_thread(f, block, offset, -- 1.9.1
[Qemu-devel] [PATCH 1/2] qemu-file: fix flaws of qemu_put_compression_data
There are some flaws in qemu_put_compression_data, this patch tries to fix it. Now it can be used by other code. Signed-off-by: Liang Li <liang.z...@intel.com> --- migration/qemu-file.c | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/migration/qemu-file.c b/migration/qemu-file.c index 0bbd257..ef9cd4a 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -616,7 +616,9 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size, ssize_t blen = IO_BUF_SIZE - f->buf_index - sizeof(int32_t); if (blen < compressBound(size)) { -return 0; +if (f->ops->writev_buffer || f->ops->put_buffer) { +qemu_fflush(f); +} } if (compress2(f->buf + f->buf_index + sizeof(int32_t), (uLongf *), (Bytef *)p, size, level) != Z_OK) { @@ -624,7 +626,13 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size, return 0; } qemu_put_be32(f, blen); +if (f->ops->writev_buffer) { +add_to_iovec(f, f->buf + f->buf_index, blen); +} f->buf_index += blen; +if (f->buf_index == IO_BUF_SIZE) { +qemu_fflush(f); +} return blen + sizeof(int32_t); } -- 1.9.1
[Qemu-devel] [PATCH 0/2] fix the flaws of qemu_put_compression_data
This patch fixed the flaws in qemu_put_compression_data function. and cleanup the code based on the change. Liang Li (2): qemu-file: fix flaws of qemu_put_compression_data migration: code clean up. migration/qemu-file.c | 10 +- migration/ram.c | 20 2 files changed, 17 insertions(+), 13 deletions(-) -- 1.9.1
[Qemu-devel] [PATCH 2/2] migration: code clean up.
Use qemu_put_compression_data to do the compression directly instead of using do_compress_ram_page, avoid some data copy. very small improvement, but the code looks better. Signed-off-by: Liang Li <liang.z...@intel.com> --- migration/ram.c | 20 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 1eb155a..44b3edc 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -911,22 +911,18 @@ static int ram_save_compressed_page(QEMUFile *f, RAMBlock *block, uint64_t *bytes_transferred) { int pages = -1; -uint64_t bytes_xmit; +uint64_t bytes_xmit = 0; uint8_t *p; int ret; p = block->host + offset; -bytes_xmit = 0; ret = ram_control_save_page(f, block->offset, offset, TARGET_PAGE_SIZE, _xmit); if (bytes_xmit) { *bytes_transferred += bytes_xmit; pages = 1; } -if (block == last_sent_block) { -offset |= RAM_SAVE_FLAG_CONTINUE; -} if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { if (ret != RAM_SAVE_CONTROL_DELAYED) { if (bytes_xmit > 0) { @@ -946,17 +942,17 @@ static int ram_save_compressed_page(QEMUFile *f, RAMBlock *block, flush_compressed_data(f); pages = save_zero_page(f, block, offset, p, bytes_transferred); if (pages == -1) { -set_compress_params(_param[0], block, offset); -/* Use the qemu thread to compress the data to make sure the - * first page is sent out before other pages - */ -bytes_xmit = do_compress_ram_page(_param[0]); -acct_info.norm_pages++; -qemu_put_qemu_file(f, comp_param[0].file); +/* Make sure the first page is sent out before other pages */ +bytes_xmit = save_page_header(f, block, offset | + RAM_SAVE_FLAG_COMPRESS_PAGE); +bytes_xmit += qemu_put_compression_data(f, p, TARGET_PAGE_SIZE, +migrate_compress_level()); *bytes_transferred += bytes_xmit; +acct_info.norm_pages++; pages = 1; } } else { +offset |= RAM_SAVE_FLAG_CONTINUE; pages = save_zero_page(f, block, offset, p, bytes_transferred); if (pages == -1) { pages = compress_page_with_multi_thread(f, block, offset, -- 1.9.1
[Qemu-devel] [PATCH 1/2] qemu-file: fix flaws of qemu_put_compression_data
There are some flaws in qemu_put_compression_data, this patch tries to fix it. Now it can be used by other code. Signed-off-by: Liang Li <liang.z...@intel.com> --- migration/qemu-file.c | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/migration/qemu-file.c b/migration/qemu-file.c index 0bbd257..ef9cd4a 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -616,7 +616,9 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size, ssize_t blen = IO_BUF_SIZE - f->buf_index - sizeof(int32_t); if (blen < compressBound(size)) { -return 0; +if (f->ops->writev_buffer || f->ops->put_buffer) { +qemu_fflush(f); +} } if (compress2(f->buf + f->buf_index + sizeof(int32_t), (uLongf *), (Bytef *)p, size, level) != Z_OK) { @@ -624,7 +626,13 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size, return 0; } qemu_put_be32(f, blen); +if (f->ops->writev_buffer) { +add_to_iovec(f, f->buf + f->buf_index, blen); +} f->buf_index += blen; +if (f->buf_index == IO_BUF_SIZE) { +qemu_fflush(f); +} return blen + sizeof(int32_t); } -- 1.9.1
[Qemu-devel] [PATCH 2/2] migration: code clean up.
Use qemu_put_compression_data to do the compression directly instead of using do_compress_ram_page, avoid some data copy. very small improvement, but the code looks better. Signed-off-by: Liang Li <liang.z...@intel.com> --- migration/ram.c | 20 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 1eb155a..44b3edc 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -911,22 +911,18 @@ static int ram_save_compressed_page(QEMUFile *f, RAMBlock *block, uint64_t *bytes_transferred) { int pages = -1; -uint64_t bytes_xmit; +uint64_t bytes_xmit = 0; uint8_t *p; int ret; p = block->host + offset; -bytes_xmit = 0; ret = ram_control_save_page(f, block->offset, offset, TARGET_PAGE_SIZE, _xmit); if (bytes_xmit) { *bytes_transferred += bytes_xmit; pages = 1; } -if (block == last_sent_block) { -offset |= RAM_SAVE_FLAG_CONTINUE; -} if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { if (ret != RAM_SAVE_CONTROL_DELAYED) { if (bytes_xmit > 0) { @@ -946,17 +942,17 @@ static int ram_save_compressed_page(QEMUFile *f, RAMBlock *block, flush_compressed_data(f); pages = save_zero_page(f, block, offset, p, bytes_transferred); if (pages == -1) { -set_compress_params(_param[0], block, offset); -/* Use the qemu thread to compress the data to make sure the - * first page is sent out before other pages - */ -bytes_xmit = do_compress_ram_page(_param[0]); -acct_info.norm_pages++; -qemu_put_qemu_file(f, comp_param[0].file); +/* Make sure the first page is sent out before other pages */ +bytes_xmit = save_page_header(f, block, offset | + RAM_SAVE_FLAG_COMPRESS_PAGE); +bytes_xmit += qemu_put_compression_data(f, p, TARGET_PAGE_SIZE, +migrate_compress_level()); *bytes_transferred += bytes_xmit; +acct_info.norm_pages++; pages = 1; } } else { +offset |= RAM_SAVE_FLAG_CONTINUE; pages = save_zero_page(f, block, offset, p, bytes_transferred); if (pages == -1) { pages = compress_page_with_multi_thread(f, block, offset, -- 1.9.1
[Qemu-devel] [PATCH 0/2] fix the flaws of qemu_put_compression_data
This patch fixed the flaws in qemu_put_compression_data function. and cleanup the code based on the change. Liang Li (2): qemu-file: fix flaws of qemu_put_compression_data migration: code clean up. migration/qemu-file.c | 10 +- migration/ram.c | 20 2 files changed, 17 insertions(+), 13 deletions(-) -- 1.9.1
[Qemu-devel] [v2 RESEND 1/2] cutils: add avx2 instruction optimization
buffer_find_nonzero_offset() is a hot function during live migration. Now it use SSE2 intructions for optimization. For platform supports AVX2 instructions, use the AVX2 instructions for optimization can help to improve the performance about 30% comparing to SSE2. Zero page check can be faster with this optimization, the test result shows that for an 8GB RAM idle guest, this patch can help to shorten the total live migration time about 6%. This patch use the ifunc mechanism to select the proper function when running, for platform supports AVX2, excute the AVX2 instructions, else, excute the original code. Signed-off-by: Liang Li <liang.z...@intel.com> --- include/qemu-common.h | 28 +++-- util/Makefile.objs| 2 ++ util/avx2.c | 68 +++ util/cutils.c | 47 +-- 4 files changed, 136 insertions(+), 9 deletions(-) create mode 100644 util/avx2.c diff --git a/include/qemu-common.h b/include/qemu-common.h index 2f74540..9fa7501 100644 --- a/include/qemu-common.h +++ b/include/qemu-common.h @@ -484,15 +484,29 @@ void qemu_hexdump(const char *buf, FILE *fp, const char *prefix, size_t size); #endif #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8 -static inline bool -can_use_buffer_find_nonzero_offset(const void *buf, size_t len) -{ -return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR - * sizeof(VECTYPE)) == 0 -&& ((uintptr_t) buf) % sizeof(VECTYPE) == 0); -} +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len); + size_t buffer_find_nonzero_offset(const void *buf, size_t len); +extern bool +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len); + +extern size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len); + +extern bool +can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len); + +extern size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len); + +__asm__(".type can_use_buffer_find_nonzero_offset, \%gnu_indirect_function"); +__asm__(".type buffer_find_nonzero_offset, \%gnu_indirect_function"); + + +void *can_use_buffer_find_nonzero_offset_ifunc(void) \ + __asm__("can_use_buffer_find_nonzero_offset"); + +void *buffer_find_nonzero_offset_ifunc(void) \ + __asm__("buffer_find_nonzero_offset"); /* * helper to parse debug environment variables */ diff --git a/util/Makefile.objs b/util/Makefile.objs index d7cc399..6aacad7 100644 --- a/util/Makefile.objs +++ b/util/Makefile.objs @@ -1,4 +1,5 @@ util-obj-y = osdep.o cutils.o unicode.o qemu-timer-common.o +util-obj-y += avx2.o util-obj-$(CONFIG_POSIX) += compatfd.o util-obj-$(CONFIG_POSIX) += event_notifier-posix.o util-obj-$(CONFIG_POSIX) += mmap-alloc.o @@ -29,3 +30,4 @@ util-obj-y += qemu-coroutine.o qemu-coroutine-lock.o qemu-coroutine-io.o util-obj-y += qemu-coroutine-sleep.o util-obj-y += coroutine-$(CONFIG_COROUTINE_BACKEND).o util-obj-y += buffer.o +avx2.o-cflags := $(AVX2_CFLAGS) diff --git a/util/avx2.c b/util/avx2.c new file mode 100644 index 000..d90289b --- /dev/null +++ b/util/avx2.c @@ -0,0 +1,68 @@ +#include "qemu-common.h" + +#ifdef __AVX2__ +#include +#define AVX2_VECTYPE__m256i +#define AVX2_SPLAT(p) _mm256_set1_epi8(*(p)) +#define AVX2_ALL_EQ(v1, v2) \ +(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0x) +#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2)) + +inline bool +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len) +{ +return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR + * sizeof(AVX2_VECTYPE)) == 0 +&& ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0); +} + +size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len) +{ +const AVX2_VECTYPE *p = buf; +const AVX2_VECTYPE zero = (AVX2_VECTYPE){0}; +size_t i; + +assert(can_use_buffer_find_nonzero_offset_avx2(buf, len)); + +if (!len) { +return 0; +} + +for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) { +if (!AVX2_ALL_EQ(p[i], zero)) { +return i * sizeof(AVX2_VECTYPE); +} +} + +for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; + i < len / sizeof(AVX2_VECTYPE); + i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) { +AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]); +AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]); +AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]); +AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]); +AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1); +AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3); +if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) { +break; +} +} + +return i * sizeof(AVX2_VECTYPE); +} + +#else +/* u