[Qemu-devel] [QEMU v2 7/9] bitmap: Add a new bitmap_move function

2016-07-14 Thread Liang Li
Sometimes, it is need to move a portion of bitmap to another place
in a large bitmap, if overlap happens, the bitmap_copy can't not
work correctly, we need a new function to do this work.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 include/qemu/bitmap.h | 13 +
 1 file changed, 13 insertions(+)

diff --git a/include/qemu/bitmap.h b/include/qemu/bitmap.h
index ec5146f..6ac89ca 100644
--- a/include/qemu/bitmap.h
+++ b/include/qemu/bitmap.h
@@ -37,6 +37,7 @@
  * bitmap_set(dst, pos, nbits) Set specified bit area
  * bitmap_set_atomic(dst, pos, nbits)   Set specified bit area with atomic ops
  * bitmap_clear(dst, pos, nbits)   Clear specified bit area
+ * bitmap_move(dst, src, nbits) Move *src to *dst
  * bitmap_test_and_clear_atomic(dst, pos, nbits)Test and clear area
  * bitmap_find_next_zero_area(buf, len, pos, n, mask)  Find bit free area
  */
@@ -136,6 +137,18 @@ static inline void bitmap_copy(unsigned long *dst, const 
unsigned long *src,
 }
 }
 
+static inline void bitmap_move(unsigned long *dst, const unsigned long *src,
+   long nbits)
+{
+if (small_nbits(nbits)) {
+unsigned long tmp = *src;
+*dst = tmp;
+} else {
+long len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
+memmove(dst, src, len);
+}
+}
+
 static inline int bitmap_and(unsigned long *dst, const unsigned long *src1,
  const unsigned long *src2, long nbits)
 {
-- 
1.9.1




[Qemu-devel] [QEMU v2 9/9] migration: skip free pages during live migration

2016-07-14 Thread Liang Li
After sending out the request for free pages, live migration
process will start without waiting for the free page bitmap is
ready. If the free page bitmap is not ready when doing the 1st
migration_bitmap_sync() after ram_save_setup(), the free page
bitmap will be ignored, this means the free pages will not be
filtered out in this case.

The current implementation can not work with post copy, if post
copy is enabled, we simply ignore the free pages. Will make it
work later.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 migration/ram.c | 86 +
 1 file changed, 86 insertions(+)

diff --git a/migration/ram.c b/migration/ram.c
index 815bc0e..223d243 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -43,6 +43,8 @@
 #include "trace.h"
 #include "exec/ram_addr.h"
 #include "qemu/rcu_queue.h"
+#include "sysemu/balloon.h"
+#include "sysemu/kvm.h"
 
 #ifdef DEBUG_MIGRATION_RAM
 #define DPRINTF(fmt, ...) \
@@ -228,6 +230,8 @@ static QemuMutex migration_bitmap_mutex;
 static uint64_t migration_dirty_pages;
 static uint32_t last_version;
 static bool ram_bulk_stage;
+static bool ignore_freepage_rsp;
+static uint64_t free_page_req_id;
 
 /* used by the search for pages to send */
 struct PageSearchStatus {
@@ -244,6 +248,7 @@ static struct BitmapRcu {
 struct rcu_head rcu;
 /* Main migration bitmap */
 unsigned long *bmap;
+unsigned long *free_page_bmap;
 /* bitmap of pages that haven't been sent even once
  * only maintained and used in postcopy at the moment
  * where it's used to send the dirtymap at the start
@@ -636,6 +641,7 @@ static void migration_bitmap_sync(void)
 rcu_read_unlock();
 qemu_mutex_unlock(_bitmap_mutex);
 
+ignore_freepage_rsp = true;
 trace_migration_bitmap_sync_end(migration_dirty_pages
 - num_dirty_pages_init);
 num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init;
@@ -1409,6 +1415,7 @@ static void migration_bitmap_free(struct BitmapRcu *bmap)
 {
 g_free(bmap->bmap);
 g_free(bmap->unsentmap);
+g_free(bmap->free_page_bmap);
 g_free(bmap);
 }
 
@@ -1479,6 +1486,77 @@ void migration_bitmap_extend(ram_addr_t old, ram_addr_t 
new)
 }
 }
 
+static void filter_out_guest_free_page(unsigned long *free_page_bmap,
+   long nbits)
+{
+long i, page_count = 0, len;
+unsigned long *bitmap;
+
+tighten_guest_free_page_bmap(free_page_bmap);
+qemu_mutex_lock(_bitmap_mutex);
+bitmap = atomic_rcu_read(_bitmap_rcu)->bmap;
+slow_bitmap_complement(bitmap, free_page_bmap, nbits);
+
+len = (last_ram_offset() >> TARGET_PAGE_BITS) / BITS_PER_LONG;
+for (i = 0; i < len; i++) {
+page_count += hweight_long(bitmap[i]);
+}
+
+migration_dirty_pages = page_count;
+qemu_mutex_unlock(_bitmap_mutex);
+}
+
+static void ram_request_free_page(unsigned long *bmap, unsigned long max_pfn)
+{
+BalloonReqStatus status;
+
+free_page_req_id++;
+status = balloon_get_free_pages(bmap, max_pfn / BITS_PER_BYTE,
+free_page_req_id);
+if (status == REQ_START) {
+ignore_freepage_rsp = false;
+}
+}
+
+static void ram_handle_free_page(void)
+{
+unsigned long nbits, req_id = 0;
+RAMBlock *pc_ram_block;
+BalloonReqStatus status;
+
+status = balloon_free_page_ready(_id);
+switch (status) {
+case REQ_DONE:
+if (req_id != free_page_req_id) {
+return;
+}
+rcu_read_lock();
+pc_ram_block = QLIST_FIRST_RCU(_list.blocks);
+nbits = pc_ram_block->used_length >> TARGET_PAGE_BITS;
+filter_out_guest_free_page(migration_bitmap_rcu->free_page_bmap, 
nbits);
+rcu_read_unlock();
+
+qemu_mutex_lock_iothread();
+migration_bitmap_sync();
+qemu_mutex_unlock_iothread();
+/*
+ * bulk stage assumes in (migration_bitmap_find_and_reset_dirty) that
+ * every page is dirty, that's no longer ture at this point.
+ */
+ram_bulk_stage = false;
+last_seen_block = NULL;
+last_sent_block = NULL;
+last_offset = 0;
+break;
+case REQ_ERROR:
+ignore_freepage_rsp = true;
+error_report("failed to get free page");
+break;
+default:
+break;
+}
+}
+
 /*
  * 'expected' is the value you expect the bitmap mostly to be full
  * of; it won't bother printing lines that are all this value.
@@ -1944,6 +2022,11 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
 qemu_mutex_unlock_ramlist();
 qemu_mutex_unlock_iothread();
 
+if (balloon_free_pages_support() && !migrate_postcopy_ram()) {
+unsigned long max_pfn = get_guest_max_pfn();
+migration_bitmap_rcu->free_page_bmap = bitmap_new(max_pfn);
+ram_

[Qemu-devel] [QEMU v2 6/9] balloon: migrate vq elem to destination

2016-07-14 Thread Liang Li
After live migration, 'guest-stats' can't get the expected memory
status in the guest. This issue is caused by commit 4eae2a657d.
The value of 's->stats_vq_elem' will be NULL after live migration,
and the check in the function 'balloon_stats_poll_cb()' will
prevent the 'virtio_notify()' from executing. So guest will not
update the memory status.

Commit 4eae2a657d is doing the right thing, but 's->stats_vq_elem'
should be treated as part of balloon device state and migrated to
destination if it's not NULL to make everything works well.

For the same reason, 's->misc_vq_elem' should be migrated to
destination too.
Michael has other idea to solve this issue, but he is busy at the
moment, this patch can be used for test before his patch is ready.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 hw/virtio/virtio-balloon.c | 36 ++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index b0c09a7..f9bf26d 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -31,6 +31,7 @@
 #include "hw/virtio/virtio-access.h"
 
 #define BALLOON_PAGE_SIZE  (1 << VIRTIO_BALLOON_PFN_SHIFT)
+#define BALLOON_VERSION 2
 
 static void balloon_page(void *addr, int deflate)
 {
@@ -610,15 +611,33 @@ static void virtio_balloon_save(QEMUFile *f, void *opaque)
 static void virtio_balloon_save_device(VirtIODevice *vdev, QEMUFile *f)
 {
 VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
+uint16_t elem_num = 0;
 
 qemu_put_be32(f, s->num_pages);
 qemu_put_be32(f, s->actual);
+if (s->stats_vq_elem != NULL) {
+elem_num = 1;
+}
+qemu_put_be16(f, elem_num);
+if (elem_num) {
+qemu_put_virtqueue_element(f, s->stats_vq_elem);
+}
+
+elem_num = 0;
+if (s->misc_vq_elem != NULL) {
+elem_num = 1;
+}
+qemu_put_be16(f, elem_num);
+if (elem_num) {
+qemu_put_virtqueue_element(f, s->misc_vq_elem);
+}
 }
 
 static int virtio_balloon_load(QEMUFile *f, void *opaque, int version_id)
 {
-if (version_id != 1)
+if (version_id < 1 || version_id > BALLOON_VERSION) {
 return -EINVAL;
+}
 
 return virtio_load(VIRTIO_DEVICE(opaque), f, version_id);
 }
@@ -627,9 +646,22 @@ static int virtio_balloon_load_device(VirtIODevice *vdev, 
QEMUFile *f,
   int version_id)
 {
 VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
+uint16_t elem_num = 0;
 
 s->num_pages = qemu_get_be32(f);
 s->actual = qemu_get_be32(f);
+if (version_id == BALLOON_VERSION) {
+elem_num = qemu_get_be16(f);
+if (elem_num == 1) {
+s->stats_vq_elem =
+qemu_get_virtqueue_element(f, sizeof(VirtQueueElement));
+}
+elem_num = qemu_get_be16(f);
+if (elem_num == 1) {
+s->misc_vq_elem =
+qemu_get_virtqueue_element(f, sizeof(VirtQueueElement));
+}
+}
 
 if (balloon_stats_enabled(s)) {
 balloon_stats_change_timer(s, s->stats_poll_interval);
@@ -665,7 +697,7 @@ static void virtio_balloon_device_realize(DeviceState *dev, 
Error **errp)
 reset_stats(s);
 s->req_status = REQ_INIT;
 
-register_savevm(dev, "virtio-balloon", -1, 1,
+register_savevm(dev, "virtio-balloon", -1, BALLOON_VERSION,
 virtio_balloon_save, virtio_balloon_load, s);
 }
 
-- 
1.9.1




[Qemu-devel] [QEMU v2 4/9] virtio-balloon: update linux head file for new feature

2016-07-14 Thread Liang Li
Update the new feature bit definition for the new virt queue and
the request header struct to keep consistent with kernel side.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 include/standard-headers/linux/virtio_balloon.h | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/include/standard-headers/linux/virtio_balloon.h 
b/include/standard-headers/linux/virtio_balloon.h
index d577359..797a868 100644
--- a/include/standard-headers/linux/virtio_balloon.h
+++ b/include/standard-headers/linux/virtio_balloon.h
@@ -35,6 +35,7 @@
 #define VIRTIO_BALLOON_F_STATS_VQ  1 /* Memory Stats virtqueue */
 #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM2 /* Deflate balloon on OOM */
 #define VIRTIO_BALLOON_F_PAGE_BITMAP   3 /* Send page info with bitmap */
+#define VIRTIO_BALLOON_F_MISC_VQ   4 /* Misc info virtqueue */
 
 /* Size of a PFN in the balloon interface. */
 #define VIRTIO_BALLOON_PFN_SHIFT 12
@@ -101,4 +102,25 @@ struct balloon_bmap_hdr {
__virtio64 bmap_len;
 };
 
+enum balloon_req_id {
+   /* Get free pages information */
+   BALLOON_GET_FREE_PAGES,
+};
+
+enum balloon_flag {
+   /* Have more data for a request */
+   BALLOON_FLAG_CONT,
+   /* No more data for a request */
+   BALLOON_FLAG_DONE,
+};
+
+struct balloon_req_hdr {
+   /* Used to distinguish different request */
+   __virtio16 cmd;
+   /* Reserved */
+   __virtio16 reserved[3];
+   /* Request parameter */
+   __virtio64 param;
+};
+
 #endif /* _LINUX_VIRTIO_BALLOON_H */
-- 
1.9.1




[Qemu-devel] [QEMU v2 2/9] virtio-balloon: update linux head file

2016-07-14 Thread Liang Li
Update the new feature bit definition and the page bitmap header
struct to keep consistent with kernel side.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 include/standard-headers/linux/virtio_balloon.h | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/include/standard-headers/linux/virtio_balloon.h 
b/include/standard-headers/linux/virtio_balloon.h
index 9d06ccd..d577359 100644
--- a/include/standard-headers/linux/virtio_balloon.h
+++ b/include/standard-headers/linux/virtio_balloon.h
@@ -34,6 +34,7 @@
 #define VIRTIO_BALLOON_F_MUST_TELL_HOST0 /* Tell before reclaiming 
pages */
 #define VIRTIO_BALLOON_F_STATS_VQ  1 /* Memory Stats virtqueue */
 #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM2 /* Deflate balloon on OOM */
+#define VIRTIO_BALLOON_F_PAGE_BITMAP   3 /* Send page info with bitmap */
 
 /* Size of a PFN in the balloon interface. */
 #define VIRTIO_BALLOON_PFN_SHIFT 12
@@ -82,4 +83,22 @@ struct virtio_balloon_stat {
__virtio64 val;
 } QEMU_PACKED;
 
+/* Page bitmap header structure */
+struct balloon_bmap_hdr {
+   /* Used to distinguish different request */
+   __virtio16 cmd;
+   /* Shift width of page in the bitmap */
+   __virtio16 page_shift;
+   /* flag used to identify different status */
+   __virtio16 flag;
+   /* Reserved */
+   __virtio16 reserved;
+   /* ID of the request */
+   __virtio64 req_id;
+   /* The pfn of 0 bit in the bitmap */
+   __virtio64 start_pfn;
+   /* The length of the bitmap, in bytes */
+   __virtio64 bmap_len;
+};
+
 #endif /* _LINUX_VIRTIO_BALLOON_H */
-- 
1.9.1




[Qemu-devel] [QEMU v2 1/9] virtio-balloon: Remove needless precompiled directive

2016-07-14 Thread Liang Li
Since there in wrapper around madvise(), the virtio-balloon
code is able to work without the precompiled directive, the
directive can be removed.

Signed-off-by: Liang Li <liang.z...@intel.com>
Suggested-by: Thomas Huth <th...@redhat.com>
---
 hw/virtio/virtio-balloon.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index 1a22e6d..62931b3 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -34,13 +34,11 @@
 
 static void balloon_page(void *addr, int deflate)
 {
-#if defined(__linux__)
 if (!qemu_balloon_is_inhibited() && (!kvm_enabled() ||
  kvm_has_sync_mmu())) {
 qemu_madvise(addr, BALLOON_PAGE_SIZE,
 deflate ? QEMU_MADV_WILLNEED : QEMU_MADV_DONTNEED);
 }
-#endif
 }
 
 static const char *balloon_stat_names[] = {
-- 
1.9.1




[Qemu-devel] [QEMU v2 8/9] kvm: Add two new arch specific functions

2016-07-14 Thread Liang Li
Add a new function to get the vm's max pfn and a new function
to filter out the holes in the undressed free page bitmap to get
a tight free page bitmap. They are implemented on X86 and should
be implemented on other arches for live migration optimization.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 include/sysemu/kvm.h | 18 ++
 target-arm/kvm.c | 14 ++
 target-i386/kvm.c| 37 +
 target-mips/kvm.c| 14 ++
 target-ppc/kvm.c | 14 ++
 target-s390x/kvm.c   | 14 ++
 6 files changed, 111 insertions(+)

diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index ad6f837..fd0956f 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -230,6 +230,24 @@ int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
   target_ulong len, int type);
 void kvm_remove_all_breakpoints(CPUState *cpu);
 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap);
+
+/**
+ * tighten_guest_free_page_bmap - process the free page bitmap from
+ * guest to get a tight page bitmap which does not contain
+ * holes.
+ * @bmap: undressed guest free page bitmap
+ * Returns: a tight guest free page bitmap, the n th bit in the
+ * returned bitmap and the n th bit in the migration bitmap
+ * should correspond to the same guest RAM page.
+ */
+unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap);
+
+/**
+ * get_guest_max_pfn - get the max pfn of guest
+ * Returns: the max pfn of guest
+ */
+unsigned long get_guest_max_pfn(void);
+
 #ifndef _WIN32
 int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset);
 #endif
diff --git a/target-arm/kvm.c b/target-arm/kvm.c
index 5c2bd7a..1cfccb3 100644
--- a/target-arm/kvm.c
+++ b/target-arm/kvm.c
@@ -626,3 +626,17 @@ int kvm_arch_msi_data_to_gsi(uint32_t data)
 {
 return (data - 32) & 0x;
 }
+
+unsigned long get_guest_max_pfn(void)
+{
+/* To be done */
+
+return 0;
+}
+
+unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap)
+{
+/* To be done */
+
+return bmap;
+}
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 9327523..f0503dd 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -3378,3 +3378,40 @@ int kvm_arch_msi_data_to_gsi(uint32_t data)
 {
 abort();
 }
+
+#define _4G (1ULL << 32)
+
+unsigned long get_guest_max_pfn(void)
+{
+PCMachineState *pcms = PC_MACHINE(current_machine);
+ram_addr_t above_4g_mem = pcms->above_4g_mem_size;
+unsigned long max_pfn;
+
+if (above_4g_mem) {
+max_pfn = (_4G + above_4g_mem) >> TARGET_PAGE_BITS;
+} else {
+max_pfn = pcms->below_4g_mem_size >> TARGET_PAGE_BITS;
+}
+
+return max_pfn;
+}
+
+unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap)
+{
+PCMachineState *pcms = PC_MACHINE(current_machine);
+ram_addr_t above_4g_mem = pcms->above_4g_mem_size;
+
+if (above_4g_mem) {
+unsigned long *src, *dst, len, pos;
+ram_addr_t below_4g_mem = pcms->below_4g_mem_size;
+src = bmap + (_4G >> TARGET_PAGE_BITS) / BITS_PER_LONG;
+dst = bmap + (below_4g_mem >> TARGET_PAGE_BITS) / BITS_PER_LONG;
+bitmap_move(dst, src, above_4g_mem >> TARGET_PAGE_BITS);
+
+pos = (above_4g_mem + below_4g_mem) >> TARGET_PAGE_BITS;
+len = (_4G - below_4g_mem) >> TARGET_PAGE_BITS;
+bitmap_clear(bmap, pos, len);
+}
+
+return bmap;
+}
diff --git a/target-mips/kvm.c b/target-mips/kvm.c
index f3f832d..ba39827 100644
--- a/target-mips/kvm.c
+++ b/target-mips/kvm.c
@@ -1047,3 +1047,17 @@ int kvm_arch_msi_data_to_gsi(uint32_t data)
 {
 abort();
 }
+
+unsigned long get_guest_max_pfn(void)
+{
+/* To be done */
+
+return 0;
+}
+
+unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap)
+{
+/* To be done */
+
+return bmap;
+}
diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index 884d564..ff67b3e 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -2630,3 +2630,17 @@ int kvmppc_enable_hwrng(void)
 
 return kvmppc_enable_hcall(kvm_state, H_RANDOM);
 }
+
+unsigned long get_guest_max_pfn(void)
+{
+/* To be done */
+
+return 0;
+}
+
+unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap)
+{
+/* To be done */
+
+return bmap;
+}
diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c
index 2991bff..2e5c763 100644
--- a/target-s390x/kvm.c
+++ b/target-s390x/kvm.c
@@ -2271,3 +2271,17 @@ int kvm_arch_msi_data_to_gsi(uint32_t data)
 {
 abort();
 }
+
+unsigned long get_guest_max_pfn(void)
+{
+/* To be done */
+
+return 0;
+}
+
+unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap)
+{
+/* To be done */
+
+return bmap;
+}
-- 
1.9.1




[Qemu-devel] [QEMU v2 3/9] virtio-balloon: speed up inflating & deflating process

2016-07-14 Thread Liang Li
The implementation of the current virtio-balloon is not very
efficient, the time spends on different stages of inflating
the balloon to 7GB of a 8GB idle guest:

a. allocating pages (6.5%)
b. sending PFNs to host (68.3%)
c. address translation (6.1%)
d. madvise (19%)

It takes about 4126ms for the inflating process to complete.
Debugging shows that the bottle neck are the stage b and stage d.

If using a bitmap to send the page info instead of the PFNs, we
can reduce the overhead in stage b quite a lot. Furthermore, we
can do the address translation and call madvise() with a bulk of
RAM pages, instead of the current page per page way, the overhead
of stage c and stage d can also be reduced a lot.

This patch is the kernel side implementation which is intended to
speed up the inflating & deflating process by adding a new feature
to the virtio-balloon device. With this new feature, inflating the
balloon to 7GB of a 8GB idle guest only takes 590ms, the
performance improvement is about 85%.

TODO: optimize stage a by allocating/freeing a chunk of pages
instead of a single page at a time.

Signed-off-by: Liang Li <liang.z...@intel.com>
Suggested-by: Michael S. Tsirkin <m...@redhat.com>
---
 hw/virtio/virtio-balloon.c | 144 ++---
 1 file changed, 123 insertions(+), 21 deletions(-)

diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index 62931b3..a7152c8 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -52,6 +52,77 @@ static const char *balloon_stat_names[] = {
[VIRTIO_BALLOON_S_NR] = NULL
 };
 
+static void do_balloon_bulk_pages(ram_addr_t base_pfn, uint16_t page_shift,
+  unsigned long len, bool deflate)
+{
+ram_addr_t size, processed, chunk, base;
+MemoryRegionSection section = {.mr = NULL};
+
+size = len << page_shift;
+base = base_pfn << page_shift;
+
+for (processed = 0; processed < size; processed += chunk) {
+chunk = size - processed;
+while (chunk >= TARGET_PAGE_SIZE) {
+section = memory_region_find(get_system_memory(),
+ base + processed, chunk);
+if (!section.mr) {
+chunk = QEMU_ALIGN_DOWN(chunk / 2, TARGET_PAGE_SIZE);
+} else {
+break;
+}
+}
+
+if (section.mr &&
+(int128_nz(section.size) && memory_region_is_ram(section.mr))) {
+void *addr = section.offset_within_region +
+   memory_region_get_ram_ptr(section.mr);
+qemu_madvise(addr, chunk,
+ deflate ? QEMU_MADV_WILLNEED : QEMU_MADV_DONTNEED);
+} else {
+qemu_log_mask(LOG_GUEST_ERROR,
+  "Invalid guest RAM range [0x%lx, 0x%lx]\n",
+  base + processed, chunk);
+chunk = TARGET_PAGE_SIZE;
+}
+}
+}
+
+static void balloon_bulk_pages(struct balloon_bmap_hdr *hdr,
+   unsigned long *bitmap, bool deflate)
+{
+ram_addr_t base_pfn = hdr->start_pfn;
+uint16_t page_shift = hdr->page_shift;
+unsigned long len = hdr->bmap_len;
+unsigned long current = 0, end = len * BITS_PER_BYTE;
+
+if (!qemu_balloon_is_inhibited() && (!kvm_enabled() ||
+ kvm_has_sync_mmu())) {
+while (current < end) {
+unsigned long one = find_next_bit(bitmap, end, current);
+
+if (one < end) {
+unsigned long pages, zero;
+
+zero = find_next_zero_bit(bitmap, end, one + 1);
+if (zero >= end) {
+pages = end - one;
+} else {
+pages = zero - one;
+}
+
+if (pages) {
+do_balloon_bulk_pages(base_pfn + one, page_shift,
+  pages, deflate);
+}
+current = one + pages;
+} else {
+current = one;
+}
+}
+}
+}
+
 /*
  * reset_stats - Mark all items in the stats array as unset
  *
@@ -72,6 +143,13 @@ static bool balloon_stats_supported(const VirtIOBalloon *s)
 return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_STATS_VQ);
 }
 
+static bool balloon_page_bitmap_supported(const VirtIOBalloon *s)
+{
+VirtIODevice *vdev = VIRTIO_DEVICE(s);
+
+return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_BITMAP);
+}
+
 static bool balloon_stats_enabled(const VirtIOBalloon *s)
 {
 return s->stats_poll_interval > 0;
@@ -213,32 +291,54 @@ static void virtio_balloon_handle_output(VirtIODevice 
*vdev, VirtQueue *vq)
 for (;;) {
 size_t offset = 0;
 uint32_t pfn;
+
 elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
 if (!elem) {

[Qemu-devel] [QEMU v2 0/9] Fast (de)inflating & fast live migration

2016-07-14 Thread Liang Li
This patch set intends to do two optimizations, one is to speed up
the (de)inflating process of virtio balloon, and another one which
is to speed up the live migration process. We put them together
because both of them are required to change the virtio balloon spec. 

The main idea of speeding up the (de)inflating process is to use 
bitmap to send the page information to host instead of the PFNs, to
reduce the overhead of virtio data transmission, address translation
and madvise(). This can help to improve the performance by about 85%.

The idea of speeding up live migration is to skip process guest's
free pages in the first round of data copy, to reduce needless
data processing, this can help to save quite a lot of CPU cycles and
network bandwidth. We get guest's free page information through the
virt queue of virtio-balloon, and filter out these free pages during
live migration. For an idle 8GB guest, this can help to shorten the 
total live migration time from 2Sec to about 500ms in the 10Gbps
network environment.  

Changes from v1 to v2:
* Abandon the patch for dropping page cache.
* Get a struct from vq instead of separate variables.
* Use two separate APIs to request free pages and query the status.
* Changed the virtio balloon interface.
* Addressed some of the comments of v1.

Liang Li (9):
  virtio-balloon: Remove needless precompiled directive
  virtio-balloon: update linux head file
  virtio-balloon: speed up inflating & deflating process
  virtio-balloon: update linux head file for new feature
  balloon: get free page info from guest
  balloon: migrate vq elem to destination
  bitmap: Add a new bitmap_move function
  kvm: Add two new arch specific functions
  migration: skip free pages during live migration

 balloon.c   |  47 +++-
 hw/virtio/virtio-balloon.c  | 304 ++--
 include/hw/virtio/virtio-balloon.h  |  18 +-
 include/qemu/bitmap.h   |  13 +
 include/standard-headers/linux/virtio_balloon.h |  41 
 include/sysemu/balloon.h|  18 +-
 include/sysemu/kvm.h|  18 ++
 migration/ram.c |  86 +++
 target-arm/kvm.c|  14 ++
 target-i386/kvm.c   |  37 +++
 target-mips/kvm.c   |  14 ++
 target-ppc/kvm.c|  14 ++
 target-s390x/kvm.c  |  14 ++
 13 files changed, 608 insertions(+), 30 deletions(-)

-- 
1.9.1




[Qemu-devel] [PATCH v2] balloon: Fix failure of updating guest memory status

2016-07-05 Thread Liang Li
After live migration, 'guest-stats' can't get the expected memory
status in the guest. This issue is caused by commit 4eae2a657d.
The value of 's->stats_vq_elem' will be NULL after live migration,
and the check in the function 'balloon_stats_poll_cb()' will
prevent the 'virtio_notify()' from executing. So guest will not
update the memory status.

Commit 4eae2a657d is doing the right thing, but 's->stats_vq_elem'
should be treated as part of balloon device state and migrated to
destination if it's not NULL to make everything works well.

Signed-off-by: Liang Li <liang.z...@intel.com>
Suggested-by: Paolo Bonzini <pbonz...@redhat.com>
Cc: Michael S. Tsirkin <m...@redhat.com>
Cc: Ladi Prosek <lpro...@redhat.com>
Cc: Paolo Bonzini <pbonz...@redhat.com>
---
 hw/virtio/virtio-balloon.c | 22 --
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index 557d3f9..64e80c6 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -31,6 +31,7 @@
 #include "hw/virtio/virtio-access.h"
 
 #define BALLOON_PAGE_SIZE  (1 << VIRTIO_BALLOON_PFN_SHIFT)
+#define BALLOON_VERSION 2
 
 static void balloon_page(void *addr, int deflate)
 {
@@ -404,15 +405,24 @@ static void virtio_balloon_save(QEMUFile *f, void *opaque)
 static void virtio_balloon_save_device(VirtIODevice *vdev, QEMUFile *f)
 {
 VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
+uint16_t elem_num = 0;
 
 qemu_put_be32(f, s->num_pages);
 qemu_put_be32(f, s->actual);
+if (s->stats_vq_elem != NULL) {
+elem_num = 1;
+}
+qemu_put_be16(f, elem_num);
+if (elem_num) {
+qemu_put_virtqueue_element(f, s->stats_vq_elem);
+}
 }
 
 static int virtio_balloon_load(QEMUFile *f, void *opaque, int version_id)
 {
-if (version_id != 1)
+if (version_id < 1 || version_id > BALLOON_VERSION) {
 return -EINVAL;
+}
 
 return virtio_load(VIRTIO_DEVICE(opaque), f, version_id);
 }
@@ -421,9 +431,17 @@ static int virtio_balloon_load_device(VirtIODevice *vdev, 
QEMUFile *f,
   int version_id)
 {
 VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
+uint16_t elem_num = 0;
 
 s->num_pages = qemu_get_be32(f);
 s->actual = qemu_get_be32(f);
+if (version_id == BALLOON_VERSION) {
+elem_num = qemu_get_be16(f);
+if (elem_num == 1) {
+s->stats_vq_elem =
+qemu_get_virtqueue_element(f, sizeof(VirtQueueElement));
+}
+}
 
 if (balloon_stats_enabled(s)) {
 balloon_stats_change_timer(s, s->stats_poll_interval);
@@ -455,7 +473,7 @@ static void virtio_balloon_device_realize(DeviceState *dev, 
Error **errp)
 
 reset_stats(s);
 
-register_savevm(dev, "virtio-balloon", -1, 1,
+register_savevm(dev, "virtio-balloon", -1, BALLOON_VERSION,
 virtio_balloon_save, virtio_balloon_load, s);
 }
 
-- 
1.8.3.1




[Qemu-devel] [PATCH] balloon: Fix failure of updating guest memory status

2016-06-30 Thread Liang Li
After live migration, 'guest-stats' can't get the expected memory
status in the guest. This issue is caused by commit 4eae2a657d.
The value of 's->stats_vq_elem' will be NULL after live migration,
and the check in the function 'balloon_stats_poll_cb()' will
prevent the 'virtio_notify()' from executing. So guest will not
update the memory status.

Signed-off-by: Liang Li <liang.z...@intel.com>
Cc: Michael S. Tsirkin <m...@redhat.com>
Cc: Ladi Prosek <lpro...@redhat.com>
Cc: Paolo Bonzini <pbonz...@redhat.com>
---
 hw/virtio/virtio-balloon.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index 557d3f9..cc6947f 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -98,13 +98,19 @@ static void balloon_stats_poll_cb(void *opaque)
 {
 VirtIOBalloon *s = opaque;
 VirtIODevice *vdev = VIRTIO_DEVICE(s);
+VirtQueueElement elem = {0};
 
-if (s->stats_vq_elem == NULL || !balloon_stats_supported(s)) {
+if (!balloon_stats_supported(s)) {
 /* re-schedule */
 balloon_stats_change_timer(s, s->stats_poll_interval);
 return;
 }
 
+if (s->stats_vq_elem == NULL) {
+virtqueue_push(s->svq, , 0);
+virtio_notify(vdev, s->svq);
+return;
+}
 virtqueue_push(s->svq, s->stats_vq_elem, s->stats_vq_offset);
 virtio_notify(vdev, s->svq);
 g_free(s->stats_vq_elem);
-- 
1.9.1




[Qemu-devel] [PATCH v2 kernel 7/7] virtio-balloon: tell host vm's free page info

2016-06-29 Thread Liang Li
Support the request for vm's free page information, response with
a page bitmap. QEMU can make use of this free page bitmap to speed
up live migration process by skipping process the free pages.

Signed-off-by: Liang Li <liang.z...@intel.com>
Cc: Michael S. Tsirkin <m...@redhat.com>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: Cornelia Huck <cornelia.h...@de.ibm.com>
Cc: Amit Shah <amit.s...@redhat.com>
---
 drivers/virtio/virtio_balloon.c | 104 +---
 1 file changed, 98 insertions(+), 6 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 2d18ff6..5ca4ad3 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -62,10 +62,13 @@ module_param(oom_pages, int, S_IRUSR | S_IWUSR);
 MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
 
 extern unsigned long get_max_pfn(void);
+extern int get_free_pages(unsigned long start_pfn, unsigned long end_pfn,
+   unsigned long *bitmap, unsigned long len);
+
 
 struct virtio_balloon {
struct virtio_device *vdev;
-   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
+   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *misc_vq;
 
/* The balloon servicing is delegated to a freezable workqueue. */
struct work_struct update_balloon_stats_work;
@@ -89,6 +92,8 @@ struct virtio_balloon {
unsigned long pfn_limit;
/* Used to record the processed pfn range */
unsigned long min_pfn, max_pfn, start_pfn, end_pfn;
+   /* Request header */
+   struct balloon_req_hdr req_hdr;
/*
 * The pages we've told the Host we're not using are enqueued
 * at vb_dev_info->pages list.
@@ -373,6 +378,49 @@ static void update_balloon_stats(struct virtio_balloon *vb)
pages_to_bytes(available));
 }
 
+static void update_free_pages_stats(struct virtio_balloon *vb,
+   unsigned long req_id)
+{
+   struct scatterlist sg_in, sg_out;
+   unsigned long pfn = 0, bmap_len, max_pfn;
+   struct virtqueue *vq = vb->misc_vq;
+   struct balloon_bmap_hdr *hdr = vb->bmap_hdr;
+   int ret = 1;
+
+   max_pfn = get_max_pfn();
+   mutex_lock(>balloon_lock);
+   while (pfn < max_pfn) {
+   memset(vb->page_bitmap, 0, vb->bmap_len);
+   ret = get_free_pages(pfn, pfn + vb->pfn_limit,
+   vb->page_bitmap, vb->bmap_len * BITS_PER_BYTE);
+   hdr->cmd = cpu_to_virtio16(vb->vdev, BALLOON_GET_FREE_PAGES);
+   hdr->page_shift = cpu_to_virtio16(vb->vdev, PAGE_SHIFT);
+   hdr->req_id = cpu_to_virtio64(vb->vdev, req_id);
+   hdr->start_pfn = cpu_to_virtio64(vb->vdev, pfn);
+   bmap_len = vb->pfn_limit / BITS_PER_BYTE;
+   if (!ret) {
+   hdr->flag = cpu_to_virtio16(vb->vdev,
+   BALLOON_FLAG_DONE);
+   if (pfn + vb->pfn_limit > max_pfn)
+   bmap_len = (max_pfn - pfn) / BITS_PER_BYTE;
+   } else
+   hdr->flag = cpu_to_virtio16(vb->vdev,
+   BALLOON_FLAG_CONT);
+   hdr->bmap_len = cpu_to_virtio64(vb->vdev, bmap_len);
+   sg_init_one(_out, hdr,
+sizeof(struct balloon_bmap_hdr) + bmap_len);
+
+   virtqueue_add_outbuf(vq, _out, 1, vb, GFP_KERNEL);
+   virtqueue_kick(vq);
+   pfn += vb->pfn_limit;
+   }
+
+   sg_init_one(_in, >req_hdr, sizeof(vb->req_hdr));
+   virtqueue_add_inbuf(vq, _in, 1, >req_hdr, GFP_KERNEL);
+   virtqueue_kick(vq);
+   mutex_unlock(>balloon_lock);
+}
+
 /*
  * While most virtqueues communicate guest-initiated requests to the 
hypervisor,
  * the stats queue operates in reverse.  The driver initializes the virtqueue
@@ -511,18 +559,49 @@ static void update_balloon_size_func(struct work_struct 
*work)
queue_work(system_freezable_wq, work);
 }
 
+static void misc_handle_rq(struct virtio_balloon *vb)
+{
+   struct balloon_req_hdr *ptr_hdr;
+   unsigned int len;
+
+   ptr_hdr = virtqueue_get_buf(vb->misc_vq, );
+   if (!ptr_hdr || len != sizeof(vb->req_hdr))
+   return;
+
+   switch (ptr_hdr->cmd) {
+   case BALLOON_GET_FREE_PAGES:
+   update_free_pages_stats(vb, ptr_hdr->param);
+   break;
+   default:
+   break;
+   }
+}
+
+static void misc_request(struct virtqueue *vq)
+{
+   struct virtio_balloon *vb = vq->vdev->priv;
+
+   misc_handle_rq(vb);
+}
+
 static int init_vqs(struct virtio_balloon *vb)
 {
-   struct virtqueue *vqs[3];
-  

[Qemu-devel] [PATCH v2 kernel 1/7] virtio-balloon: rework deflate to add page to a list

2016-06-29 Thread Liang Li
will allow faster notifications using a bitmap down the road.
balloon_pfn_to_page() can be removed because it's useless.

Signed-off-by: Liang Li <liang.z...@intel.com>
Signed-off-by: Michael S. Tsirkin <m...@redhat.com>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: Cornelia Huck <cornelia.h...@de.ibm.com>
Cc: Amit Shah <amit.s...@redhat.com>
---
 drivers/virtio/virtio_balloon.c | 22 --
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 476c0e3..8d649a2 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -98,12 +98,6 @@ static u32 page_to_balloon_pfn(struct page *page)
return pfn * VIRTIO_BALLOON_PAGES_PER_PAGE;
 }
 
-static struct page *balloon_pfn_to_page(u32 pfn)
-{
-   BUG_ON(pfn % VIRTIO_BALLOON_PAGES_PER_PAGE);
-   return pfn_to_page(pfn / VIRTIO_BALLOON_PAGES_PER_PAGE);
-}
-
 static void balloon_ack(struct virtqueue *vq)
 {
struct virtio_balloon *vb = vq->vdev->priv;
@@ -176,18 +170,16 @@ static unsigned fill_balloon(struct virtio_balloon *vb, 
size_t num)
return num_allocated_pages;
 }
 
-static void release_pages_balloon(struct virtio_balloon *vb)
+static void release_pages_balloon(struct virtio_balloon *vb,
+struct list_head *pages)
 {
-   unsigned int i;
-   struct page *page;
+   struct page *page, *next;
 
-   /* Find pfns pointing at start of each page, get pages and free them. */
-   for (i = 0; i < vb->num_pfns; i += VIRTIO_BALLOON_PAGES_PER_PAGE) {
-   page = balloon_pfn_to_page(virtio32_to_cpu(vb->vdev,
-  vb->pfns[i]));
+   list_for_each_entry_safe(page, next, pages, lru) {
if (!virtio_has_feature(vb->vdev,
VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
adjust_managed_page_count(page, 1);
+   list_del(>lru);
put_page(page); /* balloon reference */
}
 }
@@ -197,6 +189,7 @@ static unsigned leak_balloon(struct virtio_balloon *vb, 
size_t num)
unsigned num_freed_pages;
struct page *page;
struct balloon_dev_info *vb_dev_info = >vb_dev_info;
+   LIST_HEAD(pages);
 
/* We can only do one array worth at a time. */
num = min(num, ARRAY_SIZE(vb->pfns));
@@ -208,6 +201,7 @@ static unsigned leak_balloon(struct virtio_balloon *vb, 
size_t num)
if (!page)
break;
set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
+   list_add(>lru, );
vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE;
}
 
@@ -219,7 +213,7 @@ static unsigned leak_balloon(struct virtio_balloon *vb, 
size_t num)
 */
if (vb->num_pfns != 0)
tell_host(vb, vb->deflate_vq);
-   release_pages_balloon(vb);
+   release_pages_balloon(vb, );
mutex_unlock(>balloon_lock);
return num_freed_pages;
 }
-- 
1.8.3.1




[Qemu-devel] [PATCH v2 kernel 3/7] mm: add a function to get the max pfn

2016-06-29 Thread Liang Li
Expose the function to get the max pfn, so it can be used in the
virtio-balloon device driver.

Signed-off-by: Liang Li <liang.z...@intel.com>
Cc: Andrew Morton <a...@linux-foundation.org>
Cc: Mel Gorman <mgor...@techsingularity.net>
Cc: Michael S. Tsirkin <m...@redhat.com>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: Cornelia Huck <cornelia.h...@de.ibm.com>
Cc: Amit Shah <amit.s...@redhat.com>
---
 mm/page_alloc.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6903b69..2083b40 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4515,6 +4515,12 @@ void show_free_areas(unsigned int filter)
show_swap_cache_info();
 }
 
+unsigned long get_max_pfn(void)
+{
+   return max_pfn;
+}
+EXPORT_SYMBOL(get_max_pfn);
+
 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
 {
zoneref->zone = zone;
-- 
1.8.3.1




[Qemu-devel] [PATCH v2 kernel 6/7] mm: add the related functions to get free page info

2016-06-29 Thread Liang Li
Save the free page info into a page bitmap, will be used in virtio
balloon device driver.

Signed-off-by: Liang Li <liang.z...@intel.com>
Cc: Andrew Morton <a...@linux-foundation.org>
Cc: Mel Gorman <mgor...@techsingularity.net>
Cc: Michael S. Tsirkin <m...@redhat.com>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: Cornelia Huck <cornelia.h...@de.ibm.com>
Cc: Amit Shah <amit.s...@redhat.com>
---
 mm/page_alloc.c | 46 ++
 1 file changed, 46 insertions(+)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2083b40..c2a6669 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4521,6 +4521,52 @@ unsigned long get_max_pfn(void)
 }
 EXPORT_SYMBOL(get_max_pfn);
 
+static void mark_free_pages_bitmap(struct zone *zone, unsigned long start_pfn,
+   unsigned long end_pfn, unsigned long *bitmap, unsigned long len)
+{
+   unsigned long pfn, flags, page_num;
+   unsigned int order, t;
+   struct list_head *curr;
+
+   if (zone_is_empty(zone))
+   return;
+   end_pfn = min(start_pfn + len, end_pfn);
+   spin_lock_irqsave(>lock, flags);
+
+   for_each_migratetype_order(order, t) {
+   list_for_each(curr, >free_area[order].free_list[t]) {
+   pfn = page_to_pfn(list_entry(curr, struct page, lru));
+   if (pfn >= start_pfn && pfn <= end_pfn) {
+   page_num = 1UL << order;
+   if (pfn + page_num > end_pfn)
+   page_num = end_pfn - pfn;
+   bitmap_set(bitmap, pfn - start_pfn, page_num);
+   }
+   }
+   }
+
+   spin_unlock_irqrestore(>lock, flags);
+}
+
+int get_free_pages(unsigned long start_pfn, unsigned long end_pfn,
+   unsigned long *bitmap, unsigned long len)
+{
+   struct zone *zone;
+   int ret = 0;
+
+   if (bitmap == NULL || start_pfn > end_pfn || start_pfn >= max_pfn)
+   return 0;
+   if (end_pfn < max_pfn)
+   ret = 1;
+   if (end_pfn >= max_pfn)
+   ret = 0;
+
+   for_each_populated_zone(zone)
+   mark_free_pages_bitmap(zone, start_pfn, end_pfn, bitmap, len);
+   return ret;
+}
+EXPORT_SYMBOL(get_free_pages);
+
 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
 {
zoneref->zone = zone;
-- 
1.8.3.1




[Qemu-devel] [PATCH v2 kernel 4/7] virtio-balloon: speed up inflate/deflate process

2016-06-29 Thread Liang Li
The implementation of the current virtio-balloon is not very
efficient, the time spends on different stages of inflating
the balloon to 7GB of a 8GB idle guest:

a. allocating pages (6.5%)
b. sending PFNs to host (68.3%)
c. address translation (6.1%)
d. madvise (19%)

It takes about 4126ms for the inflating process to complete.
Debugging shows that the bottle neck are the stage b and stage d.

If using a bitmap to send the page info instead of the PFNs, we
can reduce the overhead in stage b quite a lot. Furthermore, we
can do the address translation and call madvise() with a bulk of
RAM pages, instead of the current page per page way, the overhead
of stage c and stage d can also be reduced a lot.

This patch is the kernel side implementation which is intended to
speed up the inflating & deflating process by adding a new feature
to the virtio-balloon device. With this new feature, inflating the
balloon to 7GB of a 8GB idle guest only takes 590ms, the
performance improvement is about 85%.

TODO: optimize stage a by allocating/freeing a chunk of pages
instead of a single page at a time.

Signed-off-by: Liang Li <liang.z...@intel.com>
Suggested-by: Michael S. Tsirkin <m...@redhat.com>
Cc: Michael S. Tsirkin <m...@redhat.com>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: Cornelia Huck <cornelia.h...@de.ibm.com>
Cc: Amit Shah <amit.s...@redhat.com>
---
 drivers/virtio/virtio_balloon.c | 184 +++-
 1 file changed, 162 insertions(+), 22 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 8d649a2..2d18ff6 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -41,10 +41,28 @@
 #define OOM_VBALLOON_DEFAULT_PAGES 256
 #define VIRTBALLOON_OOM_NOTIFY_PRIORITY 80
 
+/*
+ * VIRTIO_BALLOON_PFNS_LIMIT is used to limit the size of page bitmap
+ * to prevent a very large page bitmap, there are two reasons for this:
+ * 1) to save memory.
+ * 2) allocate a large bitmap may fail.
+ *
+ * The actual limit of pfn is determined by:
+ * pfn_limit = min(max_pfn, VIRTIO_BALLOON_PFNS_LIMIT);
+ *
+ * If system has more pages than VIRTIO_BALLOON_PFNS_LIMIT, we will scan
+ * the page list and send the PFNs with several times. To reduce the
+ * overhead of scanning the page list. VIRTIO_BALLOON_PFNS_LIMIT should
+ * be set with a value which can cover most cases.
+ */
+#define VIRTIO_BALLOON_PFNS_LIMIT ((32 * (1ULL << 30)) >> PAGE_SHIFT) /* 32GB 
*/
+
 static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES;
 module_param(oom_pages, int, S_IRUSR | S_IWUSR);
 MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
 
+extern unsigned long get_max_pfn(void);
+
 struct virtio_balloon {
struct virtio_device *vdev;
struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
@@ -62,6 +80,15 @@ struct virtio_balloon {
 
/* Number of balloon pages we've told the Host we're not using. */
unsigned int num_pages;
+   /* Pointer of the bitmap header. */
+   void *bmap_hdr;
+   /* Bitmap and length used to tell the host the pages */
+   unsigned long *page_bitmap;
+   unsigned long bmap_len;
+   /* Pfn limit */
+   unsigned long pfn_limit;
+   /* Used to record the processed pfn range */
+   unsigned long min_pfn, max_pfn, start_pfn, end_pfn;
/*
 * The pages we've told the Host we're not using are enqueued
 * at vb_dev_info->pages list.
@@ -105,12 +132,45 @@ static void balloon_ack(struct virtqueue *vq)
wake_up(>acked);
 }
 
+static inline void init_pfn_range(struct virtio_balloon *vb)
+{
+   vb->min_pfn = ULONG_MAX;
+   vb->max_pfn = 0;
+}
+
+static inline void update_pfn_range(struct virtio_balloon *vb,
+struct page *page)
+{
+   unsigned long balloon_pfn = page_to_balloon_pfn(page);
+
+   if (balloon_pfn < vb->min_pfn)
+   vb->min_pfn = balloon_pfn;
+   if (balloon_pfn > vb->max_pfn)
+   vb->max_pfn = balloon_pfn;
+}
+
 static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
 {
struct scatterlist sg;
unsigned int len;
 
-   sg_init_one(, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns);
+   if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_PAGE_BITMAP)) {
+   struct balloon_bmap_hdr *hdr = vb->bmap_hdr;
+   unsigned long bmap_len;
+
+   /* cmd and req_id are not used here, set them to 0 */
+   hdr->cmd = cpu_to_virtio16(vb->vdev, 0);
+   hdr->page_shift = cpu_to_virtio16(vb->vdev, PAGE_SHIFT);
+   hdr->reserved = cpu_to_virtio16(vb->vdev, 0);
+   hdr->req_id = cpu_to_virtio64(vb->vdev, 0);
+   hdr->start_pfn = cpu_to_virtio64(vb->vdev, vb->start_pfn);
+   bmap_len = min(vb->bmap_

[Qemu-devel] [PATCH v2 kernel 5/7] virtio-balloon: define feature bit and head for misc virt queue

2016-06-29 Thread Liang Li
Define a new feature bit which supports a new virtual queue. This
new virtual qeuque is for information exchange between hypervisor
and guest. The VMM hypervisor can make use of this virtual queue
to request the guest do some operations, e.g. drop page cache,
synchronize file system, etc. And the VMM hypervisor can get some
of guest's runtime information through this virtual queue, e.g. the
guest's free page information, which can be used for live migration
optimization.

Signed-off-by: Liang Li <liang.z...@intel.com>
Cc: Michael S. Tsirkin <m...@redhat.com>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: Cornelia Huck <cornelia.h...@de.ibm.com>
Cc: Amit Shah <amit.s...@redhat.com>
---
 include/uapi/linux/virtio_balloon.h | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/include/uapi/linux/virtio_balloon.h 
b/include/uapi/linux/virtio_balloon.h
index d3b182a..be4880f 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -35,6 +35,7 @@
 #define VIRTIO_BALLOON_F_STATS_VQ  1 /* Memory Stats virtqueue */
 #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM2 /* Deflate balloon on OOM */
 #define VIRTIO_BALLOON_F_PAGE_BITMAP   3 /* Send page info with bitmap */
+#define VIRTIO_BALLOON_F_MISC_VQ   4 /* Misc info virtqueue */
 
 /* Size of a PFN in the balloon interface. */
 #define VIRTIO_BALLOON_PFN_SHIFT 12
@@ -101,4 +102,25 @@ struct balloon_bmap_hdr {
__virtio64 bmap_len;
 };
 
+enum balloon_req_id {
+   /* Get free pages information */
+   BALLOON_GET_FREE_PAGES,
+};
+
+enum balloon_flag {
+   /* Have more data for a request */
+   BALLOON_FLAG_CONT,
+   /* No more data for a request */
+   BALLOON_FLAG_DONE,
+};
+
+struct balloon_req_hdr {
+   /* Used to distinguish different request */
+   __virtio16 cmd;
+   /* Reserved */
+   __virtio16 reserved[3];
+   /* Request parameter */
+   __virtio64 param;
+};
+
 #endif /* _LINUX_VIRTIO_BALLOON_H */
-- 
1.8.3.1




[Qemu-devel] [PATCH v2 kernel 2/7] virtio-balloon: define new feature bit and page bitmap head

2016-06-29 Thread Liang Li
Add a new feature which supports sending the page information with
a bitmap. The current implementation uses PFNs array, which is not
very efficient. Using bitmap can improve the performance of
inflating/deflating significantly

The page bitmap header will used to tell the host some information
about the page bitmap. e.g. the page size, page bitmap length and
start pfn.

Signed-off-by: Liang Li <liang.z...@intel.com>
Cc: Michael S. Tsirkin <m...@redhat.com>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: Cornelia Huck <cornelia.h...@de.ibm.com>
Cc: Amit Shah <amit.s...@redhat.com>
---
 include/uapi/linux/virtio_balloon.h | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/include/uapi/linux/virtio_balloon.h 
b/include/uapi/linux/virtio_balloon.h
index 343d7dd..d3b182a 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -34,6 +34,7 @@
 #define VIRTIO_BALLOON_F_MUST_TELL_HOST0 /* Tell before reclaiming 
pages */
 #define VIRTIO_BALLOON_F_STATS_VQ  1 /* Memory Stats virtqueue */
 #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM2 /* Deflate balloon on OOM */
+#define VIRTIO_BALLOON_F_PAGE_BITMAP   3 /* Send page info with bitmap */
 
 /* Size of a PFN in the balloon interface. */
 #define VIRTIO_BALLOON_PFN_SHIFT 12
@@ -82,4 +83,22 @@ struct virtio_balloon_stat {
__virtio64 val;
 } __attribute__((packed));
 
+/* Page bitmap header structure */
+struct balloon_bmap_hdr {
+   /* Used to distinguish different request */
+   __virtio16 cmd;
+   /* Shift width of page in the bitmap */
+   __virtio16 page_shift;
+   /* flag used to identify different status */
+   __virtio16 flag;
+   /* Reserved */
+   __virtio16 reserved;
+   /* ID of the request */
+   __virtio64 req_id;
+   /* The pfn of 0 bit in the bitmap */
+   __virtio64 start_pfn;
+   /* The length of the bitmap, in bytes */
+   __virtio64 bmap_len;
+};
+
 #endif /* _LINUX_VIRTIO_BALLOON_H */
-- 
1.8.3.1




[Qemu-devel] [PATCH v2 kernel 0/7] Extend virtio-balloon for fast (de)inflating & fast live migration

2016-06-29 Thread Liang Li
This patch set contains two parts of changes to the virtio-balloon. 

One is the change for speeding up the inflating & deflating process,
the main idea of this optimization is to use bitmap to send the page
information to host instead of the PFNs, to reduce the overhead of
virtio data transmission, address translation and madvise(). This can
help to improve the performance by about 85%.

Another change is for speeding up live migration. By skipping process
guest's free pages in the first round of data copy, to reduce needless
data processing, this can help to save quite a lot of CPU cycles and
network bandwidth. We put guest's free page information in bitmap and
send it to host with the virt queue of virtio-balloon. For an idle 8GB
guest, this can help to shorten the total live migration time from 2Sec
to about 500ms in the 10Gbps network environment.  


Changes from v1 to v2:
* Abandon the patch for dropping page cache.
* Put some structures to uapi head file.
* Use a new way to determine the page bitmap size.
* Use a unified way to send the free page information with the bitmap 
* Address the issues referred in MST's comments

Liang Li (7):
  virtio-balloon: rework deflate to add page to a list
  virtio-balloon: define new feature bit and page bitmap head
  mm: add a function to get the max pfn
  virtio-balloon: speed up inflate/deflate process
  virtio-balloon: define feature bit and head for misc virt queue
  mm: add the related functions to get free page info
  virtio-balloon: tell host vm's free page info

 drivers/virtio/virtio_balloon.c | 306 +++-
 include/uapi/linux/virtio_balloon.h |  41 +
 mm/page_alloc.c |  52 ++
 3 files changed, 359 insertions(+), 40 deletions(-)

-- 
1.8.3.1




[Qemu-devel] [QEMU 7/7] migration: skip free pages during live migration

2016-06-13 Thread Liang Li
After sending out the request for free pages, live migration
process will start without waiting for the free page bitmap is
ready. If the free page bitmap is not ready when doing the 1st
migration_bitmap_sync() after ram_save_setup(), the free page
bitmap will be ignored, this means the free pages will not be
filtered out in this case.
The current implementation can not work with post copy, if post
copy is enabled, we simply ignore the free pages. Will make it
work later.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 migration/ram.c | 93 +
 1 file changed, 93 insertions(+)

diff --git a/migration/ram.c b/migration/ram.c
index 844ea46..5f1c3ff 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -43,6 +43,8 @@
 #include "trace.h"
 #include "exec/ram_addr.h"
 #include "qemu/rcu_queue.h"
+#include "sysemu/balloon.h"
+#include "sysemu/kvm.h"
 
 #ifdef DEBUG_MIGRATION_RAM
 #define DPRINTF(fmt, ...) \
@@ -228,6 +230,7 @@ static QemuMutex migration_bitmap_mutex;
 static uint64_t migration_dirty_pages;
 static uint32_t last_version;
 static bool ram_bulk_stage;
+static bool ignore_freepage_rsp;
 
 /* used by the search for pages to send */
 struct PageSearchStatus {
@@ -244,6 +247,7 @@ static struct BitmapRcu {
 struct rcu_head rcu;
 /* Main migration bitmap */
 unsigned long *bmap;
+unsigned long *free_page_bmap;
 /* bitmap of pages that haven't been sent even once
  * only maintained and used in postcopy at the moment
  * where it's used to send the dirtymap at the start
@@ -639,6 +643,7 @@ static void migration_bitmap_sync(void)
 rcu_read_unlock();
 qemu_mutex_unlock(_bitmap_mutex);
 
+ignore_freepage_rsp = true;
 trace_migration_bitmap_sync_end(migration_dirty_pages
 - num_dirty_pages_init);
 num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init;
@@ -1417,6 +1422,7 @@ static void migration_bitmap_free(struct BitmapRcu *bmap)
 {
 g_free(bmap->bmap);
 g_free(bmap->unsentmap);
+g_free(bmap->free_page_bmap);
 g_free(bmap);
 }
 
@@ -1487,6 +1493,85 @@ void migration_bitmap_extend(ram_addr_t old, ram_addr_t 
new)
 }
 }
 
+static void filter_out_guest_free_page(unsigned long *free_page_bmap,
+   long nbits)
+{
+long i, page_count = 0, len;
+unsigned long *bitmap;
+
+tighten_guest_free_page_bmap(free_page_bmap);
+qemu_mutex_lock(_bitmap_mutex);
+bitmap = atomic_rcu_read(_bitmap_rcu)->bmap;
+slow_bitmap_complement(bitmap, free_page_bmap, nbits);
+
+len = (last_ram_offset() >> TARGET_PAGE_BITS) / BITS_PER_LONG;
+for (i = 0; i < len; i++) {
+page_count += hweight_long(bitmap[i]);
+}
+
+migration_dirty_pages = page_count;
+qemu_mutex_unlock(_bitmap_mutex);
+}
+
+static void ram_request_free_page(unsigned long *bmap, unsigned long max_pfn)
+{
+BalloonReqStatus status;
+
+status = balloon_get_free_pages(bmap, max_pfn);
+switch (status) {
+case REQ_DONE:
+ignore_freepage_rsp = false;
+break;
+case REQ_ERROR:
+error_report("Errro happend when request free page");
+break;
+default:
+error_report("unexpected response status: %d", status);
+break;
+}
+}
+
+static void ram_handle_free_page(void)
+{
+unsigned long nbits;
+RAMBlock *pc_ram_block;
+BalloonReqStatus status;
+
+status = balloon_get_free_pages(migration_bitmap_rcu->free_page_bmap,
+get_guest_max_pfn());
+switch (status) {
+case REQ_DONE:
+rcu_read_lock();
+pc_ram_block = QLIST_FIRST_RCU(_list.blocks);
+nbits = pc_ram_block->used_length >> TARGET_PAGE_BITS;
+filter_out_guest_free_page(migration_bitmap_rcu->free_page_bmap, 
nbits);
+rcu_read_unlock();
+
+qemu_mutex_lock_iothread();
+migration_bitmap_sync();
+qemu_mutex_unlock_iothread();
+/*
+ * bulk stage assumes in (migration_bitmap_find_and_reset_dirty) that
+ * every page is dirty, that's no longer ture at this point.
+ */
+ram_bulk_stage = false;
+last_seen_block = NULL;
+last_sent_block = NULL;
+last_offset = 0;
+break;
+case REQ_ERROR:
+ignore_freepage_rsp = true;
+error_report("failed to get free page");
+break;
+case REQ_INVALID_PARAM:
+ignore_freepage_rsp = true;
+error_report("buffer overflow");
+break;
+default:
+break;
+}
+}
+
 /*
  * 'expected' is the value you expect the bitmap mostly to be full
  * of; it won't bother printing lines that are all this value.
@@ -1950,6 +2035,11 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
 qemu_mutex_unlock_ramlist();
   

[Qemu-devel] [QEMU 5/7] bitmap: Add a new bitmap_move function

2016-06-13 Thread Liang Li
Sometimes, it is need to move a portion of bitmap to another place
in a large bitmap, if overlap happens, the bitmap_copy can't not
work correctly, we need a new function to do this work.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 include/qemu/bitmap.h | 13 +
 1 file changed, 13 insertions(+)

diff --git a/include/qemu/bitmap.h b/include/qemu/bitmap.h
index ec5146f..6ac89ca 100644
--- a/include/qemu/bitmap.h
+++ b/include/qemu/bitmap.h
@@ -37,6 +37,7 @@
  * bitmap_set(dst, pos, nbits) Set specified bit area
  * bitmap_set_atomic(dst, pos, nbits)   Set specified bit area with atomic ops
  * bitmap_clear(dst, pos, nbits)   Clear specified bit area
+ * bitmap_move(dst, src, nbits) Move *src to *dst
  * bitmap_test_and_clear_atomic(dst, pos, nbits)Test and clear area
  * bitmap_find_next_zero_area(buf, len, pos, n, mask)  Find bit free area
  */
@@ -136,6 +137,18 @@ static inline void bitmap_copy(unsigned long *dst, const 
unsigned long *src,
 }
 }
 
+static inline void bitmap_move(unsigned long *dst, const unsigned long *src,
+   long nbits)
+{
+if (small_nbits(nbits)) {
+unsigned long tmp = *src;
+*dst = tmp;
+} else {
+long len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
+memmove(dst, src, len);
+}
+}
+
 static inline int bitmap_and(unsigned long *dst, const unsigned long *src1,
  const unsigned long *src2, long nbits)
 {
-- 
1.9.1




[Qemu-devel] [QEMU 6/7] kvm: Add two new arch specific functions

2016-06-13 Thread Liang Li
Add a new function to get the vm's max pfn and a new function
to filter out the holes to get a tight free page bitmap.
They are implemented on X86, and all the arches should implement
them for live migration optimization.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 include/sysemu/kvm.h |  2 ++
 target-arm/kvm.c | 14 ++
 target-i386/kvm.c| 35 +++
 target-mips/kvm.c| 14 ++
 target-ppc/kvm.c | 14 ++
 target-s390x/kvm.c   | 14 ++
 6 files changed, 93 insertions(+)

diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index ad6f837..50915f9 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -230,6 +230,8 @@ int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
   target_ulong len, int type);
 void kvm_remove_all_breakpoints(CPUState *cpu);
 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap);
+unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap);
+unsigned long get_guest_max_pfn(void);
 #ifndef _WIN32
 int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset);
 #endif
diff --git a/target-arm/kvm.c b/target-arm/kvm.c
index 83da447..6464542 100644
--- a/target-arm/kvm.c
+++ b/target-arm/kvm.c
@@ -627,3 +627,17 @@ int kvm_arch_msi_data_to_gsi(uint32_t data)
 {
 return (data - 32) & 0x;
 }
+
+unsigned long get_guest_max_pfn(void)
+{
+/* To be done */
+
+return 0;
+}
+
+unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap)
+{
+/* To be done */
+
+return bmap;
+}
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index abf50e6..0b394cb 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -3327,3 +3327,38 @@ int kvm_arch_msi_data_to_gsi(uint32_t data)
 {
 abort();
 }
+
+unsigned long get_guest_max_pfn(void)
+{
+PCMachineState *pcms = PC_MACHINE(current_machine);
+ram_addr_t above_4g_mem = pcms->above_4g_mem_size;
+unsigned long max_pfn;
+
+if (above_4g_mem) {
+max_pfn = ((1ULL << 32) + above_4g_mem) >> TARGET_PAGE_BITS;
+} else {
+max_pfn = pcms->below_4g_mem_size >> TARGET_PAGE_BITS;
+}
+
+return max_pfn;
+}
+
+unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap)
+{
+PCMachineState *pcms = PC_MACHINE(current_machine);
+ram_addr_t above_4g_mem = pcms->above_4g_mem_size;
+
+if (above_4g_mem) {
+unsigned long *src, *dst, len, pos;
+ram_addr_t below_4g_mem = pcms->below_4g_mem_size;
+src = bmap + ((1ULL << 32) >> TARGET_PAGE_BITS) / BITS_PER_LONG;
+dst = bmap + (below_4g_mem >> TARGET_PAGE_BITS) / BITS_PER_LONG;
+bitmap_move(dst, src, above_4g_mem >> TARGET_PAGE_BITS);
+
+pos = (above_4g_mem + below_4g_mem) >> TARGET_PAGE_BITS;
+len = ((1ULL << 32) - below_4g_mem) >> TARGET_PAGE_BITS;
+bitmap_clear(bmap, pos, len);
+}
+
+return bmap;
+}
diff --git a/target-mips/kvm.c b/target-mips/kvm.c
index a854e4d..89a54e5 100644
--- a/target-mips/kvm.c
+++ b/target-mips/kvm.c
@@ -1048,3 +1048,17 @@ int kvm_arch_msi_data_to_gsi(uint32_t data)
 {
 abort();
 }
+
+unsigned long get_guest_max_pfn(void)
+{
+/* To be done */
+
+return 0;
+}
+
+unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap)
+{
+/* To be done */
+
+return bmap;
+}
diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index 24d6032..e222b31 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -2579,3 +2579,17 @@ int kvmppc_enable_hwrng(void)
 
 return kvmppc_enable_hcall(kvm_state, H_RANDOM);
 }
+
+unsigned long get_guest_max_pfn(void)
+{
+/* To be done */
+
+return 0;
+}
+
+unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap)
+{
+/* To be done */
+
+return bmap;
+}
diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c
index 8f46fd0..893755b 100644
--- a/target-s390x/kvm.c
+++ b/target-s390x/kvm.c
@@ -2271,3 +2271,17 @@ int kvm_arch_msi_data_to_gsi(uint32_t data)
 {
 abort();
 }
+
+unsigned long get_guest_max_pfn(void)
+{
+/* To be done */
+
+return 0;
+}
+
+unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap)
+{
+/* To be done */
+
+return bmap;
+}
-- 
1.9.1




[Qemu-devel] [QEMU 4/7] balloon: get free page info from guest

2016-06-13 Thread Liang Li
Add a new feature to get the free page information from guest,
the free page information is saved in a bitmap. Please note that
'free page' only means these pages are free before the request,
some of the pages will become no free during the process of
sending the free page bitmap to QEMU.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 balloon.c  | 24 +++-
 hw/virtio/virtio-balloon.c | 75 +-
 include/hw/virtio/virtio-balloon.h |  4 ++
 include/sysemu/balloon.h   |  8 
 4 files changed, 108 insertions(+), 3 deletions(-)

diff --git a/balloon.c b/balloon.c
index 3d96111..c74c472 100644
--- a/balloon.c
+++ b/balloon.c
@@ -37,6 +37,7 @@
 static QEMUBalloonEvent *balloon_event_fn;
 static QEMUBalloonStatus *balloon_stat_fn;
 static QEMUBalloonDropCache *balloon_drop_cache_fn;
+static QEMUBalloonGetFreePage *balloon_get_free_page_fn;
 static void *balloon_opaque;
 static bool balloon_inhibited;
 
@@ -68,10 +69,11 @@ static bool have_balloon(Error **errp)
 int qemu_add_balloon_handler(QEMUBalloonEvent *event_func,
  QEMUBalloonStatus *stat_func,
  QEMUBalloonDropCache *drop_cache_func,
+ QEMUBalloonGetFreePage *get_free_page_func,
  void *opaque)
 {
 if (balloon_event_fn || balloon_stat_fn || balloon_drop_cache_fn
-|| balloon_opaque) {
+|| balloon_get_free_page_fn || balloon_opaque) {
 /* We're already registered one balloon handler.  How many can
  * a guest really have?
  */
@@ -80,6 +82,7 @@ int qemu_add_balloon_handler(QEMUBalloonEvent *event_func,
 balloon_event_fn = event_func;
 balloon_stat_fn = stat_func;
 balloon_drop_cache_fn = drop_cache_func;
+balloon_get_free_page_fn = get_free_page_func;
 balloon_opaque = opaque;
 return 0;
 }
@@ -92,6 +95,7 @@ void qemu_remove_balloon_handler(void *opaque)
 balloon_event_fn = NULL;
 balloon_stat_fn = NULL;
 balloon_drop_cache_fn = NULL;
+balloon_get_free_page_fn = NULL;
 balloon_opaque = NULL;
 }
 
@@ -141,3 +145,21 @@ void qmp_balloon_drop_cache(DropCacheType type, Error 
**errp)
 
 balloon_drop_cache_fn(balloon_opaque, type);
 }
+
+bool balloon_free_pages_support(void)
+{
+return balloon_get_free_page_fn ? true : false;
+}
+
+BalloonReqStatus balloon_get_free_pages(unsigned long *bitmap, unsigned long 
len)
+{
+if (!balloon_get_free_page_fn) {
+return REQ_UNSUPPORT;
+}
+
+if (!bitmap) {
+return REQ_INVALID_PARAM;
+}
+
+return balloon_get_free_page_fn(balloon_opaque, bitmap, len);
+}
diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index 4757ba5..30ba074 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -38,6 +38,7 @@
 
 enum balloon_req_id {
BALLOON_DROP_CACHE,
+   BALLOON_GET_FREE_PAGES,
 };
 
 static void balloon_page(void *addr, int deflate)
@@ -435,7 +436,8 @@ static void virtio_balloon_handle_resp(VirtIODevice *vdev, 
VirtQueue *vq)
 VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
 VirtQueueElement *elem;
 size_t offset = 0;
-uint32_t tmp32, id = 0;
+uint32_t tmp32, id = 0, page_shift;
+uint64_t base_pfn, tmp64, bmap_len;
 
 elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
 if (!elem) {
@@ -457,6 +459,32 @@ static void virtio_balloon_handle_resp(VirtIODevice *vdev, 
VirtQueue *vq)
 case BALLOON_DROP_CACHE:
 s->req_status = REQ_DONE;
 break;
+case BALLOON_GET_FREE_PAGES:
+iov_to_buf(elem->out_sg, elem->out_num, offset,
+   , sizeof(uint32_t));
+page_shift = virtio_ldl_p(vdev, );
+offset += sizeof(uint32_t);
+s->page_shift = page_shift;
+
+iov_to_buf(elem->out_sg, elem->out_num, offset,
+   , sizeof(uint64_t));
+base_pfn = virtio_ldq_p(vdev, );
+offset += sizeof(uint64_t);
+s->base_pfn = base_pfn;
+
+iov_to_buf(elem->out_sg, elem->out_num, offset,
+   , sizeof(uint64_t));
+bmap_len = virtio_ldq_p(vdev, );
+offset += sizeof(uint64_t);
+if (s->bmap_len < bmap_len) {
+ s->req_status = REQ_INVALID_PARAM;
+ return;
+}
+
+iov_to_buf(elem->out_sg, elem->out_num, offset,
+   s->free_page_bmap, bmap_len);
+s->req_status = REQ_DONE;
+   break;
 default:
 break;
 }
@@ -574,6 +602,48 @@ static int virtio_balloon_drop_cache(void *opaque, 
unsigned long type)
 return REQ_DONE;
 }
 
+static BalloonReqStatus virtio_balloon_free_pages(void *opaque,
+  unsigned long *bitmap,
+  unsigned long bmap_len)
+{
+VirtIOBalloon *s = opaque;
+VirtIODevice *vd

[Qemu-devel] [QEMU 3/7] Add the hmp and qmp interface for dropping cache

2016-06-13 Thread Liang Li
Add the hmp and qmp interface to drop vm's page cache, users
can control the type of cache they want vm to drop.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 balloon.c| 19 +++
 hmp-commands.hx  | 15 +++
 hmp.c| 22 ++
 hmp.h|  3 +++
 monitor.c| 18 ++
 qapi-schema.json | 35 +++
 qmp-commands.hx  | 23 +++
 7 files changed, 135 insertions(+)

diff --git a/balloon.c b/balloon.c
index 0fb34bf..3d96111 100644
--- a/balloon.c
+++ b/balloon.c
@@ -122,3 +122,22 @@ void qmp_balloon(int64_t target, Error **errp)
 trace_balloon_event(balloon_opaque, target);
 balloon_event_fn(balloon_opaque, target);
 }
+
+void qmp_balloon_drop_cache(DropCacheType type, Error **errp)
+{
+if (!have_balloon(errp)) {
+return;
+}
+
+if (!balloon_drop_cache_fn) {
+error_setg(errp, QERR_UNSUPPORTED);
+return;
+}
+if (type < 0 && type >= DROP_CACHE_TYPE__MAX) {
+error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "type",
+   "a value in range[0, 3]");
+return;
+}
+
+balloon_drop_cache_fn(balloon_opaque, type);
+}
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 98b4b1a..c73572c 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1378,6 +1378,21 @@ Request VM to change its memory allocation to 
@var{value} (in MB).
 ETEXI
 
 {
+.name   = "balloon_drop_cache",
+.args_type  = "type:s",
+.params = "type",
+.help   = "request VM to drop its page caches",
+.mhandler.cmd = hmp_balloon_drop_cache,
+.command_completion = balloon_drop_cache_completion
+},
+
+STEXI
+@item balloon_drop_cache @var{type}
+@findex balloon_drop_cache
+Request VM to dorp its page caches.
+ETEXI
+
+{
 .name   = "set_link",
 .args_type  = "name:s,up:b",
 .params = "name on|off",
diff --git a/hmp.c b/hmp.c
index a4b1d3d..3aa1062 100644
--- a/hmp.c
+++ b/hmp.c
@@ -1061,6 +1061,28 @@ void hmp_balloon(Monitor *mon, const QDict *qdict)
 }
 }
 
+void hmp_balloon_drop_cache(Monitor *mon, const QDict *qdict)
+{
+const char *type = qdict_get_str(qdict, "type");
+Error *err = NULL;
+int i;
+
+for (i = 0; i < DROP_CACHE_TYPE__MAX; i++) {
+if (strcmp(type, DropCacheType_lookup[i]) == 0) {
+qmp_balloon_drop_cache(1 + i, );
+break;
+}
+}
+
+if (i == DROP_CACHE_TYPE__MAX) {
+error_setg(, QERR_INVALID_PARAMETER, type);
+}
+
+if (err) {
+error_report_err(err);
+}
+}
+
 void hmp_block_resize(Monitor *mon, const QDict *qdict)
 {
 const char *device = qdict_get_str(qdict, "device");
diff --git a/hmp.h b/hmp.h
index 093d65f..6bb6499 100644
--- a/hmp.h
+++ b/hmp.h
@@ -55,6 +55,7 @@ void hmp_nmi(Monitor *mon, const QDict *qdict);
 void hmp_set_link(Monitor *mon, const QDict *qdict);
 void hmp_block_passwd(Monitor *mon, const QDict *qdict);
 void hmp_balloon(Monitor *mon, const QDict *qdict);
+void hmp_balloon_drop_cache(Monitor *mon, const QDict *qdict);
 void hmp_block_resize(Monitor *mon, const QDict *qdict);
 void hmp_snapshot_blkdev(Monitor *mon, const QDict *qdict);
 void hmp_snapshot_blkdev_internal(Monitor *mon, const QDict *qdict);
@@ -120,6 +121,8 @@ void watchdog_action_completion(ReadLineState *rs, int 
nb_args,
 const char *str);
 void migrate_set_capability_completion(ReadLineState *rs, int nb_args,
const char *str);
+void balloon_drop_cache_completion(ReadLineState *rs, int nb_args,
+   const char *str);
 void migrate_set_parameter_completion(ReadLineState *rs, int nb_args,
   const char *str);
 void host_net_add_completion(ReadLineState *rs, int nb_args, const char *str);
diff --git a/monitor.c b/monitor.c
index a27e115..eefdf3d 100644
--- a/monitor.c
+++ b/monitor.c
@@ -3367,6 +3367,24 @@ void migrate_set_parameter_completion(ReadLineState *rs, 
int nb_args,
 }
 }
 
+void balloon_drop_cache_completion(ReadLineState *rs, int nb_args,
+   const char *str)
+{
+size_t len;
+
+len = strlen(str);
+readline_set_completion_index(rs, len);
+if (nb_args == 2) {
+int i;
+for (i = 0; i < DROP_CACHE_TYPE__MAX; i++) {
+const char *name = DropCacheType_lookup[i];
+if (!strncmp(str, name, len)) {
+readline_add_completion(rs, name);
+}
+}
+}
+}
+
 void host_net_add_completion(ReadLineState *rs, int nb_args, const char *str)
 {
 int i;
diff --git a/qapi-schema.json b/qapi-schema.json
index 8483bdf..117f70a 100644
--- a/qapi-schema.json
+++

[Qemu-devel] [QEMU 2/7] virtio-balloon: add drop cache support

2016-06-13 Thread Liang Li
virtio-balloon can make use of the amount of free memory to determine
the amount of memory to be filled in the balloon, but the amount of
free memory will be effected by the page cache, which can be reclaimed.
Drop the cache before getting the amount of free memory will be very
helpful to relect the exact amount of memroy that can be reclaimed.

This patch add a new feature to the balloon device to support this
operation, hypervisor can request the VM to drop it's cache, so as to
reclaim more memory.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 balloon.c   | 10 ++-
 hw/virtio/virtio-balloon.c  | 85 -
 include/hw/virtio/virtio-balloon.h  | 19 +-
 include/standard-headers/linux/virtio_balloon.h |  1 +
 include/sysemu/balloon.h|  5 +-
 5 files changed, 115 insertions(+), 5 deletions(-)

diff --git a/balloon.c b/balloon.c
index f2ef50c..0fb34bf 100644
--- a/balloon.c
+++ b/balloon.c
@@ -36,6 +36,7 @@
 
 static QEMUBalloonEvent *balloon_event_fn;
 static QEMUBalloonStatus *balloon_stat_fn;
+static QEMUBalloonDropCache *balloon_drop_cache_fn;
 static void *balloon_opaque;
 static bool balloon_inhibited;
 
@@ -65,9 +66,12 @@ static bool have_balloon(Error **errp)
 }
 
 int qemu_add_balloon_handler(QEMUBalloonEvent *event_func,
- QEMUBalloonStatus *stat_func, void *opaque)
+ QEMUBalloonStatus *stat_func,
+ QEMUBalloonDropCache *drop_cache_func,
+ void *opaque)
 {
-if (balloon_event_fn || balloon_stat_fn || balloon_opaque) {
+if (balloon_event_fn || balloon_stat_fn || balloon_drop_cache_fn
+|| balloon_opaque) {
 /* We're already registered one balloon handler.  How many can
  * a guest really have?
  */
@@ -75,6 +79,7 @@ int qemu_add_balloon_handler(QEMUBalloonEvent *event_func,
 }
 balloon_event_fn = event_func;
 balloon_stat_fn = stat_func;
+balloon_drop_cache_fn = drop_cache_func;
 balloon_opaque = opaque;
 return 0;
 }
@@ -86,6 +91,7 @@ void qemu_remove_balloon_handler(void *opaque)
 }
 balloon_event_fn = NULL;
 balloon_stat_fn = NULL;
+balloon_drop_cache_fn = NULL;
 balloon_opaque = NULL;
 }
 
diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index 8cf74c2..4757ba5 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -36,6 +36,10 @@
 
 #define BALLOON_PAGE_SIZE  (1 << VIRTIO_BALLOON_PFN_SHIFT)
 
+enum balloon_req_id {
+   BALLOON_DROP_CACHE,
+};
+
 static void balloon_page(void *addr, int deflate)
 {
 #if defined(__linux__)
@@ -154,6 +158,12 @@ static bool balloon_page_bitmap_supported(const 
VirtIOBalloon *s)
 return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_BITMAP);
 }
 
+static bool balloon_misc_supported(const VirtIOBalloon *s)
+{
+VirtIODevice *vdev = VIRTIO_DEVICE(s);
+return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_MISC);
+}
+
 static bool balloon_stats_enabled(const VirtIOBalloon *s)
 {
 return s->stats_poll_interval > 0;
@@ -420,6 +430,39 @@ out:
 }
 }
 
+static void virtio_balloon_handle_resp(VirtIODevice *vdev, VirtQueue *vq)
+{
+VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
+VirtQueueElement *elem;
+size_t offset = 0;
+uint32_t tmp32, id = 0;
+
+elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
+if (!elem) {
+s->req_status = REQ_ERROR;
+return;
+}
+
+s->misc_vq_elem = elem;
+
+if (!elem->out_num) {
+return;
+}
+
+iov_to_buf(elem->out_sg, elem->out_num, offset,
+   , sizeof(uint32_t));
+id = virtio_ldl_p(vdev, );
+offset += sizeof(uint32_t);
+switch (id) {
+case BALLOON_DROP_CACHE:
+s->req_status = REQ_DONE;
+break;
+default:
+break;
+}
+
+}
+
 static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data)
 {
 VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
@@ -490,6 +533,7 @@ static uint64_t virtio_balloon_get_features(VirtIODevice 
*vdev, uint64_t f,
 f |= dev->host_features;
 virtio_add_feature(, VIRTIO_BALLOON_F_STATS_VQ);
 virtio_add_feature(, VIRTIO_BALLOON_F_PAGE_BITMAP);
+virtio_add_feature(, VIRTIO_BALLOON_F_MISC);
 return f;
 }
 
@@ -500,6 +544,36 @@ static void virtio_balloon_stat(void *opaque, BalloonInfo 
*info)
  VIRTIO_BALLOON_PFN_SHIFT);
 }
 
+static int virtio_balloon_drop_cache(void *opaque, unsigned long type)
+{
+VirtIOBalloon *s = opaque;
+VirtIODevice *vdev = VIRTIO_DEVICE(s);
+VirtQueueElement *elem = s->misc_vq_elem;
+int len;
+
+if (!balloon_misc_supported(s)) {
+return REQ_UNSUPPORT;
+}
+
+if (elem == NULL || !elem->in_num) {
+elem = virtqueue_pop(s->mvq, sizeof(VirtQueueElem

[Qemu-devel] [QEMU 0/7] Fast balloon and fast live migration

2016-06-13 Thread Liang Li
This patch set is intended to speed up the inflating/deflating
process of virtio-balloon and speed up live migration by skipping
process guest's free pages.

The virtio-balloon is extended to support some new features, so
as to make things faster.

Liang Li (7):
  balloon: speed up inflating & deflating process
  virtio-balloon: add drop cache support
  Add the hmp and qmp interface for dropping cache
  balloon: get free page info from guest
  bitmap: Add a new bitmap_move function
  kvm: Add two new arch specific functions
  migration: skip free pages during live migration

 balloon.c   |  51 +++-
 hmp-commands.hx |  15 ++
 hmp.c   |  22 ++
 hmp.h   |   3 +
 hw/virtio/virtio-balloon.c  | 315 ++--
 include/hw/virtio/virtio-balloon.h  |  23 +-
 include/qemu/bitmap.h   |  13 +
 include/standard-headers/linux/virtio_balloon.h |   2 +
 include/sysemu/balloon.h|  13 +-
 include/sysemu/kvm.h|   2 +
 migration/ram.c |  93 +++
 monitor.c   |  18 ++
 qapi-schema.json|  35 +++
 qmp-commands.hx |  23 ++
 target-arm/kvm.c|  14 ++
 target-i386/kvm.c   |  35 +++
 target-mips/kvm.c   |  14 ++
 target-ppc/kvm.c|  14 ++
 target-s390x/kvm.c  |  14 ++
 19 files changed, 693 insertions(+), 26 deletions(-)

-- 
1.9.1




[Qemu-devel] [QEMU 1/7] balloon: speed up inflating & deflating process

2016-06-13 Thread Liang Li
The implementation of the current virtio-balloon is not very efficient,
Bellow is test result of time spends on inflating the balloon to 3GB of
a 4GB idle guest:

a. allocating pages (6.5%, 103ms)
b. sending PFNs to host (68.3%, 787ms)
c. address translation (6.1%, 96ms)
d. madvise (19%, 300ms)

It takes about 1577ms for the whole inflating process to complete. The
test shows that the bottle neck is the stage b and stage d.

If using a bitmap to send the page info instead of the PFNs, we can
reduce the overhead spends on stage b quite a lot. Furthermore, it's
possible to do the address translation and do the madvise with a bulk
of pages, instead of the current page per page way, so the overhead of
stage c and stage d can also be reduced a lot.

This patch is the QEMU side implementation which is intended to speed
up the inflating & deflating process by adding a new feature to the
virtio-balloon device. And now, inflating the balloon to 3GB of a 4GB
idle guest only takes 210ms, it's about 8 times as fast as before.

TODO: optimize stage a by allocating/freeing a chunk of pages instead
of a single page at a time.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 hw/virtio/virtio-balloon.c  | 159 
 include/standard-headers/linux/virtio_balloon.h |   1 +
 2 files changed, 139 insertions(+), 21 deletions(-)

diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index 8c15e09..8cf74c2 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -47,6 +47,76 @@ static void balloon_page(void *addr, int deflate)
 #endif
 }
 
+static void do_balloon_bulk_pages(ram_addr_t base_pfn, int page_shift,
+  unsigned long len, bool deflate)
+{
+ram_addr_t size, processed, chunk, base;
+void *addr;
+MemoryRegionSection section = {.mr = NULL};
+
+size = (len << page_shift);
+base = (base_pfn << page_shift);
+
+for (processed = 0; processed < size; processed += chunk) {
+chunk = size - processed;
+while (chunk >= TARGET_PAGE_SIZE) {
+section = memory_region_find(get_system_memory(),
+ base + processed, chunk);
+if (!section.mr) {
+chunk = QEMU_ALIGN_DOWN(chunk / 2, TARGET_PAGE_SIZE);
+} else {
+break;
+}
+}
+
+if (section.mr &&
+(int128_nz(section.size) && memory_region_is_ram(section.mr))) {
+addr = section.offset_within_region +
+   memory_region_get_ram_ptr(section.mr);
+qemu_madvise(addr, chunk,
+ deflate ? QEMU_MADV_WILLNEED : QEMU_MADV_DONTNEED);
+} else {
+fprintf(stderr, "can't find the chunk, skip\n");
+chunk = TARGET_PAGE_SIZE;
+}
+}
+}
+
+static void balloon_bulk_pages(ram_addr_t base_pfn, unsigned long *bitmap,
+   unsigned long len, int page_shift, bool deflate)
+{
+#if defined(__linux__)
+unsigned long end  = len * 8;
+unsigned long current = 0;
+
+if (!qemu_balloon_is_inhibited() && (!kvm_enabled() ||
+ kvm_has_sync_mmu())) {
+while (current < end) {
+unsigned long one = find_next_bit(bitmap, end, current);
+
+if (one < end) {
+unsigned long zero = find_next_zero_bit(bitmap, end, one + 1);
+unsigned long page_length;
+
+if (zero >= end) {
+page_length = end - one;
+} else {
+page_length = zero - one;
+}
+
+if (page_length) {
+do_balloon_bulk_pages(base_pfn + one, page_shift,
+  page_length, deflate);
+}
+current = one + page_length;
+} else {
+current = one;
+}
+}
+}
+#endif
+}
+
 static const char *balloon_stat_names[] = {
[VIRTIO_BALLOON_S_SWAP_IN] = "stat-swap-in",
[VIRTIO_BALLOON_S_SWAP_OUT] = "stat-swap-out",
@@ -78,6 +148,12 @@ static bool balloon_stats_supported(const VirtIOBalloon *s)
 return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_STATS_VQ);
 }
 
+static bool balloon_page_bitmap_supported(const VirtIOBalloon *s)
+{
+VirtIODevice *vdev = VIRTIO_DEVICE(s);
+return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_BITMAP);
+}
+
 static bool balloon_stats_enabled(const VirtIOBalloon *s)
 {
 return s->stats_poll_interval > 0;
@@ -224,27 +300,66 @@ static void virtio_balloon_handle_output(VirtIODevice 
*vdev, VirtQueue *vq)
 return;
 }
 
-while (iov_to_buf(elem->out_sg, elem->out_num, offset, , 4) == 4) {
-ram_addr_t pa;
-ram_addr_t addr;
- 

[Qemu-devel] [PATCH RFC v2 QEMU] balloon: speed up inflating & deflating process

2016-05-27 Thread Liang Li
The implementation of the current virtio-balloon is not very efficient,
Bellow is test result of time spends on inflating the balloon to 3GB of
a 4GB idle guest:

a. allocating pages (6.5%, 103ms)
b. sending PFNs to host (68.3%, 787ms)
c. address translation (6.1%, 96ms)
d. madvise (19%, 300ms)

It takes about 1577ms for the whole inflating process to complete. The
test shows that the bottle neck is the stage b and stage d.

If using a bitmap to send the page info instead of the PFNs, we can
reduce the overhead spends on stage b quite a lot. Furthermore, it's
possible to do the address translation and do the madvise with a bulk
of pages, instead of the current page per page way, so the overhead of
stage c and stage d can also be reduced a lot.

This patch is the QEMU side implementation which is intended to speed
up the inflating & deflating process by adding a new feature to the
virtio-balloon device. And now, inflating the balloon to 3GB of a 4GB
idle guest only takes 210ms, it's about 8 times as fast as before.

TODO: optimize stage a by allocating/freeing a chunk of pages instead
of a single page at a time.

v2 changes:
change the interface

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 hw/virtio/virtio-balloon.c  | 159 
 include/standard-headers/linux/virtio_balloon.h |   1 +
 2 files changed, 139 insertions(+), 21 deletions(-)

diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index 8c15e09..d6f423c 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -47,6 +47,76 @@ static void balloon_page(void *addr, int deflate)
 #endif
 }
 
+static void do_balloon_bulk_pages(ram_addr_t base_pfn, int page_shift,
+  unsigned long len, bool deflate)
+{
+ram_addr_t size, processed, chunk, base;
+void *addr;
+MemoryRegionSection section = {.mr = NULL};
+
+size = (len << page_shift);
+base = (base_pfn << page_shift);
+
+for (processed = 0; processed < size; processed += chunk) {
+chunk = size - processed;
+while (chunk >= TARGET_PAGE_SIZE) {
+section = memory_region_find(get_system_memory(),
+ base + processed, chunk);
+if (!section.mr) {
+chunk = QEMU_ALIGN_DOWN(chunk / 2, TARGET_PAGE_SIZE);
+} else {
+break;
+}
+}
+
+if (section.mr &&
+(int128_nz(section.size) && memory_region_is_ram(section.mr))) {
+addr = section.offset_within_region +
+   memory_region_get_ram_ptr(section.mr);
+qemu_madvise(addr, chunk,
+   deflate ? QEMU_MADV_WILLNEED : QEMU_MADV_DONTNEED);
+} else {
+fprintf(stderr, "can't find the chunk, skip\n");
+chunk = TARGET_PAGE_SIZE;
+}
+}
+}
+
+static void balloon_bulk_pages(ram_addr_t base_pfn, unsigned long *bitmap,
+   unsigned long len, int page_shift, bool deflate)
+{
+#if defined(__linux__)
+unsigned long end  = len * 8;
+unsigned long current = 0;
+
+if (!qemu_balloon_is_inhibited() && (!kvm_enabled() ||
+ kvm_has_sync_mmu())) {
+while (current < end) {
+unsigned long one = find_next_bit(bitmap, end, current);
+
+if (one < end) {
+unsigned long zero = find_next_zero_bit(bitmap, end, one + 1);
+unsigned long page_length;
+
+if (zero >= end) {
+page_length = end - one;
+} else {
+page_length = zero - one;
+}
+
+if (page_length) {
+do_balloon_bulk_pages(base_pfn + one, page_shift,
+  page_length, deflate);
+}
+current = one + page_length;
+} else {
+current = one;
+}
+}
+}
+#endif
+}
+
 static const char *balloon_stat_names[] = {
[VIRTIO_BALLOON_S_SWAP_IN] = "stat-swap-in",
[VIRTIO_BALLOON_S_SWAP_OUT] = "stat-swap-out",
@@ -78,6 +148,12 @@ static bool balloon_stats_supported(const VirtIOBalloon *s)
 return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_STATS_VQ);
 }
 
+static bool balloon_page_bitmap_supported(const VirtIOBalloon *s)
+{
+VirtIODevice *vdev = VIRTIO_DEVICE(s);
+return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_BITMAP);
+}
+
 static bool balloon_stats_enabled(const VirtIOBalloon *s)
 {
 return s->stats_poll_interval > 0;
@@ -224,27 +300,66 @@ static void virtio_balloon_handle_output(VirtIODevice 
*vdev, VirtQueue *vq)
 return;
 }
 
-while (iov_to_buf(elem->out_sg, elem->out_num, offset, , 4) == 4) {
-ram_addr_t pa;
-   

[Qemu-devel] [PATCH RFC v2 kernel] balloon: speed up inflating/deflating process

2016-05-27 Thread Liang Li
The implementation of the current virtio-balloon is not very efficient,
Bellow is test result of time spends on inflating the balloon to 3GB of
a 4GB idle guest:

a. allocating pages (6.5%, 103ms)
b. sending PFNs to host (68.3%, 787ms)
c. address translation (6.1%, 96ms)
d. madvise (19%, 300ms)

It takes about 1577ms for the whole inflating process to complete. The
test shows that the bottle neck is the stage b and stage d.

If using a bitmap to send the page info instead of the PFNs, we can
reduce the overhead spends on stage b quite a lot. Furthermore, it's
possible to do the address translation and do the madvise with a bulk
of pages, instead of the current page per page way, so the overhead of
stage c and stage d can also be reduced a lot.

This patch is the kernel side implementation which is intended to speed
up the inflating & deflating process by adding a new feature to the
virtio-balloon device. And now, inflating the balloon to 3GB of a 4GB
idle guest only takes 200ms, it's about 8 times as fast as before.

TODO: optimize stage a by allocating/freeing a chunk of pages instead
of a single page at a time.

v2 change:
1. Use a small page bitmap instead of a large one.
2. Address some of comments of v1.

Signed-off-by: Liang Li <liang.z...@intel.com>
Suggested-by: Michael S. Tsirkin <m...@redhat.com>
Cc: Michael S. Tsirkin <m...@redhat.com>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: Cornelia Huck <cornelia.h...@de.ibm.com>
Cc: Amit Shah <amit.s...@redhat.com>
---
 drivers/virtio/virtio_balloon.c | 207 ++--
 include/uapi/linux/virtio_balloon.h |   1 +
 2 files changed, 200 insertions(+), 8 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 476c0e3..823b4e4 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -40,11 +40,19 @@
 #define VIRTIO_BALLOON_ARRAY_PFNS_MAX 256
 #define OOM_VBALLOON_DEFAULT_PAGES 256
 #define VIRTBALLOON_OOM_NOTIFY_PRIORITY 80
+#define VIRTIO_BALLOON_PFNS_LIMIT ((2 * (1ULL << 30)) >> PAGE_SHIFT) /* 2GB */
 
 static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES;
 module_param(oom_pages, int, S_IRUSR | S_IWUSR);
 MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
 
+struct balloon_bmap_hdr {
+   __virtio32 type;
+   __virtio32 page_shift;
+   __virtio64 start_pfn;
+   __virtio64 bmap_len;
+};
+
 struct virtio_balloon {
struct virtio_device *vdev;
struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
@@ -62,6 +70,13 @@ struct virtio_balloon {
 
/* Number of balloon pages we've told the Host we're not using. */
unsigned int num_pages;
+   /* Bitmap and length used to tell the host the pages */
+   unsigned long *page_bitmap;
+   unsigned long bmap_len;
+   /* Used to record the processed pfn range */
+   unsigned long min_pfn, max_pfn, start_pfn, end_pfn;
+   /* Used for sending page bitmap and header */
+   struct scatterlist sg[2];
/*
 * The pages we've told the Host we're not using are enqueued
 * at vb_dev_info->pages list.
@@ -111,15 +126,39 @@ static void balloon_ack(struct virtqueue *vq)
wake_up(>acked);
 }
 
+static inline void init_pfn_range(struct virtio_balloon *vb)
+{
+   vb->min_pfn = (1UL << 48);
+   vb->max_pfn = 0;
+}
+
 static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
 {
-   struct scatterlist sg;
unsigned int len;
 
-   sg_init_one(, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns);
+   if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_PAGE_BITMAP)) {
+   struct balloon_bmap_hdr hdr;
+   unsigned long bmap_len;
+
+   hdr.type = cpu_to_virtio32(vb->vdev, 0);
+   hdr.page_shift = cpu_to_virtio32(vb->vdev, PAGE_SHIFT);
+   hdr.start_pfn = cpu_to_virtio64(vb->vdev, vb->start_pfn);
+   bmap_len = min(vb->bmap_len,
+ (vb->end_pfn - vb->start_pfn) / BITS_PER_BYTE);
+   hdr.bmap_len = cpu_to_virtio64(vb-vdev, bmap_len);
+   sg_set_buf(>sg[0], , sizeof(hdr));
+   sg_set_buf(>sg[1], vb->page_bitmap, bmap_len);
+   virtqueue_add_outbuf(vq, vb->sg, 2, vb, GFP_KERNEL);
+   } else {
+   struct scatterlist sg;
+
+   sg_init_one(, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns);
+   /* We should always be able to add one buffer to an
+   * empty queue.
+   */
+   virtqueue_add_outbuf(vq, , 1, vb, GFP_KERNEL);
+   }
 
-   /* We should always be able to add one buffer to an empty queue. */
-   virtqueue_add_outbuf(vq, , 1, vb, GFP_KERNEL);
virtqueue_kick(vq);
 
/* When host has read buffer, this completes via balloon_ack */
@@ -13

[Qemu-devel] [PATCH RFC kernel] balloon: speed up inflating/deflating process

2016-05-20 Thread Liang Li
The implementation of the current virtio-balloon is not very efficient,
Bellow is test result of time spends on inflating the balloon to 3GB of
a 4GB idle guest:

a. allocating pages (6.5%, 103ms)
b. sending PFNs to host (68.3%, 787ms)
c. address translation (6.1%, 96ms)
d. madvise (19%, 300ms)

It takes about 1577ms for the whole inflating process to complete. The
test shows that the bottle neck is the stage b and stage d.

If using a bitmap to send the page info instead of the PFNs, we can
reduce the overhead spends on stage b quite a lot. Furthermore, it's
possible to do the address translation and do the madvise with a bulk
of pages, instead of the current page per page way, so the overhead of
stage c and stage d can also be reduced a lot.

This patch is the kernel side implementation which is intended to speed
up the inflating & deflating process by adding a new feature to the
virtio-balloon device. And now, inflating the balloon to 3GB of a 4GB
idle guest only takes 175ms, it's about 9 times as fast as before.

TODO: optimize stage a by allocating/freeing a chunk of pages instead
of a single page at a time.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 drivers/virtio/virtio_balloon.c | 199 ++--
 include/uapi/linux/virtio_balloon.h |   1 +
 mm/page_alloc.c |   6 ++
 3 files changed, 198 insertions(+), 8 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 7b6d74f..5330b6f 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -45,6 +45,8 @@ static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES;
 module_param(oom_pages, int, S_IRUSR | S_IWUSR);
 MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
 
+extern unsigned long get_max_pfn(void);
+
 struct virtio_balloon {
struct virtio_device *vdev;
struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
@@ -62,6 +64,9 @@ struct virtio_balloon {
 
/* Number of balloon pages we've told the Host we're not using. */
unsigned int num_pages;
+   unsigned long *page_bitmap;
+   unsigned long start_pfn, end_pfn;
+   unsigned long bmap_len;
/*
 * The pages we've told the Host we're not using are enqueued
 * at vb_dev_info->pages list.
@@ -111,15 +116,66 @@ static void balloon_ack(struct virtqueue *vq)
wake_up(>acked);
 }
 
+static int balloon_page_bitmap_init(struct virtio_balloon *vb)
+{
+   unsigned long max_pfn, bmap_bytes;
+
+   max_pfn = get_max_pfn();
+   bmap_bytes = ALIGN(max_pfn, BITS_PER_LONG) / BITS_PER_BYTE;
+   if (!vb->page_bitmap)
+   vb->page_bitmap = kzalloc(bmap_bytes, GFP_KERNEL);
+   else {
+   if (bmap_bytes <= vb->bmap_len)
+   memset(vb->page_bitmap, 0, bmap_bytes);
+   else {
+   kfree(vb->page_bitmap);
+   vb->page_bitmap = kzalloc(bmap_bytes, GFP_KERNEL);
+   }
+   }
+   if (!vb->page_bitmap) {
+   dev_err(>vdev->dev, "%s failure: allocate page bitmap\n",
+__func__);
+   return -ENOMEM;
+   }
+   vb->bmap_len = bmap_bytes;
+   vb->start_pfn = max_pfn;
+   vb->end_pfn = 0;
+
+   return 0;
+}
+
 static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
 {
-   struct scatterlist sg;
unsigned int len;
 
-   sg_init_one(, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns);
+   if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_PAGE_BITMAP)) {
+   u32 page_shift = PAGE_SHIFT;
+   unsigned long start_pfn, end_pfn, flags = 0, bmap_len;
+   struct scatterlist sg[5];
+
+   start_pfn = rounddown(vb->start_pfn, BITS_PER_LONG);
+   end_pfn = roundup(vb->end_pfn, BITS_PER_LONG);
+   bmap_len = (end_pfn - start_pfn) / BITS_PER_LONG * sizeof(long);
+
+   sg_init_table(sg, 5);
+   sg_set_buf([0], , sizeof(flags));
+   sg_set_buf([1], _pfn, sizeof(start_pfn));
+   sg_set_buf([2], _shift, sizeof(page_shift));
+   sg_set_buf([3], _len, sizeof(bmap_len));
+   sg_set_buf([4], vb->page_bitmap +
+(start_pfn / BITS_PER_LONG), bmap_len);
+   virtqueue_add_outbuf(vq, sg, 5, vb, GFP_KERNEL);
+
+   } else {
+   struct scatterlist sg;
+
+   sg_init_one(, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns);
+   /* We should always be able to add one buffer to an
+   * empty queue.
+   */
+   virtqueue_add_outbuf(vq, , 1, vb, GFP_KERNEL);
+   }
 
-   /* We should always be able to add one buffer to an empty queue. */
-   virtqueue_add_outbuf(vq, , 1, vb, GFP_KE

[Qemu-devel] [PATCH RFC QEMU] balloon: speed up inflating & deflating process

2016-05-20 Thread Liang Li
The implementation of the current virtio-balloon is not very efficient,
Bellow is test result of time spends on inflating the balloon to 3GB of
a 4GB idle guest:

a. allocating pages (6.5%, 103ms)
b. sending PFNs to host (68.3%, 787ms)
c. address translation (6.1%, 96ms)
d. madvise (19%, 300ms)

It takes about 1577ms for the whole inflating process to complete. The
test shows that the bottle neck is the stage b and stage d.

If using a bitmap to send the page info instead of the PFNs, we can
reduce the overhead spends on stage b quite a lot. Furthermore, it's
possible to do the address translation and do the madvise with a bulk
of pages, instead of the current page per page way, so the overhead of
stage c and stage d can also be reduced a lot.

This patch is the QEMU side implementation which is intended to speed
up the inflating & deflating process by adding a new feature to the
virtio-balloon device. And now, inflating the balloon to 3GB of a 4GB
idle guest only takes 175ms, it's about 9 times as fast as before.

TODO: optimize stage a by allocating/freeing a chunk of pages instead
of a single page at a time.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 hw/virtio/virtio-balloon.c  | 159 
 include/standard-headers/linux/virtio_balloon.h |   1 +
 2 files changed, 139 insertions(+), 21 deletions(-)

diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index 9dbe681..ce67465 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -47,6 +47,76 @@ static void balloon_page(void *addr, int deflate)
 #endif
 }
 
+static void do_balloon_bulk_pages(ram_addr_t base_pfn, int page_shift,
+  unsigned long len, bool deflate)
+{
+ram_addr_t size, processed, chunk, base;
+void *addr;
+MemoryRegionSection section = {.mr = NULL};
+
+size = (len << page_shift);
+base = (base_pfn << page_shift);
+
+for (processed = 0; processed < size; processed += chunk) {
+chunk = size - processed;
+while (chunk >= TARGET_PAGE_SIZE) {
+section = memory_region_find(get_system_memory(),
+ base + processed, chunk);
+if (!section.mr) {
+chunk = QEMU_ALIGN_DOWN(chunk / 2, TARGET_PAGE_SIZE);
+} else {
+break;
+}
+}
+
+if (section.mr &&
+(int128_nz(section.size) && memory_region_is_ram(section.mr))) {
+addr = section.offset_within_region +
+   memory_region_get_ram_ptr(section.mr);
+qemu_madvise(addr, chunk,
+   deflate ? QEMU_MADV_WILLNEED : QEMU_MADV_DONTNEED);
+} else {
+fprintf(stderr, "can't find the chunk, skip\n");
+chunk = TARGET_PAGE_SIZE;
+}
+}
+}
+
+static void balloon_bulk_pages(ram_addr_t base_pfn, unsigned long *bitmap,
+   unsigned long len, int page_shift, bool deflate)
+{
+#if defined(__linux__)
+unsigned long end  = len * 8;
+unsigned long current = 0;
+
+if (!qemu_balloon_is_inhibited() && (!kvm_enabled() ||
+ kvm_has_sync_mmu())) {
+while (current < end) {
+unsigned long one = find_next_bit(bitmap, end, current);
+
+if (one < end) {
+unsigned long zero = find_next_zero_bit(bitmap, end, one + 1);
+unsigned long page_length;
+
+if (zero >= end) {
+page_length = end - one;
+} else {
+page_length = zero - one;
+}
+
+if (page_length) {
+do_balloon_bulk_pages(base_pfn + one, page_shift,
+  page_length, deflate);
+}
+current = one + page_length;
+} else {
+current = one;
+}
+}
+}
+#endif
+}
+
 static const char *balloon_stat_names[] = {
[VIRTIO_BALLOON_S_SWAP_IN] = "stat-swap-in",
[VIRTIO_BALLOON_S_SWAP_OUT] = "stat-swap-out",
@@ -78,6 +148,12 @@ static bool balloon_stats_supported(const VirtIOBalloon *s)
 return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_STATS_VQ);
 }
 
+static bool balloon_page_bitmap_supported(const VirtIOBalloon *s)
+{
+VirtIODevice *vdev = VIRTIO_DEVICE(s);
+return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_BITMAP);
+}
+
 static bool balloon_stats_enabled(const VirtIOBalloon *s)
 {
 return s->stats_poll_interval > 0;
@@ -223,27 +299,66 @@ static void virtio_balloon_handle_output(VirtIODevice 
*vdev, VirtQueue *vq)
 return;
 }
 
-while (iov_to_buf(elem->out_sg, elem->out_num, offset, , 4) == 4) {
-ram_addr_t pa;
-ram_addr_t addr;
- 

[Qemu-devel] [PATCH v2 9/9] migration: code clean up

2016-05-05 Thread Liang Li
Use 'QemuMutex comp_done_lock' and 'QemuCond comp_done_cond' instead
of 'QemuMutex *comp_done_lock' and 'QemuCond comp_done_cond'. To keep
consistent with 'QemuMutex decomp_done_lock' and
'QemuCond comp_done_cond'.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 migration/ram.c | 36 +++-
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 51de958..5076862 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -278,8 +278,8 @@ static QemuThread *compress_threads;
  * one of the compression threads has finished the compression.
  * comp_done_lock is used to co-work with comp_done_cond.
  */
-static QemuMutex *comp_done_lock;
-static QemuCond *comp_done_cond;
+static QemuMutex comp_done_lock;
+static QemuCond comp_done_cond;
 /* The empty QEMUFileOps will be used by file in CompressParam */
 static const QEMUFileOps empty_ops = { };
 
@@ -308,10 +308,10 @@ static void *do_data_compress(void *opaque)
 
 do_compress_ram_page(param->file, block, offset);
 
-qemu_mutex_lock(comp_done_lock);
+qemu_mutex_lock(_done_lock);
 param->done = true;
-qemu_cond_signal(comp_done_cond);
-qemu_mutex_unlock(comp_done_lock);
+qemu_cond_signal(_done_cond);
+qemu_mutex_unlock(_done_lock);
 
 qemu_mutex_lock(>mutex);
 } else {
@@ -351,16 +351,12 @@ void migrate_compress_threads_join(void)
 qemu_mutex_destroy(_param[i].mutex);
 qemu_cond_destroy(_param[i].cond);
 }
-qemu_mutex_destroy(comp_done_lock);
-qemu_cond_destroy(comp_done_cond);
+qemu_mutex_destroy(_done_lock);
+qemu_cond_destroy(_done_cond);
 g_free(compress_threads);
 g_free(comp_param);
-g_free(comp_done_cond);
-g_free(comp_done_lock);
 compress_threads = NULL;
 comp_param = NULL;
-comp_done_cond = NULL;
-comp_done_lock = NULL;
 }
 
 void migrate_compress_threads_create(void)
@@ -374,10 +370,8 @@ void migrate_compress_threads_create(void)
 thread_count = migrate_compress_threads();
 compress_threads = g_new0(QemuThread, thread_count);
 comp_param = g_new0(CompressParam, thread_count);
-comp_done_cond = g_new0(QemuCond, 1);
-comp_done_lock = g_new0(QemuMutex, 1);
-qemu_cond_init(comp_done_cond);
-qemu_mutex_init(comp_done_lock);
+qemu_cond_init(_done_cond);
+qemu_mutex_init(_done_lock);
 for (i = 0; i < thread_count; i++) {
 /* com_param[i].file is just used as a dummy buffer to save data, set
  * it's ops to empty.
@@ -840,13 +834,13 @@ static void flush_compressed_data(QEMUFile *f)
 }
 thread_count = migrate_compress_threads();
 
-qemu_mutex_lock(comp_done_lock);
+qemu_mutex_lock(_done_lock);
 for (idx = 0; idx < thread_count; idx++) {
 while (!comp_param[idx].done) {
-qemu_cond_wait(comp_done_cond, comp_done_lock);
+qemu_cond_wait(_done_cond, _done_lock);
 }
 }
-qemu_mutex_unlock(comp_done_lock);
+qemu_mutex_unlock(_done_lock);
 
 for (idx = 0; idx < thread_count; idx++) {
 qemu_mutex_lock(_param[idx].mutex);
@@ -872,7 +866,7 @@ static int compress_page_with_multi_thread(QEMUFile *f, 
RAMBlock *block,
 int idx, thread_count, bytes_xmit = -1, pages = -1;
 
 thread_count = migrate_compress_threads();
-qemu_mutex_lock(comp_done_lock);
+qemu_mutex_lock(_done_lock);
 while (true) {
 for (idx = 0; idx < thread_count; idx++) {
 if (comp_param[idx].done) {
@@ -891,10 +885,10 @@ static int compress_page_with_multi_thread(QEMUFile *f, 
RAMBlock *block,
 if (pages > 0) {
 break;
 } else {
-qemu_cond_wait(comp_done_cond, comp_done_lock);
+qemu_cond_wait(_done_cond, _done_lock);
 }
 }
-qemu_mutex_unlock(comp_done_lock);
+qemu_mutex_unlock(_done_lock);
 
 return pages;
 }
-- 
1.9.1




[Qemu-devel] [PATCH v2 5/9] migration: refine ram_save_compressed_page

2016-05-05 Thread Liang Li
Use qemu_put_compression_data to do the compression directly
instead of using do_compress_ram_page, avoid some data copy.
very small improvement, at the same time, add code to check
if the compression is successful.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 migration/ram.c | 27 +--
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 8a21c2c..1a45227 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -929,24 +929,20 @@ static int ram_save_compressed_page(QEMUFile *f, 
PageSearchStatus *pss,
 uint64_t *bytes_transferred)
 {
 int pages = -1;
-uint64_t bytes_xmit;
+uint64_t bytes_xmit = 0;
 uint8_t *p;
-int ret;
+int ret, blen;
 RAMBlock *block = pss->block;
 ram_addr_t offset = pss->offset;
 
 p = block->host + offset;
 
-bytes_xmit = 0;
 ret = ram_control_save_page(f, block->offset,
 offset, TARGET_PAGE_SIZE, _xmit);
 if (bytes_xmit) {
 *bytes_transferred += bytes_xmit;
 pages = 1;
 }
-if (block == last_sent_block) {
-offset |= RAM_SAVE_FLAG_CONTINUE;
-}
 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 if (ret != RAM_SAVE_CONTROL_DELAYED) {
 if (bytes_xmit > 0) {
@@ -966,19 +962,22 @@ static int ram_save_compressed_page(QEMUFile *f, 
PageSearchStatus *pss,
 flush_compressed_data(f);
 pages = save_zero_page(f, block, offset, p, bytes_transferred);
 if (pages == -1) {
-set_compress_params(_param[0], block, offset);
-/* Use the qemu thread to compress the data to make sure the
- * first page is sent out before other pages
- */
-bytes_xmit = do_compress_ram_page(_param[0]);
-if (bytes_xmit > 0) {
+/* Make sure the first page is sent out before other pages */
+bytes_xmit = save_page_header(f, block, offset |
+  RAM_SAVE_FLAG_COMPRESS_PAGE);
+blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
+ migrate_compress_level());
+if (blen > 0) {
+*bytes_transferred += bytes_xmit + blen;
 acct_info.norm_pages++;
-qemu_put_qemu_file(f, comp_param[0].file);
-*bytes_transferred += bytes_xmit;
 pages = 1;
+} else {
+qemu_file_set_error(f, blen);
+error_report("compressed data failed!");
 }
 }
 } else {
+offset |= RAM_SAVE_FLAG_CONTINUE;
 pages = save_zero_page(f, block, offset, p, bytes_transferred);
 if (pages == -1) {
 pages = compress_page_with_multi_thread(f, block, offset,
-- 
1.9.1




[Qemu-devel] [PATCH v2 4/9] qemu-file: Fix qemu_put_compression_data flaw

2016-05-05 Thread Liang Li
Current qemu_put_compression_data can only work with no writable
QEMUFile, and can't work with the writable QEMUFile. But it does
not provide any measure to prevent users from using it with a
writable QEMUFile.

We should fix this flaw to make it works with writable QEMUFile.

Signed-off-by: Liang Li <liang.z...@intel.com>
Suggested-by: Juan Quintela <quint...@redhat.com>
---
 migration/qemu-file.c | 23 +--
 migration/ram.c   | 18 +-
 2 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index 6f4a129..b0ef1f3 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -607,8 +607,14 @@ uint64_t qemu_get_be64(QEMUFile *f)
 return v;
 }
 
-/* compress size bytes of data start at p with specific compression
+/* Compress size bytes of data start at p with specific compression
  * level and store the compressed data to the buffer of f.
+ *
+ * When f is not writable, return -1 if f has no space to save the
+ * compressed data.
+ * When f is wirtable and it has no space to save the compressed data,
+ * do fflush first, if f still has no space to save the compressed
+ * data, return -1.
  */
 
 ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size,
@@ -617,7 +623,14 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const 
uint8_t *p, size_t size,
 ssize_t blen = IO_BUF_SIZE - f->buf_index - sizeof(int32_t);
 
 if (blen < compressBound(size)) {
-return 0;
+if (!qemu_file_is_writable(f)) {
+return -1;
+}
+qemu_fflush(f);
+blen = IO_BUF_SIZE - sizeof(int32_t);
+if (blen < compressBound(size)) {
+return -1;
+}
 }
 if (compress2(f->buf + f->buf_index + sizeof(int32_t), (uLongf *),
   (Bytef *)p, size, level) != Z_OK) {
@@ -625,7 +638,13 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const 
uint8_t *p, size_t size,
 return 0;
 }
 qemu_put_be32(f, blen);
+if (f->ops->writev_buffer) {
+add_to_iovec(f, f->buf + f->buf_index, blen);
+}
 f->buf_index += blen;
+if (f->buf_index == IO_BUF_SIZE) {
+qemu_fflush(f);
+}
 return blen + sizeof(int32_t);
 }
 
diff --git a/migration/ram.c b/migration/ram.c
index 5d544c0..8a21c2c 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -821,7 +821,13 @@ static int do_compress_ram_page(CompressParam *param)
   RAM_SAVE_FLAG_COMPRESS_PAGE);
 blen = qemu_put_compression_data(param->file, p, TARGET_PAGE_SIZE,
  migrate_compress_level());
-bytes_sent += blen;
+if (blen < 0) {
+bytes_sent = 0;
+qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
+error_report("compressed data failed!");
+} else {
+bytes_sent += blen;
+}
 
 return bytes_sent;
 }
@@ -965,10 +971,12 @@ static int ram_save_compressed_page(QEMUFile *f, 
PageSearchStatus *pss,
  * first page is sent out before other pages
  */
 bytes_xmit = do_compress_ram_page(_param[0]);
-acct_info.norm_pages++;
-qemu_put_qemu_file(f, comp_param[0].file);
-*bytes_transferred += bytes_xmit;
-pages = 1;
+if (bytes_xmit > 0) {
+acct_info.norm_pages++;
+qemu_put_qemu_file(f, comp_param[0].file);
+*bytes_transferred += bytes_xmit;
+pages = 1;
+}
 }
 } else {
 pages = save_zero_page(f, block, offset, p, bytes_transferred);
-- 
1.9.1




[Qemu-devel] [PATCH v2 6/9] migration: protect the quit flag by lock

2016-05-05 Thread Liang Li
quit_comp_thread and quit_decomp_thread are accessed by several
thread, it's better to protect them with locks. We use a per
thread flag to replace the global one, and the new flag is protected
by a lock.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 migration/ram.c | 32 
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 1a45227..50eae7c 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -253,6 +253,7 @@ static struct BitmapRcu {
 struct CompressParam {
 bool start;
 bool done;
+bool quit;
 QEMUFile *file;
 QemuMutex mutex;
 QemuCond cond;
@@ -264,6 +265,7 @@ typedef struct CompressParam CompressParam;
 struct DecompressParam {
 bool start;
 bool done;
+bool quit;
 QemuMutex mutex;
 QemuCond cond;
 void *des;
@@ -284,8 +286,6 @@ static QemuCond *comp_done_cond;
 static const QEMUFileOps empty_ops = { };
 
 static bool compression_switch;
-static bool quit_comp_thread;
-static bool quit_decomp_thread;
 static DecompressParam *decomp_param;
 static QemuThread *decompress_threads;
 static QemuMutex decomp_done_lock;
@@ -297,18 +297,18 @@ static void *do_data_compress(void *opaque)
 {
 CompressParam *param = opaque;
 
-while (!quit_comp_thread) {
+while (!param->quit) {
 qemu_mutex_lock(>mutex);
-/* Re-check the quit_comp_thread in case of
+/* Re-check the quit flag in case of
  * terminate_compression_threads is called just before
  * qemu_mutex_lock(>mutex) and after
- * while(!quit_comp_thread), re-check it here can make
+ * while(!param->quit), re-check it here can make
  * sure the compression thread terminate as expected.
  */
-while (!param->start && !quit_comp_thread) {
+while (!param->start && !param->quit) {
 qemu_cond_wait(>cond, >mutex);
 }
-if (!quit_comp_thread) {
+if (!param->quit) {
 do_compress_ram_page(param);
 }
 param->start = false;
@@ -328,9 +328,9 @@ static inline void terminate_compression_threads(void)
 int idx, thread_count;
 
 thread_count = migrate_compress_threads();
-quit_comp_thread = true;
 for (idx = 0; idx < thread_count; idx++) {
 qemu_mutex_lock(_param[idx].mutex);
+comp_param[idx].quit = true;
 qemu_cond_signal(_param[idx].cond);
 qemu_mutex_unlock(_param[idx].mutex);
 }
@@ -370,7 +370,6 @@ void migrate_compress_threads_create(void)
 if (!migrate_use_compression()) {
 return;
 }
-quit_comp_thread = false;
 compression_switch = true;
 thread_count = migrate_compress_threads();
 compress_threads = g_new0(QemuThread, thread_count);
@@ -385,6 +384,7 @@ void migrate_compress_threads_create(void)
  */
 comp_param[i].file = qemu_fopen_ops(NULL, _ops);
 comp_param[i].done = true;
+comp_param[i].quit = false;
 qemu_mutex_init(_param[i].mutex);
 qemu_cond_init(_param[i].cond);
 qemu_thread_create(compress_threads + i, "compress",
@@ -863,12 +863,12 @@ static void flush_compressed_data(QEMUFile *f)
 for (idx = 0; idx < thread_count; idx++) {
 if (!comp_param[idx].done) {
 qemu_mutex_lock(comp_done_lock);
-while (!comp_param[idx].done && !quit_comp_thread) {
+while (!comp_param[idx].done && !comp_param[idx].quit) {
 qemu_cond_wait(comp_done_cond, comp_done_lock);
 }
 qemu_mutex_unlock(comp_done_lock);
 }
-if (!quit_comp_thread) {
+if (!comp_param[idx].quit) {
 len = qemu_put_qemu_file(f, comp_param[idx].file);
 bytes_transferred += len;
 }
@@ -2200,12 +2200,12 @@ static void *do_data_decompress(void *opaque)
 DecompressParam *param = opaque;
 unsigned long pagesize;
 
-while (!quit_decomp_thread) {
+while (!param->quit) {
 qemu_mutex_lock(>mutex);
-while (!param->start && !quit_decomp_thread) {
+while (!param->start && !param->quit) {
 qemu_cond_wait(>cond, >mutex);
 }
-if (!quit_decomp_thread) {
+if (!param->quit) {
 pagesize = TARGET_PAGE_SIZE;
 /* uncompress() will return failed in some case, especially
  * when the page is dirted when doing the compression, it's
@@ -2252,7 +2252,6 @@ void migrate_decompress_threads_create(void)
 thread_count = migrate_decompress_threads();
 decompress_threads = g_new0(QemuThread, thread_count);
 decomp_param = g_new0(DecompressParam, thread_count);
-quit_decomp_thread = false;
 qemu_mutex_init(_done_lock);
 qemu_cond_init(_done_cond);
 for (i = 0; i < thread_count; i++) {
@@ -2260,6 +2259,7 @@ 

[Qemu-devel] [PATCH v2 3/9] migration: remove useless code

2016-05-05 Thread Liang Li
page_buffer is set twice repeatedly, remove the previous set.

Signed-off-by: Liang Li <liang.z...@intel.com>
Reviewed-by: Dr. David Alan Gilbert <dgilb...@redhat.com>
Reviewed-by: Juan Quintela <quint...@redhat.com>
---
 migration/ram.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/migration/ram.c b/migration/ram.c
index 8a59a08..5d544c0 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2356,7 +2356,6 @@ static int ram_load_postcopy(QEMUFile *f)
 ret = -EINVAL;
 break;
 }
-page_buffer = host;
 /*
  * Postcopy requires that we place whole host pages atomically.
  * To make it atomic, the data is read into a temporary page
-- 
1.9.1




[Qemu-devel] [PATCH v2 7/9] migration: refine the compression code

2016-05-05 Thread Liang Li
The current code for multi-thread compression is not clear,
especially in the aspect of using lock. Refine the code
to make it clear.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 migration/ram.c | 84 +++--
 1 file changed, 40 insertions(+), 44 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 50eae7c..f44f833 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -251,7 +251,6 @@ static struct BitmapRcu {
 } *migration_bitmap_rcu;
 
 struct CompressParam {
-bool start;
 bool done;
 bool quit;
 QEMUFile *file;
@@ -291,34 +290,36 @@ static QemuThread *decompress_threads;
 static QemuMutex decomp_done_lock;
 static QemuCond decomp_done_cond;
 
-static int do_compress_ram_page(CompressParam *param);
+static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
+ram_addr_t offset);
 
 static void *do_data_compress(void *opaque)
 {
 CompressParam *param = opaque;
+RAMBlock *block;
+ram_addr_t offset;
 
+qemu_mutex_lock(>mutex);
 while (!param->quit) {
-qemu_mutex_lock(>mutex);
-/* Re-check the quit flag in case of
- * terminate_compression_threads is called just before
- * qemu_mutex_lock(>mutex) and after
- * while(!param->quit), re-check it here can make
- * sure the compression thread terminate as expected.
- */
-while (!param->start && !param->quit) {
+if (param->block) {
+block = param->block;
+offset = param->offset;
+param->block = NULL;
+qemu_mutex_unlock(>mutex);
+
+do_compress_ram_page(param->file, block, offset);
+
+qemu_mutex_lock(comp_done_lock);
+param->done = true;
+qemu_cond_signal(comp_done_cond);
+qemu_mutex_unlock(comp_done_lock);
+
+qemu_mutex_lock(>mutex);
+} else {
 qemu_cond_wait(>cond, >mutex);
 }
-if (!param->quit) {
-do_compress_ram_page(param);
-}
-param->start = false;
-qemu_mutex_unlock(>mutex);
-
-qemu_mutex_lock(comp_done_lock);
-param->done = true;
-qemu_cond_signal(comp_done_cond);
-qemu_mutex_unlock(comp_done_lock);
 }
+qemu_mutex_unlock(>mutex);
 
 return NULL;
 }
@@ -808,18 +809,15 @@ static int ram_save_page(QEMUFile *f, PageSearchStatus 
*pss,
 return pages;
 }
 
-static int do_compress_ram_page(CompressParam *param)
+static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
+ram_addr_t offset)
 {
 int bytes_sent, blen;
-uint8_t *p;
-RAMBlock *block = param->block;
-ram_addr_t offset = param->offset;
+uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
 
-p = block->host + (offset & TARGET_PAGE_MASK);
-
-bytes_sent = save_page_header(param->file, block, offset |
+bytes_sent = save_page_header(f, block, offset |
   RAM_SAVE_FLAG_COMPRESS_PAGE);
-blen = qemu_put_compression_data(param->file, p, TARGET_PAGE_SIZE,
+blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
  migrate_compress_level());
 if (blen < 0) {
 bytes_sent = 0;
@@ -832,15 +830,6 @@ static int do_compress_ram_page(CompressParam *param)
 return bytes_sent;
 }
 
-static inline void start_compression(CompressParam *param)
-{
-param->done = false;
-qemu_mutex_lock(>mutex);
-param->start = true;
-qemu_cond_signal(>cond);
-qemu_mutex_unlock(>mutex);
-}
-
 static inline void start_decompression(DecompressParam *param)
 {
 param->done = false;
@@ -860,18 +849,22 @@ static void flush_compressed_data(QEMUFile *f)
 return;
 }
 thread_count = migrate_compress_threads();
+
+qemu_mutex_lock(comp_done_lock);
 for (idx = 0; idx < thread_count; idx++) {
-if (!comp_param[idx].done) {
-qemu_mutex_lock(comp_done_lock);
-while (!comp_param[idx].done && !comp_param[idx].quit) {
-qemu_cond_wait(comp_done_cond, comp_done_lock);
-}
-qemu_mutex_unlock(comp_done_lock);
+while (!comp_param[idx].done) {
+qemu_cond_wait(comp_done_cond, comp_done_lock);
 }
+}
+qemu_mutex_unlock(comp_done_lock);
+
+for (idx = 0; idx < thread_count; idx++) {
+qemu_mutex_lock(_param[idx].mutex);
 if (!comp_param[idx].quit) {
 len = qemu_put_qemu_file(f, comp_param[idx].file);
 bytes_transferred += len;
 }
+qemu_mutex_unlock(_param[idx].mutex);
 }
 }
 
@@ -893,9 +886,12 @@ static int compress_page_with_multi_thread(QEMUFile *f, 
RAMBlock *block,
 while (true) {
 for (idx = 0; idx

[Qemu-devel] [PATCH v2 8/9] migration: refine the decompression code

2016-05-05 Thread Liang Li
The current code for multi-thread decompression is not clear,
especially in the aspect of using lock. Refine the code
to make it clear.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 migration/ram.c | 50 +-
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index f44f833..51de958 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -262,7 +262,6 @@ struct CompressParam {
 typedef struct CompressParam CompressParam;
 
 struct DecompressParam {
-bool start;
 bool done;
 bool quit;
 QemuMutex mutex;
@@ -830,15 +829,6 @@ static int do_compress_ram_page(QEMUFile *f, RAMBlock 
*block,
 return bytes_sent;
 }
 
-static inline void start_decompression(DecompressParam *param)
-{
-param->done = false;
-qemu_mutex_lock(>mutex);
-param->start = true;
-qemu_cond_signal(>cond);
-qemu_mutex_unlock(>mutex);
-}
-
 static uint64_t bytes_transferred;
 
 static void flush_compressed_data(QEMUFile *f)
@@ -2195,30 +2185,37 @@ static void *do_data_decompress(void *opaque)
 {
 DecompressParam *param = opaque;
 unsigned long pagesize;
+uint8_t *des;
+int len;
 
+qemu_mutex_lock(>mutex);
 while (!param->quit) {
-qemu_mutex_lock(>mutex);
-while (!param->start && !param->quit) {
-qemu_cond_wait(>cond, >mutex);
-}
-if (!param->quit) {
+if (param->des) {
+des = param->des;
+len = param->len;
+param->des = 0;
+qemu_mutex_unlock(>mutex);
+
 pagesize = TARGET_PAGE_SIZE;
 /* uncompress() will return failed in some case, especially
  * when the page is dirted when doing the compression, it's
  * not a problem because the dirty page will be retransferred
  * and uncompress() won't break the data in other pages.
  */
-uncompress((Bytef *)param->des, ,
-   (const Bytef *)param->compbuf, param->len);
-}
-param->start = false;
-qemu_mutex_unlock(>mutex);
+uncompress((Bytef *)des, ,
+   (const Bytef *)param->compbuf, len);
 
-qemu_mutex_lock(_done_lock);
-param->done = true;
-qemu_cond_signal(_done_cond);
-qemu_mutex_unlock(_done_lock);
+qemu_mutex_lock(_done_lock);
+param->done = true;
+qemu_cond_signal(_done_cond);
+qemu_mutex_unlock(_done_lock);
+
+qemu_mutex_lock(>mutex);
+} else {
+qemu_cond_wait(>cond, >mutex);
+}
 }
+qemu_mutex_unlock(>mutex);
 
 return NULL;
 }
@@ -2295,10 +2292,13 @@ static void decompress_data_with_multi_threads(QEMUFile 
*f,
 while (true) {
 for (idx = 0; idx < thread_count; idx++) {
 if (decomp_param[idx].done) {
+decomp_param[idx].done = false;
+qemu_mutex_lock(_param[idx].mutex);
 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
 decomp_param[idx].des = host;
 decomp_param[idx].len = len;
-start_decompression(_param[idx]);
+qemu_cond_signal(_param[idx].cond);
+qemu_mutex_unlock(_param[idx].mutex);
 break;
 }
 }
-- 
1.9.1




[Qemu-devel] [PATCH v2 2/9] migration: Fix a potential issue

2016-05-05 Thread Liang Li
At the end of live migration and before vm_start() on the destination
side, we should make sure all the decompression tasks are finished, if
this can not be guaranteed, the VM may get the incorrect memory data,
or the updated memory may be overwritten by the decompression thread.
Add the code to fix this potential issue.

Suggested-by: David Alan Gilbert <dgilb...@redhat.com>
Suggested-by: Juan Quintela <quint...@redhat.com>
Signed-off-by: Liang Li <liang.z...@intel.com>
---
 migration/ram.c | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/migration/ram.c b/migration/ram.c
index 7ab6ab5..8a59a08 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2220,6 +2220,24 @@ static void *do_data_decompress(void *opaque)
 return NULL;
 }
 
+static void wait_for_decompress_done(void)
+{
+int idx, thread_count;
+
+if (!migrate_use_compression()) {
+return;
+}
+
+thread_count = migrate_decompress_threads();
+qemu_mutex_lock(_done_lock);
+for (idx = 0; idx < thread_count; idx++) {
+while (!decomp_param[idx].done) {
+qemu_cond_wait(_done_cond, _done_lock);
+}
+}
+qemu_mutex_unlock(_done_lock);
+}
+
 void migrate_decompress_threads_create(void)
 {
 int i, thread_count;
@@ -2554,6 +2572,7 @@ static int ram_load(QEMUFile *f, void *opaque, int 
version_id)
 }
 }
 
+wait_for_decompress_done();
 rcu_read_unlock();
 DPRINTF("Completed load of VM with exit code %d seq iteration "
 "%" PRIu64 "\n", ret, seq_iter);
-- 
1.9.1




[Qemu-devel] [PATCH v2 1/9] migration: Fix multi-thread compression bug

2016-05-05 Thread Liang Li
Recently, a bug related to multiple thread compression feature for
live migration is reported. The destination side will be blocked
during live migration if there are heavy workload in host and
memory intensive workload in guest, this is most likely to happen
when there is one decompression thread.

Some parts of the decompression code are incorrect:
1. The main thread receives data from source side will enter a busy
loop to wait for a free decompression thread.
2. A lock is needed to protect the decomp_param[idx]->start, because
it is checked in the main thread and is updated in the decompression
thread.

Fix these two issues by following the code pattern for compression.

Signed-off-by: Liang Li <liang.z...@intel.com>
Reported-by: Daniel P. Berrange <berra...@redhat.com>
Reviewed-by: Daniel P. Berrange <berra...@redhat.com>
Reviewed-by: Dr. David Alan Gilbert <dgilb...@redhat.com>
Reviewed-by: Juan Quintela <quint...@redhat.com>
Tested-by: Daniel P. Berrange <berra...@redhat.com>

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 migration/ram.c | 38 +++---
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 3f05738..7ab6ab5 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -263,6 +263,7 @@ typedef struct CompressParam CompressParam;
 
 struct DecompressParam {
 bool start;
+bool done;
 QemuMutex mutex;
 QemuCond cond;
 void *des;
@@ -287,6 +288,8 @@ static bool quit_comp_thread;
 static bool quit_decomp_thread;
 static DecompressParam *decomp_param;
 static QemuThread *decompress_threads;
+static QemuMutex decomp_done_lock;
+static QemuCond decomp_done_cond;
 
 static int do_compress_ram_page(CompressParam *param);
 
@@ -834,6 +837,7 @@ static inline void start_compression(CompressParam *param)
 
 static inline void start_decompression(DecompressParam *param)
 {
+param->done = false;
 qemu_mutex_lock(>mutex);
 param->start = true;
 qemu_cond_signal(>cond);
@@ -2193,19 +2197,24 @@ static void *do_data_decompress(void *opaque)
 qemu_mutex_lock(>mutex);
 while (!param->start && !quit_decomp_thread) {
 qemu_cond_wait(>cond, >mutex);
+}
+if (!quit_decomp_thread) {
 pagesize = TARGET_PAGE_SIZE;
-if (!quit_decomp_thread) {
-/* uncompress() will return failed in some case, especially
- * when the page is dirted when doing the compression, it's
- * not a problem because the dirty page will be retransferred
- * and uncompress() won't break the data in other pages.
- */
-uncompress((Bytef *)param->des, ,
-   (const Bytef *)param->compbuf, param->len);
-}
-param->start = false;
+/* uncompress() will return failed in some case, especially
+ * when the page is dirted when doing the compression, it's
+ * not a problem because the dirty page will be retransferred
+ * and uncompress() won't break the data in other pages.
+ */
+uncompress((Bytef *)param->des, ,
+   (const Bytef *)param->compbuf, param->len);
 }
+param->start = false;
 qemu_mutex_unlock(>mutex);
+
+qemu_mutex_lock(_done_lock);
+param->done = true;
+qemu_cond_signal(_done_cond);
+qemu_mutex_unlock(_done_lock);
 }
 
 return NULL;
@@ -2219,10 +2228,13 @@ void migrate_decompress_threads_create(void)
 decompress_threads = g_new0(QemuThread, thread_count);
 decomp_param = g_new0(DecompressParam, thread_count);
 quit_decomp_thread = false;
+qemu_mutex_init(_done_lock);
+qemu_cond_init(_done_cond);
 for (i = 0; i < thread_count; i++) {
 qemu_mutex_init(_param[i].mutex);
 qemu_cond_init(_param[i].cond);
 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
+decomp_param[i].done = true;
 qemu_thread_create(decompress_threads + i, "decompress",
do_data_decompress, decomp_param + i,
QEMU_THREAD_JOINABLE);
@@ -2258,9 +2270,10 @@ static void decompress_data_with_multi_threads(QEMUFile 
*f,
 int idx, thread_count;
 
 thread_count = migrate_decompress_threads();
+qemu_mutex_lock(_done_lock);
 while (true) {
 for (idx = 0; idx < thread_count; idx++) {
-if (!decomp_param[idx].start) {
+if (decomp_param[idx].done) {
 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
 decomp_param[idx].des = host;
 decomp_param[idx].len = len;
@@ -2270,8 +2283,11 @@ static void decompress_data_with_multi_threads(QEMUFile 
*f,
 }

[Qemu-devel] [PATCH v2 0/9] live migration bug fix and refine

2016-05-05 Thread Liang Li
This patch set fixed a bug which will block live migration and another
potential issue when using multi-thread (de)compression.

The last patches try to refine the code and make the using of lock more
clear. Some of the code snippets are from Juan's multiple-fd patches,
with very small change. Thanks for Juan's work.


Liang Li (9):
  migration: Fix multi-thread compression bug
  migration: Fix a potential issue
  migration: remove useless code
  qemu-file: Fix qemu_put_compression_data flaw
  migration: refine ram_save_compressed_page
  migration: protect the quit flag by lock
  migration: refine the compression code
  migration: refine the decompression code
  migration: code clean up

 migration/qemu-file.c |  23 -
 migration/ram.c   | 251 --
 2 files changed, 162 insertions(+), 112 deletions(-)

-- 
1.9.1




[Qemu-devel] [PATCH 2/5] migration: Fix a potential issue

2016-05-03 Thread Liang Li
At the end of live migration and before vm_start() on the destination
side, we should make sure all the decompression tasks are finished, if
this can not be guaranteed, the VM may get the incorrect memory data,
or the updated memory may be overwritten by the decompression thread.
Add the code to fix this potential issue.

Suggested-by: David Alan Gilbert <dgilb...@redhat.com>
Signed-off-by: Liang Li <liang.z...@intel.com>
---
 include/migration/migration.h |  1 +
 migration/migration.c |  2 +-
 migration/ram.c   | 20 
 3 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/include/migration/migration.h b/include/migration/migration.h
index ac2c12c..1c9051e 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -223,6 +223,7 @@ void migrate_compress_threads_create(void);
 void migrate_compress_threads_join(void);
 void migrate_decompress_threads_create(void);
 void migrate_decompress_threads_join(void);
+void wait_for_decompress_done(void);
 uint64_t ram_bytes_remaining(void);
 uint64_t ram_bytes_transferred(void);
 uint64_t ram_bytes_total(void);
diff --git a/migration/migration.c b/migration/migration.c
index 991313a..5228c28 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -347,7 +347,7 @@ static void process_incoming_migration_bh(void *opaque)
 /* If global state section was not received or we are in running
state, we need to obey autostart. Any other state is set with
runstate_set. */
-
+wait_for_decompress_done();
 if (!global_state_received() ||
 global_state_get_runstate() == RUN_STATE_RUNNING) {
 if (autostart) {
diff --git a/migration/ram.c b/migration/ram.c
index 7ab6ab5..4459b38 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2220,6 +2220,26 @@ static void *do_data_decompress(void *opaque)
 return NULL;
 }
 
+void wait_for_decompress_done(void)
+{
+int idx, thread_count;
+
+if (!migrate_use_compression()) {
+return;
+}
+thread_count = migrate_decompress_threads();
+for (idx = 0; idx < thread_count; idx++) {
+if (!decomp_param[idx].done) {
+qemu_mutex_lock(_done_lock);
+while (!decomp_param[idx].done) {
+qemu_cond_wait(_done_cond, _done_lock);
+}
+qemu_mutex_unlock(_done_lock);
+}
+}
+
+}
+
 void migrate_decompress_threads_create(void)
 {
 int i, thread_count;
-- 
1.9.1




[Qemu-devel] [PATCH 5/5] migration: refine ram_save_compressed_page

2016-05-03 Thread Liang Li
Use qemu_put_compression_data to do the compression directly
instead of using do_compress_ram_page, avoid some data copy.
very small improvement, at the same time, add code to check
if the compression is successful.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 migration/ram.c | 28 +---
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 7e62d8d..ec2c0bf 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -927,24 +927,20 @@ static int ram_save_compressed_page(QEMUFile *f, 
PageSearchStatus *pss,
 uint64_t *bytes_transferred)
 {
 int pages = -1;
-uint64_t bytes_xmit;
+uint64_t bytes_xmit = 0;
 uint8_t *p;
-int ret;
+int ret, blen;
 RAMBlock *block = pss->block;
 ram_addr_t offset = pss->offset;
 
 p = block->host + offset;
 
-bytes_xmit = 0;
 ret = ram_control_save_page(f, block->offset,
 offset, TARGET_PAGE_SIZE, _xmit);
 if (bytes_xmit) {
 *bytes_transferred += bytes_xmit;
 pages = 1;
 }
-if (block == last_sent_block) {
-offset |= RAM_SAVE_FLAG_CONTINUE;
-}
 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 if (ret != RAM_SAVE_CONTROL_DELAYED) {
 if (bytes_xmit > 0) {
@@ -964,17 +960,19 @@ static int ram_save_compressed_page(QEMUFile *f, 
PageSearchStatus *pss,
 flush_compressed_data(f);
 pages = save_zero_page(f, block, offset, p, bytes_transferred);
 if (pages == -1) {
-set_compress_params(_param[0], block, offset);
-/* Use the qemu thread to compress the data to make sure the
- * first page is sent out before other pages
- */
-bytes_xmit = do_compress_ram_page(_param[0]);
-acct_info.norm_pages++;
-qemu_put_qemu_file(f, comp_param[0].file);
-*bytes_transferred += bytes_xmit;
-pages = 1;
+/* Make sure the first page is sent out before other pages */
+bytes_xmit = save_page_header(f, block, offset |
+  RAM_SAVE_FLAG_COMPRESS_PAGE);
+blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
+ migrate_compress_level());
+if (blen > 0) {
+*bytes_transferred += bytes_xmit + blen;
+acct_info.norm_pages++;
+pages = 1;
+}
 }
 } else {
+offset |= RAM_SAVE_FLAG_CONTINUE;
 pages = save_zero_page(f, block, offset, p, bytes_transferred);
 if (pages == -1) {
 pages = compress_page_with_multi_thread(f, block, offset,
-- 
1.9.1




[Qemu-devel] [PATCH 3/5] migration: remove useless code

2016-05-03 Thread Liang Li
page_buffer is set twice repeatedly, remove the previous set.

Signed-off-by: Liang Li <liang.z...@intel.com>
Reviewed-by: Dr. David Alan Gilbert <dgilb...@redhat.com>
---
 migration/ram.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/migration/ram.c b/migration/ram.c
index 4459b38..bc34bc5 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2358,7 +2358,6 @@ static int ram_load_postcopy(QEMUFile *f)
 ret = -EINVAL;
 break;
 }
-page_buffer = host;
 /*
  * Postcopy requires that we place whole host pages atomically.
  * To make it atomic, the data is read into a temporary page
-- 
1.9.1




[Qemu-devel] [PATCH 4/5] qemu-file: Fix qemu_put_compression_data flaw

2016-05-03 Thread Liang Li
Current qemu_put_compression_data can only work with no writable
QEMUFile, and can't work with the writable QEMUFile. But it does
not provide any measure to prevent users from using it with a
writable QEMUFile.

We should fix this flaw to make it works with writable QEMUFile.

Suggested-by: Juan Quintela <quint...@redhat.com>
Signed-off-by: Liang Li <liang.z...@intel.com>
---
 migration/qemu-file.c | 23 +--
 migration/ram.c   |  6 +-
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index 6f4a129..b0ef1f3 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -607,8 +607,14 @@ uint64_t qemu_get_be64(QEMUFile *f)
 return v;
 }
 
-/* compress size bytes of data start at p with specific compression
+/* Compress size bytes of data start at p with specific compression
  * level and store the compressed data to the buffer of f.
+ *
+ * When f is not writable, return -1 if f has no space to save the
+ * compressed data.
+ * When f is wirtable and it has no space to save the compressed data,
+ * do fflush first, if f still has no space to save the compressed
+ * data, return -1.
  */
 
 ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size,
@@ -617,7 +623,14 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const 
uint8_t *p, size_t size,
 ssize_t blen = IO_BUF_SIZE - f->buf_index - sizeof(int32_t);
 
 if (blen < compressBound(size)) {
-return 0;
+if (!qemu_file_is_writable(f)) {
+return -1;
+}
+qemu_fflush(f);
+blen = IO_BUF_SIZE - sizeof(int32_t);
+if (blen < compressBound(size)) {
+return -1;
+}
 }
 if (compress2(f->buf + f->buf_index + sizeof(int32_t), (uLongf *),
   (Bytef *)p, size, level) != Z_OK) {
@@ -625,7 +638,13 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const 
uint8_t *p, size_t size,
 return 0;
 }
 qemu_put_be32(f, blen);
+if (f->ops->writev_buffer) {
+add_to_iovec(f, f->buf + f->buf_index, blen);
+}
 f->buf_index += blen;
+if (f->buf_index == IO_BUF_SIZE) {
+qemu_fflush(f);
+}
 return blen + sizeof(int32_t);
 }
 
diff --git a/migration/ram.c b/migration/ram.c
index bc34bc5..7e62d8d 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -821,7 +821,11 @@ static int do_compress_ram_page(CompressParam *param)
   RAM_SAVE_FLAG_COMPRESS_PAGE);
 blen = qemu_put_compression_data(param->file, p, TARGET_PAGE_SIZE,
  migrate_compress_level());
-bytes_sent += blen;
+if (blen < 0) {
+error_report("Insufficient buffer for compressed data!");
+} else {
+bytes_sent += blen;
+}
 
 return bytes_sent;
 }
-- 
1.9.1




[Qemu-devel] [PATCH 1/5] migration: Fix multi-thread compression bug

2016-05-03 Thread Liang Li
Recently, a bug related to multiple thread compression feature for
live migration is reported. The destination side will be blocked
during live migration if there are heavy workload in host and
memory intensive workload in guest, this is most likely to happen
when there is one decompression thread.

Some parts of the decompression code are incorrect:
1. The main thread receives data from source side will enter a busy
loop to wait for a free decompression thread.
2. A lock is needed to protect the decomp_param[idx]->start, because
it is checked in the main thread and is updated in the decompression
thread.

Fix these two issues by following the code pattern for compression.

Signed-off-by: Liang Li <liang.z...@intel.com>
Reported-by: Daniel P. Berrange <berra...@redhat.com>
Reviewed-by: Daniel P. Berrange <berra...@redhat.com>
Reviewed-by: Dr. David Alan Gilbert <dgilb...@redhat.com>
Tested-by: Daniel P. Berrange <berra...@redhat.com>
---
 migration/ram.c | 38 +++---
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 3f05738..7ab6ab5 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -263,6 +263,7 @@ typedef struct CompressParam CompressParam;
 
 struct DecompressParam {
 bool start;
+bool done;
 QemuMutex mutex;
 QemuCond cond;
 void *des;
@@ -287,6 +288,8 @@ static bool quit_comp_thread;
 static bool quit_decomp_thread;
 static DecompressParam *decomp_param;
 static QemuThread *decompress_threads;
+static QemuMutex decomp_done_lock;
+static QemuCond decomp_done_cond;
 
 static int do_compress_ram_page(CompressParam *param);
 
@@ -834,6 +837,7 @@ static inline void start_compression(CompressParam *param)
 
 static inline void start_decompression(DecompressParam *param)
 {
+param->done = false;
 qemu_mutex_lock(>mutex);
 param->start = true;
 qemu_cond_signal(>cond);
@@ -2193,19 +2197,24 @@ static void *do_data_decompress(void *opaque)
 qemu_mutex_lock(>mutex);
 while (!param->start && !quit_decomp_thread) {
 qemu_cond_wait(>cond, >mutex);
+}
+if (!quit_decomp_thread) {
 pagesize = TARGET_PAGE_SIZE;
-if (!quit_decomp_thread) {
-/* uncompress() will return failed in some case, especially
- * when the page is dirted when doing the compression, it's
- * not a problem because the dirty page will be retransferred
- * and uncompress() won't break the data in other pages.
- */
-uncompress((Bytef *)param->des, ,
-   (const Bytef *)param->compbuf, param->len);
-}
-param->start = false;
+/* uncompress() will return failed in some case, especially
+ * when the page is dirted when doing the compression, it's
+ * not a problem because the dirty page will be retransferred
+ * and uncompress() won't break the data in other pages.
+ */
+uncompress((Bytef *)param->des, ,
+   (const Bytef *)param->compbuf, param->len);
 }
+param->start = false;
 qemu_mutex_unlock(>mutex);
+
+qemu_mutex_lock(_done_lock);
+param->done = true;
+qemu_cond_signal(_done_cond);
+qemu_mutex_unlock(_done_lock);
 }
 
 return NULL;
@@ -2219,10 +2228,13 @@ void migrate_decompress_threads_create(void)
 decompress_threads = g_new0(QemuThread, thread_count);
 decomp_param = g_new0(DecompressParam, thread_count);
 quit_decomp_thread = false;
+qemu_mutex_init(_done_lock);
+qemu_cond_init(_done_cond);
 for (i = 0; i < thread_count; i++) {
 qemu_mutex_init(_param[i].mutex);
 qemu_cond_init(_param[i].cond);
 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
+decomp_param[i].done = true;
 qemu_thread_create(decompress_threads + i, "decompress",
do_data_decompress, decomp_param + i,
QEMU_THREAD_JOINABLE);
@@ -2258,9 +2270,10 @@ static void decompress_data_with_multi_threads(QEMUFile 
*f,
 int idx, thread_count;
 
 thread_count = migrate_decompress_threads();
+qemu_mutex_lock(_done_lock);
 while (true) {
 for (idx = 0; idx < thread_count; idx++) {
-if (!decomp_param[idx].start) {
+if (decomp_param[idx].done) {
 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
 decomp_param[idx].des = host;
 decomp_param[idx].len = len;
@@ -2270,8 +2283,11 @@ static void decompress_data_with_multi_threads(QEMUFile 
*f,
 }
 if (idx < thread_count) {
 break;
+} else {
+qemu_cond_wait(_done_cond, _done_lock);
 }
 }
+qemu_mutex_unlock(_done_lock);
 }
 
 /*
-- 
1.9.1




[Qemu-devel] [PATCH 0/5] live migration bug fix and refine

2016-05-03 Thread Liang Li
This patch set fixed a bug which will block live migration and another
potential issue when using multi-thread (de)compression.

The last three patches were submitted before, put them here together.

Liang Li (5):
  migration: Fix multi-thread compression bug
  migration: Fix a potential issue
  migration: remove useless code
  qemu-file: Fix qemu_put_compression_data flaw
  migration: refine ram_save_compressed_page

 include/migration/migration.h |  1 +
 migration/migration.c |  2 +-
 migration/qemu-file.c | 23 ++-
 migration/ram.c   | 93 ++-
 4 files changed, 88 insertions(+), 31 deletions(-)

-- 
1.9.1




[Qemu-devel] [PATCH] migration: Fix multi-thread compression bug

2016-05-03 Thread Liang Li
Recently, a bug related to multiple thread compression feature for
live migration is reported. The destination side will be blocked
during live migration if there are heavy workload in host and
memory intensive workload in guest, this is most likely to happen
when there is one decompression thread.

Some parts of the decompression code are incorrect:
1. The main thread receives data from source side will enter a busy
loop to wait for a free decompression thread.
2. A lock is needed to protect the decomp_param[idx]->start, because
it is checked in the main thread and is updated in the decompression
thread.

Fix these two issues by following the code pattern for compression.

Reported-by: Daniel P. Berrange <berra...@redhat.com>
Signed-off-by: Liang Li <liang.z...@intel.com>
---
 migration/ram.c | 38 +++---
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 3f05738..7ab6ab5 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -263,6 +263,7 @@ typedef struct CompressParam CompressParam;
 
 struct DecompressParam {
 bool start;
+bool done;
 QemuMutex mutex;
 QemuCond cond;
 void *des;
@@ -287,6 +288,8 @@ static bool quit_comp_thread;
 static bool quit_decomp_thread;
 static DecompressParam *decomp_param;
 static QemuThread *decompress_threads;
+static QemuMutex decomp_done_lock;
+static QemuCond decomp_done_cond;
 
 static int do_compress_ram_page(CompressParam *param);
 
@@ -834,6 +837,7 @@ static inline void start_compression(CompressParam *param)
 
 static inline void start_decompression(DecompressParam *param)
 {
+param->done = false;
 qemu_mutex_lock(>mutex);
 param->start = true;
 qemu_cond_signal(>cond);
@@ -2193,19 +2197,24 @@ static void *do_data_decompress(void *opaque)
 qemu_mutex_lock(>mutex);
 while (!param->start && !quit_decomp_thread) {
 qemu_cond_wait(>cond, >mutex);
+}
+if (!quit_decomp_thread) {
 pagesize = TARGET_PAGE_SIZE;
-if (!quit_decomp_thread) {
-/* uncompress() will return failed in some case, especially
- * when the page is dirted when doing the compression, it's
- * not a problem because the dirty page will be retransferred
- * and uncompress() won't break the data in other pages.
- */
-uncompress((Bytef *)param->des, ,
-   (const Bytef *)param->compbuf, param->len);
-}
-param->start = false;
+/* uncompress() will return failed in some case, especially
+ * when the page is dirted when doing the compression, it's
+ * not a problem because the dirty page will be retransferred
+ * and uncompress() won't break the data in other pages.
+ */
+uncompress((Bytef *)param->des, ,
+   (const Bytef *)param->compbuf, param->len);
 }
+param->start = false;
 qemu_mutex_unlock(>mutex);
+
+qemu_mutex_lock(_done_lock);
+param->done = true;
+qemu_cond_signal(_done_cond);
+qemu_mutex_unlock(_done_lock);
 }
 
 return NULL;
@@ -2219,10 +2228,13 @@ void migrate_decompress_threads_create(void)
 decompress_threads = g_new0(QemuThread, thread_count);
 decomp_param = g_new0(DecompressParam, thread_count);
 quit_decomp_thread = false;
+qemu_mutex_init(_done_lock);
+qemu_cond_init(_done_cond);
 for (i = 0; i < thread_count; i++) {
 qemu_mutex_init(_param[i].mutex);
 qemu_cond_init(_param[i].cond);
 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
+decomp_param[i].done = true;
 qemu_thread_create(decompress_threads + i, "decompress",
do_data_decompress, decomp_param + i,
QEMU_THREAD_JOINABLE);
@@ -2258,9 +2270,10 @@ static void decompress_data_with_multi_threads(QEMUFile 
*f,
 int idx, thread_count;
 
 thread_count = migrate_decompress_threads();
+qemu_mutex_lock(_done_lock);
 while (true) {
 for (idx = 0; idx < thread_count; idx++) {
-if (!decomp_param[idx].start) {
+if (decomp_param[idx].done) {
 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
 decomp_param[idx].des = host;
 decomp_param[idx].len = len;
@@ -2270,8 +2283,11 @@ static void decompress_data_with_multi_threads(QEMUFile 
*f,
 }
 if (idx < thread_count) {
 break;
+} else {
+qemu_cond_wait(_done_cond, _done_lock);
 }
 }
+qemu_mutex_unlock(_done_lock);
 }
 
 /*
-- 
1.9.1




[Qemu-devel] [PATCH kernel 1/2] mm: add the related functions to build the free page bitmap

2016-04-19 Thread Liang Li
The free page bitmap will be sent to QEMU through virtio interface
and used for live migration optimization.
Drop the cache before building the free page bitmap can get more
free pages. Whether dropping the cache is decided by user.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 fs/drop_caches.c   | 22 ++
 include/linux/fs.h |  1 +
 mm/page_alloc.c| 46 ++
 3 files changed, 61 insertions(+), 8 deletions(-)

diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index d72d52b..f488086 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -50,14 +50,8 @@ int drop_caches_sysctl_handler(struct ctl_table *table, int 
write,
if (write) {
static int stfu;
 
-   if (sysctl_drop_caches & 1) {
-   iterate_supers(drop_pagecache_sb, NULL);
-   count_vm_event(DROP_PAGECACHE);
-   }
-   if (sysctl_drop_caches & 2) {
-   drop_slab();
-   count_vm_event(DROP_SLAB);
-   }
+   drop_cache(sysctl_drop_caches);
+
if (!stfu) {
pr_info("%s (%d): drop_caches: %d\n",
current->comm, task_pid_nr(current),
@@ -67,3 +61,15 @@ int drop_caches_sysctl_handler(struct ctl_table *table, int 
write,
}
return 0;
 }
+
+void drop_cache(int drop_ctl)
+{
+   if (drop_ctl & 1) {
+   iterate_supers(drop_pagecache_sb, NULL);
+   count_vm_event(DROP_PAGECACHE);
+   }
+   if (drop_ctl & 2) {
+   drop_slab();
+   count_vm_event(DROP_SLAB);
+   }
+}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 70e61b5..b8a0bc0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2864,6 +2864,7 @@ extern void drop_super(struct super_block *sb);
 extern void iterate_supers(void (*)(struct super_block *, void *), void *);
 extern void iterate_supers_type(struct file_system_type *,
void (*)(struct super_block *, void *), void *);
+extern void drop_cache(int drop_ctl);
 
 extern int dcache_dir_open(struct inode *, struct file *);
 extern int dcache_dir_close(struct inode *, struct file *);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 59de90d..4799983 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -63,6 +63,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -4029,6 +4030,51 @@ void show_free_areas(unsigned int filter)
show_swap_cache_info();
 }
 
+static void mark_free_pages_bitmap(struct zone *zone,
+   unsigned long *bitmap, unsigned long len)
+{
+   unsigned long pfn, flags, i, limit;
+   unsigned int order, t;
+   struct list_head *curr;
+
+   if (zone_is_empty(zone))
+   return;
+
+   spin_lock_irqsave(>lock, flags);
+
+   limit = min(len, max_pfn);
+   for_each_migratetype_order(order, t) {
+   list_for_each(curr, >free_area[order].free_list[t]) {
+   pfn = page_to_pfn(list_entry(curr, struct page, lru));
+   for (i = 0; i < (1UL << order); i++) {
+   if ((pfn + i) < limit)
+   set_bit_le(pfn + i, bitmap);
+   else
+   break;
+   }
+   }
+   }
+
+   spin_unlock_irqrestore(>lock, flags);
+}
+
+unsigned long get_max_pfn(void)
+{
+   return max_pfn;
+}
+EXPORT_SYMBOL(get_max_pfn);
+
+void get_free_pages(unsigned long *bitmap, unsigned long len, int drop)
+{
+   struct zone *zone;
+
+   drop_cache(drop);
+
+   for_each_populated_zone(zone)
+   mark_free_pages_bitmap(zone, bitmap, len);
+}
+EXPORT_SYMBOL(get_free_pages);
+
 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
 {
zoneref->zone = zone;
-- 
1.8.3.1




[Qemu-devel] [PATCH kernel 2/2] virtio-balloon: extend balloon driver to support the new feature

2016-04-19 Thread Liang Li
Extend the virtio balloon to support the new feature
VIRTIO_BALLOON_F_GET_FREE_PAGES, so that we can use it to send
the free page bitmap from guest to QEMU, the free page bitmap will
be used for live migration optimization.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 drivers/virtio/virtio_balloon.c | 100 ++--
 include/uapi/linux/virtio_balloon.h |   1 +
 2 files changed, 96 insertions(+), 5 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 7b6d74f..cf17694 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -45,9 +45,17 @@ static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES;
 module_param(oom_pages, int, S_IRUSR | S_IWUSR);
 MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
 
+extern void get_free_pages(unsigned long *free_page_bitmap,
+   unsigned long len, int drop);
+extern unsigned long get_max_pfn(void);
+
+struct cache_drop_ctrl {
+   u64 ctrl;
+};
+
 struct virtio_balloon {
struct virtio_device *vdev;
-   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
+   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_pages_vq;
 
/* The balloon servicing is delegated to a freezable workqueue. */
struct work_struct update_balloon_stats_work;
@@ -77,6 +85,10 @@ struct virtio_balloon {
unsigned int num_pfns;
u32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX];
 
+   unsigned long *free_pages;
+   unsigned long bmap_len;
+   struct cache_drop_ctrl cache_drop;
+
/* Memory statistics */
struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR];
 
@@ -256,6 +268,64 @@ static void update_balloon_stats(struct virtio_balloon *vb)
pages_to_bytes(available));
 }
 
+static void update_free_pages_stats(struct virtio_balloon *vb)
+{
+   unsigned long bitmap_bytes, max_pfn;
+
+   max_pfn = get_max_pfn();
+   bitmap_bytes = ALIGN(max_pfn, BITS_PER_LONG) / 8;
+
+   if (!vb->free_pages)
+   vb->free_pages = kzalloc(bitmap_bytes, GFP_KERNEL);
+   else {
+   if (bitmap_bytes < vb->bmap_len)
+   memset(vb->free_pages, 0, bitmap_bytes);
+   else {
+   kfree(vb->free_pages);
+   vb->free_pages = kzalloc(bitmap_bytes, GFP_KERNEL);
+   }
+   }
+   if (!vb->free_pages) {
+   vb->bmap_len = 0;
+   return;
+   }
+
+   vb->bmap_len = bitmap_bytes;
+   get_free_pages(vb->free_pages, max_pfn, vb->cache_drop.ctrl);
+}
+
+static void free_pages_handle_rq(struct virtio_balloon *vb)
+{
+   struct virtqueue *vq;
+   struct scatterlist sg[2];
+   unsigned int len;
+   struct cache_drop_ctl *ptr_cache_drop;
+   struct scatterlist sg_in;
+
+   vq = vb->free_pages_vq;
+   ptr_cache_drop = virtqueue_get_buf(vq, );
+
+   if (!ptr_cache_drop || len != sizeof(vb->cache_drop))
+   return;
+   update_free_pages_stats(vb);
+   sg_init_table(sg, 2);
+   sg_set_buf([0], &(vb->bmap_len), sizeof(vb->bmap_len));
+   sg_set_buf([1], vb->free_pages, vb->bmap_len);
+
+   sg_init_one(_in, >cache_drop, sizeof(vb->cache_drop));
+
+   virtqueue_add_outbuf(vq, [0], 2, vb, GFP_KERNEL);
+   virtqueue_add_inbuf(vq, _in, 1, >cache_drop, GFP_KERNEL);
+   virtqueue_kick(vq);
+}
+
+static void free_pages_rq(struct virtqueue *vq)
+{
+   struct virtio_balloon *vb = vq->vdev->priv;
+
+   free_pages_handle_rq(vb);
+}
+
 /*
  * While most virtqueues communicate guest-initiated requests to the 
hypervisor,
  * the stats queue operates in reverse.  The driver initializes the virtqueue
@@ -392,16 +462,22 @@ static void update_balloon_size_func(struct work_struct 
*work)
 
 static int init_vqs(struct virtio_balloon *vb)
 {
-   struct virtqueue *vqs[3];
-   vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request 
};
-   static const char * const names[] = { "inflate", "deflate", "stats" };
+   struct virtqueue *vqs[4];
+   vq_callback_t *callbacks[] = { balloon_ack, balloon_ack,
+stats_request, free_pages_rq };
+   const char *names[] = { "inflate", "deflate", "stats", "free_pages" };
int err, nvqs;
 
/*
 * We expect two virtqueues: inflate and deflate, and
 * optionally stat.
 */
-   nvqs = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2;
+   if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_GET_FREE_PAGES))
+   nvqs = 4;
+   else
+   nvqs = virtio_has_feature(vb->vdev,
+ VIRTIO_BALLOON_F_STATS_VQ) 

[Qemu-devel] [PATCH kernel 0/2] speed up live migration by skipping free pages

2016-04-19 Thread Liang Li
Current QEMU live migration implementation mark all guest's RAM pages
as dirtied in the ram bulk stage, all these pages will be processed
and it consumes quite a lot of CPU cycles and network bandwidth.

>From guest's point of view, it doesn't care about the content in free
page. We can make use of this fact and skip processing the free
pages, this can save a lot CPU cycles and reduce the network traffic
significantly while speed up the live migration process obviously.

This patch set is the kernel side implementation.

The virtio-balloon driver is extended to send the free page bitmap
from guest to QEMU.

After getting the free page bitmap, QEMU can use it to filter out
guest's free pages. This make the live migration process much more
efficient.

In order to skip more free pages, we add an interface to let the user
decide whether dropping the cache in guest during live migration.

Liang Li (2):
  mm: add the related functions to build the free page bitmap
  virtio-balloon: extend balloon driver to support the new feature

 drivers/virtio/virtio_balloon.c | 100 ++--
 fs/drop_caches.c|  22 +---
 include/linux/fs.h  |   1 +
 include/uapi/linux/virtio_balloon.h |   1 +
 mm/page_alloc.c |  46 +
 5 files changed, 157 insertions(+), 13 deletions(-)

-- 
1.8.3.1




[Qemu-devel] [PATCH QEMU 2/5] kvm: Add two new arch specific functions

2016-04-19 Thread Liang Li
Add a new function to get the vm's max pfn and a new function
to filter out the holes to get a tight free page bitmap.
They are implemented on X86, and all the arches should implement
them for live migration optimization.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 include/sysemu/kvm.h |  2 ++
 target-arm/kvm.c | 14 ++
 target-i386/kvm.c| 35 +++
 target-mips/kvm.c| 14 ++
 target-ppc/kvm.c | 14 ++
 target-s390x/kvm.c   | 14 ++
 6 files changed, 93 insertions(+)

diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index 0e18f15..5263304 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -228,6 +228,8 @@ int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
   target_ulong len, int type);
 void kvm_remove_all_breakpoints(CPUState *cpu);
 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap);
+unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap);
+unsigned long get_guest_max_pfn(void);
 #ifndef _WIN32
 int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset);
 #endif
diff --git a/target-arm/kvm.c b/target-arm/kvm.c
index 3671032..59e9417 100644
--- a/target-arm/kvm.c
+++ b/target-arm/kvm.c
@@ -626,3 +626,17 @@ int kvm_arch_msi_data_to_gsi(uint32_t data)
 {
 return (data - 32) & 0x;
 }
+
+unsigned long get_guest_max_pfn(void)
+{
+/* To be done */
+
+return 0;
+}
+
+unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap)
+{
+/* To be done */
+
+return bmap;
+}
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 799fdfa..76a33bd 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -3334,3 +3334,38 @@ int kvm_arch_msi_data_to_gsi(uint32_t data)
 {
 abort();
 }
+
+unsigned long get_guest_max_pfn(void)
+{
+PCMachineState *pcms = PC_MACHINE(current_machine);
+ram_addr_t above_4g_mem = pcms->above_4g_mem_size;
+unsigned long max_pfn;
+
+if (above_4g_mem) {
+max_pfn = ((1ULL << 32) + above_4g_mem) >> TARGET_PAGE_BITS;
+} else {
+max_pfn = pcms->below_4g_mem_size >> TARGET_PAGE_BITS;
+}
+
+return max_pfn;
+}
+
+unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap)
+{
+PCMachineState *pcms = PC_MACHINE(current_machine);
+ram_addr_t above_4g_mem = pcms->above_4g_mem_size;
+
+if (above_4g_mem) {
+unsigned long *src, *dst, len, pos;
+ram_addr_t below_4g_mem = pcms->below_4g_mem_size;
+src = bmap + ((1ULL << 32) >> TARGET_PAGE_BITS) / BITS_PER_LONG;
+dst = bmap + (below_4g_mem >> TARGET_PAGE_BITS) / BITS_PER_LONG;
+bitmap_move(dst, src, above_4g_mem >> TARGET_PAGE_BITS);
+
+pos = (above_4g_mem + below_4g_mem) >> TARGET_PAGE_BITS;
+len = ((1ULL << 32) - below_4g_mem) >> TARGET_PAGE_BITS;
+bitmap_clear(bmap, pos, len);
+}
+
+return bmap;
+}
diff --git a/target-mips/kvm.c b/target-mips/kvm.c
index 950bc05..23fdc50 100644
--- a/target-mips/kvm.c
+++ b/target-mips/kvm.c
@@ -1048,3 +1048,17 @@ int kvm_arch_msi_data_to_gsi(uint32_t data)
 {
 abort();
 }
+
+unsigned long get_guest_max_pfn(void)
+{
+/* To be done */
+
+return 0;
+}
+
+unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap)
+{
+/* To be done */
+
+return bmap;
+}
diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index c4c8146..bf4c60f 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -2579,3 +2579,17 @@ int kvmppc_enable_hwrng(void)
 
 return kvmppc_enable_hcall(kvm_state, H_RANDOM);
 }
+
+unsigned long get_guest_max_pfn(void)
+{
+/* To be done */
+
+return 0;
+}
+
+unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap)
+{
+/* To be done */
+
+return bmap;
+}
diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c
index e1859ca..7f5e1b8 100644
--- a/target-s390x/kvm.c
+++ b/target-s390x/kvm.c
@@ -2250,3 +2250,17 @@ int kvm_arch_msi_data_to_gsi(uint32_t data)
 {
 abort();
 }
+
+unsigned long get_guest_max_pfn(void)
+{
+/* To be done */
+
+return 0;
+}
+
+unsigned long *tighten_guest_free_page_bmap(unsigned long *bmap)
+{
+/* To be done */
+
+return bmap;
+}
-- 
1.8.3.1




[Qemu-devel] [PATCH QEMU 4/5] migration: filter out free pages during live migration

2016-04-19 Thread Liang Li
After sending out the request for free pages, live migration
process will start without waiting for the free page bitmap is
ready. If the free page bitmap is not ready when doing the 1st
migration_bitmap_sync() after ram_save_setup(), the free page
bitmap will be ignored, this means the free pages will not be
filtered out in this case.
The current implementation can not work with post copy, if post
copy is enabled, we simply ignore the free pages. Will make it
work later.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 migration/ram.c | 101 
 1 file changed, 101 insertions(+)

diff --git a/migration/ram.c b/migration/ram.c
index 3f05738..3944426 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -41,6 +41,8 @@
 #include "trace.h"
 #include "exec/ram_addr.h"
 #include "qemu/rcu_queue.h"
+#include "sysemu/balloon.h"
+#include "sysemu/kvm.h"
 
 #ifdef DEBUG_MIGRATION_RAM
 #define DPRINTF(fmt, ...) \
@@ -226,6 +228,8 @@ static QemuMutex migration_bitmap_mutex;
 static uint64_t migration_dirty_pages;
 static uint32_t last_version;
 static bool ram_bulk_stage;
+static bool ignore_freepage_rsp;
+static bool drop_page_cache;
 
 /* used by the search for pages to send */
 struct PageSearchStatus {
@@ -242,6 +246,7 @@ static struct BitmapRcu {
 struct rcu_head rcu;
 /* Main migration bitmap */
 unsigned long *bmap;
+unsigned long *free_page_bmap;
 /* bitmap of pages that haven't been sent even once
  * only maintained and used in postcopy at the moment
  * where it's used to send the dirtymap at the start
@@ -639,6 +644,7 @@ static void migration_bitmap_sync(void)
 rcu_read_unlock();
 qemu_mutex_unlock(_bitmap_mutex);
 
+ignore_freepage_rsp = true;
 trace_migration_bitmap_sync_end(migration_dirty_pages
 - num_dirty_pages_init);
 num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init;
@@ -1417,6 +1423,9 @@ static void migration_bitmap_free(struct BitmapRcu *bmap)
 {
 g_free(bmap->bmap);
 g_free(bmap->unsentmap);
+if (balloon_free_pages_support() && !migrate_postcopy_ram()) {
+g_free(bmap->free_page_bmap);
+}
 g_free(bmap);
 }
 
@@ -1487,6 +1496,90 @@ void migration_bitmap_extend(ram_addr_t old, ram_addr_t 
new)
 }
 }
 
+static void filter_out_guest_free_page(unsigned long *free_page_bmap,
+   long nbits)
+{
+long i, page_count = 0, len;
+unsigned long *bitmap;
+
+tighten_guest_free_page_bmap(free_page_bmap);
+qemu_mutex_lock(_bitmap_mutex);
+bitmap = atomic_rcu_read(_bitmap_rcu)->bmap;
+slow_bitmap_complement(bitmap, free_page_bmap, nbits);
+
+len = (last_ram_offset() >> TARGET_PAGE_BITS) / BITS_PER_LONG;
+for (i = 0; i < len; i++) {
+page_count += hweight_long(bitmap[i]);
+}
+
+migration_dirty_pages = page_count;
+qemu_mutex_unlock(_bitmap_mutex);
+}
+
+static void ram_request_free_page(unsigned long *bmap, unsigned long max_pfn)
+{
+FreePageStatus status;
+
+/* drop_page_cache should be set by user, the related code will be
+ * added later, set it to ture temporarily.
+ */
+drop_page_cache = true;
+
+status = balloon_get_free_pages(bmap, max_pfn, drop_page_cache);
+switch (status) {
+case FREE_PAGE_REQ:
+ignore_freepage_rsp = false;
+break;
+case FREE_PAGE_ERROR:
+error_report("Errro happend when request free page");
+break;
+default:
+error_report("unexpected response status: %d", status);
+break;
+}
+}
+
+static void ram_handle_free_page(void)
+{
+unsigned long nbits;
+RAMBlock *pc_ram_block;
+FreePageStatus status;
+
+status = balloon_get_free_pages(migration_bitmap_rcu->free_page_bmap,
+get_guest_max_pfn(), drop_page_cache);
+switch (status) {
+case FREE_PAGE_READY:
+rcu_read_lock();
+pc_ram_block = QLIST_FIRST_RCU(_list.blocks);
+nbits = pc_ram_block->used_length >> TARGET_PAGE_BITS;
+filter_out_guest_free_page(migration_bitmap_rcu->free_page_bmap, 
nbits);
+rcu_read_unlock();
+
+qemu_mutex_lock_iothread();
+migration_bitmap_sync();
+qemu_mutex_unlock_iothread();
+/*
+ * bulk stage assumes in (migration_bitmap_find_and_reset_dirty) that
+ * every page is dirty, that's no longer ture at this point.
+ */
+ram_bulk_stage = false;
+last_seen_block = NULL;
+last_sent_block = NULL;
+last_offset = 0;
+break;
+case FREE_PAGE_ERROR:
+ignore_freepage_rsp = true;
+error_report("failed to get free page");
+break;
+case FREE_PAGE_INVALID_PARAM:
+ignore_freepage_rsp = true;
+err

[Qemu-devel] [PATCH QEMU 5/5] migration: Add the interface for cache drop control

2016-04-19 Thread Liang Li
Whether drop the cache and drop what kind of cache depend on the
user, add the related qmp and hmp interface to query and set the
cache control value.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 hmp.c |  8 
 include/migration/migration.h |  1 +
 migration/migration.c | 31 ++-
 migration/ram.c   | 10 ++
 qapi-schema.json  | 25 ++---
 5 files changed, 63 insertions(+), 12 deletions(-)

diff --git a/hmp.c b/hmp.c
index d510236..17f418e 100644
--- a/hmp.c
+++ b/hmp.c
@@ -286,6 +286,9 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict 
*qdict)
 monitor_printf(mon, " %s: %" PRId64,
 
MigrationParameter_lookup[MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT],
 params->x_cpu_throttle_increment);
+monitor_printf(mon, " %s: %" PRId64,
+MigrationParameter_lookup[MIGRATION_PARAMETER_X_DROP_CACHE],
+params->x_drop_cache);
 monitor_printf(mon, "\n");
 }
 
@@ -1242,6 +1245,7 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict 
*qdict)
 bool has_decompress_threads = false;
 bool has_x_cpu_throttle_initial = false;
 bool has_x_cpu_throttle_increment = false;
+bool has_x_drop_cache = false;
 int i;
 
 for (i = 0; i < MIGRATION_PARAMETER__MAX; i++) {
@@ -1262,12 +1266,16 @@ void hmp_migrate_set_parameter(Monitor *mon, const 
QDict *qdict)
 case MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT:
 has_x_cpu_throttle_increment = true;
 break;
+case MIGRATION_PARAMETER_X_DROP_CACHE:
+has_x_drop_cache = true;
+break;
 }
 qmp_migrate_set_parameters(has_compress_level, value,
has_compress_threads, value,
has_decompress_threads, value,
has_x_cpu_throttle_initial, value,
has_x_cpu_throttle_increment, value,
+   has_x_drop_cache, value,
);
 break;
 }
diff --git a/include/migration/migration.h b/include/migration/migration.h
index ac2c12c..873e3bc 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -283,6 +283,7 @@ bool migrate_use_compression(void);
 int migrate_compress_level(void);
 int migrate_compress_threads(void);
 int migrate_decompress_threads(void);
+int migrate_drop_cache(void);
 bool migrate_use_events(void);
 
 /* Sending on the return path - generic and then for each message type */
diff --git a/migration/migration.c b/migration/migration.c
index 991313a..ecd07b8 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -52,6 +52,13 @@
 /* Define default autoconverge cpu throttle migration parameters */
 #define DEFAULT_MIGRATE_X_CPU_THROTTLE_INITIAL 20
 #define DEFAULT_MIGRATE_X_CPU_THROTTLE_INCREMENT 10
+/* Default cache drop control
+ * 0: no drop
+ * 1: drop clean page cache
+ * 2: drop slab cache
+ * 3: drop both clean and slab cache
+ */
+#define DEFAULT_MIGRATE_X_DROP_CACHE 0
 
 /* Migration XBZRLE default cache size */
 #define DEFAULT_MIGRATE_CACHE_SIZE (64 * 1024 * 1024)
@@ -91,6 +98,8 @@ MigrationState *migrate_get_current(void)
 DEFAULT_MIGRATE_X_CPU_THROTTLE_INITIAL,
 .parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT] =
 DEFAULT_MIGRATE_X_CPU_THROTTLE_INCREMENT,
+.parameters[MIGRATION_PARAMETER_X_DROP_CACHE] =
+DEFAULT_MIGRATE_X_DROP_CACHE,
 };
 
 if (!once) {
@@ -525,6 +534,7 @@ MigrationParameters *qmp_query_migrate_parameters(Error 
**errp)
 s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INITIAL];
 params->x_cpu_throttle_increment =
 s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT];
+params->x_drop_cache = s->parameters[MIGRATION_PARAMETER_X_DROP_CACHE];
 
 return params;
 }
@@ -721,7 +731,9 @@ void qmp_migrate_set_parameters(bool has_compress_level,
 bool has_x_cpu_throttle_initial,
 int64_t x_cpu_throttle_initial,
 bool has_x_cpu_throttle_increment,
-int64_t x_cpu_throttle_increment, Error **errp)
+int64_t x_cpu_throttle_increment,
+bool has_x_drop_cache,
+int64_t x_drop_cache, Error **errp)
 {
 MigrationState *s = migrate_get_current();
 
@@ -756,6 +768,11 @@ void qmp_migrate_set_parameters(bool has_compress_level,
"x_cpu_throttle_increment",
"an integer in the range of 1 to 99");
 }
+if (h

[Qemu-devel] [PATCH QEMU 3/5] virtio-balloon: Add a new feature to balloon device

2016-04-19 Thread Liang Li
Extend the virtio balloon device to support a new feature, this
new feature can help to get guest's free pages information, which
can be used for live migration optimzation.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 balloon.c   | 29 +++-
 hw/virtio/virtio-balloon.c  | 92 -
 include/hw/virtio/virtio-balloon.h  | 30 +++-
 include/standard-headers/linux/virtio_balloon.h |  1 +
 include/sysemu/balloon.h| 12 +++-
 5 files changed, 159 insertions(+), 5 deletions(-)

diff --git a/balloon.c b/balloon.c
index f2ef50c..346e215 100644
--- a/balloon.c
+++ b/balloon.c
@@ -36,6 +36,7 @@
 
 static QEMUBalloonEvent *balloon_event_fn;
 static QEMUBalloonStatus *balloon_stat_fn;
+static QEMUBalloonGetFreePage *balloon_get_free_page_fn;
 static void *balloon_opaque;
 static bool balloon_inhibited;
 
@@ -65,9 +66,12 @@ static bool have_balloon(Error **errp)
 }
 
 int qemu_add_balloon_handler(QEMUBalloonEvent *event_func,
- QEMUBalloonStatus *stat_func, void *opaque)
+ QEMUBalloonStatus *stat_func,
+ QEMUBalloonGetFreePage *get_free_page_func,
+ void *opaque)
 {
-if (balloon_event_fn || balloon_stat_fn || balloon_opaque) {
+if (balloon_event_fn || balloon_stat_fn || balloon_get_free_page_fn
+|| balloon_opaque) {
 /* We're already registered one balloon handler.  How many can
  * a guest really have?
  */
@@ -75,6 +79,7 @@ int qemu_add_balloon_handler(QEMUBalloonEvent *event_func,
 }
 balloon_event_fn = event_func;
 balloon_stat_fn = stat_func;
+balloon_get_free_page_fn = get_free_page_func;
 balloon_opaque = opaque;
 return 0;
 }
@@ -86,6 +91,7 @@ void qemu_remove_balloon_handler(void *opaque)
 }
 balloon_event_fn = NULL;
 balloon_stat_fn = NULL;
+balloon_get_free_page_fn = NULL;
 balloon_opaque = NULL;
 }
 
@@ -116,3 +122,22 @@ void qmp_balloon(int64_t target, Error **errp)
 trace_balloon_event(balloon_opaque, target);
 balloon_event_fn(balloon_opaque, target);
 }
+
+bool balloon_free_pages_support(void)
+{
+return balloon_get_free_page_fn ? true : false;
+}
+
+FreePageStatus balloon_get_free_pages(unsigned long *bitmap,
+  unsigned long len, int drop_cache)
+{
+if (!balloon_get_free_page_fn) {
+return FREE_PAGE_UNSUPPORT;
+}
+
+if (!bitmap) {
+return FREE_PAGE_INVALID_PARAM;
+}
+
+return balloon_get_free_page_fn(balloon_opaque, bitmap, len, drop_cache);
+}
diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index 9dbe681..0abf375 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -78,6 +78,12 @@ static bool balloon_stats_supported(const VirtIOBalloon *s)
 return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_STATS_VQ);
 }
 
+static bool balloon_free_pages_supported(const VirtIOBalloon *s)
+{
+VirtIODevice *vdev = VIRTIO_DEVICE(s);
+return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_GET_FREE_PAGE);
+}
+
 static bool balloon_stats_enabled(const VirtIOBalloon *s)
 {
 return s->stats_poll_interval > 0;
@@ -304,6 +310,38 @@ out:
 }
 }
 
+static void virtio_balloon_get_free_pages(VirtIODevice *vdev, VirtQueue *vq)
+{
+VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
+VirtQueueElement *elem;
+size_t offset = 0;
+uint64_t bitmap_bytes = 0;
+
+elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
+if (!elem) {
+s->req_status = REQ_ERROR;
+return;
+}
+
+s->free_page_vq_elem = elem;
+
+if (!elem->out_num) {
+return;
+}
+
+iov_to_buf(elem->out_sg, elem->out_num, offset,
+   _bytes, sizeof(uint64_t));
+
+if (s->bmap_len < bitmap_bytes) {
+s->req_status = REQ_INVALID_PARAM;
+return;
+}
+offset += sizeof(uint64_t);
+iov_to_buf(elem->out_sg, elem->out_num, offset,
+   s->free_page_bmap, bitmap_bytes);
+s->req_status = REQ_DONE;
+}
+
 static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data)
 {
 VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
@@ -373,6 +411,7 @@ static uint64_t virtio_balloon_get_features(VirtIODevice 
*vdev, uint64_t f,
 VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
 f |= dev->host_features;
 virtio_add_feature(, VIRTIO_BALLOON_F_STATS_VQ);
+virtio_add_feature(, VIRTIO_BALLOON_F_GET_FREE_PAGE);
 return f;
 }
 
@@ -383,6 +422,48 @@ static void virtio_balloon_stat(void *opaque, BalloonInfo 
*info)
  VIRTIO_BALLOON_PFN_SHIFT);
 }
 
+static FreePageStatus virtio_balloon_free_pages(void *opaque,
+unsigned long *bitmap,
+   

[Qemu-devel] [PATCH QEMU 1/5] bitmap: Add a new bitmap_move function

2016-04-19 Thread Liang Li
Sometimes, it is need to move a portion of bitmap to another place
in a large bitmap, if overlap happens, the bitmap_copy can't not
work correctly, we need a new function to do this work.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 include/qemu/bitmap.h | 13 +
 1 file changed, 13 insertions(+)

diff --git a/include/qemu/bitmap.h b/include/qemu/bitmap.h
index 0e33fa5..ce07444 100644
--- a/include/qemu/bitmap.h
+++ b/include/qemu/bitmap.h
@@ -38,6 +38,7 @@
  * bitmap_set(dst, pos, nbits) Set specified bit area
  * bitmap_set_atomic(dst, pos, nbits)   Set specified bit area with atomic ops
  * bitmap_clear(dst, pos, nbits)   Clear specified bit area
+ * bitmap_move(dst, src, nbits) Move *src to *dst
  * bitmap_test_and_clear_atomic(dst, pos, nbits)Test and clear area
  * bitmap_find_next_zero_area(buf, len, pos, n, mask)  Find bit free area
  */
@@ -137,6 +138,18 @@ static inline void bitmap_copy(unsigned long *dst, const 
unsigned long *src,
 }
 }
 
+static inline void bitmap_move(unsigned long *dst, const unsigned long *src,
+   long nbits)
+{
+if (small_nbits(nbits)) {
+unsigned long tmp = *src;
+*dst = tmp;
+} else {
+long len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
+memmove(dst, src, len);
+}
+}
+
 static inline int bitmap_and(unsigned long *dst, const unsigned long *src1,
  const unsigned long *src2, long nbits)
 {
-- 
1.8.3.1




[Qemu-devel] [PATCH QEMU 0/5] spee up live migration by skipping free pages

2016-04-19 Thread Liang Li
Current QEMU live migration implementation marks all guest's RAM pages
as dirtied in the ram bulk stage, all these pages will be processed
and it consumes quite a lot of CPU cycles and network bandwidth.

>From guest's point of view, it doesn't care about the content in free
page. We can make use of this fact and skip processing the free
pages, it can save a lot CPU cycles and reduce the network traffic
significantly while speed up the live migration process obviously.

This patch set is the QEMU side implementation.

The virtio-balloon is extended so that QEMU can get the free pages
information from the guest.

After getting the free page bitmap, QEMU can use it to filter out
guest's free pages. This make the live migration process much more
efficient.

In order to skip more free pages, we add an interface to let the user
decide whether dropping the cache in guest during live migration.

Performance data


Test environment:

CPU: Intel (R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz
Host RAM: 64GB
Host Linux Kernel:  4.4.0   Host OS: CentOS 7.1
Guest Linux Kernel:  4.5.rc6Guest OS: CentOS 6.6
Network:  Intel X710 with 10 Gigabit connection
Guest RAM: 8GB

Case 1: Idle guest just boots:
==
| original  |  skip free page 
--
total time(ms)  |1505   |   573
--
transferred ram(KB) |   399792  |  379057
==

Case 2: The guest has ever run some memory consuming workload, the
workload is terminated before live migration.
==
| original  |   skip free page 
--
total time(ms)  |   10641|   597
--
transferred ram(KB) |  8350829  |  389900
==

Case 3: The guest has ever built the linux kernel, the building is
completed before live migration. The page cache is dropped
during live migration in this case.
==
| original  |   skip free page 
--
total time(ms)  |   2636|   914
--
transferred ram(KB) |  1316747  |  421980
==

Liang Li (5):
  bitmap: Add a new bitmap_move function
  kvm: Add two new arch specific functions
  virtio-balloon: Add a new feature to balloon device
  migration: filter out free pages during live migration
  migration: Add the interface for cache drop control

 balloon.c   | 29 +++-
 hmp.c   |  8 +++
 hw/virtio/virtio-balloon.c  | 92 +++-
 include/hw/virtio/virtio-balloon.h  | 30 +++-
 include/migration/migration.h   |  1 +
 include/qemu/bitmap.h   | 13 
 include/standard-headers/linux/virtio_balloon.h |  1 +
 include/sysemu/balloon.h| 12 +++-
 include/sysemu/kvm.h|  2 +
 migration/migration.c   | 31 +++-
 migration/ram.c | 95 +
 qapi-schema.json| 25 ++-
 target-arm/kvm.c| 14 
 target-i386/kvm.c   | 35 +
 target-mips/kvm.c   | 14 
 target-ppc/kvm.c| 14 
 target-s390x/kvm.c  | 14 
 17 files changed, 421 insertions(+), 9 deletions(-)

-- 
1.8.3.1




[Qemu-devel] [PATCH] migration: remove useless code

2016-04-18 Thread Liang Li
page_buffer is set twice repeatedly, remove the previous set.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 migration/ram.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/migration/ram.c b/migration/ram.c
index 3f05738..31d40f4 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2322,7 +2322,6 @@ static int ram_load_postcopy(QEMUFile *f)
 ret = -EINVAL;
 break;
 }
-page_buffer = host;
 /*
  * Postcopy requires that we place whole host pages atomically.
  * To make it atomic, the data is read into a temporary page
-- 
1.9.1




[Qemu-devel] [RFC Design Doc]Speed up live migration by skipping free pages

2016-03-22 Thread Liang Li
I have sent the RFC version patch set for live migration optimization
by skipping processing the free pages in the ram bulk stage and
received a lot of comments. The related threads can be found at:

https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg00715.html
https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg00714.html
https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg00717.html
https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg00716.html
https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg00718.html

https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg00719.html 
https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg00720.html
https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg00721.html

To make things easier, I wrote this doc about the possible designs
and my choices. Comments are welcome! 

Content
===
1. Background
2. Why not use virtio-balloon
3. Virtio interface
4. Constructing free page bitmap
5. Tighten free page bitmap
6. Handling page cache in the guest
7. APIs for live migration
8. Pseudo code??

Details
===
1. Background
As we know, in the ram bulk stage of live migration, current QEMU live
migration implementation mark the all guest's RAM pages as dirtied in
the ram bulk stage, all these pages will be checked for zero page
first, and the page content will be sent to the destination depends on
the checking result, that process consumes quite a lot of CPU cycles
and network bandwidth.

>From guest's point of view, there are some pages currently not used by
the guest, guest doesn't care about the content in these pages. Free
pages are this kind of pages which are not used by guest. We can make
use of this fact and skip processing the free pages in the ram bulk
stage, it can save a lot CPU cycles and reduce the network traffic
while speed up the live migration process obviously.

Usually, only the guest has the information of free pages. But it???s
possible to let the guest tell QEMU it???s free page information by some
mechanism. E.g. Through the virtio interface. Once QEMU get the free
page information, it can skip processing these free pages in the ram
bulk stage by clearing the corresponding bit of the migration bitmap. 

2. Why not use virtio-balloon 
Actually, the virtio-balloon can do the similar thing by inflating the
balloon before live migration, but its performance is no good, for an
8GB idle guest just boots, it takes about 5.7 Sec to inflate the
balloon to 7GB, but it only takes 25ms to get a valid free page bitmap
from the guest.  There are some of reasons for the bad performance of
vitio-balloon:
a. allocating pages (5%, 304ms)
b. sending PFNs to host (71%, 4194ms)
c. address translation and madvise() operation (24%, 1423ms)
Debugging shows the time spends on these operations are listed in the
brackets above. By changing the VIRTIO_BALLOON_ARRAY_PFNS_MAX to a
large value, such as 16384, the time spends on sending the PFNs can be
reduced to about 400ms, but it???s still too long.

Obviously, the virtio-balloon mechanism has a bigger performance
impact to the guest than the way we are trying to implement.

3. Virtio interface
There are three different ways of using the virtio interface to
send the free page information.
a. Extend the current virtio device
The virtio spec has already defined some virtio devices, and we can
extend one of these devices so as to use it to transport the free page
information. It requires modifying the virtio spec.

b. Implement a new virtio device
Implementing a brand new virtio device to exchange information
between host and guest is another choice. It requires modifying the
virtio spec too.

c. Make use of virtio-serial (Amit???s suggestion, my choice)
It???s possible to make use the virtio-serial for communication between
host and guest, the benefit of this solution is no need to modify the
virtio spec. 

4. Construct free page bitmap
To minimize the space for saving free page information, it???s better to
use a bitmap to describe the free pages. There are two ways to
construct the free page bitmap.

a. Construct free page bitmap when demand (My choice)
Guest can allocate memory for the free page bitmap only when it
receives the request from QEMU, and set the free page bitmap by
traversing the free page list. The advantage of this way is that it???s
quite simple and easy to implement. The disadvantage is that the
traversing operation may consume quite a long time when there are a
lot of free pages. (About 20ms for 7GB free pages)

b. Update free page bitmap when allocating/freeing pages 
Another choice is to allocate the memory for the free page bitmap
when guest boots, and then update the free page bitmap when
allocating/freeing pages. It needs more modification to the code
related to memory management in guest. The advantage of this way is
that guest can response QEMU???s request for a free page bitmap very
quickly, no matter how many free pages in the guest. Do the kernel guys
like this?

5. 

[Qemu-devel] [PATCH v6 2/2] cutils: add avx2 instruction optimization

2016-03-07 Thread Liang Li
buffer_find_nonzero_offset() is a hot function during live migration.
Now it use SSE2 instructions for optimization. For platform supports
AVX2 instructions, use AVX2 instructions for optimization can help
to improve the performance of buffer_find_nonzero_offset() about 30%
comparing to SSE2.

Live migration can be faster with this optimization, the test result
shows that for an 8GiB RAM idle guest just boots, this patch can help
to shorten the total live migration time about 6%.

This patch use the ifunc mechanism to select the proper function when
running, for platform supports AVX2, execute the AVX2 instructions,
else, execute the original instructions.

Signed-off-by: Liang Li <liang.z...@intel.com>
Suggested-by: Paolo Bonzini <pbonz...@redhat.com>
Suggested-by: Richard Henderson <r...@twiddle.net>
Reviewed-by: Paolo Bonzini <pbonz...@redhat.com>
---
 include/qemu-common.h |   8 +---
 util/cutils.c | 124 --
 2 files changed, 121 insertions(+), 11 deletions(-)

diff --git a/include/qemu-common.h b/include/qemu-common.h
index ced2994..887ca71 100644
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -476,13 +476,7 @@ void qemu_hexdump(const char *buf, FILE *fp, const char 
*prefix, size_t size);
 #endif
 
 #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8
-static inline bool
-can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
-{
-return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
-   * sizeof(VECTYPE)) == 0
-&& ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
-}
+bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len);
 size_t buffer_find_nonzero_offset(const void *buf, size_t len);
 
 /*
diff --git a/util/cutils.c b/util/cutils.c
index 59e1f70..c3dd534 100644
--- a/util/cutils.c
+++ b/util/cutils.c
@@ -160,6 +160,14 @@ int qemu_fdatasync(int fd)
 #endif
 }
 
+static bool
+can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len)
+{
+return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
+   * sizeof(VECTYPE)) == 0
+&& ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
+}
+
 /*
  * Searches for an area with non-zero content in a buffer
  *
@@ -168,8 +176,8 @@ int qemu_fdatasync(int fd)
  * and addr must be a multiple of sizeof(VECTYPE) due to
  * restriction of optimizations in this function.
  *
- * can_use_buffer_find_nonzero_offset() can be used to check
- * these requirements.
+ * can_use_buffer_find_nonzero_offset_inner() can be used to
+ * check these requirements.
  *
  * The return value is the offset of the non-zero area rounded
  * down to a multiple of sizeof(VECTYPE) for the first
@@ -180,13 +188,13 @@ int qemu_fdatasync(int fd)
  * If the buffer is all zero the return value is equal to len.
  */
 
-size_t buffer_find_nonzero_offset(const void *buf, size_t len)
+static size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len)
 {
 const VECTYPE *p = buf;
 const VECTYPE zero = (VECTYPE){0};
 size_t i;
 
-assert(can_use_buffer_find_nonzero_offset(buf, len));
+assert(can_use_buffer_find_nonzero_offset_inner(buf, len));
 
 if (!len) {
 return 0;
@@ -216,6 +224,114 @@ size_t buffer_find_nonzero_offset(const void *buf, size_t 
len)
 }
 
 /*
+ * GCC before version 4.9 has a bug which will cause the target
+ * attribute work incorrectly and failed to compile in some case,
+ * restrict the gcc version to 4.9+ to prevent the failure.
+ */
+
+#if defined CONFIG_AVX2_OPT && QEMU_GNUC_PREREQ(4, 9)
+#pragma GCC push_options
+#pragma GCC target("avx2")
+#include 
+#include 
+
+#define AVX2_VECTYPE__m256i
+#define AVX2_SPLAT(p)   _mm256_set1_epi8(*(p))
+#define AVX2_ALL_EQ(v1, v2) \
+(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0x)
+#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
+
+static bool
+can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
+{
+return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
+   * sizeof(AVX2_VECTYPE)) == 0
+&& ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0);
+}
+
+static size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
+{
+const AVX2_VECTYPE *p = buf;
+const AVX2_VECTYPE zero = (AVX2_VECTYPE){0};
+size_t i;
+
+assert(can_use_buffer_find_nonzero_offset_avx2(buf, len));
+
+if (!len) {
+return 0;
+}
+
+for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
+if (!AVX2_ALL_EQ(p[i], zero)) {
+return i * sizeof(AVX2_VECTYPE);
+}
+}
+
+for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
+ i < len / sizeof(AVX2_VECTYPE);
+ i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
+AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]);
+AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]);
+AVX2_VE

[Qemu-devel] [PATCH v6 0/2] add avx2 instruction optimization

2016-03-07 Thread Liang Li
buffer_find_nonzero_offset() is a hot function during live migration.
Now it use SSE2 instructions for optimization. For platform supports
AVX2 instructions, use the AVX2 instructions for optimization can help
to improve the performance of zero page checking about 30% comparing
to SSE2.
Live migration can be faster with this optimization, the test result
shows that for an 8GB RAM idle guest, this patch can help to shorten
the total live migration time about 6%.

This patch use the ifunc mechanism to select the proper function when
running, for platform supports AVX2, execute the AVX2 instructions,
else, execute the original instructions.

With this patch, the QEMU binary can run on both platforms support AVX2
or not.

Compiler which doesn't support the AVX2 and ifunc attribute can also build
the source code successfully.

v5 -> v6 changes:
  * Restrict the optimization to GCC 4.9+ to prevent compiling failure
in some case (Paolo's suggestion)
v4 -> v5 changes:
  * Enhance the ifunc attribute detection (Paolo's suggestion)

v3 -> v4 changes:
  * Use the GCC #pragma to make things simple (Paolo's suggestion) 
  * Put avx2 related code in cutils.c (Richard's suggestion)
  * Change the configure, detect ifunc and avx2 attributes together

v2 -> v3 changes:
  * Detect the ifunc attribute support (Paolo's suggestion) 
  * Use the ifunc attribute instead of the inline asm (Richard's suggestion)
  * Change the configure (Juan's suggestion)
Liang Li (2):
  configure: detect ifunc and avx2 attribute
  cutils: add avx2 instruction optimization

 configure |  21 +
 include/qemu-common.h |   8 +---
 util/cutils.c | 124 --
 3 files changed, 142 insertions(+), 11 deletions(-)

-- 
1.9.1




[Qemu-devel] [PATCH v6 1/2] configure: detect ifunc and avx2 attribute

2016-03-07 Thread Liang Li
Detect if the compiler can support the ifun and avx2, if so, set
CONFIG_AVX2_OPT which will be used to turn on the avx2 instruction
optimization.

Suggested-by: Paolo Bonzini <pbonz...@redhat.com>
Suggested-by: Peter Maydell <peter.mayd...@linaro.org>
Signed-off-by: Liang Li <liang.z...@intel.com>
---
 configure | 21 +
 1 file changed, 21 insertions(+)

diff --git a/configure b/configure
index 0c0472a..2b32876 100755
--- a/configure
+++ b/configure
@@ -280,6 +280,7 @@ libusb=""
 usb_redir=""
 opengl=""
 opengl_dmabuf="no"
+avx2_opt="no"
 zlib="yes"
 lzo=""
 snappy=""
@@ -1773,6 +1774,21 @@ EOF
 fi
 
 ##
+# avx2 optimization requirement check
+
+cat > $TMPC << EOF
+static void bar(void) {}
+static void *bar_ifunc(void) {return (void*) bar;}
+static void foo(void) __attribute__((ifunc("bar_ifunc")));
+int main(void) { foo(); return 0; }
+EOF
+if compile_prog "-mavx2" "" ; then
+if readelf --syms $TMPE |grep "IFUNC.*foo" >/dev/null 2>&1; then
+avx2_opt="yes"
+fi
+fi
+
+#
 # zlib check
 
 if test "$zlib" != "no" ; then
@@ -4790,6 +4806,7 @@ echo "bzip2 support $bzip2"
 echo "NUMA host support $numa"
 echo "tcmalloc support  $tcmalloc"
 echo "jemalloc support  $jemalloc"
+echo "avx2 optimization $avx2_opt"
 
 if test "$sdl_too_old" = "yes"; then
 echo "-> Your SDL version is too old - please upgrade to have SDL support"
@@ -5178,6 +5195,10 @@ if test "$opengl" = "yes" ; then
   fi
 fi
 
+if test "$avx2_opt" = "yes" ; then
+  echo "CONFIG_AVX2_OPT=y" >> $config_host_mak
+fi
+
 if test "$lzo" = "yes" ; then
   echo "CONFIG_LZO=y" >> $config_host_mak
 fi
-- 
1.9.1




[Qemu-devel] [RFC kernel 2/2] virtio-balloon: extend balloon driver to support a new feature

2016-03-03 Thread Liang Li
Extend the virio balloon to support the new feature
VIRTIO_BALLOON_F_GET_FREE_PAGES, so that we can use it to send the
free pages information from guest to QEMU, and then optimize the
live migration process.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 drivers/virtio/virtio_balloon.c | 106 ++--
 include/uapi/linux/virtio_balloon.h |   1 +
 2 files changed, 102 insertions(+), 5 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 0c3691f..7461d3e 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -45,9 +45,18 @@ static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES;
 module_param(oom_pages, int, S_IRUSR | S_IWUSR);
 MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
 
+extern void get_free_pages(unsigned long *free_page_bitmap,
+   unsigned long *free_pages_num,
+   unsigned long lowmem);
+extern unsigned long get_total_pages_count(unsigned long lowmem);
+
+struct mem_layout {
+   unsigned long low_mem;
+};
+
 struct virtio_balloon {
struct virtio_device *vdev;
-   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
+   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_pages_vq;
 
/* Where the ballooning thread waits for config to change. */
wait_queue_head_t config_change;
@@ -75,6 +84,11 @@ struct virtio_balloon {
unsigned int num_pfns;
u32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX];
 
+   unsigned long *free_pages;
+   unsigned long free_pages_len;
+   unsigned long free_pages_num;
+   struct mem_layout mem_config;
+
/* Memory statistics */
int need_stats_update;
struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR];
@@ -245,6 +259,34 @@ static void update_balloon_stats(struct virtio_balloon *vb)
pages_to_bytes(i.totalram));
 }
 
+static void update_free_pages_stats(struct virtio_balloon *vb)
+{
+   unsigned long total_page_count, bitmap_bytes;
+
+   total_page_count = get_total_pages_count(vb->mem_config.low_mem);
+   bitmap_bytes = ALIGN(total_page_count, BITS_PER_LONG) / 8;
+
+   if (!vb->free_pages)
+   vb->free_pages = kzalloc(bitmap_bytes, GFP_KERNEL);
+   else {
+   if (bitmap_bytes < vb->free_pages_len)
+   memset(vb->free_pages, 0, bitmap_bytes);
+   else {
+   kfree(vb->free_pages);
+   vb->free_pages = kzalloc(bitmap_bytes, GFP_KERNEL);
+   }
+   }
+   if (!vb->free_pages) {
+   vb->free_pages_len = 0;
+   vb->free_pages_num = 0;
+   return;
+   }
+
+   vb->free_pages_len = bitmap_bytes;
+   get_free_pages(vb->free_pages, >free_pages_num,
+  vb->mem_config.low_mem);
+}
+
 /*
  * While most virtqueues communicate guest-initiated requests to the 
hypervisor,
  * the stats queue operates in reverse.  The driver initializes the virtqueue
@@ -278,6 +320,39 @@ static void stats_handle_request(struct virtio_balloon *vb)
virtqueue_kick(vq);
 }
 
+static void free_pages_handle_rq(struct virtio_balloon *vb)
+{
+   struct virtqueue *vq;
+   struct scatterlist sg[3];
+   unsigned int len;
+   struct mem_layout *ptr_mem_layout;
+   struct scatterlist sg_in;
+
+   vq = vb->free_pages_vq;
+   ptr_mem_layout = virtqueue_get_buf(vq, );
+
+   if (!ptr_mem_layout)
+   return;
+   update_free_pages_stats(vb);
+   sg_init_table(sg, 3);
+   sg_set_buf([0], &(vb->free_pages_num), sizeof(vb->free_pages_num));
+   sg_set_buf([1], &(vb->free_pages_len), sizeof(vb->free_pages_len));
+   sg_set_buf([2], vb->free_pages, vb->free_pages_len);
+
+   sg_init_one(_in, >mem_config, sizeof(vb->mem_config));
+
+   virtqueue_add_outbuf(vq, [0], 3, vb, GFP_KERNEL);
+   virtqueue_add_inbuf(vq, _in, 1, >mem_config, GFP_KERNEL);
+   virtqueue_kick(vq);
+}
+
+static void free_pages_rq(struct virtqueue *vq)
+{
+   struct virtio_balloon *vb = vq->vdev->priv;
+
+   free_pages_handle_rq(vb);
+}
+
 static void virtballoon_changed(struct virtio_device *vdev)
 {
struct virtio_balloon *vb = vdev->priv;
@@ -386,16 +461,22 @@ static int balloon(void *_vballoon)
 
 static int init_vqs(struct virtio_balloon *vb)
 {
-   struct virtqueue *vqs[3];
-   vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request 
};
-   static const char * const names[] = { "inflate", "deflate", "stats" };
+   struct virtqueue *vqs[4];
+   vq_callback_t *callbacks[] = { balloon_ack, balloon_ack,
+stats_request, free_pages_rq };
+   const char *names[] = { &

[Qemu-devel] [RFC kernel 1/2] mm: Add the functions used to get free pages information

2016-03-03 Thread Liang Li
get_total_pages_count() tries to get the page count of the system
RAM.
get_free_pages() is intend to construct a free pages bitmap by
traversing the free_list.

The free pages information will be sent to QEMU through virtio
and used for live migration optimization.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 mm/page_alloc.c | 57 +
 1 file changed, 57 insertions(+)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 838ca8bb..81922e6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3860,6 +3860,63 @@ void show_free_areas(unsigned int filter)
show_swap_cache_info();
 }
 
+#define PFN_4G (0x1 >> PAGE_SHIFT)
+
+unsigned long get_total_pages_count(unsigned long low_mem)
+{
+   if (max_pfn >= PFN_4G) {
+   unsigned long pfn_gap = PFN_4G - (low_mem >> PAGE_SHIFT);
+
+   return max_pfn - pfn_gap;
+   } else
+   return max_pfn;
+}
+EXPORT_SYMBOL(get_total_pages_count);
+
+static void mark_free_pages_bitmap(struct zone *zone,
+unsigned long *free_page_bitmap, unsigned long pfn_gap)
+{
+   unsigned long pfn, flags, i;
+   unsigned int order, t;
+   struct list_head *curr;
+
+   if (zone_is_empty(zone))
+   return;
+
+   spin_lock_irqsave(>lock, flags);
+
+   for_each_migratetype_order(order, t) {
+   list_for_each(curr, >free_area[order].free_list[t]) {
+
+   pfn = page_to_pfn(list_entry(curr, struct page, lru));
+   for (i = 0; i < (1UL << order); i++) {
+   if ((pfn + i) >= PFN_4G)
+   set_bit_le(pfn + i - pfn_gap,
+  free_page_bitmap);
+   else
+   set_bit_le(pfn + i, free_page_bitmap);
+   }
+   }
+   }
+
+   spin_unlock_irqrestore(>lock, flags);
+}
+
+void get_free_pages(unsigned long *free_page_bitmap,
+   unsigned long *free_pages_count,
+   unsigned long low_mem)
+{
+   struct zone *zone;
+   unsigned long pfn_gap;
+
+   pfn_gap = PFN_4G - (low_mem >> PAGE_SHIFT);
+   for_each_populated_zone(zone)
+   mark_free_pages_bitmap(zone, free_page_bitmap, pfn_gap);
+
+   *free_pages_count = global_page_state(NR_FREE_PAGES);
+}
+EXPORT_SYMBOL(get_free_pages);
+
 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
 {
zoneref->zone = zone;
-- 
1.8.3.1




[Qemu-devel] [RFC qemu 4/4] migration: filter out guest's free pages in ram bulk stage

2016-03-03 Thread Liang Li
Get the free pages information through virtio and filter out the free
pages in the ram bulk stage. This can significantly reduce the total
live migration time as well as network traffic.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 migration/ram.c | 52 ++--
 1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index ee2547d..819553b 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -40,6 +40,7 @@
 #include "trace.h"
 #include "exec/ram_addr.h"
 #include "qemu/rcu_queue.h"
+#include "sysemu/balloon.h"
 
 #ifdef DEBUG_MIGRATION_RAM
 #define DPRINTF(fmt, ...) \
@@ -241,6 +242,7 @@ static struct BitmapRcu {
 struct rcu_head rcu;
 /* Main migration bitmap */
 unsigned long *bmap;
+unsigned long *free_pages_bmap;
 /* bitmap of pages that haven't been sent even once
  * only maintained and used in postcopy at the moment
  * where it's used to send the dirtymap at the start
@@ -561,12 +563,7 @@ ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb,
 unsigned long next;
 
 bitmap = atomic_rcu_read(_bitmap_rcu)->bmap;
-if (ram_bulk_stage && nr > base) {
-next = nr + 1;
-} else {
-next = find_next_bit(bitmap, size, nr);
-}
-
+next = find_next_bit(bitmap, size, nr);
 *ram_addr_abs = next << TARGET_PAGE_BITS;
 return (next - base) << TARGET_PAGE_BITS;
 }
@@ -1415,6 +1412,9 @@ void free_xbzrle_decoded_buf(void)
 static void migration_bitmap_free(struct BitmapRcu *bmap)
 {
 g_free(bmap->bmap);
+if (balloon_free_pages_support()) {
+g_free(bmap->free_pages_bmap);
+}
 g_free(bmap->unsentmap);
 g_free(bmap);
 }
@@ -1873,6 +1873,28 @@ err:
 return ret;
 }
 
+static void filter_out_guest_free_pages(unsigned long *free_pages_bmap)
+{
+RAMBlock *block;
+DirtyMemoryBlocks *blocks;
+unsigned long end, page;
+
+blocks = atomic_rcu_read(_list.dirty_memory[DIRTY_MEMORY_MIGRATION]);
+block = QLIST_FIRST_RCU(_list.blocks);
+end = TARGET_PAGE_ALIGN(block->offset +
+block->used_length) >> TARGET_PAGE_BITS;
+page = block->offset >> TARGET_PAGE_BITS;
+
+while (page < end) {
+unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
+unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
+unsigned long num = MIN(end - page, DIRTY_MEMORY_BLOCK_SIZE - offset);
+unsigned long *p = free_pages_bmap + BIT_WORD(page);
+
+slow_bitmap_complement(blocks->blocks[idx], p, num);
+page += num;
+}
+}
 
 /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
  * long-running RCU critical section.  When rcu-reclaims in the code
@@ -1884,6 +1906,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
 {
 RAMBlock *block;
 int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
+uint64_t free_pages_count = 0;
 
 dirty_rate_high_cnt = 0;
 bitmap_sync_count = 0;
@@ -1931,6 +1954,9 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
 ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
 migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
 migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
+if (balloon_free_pages_support()) {
+migration_bitmap_rcu->free_pages_bmap = bitmap_new(ram_bitmap_pages);
+}
 
 if (migrate_postcopy_ram()) {
 migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
@@ -1945,6 +1971,20 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
 DIRTY_MEMORY_MIGRATION);
 }
 memory_global_dirty_log_start();
+
+if (balloon_free_pages_support() &&
+balloon_get_free_pages(migration_bitmap_rcu->free_pages_bmap,
+   _pages_count) == 0) {
+qemu_mutex_unlock_iothread();
+while (balloon_get_free_pages(migration_bitmap_rcu->free_pages_bmap,
+  _pages_count) == 0) {
+usleep(1000);
+}
+qemu_mutex_lock_iothread();
+
+filter_out_guest_free_pages(migration_bitmap_rcu->free_pages_bmap);
+}
+
 migration_bitmap_sync();
 qemu_mutex_unlock_ramlist();
 qemu_mutex_unlock_iothread();
-- 
1.8.3.1




[Qemu-devel] [RFC kernel 0/2]A PV solution for KVM live migration optimization

2016-03-03 Thread Liang Li
The current QEMU live migration implementation mark the all the
guest's RAM pages as dirtied in the ram bulk stage, all these pages
will be processed and that takes quit a lot of CPU cycles.

>From guest's point of view, it doesn't care about the content in free
pages. We can make use of this fact and skip processing the free
pages in the ram bulk stage, it can save a lot CPU cycles and reduce
the network traffic significantly while speed up the live migration
process obviously.

This patch set is the kernel side implementation.

It get the free pages information by traversing
zone->free_area[order].free_list, and construct a free pages bitmap.
The virtio-balloon driver is extended so as to send the free pages
bitmap to QEMU for live migration optimization.

Performance data


Test environment:

CPU: Intel (R) Xeon(R) CPU ES-2699 v3 @ 2.30GHz
Host RAM: 64GB
Host Linux Kernel:  4.2.0 Host OS: CentOS 7.1
Guest Linux Kernel:  4.5.rc6Guest OS: CentOS 6.6
Network:  X540-AT2 with 10 Gigabit connection
Guest RAM: 8GB

Case 1: Idle guest just boots:

| original  |pv
---
total time(ms)  |1894   |   421

transferred ram(KB) |   398017  |  353242



Case 2: The guest has ever run some memory consuming workload, the
workload is terminated just before live migration.

| original  |pv
---
total time(ms)  |   7436|   552

transferred ram(KB) |  8146291  |  361375


Liang Li (2):
  mm: Add the functions used to get free pages information
  virtio-balloon: extend balloon driver to support a new feature

 drivers/virtio/virtio_balloon.c | 108 ++--
 include/uapi/linux/virtio_balloon.h |   1 +
 mm/page_alloc.c |  58 +++
 3 files changed, 162 insertions(+), 5 deletions(-)

-- 
1.8.3.1




[Qemu-devel] [RFC qemu 2/4] virtio-balloon: Add a new feature to balloon device

2016-03-03 Thread Liang Li
Extend the virtio balloon device to support a new feature, this
new feature can help to get guest's free pages information, which
can be used for live migration optimzation.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 balloon.c   | 30 -
 hw/virtio/virtio-balloon.c  | 81 -
 include/hw/virtio/virtio-balloon.h  | 17 +-
 include/standard-headers/linux/virtio_balloon.h |  1 +
 include/sysemu/balloon.h| 10 ++-
 5 files changed, 134 insertions(+), 5 deletions(-)

diff --git a/balloon.c b/balloon.c
index f2ef50c..a37717e 100644
--- a/balloon.c
+++ b/balloon.c
@@ -36,6 +36,7 @@
 
 static QEMUBalloonEvent *balloon_event_fn;
 static QEMUBalloonStatus *balloon_stat_fn;
+static QEMUBalloonFreePages *balloon_free_pages_fn;
 static void *balloon_opaque;
 static bool balloon_inhibited;
 
@@ -65,9 +66,12 @@ static bool have_balloon(Error **errp)
 }
 
 int qemu_add_balloon_handler(QEMUBalloonEvent *event_func,
- QEMUBalloonStatus *stat_func, void *opaque)
+ QEMUBalloonStatus *stat_func,
+ QEMUBalloonFreePages *free_pages_func,
+ void *opaque)
 {
-if (balloon_event_fn || balloon_stat_fn || balloon_opaque) {
+if (balloon_event_fn || balloon_stat_fn || balloon_free_pages_fn
+|| balloon_opaque) {
 /* We're already registered one balloon handler.  How many can
  * a guest really have?
  */
@@ -75,6 +79,7 @@ int qemu_add_balloon_handler(QEMUBalloonEvent *event_func,
 }
 balloon_event_fn = event_func;
 balloon_stat_fn = stat_func;
+balloon_free_pages_fn = free_pages_func;
 balloon_opaque = opaque;
 return 0;
 }
@@ -86,6 +91,7 @@ void qemu_remove_balloon_handler(void *opaque)
 }
 balloon_event_fn = NULL;
 balloon_stat_fn = NULL;
+balloon_free_pages_fn = NULL;
 balloon_opaque = NULL;
 }
 
@@ -116,3 +122,23 @@ void qmp_balloon(int64_t target, Error **errp)
 trace_balloon_event(balloon_opaque, target);
 balloon_event_fn(balloon_opaque, target);
 }
+
+bool balloon_free_pages_support(void)
+{
+return balloon_free_pages_fn ? true : false;
+}
+
+int balloon_get_free_pages(unsigned long *free_pages_bitmap,
+   unsigned long *free_pages_count)
+{
+if (!balloon_free_pages_fn) {
+return -1;
+}
+
+if (!free_pages_bitmap || !free_pages_count) {
+return -1;
+}
+
+return balloon_free_pages_fn(balloon_opaque,
+ free_pages_bitmap, free_pages_count);
+ }
diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index e9c30e9..a5b9d08 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -76,6 +76,12 @@ static bool balloon_stats_supported(const VirtIOBalloon *s)
 return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_STATS_VQ);
 }
 
+static bool balloon_free_pages_supported(const VirtIOBalloon *s)
+{
+VirtIODevice *vdev = VIRTIO_DEVICE(s);
+return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_GET_FREE_PAGES);
+}
+
 static bool balloon_stats_enabled(const VirtIOBalloon *s)
 {
 return s->stats_poll_interval > 0;
@@ -293,6 +299,37 @@ out:
 }
 }
 
+static void virtio_balloon_get_free_pages(VirtIODevice *vdev, VirtQueue *vq)
+{
+VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
+VirtQueueElement *elem;
+size_t offset = 0;
+uint64_t bitmap_bytes = 0, free_pages_count = 0;
+
+elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
+if (!elem) {
+return;
+}
+s->free_pages_vq_elem = elem;
+
+if (!elem->out_num) {
+return;
+}
+
+iov_to_buf(elem->out_sg, elem->out_num, offset,
+   _pages_count, sizeof(uint64_t));
+
+offset += sizeof(uint64_t);
+iov_to_buf(elem->out_sg, elem->out_num, offset,
+   _bytes, sizeof(uint64_t));
+
+offset += sizeof(uint64_t);
+iov_to_buf(elem->out_sg, elem->out_num, offset,
+   s->free_pages_bitmap, bitmap_bytes);
+s->req_status = DONE;
+s->free_pages_count = free_pages_count;
+}
+
 static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data)
 {
 VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
@@ -362,6 +399,7 @@ static uint64_t virtio_balloon_get_features(VirtIODevice 
*vdev, uint64_t f,
 VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
 f |= dev->host_features;
 virtio_add_feature(, VIRTIO_BALLOON_F_STATS_VQ);
+virtio_add_feature(, VIRTIO_BALLOON_F_GET_FREE_PAGES);
 return f;
 }
 
@@ -372,6 +410,45 @@ static void virtio_balloon_stat(void *opaque, BalloonInfo 
*info)
  VIRTIO_BALLOON_PFN_SHIFT);
 }
 
+static int virtio_balloon_free_pages(void *opaque,
+ unsigned long *free_pages_bitmap,
+  

[Qemu-devel] [RFC qemu 1/4] pc: Add code to get the lowmem form PCMachineState

2016-03-03 Thread Liang Li
The lowmem will be used by the following patch to get
a correct free pages bitmap.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 hw/i386/pc.c | 5 +
 hw/i386/pc_piix.c| 1 +
 hw/i386/pc_q35.c | 1 +
 include/hw/i386/pc.h | 3 ++-
 4 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 0aeefd2..f794a84 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1115,6 +1115,11 @@ void pc_hot_add_cpu(const int64_t id, Error **errp)
 object_unref(OBJECT(cpu));
 }
 
+ram_addr_t pc_get_lowmem(PCMachineState *pcms)
+{
+   return pcms->lowmem;
+}
+
 void pc_cpus_init(PCMachineState *pcms)
 {
 int i;
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 6f8c2cd..268a08c 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -113,6 +113,7 @@ static void pc_init1(MachineState *machine,
 }
 }
 
+pcms->lowmem = lowmem;
 if (machine->ram_size >= lowmem) {
 pcms->above_4g_mem_size = machine->ram_size - lowmem;
 pcms->below_4g_mem_size = lowmem;
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 46522c9..8d9bd39 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -101,6 +101,7 @@ static void pc_q35_init(MachineState *machine)
 }
 }
 
+pcms->lowmem = lowmem;
 if (machine->ram_size >= lowmem) {
 pcms->above_4g_mem_size = machine->ram_size - lowmem;
 pcms->below_4g_mem_size = lowmem;
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 8b3546e..3694c91 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -60,7 +60,7 @@ struct PCMachineState {
 bool nvdimm;
 
 /* RAM information (sizes, addresses, configuration): */
-ram_addr_t below_4g_mem_size, above_4g_mem_size;
+ram_addr_t below_4g_mem_size, above_4g_mem_size, lowmem;
 
 /* CPU and apic information: */
 bool apic_xrupt_override;
@@ -229,6 +229,7 @@ void pc_hot_add_cpu(const int64_t id, Error **errp);
 void pc_acpi_init(const char *default_dsdt);
 
 void pc_guest_info_init(PCMachineState *pcms);
+ram_addr_t pc_get_lowmem(PCMachineState *pcms);
 
 #define PCI_HOST_PROP_PCI_HOLE_START   "pci-hole-start"
 #define PCI_HOST_PROP_PCI_HOLE_END "pci-hole-end"
-- 
1.8.3.1




[Qemu-devel] [RFC qemu 0/4] A PV solution for live migration optimization

2016-03-03 Thread Liang Li
The current QEMU live migration implementation mark the all the
guest's RAM pages as dirtied in the ram bulk stage, all these pages
will be processed and that takes quit a lot of CPU cycles.

>From guest's point of view, it doesn't care about the content in free
pages. We can make use of this fact and skip processing the free
pages in the ram bulk stage, it can save a lot CPU cycles and reduce
the network traffic significantly while speed up the live migration
process obviously.

This patch set is the QEMU side implementation.

The virtio-balloon is extended so that QEMU can get the free pages
information from the guest through virtio.

After getting the free pages information (a bitmap), QEMU can use it
to filter out the guest's free pages in the ram bulk stage. This make
the live migration process much more efficient.

This RFC version doesn't take the post-copy and RDMA into
consideration, maybe both of them can benefit from this PV solution
by with some extra modifications.

Performance data


Test environment:

CPU: Intel (R) Xeon(R) CPU ES-2699 v3 @ 2.30GHz
Host RAM: 64GB
Host Linux Kernel:  4.2.0   Host OS: CentOS 7.1
Guest Linux Kernel:  4.5.rc6Guest OS: CentOS 6.6
Network:  X540-AT2 with 10 Gigabit connection
Guest RAM: 8GB

Case 1: Idle guest just boots:

| original  |pv
---
total time(ms)  |1894   |   421

transferred ram(KB) |   398017  |  353242



Case 2: The guest has ever run some memory consuming workload, the
workload is terminated just before live migration.

| original  |pv
---
total time(ms)  |   7436|   552

transferred ram(KB) |  8146291  |  361375
====

Liang Li (4):
  pc: Add code to get the lowmem form PCMachineState
  virtio-balloon: Add a new feature to balloon device
  migration: not set migration bitmap in setup stage
  migration: filter out guest's free pages in ram bulk stage

 balloon.c   | 30 -
 hw/i386/pc.c|  5 ++
 hw/i386/pc_piix.c   |  1 +
 hw/i386/pc_q35.c|  1 +
 hw/virtio/virtio-balloon.c  | 81 -
 include/hw/i386/pc.h|  3 +-
 include/hw/virtio/virtio-balloon.h  | 17 +-
 include/standard-headers/linux/virtio_balloon.h |  1 +
 include/sysemu/balloon.h| 10 ++-
 migration/ram.c | 64 +++
 10 files changed, 195 insertions(+), 18 deletions(-)

-- 
1.8.3.1




[Qemu-devel] [RFC qemu 3/4] migration: not set migration bitmap in setup stage

2016-03-03 Thread Liang Li
Set ram_list.dirty_memory instead of migration bitmap, the migration
bitmap will be update when doing migration_bitmap_sync().
Set migration_dirty_pages to 0 and it will be updated by
migration_dirty_pages() too.

The following patch is based on this change.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 migration/ram.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 704f6a9..ee2547d 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1931,19 +1931,19 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
 ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
 migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
 migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
-bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
 
 if (migrate_postcopy_ram()) {
 migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
 bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
 }
 
-/*
- * Count the total number of pages used by ram blocks not including any
- * gaps due to alignment or unplugs.
- */
-migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
+migration_dirty_pages = 0;
 
+QLIST_FOREACH_RCU(block, _list.blocks, next) {
+cpu_physical_memory_set_dirty_range(block->offset,
+block->used_length,
+DIRTY_MEMORY_MIGRATION);
+}
 memory_global_dirty_log_start();
 migration_bitmap_sync();
 qemu_mutex_unlock_ramlist();
-- 
1.8.3.1




[Qemu-devel] [PATCH v3 1/2] qemu-file: Fix qemu_put_compression_data flaw

2016-02-24 Thread Liang Li
Current qemu_put_compression_data can only work with no writable
QEMUFile, and can't work with the writable QEMUFile. But it does
not provide any measure to prevent users from using it with a
writable QEMUFile.

We should fix this flaw to make it works with writable QEMUFile.

Suggested-by: Juan Quintela <quint...@redhat.com>
Signed-off-by: Liang Li <liang.z...@intel.com>
---
 migration/qemu-file.c |   23 +--
 migration/ram.c   |6 +-
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index 6f4a129..b0ef1f3 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -607,8 +607,14 @@ uint64_t qemu_get_be64(QEMUFile *f)
 return v;
 }
 
-/* compress size bytes of data start at p with specific compression
+/* Compress size bytes of data start at p with specific compression
  * level and store the compressed data to the buffer of f.
+ *
+ * When f is not writable, return -1 if f has no space to save the
+ * compressed data.
+ * When f is wirtable and it has no space to save the compressed data,
+ * do fflush first, if f still has no space to save the compressed
+ * data, return -1.
  */
 
 ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size,
@@ -617,7 +623,14 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const 
uint8_t *p, size_t size,
 ssize_t blen = IO_BUF_SIZE - f->buf_index - sizeof(int32_t);
 
 if (blen < compressBound(size)) {
-return 0;
+if (!qemu_file_is_writable(f)) {
+return -1;
+}
+qemu_fflush(f);
+blen = IO_BUF_SIZE - sizeof(int32_t);
+if (blen < compressBound(size)) {
+return -1;
+}
 }
 if (compress2(f->buf + f->buf_index + sizeof(int32_t), (uLongf *),
   (Bytef *)p, size, level) != Z_OK) {
@@ -625,7 +638,13 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const 
uint8_t *p, size_t size,
 return 0;
 }
 qemu_put_be32(f, blen);
+if (f->ops->writev_buffer) {
+add_to_iovec(f, f->buf + f->buf_index, blen);
+}
 f->buf_index += blen;
+if (f->buf_index == IO_BUF_SIZE) {
+qemu_fflush(f);
+}
 return blen + sizeof(int32_t);
 }
 
diff --git a/migration/ram.c b/migration/ram.c
index 704f6a9..4de13c2 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -817,7 +817,11 @@ static int do_compress_ram_page(CompressParam *param)
   RAM_SAVE_FLAG_COMPRESS_PAGE);
 blen = qemu_put_compression_data(param->file, p, TARGET_PAGE_SIZE,
  migrate_compress_level());
-bytes_sent += blen;
+if (blen < 0) {
+error_report("Insufficient buffer for compressed data!");
+} else {
+bytes_sent += blen;
+}
 
 return bytes_sent;
 }
-- 
1.7.1




[Qemu-devel] [PATCH v3 2/2] migration: refine ram_save_compressed_page

2016-02-24 Thread Liang Li
Use qemu_put_compression_data to do the compression directly
instead of using do_compress_ram_page, avoid some data copy.
very small improvement, at the same time, add code to check
if the compression is successful.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 migration/ram.c |   28 +---
 1 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 4de13c2..9be00eb 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -922,24 +922,20 @@ static int ram_save_compressed_page(QEMUFile *f, 
PageSearchStatus *pss,
 uint64_t *bytes_transferred)
 {
 int pages = -1;
-uint64_t bytes_xmit;
+uint64_t bytes_xmit = 0;
 uint8_t *p;
-int ret;
+int ret, blen;
 RAMBlock *block = pss->block;
 ram_addr_t offset = pss->offset;
 
 p = block->host + offset;
 
-bytes_xmit = 0;
 ret = ram_control_save_page(f, block->offset,
 offset, TARGET_PAGE_SIZE, _xmit);
 if (bytes_xmit) {
 *bytes_transferred += bytes_xmit;
 pages = 1;
 }
-if (block == last_sent_block) {
-offset |= RAM_SAVE_FLAG_CONTINUE;
-}
 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 if (ret != RAM_SAVE_CONTROL_DELAYED) {
 if (bytes_xmit > 0) {
@@ -959,17 +955,19 @@ static int ram_save_compressed_page(QEMUFile *f, 
PageSearchStatus *pss,
 flush_compressed_data(f);
 pages = save_zero_page(f, block, offset, p, bytes_transferred);
 if (pages == -1) {
-set_compress_params(_param[0], block, offset);
-/* Use the qemu thread to compress the data to make sure the
- * first page is sent out before other pages
- */
-bytes_xmit = do_compress_ram_page(_param[0]);
-acct_info.norm_pages++;
-qemu_put_qemu_file(f, comp_param[0].file);
-*bytes_transferred += bytes_xmit;
-pages = 1;
+/* Make sure the first page is sent out before other pages */
+bytes_xmit = save_page_header(f, block, offset |
+  RAM_SAVE_FLAG_COMPRESS_PAGE);
+blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
+ migrate_compress_level());
+if (blen > 0) {
+*bytes_transferred += bytes_xmit + blen;
+acct_info.norm_pages++;
+pages = 1;
+}
 }
 } else {
+offset |= RAM_SAVE_FLAG_CONTINUE;
 pages = save_zero_page(f, block, offset, p, bytes_transferred);
 if (pages == -1) {
 pages = compress_page_with_multi_thread(f, block, offset,
-- 
1.7.1




[Qemu-devel] [PATCH v3 0/2] Fix flaw of qemu_put_compression_data

2016-02-24 Thread Liang Li
The implementation of qemu_put_compression_data only consider the case
QEMUFile is writable, it can't work with a writable QEMUFile and does
not provide any measure to prevent users from using it with a writable
QEMUFile. For safety, it should be improved to avoid some issues.

ram_save_compressed_page can be refined based on the change of
qemu_put_compression_data, very small improvement, but code looks better.

Liang Li (2):
  qemu-file: Fix qemu_put_compression_data flaw
  migration: refine ram_save_compressed_page

 migration/qemu-file.c |   23 +--
 migration/ram.c   |   34 ++
 2 files changed, 39 insertions(+), 18 deletions(-)




[Qemu-devel] [PATCH v5 1/2] configure: detect ifunc and avx2 attribute

2016-01-26 Thread Liang Li
Detect if the compiler can support the ifun and avx2, if so, set
CONFIG_AVX2_OPT which will be used to turn on the avx2 instruction
optimization.

Suggested-by: Paolo Bonzini <pbonz...@redhat.com>
Suggested-by: Peter Maydell <peter.mayd...@linaro.org>
Signed-off-by: Liang Li <liang.z...@intel.com>
---
 configure | 21 +
 1 file changed, 21 insertions(+)

diff --git a/configure b/configure
index 3506e44..a50dcf5 100755
--- a/configure
+++ b/configure
@@ -311,6 +311,7 @@ smartcard=""
 libusb=""
 usb_redir=""
 opengl=""
+avx2_opt="no"
 zlib="yes"
 lzo=""
 snappy=""
@@ -1832,6 +1833,21 @@ EOF
 fi
 
 ##
+# avx2 optimization requirement check
+
+cat > $TMPC << EOF
+static void bar(void) {}
+static void *bar_ifunc(void) {return (void*) bar;}
+static void foo(void) __attribute__((ifunc("bar_ifunc")));
+int main(void) { foo(); return 0; }
+EOF
+if compile_prog "-mavx2" "" ; then
+if readelf --syms $TMPE |grep "IFUNC.*foo" >/dev/null 2>&1; then
+avx2_opt="yes"
+fi
+fi
+
+#
 # zlib check
 
 if test "$zlib" != "no" ; then
@@ -4922,6 +4938,7 @@ echo "bzip2 support $bzip2"
 echo "NUMA host support $numa"
 echo "tcmalloc support  $tcmalloc"
 echo "jemalloc support  $jemalloc"
+echo "avx2 optimization $avx2_opt"
 
 if test "$sdl_too_old" = "yes"; then
 echo "-> Your SDL version is too old - please upgrade to have SDL support"
@@ -5306,6 +5323,10 @@ if test "$opengl" = "yes" ; then
   echo "OPENGL_LIBS=$opengl_libs" >> $config_host_mak
 fi
 
+if test "$avx2_opt" = "yes" ; then
+  echo "CONFIG_AVX2_OPT=y" >> $config_host_mak
+fi
+
 if test "$lzo" = "yes" ; then
   echo "CONFIG_LZO=y" >> $config_host_mak
 fi
-- 
1.9.1




[Qemu-devel] [PATCH v5 2/2] cutils: add avx2 instruction optimization

2016-01-26 Thread Liang Li
buffer_find_nonzero_offset() is a hot function during live migration.
Now it use SSE2 instructions for optimization. For platform supports
AVX2 instructions, use AVX2 instructions for optimization can help
to improve the performance of buffer_find_nonzero_offset() about 30%
comparing to SSE2.

Live migration can be faster with this optimization, the test result
shows that for an 8GiB RAM idle guest just boots, this patch can help
to shorten the total live migration time about 6%.

This patch use the ifunc mechanism to select the proper function when
running, for platform supports AVX2, execute the AVX2 instructions,
else, execute the original instructions.

Signed-off-by: Liang Li <liang.z...@intel.com>
Suggested-by: Paolo Bonzini <pbonz...@redhat.com>
Suggested-by: Richard Henderson <r...@twiddle.net>
Reviewed-by: Paolo Bonzini <pbonz...@redhat.com>
---
 include/qemu-common.h |   8 +---
 util/cutils.c | 118 --
 2 files changed, 115 insertions(+), 11 deletions(-)

diff --git a/include/qemu-common.h b/include/qemu-common.h
index 22b010c..f4c8c24 100644
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -483,13 +483,7 @@ void qemu_hexdump(const char *buf, FILE *fp, const char 
*prefix, size_t size);
 #endif
 
 #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8
-static inline bool
-can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
-{
-return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
-   * sizeof(VECTYPE)) == 0
-&& ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
-}
+bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len);
 size_t buffer_find_nonzero_offset(const void *buf, size_t len);
 
 /*
diff --git a/util/cutils.c b/util/cutils.c
index cfeb848..5c8ee5c 100644
--- a/util/cutils.c
+++ b/util/cutils.c
@@ -161,6 +161,14 @@ int qemu_fdatasync(int fd)
 #endif
 }
 
+static bool
+can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len)
+{
+return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
+   * sizeof(VECTYPE)) == 0
+&& ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
+}
+
 /*
  * Searches for an area with non-zero content in a buffer
  *
@@ -169,8 +177,8 @@ int qemu_fdatasync(int fd)
  * and addr must be a multiple of sizeof(VECTYPE) due to
  * restriction of optimizations in this function.
  *
- * can_use_buffer_find_nonzero_offset() can be used to check
- * these requirements.
+ * can_use_buffer_find_nonzero_offset_inner() can be used to
+ * check these requirements.
  *
  * The return value is the offset of the non-zero area rounded
  * down to a multiple of sizeof(VECTYPE) for the first
@@ -181,13 +189,13 @@ int qemu_fdatasync(int fd)
  * If the buffer is all zero the return value is equal to len.
  */
 
-size_t buffer_find_nonzero_offset(const void *buf, size_t len)
+static size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len)
 {
 const VECTYPE *p = buf;
 const VECTYPE zero = (VECTYPE){0};
 size_t i;
 
-assert(can_use_buffer_find_nonzero_offset(buf, len));
+assert(can_use_buffer_find_nonzero_offset_inner(buf, len));
 
 if (!len) {
 return 0;
@@ -216,6 +224,108 @@ size_t buffer_find_nonzero_offset(const void *buf, size_t 
len)
 return i * sizeof(VECTYPE);
 }
 
+#ifdef CONFIG_AVX2_OPT
+#pragma GCC push_options
+#pragma GCC target("avx2")
+#include 
+#include 
+
+#define AVX2_VECTYPE__m256i
+#define AVX2_SPLAT(p)   _mm256_set1_epi8(*(p))
+#define AVX2_ALL_EQ(v1, v2) \
+(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0x)
+#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
+
+static bool
+can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
+{
+return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
+   * sizeof(AVX2_VECTYPE)) == 0
+&& ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0);
+}
+
+static size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
+{
+const AVX2_VECTYPE *p = buf;
+const AVX2_VECTYPE zero = (AVX2_VECTYPE){0};
+size_t i;
+
+assert(can_use_buffer_find_nonzero_offset_avx2(buf, len));
+
+if (!len) {
+return 0;
+}
+
+for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
+if (!AVX2_ALL_EQ(p[i], zero)) {
+return i * sizeof(AVX2_VECTYPE);
+}
+}
+
+for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
+ i < len / sizeof(AVX2_VECTYPE);
+ i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
+AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]);
+AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]);
+AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]);
+AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]);
+AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1);
+AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, 

[Qemu-devel] [PATCH v5 0/2] add avx2 instruction optimization

2016-01-26 Thread Liang Li
buffer_find_nonzero_offset() is a hot function during live migration.
Now it use SSE2 instructions for optimization. For platform supports
AVX2 instructions, use the AVX2 instructions for optimization can help
to improve the performance of zero page checking about 30% comparing
to SSE2.
Live migration can be faster with this optimization, the test result
shows that for an 8GB RAM idle guest, this patch can help to shorten
the total live migration time about 6%.

This patch use the ifunc mechanism to select the proper function when
running, for platform supports AVX2, execute the AVX2 instructions,
else, execute the original instructions.

With this patch, the QEMU binary can run on both platforms support AVX2
or not.

Compiler which doesn't support the AVX2 and ifunc attribute can also build
the source code successfully.

v5 -> v4 changes:
  * Enhance the ifunc attribute detection (Paolo's suggestion)

v3 -> v4 changes:
  * Use the GCC #pragma to make things simple (Paolo's suggestion) 
  * Put avx2 related code in cutils.c (Richard's suggestion)
  * Change the configure, detect ifunc and avx2 attributes together

v2 -> v3 changes:
  * Detect the ifunc attribute support (Paolo's suggestion) 
  * Use the ifunc attribute instead of the inline asm (Richard's suggestion)
  * Change the configure (Juan's suggestion)

Liang Li (2):
  configure: detect ifunc and avx2 attribute
  cutils: add avx2 instruction optimization

 configure |  21 +
 include/qemu-common.h |   8 +---
 util/cutils.c | 118 --
 3 files changed, 136 insertions(+), 11 deletions(-)

-- 
1.9.1




[Qemu-devel] [PATCH] migration: remove useless code.

2016-01-26 Thread Liang Li
Since 's->state' will be set in migrate_init(), there is no
need to set it before calling migrate_init(). The code and
the related comments can be removed.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 migration/migration.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index aaca451..ae38242 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1006,12 +1006,6 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
 return;
 }
 
-/* We are starting a new migration, so we want to start in a clean
-   state.  This change is only needed if previous migration
-   failed/was cancelled.  We don't use migrate_set_state() because
-   we are setting the initial state, not changing it. */
-s->state = MIGRATION_STATUS_NONE;
-
 s = migrate_init();
 
 if (strstart(uri, "tcp:", )) {
-- 
1.9.1




[Qemu-devel] [PATCH v4 0/3] add avx2 instruction optimization

2016-01-20 Thread Liang Li
buffer_find_nonzero_offset() is a hot function during live migration.
Now it use SSE2 instructions for optimization. For platform supports
AVX2 instructions, use the AVX2 instructions for optimization can help
to improve the performance about 30% comparing to SSE2.
Zero page check can be faster with this optimization, the test result
shows that for an 8GB RAM idle guest, this patch can help to shorten
the total live migration time about 6%.

This patch use the ifunc mechanism to select the proper function when
running, for platform supports AVX2, execute the AVX2 instructions,
else, execute the original instructions.

With this patch, the QEMU binary can run on both platforms support AVX2
or not.

Compiler which doesn't support the AVX2 and ifunc attribute can also build
the source code successfully.

v3 -> v4 changes:
  * Use the GCC #pragma to make things simple (Paolo's suggestion) 
  * Put avx2 related code in cutils.c (Richard's suggestion)
  * Change the configure, detect ifunc and avx2 attributes together

v2 -> v3 changes:
  * Detect the ifunc attribute support (Paolo's suggestion) 
  * Use the ifunc attribute instead of the inline asm (Richard's suggestion)
  * Change the configure (Juan's suggestion)

Liang Li (2):
  configure: detect ifunc and avx2 attribute
  cutils: add avx2 instruction optimization

 configure |  20 +
 include/qemu-common.h |   8 +---
 util/cutils.c | 118 --
 3 files changed, 135 insertions(+), 11 deletions(-)

-- 
1.9.1




[Qemu-devel] [PATCH v4 2/2] cutils: add avx2 instruction optimization

2016-01-20 Thread Liang Li
buffer_find_nonzero_offset() is a hot function during live migration.
Now it use SSE2 instructions for optimization. For platform supports
AVX2 instructions, use AVX2 instructions for optimization can help
to improve the performance about 30% comparing to SSE2.

Zero page check can be faster with this optimization, the test result
shows that for an 8GiB RAM idle guest just boots, this patch can help
to shorten the total live migration time about 6%.

This patch use the ifunc mechanism to select the proper function when
running, for platform supports AVX2, execute the AVX2 instructions,
else, execute the original instructions.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 include/qemu-common.h |   8 +---
 util/cutils.c | 118 --
 2 files changed, 115 insertions(+), 11 deletions(-)

diff --git a/include/qemu-common.h b/include/qemu-common.h
index 22b010c..f4c8c24 100644
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -483,13 +483,7 @@ void qemu_hexdump(const char *buf, FILE *fp, const char 
*prefix, size_t size);
 #endif
 
 #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8
-static inline bool
-can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
-{
-return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
-   * sizeof(VECTYPE)) == 0
-&& ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
-}
+bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len);
 size_t buffer_find_nonzero_offset(const void *buf, size_t len);
 
 /*
diff --git a/util/cutils.c b/util/cutils.c
index cfeb848..5c8ee5c 100644
--- a/util/cutils.c
+++ b/util/cutils.c
@@ -161,6 +161,14 @@ int qemu_fdatasync(int fd)
 #endif
 }
 
+static bool
+can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len)
+{
+return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
+   * sizeof(VECTYPE)) == 0
+&& ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
+}
+
 /*
  * Searches for an area with non-zero content in a buffer
  *
@@ -169,8 +177,8 @@ int qemu_fdatasync(int fd)
  * and addr must be a multiple of sizeof(VECTYPE) due to
  * restriction of optimizations in this function.
  *
- * can_use_buffer_find_nonzero_offset() can be used to check
- * these requirements.
+ * can_use_buffer_find_nonzero_offset_inner() can be used to
+ * check these requirements.
  *
  * The return value is the offset of the non-zero area rounded
  * down to a multiple of sizeof(VECTYPE) for the first
@@ -181,13 +189,13 @@ int qemu_fdatasync(int fd)
  * If the buffer is all zero the return value is equal to len.
  */
 
-size_t buffer_find_nonzero_offset(const void *buf, size_t len)
+static size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len)
 {
 const VECTYPE *p = buf;
 const VECTYPE zero = (VECTYPE){0};
 size_t i;
 
-assert(can_use_buffer_find_nonzero_offset(buf, len));
+assert(can_use_buffer_find_nonzero_offset_inner(buf, len));
 
 if (!len) {
 return 0;
@@ -216,6 +224,108 @@ size_t buffer_find_nonzero_offset(const void *buf, size_t 
len)
 return i * sizeof(VECTYPE);
 }
 
+#ifdef CONFIG_AVX2_OPT
+#pragma GCC push_options
+#pragma GCC target("avx2")
+#include 
+#include 
+
+#define AVX2_VECTYPE__m256i
+#define AVX2_SPLAT(p)   _mm256_set1_epi8(*(p))
+#define AVX2_ALL_EQ(v1, v2) \
+(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0x)
+#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
+
+static bool
+can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
+{
+return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
+   * sizeof(AVX2_VECTYPE)) == 0
+&& ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0);
+}
+
+static size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
+{
+const AVX2_VECTYPE *p = buf;
+const AVX2_VECTYPE zero = (AVX2_VECTYPE){0};
+size_t i;
+
+assert(can_use_buffer_find_nonzero_offset_avx2(buf, len));
+
+if (!len) {
+return 0;
+}
+
+for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
+if (!AVX2_ALL_EQ(p[i], zero)) {
+return i * sizeof(AVX2_VECTYPE);
+}
+}
+
+for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
+ i < len / sizeof(AVX2_VECTYPE);
+ i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
+AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]);
+AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]);
+AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]);
+AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]);
+AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1);
+AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3);
+if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) {
+break;
+}
+}
+
+return i * sizeof(AVX2_VECTYPE);
+}
+
+static bool avx2_support(void)
+{
+int a, b,

[Qemu-devel] [PATCH v4 1/2] configure: detect ifunc and avx2 attribute

2016-01-20 Thread Liang Li
Detect if the compiler can support the ifun and avx2, if so, set
CONFIG_AVX2_OPT which will be used to turn on the avx2 instruction
optimization.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 configure | 20 
 1 file changed, 20 insertions(+)

diff --git a/configure b/configure
index 44ac9ab..b7f4661 100755
--- a/configure
+++ b/configure
@@ -310,6 +310,7 @@ smartcard=""
 libusb=""
 usb_redir=""
 opengl=""
+avx2_opt=""
 zlib="yes"
 lzo=""
 snappy=""
@@ -1827,6 +1828,20 @@ EOF
 fi
 
 ##
+# avx2 optimization requirement check
+
+cat > $TMPC << EOF
+static void bar(void) {}
+static void foo(void) __attribute__((ifunc("bar")));
+int main(void) { foo(); return 0; }
+EOF
+if compile_prog "" "-mavx2" ; then
+avx2_opt="yes"
+else
+avx2_opt="no"
+fi
+
+#
 # zlib check
 
 if test "$zlib" != "no" ; then
@@ -4855,6 +4870,7 @@ echo "bzip2 support $bzip2"
 echo "NUMA host support $numa"
 echo "tcmalloc support  $tcmalloc"
 echo "jemalloc support  $jemalloc"
+echo "avx2 optimization $avx2_opt"
 
 if test "$sdl_too_old" = "yes"; then
 echo "-> Your SDL version is too old - please upgrade to have SDL support"
@@ -5236,6 +5252,10 @@ if test "$opengl" = "yes" ; then
   echo "OPENGL_LIBS=$opengl_libs" >> $config_host_mak
 fi
 
+if test "$avx2_opt" = "yes" ; then
+  echo "CONFIG_AVX2_OPT=y" >> $config_host_mak
+fi
+
 if test "$lzo" = "yes" ; then
   echo "CONFIG_LZO=y" >> $config_host_mak
 fi
-- 
1.9.1




[Qemu-devel] [PATCH v4 0/2] add avx2 instruction optimization

2016-01-20 Thread Liang Li
buffer_find_nonzero_offset() is a hot function during live migration.
Now it use SSE2 instructions for optimization. For platform supports
AVX2 instructions, use the AVX2 instructions for optimization can help
to improve the performance about 30% comparing to SSE2.
Zero page check can be faster with this optimization, the test result
shows that for an 8GB RAM idle guest, this patch can help to shorten
the total live migration time about 6%.

This patch use the ifunc mechanism to select the proper function when
running, for platform supports AVX2, execute the AVX2 instructions,
else, execute the original instructions.

With this patch, the QEMU binary can run on both platforms support AVX2
or not.

Compiler which doesn't support the AVX2 and ifunc attribute can also build
the source code successfully.

v3 -> v4 changes:
  * Use the GCC #pragma to make things simple (Paolo's suggestion) 
  * Put avx2 related code in cutils.c (Richard's suggestion)
  * Change the configure, detect ifunc and avx2 attributes together

v2 -> v3 changes:
  * Detect the ifunc attribute support (Paolo's suggestion) 
  * Use the ifunc attribute instead of the inline asm (Richard's suggestion)
  * Change the configure (Juan's suggestion)

Liang Li (2):
  configure: detect ifunc and avx2 attribute
  cutils: add avx2 instruction optimization

 configure |  20 +
 include/qemu-common.h |   8 +---
 util/cutils.c | 118 --
 3 files changed, 135 insertions(+), 11 deletions(-)

-- 
1.9.1




[Qemu-devel] [PATCH RESEND v2 2/2] migration: refine ram_save_compressed_page

2016-01-15 Thread Liang Li
Use qemu_put_compression_data to do the compression directly
instead of using do_compress_ram_page, avoid some data copy.
very small improvement, but the code looks better.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 migration/ram.c | 20 
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 4e606ab..48ebef0 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -917,22 +917,18 @@ static int ram_save_compressed_page(QEMUFile *f, RAMBlock 
*block,
 uint64_t *bytes_transferred)
 {
 int pages = -1;
-uint64_t bytes_xmit;
+uint64_t bytes_xmit = 0;
 uint8_t *p;
 int ret;
 
 p = block->host + offset;
 
-bytes_xmit = 0;
 ret = ram_control_save_page(f, block->offset,
 offset, TARGET_PAGE_SIZE, _xmit);
 if (bytes_xmit) {
 *bytes_transferred += bytes_xmit;
 pages = 1;
 }
-if (block == last_sent_block) {
-offset |= RAM_SAVE_FLAG_CONTINUE;
-}
 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 if (ret != RAM_SAVE_CONTROL_DELAYED) {
 if (bytes_xmit > 0) {
@@ -952,17 +948,17 @@ static int ram_save_compressed_page(QEMUFile *f, RAMBlock 
*block,
 flush_compressed_data(f);
 pages = save_zero_page(f, block, offset, p, bytes_transferred);
 if (pages == -1) {
-set_compress_params(_param[0], block, offset);
-/* Use the qemu thread to compress the data to make sure the
- * first page is sent out before other pages
- */
-bytes_xmit = do_compress_ram_page(_param[0]);
-acct_info.norm_pages++;
-qemu_put_qemu_file(f, comp_param[0].file);
+/* Make sure the first page is sent out before other pages */
+bytes_xmit = save_page_header(f, block, offset |
+  RAM_SAVE_FLAG_COMPRESS_PAGE);
+bytes_xmit += qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
+migrate_compress_level());
 *bytes_transferred += bytes_xmit;
+acct_info.norm_pages++;
 pages = 1;
 }
 } else {
+offset |= RAM_SAVE_FLAG_CONTINUE;
 pages = save_zero_page(f, block, offset, p, bytes_transferred);
 if (pages == -1) {
 pages = compress_page_with_multi_thread(f, block, offset,
-- 
1.9.1




[Qemu-devel] (no subject)

2016-01-15 Thread Liang Li
Subject: [PATCH RESEND v2 0/2] Fix flaw of qemu_put_compression_data

The implementation of qemu_put_compression_data only consider the case
QEMUFile is writable, it can't work with a writable QEMUFile and does
not provide any measure to prevent users from using it with a writable
QEMUFile. For safety, it should be improved to avoid some issues.

ram_save_compressed_page can be refined based on the change of
qemu_put_compression_data, very small improvement, but code looks better.

Liang Li (2):
  qemu-file: Fix qemu_put_compression_data flaw
  migration: refine ram_save_compressed_page

 migration/qemu-file.c | 23 +--
 migration/ram.c   | 20 
 2 files changed, 29 insertions(+), 14 deletions(-)

-- 
1.9.1




[Qemu-devel] [PATCH RESEND v2 1/2] qemu-file: Fix qemu_put_compression_data flaw

2016-01-15 Thread Liang Li
Current qemu_put_compression_data can only work with no writable
QEMUFile, and can't work with the writable QEMUFile. But it does
not provide any measure to prevent users from using it with a
writable QEMUFile.

We should fix this flaw to make it works with writable QEMUFile.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 migration/qemu-file.c | 23 +--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index 0bbd257..b956ab6 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -606,8 +606,14 @@ uint64_t qemu_get_be64(QEMUFile *f)
 return v;
 }
 
-/* compress size bytes of data start at p with specific compression
+/* Compress size bytes of data start at p with specific compression
  * level and store the compressed data to the buffer of f.
+ *
+ * When f is not writable, return 0 if f has no space to save the
+ * compressed data.
+ * When f is wirtable and it has no space to save the compressed data,
+ * do fflush first, if f still has no space to save the compressed
+ * data, return 0.
  */
 
 ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size,
@@ -616,7 +622,14 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const 
uint8_t *p, size_t size,
 ssize_t blen = IO_BUF_SIZE - f->buf_index - sizeof(int32_t);
 
 if (blen < compressBound(size)) {
-return 0;
+if (!qemu_file_is_writable(f)) {
+return 0;
+}
+qemu_fflush(f);
+blen = IO_BUF_SIZE - sizeof(int32_t);
+if (blen < compressBound(size)) {
+return 0;
+}
 }
 if (compress2(f->buf + f->buf_index + sizeof(int32_t), (uLongf *),
   (Bytef *)p, size, level) != Z_OK) {
@@ -624,7 +637,13 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const 
uint8_t *p, size_t size,
 return 0;
 }
 qemu_put_be32(f, blen);
+if (f->ops->writev_buffer) {
+add_to_iovec(f, f->buf + f->buf_index, blen);
+}
 f->buf_index += blen;
+if (f->buf_index == IO_BUF_SIZE) {
+qemu_fflush(f);
+}
 return blen + sizeof(int32_t);
 }
 
-- 
1.9.1




[Qemu-devel] [PATCH RESEND v2 0/2] Fix flaw of qemu_put_compression_data

2016-01-15 Thread Liang Li
The implementation of qemu_put_compression_data only consider the case
QEMUFile is writable, it can't work with a writable QEMUFile and does
not provide any measure to prevent users from using it with a writable
QEMUFile. For safety, it should be improved to avoid some issues.

ram_save_compressed_page can be refined based on the change of
qemu_put_compression_data, very small improvement, but code looks better.

Liang Li (2):
  qemu-file: Fix qemu_put_compression_data flaw
  migration: refine ram_save_compressed_page

 migration/qemu-file.c | 23 +--
 migration/ram.c   | 20 
 2 files changed, 29 insertions(+), 14 deletions(-)

-- 
1.9.1




[Qemu-devel] [PATCH] migration: not send zero page header in ram bulk stage

2016-01-15 Thread Liang Li
Now that VM's RAM pages are initialized to zero, (VM's RAM is allcated
with the mmap() and MAP_ANONYMOUS option, or mmap() without MAP_SHARED
if hugetlbfs is used.) so there is no need to send the zero page header
to destination.

For guest just uses a small portions of RAM, this change can avoid
allocating all the guest's RAM pages in the destination node after
live migration. Another benefit is destination QEMU can save lots of
CPU cycles for zero page checking.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 migration/ram.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 4e606ab..c4821d1 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -705,10 +705,12 @@ static int save_zero_page(QEMUFile *f, RAMBlock *block, 
ram_addr_t offset,
 
 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 acct_info.dup_pages++;
-*bytes_transferred += save_page_header(f, block,
-   offset | 
RAM_SAVE_FLAG_COMPRESS);
-qemu_put_byte(f, 0);
-*bytes_transferred += 1;
+if (!ram_bulk_stage) {
+*bytes_transferred += save_page_header(f, block, offset |
+   RAM_SAVE_FLAG_COMPRESS);
+qemu_put_byte(f, 0);
+*bytes_transferred += 1;
+}
 pages = 1;
 }
 
-- 
1.9.1




[Qemu-devel] [v3 1/3] cutils: add avx2 instruction optimization

2015-12-08 Thread Liang Li
buffer_find_nonzero_offset() is a hot function during live migration.
Now it use SSE2 intructions for optimization. For platform supports
AVX2 instructions, use the AVX2 instructions for optimization can help
to improve the performance about 30% comparing to SSE2.
Zero page check can be faster with this optimization, the test result
shows that for an 8GB RAM idle guest, this patch can help to shorten
the total live migration time about 6%.

This patch use the ifunc mechanism to select the proper function when
running, for platform supports AVX2, excute the AVX2 instructions,
else, excute the original code.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 include/qemu-common.h   | 13 +-
 util/Makefile.objs  |  2 ++
 util/buffer-zero-avx2.c | 54 
 util/cutils.c   | 65 +++--
 4 files changed, 125 insertions(+), 9 deletions(-)
 create mode 100644 util/buffer-zero-avx2.c

diff --git a/include/qemu-common.h b/include/qemu-common.h
index 405364f..be8ba79 100644
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -484,15 +484,14 @@ void qemu_hexdump(const char *buf, FILE *fp, const char 
*prefix, size_t size);
 #endif
 
 #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8
-static inline bool
-can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
-{
-return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
-   * sizeof(VECTYPE)) == 0
-&& ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
-}
+bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len);
 size_t buffer_find_nonzero_offset(const void *buf, size_t len);
 
+#if defined CONFIG_IFUNC && defined CONFIG_AVX2
+bool can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len);
+size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len);
+#endif
+
 /*
  * helper to parse debug environment variables
  */
diff --git a/util/Makefile.objs b/util/Makefile.objs
index 89dd80e..a130b35 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -1,4 +1,5 @@
 util-obj-y = osdep.o cutils.o unicode.o qemu-timer-common.o
+util-obj-$(CONFIG_AVX2) += buffer-zero-avx2.o
 util-obj-$(CONFIG_POSIX) += compatfd.o
 util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
 util-obj-$(CONFIG_POSIX) += mmap-alloc.o
@@ -30,3 +31,4 @@ util-obj-y += qemu-coroutine-sleep.o
 util-obj-y += coroutine-$(CONFIG_COROUTINE_BACKEND).o
 util-obj-y += buffer.o
 util-obj-y += timed-average.o
+buffer-zero-avx2.o-cflags  := $(AVX2_CFLAGS)
diff --git a/util/buffer-zero-avx2.c b/util/buffer-zero-avx2.c
new file mode 100644
index 000..b9da0e3
--- /dev/null
+++ b/util/buffer-zero-avx2.c
@@ -0,0 +1,54 @@
+#include "qemu-common.h"
+
+#if defined CONFIG_IFUNC && defined CONFIG_AVX2
+#include 
+#define AVX2_VECTYPE__m256i
+#define AVX2_SPLAT(p)   _mm256_set1_epi8(*(p))
+#define AVX2_ALL_EQ(v1, v2) \
+(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0x)
+#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
+
+inline bool
+can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
+{
+return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
+   * sizeof(AVX2_VECTYPE)) == 0
+&& ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0);
+}
+
+size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
+{
+const AVX2_VECTYPE *p = buf;
+const AVX2_VECTYPE zero = (AVX2_VECTYPE){0};
+size_t i;
+
+assert(can_use_buffer_find_nonzero_offset_avx2(buf, len));
+
+if (!len) {
+return 0;
+}
+
+for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
+if (!AVX2_ALL_EQ(p[i], zero)) {
+return i * sizeof(AVX2_VECTYPE);
+}
+}
+
+for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
+ i < len / sizeof(AVX2_VECTYPE);
+ i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
+AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]);
+AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]);
+AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]);
+AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]);
+AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1);
+AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3);
+if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) {
+break;
+}
+}
+
+return i * sizeof(AVX2_VECTYPE);
+}
+
+#endif
diff --git a/util/cutils.c b/util/cutils.c
index cfeb848..3631c02 100644
--- a/util/cutils.c
+++ b/util/cutils.c
@@ -26,6 +26,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "qemu/sockets.h"
 #include "qemu/iov.h"
@@ -161,6 +162,14 @@ int qemu_fdatasync(int fd)
 #endif
 }
 
+static inline bool
+can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len)
+{
+return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
+   * size

[Qemu-devel] [v3 3/3] configure: add options to config avx2

2015-12-08 Thread Liang Li
Add the '--enable-avx2' & '--disable-avx2' option so as to config
the AVX2 instruction optimization.

If '--disable-avx2' is not set, configure will detect if the compiler
can support AVX2 option, if yes, AVX2 optimization is eabled, else
disabled.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 configure | 30 ++
 1 file changed, 30 insertions(+)

diff --git a/configure b/configure
index 394db3b..94e45fa 100755
--- a/configure
+++ b/configure
@@ -311,6 +311,7 @@ libusb=""
 usb_redir=""
 opengl=""
 ifunc=""
+avx2=""
 zlib="yes"
 lzo=""
 snappy=""
@@ -1063,6 +1064,10 @@ for opt do
   ;;
   --enable-usb-redir) usb_redir="yes"
   ;;
+  --disable-avx2) avx2="no"
+  ;;
+  --enable-avx2) avx2="yes"
+  ;;
   --disable-zlib-test) zlib="no"
   ;;
   --disable-lzo) lzo="no"
@@ -1378,6 +1383,7 @@ disabled with --disable-FEATURE, default is enabled if 
available:
   smartcard   smartcard support (libcacard)
   libusb  libusb (for usb passthrough)
   usb-redir   usb network redirection support
+  avx2support of avx2 instruction
   lzo support of lzo compression library
   snappy  support of snappy compression library
   bzip2   support of bzip2 compression library
@@ -1841,6 +1847,23 @@ else
 ifunc="no"
 fi
 
+
+# avx2 check
+
+if test "$avx2" != "no" ; then
+cat > $TMPC << EOF
+int main(void) { return 0; }
+EOF
+if compile_prog "" "-mavx2" ; then
+avx2="yes"
+else
+if test "$avx2" = "yes" ; then
+feature_not_found "avx2" "Your compiler don't support avx2"
+fi
+avx2="no"
+fi
+fi
+
 #
 # zlib check
 
@@ -4853,6 +4876,7 @@ echo "TPM passthrough   $tpm_passthrough"
 echo "QOM debugging $qom_cast_debug"
 echo "vhdx  $vhdx"
 echo "ifunc support $ifunc"
+echo "avx2 support  $avx2"
 echo "lzo support   $lzo"
 echo "snappy support$snappy"
 echo "bzip2 support $bzip2"
@@ -5241,6 +5265,12 @@ if test "$ifunc" = "yes" ; then
   echo "CONFIG_IFUNC=y" >> $config_host_mak
 fi
 
+if test "$avx2" = "yes" ; then
+  avx2_cflags=" -mavx2"
+  echo "AVX2_CFLAGS=$avx2_cflags" >> $config_host_mak
+  echo "CONFIG_AVX2=y" >> $config_host_mak
+fi
+
 if test "$lzo" = "yes" ; then
   echo "CONFIG_LZO=y" >> $config_host_mak
 fi
-- 
1.9.1




[Qemu-devel] [v3 2/3] configure: detect ifunc attribute

2015-12-08 Thread Liang Li
Detect if the compiler can support the ifunc attribute, the avx2
optimization depends on ifunc attribute.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 configure | 20 
 1 file changed, 20 insertions(+)

diff --git a/configure b/configure
index b9552fd..394db3b 100755
--- a/configure
+++ b/configure
@@ -310,6 +310,7 @@ smartcard=""
 libusb=""
 usb_redir=""
 opengl=""
+ifunc=""
 zlib="yes"
 lzo=""
 snappy=""
@@ -1827,6 +1828,20 @@ EOF
 fi
 
 ##
+# ifunc check
+
+cat > $TMPC << EOF
+static void bar(void) {}
+static void foo(void) __attribute__((ifunc("bar")));
+int main(void) { foo(); return 0; }
+EOF
+if compile_prog "" "" ; then
+ifunc="yes"
+else
+ifunc="no"
+fi
+
+#
 # zlib check
 
 if test "$zlib" != "no" ; then
@@ -4837,6 +4852,7 @@ echo "libssh2 support   $libssh2"
 echo "TPM passthrough   $tpm_passthrough"
 echo "QOM debugging $qom_cast_debug"
 echo "vhdx  $vhdx"
+echo "ifunc support $ifunc"
 echo "lzo support   $lzo"
 echo "snappy support$snappy"
 echo "bzip2 support $bzip2"
@@ -5221,6 +5237,10 @@ if test "$opengl" = "yes" ; then
   echo "OPENGL_LIBS=$opengl_libs" >> $config_host_mak
 fi
 
+if test "$ifunc" = "yes" ; then
+  echo "CONFIG_IFUNC=y" >> $config_host_mak
+fi
+
 if test "$lzo" = "yes" ; then
   echo "CONFIG_LZO=y" >> $config_host_mak
 fi
-- 
1.9.1




[Qemu-devel] [v3 0/3] add avx2 instruction optimization

2015-12-08 Thread Liang Li
buffer_find_nonzero_offset() is a hot function during live migration.
Now it use SSE2 intructions for optimization. For platform supports
AVX2 instructions, use the AVX2 instructions for optimization can help
to improve the performance about 30% comparing to SSE2.
Zero page check can be faster with this optimization, the test result
shows that for an 8GB RAM idle guest, this patch can help to shorten
the total live migration time about 6%.

This patch use the ifunc mechanism to select the proper function when
running, for platform supports AVX2, excute the AVX2 instructions,
else, excute the original code.

With this patch, the QEMU binary can run on both platforms support AVX2
or not.

Compiler which desn't support the AVX2 or ifunc attribute can build the
source code successfully.


v2 -> v3 changes:
  * Detect the ifunc attribute support (Paolo's suggestion) 
  * Use the ifunc attribute instead of the inline asm (Richard's suggestion)
  * Change the configure (Juan's suggestion)


Liang Li (3):
  cutils: add avx2 instruction optimization
  configure: detect ifunc attribute
  configure: add options to config avx2

 configure   | 50 +
 include/qemu-common.h   | 13 +-
 util/Makefile.objs  |  2 ++
 util/buffer-zero-avx2.c | 54 
 util/cutils.c   | 65 +++--
 5 files changed, 175 insertions(+), 9 deletions(-)
 create mode 100644 util/buffer-zero-avx2.c

-- 
1.9.1




[Qemu-devel] [v2 1/2] qemu-file: Fix qemu_put_compression_data flaw

2015-12-07 Thread Liang Li
Current qemu_put_compression_data can only work with no writable
QEMUFile, and can't work with the writable QEMUFile. But it does
not provide any measure to prevent users from using it with a
writable QEMUFile.

We should fix this flaw to make it works with writable QEMUFile.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 migration/qemu-file.c | 23 +--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index 0bbd257..b956ab6 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -606,8 +606,14 @@ uint64_t qemu_get_be64(QEMUFile *f)
 return v;
 }
 
-/* compress size bytes of data start at p with specific compression
+/* Compress size bytes of data start at p with specific compression
  * level and store the compressed data to the buffer of f.
+ *
+ * When f is not writable, return 0 if f has no space to save the
+ * compressed data.
+ * When f is wirtable and it has no space to save the compressed data,
+ * do fflush first, if f still has no space to save the compressed
+ * data, return 0.
  */
 
 ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size,
@@ -616,7 +622,14 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const 
uint8_t *p, size_t size,
 ssize_t blen = IO_BUF_SIZE - f->buf_index - sizeof(int32_t);
 
 if (blen < compressBound(size)) {
-return 0;
+if (!qemu_file_is_writable(f)) {
+return 0;
+}
+qemu_fflush(f);
+blen = IO_BUF_SIZE - sizeof(int32_t);
+if (blen < compressBound(size)) {
+return 0;
+}
 }
 if (compress2(f->buf + f->buf_index + sizeof(int32_t), (uLongf *),
   (Bytef *)p, size, level) != Z_OK) {
@@ -624,7 +637,13 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const 
uint8_t *p, size_t size,
 return 0;
 }
 qemu_put_be32(f, blen);
+if (f->ops->writev_buffer) {
+add_to_iovec(f, f->buf + f->buf_index, blen);
+}
 f->buf_index += blen;
+if (f->buf_index == IO_BUF_SIZE) {
+qemu_fflush(f);
+}
 return blen + sizeof(int32_t);
 }
 
-- 
1.9.1




[Qemu-devel] [v2 0/2] Fix flaw of qemu_put_compression_data

2015-12-07 Thread Liang Li
The implementation of qemu_put_compression_data only consider the case
QEMUFile is writable, it can't work with a writable QEMUFile and does
not provide any measure to prevent users from using it with a writable
QEMUFile. For safety, it should be improved to avoid some issues.

ram_save_compressed_page can be refined based on the change of
qemu_put_compression_data, very small improvement, but code looks better.

Liang Li (2):
  qemu-file: Fix qemu_put_compression_data flaw
  migration: refine ram_save_compressed_page

 migration/qemu-file.c | 23 +--
 migration/ram.c   | 20 
 2 files changed, 29 insertions(+), 14 deletions(-)

-- 
1.9.1




[Qemu-devel] [v2 2/2] migration: refine ram_save_compressed_page

2015-12-07 Thread Liang Li
Use qemu_put_compression_data to do the compression directly
instead of using do_compress_ram_page, avoid some data copy.
very small improvement, but the code looks better.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 migration/ram.c | 20 
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 1eb155a..44b3edc 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -911,22 +911,18 @@ static int ram_save_compressed_page(QEMUFile *f, RAMBlock 
*block,
 uint64_t *bytes_transferred)
 {
 int pages = -1;
-uint64_t bytes_xmit;
+uint64_t bytes_xmit = 0;
 uint8_t *p;
 int ret;
 
 p = block->host + offset;
 
-bytes_xmit = 0;
 ret = ram_control_save_page(f, block->offset,
 offset, TARGET_PAGE_SIZE, _xmit);
 if (bytes_xmit) {
 *bytes_transferred += bytes_xmit;
 pages = 1;
 }
-if (block == last_sent_block) {
-offset |= RAM_SAVE_FLAG_CONTINUE;
-}
 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 if (ret != RAM_SAVE_CONTROL_DELAYED) {
 if (bytes_xmit > 0) {
@@ -946,17 +942,17 @@ static int ram_save_compressed_page(QEMUFile *f, RAMBlock 
*block,
 flush_compressed_data(f);
 pages = save_zero_page(f, block, offset, p, bytes_transferred);
 if (pages == -1) {
-set_compress_params(_param[0], block, offset);
-/* Use the qemu thread to compress the data to make sure the
- * first page is sent out before other pages
- */
-bytes_xmit = do_compress_ram_page(_param[0]);
-acct_info.norm_pages++;
-qemu_put_qemu_file(f, comp_param[0].file);
+/* Make sure the first page is sent out before other pages */
+bytes_xmit = save_page_header(f, block, offset |
+  RAM_SAVE_FLAG_COMPRESS_PAGE);
+bytes_xmit += qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
+migrate_compress_level());
 *bytes_transferred += bytes_xmit;
+acct_info.norm_pages++;
 pages = 1;
 }
 } else {
+offset |= RAM_SAVE_FLAG_CONTINUE;
 pages = save_zero_page(f, block, offset, p, bytes_transferred);
 if (pages == -1) {
 pages = compress_page_with_multi_thread(f, block, offset,
-- 
1.9.1




[Qemu-devel] [PATCH 1/2] qemu-file: fix flaws of qemu_put_compression_data

2015-12-03 Thread Liang Li
There are some flaws in qemu_put_compression_data, this patch tries
to fix it. Now it can be used by other code.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 migration/qemu-file.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index 0bbd257..ef9cd4a 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -616,7 +616,9 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const 
uint8_t *p, size_t size,
 ssize_t blen = IO_BUF_SIZE - f->buf_index - sizeof(int32_t);
 
 if (blen < compressBound(size)) {
-return 0;
+if (f->ops->writev_buffer || f->ops->put_buffer) {
+qemu_fflush(f);
+}
 }
 if (compress2(f->buf + f->buf_index + sizeof(int32_t), (uLongf *),
   (Bytef *)p, size, level) != Z_OK) {
@@ -624,7 +626,13 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const 
uint8_t *p, size_t size,
 return 0;
 }
 qemu_put_be32(f, blen);
+if (f->ops->writev_buffer) {
+add_to_iovec(f, f->buf + f->buf_index, blen);
+}
 f->buf_index += blen;
+if (f->buf_index == IO_BUF_SIZE) {
+qemu_fflush(f);
+}
 return blen + sizeof(int32_t);
 }
 
-- 
1.9.1




[Qemu-devel] [PATCH 0/2] fix the flaws of qemu_put_compression_data

2015-12-03 Thread Liang Li
This patch fixed the flaws in qemu_put_compression_data function.
and cleanup the code based on the change.

Liang Li (2):
  qemu-file: fix flaws of qemu_put_compression_data
  migration: code clean up.

 migration/qemu-file.c | 10 +-
 migration/ram.c   | 20 
 2 files changed, 17 insertions(+), 13 deletions(-)

-- 
1.9.1




[Qemu-devel] [PATCH 2/2] migration: code clean up.

2015-12-03 Thread Liang Li
Use qemu_put_compression_data to do the compression directly
instead of using do_compress_ram_page, avoid some data copy.
very small improvement, but the code looks better.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 migration/ram.c | 20 
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 1eb155a..44b3edc 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -911,22 +911,18 @@ static int ram_save_compressed_page(QEMUFile *f, RAMBlock 
*block,
 uint64_t *bytes_transferred)
 {
 int pages = -1;
-uint64_t bytes_xmit;
+uint64_t bytes_xmit = 0;
 uint8_t *p;
 int ret;
 
 p = block->host + offset;
 
-bytes_xmit = 0;
 ret = ram_control_save_page(f, block->offset,
 offset, TARGET_PAGE_SIZE, _xmit);
 if (bytes_xmit) {
 *bytes_transferred += bytes_xmit;
 pages = 1;
 }
-if (block == last_sent_block) {
-offset |= RAM_SAVE_FLAG_CONTINUE;
-}
 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 if (ret != RAM_SAVE_CONTROL_DELAYED) {
 if (bytes_xmit > 0) {
@@ -946,17 +942,17 @@ static int ram_save_compressed_page(QEMUFile *f, RAMBlock 
*block,
 flush_compressed_data(f);
 pages = save_zero_page(f, block, offset, p, bytes_transferred);
 if (pages == -1) {
-set_compress_params(_param[0], block, offset);
-/* Use the qemu thread to compress the data to make sure the
- * first page is sent out before other pages
- */
-bytes_xmit = do_compress_ram_page(_param[0]);
-acct_info.norm_pages++;
-qemu_put_qemu_file(f, comp_param[0].file);
+/* Make sure the first page is sent out before other pages */
+bytes_xmit = save_page_header(f, block, offset |
+  RAM_SAVE_FLAG_COMPRESS_PAGE);
+bytes_xmit += qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
+migrate_compress_level());
 *bytes_transferred += bytes_xmit;
+acct_info.norm_pages++;
 pages = 1;
 }
 } else {
+offset |= RAM_SAVE_FLAG_CONTINUE;
 pages = save_zero_page(f, block, offset, p, bytes_transferred);
 if (pages == -1) {
 pages = compress_page_with_multi_thread(f, block, offset,
-- 
1.9.1




[Qemu-devel] [PATCH 1/2] qemu-file: fix flaws of qemu_put_compression_data

2015-12-03 Thread Liang Li
There are some flaws in qemu_put_compression_data, this patch tries
to fix it. Now it can be used by other code.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 migration/qemu-file.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index 0bbd257..ef9cd4a 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -616,7 +616,9 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const 
uint8_t *p, size_t size,
 ssize_t blen = IO_BUF_SIZE - f->buf_index - sizeof(int32_t);
 
 if (blen < compressBound(size)) {
-return 0;
+if (f->ops->writev_buffer || f->ops->put_buffer) {
+qemu_fflush(f);
+}
 }
 if (compress2(f->buf + f->buf_index + sizeof(int32_t), (uLongf *),
   (Bytef *)p, size, level) != Z_OK) {
@@ -624,7 +626,13 @@ ssize_t qemu_put_compression_data(QEMUFile *f, const 
uint8_t *p, size_t size,
 return 0;
 }
 qemu_put_be32(f, blen);
+if (f->ops->writev_buffer) {
+add_to_iovec(f, f->buf + f->buf_index, blen);
+}
 f->buf_index += blen;
+if (f->buf_index == IO_BUF_SIZE) {
+qemu_fflush(f);
+}
 return blen + sizeof(int32_t);
 }
 
-- 
1.9.1




[Qemu-devel] [PATCH 2/2] migration: code clean up.

2015-12-03 Thread Liang Li
Use qemu_put_compression_data to do the compression directly
instead of using do_compress_ram_page, avoid some data copy.
very small improvement, but the code looks better.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 migration/ram.c | 20 
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 1eb155a..44b3edc 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -911,22 +911,18 @@ static int ram_save_compressed_page(QEMUFile *f, RAMBlock 
*block,
 uint64_t *bytes_transferred)
 {
 int pages = -1;
-uint64_t bytes_xmit;
+uint64_t bytes_xmit = 0;
 uint8_t *p;
 int ret;
 
 p = block->host + offset;
 
-bytes_xmit = 0;
 ret = ram_control_save_page(f, block->offset,
 offset, TARGET_PAGE_SIZE, _xmit);
 if (bytes_xmit) {
 *bytes_transferred += bytes_xmit;
 pages = 1;
 }
-if (block == last_sent_block) {
-offset |= RAM_SAVE_FLAG_CONTINUE;
-}
 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 if (ret != RAM_SAVE_CONTROL_DELAYED) {
 if (bytes_xmit > 0) {
@@ -946,17 +942,17 @@ static int ram_save_compressed_page(QEMUFile *f, RAMBlock 
*block,
 flush_compressed_data(f);
 pages = save_zero_page(f, block, offset, p, bytes_transferred);
 if (pages == -1) {
-set_compress_params(_param[0], block, offset);
-/* Use the qemu thread to compress the data to make sure the
- * first page is sent out before other pages
- */
-bytes_xmit = do_compress_ram_page(_param[0]);
-acct_info.norm_pages++;
-qemu_put_qemu_file(f, comp_param[0].file);
+/* Make sure the first page is sent out before other pages */
+bytes_xmit = save_page_header(f, block, offset |
+  RAM_SAVE_FLAG_COMPRESS_PAGE);
+bytes_xmit += qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
+migrate_compress_level());
 *bytes_transferred += bytes_xmit;
+acct_info.norm_pages++;
 pages = 1;
 }
 } else {
+offset |= RAM_SAVE_FLAG_CONTINUE;
 pages = save_zero_page(f, block, offset, p, bytes_transferred);
 if (pages == -1) {
 pages = compress_page_with_multi_thread(f, block, offset,
-- 
1.9.1




[Qemu-devel] [PATCH 0/2] fix the flaws of qemu_put_compression_data

2015-12-03 Thread Liang Li
This patch fixed the flaws in qemu_put_compression_data function.
and cleanup the code based on the change.

Liang Li (2):
  qemu-file: fix flaws of qemu_put_compression_data
  migration: code clean up.

 migration/qemu-file.c | 10 +-
 migration/ram.c   | 20 
 2 files changed, 17 insertions(+), 13 deletions(-)

-- 
1.9.1




[Qemu-devel] [v2 RESEND 1/2] cutils: add avx2 instruction optimization

2015-11-09 Thread Liang Li
buffer_find_nonzero_offset() is a hot function during live migration.
Now it use SSE2 intructions for optimization. For platform supports
AVX2 instructions, use the AVX2 instructions for optimization can help
to improve the performance about 30% comparing to SSE2.
Zero page check can be faster with this optimization, the test result
shows that for an 8GB RAM idle guest, this patch can help to shorten
the total live migration time about 6%.

This patch use the ifunc mechanism to select the proper function when
running, for platform supports AVX2, excute the AVX2 instructions,
else, excute the original code.

Signed-off-by: Liang Li <liang.z...@intel.com>
---
 include/qemu-common.h | 28 +++--
 util/Makefile.objs|  2 ++
 util/avx2.c   | 68 +++
 util/cutils.c | 47 +--
 4 files changed, 136 insertions(+), 9 deletions(-)
 create mode 100644 util/avx2.c

diff --git a/include/qemu-common.h b/include/qemu-common.h
index 2f74540..9fa7501 100644
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -484,15 +484,29 @@ void qemu_hexdump(const char *buf, FILE *fp, const char 
*prefix, size_t size);
 #endif
 
 #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8
-static inline bool
-can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
-{
-return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
-   * sizeof(VECTYPE)) == 0
-&& ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
-}
+bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len);
+
 size_t buffer_find_nonzero_offset(const void *buf, size_t len);
 
+extern bool
+can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len);
+
+extern size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len);
+
+extern bool
+can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len);
+
+extern size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len);
+
+__asm__(".type can_use_buffer_find_nonzero_offset, \%gnu_indirect_function");
+__asm__(".type buffer_find_nonzero_offset, \%gnu_indirect_function");
+
+
+void *can_use_buffer_find_nonzero_offset_ifunc(void) \
+ __asm__("can_use_buffer_find_nonzero_offset");
+
+void *buffer_find_nonzero_offset_ifunc(void) \
+ __asm__("buffer_find_nonzero_offset");
 /*
  * helper to parse debug environment variables
  */
diff --git a/util/Makefile.objs b/util/Makefile.objs
index d7cc399..6aacad7 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -1,4 +1,5 @@
 util-obj-y = osdep.o cutils.o unicode.o qemu-timer-common.o
+util-obj-y += avx2.o
 util-obj-$(CONFIG_POSIX) += compatfd.o
 util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
 util-obj-$(CONFIG_POSIX) += mmap-alloc.o
@@ -29,3 +30,4 @@ util-obj-y += qemu-coroutine.o qemu-coroutine-lock.o 
qemu-coroutine-io.o
 util-obj-y += qemu-coroutine-sleep.o
 util-obj-y += coroutine-$(CONFIG_COROUTINE_BACKEND).o
 util-obj-y += buffer.o
+avx2.o-cflags  := $(AVX2_CFLAGS)
diff --git a/util/avx2.c b/util/avx2.c
new file mode 100644
index 000..d90289b
--- /dev/null
+++ b/util/avx2.c
@@ -0,0 +1,68 @@
+#include "qemu-common.h"
+
+#ifdef __AVX2__
+#include 
+#define AVX2_VECTYPE__m256i
+#define AVX2_SPLAT(p)   _mm256_set1_epi8(*(p))
+#define AVX2_ALL_EQ(v1, v2) \
+(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0x)
+#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
+
+inline bool
+can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
+{
+return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
+   * sizeof(AVX2_VECTYPE)) == 0
+&& ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0);
+}
+
+size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
+{
+const AVX2_VECTYPE *p = buf;
+const AVX2_VECTYPE zero = (AVX2_VECTYPE){0};
+size_t i;
+
+assert(can_use_buffer_find_nonzero_offset_avx2(buf, len));
+
+if (!len) {
+return 0;
+}
+
+for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
+if (!AVX2_ALL_EQ(p[i], zero)) {
+return i * sizeof(AVX2_VECTYPE);
+}
+}
+
+for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
+ i < len / sizeof(AVX2_VECTYPE);
+ i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
+AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]);
+AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]);
+AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]);
+AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]);
+AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1);
+AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3);
+if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) {
+break;
+}
+}
+
+return i * sizeof(AVX2_VECTYPE);
+}
+
+#else
+/* u

<    1   2   3   4   >