[PATCH v18 09/10] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ

2017-11-29 Thread Wei Wang
Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_VQ feature indicates the
support of reporting hints of guest free pages to host via virtio-balloon.

Host requests the guest to report free pages by sending a new cmd
id to the guest via the free_page_report_cmd_id configuration register.

When the guest starts to report, the first element added to the free page
vq is the cmd id given by host. When the guest finishes the reporting
of all the free pages, VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID is added
to the vq to tell host that the reporting is done. Host may also requests
the guest to stop the reporting in advance by sending the stop cmd id to
the guest via the configuration register.

Signed-off-by: Wei Wang <wei.w.w...@intel.com>
Signed-off-by: Liang Li <liang.z...@intel.com>
Cc: Michael S. Tsirkin <m...@redhat.com>
Cc: Michal Hocko <mho...@kernel.org>
---
 drivers/virtio/virtio_balloon.c | 202 +---
 include/uapi/linux/virtio_balloon.h |   4 +
 2 files changed, 167 insertions(+), 39 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 2c21c5a..035bd3a 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -55,7 +55,12 @@ static struct vfsmount *balloon_mnt;
 
 struct virtio_balloon {
struct virtio_device *vdev;
-   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
+   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
+
+   /* Balloon's own wq for cpu-intensive work items */
+   struct workqueue_struct *balloon_wq;
+   /* The free page reporting work item submitted to the balloon wq */
+   struct work_struct report_free_page_work;
 
/* The balloon servicing is delegated to a freezable workqueue. */
struct work_struct update_balloon_stats_work;
@@ -65,6 +70,13 @@ struct virtio_balloon {
spinlock_t stop_update_lock;
bool stop_update;
 
+   /* Start to report free pages */
+   bool report_free_page;
+   /* Stores the cmd id given by host to start the free page reporting */
+   uint32_t start_cmd_id;
+   /* Stores STOP_ID as a sign to tell host that the reporting is done */
+   uint32_t stop_cmd_id;
+
/* Waiting for host to ack the pages we released. */
wait_queue_head_t acked;
 
@@ -159,7 +171,8 @@ static void send_one_desc(struct virtio_balloon *vb,
  uint64_t addr,
  uint32_t len,
  bool inbuf,
- bool batch)
+ bool batch,
+ bool wait)
 {
int err;
unsigned int size;
@@ -178,8 +191,12 @@ static void send_one_desc(struct virtio_balloon *vb,
BUG_ON(err);
 
/* If batching is requested, we batch till the vq is full */
-   if (!batch || !vq->num_free)
-   kick_and_wait(vq, vb->acked);
+   if (!batch || !vq->num_free) {
+   if (wait)
+   kick_and_wait(vq, vb->acked);
+   else
+   virtqueue_kick(vq);
+   }
 }
 
 /*
@@ -212,11 +229,11 @@ static void tell_host_sgs(struct virtio_balloon *vb,
addr = pfn_start << PAGE_SHIFT;
len = (pfn_end - pfn_start) << PAGE_SHIFT;
while (len > max_len) {
-   send_one_desc(vb, vq, addr, max_len, true, true);
+   send_one_desc(vb, vq, addr, max_len, true, true, true);
addr += max_len;
len -= max_len;
}
-   send_one_desc(vb, vq, addr, len, true, true);
+   send_one_desc(vb, vq, addr, len, true, true, true);
pfn_start = pfn_end + 1;
}
 
@@ -401,7 +418,7 @@ static unsigned int leak_balloon_sg_oom(struct 
virtio_balloon *vb)
list_add(>lru, );
vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE;
send_one_desc(vb, vq, virt_to_phys(page_address(page)),
- PAGE_SIZE, true, true);
+ PAGE_SIZE, true, true, true);
release_pages_balloon(vb, );
}
 
@@ -491,17 +508,6 @@ static void stats_handle_request(struct virtio_balloon *vb)
virtqueue_kick(vq);
 }
 
-static void virtballoon_changed(struct virtio_device *vdev)
-{
-   struct virtio_balloon *vb = vdev->priv;
-   unsigned long flags;
-
-   spin_lock_irqsave(>stop_update_lock, flags);
-   if (!vb->stop_update)
-   queue_work(system_freezable_wq, >update_balloon_size_work);
-   spin_unlock_irqrestore(>stop_update_lock, flags);
-}
-
 static inline s64 towards_target(struct virtio_balloon *vb)
 {
s64 target;
@@ -518,6 +524,36 @@ static inline s64 towards_target(struct virtio_balloon *vb)
return target - vb->

[PATCH v18 09/10] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ

2017-11-29 Thread Wei Wang
Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_VQ feature indicates the
support of reporting hints of guest free pages to host via virtio-balloon.

Host requests the guest to report free pages by sending a new cmd
id to the guest via the free_page_report_cmd_id configuration register.

When the guest starts to report, the first element added to the free page
vq is the cmd id given by host. When the guest finishes the reporting
of all the free pages, VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID is added
to the vq to tell host that the reporting is done. Host may also requests
the guest to stop the reporting in advance by sending the stop cmd id to
the guest via the configuration register.

Signed-off-by: Wei Wang 
Signed-off-by: Liang Li 
Cc: Michael S. Tsirkin 
Cc: Michal Hocko 
---
 drivers/virtio/virtio_balloon.c | 202 +---
 include/uapi/linux/virtio_balloon.h |   4 +
 2 files changed, 167 insertions(+), 39 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 2c21c5a..035bd3a 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -55,7 +55,12 @@ static struct vfsmount *balloon_mnt;
 
 struct virtio_balloon {
struct virtio_device *vdev;
-   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
+   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
+
+   /* Balloon's own wq for cpu-intensive work items */
+   struct workqueue_struct *balloon_wq;
+   /* The free page reporting work item submitted to the balloon wq */
+   struct work_struct report_free_page_work;
 
/* The balloon servicing is delegated to a freezable workqueue. */
struct work_struct update_balloon_stats_work;
@@ -65,6 +70,13 @@ struct virtio_balloon {
spinlock_t stop_update_lock;
bool stop_update;
 
+   /* Start to report free pages */
+   bool report_free_page;
+   /* Stores the cmd id given by host to start the free page reporting */
+   uint32_t start_cmd_id;
+   /* Stores STOP_ID as a sign to tell host that the reporting is done */
+   uint32_t stop_cmd_id;
+
/* Waiting for host to ack the pages we released. */
wait_queue_head_t acked;
 
@@ -159,7 +171,8 @@ static void send_one_desc(struct virtio_balloon *vb,
  uint64_t addr,
  uint32_t len,
  bool inbuf,
- bool batch)
+ bool batch,
+ bool wait)
 {
int err;
unsigned int size;
@@ -178,8 +191,12 @@ static void send_one_desc(struct virtio_balloon *vb,
BUG_ON(err);
 
/* If batching is requested, we batch till the vq is full */
-   if (!batch || !vq->num_free)
-   kick_and_wait(vq, vb->acked);
+   if (!batch || !vq->num_free) {
+   if (wait)
+   kick_and_wait(vq, vb->acked);
+   else
+   virtqueue_kick(vq);
+   }
 }
 
 /*
@@ -212,11 +229,11 @@ static void tell_host_sgs(struct virtio_balloon *vb,
addr = pfn_start << PAGE_SHIFT;
len = (pfn_end - pfn_start) << PAGE_SHIFT;
while (len > max_len) {
-   send_one_desc(vb, vq, addr, max_len, true, true);
+   send_one_desc(vb, vq, addr, max_len, true, true, true);
addr += max_len;
len -= max_len;
}
-   send_one_desc(vb, vq, addr, len, true, true);
+   send_one_desc(vb, vq, addr, len, true, true, true);
pfn_start = pfn_end + 1;
}
 
@@ -401,7 +418,7 @@ static unsigned int leak_balloon_sg_oom(struct 
virtio_balloon *vb)
list_add(>lru, );
vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE;
send_one_desc(vb, vq, virt_to_phys(page_address(page)),
- PAGE_SIZE, true, true);
+ PAGE_SIZE, true, true, true);
release_pages_balloon(vb, );
}
 
@@ -491,17 +508,6 @@ static void stats_handle_request(struct virtio_balloon *vb)
virtqueue_kick(vq);
 }
 
-static void virtballoon_changed(struct virtio_device *vdev)
-{
-   struct virtio_balloon *vb = vdev->priv;
-   unsigned long flags;
-
-   spin_lock_irqsave(>stop_update_lock, flags);
-   if (!vb->stop_update)
-   queue_work(system_freezable_wq, >update_balloon_size_work);
-   spin_unlock_irqrestore(>stop_update_lock, flags);
-}
-
 static inline s64 towards_target(struct virtio_balloon *vb)
 {
s64 target;
@@ -518,6 +524,36 @@ static inline s64 towards_target(struct virtio_balloon *vb)
return target - vb->num_pages;
 }
 
+static void virtballoon_changed(struct virtio_device *vdev)
+{
+   struct virtio_ba

[PATCH v18 05/10] xbitmap: add more operations

2017-11-29 Thread Wei Wang
This patch adds support to find next 1 or 0 bit in a xbmitmap range and
clear a range of bits.

More possible optimizations to add in the future:
1) xb_set_bit_range: set a range of bits.
2) when searching a bit, if the bit is not found in the slot, move on to
the next slot directly.
3) add tags to help searching.

Signed-off-by: Wei Wang <wei.w.w...@intel.com>
Cc: Matthew Wilcox <mawil...@microsoft.com>
Cc: Andrew Morton <a...@linux-foundation.org>
Cc: Michal Hocko <mho...@kernel.org>
Cc: Michael S. Tsirkin <m...@redhat.com>
Cc: Tetsuo Handa <penguin-ker...@i-love.sakura.ne.jp>
Suggested-by: Matthew Wilcox <mawil...@microsoft.com>
---
 include/linux/xbitmap.h  |   8 +-
 lib/xbitmap.c| 180 +++
 tools/include/linux/bitmap.h |  34 
 tools/include/linux/kernel.h |   2 +
 4 files changed, 223 insertions(+), 1 deletion(-)

diff --git a/include/linux/xbitmap.h b/include/linux/xbitmap.h
index b4d8375..eddf0d5e 100644
--- a/include/linux/xbitmap.h
+++ b/include/linux/xbitmap.h
@@ -33,8 +33,14 @@ static inline void xb_init(struct xb *xb)
 }
 
 int xb_set_bit(struct xb *xb, unsigned long bit);
+int xb_preload_and_set_bit(struct xb *xb, unsigned long bit, gfp_t gfp);
 bool xb_test_bit(const struct xb *xb, unsigned long bit);
-int xb_clear_bit(struct xb *xb, unsigned long bit);
+void xb_clear_bit(struct xb *xb, unsigned long bit);
+unsigned long xb_find_next_set_bit(struct xb *xb, unsigned long start,
+  unsigned long end);
+unsigned long xb_find_next_zero_bit(struct xb *xb, unsigned long start,
+   unsigned long end);
+void xb_clear_bit_range(struct xb *xb, unsigned long start, unsigned long end);
 
 static inline bool xb_empty(const struct xb *xb)
 {
diff --git a/lib/xbitmap.c b/lib/xbitmap.c
index 182aa29..816dd3e 100644
--- a/lib/xbitmap.c
+++ b/lib/xbitmap.c
@@ -3,6 +3,13 @@
 #include 
 #include 
 
+/*
+ * Developer notes: locks are required to gurantee there is no concurrent
+ * calls of xb_set_bit, xb_clear_bit, xb_clear_bit_range, xb_test_bit,
+ * xb_find_next_set_bit, or xb_find_next_clear_bit to operate on the same
+ * ida bitamp.
+ */
+
 /**
  *  xb_set_bit - set a bit in the xbitmap
  *  @xb: the xbitmap tree used to record the bit
@@ -70,6 +77,28 @@ int xb_set_bit(struct xb *xb, unsigned long bit)
 EXPORT_SYMBOL(xb_set_bit);
 
 /**
+ *  xb_preload_and_set_bit - preload the memory and set a bit in the xbitmap
+ *  @xb: the xbitmap tree used to record the bit
+ *  @bit: index of the bit to set
+ *
+ * A wrapper of the xb_preload() and xb_set_bit().
+ * Returns: 0 on success; -EAGAIN or -ENOMEM on error.
+ */
+int xb_preload_and_set_bit(struct xb *xb, unsigned long bit, gfp_t gfp)
+{
+   int ret = 0;
+
+   if (!xb_preload(gfp))
+   return -ENOMEM;
+
+   ret = xb_set_bit(xb, bit);
+   xb_preload_end();
+
+   return ret;
+}
+EXPORT_SYMBOL(xb_preload_and_set_bit);
+
+/**
  * xb_clear_bit - clear a bit in the xbitmap
  * @xb: the xbitmap tree used to record the bit
  * @bit: index of the bit to clear
@@ -115,6 +144,56 @@ void xb_clear_bit(struct xb *xb, unsigned long bit)
 EXPORT_SYMBOL(xb_clear_bit);
 
 /**
+ * xb_clear_bit - clear a range of bits in the xbitmap
+ * @start: the start of the bit range, inclusive
+ * @end: the end of the bit range, inclusive
+ *
+ * This function is used to clear a bit in the xbitmap. If all the bits of the
+ * bitmap are 0, the bitmap will be freed.
+ */
+void xb_clear_bit_range(struct xb *xb, unsigned long start, unsigned long end)
+{
+   struct radix_tree_root *root = >xbrt;
+   struct radix_tree_node *node;
+   void **slot;
+   struct ida_bitmap *bitmap;
+   unsigned int nbits;
+
+   for (; start < end; start = (start | (IDA_BITMAP_BITS - 1)) + 1) {
+   unsigned long index = start / IDA_BITMAP_BITS;
+   unsigned long bit = start % IDA_BITMAP_BITS;
+
+   bitmap = __radix_tree_lookup(root, index, , );
+   if (radix_tree_exception(bitmap)) {
+   unsigned long ebit = bit + 2;
+   unsigned long tmp = (unsigned long)bitmap;
+
+   nbits = min(end - start + 1, BITS_PER_LONG - ebit);
+
+   if (ebit >= BITS_PER_LONG)
+   continue;
+   bitmap_clear(, ebit, nbits);
+   if (tmp == RADIX_TREE_EXCEPTIONAL_ENTRY)
+   __radix_tree_delete(root, node, slot);
+   else
+   rcu_assign_pointer(*slot, (void *)tmp);
+   } else if (bitmap) {
+   nbits = min(end - start + 1, IDA_BITMAP_BITS - bit);
+
+   if (nbits != IDA_BITMAP_BITS)
+   bitmap_clear(bitmap->bitmap, bit, nbits);

[PATCH v18 05/10] xbitmap: add more operations

2017-11-29 Thread Wei Wang
This patch adds support to find next 1 or 0 bit in a xbmitmap range and
clear a range of bits.

More possible optimizations to add in the future:
1) xb_set_bit_range: set a range of bits.
2) when searching a bit, if the bit is not found in the slot, move on to
the next slot directly.
3) add tags to help searching.

Signed-off-by: Wei Wang 
Cc: Matthew Wilcox 
Cc: Andrew Morton 
Cc: Michal Hocko 
Cc: Michael S. Tsirkin 
Cc: Tetsuo Handa 
Suggested-by: Matthew Wilcox 
---
 include/linux/xbitmap.h  |   8 +-
 lib/xbitmap.c| 180 +++
 tools/include/linux/bitmap.h |  34 
 tools/include/linux/kernel.h |   2 +
 4 files changed, 223 insertions(+), 1 deletion(-)

diff --git a/include/linux/xbitmap.h b/include/linux/xbitmap.h
index b4d8375..eddf0d5e 100644
--- a/include/linux/xbitmap.h
+++ b/include/linux/xbitmap.h
@@ -33,8 +33,14 @@ static inline void xb_init(struct xb *xb)
 }
 
 int xb_set_bit(struct xb *xb, unsigned long bit);
+int xb_preload_and_set_bit(struct xb *xb, unsigned long bit, gfp_t gfp);
 bool xb_test_bit(const struct xb *xb, unsigned long bit);
-int xb_clear_bit(struct xb *xb, unsigned long bit);
+void xb_clear_bit(struct xb *xb, unsigned long bit);
+unsigned long xb_find_next_set_bit(struct xb *xb, unsigned long start,
+  unsigned long end);
+unsigned long xb_find_next_zero_bit(struct xb *xb, unsigned long start,
+   unsigned long end);
+void xb_clear_bit_range(struct xb *xb, unsigned long start, unsigned long end);
 
 static inline bool xb_empty(const struct xb *xb)
 {
diff --git a/lib/xbitmap.c b/lib/xbitmap.c
index 182aa29..816dd3e 100644
--- a/lib/xbitmap.c
+++ b/lib/xbitmap.c
@@ -3,6 +3,13 @@
 #include 
 #include 
 
+/*
+ * Developer notes: locks are required to gurantee there is no concurrent
+ * calls of xb_set_bit, xb_clear_bit, xb_clear_bit_range, xb_test_bit,
+ * xb_find_next_set_bit, or xb_find_next_clear_bit to operate on the same
+ * ida bitamp.
+ */
+
 /**
  *  xb_set_bit - set a bit in the xbitmap
  *  @xb: the xbitmap tree used to record the bit
@@ -70,6 +77,28 @@ int xb_set_bit(struct xb *xb, unsigned long bit)
 EXPORT_SYMBOL(xb_set_bit);
 
 /**
+ *  xb_preload_and_set_bit - preload the memory and set a bit in the xbitmap
+ *  @xb: the xbitmap tree used to record the bit
+ *  @bit: index of the bit to set
+ *
+ * A wrapper of the xb_preload() and xb_set_bit().
+ * Returns: 0 on success; -EAGAIN or -ENOMEM on error.
+ */
+int xb_preload_and_set_bit(struct xb *xb, unsigned long bit, gfp_t gfp)
+{
+   int ret = 0;
+
+   if (!xb_preload(gfp))
+   return -ENOMEM;
+
+   ret = xb_set_bit(xb, bit);
+   xb_preload_end();
+
+   return ret;
+}
+EXPORT_SYMBOL(xb_preload_and_set_bit);
+
+/**
  * xb_clear_bit - clear a bit in the xbitmap
  * @xb: the xbitmap tree used to record the bit
  * @bit: index of the bit to clear
@@ -115,6 +144,56 @@ void xb_clear_bit(struct xb *xb, unsigned long bit)
 EXPORT_SYMBOL(xb_clear_bit);
 
 /**
+ * xb_clear_bit - clear a range of bits in the xbitmap
+ * @start: the start of the bit range, inclusive
+ * @end: the end of the bit range, inclusive
+ *
+ * This function is used to clear a bit in the xbitmap. If all the bits of the
+ * bitmap are 0, the bitmap will be freed.
+ */
+void xb_clear_bit_range(struct xb *xb, unsigned long start, unsigned long end)
+{
+   struct radix_tree_root *root = >xbrt;
+   struct radix_tree_node *node;
+   void **slot;
+   struct ida_bitmap *bitmap;
+   unsigned int nbits;
+
+   for (; start < end; start = (start | (IDA_BITMAP_BITS - 1)) + 1) {
+   unsigned long index = start / IDA_BITMAP_BITS;
+   unsigned long bit = start % IDA_BITMAP_BITS;
+
+   bitmap = __radix_tree_lookup(root, index, , );
+   if (radix_tree_exception(bitmap)) {
+   unsigned long ebit = bit + 2;
+   unsigned long tmp = (unsigned long)bitmap;
+
+   nbits = min(end - start + 1, BITS_PER_LONG - ebit);
+
+   if (ebit >= BITS_PER_LONG)
+   continue;
+   bitmap_clear(, ebit, nbits);
+   if (tmp == RADIX_TREE_EXCEPTIONAL_ENTRY)
+   __radix_tree_delete(root, node, slot);
+   else
+   rcu_assign_pointer(*slot, (void *)tmp);
+   } else if (bitmap) {
+   nbits = min(end - start + 1, IDA_BITMAP_BITS - bit);
+
+   if (nbits != IDA_BITMAP_BITS)
+   bitmap_clear(bitmap->bitmap, bit, nbits);
+
+   if (nbits == IDA_BITMAP_BITS ||
+   bitmap_empty(bitmap->bitmap, IDA_BITMAP_BITS)) {
+   kfree(bitmap);
+   __radix_tree_delete(r

[PATCH v18 08/10] mm: support reporting free page blocks

2017-11-29 Thread Wei Wang
This patch adds support to walk through the free page blocks in the
system and report them via a callback function. Some page blocks may
leave the free list after zone->lock is released, so it is the caller's
responsibility to either detect or prevent the use of such pages.

One use example of this patch is to accelerate live migration by skipping
the transfer of free pages reported from the guest. A popular method used
by the hypervisor to track which part of memory is written during live
migration is to write-protect all the guest memory. So, those pages that
are reported as free pages but are written after the report function
returns will be captured by the hypervisor, and they will be added to the
next round of memory transfer.

Signed-off-by: Wei Wang <wei.w.w...@intel.com>
Signed-off-by: Liang Li <liang.z...@intel.com>
Cc: Michal Hocko <mho...@kernel.org>
Cc: Michael S. Tsirkin <m...@redhat.com>
Acked-by: Michal Hocko <mho...@kernel.org>
---
 include/linux/mm.h |  6 
 mm/page_alloc.c| 91 ++
 2 files changed, 97 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ee07314..c1339be 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1924,6 +1924,12 @@ extern void free_area_init_node(int nid, unsigned long * 
zones_size,
unsigned long zone_start_pfn, unsigned long *zholes_size);
 extern void free_initmem(void);
 
+extern void walk_free_mem_block(void *opaque,
+   int min_order,
+   bool (*report_pfn_range)(void *opaque,
+unsigned long pfn,
+unsigned long num));
+
 /*
  * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
  * into the buddy system. The freed pages will be poisoned with pattern
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d4096f4..0f4a197 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4892,6 +4892,97 @@ void show_free_areas(unsigned int filter, nodemask_t 
*nodemask)
show_swap_cache_info();
 }
 
+/*
+ * Walk through a free page list and report the found pfn range via the
+ * callback.
+ *
+ * Return false if the callback requests to stop reporting. Otherwise,
+ * return true.
+ */
+static bool walk_free_page_list(void *opaque,
+   struct zone *zone,
+   int order,
+   enum migratetype mt,
+   bool (*report_pfn_range)(void *,
+unsigned long,
+unsigned long))
+{
+   struct page *page;
+   struct list_head *list;
+   unsigned long pfn, flags;
+   bool ret;
+
+   spin_lock_irqsave(>lock, flags);
+   list = >free_area[order].free_list[mt];
+   list_for_each_entry(page, list, lru) {
+   pfn = page_to_pfn(page);
+   ret = report_pfn_range(opaque, pfn, 1 << order);
+   if (!ret)
+   break;
+   }
+   spin_unlock_irqrestore(>lock, flags);
+
+   return ret;
+}
+
+/**
+ * walk_free_mem_block - Walk through the free page blocks in the system
+ * @opaque: the context passed from the caller
+ * @min_order: the minimum order of free lists to check
+ * @report_pfn_range: the callback to report the pfn range of the free pages
+ *
+ * If the callback returns false, stop iterating the list of free page blocks.
+ * Otherwise, continue to report.
+ *
+ * Please note that there are no locking guarantees for the callback and
+ * that the reported pfn range might be freed or disappear after the
+ * callback returns so the caller has to be very careful how it is used.
+ *
+ * The callback itself must not sleep or perform any operations which would
+ * require any memory allocations directly (not even GFP_NOWAIT/GFP_ATOMIC)
+ * or via any lock dependency. It is generally advisable to implement
+ * the callback as simple as possible and defer any heavy lifting to a
+ * different context.
+ *
+ * There is no guarantee that each free range will be reported only once
+ * during one walk_free_mem_block invocation.
+ *
+ * pfn_to_page on the given range is strongly discouraged and if there is
+ * an absolute need for that make sure to contact MM people to discuss
+ * potential problems.
+ *
+ * The function itself might sleep so it cannot be called from atomic
+ * contexts.
+ *
+ * In general low orders tend to be very volatile and so it makes more
+ * sense to query larger ones first for various optimizations which like
+ * ballooning etc... This will reduce the overhead as well.
+ */
+void walk_free_mem_block(void *opaque,
+int min_order,
+bool (*report_pfn_range)(void *opaque,
+  

[PATCH v18 08/10] mm: support reporting free page blocks

2017-11-29 Thread Wei Wang
This patch adds support to walk through the free page blocks in the
system and report them via a callback function. Some page blocks may
leave the free list after zone->lock is released, so it is the caller's
responsibility to either detect or prevent the use of such pages.

One use example of this patch is to accelerate live migration by skipping
the transfer of free pages reported from the guest. A popular method used
by the hypervisor to track which part of memory is written during live
migration is to write-protect all the guest memory. So, those pages that
are reported as free pages but are written after the report function
returns will be captured by the hypervisor, and they will be added to the
next round of memory transfer.

Signed-off-by: Wei Wang 
Signed-off-by: Liang Li 
Cc: Michal Hocko 
Cc: Michael S. Tsirkin 
Acked-by: Michal Hocko 
---
 include/linux/mm.h |  6 
 mm/page_alloc.c| 91 ++
 2 files changed, 97 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ee07314..c1339be 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1924,6 +1924,12 @@ extern void free_area_init_node(int nid, unsigned long * 
zones_size,
unsigned long zone_start_pfn, unsigned long *zholes_size);
 extern void free_initmem(void);
 
+extern void walk_free_mem_block(void *opaque,
+   int min_order,
+   bool (*report_pfn_range)(void *opaque,
+unsigned long pfn,
+unsigned long num));
+
 /*
  * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
  * into the buddy system. The freed pages will be poisoned with pattern
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d4096f4..0f4a197 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4892,6 +4892,97 @@ void show_free_areas(unsigned int filter, nodemask_t 
*nodemask)
show_swap_cache_info();
 }
 
+/*
+ * Walk through a free page list and report the found pfn range via the
+ * callback.
+ *
+ * Return false if the callback requests to stop reporting. Otherwise,
+ * return true.
+ */
+static bool walk_free_page_list(void *opaque,
+   struct zone *zone,
+   int order,
+   enum migratetype mt,
+   bool (*report_pfn_range)(void *,
+unsigned long,
+unsigned long))
+{
+   struct page *page;
+   struct list_head *list;
+   unsigned long pfn, flags;
+   bool ret;
+
+   spin_lock_irqsave(>lock, flags);
+   list = >free_area[order].free_list[mt];
+   list_for_each_entry(page, list, lru) {
+   pfn = page_to_pfn(page);
+   ret = report_pfn_range(opaque, pfn, 1 << order);
+   if (!ret)
+   break;
+   }
+   spin_unlock_irqrestore(>lock, flags);
+
+   return ret;
+}
+
+/**
+ * walk_free_mem_block - Walk through the free page blocks in the system
+ * @opaque: the context passed from the caller
+ * @min_order: the minimum order of free lists to check
+ * @report_pfn_range: the callback to report the pfn range of the free pages
+ *
+ * If the callback returns false, stop iterating the list of free page blocks.
+ * Otherwise, continue to report.
+ *
+ * Please note that there are no locking guarantees for the callback and
+ * that the reported pfn range might be freed or disappear after the
+ * callback returns so the caller has to be very careful how it is used.
+ *
+ * The callback itself must not sleep or perform any operations which would
+ * require any memory allocations directly (not even GFP_NOWAIT/GFP_ATOMIC)
+ * or via any lock dependency. It is generally advisable to implement
+ * the callback as simple as possible and defer any heavy lifting to a
+ * different context.
+ *
+ * There is no guarantee that each free range will be reported only once
+ * during one walk_free_mem_block invocation.
+ *
+ * pfn_to_page on the given range is strongly discouraged and if there is
+ * an absolute need for that make sure to contact MM people to discuss
+ * potential problems.
+ *
+ * The function itself might sleep so it cannot be called from atomic
+ * contexts.
+ *
+ * In general low orders tend to be very volatile and so it makes more
+ * sense to query larger ones first for various optimizations which like
+ * ballooning etc... This will reduce the overhead as well.
+ */
+void walk_free_mem_block(void *opaque,
+int min_order,
+bool (*report_pfn_range)(void *opaque,
+ unsigned long pfn,
+ unsigned long num))
+{
+  

[PATCH v18 10/10] virtio-balloon: don't report free pages when page poisoning is enabled

2017-11-29 Thread Wei Wang
The guest free pages should not be discarded by the live migration thread
when page poisoning is enabled with PAGE_POISONING_NO_SANITY=n, because
skipping the transfer of such poisoned free pages will trigger false
positive when new pages are allocated and checked on the destination.
This patch skips the reporting of free pages in the above case.

Reported-by: Michael S. Tsirkin <m...@redhat.com>
Signed-off-by: Wei Wang <wei.w.w...@intel.com>
Cc: Michal Hocko <mho...@suse.com>
---
 drivers/virtio/virtio_balloon.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 035bd3a..6ac4cff 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -652,7 +652,9 @@ static void report_free_page(struct work_struct *work)
/* Start by sending the obtained cmd id to the host with an outbuf */
send_one_desc(vb, vb->free_page_vq, virt_to_phys(>start_cmd_id),
  sizeof(uint32_t), false, true, false);
-   walk_free_mem_block(vb, 0, _balloon_send_free_pages);
+   if (!(page_poisoning_enabled() &&
+   !IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY)))
+   walk_free_mem_block(vb, 0, _balloon_send_free_pages);
/*
 * End by sending the stop id to the host with an outbuf. Use the
 * non-batching mode here to trigger a kick after adding the stop id.
-- 
2.7.4



[PATCH v18 10/10] virtio-balloon: don't report free pages when page poisoning is enabled

2017-11-29 Thread Wei Wang
The guest free pages should not be discarded by the live migration thread
when page poisoning is enabled with PAGE_POISONING_NO_SANITY=n, because
skipping the transfer of such poisoned free pages will trigger false
positive when new pages are allocated and checked on the destination.
This patch skips the reporting of free pages in the above case.

Reported-by: Michael S. Tsirkin 
Signed-off-by: Wei Wang 
Cc: Michal Hocko 
---
 drivers/virtio/virtio_balloon.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 035bd3a..6ac4cff 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -652,7 +652,9 @@ static void report_free_page(struct work_struct *work)
/* Start by sending the obtained cmd id to the host with an outbuf */
send_one_desc(vb, vb->free_page_vq, virt_to_phys(>start_cmd_id),
  sizeof(uint32_t), false, true, false);
-   walk_free_mem_block(vb, 0, _balloon_send_free_pages);
+   if (!(page_poisoning_enabled() &&
+   !IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY)))
+   walk_free_mem_block(vb, 0, _balloon_send_free_pages);
/*
 * End by sending the stop id to the host with an outbuf. Use the
 * non-batching mode here to trigger a kick after adding the stop id.
-- 
2.7.4



[PATCH v18 04/10] xbitmap: potential improvement

2017-11-29 Thread Wei Wang
This patch made some changes to the original xbitmap implementation from
the linux-dax tree:

- remove xb_fill() and xb_zero() from xbitmap.h since they are not
  implemented;

- xb_test_bit: changed "ebit > BITS_PER_LONG" to "ebit >= BITS_PER_LONG",
  because bit 64 beyonds the "unsigned long" exceptional entry (0 to 63);

- xb_set_bit: delete the new inserted radix_tree_node when failing to
  get the per cpu ida bitmap, this avoids the kind of memory leak of the
  unused radix tree node left in the tree.

- xb_clear_bit: change it to be a void function, since the original
  implementation reurns nothing than a 0.

- remove the comment above "#define XB_INDEX_BITS", because it causes
  confusion based on the feedbacks from the previous discussion;

- xb_preload: with the original implementation, the CPU that successfully
  do __radix_tree_preload() may get into sleep by kmalloc(), which has a
  risk of getting the caller of xb_preload() scheduled to another CPU
  after waken up, and the new CPU may not have radix_tree_node
  pre-allocated there, this will be a problem when inserting a node to
  the tree later. This patch moves __radix_tree_preload() after kmalloc()
  and returns a boolean to indicate the success or failure. Also, add the
  __must_check annotation to xb_preload for prudence purpose.

Signed-off-by: Wei Wang <wei.w.w...@intel.com>
Cc: Matthew Wilcox <mawil...@microsoft.com>
Cc: Andrew Morton <a...@linux-foundation.org>
Cc: Michal Hocko <mho...@kernel.org>
Cc: Michael S. Tsirkin <m...@redhat.com>
Cc: Tetsuo Handa <penguin-ker...@i-love.sakura.ne.jp>
---
 include/linux/xbitmap.h |  5 +
 lib/radix-tree.c| 27 +--
 lib/xbitmap.c   | 24 +---
 3 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/include/linux/xbitmap.h b/include/linux/xbitmap.h
index ed75d87..b4d8375 100644
--- a/include/linux/xbitmap.h
+++ b/include/linux/xbitmap.h
@@ -36,15 +36,12 @@ int xb_set_bit(struct xb *xb, unsigned long bit);
 bool xb_test_bit(const struct xb *xb, unsigned long bit);
 int xb_clear_bit(struct xb *xb, unsigned long bit);
 
-int xb_zero(struct xb *xb, unsigned long start, unsigned long nbits);
-int xb_fill(struct xb *xb, unsigned long start, unsigned long nbits);
-
 static inline bool xb_empty(const struct xb *xb)
 {
return radix_tree_empty(>xbrt);
 }
 
-void xb_preload(gfp_t);
+bool xb_preload(gfp_t);
 
 static inline void xb_preload_end(void)
 {
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 7000ad6..a039588 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -77,9 +77,6 @@ static struct kmem_cache *radix_tree_node_cachep;
RADIX_TREE_MAP_SHIFT))
 #define IDA_PRELOAD_SIZE   (IDA_MAX_PATH * 2 - 1)
 
-/*
- * The XB can go up to unsigned long, but also uses a bitmap.
- */
 #define XB_INDEX_BITS  (BITS_PER_LONG - ilog2(IDA_BITMAP_BITS))
 #define XB_MAX_PATH(DIV_ROUND_UP(XB_INDEX_BITS, \
RADIX_TREE_MAP_SHIFT))
@@ -2145,17 +2142,35 @@ int ida_pre_get(struct ida *ida, gfp_t gfp)
 }
 EXPORT_SYMBOL(ida_pre_get);
 
-void xb_preload(gfp_t gfp)
+/**
+ *  xb_preload - preload for xb_set_bit()
+ *  @gfp_mask: allocation mask to use for preloading
+ *
+ * Preallocate memory to use for the next call to xb_set_bit(). On success,
+ * return true, with preemption disabled. On error, return false with
+ * preemption not disabled.
+ */
+__must_check bool xb_preload(gfp_t gfp)
 {
-   __radix_tree_preload(gfp, XB_PRELOAD_SIZE);
if (!this_cpu_read(ida_bitmap)) {
struct ida_bitmap *bitmap = kmalloc(sizeof(*bitmap), gfp);
 
if (!bitmap)
-   return;
+   return false;
+   /*
+* The per-CPU variable is updated with preemption enabled.
+* If the calling task is unlucky to be scheduled to another
+* CPU which has no ida_bitmap allocation, it will be detected
+* when setting a bit (i.e. __xb_set_bit()).
+*/
bitmap = this_cpu_cmpxchg(ida_bitmap, NULL, bitmap);
kfree(bitmap);
}
+
+   if (__radix_tree_preload(gfp, XB_PRELOAD_SIZE) < 0)
+   return false;
+
+   return true;
 }
 EXPORT_SYMBOL(xb_preload);
 
diff --git a/lib/xbitmap.c b/lib/xbitmap.c
index 2b547a73..182aa29 100644
--- a/lib/xbitmap.c
+++ b/lib/xbitmap.c
@@ -39,8 +39,10 @@ int xb_set_bit(struct xb *xb, unsigned long bit)
return 0;
}
bitmap = this_cpu_xchg(ida_bitmap, NULL);
-   if (!bitmap)
+   if (!bitmap) {
+   __radix_tree_delete(root, node, slot);
return -EAGAIN;
+   }
memset(bitmap, 0, s

[PATCH v18 04/10] xbitmap: potential improvement

2017-11-29 Thread Wei Wang
This patch made some changes to the original xbitmap implementation from
the linux-dax tree:

- remove xb_fill() and xb_zero() from xbitmap.h since they are not
  implemented;

- xb_test_bit: changed "ebit > BITS_PER_LONG" to "ebit >= BITS_PER_LONG",
  because bit 64 beyonds the "unsigned long" exceptional entry (0 to 63);

- xb_set_bit: delete the new inserted radix_tree_node when failing to
  get the per cpu ida bitmap, this avoids the kind of memory leak of the
  unused radix tree node left in the tree.

- xb_clear_bit: change it to be a void function, since the original
  implementation reurns nothing than a 0.

- remove the comment above "#define XB_INDEX_BITS", because it causes
  confusion based on the feedbacks from the previous discussion;

- xb_preload: with the original implementation, the CPU that successfully
  do __radix_tree_preload() may get into sleep by kmalloc(), which has a
  risk of getting the caller of xb_preload() scheduled to another CPU
  after waken up, and the new CPU may not have radix_tree_node
  pre-allocated there, this will be a problem when inserting a node to
  the tree later. This patch moves __radix_tree_preload() after kmalloc()
  and returns a boolean to indicate the success or failure. Also, add the
  __must_check annotation to xb_preload for prudence purpose.

Signed-off-by: Wei Wang 
Cc: Matthew Wilcox 
Cc: Andrew Morton 
Cc: Michal Hocko 
Cc: Michael S. Tsirkin 
Cc: Tetsuo Handa 
---
 include/linux/xbitmap.h |  5 +
 lib/radix-tree.c| 27 +--
 lib/xbitmap.c   | 24 +---
 3 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/include/linux/xbitmap.h b/include/linux/xbitmap.h
index ed75d87..b4d8375 100644
--- a/include/linux/xbitmap.h
+++ b/include/linux/xbitmap.h
@@ -36,15 +36,12 @@ int xb_set_bit(struct xb *xb, unsigned long bit);
 bool xb_test_bit(const struct xb *xb, unsigned long bit);
 int xb_clear_bit(struct xb *xb, unsigned long bit);
 
-int xb_zero(struct xb *xb, unsigned long start, unsigned long nbits);
-int xb_fill(struct xb *xb, unsigned long start, unsigned long nbits);
-
 static inline bool xb_empty(const struct xb *xb)
 {
return radix_tree_empty(>xbrt);
 }
 
-void xb_preload(gfp_t);
+bool xb_preload(gfp_t);
 
 static inline void xb_preload_end(void)
 {
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 7000ad6..a039588 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -77,9 +77,6 @@ static struct kmem_cache *radix_tree_node_cachep;
RADIX_TREE_MAP_SHIFT))
 #define IDA_PRELOAD_SIZE   (IDA_MAX_PATH * 2 - 1)
 
-/*
- * The XB can go up to unsigned long, but also uses a bitmap.
- */
 #define XB_INDEX_BITS  (BITS_PER_LONG - ilog2(IDA_BITMAP_BITS))
 #define XB_MAX_PATH(DIV_ROUND_UP(XB_INDEX_BITS, \
RADIX_TREE_MAP_SHIFT))
@@ -2145,17 +2142,35 @@ int ida_pre_get(struct ida *ida, gfp_t gfp)
 }
 EXPORT_SYMBOL(ida_pre_get);
 
-void xb_preload(gfp_t gfp)
+/**
+ *  xb_preload - preload for xb_set_bit()
+ *  @gfp_mask: allocation mask to use for preloading
+ *
+ * Preallocate memory to use for the next call to xb_set_bit(). On success,
+ * return true, with preemption disabled. On error, return false with
+ * preemption not disabled.
+ */
+__must_check bool xb_preload(gfp_t gfp)
 {
-   __radix_tree_preload(gfp, XB_PRELOAD_SIZE);
if (!this_cpu_read(ida_bitmap)) {
struct ida_bitmap *bitmap = kmalloc(sizeof(*bitmap), gfp);
 
if (!bitmap)
-   return;
+   return false;
+   /*
+* The per-CPU variable is updated with preemption enabled.
+* If the calling task is unlucky to be scheduled to another
+* CPU which has no ida_bitmap allocation, it will be detected
+* when setting a bit (i.e. __xb_set_bit()).
+*/
bitmap = this_cpu_cmpxchg(ida_bitmap, NULL, bitmap);
kfree(bitmap);
}
+
+   if (__radix_tree_preload(gfp, XB_PRELOAD_SIZE) < 0)
+   return false;
+
+   return true;
 }
 EXPORT_SYMBOL(xb_preload);
 
diff --git a/lib/xbitmap.c b/lib/xbitmap.c
index 2b547a73..182aa29 100644
--- a/lib/xbitmap.c
+++ b/lib/xbitmap.c
@@ -39,8 +39,10 @@ int xb_set_bit(struct xb *xb, unsigned long bit)
return 0;
}
bitmap = this_cpu_xchg(ida_bitmap, NULL);
-   if (!bitmap)
+   if (!bitmap) {
+   __radix_tree_delete(root, node, slot);
return -EAGAIN;
+   }
memset(bitmap, 0, sizeof(*bitmap));
bitmap->bitmap[0] = tmp >> RADIX_TREE_EXCEPTIONAL_SHIFT;
rcu_assign_pointer(*slot, bitmap);
@@ -54,8 +56,10 @@ int xb_set_bit(struc

[PATCH v18 02/10] radix tree test suite: remove ARRAY_SIZE to avoid redefinition

2017-11-29 Thread Wei Wang
ARRAY_SIZE() has been defined in include/linux/kernel.h, and "make"
complains a warning of redefinition of ARRAY_SIZE() in
testing/radix/linux/kernel.h. So, remove ARRAY_SIZE() from there.

Signed-off-by: Wei Wang <wei.w.w...@intel.com>
Cc: Matthew Wilcox <mawil...@microsoft.com>
Cc: Andrew Morton <a...@linux-foundation.org>
---
 tools/testing/radix-tree/linux/kernel.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tools/testing/radix-tree/linux/kernel.h 
b/tools/testing/radix-tree/linux/kernel.h
index c3bc3f3..426f32f 100644
--- a/tools/testing/radix-tree/linux/kernel.h
+++ b/tools/testing/radix-tree/linux/kernel.h
@@ -17,6 +17,4 @@
 #define pr_debug printk
 #define pr_cont printk
 
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
-
 #endif /* _KERNEL_H */
-- 
2.7.4



[PATCH v18 02/10] radix tree test suite: remove ARRAY_SIZE to avoid redefinition

2017-11-29 Thread Wei Wang
ARRAY_SIZE() has been defined in include/linux/kernel.h, and "make"
complains a warning of redefinition of ARRAY_SIZE() in
testing/radix/linux/kernel.h. So, remove ARRAY_SIZE() from there.

Signed-off-by: Wei Wang 
Cc: Matthew Wilcox 
Cc: Andrew Morton 
---
 tools/testing/radix-tree/linux/kernel.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tools/testing/radix-tree/linux/kernel.h 
b/tools/testing/radix-tree/linux/kernel.h
index c3bc3f3..426f32f 100644
--- a/tools/testing/radix-tree/linux/kernel.h
+++ b/tools/testing/radix-tree/linux/kernel.h
@@ -17,6 +17,4 @@
 #define pr_debug printk
 #define pr_cont printk
 
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
-
 #endif /* _KERNEL_H */
-- 
2.7.4



Re: [PATCH v17 6/6] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ

2017-11-20 Thread Wei Wang

On 11/14/2017 01:32 AM, Michael S. Tsirkin wrote:

You should Cc Nitesh who is working on a related feature.


OK, I'll do. We have two more issues which haven't been discussed yet, 
please have a check below.




On Mon, Nov 13, 2017 at 06:34:48PM +0800, Wei Wang wrote:

Ping for comments, thanks.

On 11/03/2017 04:13 PM, Wei Wang wrote:

+static void virtballoon_cmd_report_free_page_start(struct virtio_balloon *vb)
+{
+   unsigned long flags;
+
+   vb->report_free_page_stop = false;

this flag is used a lot outside any locks. Why is this safe?
Please add some comments explaining access to this flag.


I will revert the logic as suggested: vb->report_free_page. Also plan to 
simplify its usage as below.


The flag is set or cleared in the config handler according to the 
new_cmd_id given

by the host:

new_cmd_id=0:WRITE_ONCE(vb->report_free_page, 
false); // stop reporting
new_cmd_id != old_cmd_id: WRITE_ONCE(vb->report_free_page, true);  // 
start reporting



The flag is read by virtio_balloon_send_free_pages() - the callback to 
report free pages:


if (!READ_ONCE(vb->report_free_page))
return false;

I don't find where it could be unsafe then (the flag is written by the 
config handler only).







+}
+
   static inline s64 towards_target(struct virtio_balloon *vb)
   {
s64 target;
@@ -597,42 +673,147 @@ static void update_balloon_size_func(struct work_struct 
*work)
queue_work(system_freezable_wq, work);
   }
-static int init_vqs(struct virtio_balloon *vb)
+static bool virtio_balloon_send_free_pages(void *opaque, unsigned long pfn,
+  unsigned long nr_pages)
   {
-   struct virtqueue *vqs[3];
-   vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request 
};
-   static const char * const names[] = { "inflate", "deflate", "stats" };
-   int err, nvqs;
+   struct virtio_balloon *vb = (struct virtio_balloon *)opaque;
+   void *addr = (void *)pfn_to_kaddr(pfn);

How do we know all free pages have a kaddr?


For x86_64, it works well since the kernel has all the physical memory 
mapped already. But for 32-bit kernel, yes, the high memory usually 
isn't mapped and thus no kaddr. Essentially, this pfn_to_kaddr convert 
isn't necessary, we do it here because the current API that virtio has 
is based on "struct scatterlist", which takes a kaddr, and this kaddr is 
then convert back to physical address in virtqueue_add() when assigning 
to desc->addr.


I think a better solution would be to add a new API, which directly 
assigns the caller's guest physical address to desc->addr, similar to 
the previous implementation "add_one_chunk()" 
(https://lists.gnu.org/archive/html/qemu-devel/2017-06/msg02452.html). 
But we can change that to a general virtio API:
virtqueue_add_one_desc(struct virtqueue *_vq, u64 base_addr, u32 size, 
bool in_desc, void *data);


What do you think?

Best,
Wei




Re: [PATCH v17 6/6] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ

2017-11-20 Thread Wei Wang

On 11/14/2017 01:32 AM, Michael S. Tsirkin wrote:

You should Cc Nitesh who is working on a related feature.


OK, I'll do. We have two more issues which haven't been discussed yet, 
please have a check below.




On Mon, Nov 13, 2017 at 06:34:48PM +0800, Wei Wang wrote:

Ping for comments, thanks.

On 11/03/2017 04:13 PM, Wei Wang wrote:

+static void virtballoon_cmd_report_free_page_start(struct virtio_balloon *vb)
+{
+   unsigned long flags;
+
+   vb->report_free_page_stop = false;

this flag is used a lot outside any locks. Why is this safe?
Please add some comments explaining access to this flag.


I will revert the logic as suggested: vb->report_free_page. Also plan to 
simplify its usage as below.


The flag is set or cleared in the config handler according to the 
new_cmd_id given

by the host:

new_cmd_id=0:WRITE_ONCE(vb->report_free_page, 
false); // stop reporting
new_cmd_id != old_cmd_id: WRITE_ONCE(vb->report_free_page, true);  // 
start reporting



The flag is read by virtio_balloon_send_free_pages() - the callback to 
report free pages:


if (!READ_ONCE(vb->report_free_page))
return false;

I don't find where it could be unsafe then (the flag is written by the 
config handler only).







+}
+
   static inline s64 towards_target(struct virtio_balloon *vb)
   {
s64 target;
@@ -597,42 +673,147 @@ static void update_balloon_size_func(struct work_struct 
*work)
queue_work(system_freezable_wq, work);
   }
-static int init_vqs(struct virtio_balloon *vb)
+static bool virtio_balloon_send_free_pages(void *opaque, unsigned long pfn,
+  unsigned long nr_pages)
   {
-   struct virtqueue *vqs[3];
-   vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request 
};
-   static const char * const names[] = { "inflate", "deflate", "stats" };
-   int err, nvqs;
+   struct virtio_balloon *vb = (struct virtio_balloon *)opaque;
+   void *addr = (void *)pfn_to_kaddr(pfn);

How do we know all free pages have a kaddr?


For x86_64, it works well since the kernel has all the physical memory 
mapped already. But for 32-bit kernel, yes, the high memory usually 
isn't mapped and thus no kaddr. Essentially, this pfn_to_kaddr convert 
isn't necessary, we do it here because the current API that virtio has 
is based on "struct scatterlist", which takes a kaddr, and this kaddr is 
then convert back to physical address in virtqueue_add() when assigning 
to desc->addr.


I think a better solution would be to add a new API, which directly 
assigns the caller's guest physical address to desc->addr, similar to 
the previous implementation "add_one_chunk()" 
(https://lists.gnu.org/archive/html/qemu-devel/2017-06/msg02452.html). 
But we can change that to a general virtio API:
virtqueue_add_one_desc(struct virtqueue *_vq, u64 base_addr, u32 size, 
bool in_desc, void *data);


What do you think?

Best,
Wei




Re: [virtio-dev] Re: [PATCH v17 6/6] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ

2017-11-17 Thread Wei Wang

On 11/17/2017 07:35 PM, Wei Wang wrote:

On 11/16/2017 09:27 PM, Wei Wang wrote:

On 11/16/2017 04:32 AM, Michael S. Tsirkin wrote:

On Fri, Nov 03, 2017 at 04:13:06PM +0800, Wei Wang wrote:

Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_VQ feature indicates the
support of reporting hints of guest free pages to the host via
virtio-balloon. The host requests the guest to report the free 
pages by

sending commands via the virtio-balloon configuration registers.

When the guest starts to report, the first element added to the 
free page

vq is a sequence id of the start reporting command. The id is given by
the host, and it indicates whether the following free pages correspond
to the command. For example, the host may stop the report and start 
again
with a new command id. The obsolete pages for the previous start 
command
can be detected by the id dismatching on the host. The id is added 
to the
vq using an output buffer, and the free pages are added to the vq 
using

input buffer.

Here are some explainations about the added configuration registers:
- host2guest_cmd: a register used by the host to send commands to the
guest.
- guest2host_cmd: written by the guest to ACK to the host about the
commands that have been received. The host will clear the 
corresponding

bits on the host2guest_cmd register. The guest also uses this register
to send commands to the host (e.g. when finish free page reporting).
- free_page_cmd_id: the sequence id of the free page report command
given by the host.

Signed-off-by: Wei Wang <wei.w.w...@intel.com>
Signed-off-by: Liang Li <liang.z...@intel.com>
Cc: Michael S. Tsirkin <m...@redhat.com>
Cc: Michal Hocko <mho...@kernel.org>
---

+
+static void report_free_page(struct work_struct *work)
+{
+struct virtio_balloon *vb;
+
+vb = container_of(work, struct virtio_balloon, 
report_free_page_work);

+report_free_page_cmd_id(vb);
+walk_free_mem_block(vb, 0, _balloon_send_free_pages);
+/*
+ * The last few free page blocks that were added may not reach 
the
+ * batch size, but need a kick to notify the device to handle 
them.

+ */
+virtqueue_kick(vb->free_page_vq);
+report_free_page_end(vb);
+}
+

I think there's an issue here: if pages are poisoned and hypervisor
subsequently drops them, testing them after allocation will
trigger a false positive.

The specific configuration:

PAGE_POISONING on
PAGE_POISONING_NO_SANITY off
PAGE_POISONING_ZERO off


Solutions:
1. disable the feature in that configuration
suggested as an initial step


Thanks for the finding.
Similar to this option: I'm thinking could we make 
walk_free_mem_block() simply return if that option is on?

That is, at the beginning of the function:
if (!page_poisoning_enabled())
return;




Thought about it more, I think it would be better to put this logic to 
virtio_balloon:


send_free_page_cmd_id(vb, >start_cmd_id);
if (page_poisoning_enabled() &&
!IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY))
walk_free_mem_block(vb, 0, 
_balloon_send_free_pages);


logic should be inverse:
if (!(page_poisoning_enabled() &&
!IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY)))

Best,
Wei



Re: [virtio-dev] Re: [PATCH v17 6/6] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ

2017-11-17 Thread Wei Wang

On 11/17/2017 07:35 PM, Wei Wang wrote:

On 11/16/2017 09:27 PM, Wei Wang wrote:

On 11/16/2017 04:32 AM, Michael S. Tsirkin wrote:

On Fri, Nov 03, 2017 at 04:13:06PM +0800, Wei Wang wrote:

Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_VQ feature indicates the
support of reporting hints of guest free pages to the host via
virtio-balloon. The host requests the guest to report the free 
pages by

sending commands via the virtio-balloon configuration registers.

When the guest starts to report, the first element added to the 
free page

vq is a sequence id of the start reporting command. The id is given by
the host, and it indicates whether the following free pages correspond
to the command. For example, the host may stop the report and start 
again
with a new command id. The obsolete pages for the previous start 
command
can be detected by the id dismatching on the host. The id is added 
to the
vq using an output buffer, and the free pages are added to the vq 
using

input buffer.

Here are some explainations about the added configuration registers:
- host2guest_cmd: a register used by the host to send commands to the
guest.
- guest2host_cmd: written by the guest to ACK to the host about the
commands that have been received. The host will clear the 
corresponding

bits on the host2guest_cmd register. The guest also uses this register
to send commands to the host (e.g. when finish free page reporting).
- free_page_cmd_id: the sequence id of the free page report command
given by the host.

Signed-off-by: Wei Wang 
Signed-off-by: Liang Li 
Cc: Michael S. Tsirkin 
Cc: Michal Hocko 
---

+
+static void report_free_page(struct work_struct *work)
+{
+struct virtio_balloon *vb;
+
+vb = container_of(work, struct virtio_balloon, 
report_free_page_work);

+report_free_page_cmd_id(vb);
+walk_free_mem_block(vb, 0, _balloon_send_free_pages);
+/*
+ * The last few free page blocks that were added may not reach 
the
+ * batch size, but need a kick to notify the device to handle 
them.

+ */
+virtqueue_kick(vb->free_page_vq);
+report_free_page_end(vb);
+}
+

I think there's an issue here: if pages are poisoned and hypervisor
subsequently drops them, testing them after allocation will
trigger a false positive.

The specific configuration:

PAGE_POISONING on
PAGE_POISONING_NO_SANITY off
PAGE_POISONING_ZERO off


Solutions:
1. disable the feature in that configuration
suggested as an initial step


Thanks for the finding.
Similar to this option: I'm thinking could we make 
walk_free_mem_block() simply return if that option is on?

That is, at the beginning of the function:
if (!page_poisoning_enabled())
return;




Thought about it more, I think it would be better to put this logic to 
virtio_balloon:


send_free_page_cmd_id(vb, >start_cmd_id);
if (page_poisoning_enabled() &&
!IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY))
walk_free_mem_block(vb, 0, 
_balloon_send_free_pages);


logic should be inverse:
if (!(page_poisoning_enabled() &&
!IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY)))

Best,
Wei



Re: [virtio-dev] Re: [PATCH v17 6/6] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ

2017-11-17 Thread Wei Wang

On 11/16/2017 09:27 PM, Wei Wang wrote:

On 11/16/2017 04:32 AM, Michael S. Tsirkin wrote:

On Fri, Nov 03, 2017 at 04:13:06PM +0800, Wei Wang wrote:

Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_VQ feature indicates the
support of reporting hints of guest free pages to the host via
virtio-balloon. The host requests the guest to report the free pages by
sending commands via the virtio-balloon configuration registers.

When the guest starts to report, the first element added to the free 
page

vq is a sequence id of the start reporting command. The id is given by
the host, and it indicates whether the following free pages correspond
to the command. For example, the host may stop the report and start 
again
with a new command id. The obsolete pages for the previous start 
command
can be detected by the id dismatching on the host. The id is added 
to the

vq using an output buffer, and the free pages are added to the vq using
input buffer.

Here are some explainations about the added configuration registers:
- host2guest_cmd: a register used by the host to send commands to the
guest.
- guest2host_cmd: written by the guest to ACK to the host about the
commands that have been received. The host will clear the corresponding
bits on the host2guest_cmd register. The guest also uses this register
to send commands to the host (e.g. when finish free page reporting).
- free_page_cmd_id: the sequence id of the free page report command
given by the host.

Signed-off-by: Wei Wang <wei.w.w...@intel.com>
Signed-off-by: Liang Li <liang.z...@intel.com>
Cc: Michael S. Tsirkin <m...@redhat.com>
Cc: Michal Hocko <mho...@kernel.org>
---

+
+static void report_free_page(struct work_struct *work)
+{
+struct virtio_balloon *vb;
+
+vb = container_of(work, struct virtio_balloon, 
report_free_page_work);

+report_free_page_cmd_id(vb);
+walk_free_mem_block(vb, 0, _balloon_send_free_pages);
+/*
+ * The last few free page blocks that were added may not reach the
+ * batch size, but need a kick to notify the device to handle 
them.

+ */
+virtqueue_kick(vb->free_page_vq);
+report_free_page_end(vb);
+}
+

I think there's an issue here: if pages are poisoned and hypervisor
subsequently drops them, testing them after allocation will
trigger a false positive.

The specific configuration:

PAGE_POISONING on
PAGE_POISONING_NO_SANITY off
PAGE_POISONING_ZERO off


Solutions:
1. disable the feature in that configuration
suggested as an initial step


Thanks for the finding.
Similar to this option: I'm thinking could we make 
walk_free_mem_block() simply return if that option is on?

That is, at the beginning of the function:
if (!page_poisoning_enabled())
return;




Thought about it more, I think it would be better to put this logic to 
virtio_balloon:


send_free_page_cmd_id(vb, >start_cmd_id);
if (page_poisoning_enabled() &&
!IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY))
walk_free_mem_block(vb, 0, 
_balloon_send_free_pages);

send_free_page_cmd_id(vb, >stop_cmd_id);


walk_free_mem_block() should be a more generic API, and this potential 
page poisoning issue is specific to live migration which is only one use 
case of this function, so I think it is better to handle it in the 
special use case itself.


Best,
Wei





Re: [virtio-dev] Re: [PATCH v17 6/6] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ

2017-11-17 Thread Wei Wang

On 11/16/2017 09:27 PM, Wei Wang wrote:

On 11/16/2017 04:32 AM, Michael S. Tsirkin wrote:

On Fri, Nov 03, 2017 at 04:13:06PM +0800, Wei Wang wrote:

Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_VQ feature indicates the
support of reporting hints of guest free pages to the host via
virtio-balloon. The host requests the guest to report the free pages by
sending commands via the virtio-balloon configuration registers.

When the guest starts to report, the first element added to the free 
page

vq is a sequence id of the start reporting command. The id is given by
the host, and it indicates whether the following free pages correspond
to the command. For example, the host may stop the report and start 
again
with a new command id. The obsolete pages for the previous start 
command
can be detected by the id dismatching on the host. The id is added 
to the

vq using an output buffer, and the free pages are added to the vq using
input buffer.

Here are some explainations about the added configuration registers:
- host2guest_cmd: a register used by the host to send commands to the
guest.
- guest2host_cmd: written by the guest to ACK to the host about the
commands that have been received. The host will clear the corresponding
bits on the host2guest_cmd register. The guest also uses this register
to send commands to the host (e.g. when finish free page reporting).
- free_page_cmd_id: the sequence id of the free page report command
given by the host.

Signed-off-by: Wei Wang 
Signed-off-by: Liang Li 
Cc: Michael S. Tsirkin 
Cc: Michal Hocko 
---

+
+static void report_free_page(struct work_struct *work)
+{
+struct virtio_balloon *vb;
+
+vb = container_of(work, struct virtio_balloon, 
report_free_page_work);

+report_free_page_cmd_id(vb);
+walk_free_mem_block(vb, 0, _balloon_send_free_pages);
+/*
+ * The last few free page blocks that were added may not reach the
+ * batch size, but need a kick to notify the device to handle 
them.

+ */
+virtqueue_kick(vb->free_page_vq);
+report_free_page_end(vb);
+}
+

I think there's an issue here: if pages are poisoned and hypervisor
subsequently drops them, testing them after allocation will
trigger a false positive.

The specific configuration:

PAGE_POISONING on
PAGE_POISONING_NO_SANITY off
PAGE_POISONING_ZERO off


Solutions:
1. disable the feature in that configuration
suggested as an initial step


Thanks for the finding.
Similar to this option: I'm thinking could we make 
walk_free_mem_block() simply return if that option is on?

That is, at the beginning of the function:
if (!page_poisoning_enabled())
return;




Thought about it more, I think it would be better to put this logic to 
virtio_balloon:


send_free_page_cmd_id(vb, >start_cmd_id);
if (page_poisoning_enabled() &&
!IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY))
walk_free_mem_block(vb, 0, 
_balloon_send_free_pages);

send_free_page_cmd_id(vb, >stop_cmd_id);


walk_free_mem_block() should be a more generic API, and this potential 
page poisoning issue is specific to live migration which is only one use 
case of this function, so I think it is better to handle it in the 
special use case itself.


Best,
Wei





Re: [virtio-dev] Re: [PATCH v17 6/6] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ

2017-11-16 Thread Wei Wang

On 11/16/2017 04:32 AM, Michael S. Tsirkin wrote:

On Fri, Nov 03, 2017 at 04:13:06PM +0800, Wei Wang wrote:

Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_VQ feature indicates the
support of reporting hints of guest free pages to the host via
virtio-balloon. The host requests the guest to report the free pages by
sending commands via the virtio-balloon configuration registers.

When the guest starts to report, the first element added to the free page
vq is a sequence id of the start reporting command. The id is given by
the host, and it indicates whether the following free pages correspond
to the command. For example, the host may stop the report and start again
with a new command id. The obsolete pages for the previous start command
can be detected by the id dismatching on the host. The id is added to the
vq using an output buffer, and the free pages are added to the vq using
input buffer.

Here are some explainations about the added configuration registers:
- host2guest_cmd: a register used by the host to send commands to the
guest.
- guest2host_cmd: written by the guest to ACK to the host about the
commands that have been received. The host will clear the corresponding
bits on the host2guest_cmd register. The guest also uses this register
to send commands to the host (e.g. when finish free page reporting).
- free_page_cmd_id: the sequence id of the free page report command
given by the host.

Signed-off-by: Wei Wang <wei.w.w...@intel.com>
Signed-off-by: Liang Li <liang.z...@intel.com>
Cc: Michael S. Tsirkin <m...@redhat.com>
Cc: Michal Hocko <mho...@kernel.org>
---

+
+static void report_free_page(struct work_struct *work)
+{
+   struct virtio_balloon *vb;
+
+   vb = container_of(work, struct virtio_balloon, report_free_page_work);
+   report_free_page_cmd_id(vb);
+   walk_free_mem_block(vb, 0, _balloon_send_free_pages);
+   /*
+* The last few free page blocks that were added may not reach the
+* batch size, but need a kick to notify the device to handle them.
+*/
+   virtqueue_kick(vb->free_page_vq);
+   report_free_page_end(vb);
+}
+

I think there's an issue here: if pages are poisoned and hypervisor
subsequently drops them, testing them after allocation will
trigger a false positive.

The specific configuration:

PAGE_POISONING on
PAGE_POISONING_NO_SANITY off
PAGE_POISONING_ZERO off


Solutions:
1. disable the feature in that configuration
suggested as an initial step


Thanks for the finding.
Similar to this option: I'm thinking could we make walk_free_mem_block() 
simply return if that option is on?

That is, at the beginning of the function:
if (!page_poisoning_enabled())
return;

I think in most usages, people would not choose to use the poisoning 
option due to the added overhead.



Probably we could make it a separate fix patch of this report following 
patch 5 to explain the above reasons in the commit.



2. pass poison value to host so it can validate page content
before it drops it
3. pass poison value to host so it can init allocated pages with that value

In fact one nice side effect would be that unmap
becomes safe even though free list is not locked anymore.


I haven't got this point yet,  how would it bring performance benefit?


It would be interesting to see whether this last has
any value performance-wise.



Best,
Wei


Re: [virtio-dev] Re: [PATCH v17 6/6] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ

2017-11-16 Thread Wei Wang

On 11/16/2017 04:32 AM, Michael S. Tsirkin wrote:

On Fri, Nov 03, 2017 at 04:13:06PM +0800, Wei Wang wrote:

Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_VQ feature indicates the
support of reporting hints of guest free pages to the host via
virtio-balloon. The host requests the guest to report the free pages by
sending commands via the virtio-balloon configuration registers.

When the guest starts to report, the first element added to the free page
vq is a sequence id of the start reporting command. The id is given by
the host, and it indicates whether the following free pages correspond
to the command. For example, the host may stop the report and start again
with a new command id. The obsolete pages for the previous start command
can be detected by the id dismatching on the host. The id is added to the
vq using an output buffer, and the free pages are added to the vq using
input buffer.

Here are some explainations about the added configuration registers:
- host2guest_cmd: a register used by the host to send commands to the
guest.
- guest2host_cmd: written by the guest to ACK to the host about the
commands that have been received. The host will clear the corresponding
bits on the host2guest_cmd register. The guest also uses this register
to send commands to the host (e.g. when finish free page reporting).
- free_page_cmd_id: the sequence id of the free page report command
given by the host.

Signed-off-by: Wei Wang 
Signed-off-by: Liang Li 
Cc: Michael S. Tsirkin 
Cc: Michal Hocko 
---

+
+static void report_free_page(struct work_struct *work)
+{
+   struct virtio_balloon *vb;
+
+   vb = container_of(work, struct virtio_balloon, report_free_page_work);
+   report_free_page_cmd_id(vb);
+   walk_free_mem_block(vb, 0, _balloon_send_free_pages);
+   /*
+* The last few free page blocks that were added may not reach the
+* batch size, but need a kick to notify the device to handle them.
+*/
+   virtqueue_kick(vb->free_page_vq);
+   report_free_page_end(vb);
+}
+

I think there's an issue here: if pages are poisoned and hypervisor
subsequently drops them, testing them after allocation will
trigger a false positive.

The specific configuration:

PAGE_POISONING on
PAGE_POISONING_NO_SANITY off
PAGE_POISONING_ZERO off


Solutions:
1. disable the feature in that configuration
suggested as an initial step


Thanks for the finding.
Similar to this option: I'm thinking could we make walk_free_mem_block() 
simply return if that option is on?

That is, at the beginning of the function:
if (!page_poisoning_enabled())
return;

I think in most usages, people would not choose to use the poisoning 
option due to the added overhead.



Probably we could make it a separate fix patch of this report following 
patch 5 to explain the above reasons in the commit.



2. pass poison value to host so it can validate page content
before it drops it
3. pass poison value to host so it can init allocated pages with that value

In fact one nice side effect would be that unmap
becomes safe even though free list is not locked anymore.


I haven't got this point yet,  how would it bring performance benefit?


It would be interesting to see whether this last has
any value performance-wise.



Best,
Wei


Re: [PATCH v17 6/6] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ

2017-11-16 Thread Wei Wang

On 11/15/2017 09:26 PM, Michael S. Tsirkin wrote:

On Wed, Nov 15, 2017 at 11:47:58AM +0800, Wei Wang wrote:

On 11/15/2017 05:21 AM, Michael S. Tsirkin wrote:

On Tue, Nov 14, 2017 at 08:02:03PM +0800, Wei Wang wrote:

On 11/14/2017 01:32 AM, Michael S. Tsirkin wrote:

- guest2host_cmd: written by the guest to ACK to the host about the
commands that have been received. The host will clear the corresponding
bits on the host2guest_cmd register. The guest also uses this register
to send commands to the host (e.g. when finish free page reporting).

I am not sure what is the role of guest2host_cmd. Reporting of
the correct cmd id seems sufficient indication that guest
received the start command. Not getting any more seems sufficient
to detect stop.


I think the issue is when the host is waiting for the guest to report pages,
it does not know whether the guest is going to report more or the report is
done already. That's why we need a way to let the guest tell the host "the
report is done, don't wait for more", then the host continues to the next
step - sending the non-free pages to the destination. The following method
is a conclusion of other comments, with some new thought. Please have a
check if it is good.

config won't work well for this IMHO.
Writes to config register are hard to synchronize with the VQ.
For example, guest sends free pages, host says stop, meanwhile
guest sends stop for 1st set of pages.

I still don't see an issue with this. Please see below:
(before jumping into the discussion, just make sure I've well explained this
point: now host-to-guest commands are done via config, and guest-to-host
commands are done via the free page vq)

This is fine by me actually. But right now you have guest to host
not going through vq, going through command register instead -
this is how sending stop to host seems to happen.
If you make it go through vq then I think all will be well.


Case: Host starts to request the reporting with cmd_id=1. Some time later,
Host writes "stop" to config, meantime guest happens to finish the reporting
and plan to actively send a "stop" command from the free_page_vq().
   Essentially, this is like a sync between two threads - if we view
the config interrupt handler as one thread, another is the free page
reporting worker thread.

 - what the config handler does is simply:
   1.1:  WRITE_ONCE(vb->reporting_stop, true);

 - what the reporting thread will do is
   2.1:  WRITE_ONCE(vb->reporting_stop, true);
   2.2:  send_stop_to_host_via_vq();

 From the guest point of view, no matter 1.1 is executed first or 2.1 first,
it doesn't make a difference to the end result - vb->reporting_stop is set.

 From the host point of view, it knows that cmd_id=1 has truly stopped the
reporting when it receives a "stop" sign via the vq.



How about adding a buffer with "stop" in the VQ instead?
Wastes a VQ entry which you will need to reserve for this
but is it a big deal?

The free page vq is guest-to-host direction.

Yes, for guest to host stop sign.


Using it for host-to-guest
requests will make it bidirectional, which will result in the same issue
described before: https://lkml.org/lkml/2017/10/11/1009 (the first response)

On the other hand, I think adding another new vq for host-to-guest
requesting doesn't make a difference in essence, compared to using config
(same 1.1, 2.1, 2.2 above), but will be more complicated.

I agree with this. Host to guest can just incremenent the "free command id"
register.



OK, thanks for the suggestions. I think one more issue left here:

Previously, when the guest receives a config interrupt, it blindly adds 
the balloon work item to the workqueue in virtballoon_changed(), because 
only ballooning uses the config.

Now, free page reporting is requested via config, too.

We have the following two options:

Option 1: add "diff = towards_target()" to virtballoon_changed(), and if 
diff = 0, it will not add the balloon work item to the wq.


Option 2: add "cmd" for the host-to-guest request, and add the item when 
"cmd | CMD_BALLOON" is true.


I'm inclined to take option 1 now. Which one would you prefer?

Best,
Wei




Re: [PATCH v17 6/6] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ

2017-11-16 Thread Wei Wang

On 11/15/2017 09:26 PM, Michael S. Tsirkin wrote:

On Wed, Nov 15, 2017 at 11:47:58AM +0800, Wei Wang wrote:

On 11/15/2017 05:21 AM, Michael S. Tsirkin wrote:

On Tue, Nov 14, 2017 at 08:02:03PM +0800, Wei Wang wrote:

On 11/14/2017 01:32 AM, Michael S. Tsirkin wrote:

- guest2host_cmd: written by the guest to ACK to the host about the
commands that have been received. The host will clear the corresponding
bits on the host2guest_cmd register. The guest also uses this register
to send commands to the host (e.g. when finish free page reporting).

I am not sure what is the role of guest2host_cmd. Reporting of
the correct cmd id seems sufficient indication that guest
received the start command. Not getting any more seems sufficient
to detect stop.


I think the issue is when the host is waiting for the guest to report pages,
it does not know whether the guest is going to report more or the report is
done already. That's why we need a way to let the guest tell the host "the
report is done, don't wait for more", then the host continues to the next
step - sending the non-free pages to the destination. The following method
is a conclusion of other comments, with some new thought. Please have a
check if it is good.

config won't work well for this IMHO.
Writes to config register are hard to synchronize with the VQ.
For example, guest sends free pages, host says stop, meanwhile
guest sends stop for 1st set of pages.

I still don't see an issue with this. Please see below:
(before jumping into the discussion, just make sure I've well explained this
point: now host-to-guest commands are done via config, and guest-to-host
commands are done via the free page vq)

This is fine by me actually. But right now you have guest to host
not going through vq, going through command register instead -
this is how sending stop to host seems to happen.
If you make it go through vq then I think all will be well.


Case: Host starts to request the reporting with cmd_id=1. Some time later,
Host writes "stop" to config, meantime guest happens to finish the reporting
and plan to actively send a "stop" command from the free_page_vq().
   Essentially, this is like a sync between two threads - if we view
the config interrupt handler as one thread, another is the free page
reporting worker thread.

 - what the config handler does is simply:
   1.1:  WRITE_ONCE(vb->reporting_stop, true);

 - what the reporting thread will do is
   2.1:  WRITE_ONCE(vb->reporting_stop, true);
   2.2:  send_stop_to_host_via_vq();

 From the guest point of view, no matter 1.1 is executed first or 2.1 first,
it doesn't make a difference to the end result - vb->reporting_stop is set.

 From the host point of view, it knows that cmd_id=1 has truly stopped the
reporting when it receives a "stop" sign via the vq.



How about adding a buffer with "stop" in the VQ instead?
Wastes a VQ entry which you will need to reserve for this
but is it a big deal?

The free page vq is guest-to-host direction.

Yes, for guest to host stop sign.


Using it for host-to-guest
requests will make it bidirectional, which will result in the same issue
described before: https://lkml.org/lkml/2017/10/11/1009 (the first response)

On the other hand, I think adding another new vq for host-to-guest
requesting doesn't make a difference in essence, compared to using config
(same 1.1, 2.1, 2.2 above), but will be more complicated.

I agree with this. Host to guest can just incremenent the "free command id"
register.



OK, thanks for the suggestions. I think one more issue left here:

Previously, when the guest receives a config interrupt, it blindly adds 
the balloon work item to the workqueue in virtballoon_changed(), because 
only ballooning uses the config.

Now, free page reporting is requested via config, too.

We have the following two options:

Option 1: add "diff = towards_target()" to virtballoon_changed(), and if 
diff = 0, it will not add the balloon work item to the wq.


Option 2: add "cmd" for the host-to-guest request, and add the item when 
"cmd | CMD_BALLOON" is true.


I'm inclined to take option 1 now. Which one would you prefer?

Best,
Wei




Re: [PATCH v17 6/6] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ

2017-11-14 Thread Wei Wang

On 11/15/2017 05:21 AM, Michael S. Tsirkin wrote:

On Tue, Nov 14, 2017 at 08:02:03PM +0800, Wei Wang wrote:

On 11/14/2017 01:32 AM, Michael S. Tsirkin wrote:

- guest2host_cmd: written by the guest to ACK to the host about the
commands that have been received. The host will clear the corresponding
bits on the host2guest_cmd register. The guest also uses this register
to send commands to the host (e.g. when finish free page reporting).

I am not sure what is the role of guest2host_cmd. Reporting of
the correct cmd id seems sufficient indication that guest
received the start command. Not getting any more seems sufficient
to detect stop.


I think the issue is when the host is waiting for the guest to report pages,
it does not know whether the guest is going to report more or the report is
done already. That's why we need a way to let the guest tell the host "the
report is done, don't wait for more", then the host continues to the next
step - sending the non-free pages to the destination. The following method
is a conclusion of other comments, with some new thought. Please have a
check if it is good.

config won't work well for this IMHO.
Writes to config register are hard to synchronize with the VQ.
For example, guest sends free pages, host says stop, meanwhile
guest sends stop for 1st set of pages.


I still don't see an issue with this. Please see below:
(before jumping into the discussion, just make sure I've well explained 
this point: now host-to-guest commands are done via config, and 
guest-to-host commands are done via the free page vq)


Case: Host starts to request the reporting with cmd_id=1. Some time 
later, Host writes "stop" to config, meantime guest happens to finish 
the reporting and plan to actively send a "stop" command from the 
free_page_vq().
  Essentially, this is like a sync between two threads - if we 
view the config interrupt handler as one thread, another is the free 
page reporting worker thread.


- what the config handler does is simply:
  1.1:  WRITE_ONCE(vb->reporting_stop, true);

- what the reporting thread will do is
  2.1:  WRITE_ONCE(vb->reporting_stop, true);
  2.2:  send_stop_to_host_via_vq();

From the guest point of view, no matter 1.1 is executed first or 2.1 
first, it doesn't make a difference to the end result - 
vb->reporting_stop is set.


From the host point of view, it knows that cmd_id=1 has truly stopped 
the reporting when it receives a "stop" sign via the vq.




How about adding a buffer with "stop" in the VQ instead?
Wastes a VQ entry which you will need to reserve for this
but is it a big deal?


The free page vq is guest-to-host direction. Using it for host-to-guest 
requests will make it bidirectional, which will result in the same issue 
described before: https://lkml.org/lkml/2017/10/11/1009 (the first response)


On the other hand, I think adding another new vq for host-to-guest 
requesting doesn't make a difference in essence, compared to using 
config (same 1.1, 2.1, 2.2 above), but will be more complicated.




Two new configuration registers in total:
- cmd_reg: the command register, combined from the previous host2guest and
guest2host. I think we can use the same register for host requesting and
guest ACKing, since the guest writing will trap to QEMU, that is, all the
writes to the register are performed in QEMU, and we can keep things work in
a correct way there.
- cmd_id_reg: the sequence id of the free page report command.

-- free page report:
 - host requests the guest to start reporting by "cmd_reg |
REPORT_START";
 - guest ACKs to the host about receiving the start reporting request by
"cmd_reg | REPORT_START", host will clear the flag bit once receiving the
ACK.
 - host requests the guest to stop reporting by "cmd_reg | REPORT_STOP";
 - guest ACKs to the host about receiving the stop reporting request by
"cmd_reg | REPORT_STOP", host will clear the flag once receiving the ACK.
 - guest tells the host about the start of the reporting by writing "cmd
id" into an outbuf, which is added to the free page vq.
 - guest tells the host about the end of the reporting by writing "0"
into an outbuf, which is added to the free page vq. (we reserve "id=0" as
the stop sign)

-- ballooning:
 - host requests the guest to start ballooning by "cmd_reg | BALLOONING";
 - guest ACKs to the host about receiving the request by "cmd_reg |
BALLOONING", host will clear the flag once receiving the ACK.


Some more explanations:
-- Why not let the host request the guest to start the free page reporting
simply by writing a new cmd id to the cmd_id_reg?
The configuration interrupt is shared among all the features - ballooning,
free page reporting, and future feature extensions which need host-to-guest
requests. Some f

Re: [PATCH v17 6/6] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ

2017-11-14 Thread Wei Wang

On 11/15/2017 05:21 AM, Michael S. Tsirkin wrote:

On Tue, Nov 14, 2017 at 08:02:03PM +0800, Wei Wang wrote:

On 11/14/2017 01:32 AM, Michael S. Tsirkin wrote:

- guest2host_cmd: written by the guest to ACK to the host about the
commands that have been received. The host will clear the corresponding
bits on the host2guest_cmd register. The guest also uses this register
to send commands to the host (e.g. when finish free page reporting).

I am not sure what is the role of guest2host_cmd. Reporting of
the correct cmd id seems sufficient indication that guest
received the start command. Not getting any more seems sufficient
to detect stop.


I think the issue is when the host is waiting for the guest to report pages,
it does not know whether the guest is going to report more or the report is
done already. That's why we need a way to let the guest tell the host "the
report is done, don't wait for more", then the host continues to the next
step - sending the non-free pages to the destination. The following method
is a conclusion of other comments, with some new thought. Please have a
check if it is good.

config won't work well for this IMHO.
Writes to config register are hard to synchronize with the VQ.
For example, guest sends free pages, host says stop, meanwhile
guest sends stop for 1st set of pages.


I still don't see an issue with this. Please see below:
(before jumping into the discussion, just make sure I've well explained 
this point: now host-to-guest commands are done via config, and 
guest-to-host commands are done via the free page vq)


Case: Host starts to request the reporting with cmd_id=1. Some time 
later, Host writes "stop" to config, meantime guest happens to finish 
the reporting and plan to actively send a "stop" command from the 
free_page_vq().
  Essentially, this is like a sync between two threads - if we 
view the config interrupt handler as one thread, another is the free 
page reporting worker thread.


- what the config handler does is simply:
  1.1:  WRITE_ONCE(vb->reporting_stop, true);

- what the reporting thread will do is
  2.1:  WRITE_ONCE(vb->reporting_stop, true);
  2.2:  send_stop_to_host_via_vq();

From the guest point of view, no matter 1.1 is executed first or 2.1 
first, it doesn't make a difference to the end result - 
vb->reporting_stop is set.


From the host point of view, it knows that cmd_id=1 has truly stopped 
the reporting when it receives a "stop" sign via the vq.




How about adding a buffer with "stop" in the VQ instead?
Wastes a VQ entry which you will need to reserve for this
but is it a big deal?


The free page vq is guest-to-host direction. Using it for host-to-guest 
requests will make it bidirectional, which will result in the same issue 
described before: https://lkml.org/lkml/2017/10/11/1009 (the first response)


On the other hand, I think adding another new vq for host-to-guest 
requesting doesn't make a difference in essence, compared to using 
config (same 1.1, 2.1, 2.2 above), but will be more complicated.




Two new configuration registers in total:
- cmd_reg: the command register, combined from the previous host2guest and
guest2host. I think we can use the same register for host requesting and
guest ACKing, since the guest writing will trap to QEMU, that is, all the
writes to the register are performed in QEMU, and we can keep things work in
a correct way there.
- cmd_id_reg: the sequence id of the free page report command.

-- free page report:
 - host requests the guest to start reporting by "cmd_reg |
REPORT_START";
 - guest ACKs to the host about receiving the start reporting request by
"cmd_reg | REPORT_START", host will clear the flag bit once receiving the
ACK.
 - host requests the guest to stop reporting by "cmd_reg | REPORT_STOP";
 - guest ACKs to the host about receiving the stop reporting request by
"cmd_reg | REPORT_STOP", host will clear the flag once receiving the ACK.
 - guest tells the host about the start of the reporting by writing "cmd
id" into an outbuf, which is added to the free page vq.
 - guest tells the host about the end of the reporting by writing "0"
into an outbuf, which is added to the free page vq. (we reserve "id=0" as
the stop sign)

-- ballooning:
 - host requests the guest to start ballooning by "cmd_reg | BALLOONING";
 - guest ACKs to the host about receiving the request by "cmd_reg |
BALLOONING", host will clear the flag once receiving the ACK.


Some more explanations:
-- Why not let the host request the guest to start the free page reporting
simply by writing a new cmd id to the cmd_id_reg?
The configuration interrupt is shared among all the features - ballooning,
free page reporting, and future feature extensions which need host-to-guest
requests. Some f

Re: [PATCH v17 6/6] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ

2017-11-14 Thread Wei Wang

On 11/14/2017 01:32 AM, Michael S. Tsirkin wrote:

- guest2host_cmd: written by the guest to ACK to the host about the
commands that have been received. The host will clear the corresponding
bits on the host2guest_cmd register. The guest also uses this register
to send commands to the host (e.g. when finish free page reporting).

I am not sure what is the role of guest2host_cmd. Reporting of
the correct cmd id seems sufficient indication that guest
received the start command. Not getting any more seems sufficient
to detect stop.



I think the issue is when the host is waiting for the guest to report 
pages, it does not know whether the guest is going to report more or the 
report is done already. That's why we need a way to let the guest tell 
the host "the report is done, don't wait for more", then the host 
continues to the next step - sending the non-free pages to the 
destination. The following method is a conclusion of other comments, 
with some new thought. Please have a check if it is good.


Two new configuration registers in total:
- cmd_reg: the command register, combined from the previous host2guest 
and guest2host. I think we can use the same register for host requesting 
and guest ACKing, since the guest writing will trap to QEMU, that is, 
all the writes to the register are performed in QEMU, and we can keep 
things work in a correct way there.

- cmd_id_reg: the sequence id of the free page report command.

-- free page report:
- host requests the guest to start reporting by "cmd_reg | 
REPORT_START";
- guest ACKs to the host about receiving the start reporting 
request by "cmd_reg | REPORT_START", host will clear the flag bit once 
receiving the ACK.

- host requests the guest to stop reporting by "cmd_reg | REPORT_STOP";
- guest ACKs to the host about receiving the stop reporting request 
by "cmd_reg | REPORT_STOP", host will clear the flag once receiving the ACK.
- guest tells the host about the start of the reporting by writing 
"cmd id" into an outbuf, which is added to the free page vq.
- guest tells the host about the end of the reporting by writing 
"0" into an outbuf, which is added to the free page vq. (we reserve 
"id=0" as the stop sign)


-- ballooning:
- host requests the guest to start ballooning by "cmd_reg | 
BALLOONING";
- guest ACKs to the host about receiving the request by "cmd_reg | 
BALLOONING", host will clear the flag once receiving the ACK.



Some more explanations:
-- Why not let the host request the guest to start the free page 
reporting simply by writing a new cmd id to the cmd_id_reg?
The configuration interrupt is shared among all the features - 
ballooning, free page reporting, and future feature extensions which 
need host-to-guest requests. Some features may need to add other feature 
specific configuration registers, like free page reporting need the 
cmd_id_reg, which is not used by ballooning. The rule here is that the 
feature specific registers are read only when that feature is requested 
via the cmd_reg. For example, the cmd_id_reg is read only when "cmd_reg 
| REPORT_START" is true. Otherwise, when the driver receives a 
configuration interrupt, it has to read both cmd_reg and cmd_id 
registers to know what are requested by the host - think about the case 
that ballooning requests are sent frequently while free page reporting 
isn't requested, the guest has to read the cmd_id register every time a 
ballooning request is sent by the host, which is not necessary. If 
future new features follow this style, there will be more unnecessary 
VMexits to read the unused feature specific registers.
So I think it is good to have a central control of the feature request 
via only one cmd register - reading that one is enough to know what is 
requested by the host.



Best,
Wei


Re: [PATCH v17 6/6] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ

2017-11-14 Thread Wei Wang

On 11/14/2017 01:32 AM, Michael S. Tsirkin wrote:

- guest2host_cmd: written by the guest to ACK to the host about the
commands that have been received. The host will clear the corresponding
bits on the host2guest_cmd register. The guest also uses this register
to send commands to the host (e.g. when finish free page reporting).

I am not sure what is the role of guest2host_cmd. Reporting of
the correct cmd id seems sufficient indication that guest
received the start command. Not getting any more seems sufficient
to detect stop.



I think the issue is when the host is waiting for the guest to report 
pages, it does not know whether the guest is going to report more or the 
report is done already. That's why we need a way to let the guest tell 
the host "the report is done, don't wait for more", then the host 
continues to the next step - sending the non-free pages to the 
destination. The following method is a conclusion of other comments, 
with some new thought. Please have a check if it is good.


Two new configuration registers in total:
- cmd_reg: the command register, combined from the previous host2guest 
and guest2host. I think we can use the same register for host requesting 
and guest ACKing, since the guest writing will trap to QEMU, that is, 
all the writes to the register are performed in QEMU, and we can keep 
things work in a correct way there.

- cmd_id_reg: the sequence id of the free page report command.

-- free page report:
- host requests the guest to start reporting by "cmd_reg | 
REPORT_START";
- guest ACKs to the host about receiving the start reporting 
request by "cmd_reg | REPORT_START", host will clear the flag bit once 
receiving the ACK.

- host requests the guest to stop reporting by "cmd_reg | REPORT_STOP";
- guest ACKs to the host about receiving the stop reporting request 
by "cmd_reg | REPORT_STOP", host will clear the flag once receiving the ACK.
- guest tells the host about the start of the reporting by writing 
"cmd id" into an outbuf, which is added to the free page vq.
- guest tells the host about the end of the reporting by writing 
"0" into an outbuf, which is added to the free page vq. (we reserve 
"id=0" as the stop sign)


-- ballooning:
- host requests the guest to start ballooning by "cmd_reg | 
BALLOONING";
- guest ACKs to the host about receiving the request by "cmd_reg | 
BALLOONING", host will clear the flag once receiving the ACK.



Some more explanations:
-- Why not let the host request the guest to start the free page 
reporting simply by writing a new cmd id to the cmd_id_reg?
The configuration interrupt is shared among all the features - 
ballooning, free page reporting, and future feature extensions which 
need host-to-guest requests. Some features may need to add other feature 
specific configuration registers, like free page reporting need the 
cmd_id_reg, which is not used by ballooning. The rule here is that the 
feature specific registers are read only when that feature is requested 
via the cmd_reg. For example, the cmd_id_reg is read only when "cmd_reg 
| REPORT_START" is true. Otherwise, when the driver receives a 
configuration interrupt, it has to read both cmd_reg and cmd_id 
registers to know what are requested by the host - think about the case 
that ballooning requests are sent frequently while free page reporting 
isn't requested, the guest has to read the cmd_id register every time a 
ballooning request is sent by the host, which is not necessary. If 
future new features follow this style, there will be more unnecessary 
VMexits to read the unused feature specific registers.
So I think it is good to have a central control of the feature request 
via only one cmd register - reading that one is enough to know what is 
requested by the host.



Best,
Wei


Re: [PATCH v17 6/6] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ

2017-11-13 Thread Wei Wang

Ping for comments, thanks.

On 11/03/2017 04:13 PM, Wei Wang wrote:

Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_VQ feature indicates the
support of reporting hints of guest free pages to the host via
virtio-balloon. The host requests the guest to report the free pages by
sending commands via the virtio-balloon configuration registers.

When the guest starts to report, the first element added to the free page
vq is a sequence id of the start reporting command. The id is given by
the host, and it indicates whether the following free pages correspond
to the command. For example, the host may stop the report and start again
with a new command id. The obsolete pages for the previous start command
can be detected by the id dismatching on the host. The id is added to the
vq using an output buffer, and the free pages are added to the vq using
input buffer.

Here are some explainations about the added configuration registers:
- host2guest_cmd: a register used by the host to send commands to the
guest.
- guest2host_cmd: written by the guest to ACK to the host about the
commands that have been received. The host will clear the corresponding
bits on the host2guest_cmd register. The guest also uses this register
to send commands to the host (e.g. when finish free page reporting).
- free_page_cmd_id: the sequence id of the free page report command
given by the host.

Signed-off-by: Wei Wang <wei.w.w...@intel.com>
Signed-off-by: Liang Li <liang.z...@intel.com>
Cc: Michael S. Tsirkin <m...@redhat.com>
Cc: Michal Hocko <mho...@kernel.org>
---
  drivers/virtio/virtio_balloon.c | 234 
  include/uapi/linux/virtio_balloon.h |  11 ++
  2 files changed, 223 insertions(+), 22 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index b31fc25..4087f04 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -55,7 +55,12 @@ static struct vfsmount *balloon_mnt;
  
  struct virtio_balloon {

struct virtio_device *vdev;
-   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
+   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
+
+   /* Balloon's own wq for cpu-intensive work items */
+   struct workqueue_struct *balloon_wq;
+   /* The free page reporting work item submitted to the balloon wq */
+   struct work_struct report_free_page_work;
  
  	/* The balloon servicing is delegated to a freezable workqueue. */

struct work_struct update_balloon_stats_work;
@@ -65,6 +70,10 @@ struct virtio_balloon {
spinlock_t stop_update_lock;
bool stop_update;
  
+	/* Stop reporting free pages */

+   bool report_free_page_stop;
+   uint32_t free_page_cmd_id;
+
/* Waiting for host to ack the pages we released. */
wait_queue_head_t acked;
  
@@ -191,6 +200,30 @@ static void send_balloon_page_sg(struct virtio_balloon *vb,

kick_and_wait(vq, vb->acked);
  }
  
+static void send_free_page_sg(struct virtqueue *vq, void *addr, uint32_t size)

+{
+   int err = 0;
+   unsigned int len;
+
+   /* Detach all the used buffers from the vq */
+   while (virtqueue_get_buf(vq, ))
+   ;
+
+   /*
+* Since this is an optimization feature, losing a couple of free
+* pages to report isn't important. We simply resturn without adding
+* the page if the vq is full.
+*/
+   if (vq->num_free) {
+   err = add_one_sg(vq, addr, size);
+   BUG_ON(err);
+   }
+
+   /* Batch till the vq is full */
+   if (!vq->num_free)
+   virtqueue_kick(vq);
+}
+
  /*
   * Send balloon pages in sgs to host. The balloon pages are recorded in the
   * page xbitmap. Each bit in the bitmap corresponds to a page of PAGE_SIZE.
@@ -495,9 +528,8 @@ static void stats_handle_request(struct virtio_balloon *vb)
virtqueue_kick(vq);
  }
  
-static void virtballoon_changed(struct virtio_device *vdev)

+static void virtballoon_cmd_balloon_memory(struct virtio_balloon *vb)
  {
-   struct virtio_balloon *vb = vdev->priv;
unsigned long flags;
  
  	spin_lock_irqsave(>stop_update_lock, flags);

@@ -506,6 +538,50 @@ static void virtballoon_changed(struct virtio_device *vdev)
spin_unlock_irqrestore(>stop_update_lock, flags);
  }
  
+static void virtballoon_cmd_report_free_page_start(struct virtio_balloon *vb)

+{
+   unsigned long flags;
+
+   vb->report_free_page_stop = false;
+   spin_lock_irqsave(>stop_update_lock, flags);
+   if (!vb->stop_update)
+   queue_work(vb->balloon_wq, >report_free_page_work);
+   spin_unlock_irqrestore(>stop_update_lock, flags);
+}
+
+static void virtballoon_changed(struct virtio_device *vdev)
+{
+   struct virtio_balloon *vb = vdev->priv;
+   u32 host2guest_cmd, guest2host_cmd = 0;
+
+   

Re: [PATCH v17 6/6] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ

2017-11-13 Thread Wei Wang

Ping for comments, thanks.

On 11/03/2017 04:13 PM, Wei Wang wrote:

Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_VQ feature indicates the
support of reporting hints of guest free pages to the host via
virtio-balloon. The host requests the guest to report the free pages by
sending commands via the virtio-balloon configuration registers.

When the guest starts to report, the first element added to the free page
vq is a sequence id of the start reporting command. The id is given by
the host, and it indicates whether the following free pages correspond
to the command. For example, the host may stop the report and start again
with a new command id. The obsolete pages for the previous start command
can be detected by the id dismatching on the host. The id is added to the
vq using an output buffer, and the free pages are added to the vq using
input buffer.

Here are some explainations about the added configuration registers:
- host2guest_cmd: a register used by the host to send commands to the
guest.
- guest2host_cmd: written by the guest to ACK to the host about the
commands that have been received. The host will clear the corresponding
bits on the host2guest_cmd register. The guest also uses this register
to send commands to the host (e.g. when finish free page reporting).
- free_page_cmd_id: the sequence id of the free page report command
given by the host.

Signed-off-by: Wei Wang 
Signed-off-by: Liang Li 
Cc: Michael S. Tsirkin 
Cc: Michal Hocko 
---
  drivers/virtio/virtio_balloon.c | 234 
  include/uapi/linux/virtio_balloon.h |  11 ++
  2 files changed, 223 insertions(+), 22 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index b31fc25..4087f04 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -55,7 +55,12 @@ static struct vfsmount *balloon_mnt;
  
  struct virtio_balloon {

struct virtio_device *vdev;
-   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
+   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
+
+   /* Balloon's own wq for cpu-intensive work items */
+   struct workqueue_struct *balloon_wq;
+   /* The free page reporting work item submitted to the balloon wq */
+   struct work_struct report_free_page_work;
  
  	/* The balloon servicing is delegated to a freezable workqueue. */

struct work_struct update_balloon_stats_work;
@@ -65,6 +70,10 @@ struct virtio_balloon {
spinlock_t stop_update_lock;
bool stop_update;
  
+	/* Stop reporting free pages */

+   bool report_free_page_stop;
+   uint32_t free_page_cmd_id;
+
/* Waiting for host to ack the pages we released. */
wait_queue_head_t acked;
  
@@ -191,6 +200,30 @@ static void send_balloon_page_sg(struct virtio_balloon *vb,

kick_and_wait(vq, vb->acked);
  }
  
+static void send_free_page_sg(struct virtqueue *vq, void *addr, uint32_t size)

+{
+   int err = 0;
+   unsigned int len;
+
+   /* Detach all the used buffers from the vq */
+   while (virtqueue_get_buf(vq, ))
+   ;
+
+   /*
+* Since this is an optimization feature, losing a couple of free
+* pages to report isn't important. We simply resturn without adding
+* the page if the vq is full.
+*/
+   if (vq->num_free) {
+   err = add_one_sg(vq, addr, size);
+   BUG_ON(err);
+   }
+
+   /* Batch till the vq is full */
+   if (!vq->num_free)
+   virtqueue_kick(vq);
+}
+
  /*
   * Send balloon pages in sgs to host. The balloon pages are recorded in the
   * page xbitmap. Each bit in the bitmap corresponds to a page of PAGE_SIZE.
@@ -495,9 +528,8 @@ static void stats_handle_request(struct virtio_balloon *vb)
virtqueue_kick(vq);
  }
  
-static void virtballoon_changed(struct virtio_device *vdev)

+static void virtballoon_cmd_balloon_memory(struct virtio_balloon *vb)
  {
-   struct virtio_balloon *vb = vdev->priv;
unsigned long flags;
  
  	spin_lock_irqsave(>stop_update_lock, flags);

@@ -506,6 +538,50 @@ static void virtballoon_changed(struct virtio_device *vdev)
spin_unlock_irqrestore(>stop_update_lock, flags);
  }
  
+static void virtballoon_cmd_report_free_page_start(struct virtio_balloon *vb)

+{
+   unsigned long flags;
+
+   vb->report_free_page_stop = false;
+   spin_lock_irqsave(>stop_update_lock, flags);
+   if (!vb->stop_update)
+   queue_work(vb->balloon_wq, >report_free_page_work);
+   spin_unlock_irqrestore(>stop_update_lock, flags);
+}
+
+static void virtballoon_changed(struct virtio_device *vdev)
+{
+   struct virtio_balloon *vb = vdev->priv;
+   u32 host2guest_cmd, guest2host_cmd = 0;
+
+   if (!virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ)) {
+   virtballoon_cmd_bal

Re: [PULL] virtio: last minute bugfix

2017-11-07 Thread Wei Wang

On 11/08/2017 03:23 AM, Michael S. Tsirkin wrote:

On Tue, Nov 07, 2017 at 08:13:10PM +0200, Michael S. Tsirkin wrote:

On Tue, Nov 07, 2017 at 09:29:59AM -0800, Linus Torvalds wrote:

On Tue, Nov 7, 2017 at 9:23 AM, Linus Torvalds
 wrote:

I guess I'll take it, but please don't do things like this to me.

Oh no I wont.

The garbage you sent me doesn't even compile cleanly, and is utter shite.

Not acceptable for last-minute bugfixes, and you're now on my shit-list.

 Linus

Sorry about that.

I'll investigate what went wrong.

Will be more careful not to cut corners next time around, just follow
the standard procedure.

All right, my local tests didn't fail on new warnings, and I didn't give
the zero day infrastructure enough time to do it's job.

Lesson hopefully learned - don't rush it, give tools the time to do
their job.

Wei, you'll want to respin your 4.15 patchset on top of my fixed tree.
At this point the fix will only land in 4.15, sorry about that.

Thanks everyone.



OK, I'll use the fixed tree. Thanks, Michael.


Best,
Wei


Re: [PULL] virtio: last minute bugfix

2017-11-07 Thread Wei Wang

On 11/08/2017 03:23 AM, Michael S. Tsirkin wrote:

On Tue, Nov 07, 2017 at 08:13:10PM +0200, Michael S. Tsirkin wrote:

On Tue, Nov 07, 2017 at 09:29:59AM -0800, Linus Torvalds wrote:

On Tue, Nov 7, 2017 at 9:23 AM, Linus Torvalds
 wrote:

I guess I'll take it, but please don't do things like this to me.

Oh no I wont.

The garbage you sent me doesn't even compile cleanly, and is utter shite.

Not acceptable for last-minute bugfixes, and you're now on my shit-list.

 Linus

Sorry about that.

I'll investigate what went wrong.

Will be more careful not to cut corners next time around, just follow
the standard procedure.

All right, my local tests didn't fail on new warnings, and I didn't give
the zero day infrastructure enough time to do it's job.

Lesson hopefully learned - don't rush it, give tools the time to do
their job.

Wei, you'll want to respin your 4.15 patchset on top of my fixed tree.
At this point the fix will only land in 4.15, sorry about that.

Thanks everyone.



OK, I'll use the fixed tree. Thanks, Michael.


Best,
Wei


Re: [PATCH v17 4/6] virtio-balloon: VIRTIO_BALLOON_F_SG

2017-11-06 Thread Wei Wang

On 11/04/2017 07:28 PM, Tetsuo Handa wrote:

Wei Wang wrote:

On 11/03/2017 07:25 PM, Tetsuo Handa wrote:

If this is inside vb->balloon_lock mutex (isn't this?), xb_set_page() must not
use __GFP_DIRECT_RECLAIM allocation, for leak_balloon_sg_oom() will be blocked
on vb->balloon_lock mutex.
OK. Since the preload() doesn't need too much memory (< 4K in total),
how about GFP_NOWAIT here?

Maybe GFP_NOWAIT | __GFP_NOWARN ?


Sounds good to me. I also plan to move "xb_set_page()" under mutex_lock, 
that is,


fill_balloon()
{
...
mutex_lock(>balloon_lock);

vb->num_pfns = 0;
while ((page = balloon_page_pop())) {
==>xb_set_page(..,page,..);
balloon_page_enqueue(>vb_dev_info, page);
...
}

As explained in the xbitmap patch, we need the lock to avoid concurrent 
access to the bitmap.


Best,
Wei


Re: [PATCH v17 4/6] virtio-balloon: VIRTIO_BALLOON_F_SG

2017-11-06 Thread Wei Wang

On 11/04/2017 07:28 PM, Tetsuo Handa wrote:

Wei Wang wrote:

On 11/03/2017 07:25 PM, Tetsuo Handa wrote:

If this is inside vb->balloon_lock mutex (isn't this?), xb_set_page() must not
use __GFP_DIRECT_RECLAIM allocation, for leak_balloon_sg_oom() will be blocked
on vb->balloon_lock mutex.
OK. Since the preload() doesn't need too much memory (< 4K in total),
how about GFP_NOWAIT here?

Maybe GFP_NOWAIT | __GFP_NOWARN ?


Sounds good to me. I also plan to move "xb_set_page()" under mutex_lock, 
that is,


fill_balloon()
{
...
mutex_lock(>balloon_lock);

vb->num_pfns = 0;
while ((page = balloon_page_pop())) {
==>xb_set_page(..,page,..);
balloon_page_enqueue(>vb_dev_info, page);
...
}

As explained in the xbitmap patch, we need the lock to avoid concurrent 
access to the bitmap.


Best,
Wei


Re: [PATCH v17 1/6] lib/xbitmap: Introduce xbitmap

2017-11-06 Thread Wei Wang

On 11/03/2017 06:55 PM, Tetsuo Handa wrote:

I'm commenting without understanding the logic.

Wei Wang wrote:

+
+bool xb_preload(gfp_t gfp);
+

Want __must_check annotation, for __radix_tree_preload() is marked
with __must_check annotation. By error failing to check result of
xb_preload() will lead to preemption kept disabled unexpectedly.



I don't disagree with this, but I find its wrappers, e.g. 
radix_tree_preload() and radix_tree_maybe_preload(), don't seem to have 
__must_chek added.






+int xb_set_bit(struct xb *xb, unsigned long bit)
+{
+   int err;
+   unsigned long index = bit / IDA_BITMAP_BITS;
+   struct radix_tree_root *root = >xbrt;
+   struct radix_tree_node *node;
+   void **slot;
+   struct ida_bitmap *bitmap;
+   unsigned long ebit;
+
+   bit %= IDA_BITMAP_BITS;
+   ebit = bit + 2;
+
+   err = __radix_tree_create(root, index, 0, , );
+   if (err)
+   return err;
+   bitmap = rcu_dereference_raw(*slot);
+   if (radix_tree_exception(bitmap)) {
+   unsigned long tmp = (unsigned long)bitmap;
+
+   if (ebit < BITS_PER_LONG) {
+   tmp |= 1UL << ebit;
+   rcu_assign_pointer(*slot, (void *)tmp);
+   return 0;
+   }
+   bitmap = this_cpu_xchg(ida_bitmap, NULL);
+   if (!bitmap)

Please write locking rules, in order to explain how memory
allocated by __radix_tree_create() will not leak.



For the memory allocated by __radix_tree_create(), I think we could add:

if (!bitmap) {
__radix_tree_delete(root, node, slot);
break;
}


For the locking rules, how about adding the following "Developer notes:" 
at the top of the file:


"
Locks are required to ensure that concurrent calls to xb_set_bit, 
xb_preload_and_set_bit, xb_test_bit, xb_clear_bit, xb_clear_bit_range, 
xb_find_next_set_bit and xb_find_next_zero_bit, for the same ida bitmap 
will not happen.

"


+bool xb_test_bit(struct xb *xb, unsigned long bit)
+{
+   unsigned long index = bit / IDA_BITMAP_BITS;
+   const struct radix_tree_root *root = >xbrt;
+   struct ida_bitmap *bitmap = radix_tree_lookup(root, index);
+
+   bit %= IDA_BITMAP_BITS;
+
+   if (!bitmap)
+   return false;
+   if (radix_tree_exception(bitmap)) {
+   bit += RADIX_TREE_EXCEPTIONAL_SHIFT;
+   if (bit > BITS_PER_LONG)

Why not bit >= BITS_PER_LONG here?


Yes, I think it should be ">=" here. Thanks.

Best,
Wei


Re: [PATCH v17 1/6] lib/xbitmap: Introduce xbitmap

2017-11-06 Thread Wei Wang

On 11/03/2017 06:55 PM, Tetsuo Handa wrote:

I'm commenting without understanding the logic.

Wei Wang wrote:

+
+bool xb_preload(gfp_t gfp);
+

Want __must_check annotation, for __radix_tree_preload() is marked
with __must_check annotation. By error failing to check result of
xb_preload() will lead to preemption kept disabled unexpectedly.



I don't disagree with this, but I find its wrappers, e.g. 
radix_tree_preload() and radix_tree_maybe_preload(), don't seem to have 
__must_chek added.






+int xb_set_bit(struct xb *xb, unsigned long bit)
+{
+   int err;
+   unsigned long index = bit / IDA_BITMAP_BITS;
+   struct radix_tree_root *root = >xbrt;
+   struct radix_tree_node *node;
+   void **slot;
+   struct ida_bitmap *bitmap;
+   unsigned long ebit;
+
+   bit %= IDA_BITMAP_BITS;
+   ebit = bit + 2;
+
+   err = __radix_tree_create(root, index, 0, , );
+   if (err)
+   return err;
+   bitmap = rcu_dereference_raw(*slot);
+   if (radix_tree_exception(bitmap)) {
+   unsigned long tmp = (unsigned long)bitmap;
+
+   if (ebit < BITS_PER_LONG) {
+   tmp |= 1UL << ebit;
+   rcu_assign_pointer(*slot, (void *)tmp);
+   return 0;
+   }
+   bitmap = this_cpu_xchg(ida_bitmap, NULL);
+   if (!bitmap)

Please write locking rules, in order to explain how memory
allocated by __radix_tree_create() will not leak.



For the memory allocated by __radix_tree_create(), I think we could add:

if (!bitmap) {
__radix_tree_delete(root, node, slot);
break;
}


For the locking rules, how about adding the following "Developer notes:" 
at the top of the file:


"
Locks are required to ensure that concurrent calls to xb_set_bit, 
xb_preload_and_set_bit, xb_test_bit, xb_clear_bit, xb_clear_bit_range, 
xb_find_next_set_bit and xb_find_next_zero_bit, for the same ida bitmap 
will not happen.

"


+bool xb_test_bit(struct xb *xb, unsigned long bit)
+{
+   unsigned long index = bit / IDA_BITMAP_BITS;
+   const struct radix_tree_root *root = >xbrt;
+   struct ida_bitmap *bitmap = radix_tree_lookup(root, index);
+
+   bit %= IDA_BITMAP_BITS;
+
+   if (!bitmap)
+   return false;
+   if (radix_tree_exception(bitmap)) {
+   bit += RADIX_TREE_EXCEPTIONAL_SHIFT;
+   if (bit > BITS_PER_LONG)

Why not bit >= BITS_PER_LONG here?


Yes, I think it should be ">=" here. Thanks.

Best,
Wei


Re: [PATCH v17 4/6] virtio-balloon: VIRTIO_BALLOON_F_SG

2017-11-04 Thread Wei Wang

On 11/03/2017 07:25 PM, Tetsuo Handa wrote:

Wei Wang wrote:

@@ -164,6 +284,8 @@ static unsigned fill_balloon(struct virtio_balloon *vb, 
size_t num)
break;
}
  
+		if (use_sg && xb_set_page(vb, page, _min, _max) < 0)

Isn't this leaking "page" ?



Right, thanks, will add __free_page(page) here.


@@ -184,8 +307,12 @@ static unsigned fill_balloon(struct virtio_balloon *vb, 
size_t num)
  
  	num_allocated_pages = vb->num_pfns;

/* Did we get any? */
-   if (vb->num_pfns != 0)
-   tell_host(vb, vb->inflate_vq);
+   if (vb->num_pfns) {
+   if (use_sg)
+   tell_host_sgs(vb, vb->inflate_vq, pfn_min, pfn_max);

Please describe why tell_host_sgs() can work without __GFP_DIRECT_RECLAIM 
allocation,
for tell_host_sgs() is called with vb->balloon_lock mutex held.


Essentially, 
tell_host_sgs()-->send_balloon_page_sg()-->add_one_sg()-->virtqueue_add_inbuf( 
, , num=1 ,,GFP_KERNEL)
won't need any memory allocation, because we always add one sg (i.e. 
num=1) each time. That memory
allocation option is only used when multiple sgs are added (i.e. num > 
1) and the implementation inside virtqueue_add_inbuf

need allocation of indirect descriptor table.

We could also add some comments above the function to explain a little 
about this if necessary.






@@ -223,7 +353,13 @@ static unsigned leak_balloon(struct virtio_balloon *vb, 
size_t num)
page = balloon_page_dequeue(vb_dev_info);
if (!page)
break;
-   set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
+   if (use_sg) {
+   if (xb_set_page(vb, page, _min, _max) < 0)

Isn't this leaking "page" ?


Yes, will make it:

if (xb_set_page(vb, page, _min, _max) < 0) {
balloon_page_enqueue(..., page);
break;
}



If this is inside vb->balloon_lock mutex (isn't this?), xb_set_page() must not
use __GFP_DIRECT_RECLAIM allocation, for leak_balloon_sg_oom() will be blocked
on vb->balloon_lock mutex.


OK. Since the preload() doesn't need too much memory (< 4K in total), 
how about GFP_NOWAIT here?



Best,
Wei



Re: [PATCH v17 4/6] virtio-balloon: VIRTIO_BALLOON_F_SG

2017-11-04 Thread Wei Wang

On 11/03/2017 07:25 PM, Tetsuo Handa wrote:

Wei Wang wrote:

@@ -164,6 +284,8 @@ static unsigned fill_balloon(struct virtio_balloon *vb, 
size_t num)
break;
}
  
+		if (use_sg && xb_set_page(vb, page, _min, _max) < 0)

Isn't this leaking "page" ?



Right, thanks, will add __free_page(page) here.


@@ -184,8 +307,12 @@ static unsigned fill_balloon(struct virtio_balloon *vb, 
size_t num)
  
  	num_allocated_pages = vb->num_pfns;

/* Did we get any? */
-   if (vb->num_pfns != 0)
-   tell_host(vb, vb->inflate_vq);
+   if (vb->num_pfns) {
+   if (use_sg)
+   tell_host_sgs(vb, vb->inflate_vq, pfn_min, pfn_max);

Please describe why tell_host_sgs() can work without __GFP_DIRECT_RECLAIM 
allocation,
for tell_host_sgs() is called with vb->balloon_lock mutex held.


Essentially, 
tell_host_sgs()-->send_balloon_page_sg()-->add_one_sg()-->virtqueue_add_inbuf( 
, , num=1 ,,GFP_KERNEL)
won't need any memory allocation, because we always add one sg (i.e. 
num=1) each time. That memory
allocation option is only used when multiple sgs are added (i.e. num > 
1) and the implementation inside virtqueue_add_inbuf

need allocation of indirect descriptor table.

We could also add some comments above the function to explain a little 
about this if necessary.






@@ -223,7 +353,13 @@ static unsigned leak_balloon(struct virtio_balloon *vb, 
size_t num)
page = balloon_page_dequeue(vb_dev_info);
if (!page)
break;
-   set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
+   if (use_sg) {
+   if (xb_set_page(vb, page, _min, _max) < 0)

Isn't this leaking "page" ?


Yes, will make it:

if (xb_set_page(vb, page, _min, _max) < 0) {
balloon_page_enqueue(..., page);
break;
}



If this is inside vb->balloon_lock mutex (isn't this?), xb_set_page() must not
use __GFP_DIRECT_RECLAIM allocation, for leak_balloon_sg_oom() will be blocked
on vb->balloon_lock mutex.


OK. Since the preload() doesn't need too much memory (< 4K in total), 
how about GFP_NOWAIT here?



Best,
Wei



Re: [PATCH v1 0/3] Virtio-balloon Improvement

2017-11-03 Thread Wei Wang

On 10/22/2017 11:19 AM, Michael S. Tsirkin wrote:

On Fri, Oct 20, 2017 at 07:54:23PM +0800, Wei Wang wrote:

This patch series intends to summarize the recent contributions made by
Michael S. Tsirkin, Tetsuo Handa, Michal Hocko etc. via reporting and
discussing the related deadlock issues on the mailinglist. Please check
each patch for details.

>From a high-level point of view, this patch series achieves:
1) eliminate the deadlock issue fundamentally caused by the inability
to run leak_balloon and fill_balloon concurrently;

We need to think about this carefully. Is it an issue that
leak can now bypass fill? It seems that we can now
try to leak a page before fill was seen by host,
but I did not look into it deeply.

I really like my patch for this better at least for
current kernel. I agree we need to work more on 2+3.



Since we have many customers interested in the "Virtio-balloon 
Enhancement" series,
please review the v17 patches first (it has a dependency on your patch 
for that deadlock fix,
so I included it there too), and we can get back to 2+3 here after that 
series is done. Thanks.


Best,
Wei


Re: [PATCH v1 0/3] Virtio-balloon Improvement

2017-11-03 Thread Wei Wang

On 10/22/2017 11:19 AM, Michael S. Tsirkin wrote:

On Fri, Oct 20, 2017 at 07:54:23PM +0800, Wei Wang wrote:

This patch series intends to summarize the recent contributions made by
Michael S. Tsirkin, Tetsuo Handa, Michal Hocko etc. via reporting and
discussing the related deadlock issues on the mailinglist. Please check
each patch for details.

>From a high-level point of view, this patch series achieves:
1) eliminate the deadlock issue fundamentally caused by the inability
to run leak_balloon and fill_balloon concurrently;

We need to think about this carefully. Is it an issue that
leak can now bypass fill? It seems that we can now
try to leak a page before fill was seen by host,
but I did not look into it deeply.

I really like my patch for this better at least for
current kernel. I agree we need to work more on 2+3.



Since we have many customers interested in the "Virtio-balloon 
Enhancement" series,
please review the v17 patches first (it has a dependency on your patch 
for that deadlock fix,
so I included it there too), and we can get back to 2+3 here after that 
series is done. Thanks.


Best,
Wei


[PATCH v17 2/6] radix tree test suite: add tests for xbitmap

2017-11-03 Thread Wei Wang
From: Matthew Wilcox <mawil...@microsoft.com>

Add the following tests for xbitmap:
1) single bit test: single bit set/clear/find;
2) bit range test: set/clear a range of bits and find a 0 or 1 bit in
the range.

Signed-off-by: Wei Wang <wei.w.w...@intel.com>
Cc: Matthew Wilcox <mawil...@microsoft.com>
Cc: Andrew Morton <a...@linux-foundation.org>
Cc: Michael S. Tsirkin <m...@redhat.com>
---
 tools/include/linux/bitmap.h|  34 
 tools/include/linux/kernel.h|   2 +
 tools/testing/radix-tree/Makefile   |   7 +-
 tools/testing/radix-tree/linux/kernel.h |   2 -
 tools/testing/radix-tree/main.c |   5 +
 tools/testing/radix-tree/test.h |   1 +
 tools/testing/radix-tree/xbitmap.c  | 278 
 7 files changed, 326 insertions(+), 3 deletions(-)
 create mode 100644 tools/testing/radix-tree/xbitmap.c

diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h
index e8b9f51..890dab2 100644
--- a/tools/include/linux/bitmap.h
+++ b/tools/include/linux/bitmap.h
@@ -36,6 +36,40 @@ static inline void bitmap_zero(unsigned long *dst, int nbits)
}
 }
 
+static inline void __bitmap_clear(unsigned long *map, unsigned int start,
+ int len)
+{
+   unsigned long *p = map + BIT_WORD(start);
+   const unsigned int size = start + len;
+   int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
+   unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
+
+   while (len - bits_to_clear >= 0) {
+   *p &= ~mask_to_clear;
+   len -= bits_to_clear;
+   bits_to_clear = BITS_PER_LONG;
+   mask_to_clear = ~0UL;
+   p++;
+   }
+   if (len) {
+   mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
+   *p &= ~mask_to_clear;
+   }
+}
+
+static inline __always_inline void bitmap_clear(unsigned long *map,
+   unsigned int start,
+   unsigned int nbits)
+{
+   if (__builtin_constant_p(nbits) && nbits == 1)
+   __clear_bit(start, map);
+   else if (__builtin_constant_p(start & 7) && IS_ALIGNED(start, 8) &&
+__builtin_constant_p(nbits & 7) && IS_ALIGNED(nbits, 8))
+   memset((char *)map + start / 8, 0, nbits / 8);
+   else
+   __bitmap_clear(map, start, nbits);
+}
+
 static inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
 {
unsigned int nlongs = BITS_TO_LONGS(nbits);
diff --git a/tools/include/linux/kernel.h b/tools/include/linux/kernel.h
index 77d2e94..21e90ee 100644
--- a/tools/include/linux/kernel.h
+++ b/tools/include/linux/kernel.h
@@ -12,6 +12,8 @@
 #define UINT_MAX   (~0U)
 #endif
 
+#define IS_ALIGNED(x, a)   (((x) & ((typeof(x))(a) - 1)) == 0)
+
 #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
 
 #define PERF_ALIGN(x, a)   __PERF_ALIGN_MASK(x, (typeof(x))(a)-1)
diff --git a/tools/testing/radix-tree/Makefile 
b/tools/testing/radix-tree/Makefile
index 6a9480c..fc7cb422 100644
--- a/tools/testing/radix-tree/Makefile
+++ b/tools/testing/radix-tree/Makefile
@@ -5,7 +5,8 @@ LDLIBS+= -lpthread -lurcu
 TARGETS = main idr-test multiorder
 CORE_OFILES := radix-tree.o idr.o linux.o test.o find_bit.o
 OFILES = main.o $(CORE_OFILES) regression1.o regression2.o regression3.o \
-tag_check.o multiorder.o idr-test.o iteration_check.o benchmark.o
+tag_check.o multiorder.o idr-test.o iteration_check.o benchmark.o \
+xbitmap.o
 
 ifndef SHIFT
SHIFT=3
@@ -24,6 +25,9 @@ idr-test: idr-test.o $(CORE_OFILES)
 
 multiorder: multiorder.o $(CORE_OFILES)
 
+xbitmap: xbitmap.o $(CORE_OFILES)
+   $(CC) $(CFLAGS) $(LDFLAGS) $^ -o xbitmap
+
 clean:
$(RM) $(TARGETS) *.o radix-tree.c idr.c generated/map-shift.h
 
@@ -33,6 +37,7 @@ $(OFILES): Makefile *.h */*.h generated/map-shift.h \
../../include/linux/*.h \
../../include/asm/*.h \
../../../include/linux/radix-tree.h \
+   ../../../include/linux/xbitmap.h \
../../../include/linux/idr.h
 
 radix-tree.c: ../../../lib/radix-tree.c
diff --git a/tools/testing/radix-tree/linux/kernel.h 
b/tools/testing/radix-tree/linux/kernel.h
index b21a77f..c1e6088 100644
--- a/tools/testing/radix-tree/linux/kernel.h
+++ b/tools/testing/radix-tree/linux/kernel.h
@@ -16,6 +16,4 @@
 #define pr_debug printk
 #define pr_cont printk
 
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
-
 #endif /* _KERNEL_H */
diff --git a/tools/testing/radix-tree/main.c b/tools/testing/radix-tree/main.c
index bc9a784..6f4774e 100644
--- a/tools/testing/radix-tree/main.c
+++ b/tools/testing/radix-tree/main.c
@@ -337,6 +337,11 @@ static void single_thread_tests(bool long_run)
rcu_barrier();
printv(2, "after copy_tag_check: %d al

[PATCH v17 2/6] radix tree test suite: add tests for xbitmap

2017-11-03 Thread Wei Wang
From: Matthew Wilcox 

Add the following tests for xbitmap:
1) single bit test: single bit set/clear/find;
2) bit range test: set/clear a range of bits and find a 0 or 1 bit in
the range.

Signed-off-by: Wei Wang 
Cc: Matthew Wilcox 
Cc: Andrew Morton 
Cc: Michael S. Tsirkin 
---
 tools/include/linux/bitmap.h|  34 
 tools/include/linux/kernel.h|   2 +
 tools/testing/radix-tree/Makefile   |   7 +-
 tools/testing/radix-tree/linux/kernel.h |   2 -
 tools/testing/radix-tree/main.c |   5 +
 tools/testing/radix-tree/test.h |   1 +
 tools/testing/radix-tree/xbitmap.c  | 278 
 7 files changed, 326 insertions(+), 3 deletions(-)
 create mode 100644 tools/testing/radix-tree/xbitmap.c

diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h
index e8b9f51..890dab2 100644
--- a/tools/include/linux/bitmap.h
+++ b/tools/include/linux/bitmap.h
@@ -36,6 +36,40 @@ static inline void bitmap_zero(unsigned long *dst, int nbits)
}
 }
 
+static inline void __bitmap_clear(unsigned long *map, unsigned int start,
+ int len)
+{
+   unsigned long *p = map + BIT_WORD(start);
+   const unsigned int size = start + len;
+   int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
+   unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
+
+   while (len - bits_to_clear >= 0) {
+   *p &= ~mask_to_clear;
+   len -= bits_to_clear;
+   bits_to_clear = BITS_PER_LONG;
+   mask_to_clear = ~0UL;
+   p++;
+   }
+   if (len) {
+   mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
+   *p &= ~mask_to_clear;
+   }
+}
+
+static inline __always_inline void bitmap_clear(unsigned long *map,
+   unsigned int start,
+   unsigned int nbits)
+{
+   if (__builtin_constant_p(nbits) && nbits == 1)
+   __clear_bit(start, map);
+   else if (__builtin_constant_p(start & 7) && IS_ALIGNED(start, 8) &&
+__builtin_constant_p(nbits & 7) && IS_ALIGNED(nbits, 8))
+   memset((char *)map + start / 8, 0, nbits / 8);
+   else
+   __bitmap_clear(map, start, nbits);
+}
+
 static inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
 {
unsigned int nlongs = BITS_TO_LONGS(nbits);
diff --git a/tools/include/linux/kernel.h b/tools/include/linux/kernel.h
index 77d2e94..21e90ee 100644
--- a/tools/include/linux/kernel.h
+++ b/tools/include/linux/kernel.h
@@ -12,6 +12,8 @@
 #define UINT_MAX   (~0U)
 #endif
 
+#define IS_ALIGNED(x, a)   (((x) & ((typeof(x))(a) - 1)) == 0)
+
 #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
 
 #define PERF_ALIGN(x, a)   __PERF_ALIGN_MASK(x, (typeof(x))(a)-1)
diff --git a/tools/testing/radix-tree/Makefile 
b/tools/testing/radix-tree/Makefile
index 6a9480c..fc7cb422 100644
--- a/tools/testing/radix-tree/Makefile
+++ b/tools/testing/radix-tree/Makefile
@@ -5,7 +5,8 @@ LDLIBS+= -lpthread -lurcu
 TARGETS = main idr-test multiorder
 CORE_OFILES := radix-tree.o idr.o linux.o test.o find_bit.o
 OFILES = main.o $(CORE_OFILES) regression1.o regression2.o regression3.o \
-tag_check.o multiorder.o idr-test.o iteration_check.o benchmark.o
+tag_check.o multiorder.o idr-test.o iteration_check.o benchmark.o \
+xbitmap.o
 
 ifndef SHIFT
SHIFT=3
@@ -24,6 +25,9 @@ idr-test: idr-test.o $(CORE_OFILES)
 
 multiorder: multiorder.o $(CORE_OFILES)
 
+xbitmap: xbitmap.o $(CORE_OFILES)
+   $(CC) $(CFLAGS) $(LDFLAGS) $^ -o xbitmap
+
 clean:
$(RM) $(TARGETS) *.o radix-tree.c idr.c generated/map-shift.h
 
@@ -33,6 +37,7 @@ $(OFILES): Makefile *.h */*.h generated/map-shift.h \
../../include/linux/*.h \
../../include/asm/*.h \
../../../include/linux/radix-tree.h \
+   ../../../include/linux/xbitmap.h \
../../../include/linux/idr.h
 
 radix-tree.c: ../../../lib/radix-tree.c
diff --git a/tools/testing/radix-tree/linux/kernel.h 
b/tools/testing/radix-tree/linux/kernel.h
index b21a77f..c1e6088 100644
--- a/tools/testing/radix-tree/linux/kernel.h
+++ b/tools/testing/radix-tree/linux/kernel.h
@@ -16,6 +16,4 @@
 #define pr_debug printk
 #define pr_cont printk
 
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
-
 #endif /* _KERNEL_H */
diff --git a/tools/testing/radix-tree/main.c b/tools/testing/radix-tree/main.c
index bc9a784..6f4774e 100644
--- a/tools/testing/radix-tree/main.c
+++ b/tools/testing/radix-tree/main.c
@@ -337,6 +337,11 @@ static void single_thread_tests(bool long_run)
rcu_barrier();
printv(2, "after copy_tag_check: %d allocated, preempt %d\n",
nr_allocated, preempt_count);
+
+   xbitmap_checks();
+   rcu_barrier()

[PATCH v17 3/6] mm/balloon_compaction.c: split balloon page allocation and enqueue

2017-11-03 Thread Wei Wang
From: "Michael S. Tsirkin" 

fill_balloon doing memory allocations under balloon_lock
can cause a deadlock when leak_balloon is called from
virtballoon_oom_notify and tries to take same lock.

To fix, split page allocation and enqueue and do allocations outside
the lock.

Here's a detailed analysis of the deadlock by Tetsuo Handa:

In leak_balloon(), mutex_lock(>balloon_lock) is called in order to
serialize against fill_balloon(). But in fill_balloon(),
alloc_page(GFP_HIGHUSER[_MOVABLE] | __GFP_NOMEMALLOC | __GFP_NORETRY) is
called with vb->balloon_lock mutex held. Since GFP_HIGHUSER[_MOVABLE]
implies __GFP_DIRECT_RECLAIM | __GFP_IO | __GFP_FS, despite __GFP_NORETRY
is specified, this allocation attempt might indirectly depend on somebody
else's __GFP_DIRECT_RECLAIM memory allocation. And such indirect
__GFP_DIRECT_RECLAIM memory allocation might call leak_balloon() via
virtballoon_oom_notify() via blocking_notifier_call_chain() callback via
out_of_memory() when it reached __alloc_pages_may_oom() and held oom_lock
mutex. Since vb->balloon_lock mutex is already held by fill_balloon(), it
will cause OOM lockup. Thus, do not wait for vb->balloon_lock mutex if
leak_balloon() is called from out_of_memory().

Thread1Thread2
fill_balloon()
 takes a balloon_lock
  balloon_page_enqueue()
   alloc_page(GFP_HIGHUSER_MOVABLE)
direct reclaim (__GFP_FS context)  takes a fs lock
 waits for that fs lock alloc_page(GFP_NOFS)
 __alloc_pages_may_oom()
  takes the oom_lock
   out_of_memory()
blocking_notifier_call_chain()
 leak_balloon()
   tries to take that
   balloon_lock and deadlocks

Reported-by: Tetsuo Handa <penguin-ker...@i-love.sakura.ne.jp>
Signed-off-by: Michael S. Tsirkin <m...@redhat.com>
Cc: Michal Hocko <mho...@suse.com>
Cc: Wei Wang <wei.w.w...@intel.com>
Reviewed-by: Wei Wang <wei.w.w...@intel.com>

---
 drivers/virtio/virtio_balloon.c| 23 ++-
 include/linux/balloon_compaction.h | 34 +-
 mm/balloon_compaction.c| 28 +---
 3 files changed, 72 insertions(+), 13 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index f0b3a0b..45fe6a8 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -143,16 +143,17 @@ static void set_page_pfns(struct virtio_balloon *vb,
 
 static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
 {
-   struct balloon_dev_info *vb_dev_info = >vb_dev_info;
unsigned num_allocated_pages;
+   unsigned int num_pfns;
+   struct page *page;
+   LIST_HEAD(pages);
 
/* We can only do one array worth at a time. */
num = min(num, ARRAY_SIZE(vb->pfns));
 
-   mutex_lock(>balloon_lock);
-   for (vb->num_pfns = 0; vb->num_pfns < num;
-vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
-   struct page *page = balloon_page_enqueue(vb_dev_info);
+   for (num_pfns = 0; num_pfns < num;
+num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
+   struct page *page = balloon_page_alloc();
 
if (!page) {
dev_info_ratelimited(>vdev->dev,
@@ -162,6 +163,18 @@ static unsigned fill_balloon(struct virtio_balloon *vb, 
size_t num)
msleep(200);
break;
}
+
+   balloon_page_push(, page);
+   }
+
+   mutex_lock(>balloon_lock);
+
+   vb->num_pfns = 0;
+   while ((page = balloon_page_pop())) {
+   balloon_page_enqueue(>vb_dev_info, page);
+
+   vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE;
+
set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
if (!virtio_has_feature(vb->vdev,
diff --git a/include/linux/balloon_compaction.h 
b/include/linux/balloon_compaction.h
index 79542b2..bdc055a 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -49,6 +49,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Balloon device information descriptor.
@@ -66,7 +67,9 @@ struct balloon_dev_info {
struct inode *inode;
 };
 
-extern struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info);
+extern struct page *balloon_page_alloc(void);
+extern void balloon_page_enqueue(struct balloon_dev_info *b_dev_info,
+struct page *page);
 extern struct page *balloon_page_dequeue(struct balloon_dev_info 

[PATCH v17 3/6] mm/balloon_compaction.c: split balloon page allocation and enqueue

2017-11-03 Thread Wei Wang
From: "Michael S. Tsirkin" 

fill_balloon doing memory allocations under balloon_lock
can cause a deadlock when leak_balloon is called from
virtballoon_oom_notify and tries to take same lock.

To fix, split page allocation and enqueue and do allocations outside
the lock.

Here's a detailed analysis of the deadlock by Tetsuo Handa:

In leak_balloon(), mutex_lock(>balloon_lock) is called in order to
serialize against fill_balloon(). But in fill_balloon(),
alloc_page(GFP_HIGHUSER[_MOVABLE] | __GFP_NOMEMALLOC | __GFP_NORETRY) is
called with vb->balloon_lock mutex held. Since GFP_HIGHUSER[_MOVABLE]
implies __GFP_DIRECT_RECLAIM | __GFP_IO | __GFP_FS, despite __GFP_NORETRY
is specified, this allocation attempt might indirectly depend on somebody
else's __GFP_DIRECT_RECLAIM memory allocation. And such indirect
__GFP_DIRECT_RECLAIM memory allocation might call leak_balloon() via
virtballoon_oom_notify() via blocking_notifier_call_chain() callback via
out_of_memory() when it reached __alloc_pages_may_oom() and held oom_lock
mutex. Since vb->balloon_lock mutex is already held by fill_balloon(), it
will cause OOM lockup. Thus, do not wait for vb->balloon_lock mutex if
leak_balloon() is called from out_of_memory().

Thread1Thread2
fill_balloon()
 takes a balloon_lock
  balloon_page_enqueue()
   alloc_page(GFP_HIGHUSER_MOVABLE)
direct reclaim (__GFP_FS context)  takes a fs lock
 waits for that fs lock alloc_page(GFP_NOFS)
 __alloc_pages_may_oom()
  takes the oom_lock
   out_of_memory()
blocking_notifier_call_chain()
 leak_balloon()
   tries to take that
   balloon_lock and deadlocks

Reported-by: Tetsuo Handa 
Signed-off-by: Michael S. Tsirkin 
Cc: Michal Hocko 
Cc: Wei Wang 
Reviewed-by: Wei Wang 

---
 drivers/virtio/virtio_balloon.c| 23 ++-
 include/linux/balloon_compaction.h | 34 +-
 mm/balloon_compaction.c| 28 +---
 3 files changed, 72 insertions(+), 13 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index f0b3a0b..45fe6a8 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -143,16 +143,17 @@ static void set_page_pfns(struct virtio_balloon *vb,
 
 static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
 {
-   struct balloon_dev_info *vb_dev_info = >vb_dev_info;
unsigned num_allocated_pages;
+   unsigned int num_pfns;
+   struct page *page;
+   LIST_HEAD(pages);
 
/* We can only do one array worth at a time. */
num = min(num, ARRAY_SIZE(vb->pfns));
 
-   mutex_lock(>balloon_lock);
-   for (vb->num_pfns = 0; vb->num_pfns < num;
-vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
-   struct page *page = balloon_page_enqueue(vb_dev_info);
+   for (num_pfns = 0; num_pfns < num;
+num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
+   struct page *page = balloon_page_alloc();
 
if (!page) {
dev_info_ratelimited(>vdev->dev,
@@ -162,6 +163,18 @@ static unsigned fill_balloon(struct virtio_balloon *vb, 
size_t num)
msleep(200);
break;
}
+
+   balloon_page_push(, page);
+   }
+
+   mutex_lock(>balloon_lock);
+
+   vb->num_pfns = 0;
+   while ((page = balloon_page_pop())) {
+   balloon_page_enqueue(>vb_dev_info, page);
+
+   vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE;
+
set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
if (!virtio_has_feature(vb->vdev,
diff --git a/include/linux/balloon_compaction.h 
b/include/linux/balloon_compaction.h
index 79542b2..bdc055a 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -49,6 +49,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Balloon device information descriptor.
@@ -66,7 +67,9 @@ struct balloon_dev_info {
struct inode *inode;
 };
 
-extern struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info);
+extern struct page *balloon_page_alloc(void);
+extern void balloon_page_enqueue(struct balloon_dev_info *b_dev_info,
+struct page *page);
 extern struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info);
 
 static inline void balloon_devinfo_init(struct balloon_dev_info *balloon)
@@ -86,6 +89,35 @@ extern void balloon_page_putback(struct

[PATCH v17 5/6] mm: support reporting free page blocks

2017-11-03 Thread Wei Wang
This patch adds support to walk through the free page blocks in the
system and report them via a callback function. Some page blocks may
leave the free list after zone->lock is released, so it is the caller's
responsibility to either detect or prevent the use of such pages.

One use example of this patch is to accelerate live migration by skipping
the transfer of free pages reported from the guest. A popular method used
by the hypervisor to track which part of memory is written during live
migration is to write-protect all the guest memory. So, those pages that
are reported as free pages but are written after the report function
returns will be captured by the hypervisor, and they will be added to the
next round of memory transfer.

Signed-off-by: Wei Wang <wei.w.w...@intel.com>
Signed-off-by: Liang Li <liang.z...@intel.com>
Cc: Michal Hocko <mho...@kernel.org>
Cc: Michael S. Tsirkin <m...@redhat.com>
Acked-by: Michal Hocko <mho...@kernel.org>
---
 include/linux/mm.h |  6 
 mm/page_alloc.c| 91 ++
 2 files changed, 97 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 065d99d..fe5a90e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1877,6 +1877,12 @@ extern void free_area_init_node(int nid, unsigned long * 
zones_size,
unsigned long zone_start_pfn, unsigned long *zholes_size);
 extern void free_initmem(void);
 
+extern void walk_free_mem_block(void *opaque,
+   int min_order,
+   bool (*report_pfn_range)(void *opaque,
+unsigned long pfn,
+unsigned long num));
+
 /*
  * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
  * into the buddy system. The freed pages will be poisoned with pattern
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 77e4d3c..2283fcc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4867,6 +4867,97 @@ void show_free_areas(unsigned int filter, nodemask_t 
*nodemask)
show_swap_cache_info();
 }
 
+/*
+ * Walk through a free page list and report the found pfn range via the
+ * callback.
+ *
+ * Return false if the callback requests to stop reporting. Otherwise,
+ * return true.
+ */
+static bool walk_free_page_list(void *opaque,
+   struct zone *zone,
+   int order,
+   enum migratetype mt,
+   bool (*report_pfn_range)(void *,
+unsigned long,
+unsigned long))
+{
+   struct page *page;
+   struct list_head *list;
+   unsigned long pfn, flags;
+   bool ret;
+
+   spin_lock_irqsave(>lock, flags);
+   list = >free_area[order].free_list[mt];
+   list_for_each_entry(page, list, lru) {
+   pfn = page_to_pfn(page);
+   ret = report_pfn_range(opaque, pfn, 1 << order);
+   if (!ret)
+   break;
+   }
+   spin_unlock_irqrestore(>lock, flags);
+
+   return ret;
+}
+
+/**
+ * walk_free_mem_block - Walk through the free page blocks in the system
+ * @opaque: the context passed from the caller
+ * @min_order: the minimum order of free lists to check
+ * @report_pfn_range: the callback to report the pfn range of the free pages
+ *
+ * If the callback returns false, stop iterating the list of free page blocks.
+ * Otherwise, continue to report.
+ *
+ * Please note that there are no locking guarantees for the callback and
+ * that the reported pfn range might be freed or disappear after the
+ * callback returns so the caller has to be very careful how it is used.
+ *
+ * The callback itself must not sleep or perform any operations which would
+ * require any memory allocations directly (not even GFP_NOWAIT/GFP_ATOMIC)
+ * or via any lock dependency. It is generally advisable to implement
+ * the callback as simple as possible and defer any heavy lifting to a
+ * different context.
+ *
+ * There is no guarantee that each free range will be reported only once
+ * during one walk_free_mem_block invocation.
+ *
+ * pfn_to_page on the given range is strongly discouraged and if there is
+ * an absolute need for that make sure to contact MM people to discuss
+ * potential problems.
+ *
+ * The function itself might sleep so it cannot be called from atomic
+ * contexts.
+ *
+ * In general low orders tend to be very volatile and so it makes more
+ * sense to query larger ones first for various optimizations which like
+ * ballooning etc... This will reduce the overhead as well.
+ */
+void walk_free_mem_block(void *opaque,
+int min_order,
+bool (*report_pfn_range)(void *opaque,
+  

[PATCH v17 5/6] mm: support reporting free page blocks

2017-11-03 Thread Wei Wang
This patch adds support to walk through the free page blocks in the
system and report them via a callback function. Some page blocks may
leave the free list after zone->lock is released, so it is the caller's
responsibility to either detect or prevent the use of such pages.

One use example of this patch is to accelerate live migration by skipping
the transfer of free pages reported from the guest. A popular method used
by the hypervisor to track which part of memory is written during live
migration is to write-protect all the guest memory. So, those pages that
are reported as free pages but are written after the report function
returns will be captured by the hypervisor, and they will be added to the
next round of memory transfer.

Signed-off-by: Wei Wang 
Signed-off-by: Liang Li 
Cc: Michal Hocko 
Cc: Michael S. Tsirkin 
Acked-by: Michal Hocko 
---
 include/linux/mm.h |  6 
 mm/page_alloc.c| 91 ++
 2 files changed, 97 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 065d99d..fe5a90e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1877,6 +1877,12 @@ extern void free_area_init_node(int nid, unsigned long * 
zones_size,
unsigned long zone_start_pfn, unsigned long *zholes_size);
 extern void free_initmem(void);
 
+extern void walk_free_mem_block(void *opaque,
+   int min_order,
+   bool (*report_pfn_range)(void *opaque,
+unsigned long pfn,
+unsigned long num));
+
 /*
  * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
  * into the buddy system. The freed pages will be poisoned with pattern
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 77e4d3c..2283fcc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4867,6 +4867,97 @@ void show_free_areas(unsigned int filter, nodemask_t 
*nodemask)
show_swap_cache_info();
 }
 
+/*
+ * Walk through a free page list and report the found pfn range via the
+ * callback.
+ *
+ * Return false if the callback requests to stop reporting. Otherwise,
+ * return true.
+ */
+static bool walk_free_page_list(void *opaque,
+   struct zone *zone,
+   int order,
+   enum migratetype mt,
+   bool (*report_pfn_range)(void *,
+unsigned long,
+unsigned long))
+{
+   struct page *page;
+   struct list_head *list;
+   unsigned long pfn, flags;
+   bool ret;
+
+   spin_lock_irqsave(>lock, flags);
+   list = >free_area[order].free_list[mt];
+   list_for_each_entry(page, list, lru) {
+   pfn = page_to_pfn(page);
+   ret = report_pfn_range(opaque, pfn, 1 << order);
+   if (!ret)
+   break;
+   }
+   spin_unlock_irqrestore(>lock, flags);
+
+   return ret;
+}
+
+/**
+ * walk_free_mem_block - Walk through the free page blocks in the system
+ * @opaque: the context passed from the caller
+ * @min_order: the minimum order of free lists to check
+ * @report_pfn_range: the callback to report the pfn range of the free pages
+ *
+ * If the callback returns false, stop iterating the list of free page blocks.
+ * Otherwise, continue to report.
+ *
+ * Please note that there are no locking guarantees for the callback and
+ * that the reported pfn range might be freed or disappear after the
+ * callback returns so the caller has to be very careful how it is used.
+ *
+ * The callback itself must not sleep or perform any operations which would
+ * require any memory allocations directly (not even GFP_NOWAIT/GFP_ATOMIC)
+ * or via any lock dependency. It is generally advisable to implement
+ * the callback as simple as possible and defer any heavy lifting to a
+ * different context.
+ *
+ * There is no guarantee that each free range will be reported only once
+ * during one walk_free_mem_block invocation.
+ *
+ * pfn_to_page on the given range is strongly discouraged and if there is
+ * an absolute need for that make sure to contact MM people to discuss
+ * potential problems.
+ *
+ * The function itself might sleep so it cannot be called from atomic
+ * contexts.
+ *
+ * In general low orders tend to be very volatile and so it makes more
+ * sense to query larger ones first for various optimizations which like
+ * ballooning etc... This will reduce the overhead as well.
+ */
+void walk_free_mem_block(void *opaque,
+int min_order,
+bool (*report_pfn_range)(void *opaque,
+ unsigned long pfn,
+ unsigned long num))
+{
+  

[PATCH v17 4/6] virtio-balloon: VIRTIO_BALLOON_F_SG

2017-11-03 Thread Wei Wang
Add a new feature, VIRTIO_BALLOON_F_SG, which enables the transfer
of balloon (i.e. inflated/deflated) pages using scatter-gather lists
to the host.

The implementation of the previous virtio-balloon is not very
efficient, because the balloon pages are transferred to the
host one by one. Here is the breakdown of the time in percentage
spent on each step of the balloon inflating process (inflating
7GB of an 8GB idle guest).

1) allocating pages (6.5%)
2) sending PFNs to host (68.3%)
3) address translation (6.1%)
4) madvise (19%)

It takes about 4126ms for the inflating process to complete.
The above profiling shows that the bottlenecks are stage 2)
and stage 4).

This patch optimizes step 2) by transferring pages to the host in
sgs. An sg describes a chunk of guest physically continuous pages.
With this mechanism, step 4) can also be optimized by doing address
translation and madvise() in chunks rather than page by page.

With this new feature, the above ballooning process takes ~492ms
resulting in an improvement of ~88%.

TODO: optimize stage 1) by allocating/freeing a chunk of pages
instead of a single page each time.

Signed-off-by: Wei Wang <wei.w.w...@intel.com>
Signed-off-by: Liang Li <liang.z...@intel.com>
Suggested-by: Michael S. Tsirkin <m...@redhat.com>
Cc: Tetsuo Handa <penguin-ker...@i-love.sakura.ne.jp>
---
 drivers/virtio/virtio_balloon.c | 232 +---
 include/uapi/linux/virtio_balloon.h |   1 +
 2 files changed, 215 insertions(+), 18 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 45fe6a8..b31fc25 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -32,6 +32,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 /*
  * Balloon device works in 4K page units.  So each page is pointed to by
@@ -79,6 +81,9 @@ struct virtio_balloon {
/* Synchronize access/update to this struct virtio_balloon elements */
struct mutex balloon_lock;
 
+   /* The xbitmap used to record balloon pages */
+   struct xb page_xb;
+
/* The array of pfns we tell the Host about. */
unsigned int num_pfns;
__virtio32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX];
@@ -141,15 +146,130 @@ static void set_page_pfns(struct virtio_balloon *vb,
  page_to_balloon_pfn(page) + i);
 }
 
+
+static void kick_and_wait(struct virtqueue *vq, wait_queue_head_t wq_head)
+{
+   unsigned int len;
+
+   virtqueue_kick(vq);
+   wait_event(wq_head, virtqueue_get_buf(vq, ));
+}
+
+static int add_one_sg(struct virtqueue *vq, void *addr, uint32_t size)
+{
+   struct scatterlist sg;
+   unsigned int len;
+
+   sg_init_one(, addr, size);
+
+   /* Detach all the used buffers from the vq */
+   while (virtqueue_get_buf(vq, ))
+   ;
+
+   return virtqueue_add_inbuf(vq, , 1, vq, GFP_KERNEL);
+}
+
+static void send_balloon_page_sg(struct virtio_balloon *vb,
+struct virtqueue *vq,
+void *addr,
+uint32_t size,
+bool batch)
+{
+   int err;
+
+   err = add_one_sg(vq, addr, size);
+   /*
+* This is expected to never fail: there is always at least 1 entry
+* available on the vq, because when the vq is full the worker thread
+* that adds the sg will be put into sleep until at least 1 entry is
+* available to use.
+*/
+   BUG_ON(err);
+
+   /* If batching is requested, we batch till the vq is full */
+   if (!batch || !vq->num_free)
+   kick_and_wait(vq, vb->acked);
+}
+
+/*
+ * Send balloon pages in sgs to host. The balloon pages are recorded in the
+ * page xbitmap. Each bit in the bitmap corresponds to a page of PAGE_SIZE.
+ * The page xbitmap is searched for continuous "1" bits, which correspond
+ * to continuous pages, to chunk into sgs.
+ *
+ * @page_xb_start and @page_xb_end form the range of bits in the xbitmap that
+ * need to be searched.
+ */
+static void tell_host_sgs(struct virtio_balloon *vb,
+ struct virtqueue *vq,
+ unsigned long page_xb_start,
+ unsigned long page_xb_end)
+{
+   unsigned long sg_pfn_start, sg_pfn_end;
+   void *sg_addr;
+   uint32_t sg_len, sg_max_len = round_down(UINT_MAX, PAGE_SIZE);
+
+   sg_pfn_start = page_xb_start;
+   while (sg_pfn_start < page_xb_end) {
+   sg_pfn_start = xb_find_next_set_bit(>page_xb, sg_pfn_start,
+   page_xb_end);
+   if (sg_pfn_start == page_xb_end + 1)
+   break;
+   sg_pfn_end = xb_find_next_zero_bit(>page_xb,
+  sg_pfn_start + 1,
+

[PATCH v17 4/6] virtio-balloon: VIRTIO_BALLOON_F_SG

2017-11-03 Thread Wei Wang
Add a new feature, VIRTIO_BALLOON_F_SG, which enables the transfer
of balloon (i.e. inflated/deflated) pages using scatter-gather lists
to the host.

The implementation of the previous virtio-balloon is not very
efficient, because the balloon pages are transferred to the
host one by one. Here is the breakdown of the time in percentage
spent on each step of the balloon inflating process (inflating
7GB of an 8GB idle guest).

1) allocating pages (6.5%)
2) sending PFNs to host (68.3%)
3) address translation (6.1%)
4) madvise (19%)

It takes about 4126ms for the inflating process to complete.
The above profiling shows that the bottlenecks are stage 2)
and stage 4).

This patch optimizes step 2) by transferring pages to the host in
sgs. An sg describes a chunk of guest physically continuous pages.
With this mechanism, step 4) can also be optimized by doing address
translation and madvise() in chunks rather than page by page.

With this new feature, the above ballooning process takes ~492ms
resulting in an improvement of ~88%.

TODO: optimize stage 1) by allocating/freeing a chunk of pages
instead of a single page each time.

Signed-off-by: Wei Wang 
Signed-off-by: Liang Li 
Suggested-by: Michael S. Tsirkin 
Cc: Tetsuo Handa 
---
 drivers/virtio/virtio_balloon.c | 232 +---
 include/uapi/linux/virtio_balloon.h |   1 +
 2 files changed, 215 insertions(+), 18 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 45fe6a8..b31fc25 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -32,6 +32,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 /*
  * Balloon device works in 4K page units.  So each page is pointed to by
@@ -79,6 +81,9 @@ struct virtio_balloon {
/* Synchronize access/update to this struct virtio_balloon elements */
struct mutex balloon_lock;
 
+   /* The xbitmap used to record balloon pages */
+   struct xb page_xb;
+
/* The array of pfns we tell the Host about. */
unsigned int num_pfns;
__virtio32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX];
@@ -141,15 +146,130 @@ static void set_page_pfns(struct virtio_balloon *vb,
  page_to_balloon_pfn(page) + i);
 }
 
+
+static void kick_and_wait(struct virtqueue *vq, wait_queue_head_t wq_head)
+{
+   unsigned int len;
+
+   virtqueue_kick(vq);
+   wait_event(wq_head, virtqueue_get_buf(vq, ));
+}
+
+static int add_one_sg(struct virtqueue *vq, void *addr, uint32_t size)
+{
+   struct scatterlist sg;
+   unsigned int len;
+
+   sg_init_one(, addr, size);
+
+   /* Detach all the used buffers from the vq */
+   while (virtqueue_get_buf(vq, ))
+   ;
+
+   return virtqueue_add_inbuf(vq, , 1, vq, GFP_KERNEL);
+}
+
+static void send_balloon_page_sg(struct virtio_balloon *vb,
+struct virtqueue *vq,
+void *addr,
+uint32_t size,
+bool batch)
+{
+   int err;
+
+   err = add_one_sg(vq, addr, size);
+   /*
+* This is expected to never fail: there is always at least 1 entry
+* available on the vq, because when the vq is full the worker thread
+* that adds the sg will be put into sleep until at least 1 entry is
+* available to use.
+*/
+   BUG_ON(err);
+
+   /* If batching is requested, we batch till the vq is full */
+   if (!batch || !vq->num_free)
+   kick_and_wait(vq, vb->acked);
+}
+
+/*
+ * Send balloon pages in sgs to host. The balloon pages are recorded in the
+ * page xbitmap. Each bit in the bitmap corresponds to a page of PAGE_SIZE.
+ * The page xbitmap is searched for continuous "1" bits, which correspond
+ * to continuous pages, to chunk into sgs.
+ *
+ * @page_xb_start and @page_xb_end form the range of bits in the xbitmap that
+ * need to be searched.
+ */
+static void tell_host_sgs(struct virtio_balloon *vb,
+ struct virtqueue *vq,
+ unsigned long page_xb_start,
+ unsigned long page_xb_end)
+{
+   unsigned long sg_pfn_start, sg_pfn_end;
+   void *sg_addr;
+   uint32_t sg_len, sg_max_len = round_down(UINT_MAX, PAGE_SIZE);
+
+   sg_pfn_start = page_xb_start;
+   while (sg_pfn_start < page_xb_end) {
+   sg_pfn_start = xb_find_next_set_bit(>page_xb, sg_pfn_start,
+   page_xb_end);
+   if (sg_pfn_start == page_xb_end + 1)
+   break;
+   sg_pfn_end = xb_find_next_zero_bit(>page_xb,
+  sg_pfn_start + 1,
+  page_xb_end);
+   sg_addr = (void *)pfn_to_kaddr(sg_pfn_start);
+   sg_len = (

[PATCH v17 6/6] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ

2017-11-03 Thread Wei Wang
Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_VQ feature indicates the
support of reporting hints of guest free pages to the host via
virtio-balloon. The host requests the guest to report the free pages by
sending commands via the virtio-balloon configuration registers.

When the guest starts to report, the first element added to the free page
vq is a sequence id of the start reporting command. The id is given by
the host, and it indicates whether the following free pages correspond
to the command. For example, the host may stop the report and start again
with a new command id. The obsolete pages for the previous start command
can be detected by the id dismatching on the host. The id is added to the
vq using an output buffer, and the free pages are added to the vq using
input buffer.

Here are some explainations about the added configuration registers:
- host2guest_cmd: a register used by the host to send commands to the
guest.
- guest2host_cmd: written by the guest to ACK to the host about the
commands that have been received. The host will clear the corresponding
bits on the host2guest_cmd register. The guest also uses this register
to send commands to the host (e.g. when finish free page reporting).
- free_page_cmd_id: the sequence id of the free page report command
given by the host.

Signed-off-by: Wei Wang <wei.w.w...@intel.com>
Signed-off-by: Liang Li <liang.z...@intel.com>
Cc: Michael S. Tsirkin <m...@redhat.com>
Cc: Michal Hocko <mho...@kernel.org>
---
 drivers/virtio/virtio_balloon.c | 234 
 include/uapi/linux/virtio_balloon.h |  11 ++
 2 files changed, 223 insertions(+), 22 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index b31fc25..4087f04 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -55,7 +55,12 @@ static struct vfsmount *balloon_mnt;
 
 struct virtio_balloon {
struct virtio_device *vdev;
-   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
+   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
+
+   /* Balloon's own wq for cpu-intensive work items */
+   struct workqueue_struct *balloon_wq;
+   /* The free page reporting work item submitted to the balloon wq */
+   struct work_struct report_free_page_work;
 
/* The balloon servicing is delegated to a freezable workqueue. */
struct work_struct update_balloon_stats_work;
@@ -65,6 +70,10 @@ struct virtio_balloon {
spinlock_t stop_update_lock;
bool stop_update;
 
+   /* Stop reporting free pages */
+   bool report_free_page_stop;
+   uint32_t free_page_cmd_id;
+
/* Waiting for host to ack the pages we released. */
wait_queue_head_t acked;
 
@@ -191,6 +200,30 @@ static void send_balloon_page_sg(struct virtio_balloon *vb,
kick_and_wait(vq, vb->acked);
 }
 
+static void send_free_page_sg(struct virtqueue *vq, void *addr, uint32_t size)
+{
+   int err = 0;
+   unsigned int len;
+
+   /* Detach all the used buffers from the vq */
+   while (virtqueue_get_buf(vq, ))
+   ;
+
+   /*
+* Since this is an optimization feature, losing a couple of free
+* pages to report isn't important. We simply resturn without adding
+* the page if the vq is full.
+*/
+   if (vq->num_free) {
+   err = add_one_sg(vq, addr, size);
+   BUG_ON(err);
+   }
+
+   /* Batch till the vq is full */
+   if (!vq->num_free)
+   virtqueue_kick(vq);
+}
+
 /*
  * Send balloon pages in sgs to host. The balloon pages are recorded in the
  * page xbitmap. Each bit in the bitmap corresponds to a page of PAGE_SIZE.
@@ -495,9 +528,8 @@ static void stats_handle_request(struct virtio_balloon *vb)
virtqueue_kick(vq);
 }
 
-static void virtballoon_changed(struct virtio_device *vdev)
+static void virtballoon_cmd_balloon_memory(struct virtio_balloon *vb)
 {
-   struct virtio_balloon *vb = vdev->priv;
unsigned long flags;
 
spin_lock_irqsave(>stop_update_lock, flags);
@@ -506,6 +538,50 @@ static void virtballoon_changed(struct virtio_device *vdev)
spin_unlock_irqrestore(>stop_update_lock, flags);
 }
 
+static void virtballoon_cmd_report_free_page_start(struct virtio_balloon *vb)
+{
+   unsigned long flags;
+
+   vb->report_free_page_stop = false;
+   spin_lock_irqsave(>stop_update_lock, flags);
+   if (!vb->stop_update)
+   queue_work(vb->balloon_wq, >report_free_page_work);
+   spin_unlock_irqrestore(>stop_update_lock, flags);
+}
+
+static void virtballoon_changed(struct virtio_device *vdev)
+{
+   struct virtio_balloon *vb = vdev->priv;
+   u32 host2guest_cmd, guest2host_cmd = 0;
+
+   if (!virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ)) {
+  

[PATCH v17 1/6] lib/xbitmap: Introduce xbitmap

2017-11-03 Thread Wei Wang
From: Matthew Wilcox <mawil...@microsoft.com>

The eXtensible Bitmap is a sparse bitmap representation which is
efficient for set bits which tend to cluster.  It supports up to
'unsigned long' worth of bits, and this commit adds the bare bones --
xb_set_bit(), xb_clear_bit(), xb_clear_bit_range(), xb_test_bit(),
xb_find_next_set_bit(), xb_find_next_zero_bit().

More possible optimizations to add in the future:
1) xb_set_bit_range: set a range of bits.
2) when searching a bit, if the bit is not found in the slot, move on to
the next slot directly.
3) add Tags to help searching.

Signed-off-by: Wei Wang <wei.w.w...@intel.com>
Cc: Matthew Wilcox <mawil...@microsoft.com>
Cc: Andrew Morton <a...@linux-foundation.org>
Cc: Michal Hocko <mho...@kernel.org>
Cc: Michael S. Tsirkin <m...@redhat.com>
Cc: Tetsuo Handa <penguin-ker...@i-love.sakura.ne.jp>

v16->v17 ChangeLog:
1) xb_preload: allocate ida bitmap before __radix_tree_preload() to avoid
kmalloc with preemption disabled. Also change this function to return with
preemption not disabled on error.
2) xb_preload_and_set_bit: a wrapper of xb_preload and xb_set_bit, for
the convenience of usage.

v15->v16 ChangeLog:
1) coding style - separate small functions for bit set/clear/test;
2) Clear a range of bits in a more efficient way:
   A) clear a range of bits from the same ida bitmap directly rather than
  search the bitmap again for each bit;
   B) when the range of bits to clear covers the whole ida bitmap,
  directly free the bitmap - no need to zero the bitmap first.
3) more efficient bit searching, like 2.A.
---
 include/linux/radix-tree.h |   2 +
 include/linux/xbitmap.h|  67 +++
 lib/Makefile   |   2 +-
 lib/radix-tree.c   |  51 +++-
 lib/xbitmap.c  | 283 +
 5 files changed, 402 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/xbitmap.h
 create mode 100644 lib/xbitmap.c

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 567ebb5..1d6d6f6 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -309,6 +309,8 @@ void radix_tree_iter_replace(struct radix_tree_root *,
const struct radix_tree_iter *, void __rcu **slot, void *entry);
 void radix_tree_replace_slot(struct radix_tree_root *,
 void __rcu **slot, void *entry);
+bool __radix_tree_delete(struct radix_tree_root *root,
+struct radix_tree_node *node, void __rcu **slot);
 void __radix_tree_delete_node(struct radix_tree_root *,
  struct radix_tree_node *,
  radix_tree_update_node_t update_node,
diff --git a/include/linux/xbitmap.h b/include/linux/xbitmap.h
new file mode 100644
index 000..00b59c3
--- /dev/null
+++ b/include/linux/xbitmap.h
@@ -0,0 +1,67 @@
+/*
+ * eXtensible Bitmaps
+ * Copyright (c) 2017 Microsoft Corporation <mawil...@microsoft.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * eXtensible Bitmaps provide an unlimited-size sparse bitmap facility.
+ * All bits are initially zero.
+ */
+
+#ifndef __XBITMAP_H__
+#define __XBITMAP_H__
+
+#include 
+
+struct xb {
+   struct radix_tree_root xbrt;
+};
+
+#define XB_INIT {  \
+   .xbrt = RADIX_TREE_INIT(IDR_RT_MARKER | GFP_NOWAIT),\
+}
+#define DEFINE_XB(name)struct xb name = XB_INIT
+
+static inline void xb_init(struct xb *xb)
+{
+   INIT_RADIX_TREE(>xbrt, IDR_RT_MARKER | GFP_NOWAIT);
+}
+
+int xb_set_bit(struct xb *xb, unsigned long bit);
+int xb_preload_and_set_bit(struct xb *xb, unsigned long bit, gfp_t gfp);
+bool xb_test_bit(struct xb *xb, unsigned long bit);
+void xb_clear_bit(struct xb *xb, unsigned long bit);
+unsigned long xb_find_next_set_bit(struct xb *xb, unsigned long start,
+  unsigned long end);
+unsigned long xb_find_next_zero_bit(struct xb *xb, unsigned long start,
+   unsigned long end);
+void xb_clear_bit_range(struct xb *xb, unsigned long start, unsigned long end);
+
+/* Check if the xb tree is empty */
+static inline bool xb_is_empty(const struct xb *xb)
+{
+   return radix_tree_empty(>xbrt);
+}
+
+bool xb_preload(gfp_t gfp);
+
+/**
+ * xb_preload_end - end preload section started with xb_preload()
+ *
+ * Each xb_preload() should be matched with an invocation of t

[PATCH v17 1/6] lib/xbitmap: Introduce xbitmap

2017-11-03 Thread Wei Wang
From: Matthew Wilcox 

The eXtensible Bitmap is a sparse bitmap representation which is
efficient for set bits which tend to cluster.  It supports up to
'unsigned long' worth of bits, and this commit adds the bare bones --
xb_set_bit(), xb_clear_bit(), xb_clear_bit_range(), xb_test_bit(),
xb_find_next_set_bit(), xb_find_next_zero_bit().

More possible optimizations to add in the future:
1) xb_set_bit_range: set a range of bits.
2) when searching a bit, if the bit is not found in the slot, move on to
the next slot directly.
3) add Tags to help searching.

Signed-off-by: Wei Wang 
Cc: Matthew Wilcox 
Cc: Andrew Morton 
Cc: Michal Hocko 
Cc: Michael S. Tsirkin 
Cc: Tetsuo Handa 

v16->v17 ChangeLog:
1) xb_preload: allocate ida bitmap before __radix_tree_preload() to avoid
kmalloc with preemption disabled. Also change this function to return with
preemption not disabled on error.
2) xb_preload_and_set_bit: a wrapper of xb_preload and xb_set_bit, for
the convenience of usage.

v15->v16 ChangeLog:
1) coding style - separate small functions for bit set/clear/test;
2) Clear a range of bits in a more efficient way:
   A) clear a range of bits from the same ida bitmap directly rather than
  search the bitmap again for each bit;
   B) when the range of bits to clear covers the whole ida bitmap,
  directly free the bitmap - no need to zero the bitmap first.
3) more efficient bit searching, like 2.A.
---
 include/linux/radix-tree.h |   2 +
 include/linux/xbitmap.h|  67 +++
 lib/Makefile   |   2 +-
 lib/radix-tree.c   |  51 +++-
 lib/xbitmap.c  | 283 +
 5 files changed, 402 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/xbitmap.h
 create mode 100644 lib/xbitmap.c

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 567ebb5..1d6d6f6 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -309,6 +309,8 @@ void radix_tree_iter_replace(struct radix_tree_root *,
const struct radix_tree_iter *, void __rcu **slot, void *entry);
 void radix_tree_replace_slot(struct radix_tree_root *,
 void __rcu **slot, void *entry);
+bool __radix_tree_delete(struct radix_tree_root *root,
+struct radix_tree_node *node, void __rcu **slot);
 void __radix_tree_delete_node(struct radix_tree_root *,
  struct radix_tree_node *,
  radix_tree_update_node_t update_node,
diff --git a/include/linux/xbitmap.h b/include/linux/xbitmap.h
new file mode 100644
index 000..00b59c3
--- /dev/null
+++ b/include/linux/xbitmap.h
@@ -0,0 +1,67 @@
+/*
+ * eXtensible Bitmaps
+ * Copyright (c) 2017 Microsoft Corporation 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * eXtensible Bitmaps provide an unlimited-size sparse bitmap facility.
+ * All bits are initially zero.
+ */
+
+#ifndef __XBITMAP_H__
+#define __XBITMAP_H__
+
+#include 
+
+struct xb {
+   struct radix_tree_root xbrt;
+};
+
+#define XB_INIT {  \
+   .xbrt = RADIX_TREE_INIT(IDR_RT_MARKER | GFP_NOWAIT),\
+}
+#define DEFINE_XB(name)struct xb name = XB_INIT
+
+static inline void xb_init(struct xb *xb)
+{
+   INIT_RADIX_TREE(>xbrt, IDR_RT_MARKER | GFP_NOWAIT);
+}
+
+int xb_set_bit(struct xb *xb, unsigned long bit);
+int xb_preload_and_set_bit(struct xb *xb, unsigned long bit, gfp_t gfp);
+bool xb_test_bit(struct xb *xb, unsigned long bit);
+void xb_clear_bit(struct xb *xb, unsigned long bit);
+unsigned long xb_find_next_set_bit(struct xb *xb, unsigned long start,
+  unsigned long end);
+unsigned long xb_find_next_zero_bit(struct xb *xb, unsigned long start,
+   unsigned long end);
+void xb_clear_bit_range(struct xb *xb, unsigned long start, unsigned long end);
+
+/* Check if the xb tree is empty */
+static inline bool xb_is_empty(const struct xb *xb)
+{
+   return radix_tree_empty(>xbrt);
+}
+
+bool xb_preload(gfp_t gfp);
+
+/**
+ * xb_preload_end - end preload section started with xb_preload()
+ *
+ * Each xb_preload() should be matched with an invocation of this
+ * function. See xb_preload() for details.
+ */
+static inline void xb_preload_end(void)
+{
+   preempt_enable();
+}
+
+#endif
diff --git a/lib/Makefile b/lib/Makefile
index dafa796..082361b 100644
--- a/lib/Makefile
+++ b/lib/Makef

[PATCH v17 6/6] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ

2017-11-03 Thread Wei Wang
Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_VQ feature indicates the
support of reporting hints of guest free pages to the host via
virtio-balloon. The host requests the guest to report the free pages by
sending commands via the virtio-balloon configuration registers.

When the guest starts to report, the first element added to the free page
vq is a sequence id of the start reporting command. The id is given by
the host, and it indicates whether the following free pages correspond
to the command. For example, the host may stop the report and start again
with a new command id. The obsolete pages for the previous start command
can be detected by the id dismatching on the host. The id is added to the
vq using an output buffer, and the free pages are added to the vq using
input buffer.

Here are some explainations about the added configuration registers:
- host2guest_cmd: a register used by the host to send commands to the
guest.
- guest2host_cmd: written by the guest to ACK to the host about the
commands that have been received. The host will clear the corresponding
bits on the host2guest_cmd register. The guest also uses this register
to send commands to the host (e.g. when finish free page reporting).
- free_page_cmd_id: the sequence id of the free page report command
given by the host.

Signed-off-by: Wei Wang 
Signed-off-by: Liang Li 
Cc: Michael S. Tsirkin 
Cc: Michal Hocko 
---
 drivers/virtio/virtio_balloon.c | 234 
 include/uapi/linux/virtio_balloon.h |  11 ++
 2 files changed, 223 insertions(+), 22 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index b31fc25..4087f04 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -55,7 +55,12 @@ static struct vfsmount *balloon_mnt;
 
 struct virtio_balloon {
struct virtio_device *vdev;
-   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
+   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
+
+   /* Balloon's own wq for cpu-intensive work items */
+   struct workqueue_struct *balloon_wq;
+   /* The free page reporting work item submitted to the balloon wq */
+   struct work_struct report_free_page_work;
 
/* The balloon servicing is delegated to a freezable workqueue. */
struct work_struct update_balloon_stats_work;
@@ -65,6 +70,10 @@ struct virtio_balloon {
spinlock_t stop_update_lock;
bool stop_update;
 
+   /* Stop reporting free pages */
+   bool report_free_page_stop;
+   uint32_t free_page_cmd_id;
+
/* Waiting for host to ack the pages we released. */
wait_queue_head_t acked;
 
@@ -191,6 +200,30 @@ static void send_balloon_page_sg(struct virtio_balloon *vb,
kick_and_wait(vq, vb->acked);
 }
 
+static void send_free_page_sg(struct virtqueue *vq, void *addr, uint32_t size)
+{
+   int err = 0;
+   unsigned int len;
+
+   /* Detach all the used buffers from the vq */
+   while (virtqueue_get_buf(vq, ))
+   ;
+
+   /*
+* Since this is an optimization feature, losing a couple of free
+* pages to report isn't important. We simply resturn without adding
+* the page if the vq is full.
+*/
+   if (vq->num_free) {
+   err = add_one_sg(vq, addr, size);
+   BUG_ON(err);
+   }
+
+   /* Batch till the vq is full */
+   if (!vq->num_free)
+   virtqueue_kick(vq);
+}
+
 /*
  * Send balloon pages in sgs to host. The balloon pages are recorded in the
  * page xbitmap. Each bit in the bitmap corresponds to a page of PAGE_SIZE.
@@ -495,9 +528,8 @@ static void stats_handle_request(struct virtio_balloon *vb)
virtqueue_kick(vq);
 }
 
-static void virtballoon_changed(struct virtio_device *vdev)
+static void virtballoon_cmd_balloon_memory(struct virtio_balloon *vb)
 {
-   struct virtio_balloon *vb = vdev->priv;
unsigned long flags;
 
spin_lock_irqsave(>stop_update_lock, flags);
@@ -506,6 +538,50 @@ static void virtballoon_changed(struct virtio_device *vdev)
spin_unlock_irqrestore(>stop_update_lock, flags);
 }
 
+static void virtballoon_cmd_report_free_page_start(struct virtio_balloon *vb)
+{
+   unsigned long flags;
+
+   vb->report_free_page_stop = false;
+   spin_lock_irqsave(>stop_update_lock, flags);
+   if (!vb->stop_update)
+   queue_work(vb->balloon_wq, >report_free_page_work);
+   spin_unlock_irqrestore(>stop_update_lock, flags);
+}
+
+static void virtballoon_changed(struct virtio_device *vdev)
+{
+   struct virtio_balloon *vb = vdev->priv;
+   u32 host2guest_cmd, guest2host_cmd = 0;
+
+   if (!virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ)) {
+   virtballoon_cmd_balloon_memory(vb);
+   return;
+   }
+
+   virtio_cread(vb->vdev, st

[PATCH v17 0/6] Virtio-balloon Enhancement

2017-11-03 Thread Wei Wang
 Split the two new features, VIRTIO_BALLOON_F_BALLOON_CHUNKS and
VIRTIO_BALLOON_F_MISC_VQ, which were mixed together in the previous
implementation;
2) Simpler function to get the free page block.

v7->v8:
1) Use only one chunk format, instead of two.
2) re-write the virtio-balloon implementation patch.
3) commit changes
4) patch re-org

Matthew Wilcox (2):
  lib/xbitmap: Introduce xbitmap
  radix tree test suite: add tests for xbitmap

Michael S. Tsirkin (1):
  mm/balloon_compaction.c: split balloon page allocation and enqueue

Wei Wang (3):
  virtio-balloon: VIRTIO_BALLOON_F_SG
  mm: support reporting free page blocks
  virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ

 drivers/virtio/virtio_balloon.c | 489 +---
 include/linux/balloon_compaction.h  |  34 ++-
 include/linux/mm.h  |   6 +
 include/linux/radix-tree.h  |   2 +
 include/linux/xbitmap.h |  67 +
 include/uapi/linux/virtio_balloon.h |  12 +
 lib/Makefile|   2 +-
 lib/radix-tree.c|  51 +++-
 lib/xbitmap.c   | 283 ++
 mm/balloon_compaction.c |  28 +-
 mm/page_alloc.c |  91 ++
 tools/include/linux/bitmap.h|  34 +++
 tools/include/linux/kernel.h|   2 +
 tools/testing/radix-tree/Makefile   |   7 +-
 tools/testing/radix-tree/linux/kernel.h |   2 -
 tools/testing/radix-tree/main.c |   5 +
 tools/testing/radix-tree/test.h |   1 +
 tools/testing/radix-tree/xbitmap.c  | 278 ++
 18 files changed, 1335 insertions(+), 59 deletions(-)
 create mode 100644 include/linux/xbitmap.h
 create mode 100644 lib/xbitmap.c
 create mode 100644 tools/testing/radix-tree/xbitmap.c

-- 
2.7.4



[PATCH v17 0/6] Virtio-balloon Enhancement

2017-11-03 Thread Wei Wang
 Split the two new features, VIRTIO_BALLOON_F_BALLOON_CHUNKS and
VIRTIO_BALLOON_F_MISC_VQ, which were mixed together in the previous
implementation;
2) Simpler function to get the free page block.

v7->v8:
1) Use only one chunk format, instead of two.
2) re-write the virtio-balloon implementation patch.
3) commit changes
4) patch re-org

Matthew Wilcox (2):
  lib/xbitmap: Introduce xbitmap
  radix tree test suite: add tests for xbitmap

Michael S. Tsirkin (1):
  mm/balloon_compaction.c: split balloon page allocation and enqueue

Wei Wang (3):
  virtio-balloon: VIRTIO_BALLOON_F_SG
  mm: support reporting free page blocks
  virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ

 drivers/virtio/virtio_balloon.c | 489 +---
 include/linux/balloon_compaction.h  |  34 ++-
 include/linux/mm.h  |   6 +
 include/linux/radix-tree.h  |   2 +
 include/linux/xbitmap.h |  67 +
 include/uapi/linux/virtio_balloon.h |  12 +
 lib/Makefile|   2 +-
 lib/radix-tree.c|  51 +++-
 lib/xbitmap.c   | 283 ++
 mm/balloon_compaction.c |  28 +-
 mm/page_alloc.c |  91 ++
 tools/include/linux/bitmap.h|  34 +++
 tools/include/linux/kernel.h|   2 +
 tools/testing/radix-tree/Makefile   |   7 +-
 tools/testing/radix-tree/linux/kernel.h |   2 -
 tools/testing/radix-tree/main.c |   5 +
 tools/testing/radix-tree/test.h |   1 +
 tools/testing/radix-tree/xbitmap.c  | 278 ++
 18 files changed, 1335 insertions(+), 59 deletions(-)
 create mode 100644 include/linux/xbitmap.h
 create mode 100644 lib/xbitmap.c
 create mode 100644 tools/testing/radix-tree/xbitmap.c

-- 
2.7.4



Re: [PATCH] virtio_balloon: fix deadlock on OOM

2017-10-30 Thread Wei Wang

On 10/13/2017 09:21 PM, Michael S. Tsirkin wrote:

fill_balloon doing memory allocations under balloon_lock
can cause a deadlock when leak_balloon is called from
virtballoon_oom_notify and tries to take same lock.

To fix, split page allocation and enqueue and do allocations outside the lock.

Here's a detailed analysis of the deadlock by Tetsuo Handa:

In leak_balloon(), mutex_lock(>balloon_lock) is called in order to
serialize against fill_balloon(). But in fill_balloon(),
alloc_page(GFP_HIGHUSER[_MOVABLE] | __GFP_NOMEMALLOC | __GFP_NORETRY) is
called with vb->balloon_lock mutex held. Since GFP_HIGHUSER[_MOVABLE]
implies __GFP_DIRECT_RECLAIM | __GFP_IO | __GFP_FS, despite __GFP_NORETRY
is specified, this allocation attempt might indirectly depend on somebody
else's __GFP_DIRECT_RECLAIM memory allocation. And such indirect
__GFP_DIRECT_RECLAIM memory allocation might call leak_balloon() via
virtballoon_oom_notify() via blocking_notifier_call_chain() callback via
out_of_memory() when it reached __alloc_pages_may_oom() and held oom_lock
mutex. Since vb->balloon_lock mutex is already held by fill_balloon(), it
will cause OOM lockup. Thus, do not wait for vb->balloon_lock mutex if
leak_balloon() is called from out_of_memory().

   Thread1   Thread2
 fill_balloon()
   takes a balloon_lock
   balloon_page_enqueue()
 alloc_page(GFP_HIGHUSER_MOVABLE)
   direct reclaim (__GFP_FS context)   takes a fs lock
 waits for that fs lock  alloc_page(GFP_NOFS)
   __alloc_pages_may_oom()
 takes the oom_lock
 out_of_memory()
   
blocking_notifier_call_chain()
 leak_balloon()
   tries to take 
that balloon_lock and deadlocks

Reported-by: Tetsuo Handa <penguin-ker...@i-love.sakura.ne.jp>
Cc: Michal Hocko <mho...@suse.com>
Cc: Wei Wang <wei.w.w...@intel.com>
---


The "virtio-balloon enhancement" series has a dependency on this patch.
Could you send out a new version soon? Or I can include it in the series 
if you want.



Best,
Wei



Re: [PATCH] virtio_balloon: fix deadlock on OOM

2017-10-30 Thread Wei Wang

On 10/13/2017 09:21 PM, Michael S. Tsirkin wrote:

fill_balloon doing memory allocations under balloon_lock
can cause a deadlock when leak_balloon is called from
virtballoon_oom_notify and tries to take same lock.

To fix, split page allocation and enqueue and do allocations outside the lock.

Here's a detailed analysis of the deadlock by Tetsuo Handa:

In leak_balloon(), mutex_lock(>balloon_lock) is called in order to
serialize against fill_balloon(). But in fill_balloon(),
alloc_page(GFP_HIGHUSER[_MOVABLE] | __GFP_NOMEMALLOC | __GFP_NORETRY) is
called with vb->balloon_lock mutex held. Since GFP_HIGHUSER[_MOVABLE]
implies __GFP_DIRECT_RECLAIM | __GFP_IO | __GFP_FS, despite __GFP_NORETRY
is specified, this allocation attempt might indirectly depend on somebody
else's __GFP_DIRECT_RECLAIM memory allocation. And such indirect
__GFP_DIRECT_RECLAIM memory allocation might call leak_balloon() via
virtballoon_oom_notify() via blocking_notifier_call_chain() callback via
out_of_memory() when it reached __alloc_pages_may_oom() and held oom_lock
mutex. Since vb->balloon_lock mutex is already held by fill_balloon(), it
will cause OOM lockup. Thus, do not wait for vb->balloon_lock mutex if
leak_balloon() is called from out_of_memory().

   Thread1   Thread2
 fill_balloon()
   takes a balloon_lock
   balloon_page_enqueue()
 alloc_page(GFP_HIGHUSER_MOVABLE)
   direct reclaim (__GFP_FS context)   takes a fs lock
 waits for that fs lock  alloc_page(GFP_NOFS)
   __alloc_pages_may_oom()
 takes the oom_lock
 out_of_memory()
   
blocking_notifier_call_chain()
 leak_balloon()
   tries to take 
that balloon_lock and deadlocks

Reported-by: Tetsuo Handa 
Cc: Michal Hocko 
Cc: Wei Wang 
---


The "virtio-balloon enhancement" series has a dependency on this patch.
Could you send out a new version soon? Or I can include it in the series 
if you want.



Best,
Wei



Re: [PATCH v1 3/3] virtio-balloon: stop inflating when OOM occurs

2017-10-23 Thread Wei Wang

On 10/23/2017 01:13 AM, Michael S. Tsirkin wrote:

On Fri, Oct 20, 2017 at 07:54:26PM +0800, Wei Wang wrote:

This patch forces the cease of the inflating work when OOM occurs.
The fundamental idea of memory ballooning is to take out some guest
pages when the guest has low memory utilization, so it is sensible to
inflate nothing when the guest is already under memory pressure.

On the other hand, the policy is determined by the admin or the
orchestration layer from the host. That is, the host is expected to
re-start the memory inflating request at a proper time later when
the guest has enough memory to inflate, for example, by checking
the memory stats reported by the balloon.

Is there any other way to do it? And if so can't we just have guest do
it automatically? Maybe the issue is really that fill attempts to
allocate memory aggressively instead of checking availability.
Maybe with deflate on oom it should check availability?



I think it might not be easy to do it in the guest in practice.
For example, the host asks for 4G from the guest, and the guest checks
that it has 4G that can be inflated at that point. While it is inflating 
and 2G

is done inflating, another new task on the guest comes out and
takes the remaining 2G to use. Now the guest has nothing to inflate.

This would raise the questions:
1) what is the point of checking the availability?
Maybe we could just let the guest inflate as much as it can, that is, till
balloon_page_enqueue() returns NULL, then stop inflating.

2) How long would the host has to wait for this guest to get the 
remaining 2G?
If I understand "guest do it automatically" correctly: now the guest is 
responsible
for giving another 2G, which he owes to the host in this case - not 
giving up inflating
whenever there is some free memory. Maybe in the next 1 hour it wouldn't 
have any

memory available to give to the host. The time seems non-deterministic.

If we leave it to the host to define the policy, I think it would be easier.
Once the host finds that the guest can only offer 2G, then it can just 
give up asking
for memory from this guest, and continue to check other guests to see if 
it can get

some memory there to satisfy the needs.


Best,
Wei


Re: [PATCH v1 3/3] virtio-balloon: stop inflating when OOM occurs

2017-10-23 Thread Wei Wang

On 10/23/2017 01:13 AM, Michael S. Tsirkin wrote:

On Fri, Oct 20, 2017 at 07:54:26PM +0800, Wei Wang wrote:

This patch forces the cease of the inflating work when OOM occurs.
The fundamental idea of memory ballooning is to take out some guest
pages when the guest has low memory utilization, so it is sensible to
inflate nothing when the guest is already under memory pressure.

On the other hand, the policy is determined by the admin or the
orchestration layer from the host. That is, the host is expected to
re-start the memory inflating request at a proper time later when
the guest has enough memory to inflate, for example, by checking
the memory stats reported by the balloon.

Is there any other way to do it? And if so can't we just have guest do
it automatically? Maybe the issue is really that fill attempts to
allocate memory aggressively instead of checking availability.
Maybe with deflate on oom it should check availability?



I think it might not be easy to do it in the guest in practice.
For example, the host asks for 4G from the guest, and the guest checks
that it has 4G that can be inflated at that point. While it is inflating 
and 2G

is done inflating, another new task on the guest comes out and
takes the remaining 2G to use. Now the guest has nothing to inflate.

This would raise the questions:
1) what is the point of checking the availability?
Maybe we could just let the guest inflate as much as it can, that is, till
balloon_page_enqueue() returns NULL, then stop inflating.

2) How long would the host has to wait for this guest to get the 
remaining 2G?
If I understand "guest do it automatically" correctly: now the guest is 
responsible
for giving another 2G, which he owes to the host in this case - not 
giving up inflating
whenever there is some free memory. Maybe in the next 1 hour it wouldn't 
have any

memory available to give to the host. The time seems non-deterministic.

If we leave it to the host to define the policy, I think it would be easier.
Once the host finds that the guest can only offer 2G, then it can just 
give up asking
for memory from this guest, and continue to check other guests to see if 
it can get

some memory there to satisfy the needs.


Best,
Wei


Re: [PATCH v1 1/3] virtio-balloon: replace the coarse-grained balloon_lock

2017-10-23 Thread Wei Wang

On 10/22/2017 07:50 PM, Tetsuo Handa wrote:

Wei Wang wrote:

@@ -162,20 +160,20 @@ static unsigned fill_balloon(struct virtio_balloon *vb, 
size_t num)
msleep(200);
break;
}
-   set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
-   vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
+   set_page_pfns(vb, pfns + num_pfns, page);
if (!virtio_has_feature(vb->vdev,
VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
adjust_managed_page_count(page, -1);
}
   
-	num_allocated_pages = vb->num_pfns;

+   mutex_lock(>inflate_lock);
/* Did we get any? */
-   if (vb->num_pfns != 0)
-   tell_host(vb, vb->inflate_vq);
-   mutex_unlock(>balloon_lock);
+   if (num_pfns != 0)
+   tell_host(vb, vb->inflate_vq, pfns, num_pfns);
+   mutex_unlock(>inflate_lock);
+   atomic64_add(num_pfns, >num_pages);

Isn't this addition too late? If leak_balloon() is called due to
out_of_memory(), it will fail to find up to dated vb->num_pages value.

Not really. I think the old way of implementation above:
"vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE"
isn't quite accurate, because "vb->num_page" should reflect the number of
pages that have already been inflated, which means those pages have
already been given to the host via "tell_host()".

If we update "vb->num_page" earlier before tell_host(), then it will
include the pages
that haven't been given to the host, which I think shouldn't be counted
as inflated pages.

On the other hand, OOM will use leak_balloon() to release the pages that
should
have already been inflated.

But leak_balloon() finds max inflated pages from vb->num_pages, doesn't it?

   
   	/* We can only do one array worth at a time. */

-   num = min(num, ARRAY_SIZE(vb->pfns));
+   num = min_t(size_t, num, VIRTIO_BALLOON_ARRAY_PFNS_MAX);
   
-	mutex_lock(>balloon_lock);

/* We can't release more pages than taken */
-   num = min(num, (size_t)vb->num_pages);
-   for (vb->num_pfns = 0; vb->num_pfns < num;
-vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
+   num = min_t(size_t, num, atomic64_read(>num_pages));
+   for (num_pfns = 0; num_pfns < num;
+num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
page = balloon_page_dequeue(vb_dev_info);

If balloon_page_dequeue() can be concurrently called by both host's request
and guest's OOM event, is (!dequeued_page) test in balloon_page_dequeue() safe?


I'm not sure about the question. The "dequeue_page" is a local variable
in the function, why would it be unsafe for two invocations (the shared
b_dev_info->pages are operated under a lock)?

I'm not MM person nor virtio person. I'm commenting from point of view of
safe programming. My question is, isn't there possibility of hitting

if (unlikely(list_empty(_dev_info->pages) &&
 !b_dev_info->isolated_pages))
BUG();

when things run concurrently.


Thanks for the comments. I'm not 100% confident about all the possible 
corner cases here at present
(e.g. why is the b_dev_info->page_lock released and re-gained in 
balloon_page_dequeue()), and
Michael has given a preference of the solution, so I plan not to stick 
with this one.


Best,
Wei




Re: [PATCH v1 1/3] virtio-balloon: replace the coarse-grained balloon_lock

2017-10-23 Thread Wei Wang

On 10/22/2017 07:50 PM, Tetsuo Handa wrote:

Wei Wang wrote:

@@ -162,20 +160,20 @@ static unsigned fill_balloon(struct virtio_balloon *vb, 
size_t num)
msleep(200);
break;
}
-   set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
-   vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
+   set_page_pfns(vb, pfns + num_pfns, page);
if (!virtio_has_feature(vb->vdev,
VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
adjust_managed_page_count(page, -1);
}
   
-	num_allocated_pages = vb->num_pfns;

+   mutex_lock(>inflate_lock);
/* Did we get any? */
-   if (vb->num_pfns != 0)
-   tell_host(vb, vb->inflate_vq);
-   mutex_unlock(>balloon_lock);
+   if (num_pfns != 0)
+   tell_host(vb, vb->inflate_vq, pfns, num_pfns);
+   mutex_unlock(>inflate_lock);
+   atomic64_add(num_pfns, >num_pages);

Isn't this addition too late? If leak_balloon() is called due to
out_of_memory(), it will fail to find up to dated vb->num_pages value.

Not really. I think the old way of implementation above:
"vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE"
isn't quite accurate, because "vb->num_page" should reflect the number of
pages that have already been inflated, which means those pages have
already been given to the host via "tell_host()".

If we update "vb->num_page" earlier before tell_host(), then it will
include the pages
that haven't been given to the host, which I think shouldn't be counted
as inflated pages.

On the other hand, OOM will use leak_balloon() to release the pages that
should
have already been inflated.

But leak_balloon() finds max inflated pages from vb->num_pages, doesn't it?

   
   	/* We can only do one array worth at a time. */

-   num = min(num, ARRAY_SIZE(vb->pfns));
+   num = min_t(size_t, num, VIRTIO_BALLOON_ARRAY_PFNS_MAX);
   
-	mutex_lock(>balloon_lock);

/* We can't release more pages than taken */
-   num = min(num, (size_t)vb->num_pages);
-   for (vb->num_pfns = 0; vb->num_pfns < num;
-vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
+   num = min_t(size_t, num, atomic64_read(>num_pages));
+   for (num_pfns = 0; num_pfns < num;
+num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
page = balloon_page_dequeue(vb_dev_info);

If balloon_page_dequeue() can be concurrently called by both host's request
and guest's OOM event, is (!dequeued_page) test in balloon_page_dequeue() safe?


I'm not sure about the question. The "dequeue_page" is a local variable
in the function, why would it be unsafe for two invocations (the shared
b_dev_info->pages are operated under a lock)?

I'm not MM person nor virtio person. I'm commenting from point of view of
safe programming. My question is, isn't there possibility of hitting

if (unlikely(list_empty(_dev_info->pages) &&
 !b_dev_info->isolated_pages))
BUG();

when things run concurrently.


Thanks for the comments. I'm not 100% confident about all the possible 
corner cases here at present
(e.g. why is the b_dev_info->page_lock released and re-gained in 
balloon_page_dequeue()), and
Michael has given a preference of the solution, so I plan not to stick 
with this one.


Best,
Wei




Re: [PATCH v1 2/3] virtio-balloon: deflate up to oom_pages on OOM

2017-10-22 Thread Wei Wang

On 10/22/2017 12:11 PM, Tetsuo Handa wrote:

Michael S. Tsirkin wrote:

-   num_freed_pages = leak_balloon(vb, oom_pages);
+
+   /* Don't deflate more than the number of inflated pages */
+   while (npages && atomic64_read(>num_pages))
+   npages -= leak_balloon(vb, npages);

don't we need to abort if leak_balloon() returned 0 for some reason?


I don't think so. Returning 0 should be a normal case when the host tries
to give back some pages to the guest, but there is no pages that have ever
been inflated. For example, right after booting the guest, the host sends a
deflating request to give the guest 1G memory, leak_balloon should return 0,
and guest wouldn't get 1 more G memory.


Best,
Wei


Re: [PATCH v1 2/3] virtio-balloon: deflate up to oom_pages on OOM

2017-10-22 Thread Wei Wang

On 10/22/2017 12:11 PM, Tetsuo Handa wrote:

Michael S. Tsirkin wrote:

-   num_freed_pages = leak_balloon(vb, oom_pages);
+
+   /* Don't deflate more than the number of inflated pages */
+   while (npages && atomic64_read(>num_pages))
+   npages -= leak_balloon(vb, npages);

don't we need to abort if leak_balloon() returned 0 for some reason?


I don't think so. Returning 0 should be a normal case when the host tries
to give back some pages to the guest, but there is no pages that have ever
been inflated. For example, right after booting the guest, the host sends a
deflating request to give the guest 1G memory, leak_balloon should return 0,
and guest wouldn't get 1 more G memory.


Best,
Wei


Re: [PATCH v1 1/3] virtio-balloon: replace the coarse-grained balloon_lock

2017-10-22 Thread Wei Wang

On 10/22/2017 01:20 PM, Tetsuo Handa wrote:

Wei Wang wrote:

The balloon_lock was used to synchronize the access demand to elements
of struct virtio_balloon and its queue operations (please see commit
e22504296d). This prevents the concurrent run of the leak_balloon and
fill_balloon functions, thereby resulting in a deadlock issue on OOM:

fill_balloon: take balloon_lock and wait for OOM to get some memory;
oom_notify: release some inflated memory via leak_balloon();
leak_balloon: wait for balloon_lock to be released by fill_balloon.

This patch breaks the lock into two fine-grained inflate_lock and
deflate_lock, and eliminates the unnecessary use of the shared data
(i.e. vb->pnfs, vb->num_pfns). This enables leak_balloon and
fill_balloon to run concurrently and solves the deadlock issue.

@@ -162,20 +160,20 @@ static unsigned fill_balloon(struct virtio_balloon *vb, 
size_t num)
msleep(200);
break;
}
-   set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
-   vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
+   set_page_pfns(vb, pfns + num_pfns, page);
if (!virtio_has_feature(vb->vdev,
VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
adjust_managed_page_count(page, -1);
}
  
-	num_allocated_pages = vb->num_pfns;

+   mutex_lock(>inflate_lock);
/* Did we get any? */
-   if (vb->num_pfns != 0)
-   tell_host(vb, vb->inflate_vq);
-   mutex_unlock(>balloon_lock);
+   if (num_pfns != 0)
+   tell_host(vb, vb->inflate_vq, pfns, num_pfns);
+   mutex_unlock(>inflate_lock);
+   atomic64_add(num_pfns, >num_pages);

Isn't this addition too late? If leak_balloon() is called due to
out_of_memory(), it will fail to find up to dated vb->num_pages value.


Not really. I think the old way of implementation above:
"vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE"
isn't quite accurate, because "vb->num_page" should reflect the number of
pages that have already been inflated, which means those pages have
already been given to the host via "tell_host()".

If we update "vb->num_page" earlier before tell_host(), then it will 
include the pages
that haven't been given to the host, which I think shouldn't be counted 
as inflated pages.


On the other hand, OOM will use leak_balloon() to release the pages that 
should

have already been inflated.

In addition, I think we would also need to move balloon_page_insert(), 
which puts the

page onto the inflated page list, after tell_host().



  
-	return num_allocated_pages;

+   return num_pfns;
  }
  
  static void release_pages_balloon(struct virtio_balloon *vb,

@@ -194,38 +192,39 @@ static void release_pages_balloon(struct virtio_balloon 
*vb,
  
  static unsigned leak_balloon(struct virtio_balloon *vb, size_t num)

  {
-   unsigned num_freed_pages;
struct page *page;
struct balloon_dev_info *vb_dev_info = >vb_dev_info;
LIST_HEAD(pages);
+   unsigned int num_pfns;
+   __virtio32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX];

This array consumes 1024 bytes of kernel stack, doesn't it?
leak_balloon() might be called from out_of_memory() where kernel stack
is already largely consumed before entering __alloc_pages_nodemask().
For reducing possibility of stack overflow, since out_of_memory() is
serialized by oom_lock, I suggest using static (maybe kmalloc()ed as
vb->oom_pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX]) buffer when called from
out_of_memory().


In that case, we might as well to use
vb->inflate_pfns = kmalloc(VIRTIO_BALLOON_ARRAY_PFNS_MAX..);
vb->deflate_pfns = kmalloc(VIRTIO_BALLOON_ARRAY_PFNS_MAX..);
which are allocated in probe().

  
  	/* We can only do one array worth at a time. */

-   num = min(num, ARRAY_SIZE(vb->pfns));
+   num = min_t(size_t, num, VIRTIO_BALLOON_ARRAY_PFNS_MAX);
  
-	mutex_lock(>balloon_lock);

/* We can't release more pages than taken */
-   num = min(num, (size_t)vb->num_pages);
-   for (vb->num_pfns = 0; vb->num_pfns < num;
-vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
+   num = min_t(size_t, num, atomic64_read(>num_pages));
+   for (num_pfns = 0; num_pfns < num;
+num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
page = balloon_page_dequeue(vb_dev_info);

If balloon_page_dequeue() can be concurrently called by both host's request
and guest's OOM event, is (!dequeued_page) test in balloon_page_dequeue() safe?



I'm not sure about the question. The "dequeue_page" is a local variable
in the function, why would it be unsafe for two invocations (the shared
b_dev_info->pages are operated under a lock)?




Is such concurrency needed?


Thanks for this question, it triggers another optimizatio

Re: [PATCH v1 1/3] virtio-balloon: replace the coarse-grained balloon_lock

2017-10-22 Thread Wei Wang

On 10/22/2017 01:20 PM, Tetsuo Handa wrote:

Wei Wang wrote:

The balloon_lock was used to synchronize the access demand to elements
of struct virtio_balloon and its queue operations (please see commit
e22504296d). This prevents the concurrent run of the leak_balloon and
fill_balloon functions, thereby resulting in a deadlock issue on OOM:

fill_balloon: take balloon_lock and wait for OOM to get some memory;
oom_notify: release some inflated memory via leak_balloon();
leak_balloon: wait for balloon_lock to be released by fill_balloon.

This patch breaks the lock into two fine-grained inflate_lock and
deflate_lock, and eliminates the unnecessary use of the shared data
(i.e. vb->pnfs, vb->num_pfns). This enables leak_balloon and
fill_balloon to run concurrently and solves the deadlock issue.

@@ -162,20 +160,20 @@ static unsigned fill_balloon(struct virtio_balloon *vb, 
size_t num)
msleep(200);
break;
}
-   set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
-   vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
+   set_page_pfns(vb, pfns + num_pfns, page);
if (!virtio_has_feature(vb->vdev,
VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
adjust_managed_page_count(page, -1);
}
  
-	num_allocated_pages = vb->num_pfns;

+   mutex_lock(>inflate_lock);
/* Did we get any? */
-   if (vb->num_pfns != 0)
-   tell_host(vb, vb->inflate_vq);
-   mutex_unlock(>balloon_lock);
+   if (num_pfns != 0)
+   tell_host(vb, vb->inflate_vq, pfns, num_pfns);
+   mutex_unlock(>inflate_lock);
+   atomic64_add(num_pfns, >num_pages);

Isn't this addition too late? If leak_balloon() is called due to
out_of_memory(), it will fail to find up to dated vb->num_pages value.


Not really. I think the old way of implementation above:
"vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE"
isn't quite accurate, because "vb->num_page" should reflect the number of
pages that have already been inflated, which means those pages have
already been given to the host via "tell_host()".

If we update "vb->num_page" earlier before tell_host(), then it will 
include the pages
that haven't been given to the host, which I think shouldn't be counted 
as inflated pages.


On the other hand, OOM will use leak_balloon() to release the pages that 
should

have already been inflated.

In addition, I think we would also need to move balloon_page_insert(), 
which puts the

page onto the inflated page list, after tell_host().



  
-	return num_allocated_pages;

+   return num_pfns;
  }
  
  static void release_pages_balloon(struct virtio_balloon *vb,

@@ -194,38 +192,39 @@ static void release_pages_balloon(struct virtio_balloon 
*vb,
  
  static unsigned leak_balloon(struct virtio_balloon *vb, size_t num)

  {
-   unsigned num_freed_pages;
struct page *page;
struct balloon_dev_info *vb_dev_info = >vb_dev_info;
LIST_HEAD(pages);
+   unsigned int num_pfns;
+   __virtio32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX];

This array consumes 1024 bytes of kernel stack, doesn't it?
leak_balloon() might be called from out_of_memory() where kernel stack
is already largely consumed before entering __alloc_pages_nodemask().
For reducing possibility of stack overflow, since out_of_memory() is
serialized by oom_lock, I suggest using static (maybe kmalloc()ed as
vb->oom_pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX]) buffer when called from
out_of_memory().


In that case, we might as well to use
vb->inflate_pfns = kmalloc(VIRTIO_BALLOON_ARRAY_PFNS_MAX..);
vb->deflate_pfns = kmalloc(VIRTIO_BALLOON_ARRAY_PFNS_MAX..);
which are allocated in probe().

  
  	/* We can only do one array worth at a time. */

-   num = min(num, ARRAY_SIZE(vb->pfns));
+   num = min_t(size_t, num, VIRTIO_BALLOON_ARRAY_PFNS_MAX);
  
-	mutex_lock(>balloon_lock);

/* We can't release more pages than taken */
-   num = min(num, (size_t)vb->num_pages);
-   for (vb->num_pfns = 0; vb->num_pfns < num;
-vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
+   num = min_t(size_t, num, atomic64_read(>num_pages));
+   for (num_pfns = 0; num_pfns < num;
+num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
page = balloon_page_dequeue(vb_dev_info);

If balloon_page_dequeue() can be concurrently called by both host's request
and guest's OOM event, is (!dequeued_page) test in balloon_page_dequeue() safe?



I'm not sure about the question. The "dequeue_page" is a local variable
in the function, why would it be unsafe for two invocations (the shared
b_dev_info->pages are operated under a lock)?




Is such concurrency needed?


Thanks for this question, it triggers another optimizatio

Re: [PATCH v1 0/3] Virtio-balloon Improvement

2017-10-22 Thread Wei Wang

On 10/22/2017 11:19 AM, Michael S. Tsirkin wrote:

On Fri, Oct 20, 2017 at 07:54:23PM +0800, Wei Wang wrote:

This patch series intends to summarize the recent contributions made by
Michael S. Tsirkin, Tetsuo Handa, Michal Hocko etc. via reporting and
discussing the related deadlock issues on the mailinglist. Please check
each patch for details.

>From a high-level point of view, this patch series achieves:
1) eliminate the deadlock issue fundamentally caused by the inability
to run leak_balloon and fill_balloon concurrently;

We need to think about this carefully. Is it an issue that
leak can now bypass fill? It seems that we can now
try to leak a page before fill was seen by host,
but I did not look into it deeply.

I really like my patch for this better at least for
current kernel. I agree we need to work more on 2+3.


Yes, we can check more. But from the original intention:
(copied from the commit e22504296d)
balloon_lock (mutex) : synchronizes the access demand to elements of
  struct virtio_balloon and its queue 
operations;


This implementation has covered what balloon_lock achieves. We have
inflating and deflating decoupled and use a small lock for each vq 
respectively.


I also tested inflating 20G, and before it's done, requested to 
deflating 20G, all work fine.






2) enable OOM to release more than 256 inflated pages; and

Does just this help enough? How about my patch + 2?
Tetsuo, what do you think?


3) stop inflating when the guest is under severe memory pressure
(i.e. OOM).

But when do we finally inflate?  Question is how does host know it needs
to resend an interrupt, and when should it do it?


I think "when to inflate again" should be a policy defined by the 
orchestration
layer software on the host. A reasonable inflating request should be 
sent to a

guest on the condition that this guest has enough free memory to inflate
(virtio-balloon memory stats has already supported to report that info).

If the policy defines to inflate guest memory without considering 
whether the guest
is even under memory pressure. The mechanism we provide here is to offer 
no pages

to the host in that case. I think this should be reasonable.


Best,
Wei


Re: [PATCH v1 0/3] Virtio-balloon Improvement

2017-10-22 Thread Wei Wang

On 10/22/2017 11:19 AM, Michael S. Tsirkin wrote:

On Fri, Oct 20, 2017 at 07:54:23PM +0800, Wei Wang wrote:

This patch series intends to summarize the recent contributions made by
Michael S. Tsirkin, Tetsuo Handa, Michal Hocko etc. via reporting and
discussing the related deadlock issues on the mailinglist. Please check
each patch for details.

>From a high-level point of view, this patch series achieves:
1) eliminate the deadlock issue fundamentally caused by the inability
to run leak_balloon and fill_balloon concurrently;

We need to think about this carefully. Is it an issue that
leak can now bypass fill? It seems that we can now
try to leak a page before fill was seen by host,
but I did not look into it deeply.

I really like my patch for this better at least for
current kernel. I agree we need to work more on 2+3.


Yes, we can check more. But from the original intention:
(copied from the commit e22504296d)
balloon_lock (mutex) : synchronizes the access demand to elements of
  struct virtio_balloon and its queue 
operations;


This implementation has covered what balloon_lock achieves. We have
inflating and deflating decoupled and use a small lock for each vq 
respectively.


I also tested inflating 20G, and before it's done, requested to 
deflating 20G, all work fine.






2) enable OOM to release more than 256 inflated pages; and

Does just this help enough? How about my patch + 2?
Tetsuo, what do you think?


3) stop inflating when the guest is under severe memory pressure
(i.e. OOM).

But when do we finally inflate?  Question is how does host know it needs
to resend an interrupt, and when should it do it?


I think "when to inflate again" should be a policy defined by the 
orchestration
layer software on the host. A reasonable inflating request should be 
sent to a

guest on the condition that this guest has enough free memory to inflate
(virtio-balloon memory stats has already supported to report that info).

If the policy defines to inflate guest memory without considering 
whether the guest
is even under memory pressure. The mechanism we provide here is to offer 
no pages

to the host in that case. I think this should be reasonable.


Best,
Wei


[PATCH v1 2/3] virtio-balloon: deflate up to oom_pages on OOM

2017-10-20 Thread Wei Wang
The current implementation only deflates 256 pages even when a user
specifies more than that via the oom_pages module param. This patch
enables the deflating of up to oom_pages pages if there are enough
inflated pages.

Signed-off-by: Wei Wang <wei.w.w...@intel.com>
Cc: Michael S. Tsirkin <m...@redhat.com>
Cc: Michal Hocko <mho...@kernel.org>
Cc: Tetsuo Handa <penguin-ker...@i-love.sakura.ne.jp>
---
 drivers/virtio/virtio_balloon.c | 14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 1ecd15a..ab55cf8 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -43,8 +43,8 @@
 #define OOM_VBALLOON_DEFAULT_PAGES 256
 #define VIRTBALLOON_OOM_NOTIFY_PRIORITY 80
 
-static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES;
-module_param(oom_pages, int, S_IRUSR | S_IWUSR);
+static unsigned int oom_pages = OOM_VBALLOON_DEFAULT_PAGES;
+module_param(oom_pages, uint, 0600);
 MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
 
 #ifdef CONFIG_BALLOON_COMPACTION
@@ -359,16 +359,20 @@ static int virtballoon_oom_notify(struct notifier_block 
*self,
 {
struct virtio_balloon *vb;
unsigned long *freed;
-   unsigned num_freed_pages;
+   unsigned int npages = oom_pages;
 
vb = container_of(self, struct virtio_balloon, nb);
if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
return NOTIFY_OK;
 
freed = parm;
-   num_freed_pages = leak_balloon(vb, oom_pages);
+
+   /* Don't deflate more than the number of inflated pages */
+   while (npages && atomic64_read(>num_pages))
+   npages -= leak_balloon(vb, npages);
+
update_balloon_size(vb);
-   *freed += num_freed_pages;
+   *freed += oom_pages - npages;
 
return NOTIFY_OK;
 }
-- 
2.7.4



[PATCH v1 2/3] virtio-balloon: deflate up to oom_pages on OOM

2017-10-20 Thread Wei Wang
The current implementation only deflates 256 pages even when a user
specifies more than that via the oom_pages module param. This patch
enables the deflating of up to oom_pages pages if there are enough
inflated pages.

Signed-off-by: Wei Wang 
Cc: Michael S. Tsirkin 
Cc: Michal Hocko 
Cc: Tetsuo Handa 
---
 drivers/virtio/virtio_balloon.c | 14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 1ecd15a..ab55cf8 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -43,8 +43,8 @@
 #define OOM_VBALLOON_DEFAULT_PAGES 256
 #define VIRTBALLOON_OOM_NOTIFY_PRIORITY 80
 
-static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES;
-module_param(oom_pages, int, S_IRUSR | S_IWUSR);
+static unsigned int oom_pages = OOM_VBALLOON_DEFAULT_PAGES;
+module_param(oom_pages, uint, 0600);
 MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
 
 #ifdef CONFIG_BALLOON_COMPACTION
@@ -359,16 +359,20 @@ static int virtballoon_oom_notify(struct notifier_block 
*self,
 {
struct virtio_balloon *vb;
unsigned long *freed;
-   unsigned num_freed_pages;
+   unsigned int npages = oom_pages;
 
vb = container_of(self, struct virtio_balloon, nb);
if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
return NOTIFY_OK;
 
freed = parm;
-   num_freed_pages = leak_balloon(vb, oom_pages);
+
+   /* Don't deflate more than the number of inflated pages */
+   while (npages && atomic64_read(>num_pages))
+   npages -= leak_balloon(vb, npages);
+
update_balloon_size(vb);
-   *freed += num_freed_pages;
+   *freed += oom_pages - npages;
 
return NOTIFY_OK;
 }
-- 
2.7.4



[PATCH v1 0/3] Virtio-balloon Improvement

2017-10-20 Thread Wei Wang
This patch series intends to summarize the recent contributions made by
Michael S. Tsirkin, Tetsuo Handa, Michal Hocko etc. via reporting and
discussing the related deadlock issues on the mailinglist. Please check
each patch for details.

>From a high-level point of view, this patch series achieves:
1) eliminate the deadlock issue fundamentally caused by the inability
to run leak_balloon and fill_balloon concurrently;
2) enable OOM to release more than 256 inflated pages; and
3) stop inflating when the guest is under severe memory pressure
(i.e. OOM).

Here is an example of the benefit brought by this patch series:
The guest sets virtio_balloon.oom_pages=10. When the host requests
to inflate 7.9G of an 8G idle guest, the guest can still run normally
since OOM can guarantee at least 10 pages (400MB) for the guest.
Without the above patches, the guest will kill all the killable
processes and fall into kernel panic finally.

Wei Wang (3):
  virtio-balloon: replace the coarse-grained balloon_lock
  virtio-balloon: deflate up to oom_pages on OOM
  virtio-balloon: stop inflating when OOM occurs

 drivers/virtio/virtio_balloon.c | 149 
 1 file changed, 91 insertions(+), 58 deletions(-)

-- 
2.7.4



[PATCH v1 1/3] virtio-balloon: replace the coarse-grained balloon_lock

2017-10-20 Thread Wei Wang
The balloon_lock was used to synchronize the access demand to elements
of struct virtio_balloon and its queue operations (please see commit
e22504296d). This prevents the concurrent run of the leak_balloon and
fill_balloon functions, thereby resulting in a deadlock issue on OOM:

fill_balloon: take balloon_lock and wait for OOM to get some memory;
oom_notify: release some inflated memory via leak_balloon();
leak_balloon: wait for balloon_lock to be released by fill_balloon.

This patch breaks the lock into two fine-grained inflate_lock and
deflate_lock, and eliminates the unnecessary use of the shared data
(i.e. vb->pnfs, vb->num_pfns). This enables leak_balloon and
fill_balloon to run concurrently and solves the deadlock issue.

Reported-by: Tetsuo Handa <penguin-ker...@i-love.sakura.ne.jp>
Signed-off-by: Wei Wang <wei.w.w...@intel.com>
Cc: Michael S. Tsirkin <m...@redhat.com>
Cc: Tetsuo Handa <penguin-ker...@i-love.sakura.ne.jp>
Cc: Michal Hocko <mho...@kernel.org>
---
 drivers/virtio/virtio_balloon.c | 102 +---
 1 file changed, 53 insertions(+), 49 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index f0b3a0b..1ecd15a 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -67,7 +67,7 @@ struct virtio_balloon {
wait_queue_head_t acked;
 
/* Number of balloon pages we've told the Host we're not using. */
-   unsigned int num_pages;
+   atomic64_t num_pages;
/*
 * The pages we've told the Host we're not using are enqueued
 * at vb_dev_info->pages list.
@@ -76,12 +76,9 @@ struct virtio_balloon {
 */
struct balloon_dev_info vb_dev_info;
 
-   /* Synchronize access/update to this struct virtio_balloon elements */
-   struct mutex balloon_lock;
-
-   /* The array of pfns we tell the Host about. */
-   unsigned int num_pfns;
-   __virtio32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX];
+   /* Synchronize access to inflate_vq and deflate_vq respectively */
+   struct mutex inflate_lock;
+   struct mutex deflate_lock;
 
/* Memory statistics */
struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR];
@@ -111,12 +108,13 @@ static void balloon_ack(struct virtqueue *vq)
wake_up(>acked);
 }
 
-static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
+static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq,
+ __virtio32 pfns[], unsigned int num_pfns)
 {
struct scatterlist sg;
unsigned int len;
 
-   sg_init_one(, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns);
+   sg_init_one(, pfns, sizeof(pfns[0]) * num_pfns);
 
/* We should always be able to add one buffer to an empty queue. */
virtqueue_add_outbuf(vq, , 1, vb, GFP_KERNEL);
@@ -144,14 +142,14 @@ static void set_page_pfns(struct virtio_balloon *vb,
 static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
 {
struct balloon_dev_info *vb_dev_info = >vb_dev_info;
-   unsigned num_allocated_pages;
+   unsigned int num_pfns;
+   __virtio32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX];
 
/* We can only do one array worth at a time. */
-   num = min(num, ARRAY_SIZE(vb->pfns));
+   num = min_t(size_t, num, VIRTIO_BALLOON_ARRAY_PFNS_MAX);
 
-   mutex_lock(>balloon_lock);
-   for (vb->num_pfns = 0; vb->num_pfns < num;
-vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
+   for (num_pfns = 0; num_pfns < num;
+num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
struct page *page = balloon_page_enqueue(vb_dev_info);
 
if (!page) {
@@ -162,20 +160,20 @@ static unsigned fill_balloon(struct virtio_balloon *vb, 
size_t num)
msleep(200);
break;
}
-   set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
-   vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
+   set_page_pfns(vb, pfns + num_pfns, page);
if (!virtio_has_feature(vb->vdev,
VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
adjust_managed_page_count(page, -1);
}
 
-   num_allocated_pages = vb->num_pfns;
+   mutex_lock(>inflate_lock);
/* Did we get any? */
-   if (vb->num_pfns != 0)
-   tell_host(vb, vb->inflate_vq);
-   mutex_unlock(>balloon_lock);
+   if (num_pfns != 0)
+   tell_host(vb, vb->inflate_vq, pfns, num_pfns);
+   mutex_unlock(>inflate_lock);
+   atomic64_add(num_pfns, >num_pages);
 
-   return num_allocated_pages;
+   return num_pfns;
 }
 
 static void release_pages_balloon(struct virtio_balloon *vb,
@@ -194,38 +192,39 @@ static void release_pages_balloon(struct 

[PATCH v1 3/3] virtio-balloon: stop inflating when OOM occurs

2017-10-20 Thread Wei Wang
This patch forces the cease of the inflating work when OOM occurs.
The fundamental idea of memory ballooning is to take out some guest
pages when the guest has low memory utilization, so it is sensible to
inflate nothing when the guest is already under memory pressure.

On the other hand, the policy is determined by the admin or the
orchestration layer from the host. That is, the host is expected to
re-start the memory inflating request at a proper time later when
the guest has enough memory to inflate, for example, by checking
the memory stats reported by the balloon. If another inflating
requests is sent to guest when the guest is still under memory
pressure, still no pages will be inflated.

Signed-off-by: Wei Wang <wei.w.w...@intel.com>
Cc: Michael S. Tsirkin <m...@redhat.com>
Cc: Tetsuo Handa <penguin-ker...@i-love.sakura.ne.jp>
Cc: Michal Hocko <mho...@kernel.org>
---
 drivers/virtio/virtio_balloon.c | 33 +
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index ab55cf8..cf29663 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -63,6 +63,15 @@ struct virtio_balloon {
spinlock_t stop_update_lock;
bool stop_update;
 
+   /*
+* The balloon driver enters the oom mode if the oom notifier is
+* invoked. Entering the oom mode will force the exit of current
+* inflating work. When a later inflating request is received from
+* the host, the success of memory allocation via balloon_page_enqueue
+* will turn off the mode.
+*/
+   bool oom_mode;
+
/* Waiting for host to ack the pages we released. */
wait_queue_head_t acked;
 
@@ -142,22 +151,22 @@ static void set_page_pfns(struct virtio_balloon *vb,
 static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
 {
struct balloon_dev_info *vb_dev_info = >vb_dev_info;
+   struct page *page;
+   size_t orig_num;
unsigned int num_pfns;
__virtio32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX];
 
+   orig_num = num;
/* We can only do one array worth at a time. */
num = min_t(size_t, num, VIRTIO_BALLOON_ARRAY_PFNS_MAX);
 
for (num_pfns = 0; num_pfns < num;
 num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
-   struct page *page = balloon_page_enqueue(vb_dev_info);
-
+   page = balloon_page_enqueue(vb_dev_info);
if (!page) {
dev_info_ratelimited(>vdev->dev,
 "Out of puff! Can't get %u 
pages\n",
 VIRTIO_BALLOON_PAGES_PER_PAGE);
-   /* Sleep for at least 1/5 of a second before retry. */
-   msleep(200);
break;
}
set_page_pfns(vb, pfns + num_pfns, page);
@@ -166,6 +175,13 @@ static unsigned fill_balloon(struct virtio_balloon *vb, 
size_t num)
adjust_managed_page_count(page, -1);
}
 
+   /*
+* The oom_mode is set, but we've already been able to get some
+* pages, so it is time to turn it off here.
+*/
+   if (unlikely(READ_ONCE(vb->oom_mode) && page))
+   WRITE_ONCE(vb->oom_mode, false);
+
mutex_lock(>inflate_lock);
/* Did we get any? */
if (num_pfns != 0)
@@ -173,6 +189,13 @@ static unsigned fill_balloon(struct virtio_balloon *vb, 
size_t num)
mutex_unlock(>inflate_lock);
atomic64_add(num_pfns, >num_pages);
 
+   /*
+* If oom_mode is on, return the original @num passed by
+* update_balloon_size_func to stop the inflating.
+*/
+   if (READ_ONCE(vb->oom_mode))
+   return orig_num;
+
return num_pfns;
 }
 
@@ -365,6 +388,7 @@ static int virtballoon_oom_notify(struct notifier_block 
*self,
if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
return NOTIFY_OK;
 
+   WRITE_ONCE(vb->oom_mode, true);
freed = parm;
 
/* Don't deflate more than the number of inflated pages */
@@ -549,6 +573,7 @@ static int virtballoon_probe(struct virtio_device *vdev)
INIT_WORK(>update_balloon_size_work, update_balloon_size_func);
spin_lock_init(>stop_update_lock);
vb->stop_update = false;
+   vb->oom_mode = false;
atomic64_set(>num_pages, 0);
mutex_init(>inflate_lock);
mutex_init(>deflate_lock);
-- 
2.7.4



[PATCH v1 1/3] virtio-balloon: replace the coarse-grained balloon_lock

2017-10-20 Thread Wei Wang
The balloon_lock was used to synchronize the access demand to elements
of struct virtio_balloon and its queue operations (please see commit
e22504296d). This prevents the concurrent run of the leak_balloon and
fill_balloon functions, thereby resulting in a deadlock issue on OOM:

fill_balloon: take balloon_lock and wait for OOM to get some memory;
oom_notify: release some inflated memory via leak_balloon();
leak_balloon: wait for balloon_lock to be released by fill_balloon.

This patch breaks the lock into two fine-grained inflate_lock and
deflate_lock, and eliminates the unnecessary use of the shared data
(i.e. vb->pnfs, vb->num_pfns). This enables leak_balloon and
fill_balloon to run concurrently and solves the deadlock issue.

Reported-by: Tetsuo Handa 
Signed-off-by: Wei Wang 
Cc: Michael S. Tsirkin 
Cc: Tetsuo Handa 
Cc: Michal Hocko 
---
 drivers/virtio/virtio_balloon.c | 102 +---
 1 file changed, 53 insertions(+), 49 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index f0b3a0b..1ecd15a 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -67,7 +67,7 @@ struct virtio_balloon {
wait_queue_head_t acked;
 
/* Number of balloon pages we've told the Host we're not using. */
-   unsigned int num_pages;
+   atomic64_t num_pages;
/*
 * The pages we've told the Host we're not using are enqueued
 * at vb_dev_info->pages list.
@@ -76,12 +76,9 @@ struct virtio_balloon {
 */
struct balloon_dev_info vb_dev_info;
 
-   /* Synchronize access/update to this struct virtio_balloon elements */
-   struct mutex balloon_lock;
-
-   /* The array of pfns we tell the Host about. */
-   unsigned int num_pfns;
-   __virtio32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX];
+   /* Synchronize access to inflate_vq and deflate_vq respectively */
+   struct mutex inflate_lock;
+   struct mutex deflate_lock;
 
/* Memory statistics */
struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR];
@@ -111,12 +108,13 @@ static void balloon_ack(struct virtqueue *vq)
wake_up(>acked);
 }
 
-static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
+static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq,
+ __virtio32 pfns[], unsigned int num_pfns)
 {
struct scatterlist sg;
unsigned int len;
 
-   sg_init_one(, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns);
+   sg_init_one(, pfns, sizeof(pfns[0]) * num_pfns);
 
/* We should always be able to add one buffer to an empty queue. */
virtqueue_add_outbuf(vq, , 1, vb, GFP_KERNEL);
@@ -144,14 +142,14 @@ static void set_page_pfns(struct virtio_balloon *vb,
 static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
 {
struct balloon_dev_info *vb_dev_info = >vb_dev_info;
-   unsigned num_allocated_pages;
+   unsigned int num_pfns;
+   __virtio32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX];
 
/* We can only do one array worth at a time. */
-   num = min(num, ARRAY_SIZE(vb->pfns));
+   num = min_t(size_t, num, VIRTIO_BALLOON_ARRAY_PFNS_MAX);
 
-   mutex_lock(>balloon_lock);
-   for (vb->num_pfns = 0; vb->num_pfns < num;
-vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
+   for (num_pfns = 0; num_pfns < num;
+num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
struct page *page = balloon_page_enqueue(vb_dev_info);
 
if (!page) {
@@ -162,20 +160,20 @@ static unsigned fill_balloon(struct virtio_balloon *vb, 
size_t num)
msleep(200);
break;
}
-   set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
-   vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
+   set_page_pfns(vb, pfns + num_pfns, page);
if (!virtio_has_feature(vb->vdev,
VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
adjust_managed_page_count(page, -1);
}
 
-   num_allocated_pages = vb->num_pfns;
+   mutex_lock(>inflate_lock);
/* Did we get any? */
-   if (vb->num_pfns != 0)
-   tell_host(vb, vb->inflate_vq);
-   mutex_unlock(>balloon_lock);
+   if (num_pfns != 0)
+   tell_host(vb, vb->inflate_vq, pfns, num_pfns);
+   mutex_unlock(>inflate_lock);
+   atomic64_add(num_pfns, >num_pages);
 
-   return num_allocated_pages;
+   return num_pfns;
 }
 
 static void release_pages_balloon(struct virtio_balloon *vb,
@@ -194,38 +192,39 @@ static void release_pages_balloon(struct virtio_balloon 
*vb,
 
 static unsigned leak_balloon(struct virtio_balloon *vb, size_t num)
 {
-   unsigned num_freed_pages;
struct page *page;
   

[PATCH v1 3/3] virtio-balloon: stop inflating when OOM occurs

2017-10-20 Thread Wei Wang
This patch forces the cease of the inflating work when OOM occurs.
The fundamental idea of memory ballooning is to take out some guest
pages when the guest has low memory utilization, so it is sensible to
inflate nothing when the guest is already under memory pressure.

On the other hand, the policy is determined by the admin or the
orchestration layer from the host. That is, the host is expected to
re-start the memory inflating request at a proper time later when
the guest has enough memory to inflate, for example, by checking
the memory stats reported by the balloon. If another inflating
requests is sent to guest when the guest is still under memory
pressure, still no pages will be inflated.

Signed-off-by: Wei Wang 
Cc: Michael S. Tsirkin 
Cc: Tetsuo Handa 
Cc: Michal Hocko 
---
 drivers/virtio/virtio_balloon.c | 33 +
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index ab55cf8..cf29663 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -63,6 +63,15 @@ struct virtio_balloon {
spinlock_t stop_update_lock;
bool stop_update;
 
+   /*
+* The balloon driver enters the oom mode if the oom notifier is
+* invoked. Entering the oom mode will force the exit of current
+* inflating work. When a later inflating request is received from
+* the host, the success of memory allocation via balloon_page_enqueue
+* will turn off the mode.
+*/
+   bool oom_mode;
+
/* Waiting for host to ack the pages we released. */
wait_queue_head_t acked;
 
@@ -142,22 +151,22 @@ static void set_page_pfns(struct virtio_balloon *vb,
 static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
 {
struct balloon_dev_info *vb_dev_info = >vb_dev_info;
+   struct page *page;
+   size_t orig_num;
unsigned int num_pfns;
__virtio32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX];
 
+   orig_num = num;
/* We can only do one array worth at a time. */
num = min_t(size_t, num, VIRTIO_BALLOON_ARRAY_PFNS_MAX);
 
for (num_pfns = 0; num_pfns < num;
 num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
-   struct page *page = balloon_page_enqueue(vb_dev_info);
-
+   page = balloon_page_enqueue(vb_dev_info);
if (!page) {
dev_info_ratelimited(>vdev->dev,
 "Out of puff! Can't get %u 
pages\n",
 VIRTIO_BALLOON_PAGES_PER_PAGE);
-   /* Sleep for at least 1/5 of a second before retry. */
-   msleep(200);
break;
}
set_page_pfns(vb, pfns + num_pfns, page);
@@ -166,6 +175,13 @@ static unsigned fill_balloon(struct virtio_balloon *vb, 
size_t num)
adjust_managed_page_count(page, -1);
}
 
+   /*
+* The oom_mode is set, but we've already been able to get some
+* pages, so it is time to turn it off here.
+*/
+   if (unlikely(READ_ONCE(vb->oom_mode) && page))
+   WRITE_ONCE(vb->oom_mode, false);
+
mutex_lock(>inflate_lock);
/* Did we get any? */
if (num_pfns != 0)
@@ -173,6 +189,13 @@ static unsigned fill_balloon(struct virtio_balloon *vb, 
size_t num)
mutex_unlock(>inflate_lock);
atomic64_add(num_pfns, >num_pages);
 
+   /*
+* If oom_mode is on, return the original @num passed by
+* update_balloon_size_func to stop the inflating.
+*/
+   if (READ_ONCE(vb->oom_mode))
+   return orig_num;
+
return num_pfns;
 }
 
@@ -365,6 +388,7 @@ static int virtballoon_oom_notify(struct notifier_block 
*self,
if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
return NOTIFY_OK;
 
+   WRITE_ONCE(vb->oom_mode, true);
freed = parm;
 
/* Don't deflate more than the number of inflated pages */
@@ -549,6 +573,7 @@ static int virtballoon_probe(struct virtio_device *vdev)
INIT_WORK(>update_balloon_size_work, update_balloon_size_func);
spin_lock_init(>stop_update_lock);
vb->stop_update = false;
+   vb->oom_mode = false;
atomic64_set(>num_pages, 0);
mutex_init(>inflate_lock);
mutex_init(>deflate_lock);
-- 
2.7.4



[PATCH v1 0/3] Virtio-balloon Improvement

2017-10-20 Thread Wei Wang
This patch series intends to summarize the recent contributions made by
Michael S. Tsirkin, Tetsuo Handa, Michal Hocko etc. via reporting and
discussing the related deadlock issues on the mailinglist. Please check
each patch for details.

>From a high-level point of view, this patch series achieves:
1) eliminate the deadlock issue fundamentally caused by the inability
to run leak_balloon and fill_balloon concurrently;
2) enable OOM to release more than 256 inflated pages; and
3) stop inflating when the guest is under severe memory pressure
(i.e. OOM).

Here is an example of the benefit brought by this patch series:
The guest sets virtio_balloon.oom_pages=10. When the host requests
to inflate 7.9G of an 8G idle guest, the guest can still run normally
since OOM can guarantee at least 10 pages (400MB) for the guest.
Without the above patches, the guest will kill all the killable
processes and fall into kernel panic finally.

Wei Wang (3):
  virtio-balloon: replace the coarse-grained balloon_lock
  virtio-balloon: deflate up to oom_pages on OOM
  virtio-balloon: stop inflating when OOM occurs

 drivers/virtio/virtio_balloon.c | 149 
 1 file changed, 91 insertions(+), 58 deletions(-)

-- 
2.7.4



Re: [PATCH v16 5/5] virtio-balloon: VIRTIO_BALLOON_F_CTRL_VQ

2017-10-19 Thread Wei Wang

On 10/13/2017 09:38 PM, Michael S. Tsirkin wrote:

On Thu, Oct 12, 2017 at 11:54:56AM +0800, Wei Wang wrote:

But I think flushing is very fragile. You will easily run into races
if one of the actors gets out of sync and keeps adding data.
I think adding an ID in the free vq stream is a more robust
approach.


Adding ID to the free vq would need the device to distinguish whether it
receives an ID or a free page hint,

Not really.  It's pretty simple: a 64 bit buffer is an ID. A 4K and bigger one
is a page.


I think we can also use the previous method, free page via in_buf, and 
id via out_buf.


Best,
Wei


Re: [PATCH v16 5/5] virtio-balloon: VIRTIO_BALLOON_F_CTRL_VQ

2017-10-19 Thread Wei Wang

On 10/13/2017 09:38 PM, Michael S. Tsirkin wrote:

On Thu, Oct 12, 2017 at 11:54:56AM +0800, Wei Wang wrote:

But I think flushing is very fragile. You will easily run into races
if one of the actors gets out of sync and keeps adding data.
I think adding an ID in the free vq stream is a more robust
approach.


Adding ID to the free vq would need the device to distinguish whether it
receives an ID or a free page hint,

Not really.  It's pretty simple: a 64 bit buffer is an ID. A 4K and bigger one
is a page.


I think we can also use the previous method, free page via in_buf, and 
id via out_buf.


Best,
Wei


Re: [PATCH] virtio_balloon: fix deadlock on OOM

2017-10-19 Thread Wei Wang

On 10/19/2017 01:19 AM, Michael S. Tsirkin wrote:

On Fri, Oct 13, 2017 at 11:06:23PM +0900, Tetsuo Handa wrote:

Michael S. Tsirkin wrote:

This is a replacement for
[PATCH] virtio: avoid possible OOM lockup at virtballoon_oom_notify()
but unlike that patch it actually deflates on oom even in presence of
lock contention.

But Wei Wang is proposing VIRTIO_BALLOON_F_SG which will try to allocate
memory, isn't he?

Hopefully that can be fixed by allocating outside the lock.



I think that would still have an issue even without the lock, because we 
can't do

any memory allocation in the OOM code path.

Probably, we could write a separate function, leak_balloon_oom() for the 
oom notifier,
which puts the oom deflating pages to the vq one by one, and kick when 
the vq is full.


In this case, we would need to stop the normal leak_balloon while oom 
deflating starts.
However, a better optimization I think would be to do some kind of 
consolidation, since
leak_balloon is already deflating, leak_ballon_oom can just count the 
number of pages
that have been deflated by leak_balloon and return when it reaches 
oom_pages.



Best,
Wei


Re: [PATCH] virtio_balloon: fix deadlock on OOM

2017-10-19 Thread Wei Wang

On 10/19/2017 01:19 AM, Michael S. Tsirkin wrote:

On Fri, Oct 13, 2017 at 11:06:23PM +0900, Tetsuo Handa wrote:

Michael S. Tsirkin wrote:

This is a replacement for
[PATCH] virtio: avoid possible OOM lockup at virtballoon_oom_notify()
but unlike that patch it actually deflates on oom even in presence of
lock contention.

But Wei Wang is proposing VIRTIO_BALLOON_F_SG which will try to allocate
memory, isn't he?

Hopefully that can be fixed by allocating outside the lock.



I think that would still have an issue even without the lock, because we 
can't do

any memory allocation in the OOM code path.

Probably, we could write a separate function, leak_balloon_oom() for the 
oom notifier,
which puts the oom deflating pages to the vq one by one, and kick when 
the vq is full.


In this case, we would need to stop the normal leak_balloon while oom 
deflating starts.
However, a better optimization I think would be to do some kind of 
consolidation, since
leak_balloon is already deflating, leak_ballon_oom can just count the 
number of pages
that have been deflated by leak_balloon and return when it reaches 
oom_pages.



Best,
Wei


Re: [lkp-robot] [ipv6] 2b760fcf5c: WARNING:suspicious_RCU_usage

2017-10-12 Thread Wei Wang
On Thu, Oct 12, 2017 at 7:03 PM, kernel test robot
 wrote:
>
> FYI, we noticed the following commit (built with gcc-6):
>
> commit: 2b760fcf5cfb34e8610df56d83745b2b74ae1379 ("ipv6: hook up exception 
> table to store dst cache")
> https://git.kernel.org/cgit/linux/kernel/git/next/linux-next.git master
>
> in testcase: boot
>
> on test machine: qemu-system-x86_64 -enable-kvm -m 420M
>
> caused below changes (please refer to attached dmesg/kmsg for entire 
> log/backtrace):
>
>
> +---+++
> |   | 38fbcc | 
> 2b760fcf5c |
> +---+++
> | boot_successes| 6  | 2  
> |
> | boot_failures | 2  | 32 
> |
> | BUG:kernel_hang_in_test_stage | 2  |
> |
> | WARNING:suspicious_RCU_usage  | 0  | 32 
> |
> | net/ipv6/route.c:#suspicious_rcu_dereference_check()usage | 0  | 32 
> |
> | WARNING:at_net/ipv6/route.c:#__rt6_find_exception_rcu | 0  | 32 
> |
> +---+++
>
>

This warning should be fixed by later commit 66f5d6ce53e6 ("ipv6:
replace rwlock with rcu and spinlock in fib6_table").
But by this commit, rcu is not yet used in ip6_pol_route(). (Sorry
that I missed this earlier.) Not sure what to do here to fix this
particular warning for this commit.

> [   19.842463] WARNING: suspicious RCU usage
> [   19.843540] 4.14.0-rc3-00907-g2b760fc #58 Not tainted
> [   19.844776] -
> [   19.845854] net/ipv6/route.c:1355 suspicious rcu_dereference_check() usage!
> [   19.847961]
> [   19.847961] other info that might help us debug this:
> [   19.847961]
> [   19.850409]
> [   19.850409] rcu_scheduler_active = 2, debug_locks = 1
> [   19.852220] 2 locks held by odhcpd/3695:
> [   19.853285]  #0:  (sk_lock-AF_INET6){+.+.}, at: [] 
> ip6_datagram_connect+0x1d/0x3f
> [   19.855480]  #1:  (>tb6_lock){++--}, at: [] 
> ip6_pol_route+0x51/0x80c
> [   19.857583]
> [   19.857583] stack backtrace:
> [   19.859115] CPU: 0 PID: 3695 Comm: odhcpd Not tainted 
> 4.14.0-rc3-00907-g2b760fc #58
> [   19.861087] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
> 1.10.2-1 04/01/2014
> [   19.863123] Call Trace:
> [   19.863963]  dump_stack+0x86/0xc0
> [   19.864926]  lockdep_rcu_suspicious+0xea/0xf3
> [   19.866031]  rt6_find_cached_rt+0x51/0x84
> [   19.867173]  ip6_pol_route+0x21c/0x80c
> [   19.868199]  ip6_pol_route_output+0x16/0x18
> [   19.869287]  fib6_rule_lookup+0x1e/0x55
> [   19.870307]  ip6_route_output_flags+0xb6/0xc2
> [   19.871410]  ip6_dst_lookup_tail+0x4f/0x194
> [   19.872500]  ip6_dst_lookup_flow+0x38/0x78
> [   19.873562]  ip6_datagram_dst_update+0x254/0x482
> [   19.874690]  ? save_stack_trace+0x1b/0x1d
> [   19.875760]  __ip6_datagram_connect+0x20e/0x299
> [   19.876881]  ? __ip6_datagram_connect+0x20e/0x299
> [   19.878054]  ip6_datagram_connect+0x2b/0x3f
> [   19.879160]  ip6_datagram_connect_v6_only+0x14/0x1c
> [   19.880348]  inet_dgram_connect+0x49/0x68
> [   19.881424]  SyS_connect+0x74/0xa1
> [   19.882424]  ? __might_fault+0x7e/0x84
> [   19.883495]  ? _copy_from_user+0x61/0x82
> [   19.884531]  compat_SyS_socketcall+0xfb/0x1fd
> [   19.885644]  ? trace_hardirqs_on_caller+0x17b/0x197
> [   19.886868]  do_int80_syscall_32+0x66/0x15a
> [   19.887979]  entry_INT80_compat+0x32/0x50
> [   19.889037] RIP: 0023:0xf7f5e384
> [   19.889985] RSP: 002b:ffc94db8 EFLAGS: 0296 ORIG_RAX: 
> 0066
> [   19.891958] RAX: ffda RBX: 0003 RCX: 
> ffc94dc8
> [   19.893475] RDX: f7fa4000 RSI: ffc94dc8 RDI: 
> ffc94ee0
> [   19.894946] RBP: ffc94e28 R08:  R09: 
> 
> [   19.896454] R10:  R11:  R12: 
> 
> [   19.897968] R13:  R14:  R15: 
> 
> [   19.899594] [ cut here ]
> [   19.900775] WARNING: CPU: 0 PID: 3695 at net/ipv6/route.c:1208 
> __rt6_find_exception_rcu+0x1b/0x7b
> [   19.903220] Modules linked in:
> [   19.904172] CPU: 0 PID: 3695 Comm: odhcpd Not tainted 
> 4.14.0-rc3-00907-g2b760fc #58
> [   19.906147] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
> 1.10.2-1 04/01/2014
> [   19.908263] task: a024549f0040 task.stack: aee58081
> [   19.909644] RIP: 0010:__rt6_find_exception_rcu+0x1b/0x7b
> [   19.910973] RSP: :aee580813af0 EFLAGS: 00010246
> [   19.912231] RAX:  RBX: aee580813cf0 RCX: 
> 0001
> [   19.913783] RDX:  RSI: 

Re: [lkp-robot] [ipv6] 2b760fcf5c: WARNING:suspicious_RCU_usage

2017-10-12 Thread Wei Wang
On Thu, Oct 12, 2017 at 7:03 PM, kernel test robot
 wrote:
>
> FYI, we noticed the following commit (built with gcc-6):
>
> commit: 2b760fcf5cfb34e8610df56d83745b2b74ae1379 ("ipv6: hook up exception 
> table to store dst cache")
> https://git.kernel.org/cgit/linux/kernel/git/next/linux-next.git master
>
> in testcase: boot
>
> on test machine: qemu-system-x86_64 -enable-kvm -m 420M
>
> caused below changes (please refer to attached dmesg/kmsg for entire 
> log/backtrace):
>
>
> +---+++
> |   | 38fbcc | 
> 2b760fcf5c |
> +---+++
> | boot_successes| 6  | 2  
> |
> | boot_failures | 2  | 32 
> |
> | BUG:kernel_hang_in_test_stage | 2  |
> |
> | WARNING:suspicious_RCU_usage  | 0  | 32 
> |
> | net/ipv6/route.c:#suspicious_rcu_dereference_check()usage | 0  | 32 
> |
> | WARNING:at_net/ipv6/route.c:#__rt6_find_exception_rcu | 0  | 32 
> |
> +---+++
>
>

This warning should be fixed by later commit 66f5d6ce53e6 ("ipv6:
replace rwlock with rcu and spinlock in fib6_table").
But by this commit, rcu is not yet used in ip6_pol_route(). (Sorry
that I missed this earlier.) Not sure what to do here to fix this
particular warning for this commit.

> [   19.842463] WARNING: suspicious RCU usage
> [   19.843540] 4.14.0-rc3-00907-g2b760fc #58 Not tainted
> [   19.844776] -
> [   19.845854] net/ipv6/route.c:1355 suspicious rcu_dereference_check() usage!
> [   19.847961]
> [   19.847961] other info that might help us debug this:
> [   19.847961]
> [   19.850409]
> [   19.850409] rcu_scheduler_active = 2, debug_locks = 1
> [   19.852220] 2 locks held by odhcpd/3695:
> [   19.853285]  #0:  (sk_lock-AF_INET6){+.+.}, at: [] 
> ip6_datagram_connect+0x1d/0x3f
> [   19.855480]  #1:  (>tb6_lock){++--}, at: [] 
> ip6_pol_route+0x51/0x80c
> [   19.857583]
> [   19.857583] stack backtrace:
> [   19.859115] CPU: 0 PID: 3695 Comm: odhcpd Not tainted 
> 4.14.0-rc3-00907-g2b760fc #58
> [   19.861087] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
> 1.10.2-1 04/01/2014
> [   19.863123] Call Trace:
> [   19.863963]  dump_stack+0x86/0xc0
> [   19.864926]  lockdep_rcu_suspicious+0xea/0xf3
> [   19.866031]  rt6_find_cached_rt+0x51/0x84
> [   19.867173]  ip6_pol_route+0x21c/0x80c
> [   19.868199]  ip6_pol_route_output+0x16/0x18
> [   19.869287]  fib6_rule_lookup+0x1e/0x55
> [   19.870307]  ip6_route_output_flags+0xb6/0xc2
> [   19.871410]  ip6_dst_lookup_tail+0x4f/0x194
> [   19.872500]  ip6_dst_lookup_flow+0x38/0x78
> [   19.873562]  ip6_datagram_dst_update+0x254/0x482
> [   19.874690]  ? save_stack_trace+0x1b/0x1d
> [   19.875760]  __ip6_datagram_connect+0x20e/0x299
> [   19.876881]  ? __ip6_datagram_connect+0x20e/0x299
> [   19.878054]  ip6_datagram_connect+0x2b/0x3f
> [   19.879160]  ip6_datagram_connect_v6_only+0x14/0x1c
> [   19.880348]  inet_dgram_connect+0x49/0x68
> [   19.881424]  SyS_connect+0x74/0xa1
> [   19.882424]  ? __might_fault+0x7e/0x84
> [   19.883495]  ? _copy_from_user+0x61/0x82
> [   19.884531]  compat_SyS_socketcall+0xfb/0x1fd
> [   19.885644]  ? trace_hardirqs_on_caller+0x17b/0x197
> [   19.886868]  do_int80_syscall_32+0x66/0x15a
> [   19.887979]  entry_INT80_compat+0x32/0x50
> [   19.889037] RIP: 0023:0xf7f5e384
> [   19.889985] RSP: 002b:ffc94db8 EFLAGS: 0296 ORIG_RAX: 
> 0066
> [   19.891958] RAX: ffda RBX: 0003 RCX: 
> ffc94dc8
> [   19.893475] RDX: f7fa4000 RSI: ffc94dc8 RDI: 
> ffc94ee0
> [   19.894946] RBP: ffc94e28 R08:  R09: 
> 
> [   19.896454] R10:  R11:  R12: 
> 
> [   19.897968] R13:  R14:  R15: 
> 
> [   19.899594] [ cut here ]
> [   19.900775] WARNING: CPU: 0 PID: 3695 at net/ipv6/route.c:1208 
> __rt6_find_exception_rcu+0x1b/0x7b
> [   19.903220] Modules linked in:
> [   19.904172] CPU: 0 PID: 3695 Comm: odhcpd Not tainted 
> 4.14.0-rc3-00907-g2b760fc #58
> [   19.906147] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
> 1.10.2-1 04/01/2014
> [   19.908263] task: a024549f0040 task.stack: aee58081
> [   19.909644] RIP: 0010:__rt6_find_exception_rcu+0x1b/0x7b
> [   19.910973] RSP: :aee580813af0 EFLAGS: 00010246
> [   19.912231] RAX:  RBX: aee580813cf0 RCX: 
> 0001
> [   19.913783] RDX:  RSI: 9b47f900 RDI: 
> 

Re: [PATCH v16 5/5] virtio-balloon: VIRTIO_BALLOON_F_CTRL_VQ

2017-10-11 Thread Wei Wang

On 10/11/2017 09:49 PM, Michael S. Tsirkin wrote:

On Wed, Oct 11, 2017 at 02:03:20PM +0800, Wei Wang wrote:

On 10/10/2017 11:15 PM, Michael S. Tsirkin wrote:

On Mon, Oct 02, 2017 at 04:38:01PM +, Wang, Wei W wrote:

On Sunday, October 1, 2017 11:19 AM, Michael S. Tsirkin wrote:

On Sat, Sep 30, 2017 at 12:05:54PM +0800, Wei Wang wrote:

+static void ctrlq_send_cmd(struct virtio_balloon *vb,
+ struct virtio_balloon_ctrlq_cmd *cmd,
+ bool inbuf)
+{
+   struct virtqueue *vq = vb->ctrl_vq;
+
+   ctrlq_add_cmd(vq, cmd, inbuf);
+   if (!inbuf) {
+   /*
+* All the input cmd buffers are replenished here.
+* This is necessary because the input cmd buffers are lost
+* after live migration. The device needs to rewind all of
+* them from the ctrl_vq.

Confused. Live migration somehow loses state? Why is that and why is it a good
idea? And how do you know this is migration even?
Looks like all you know is you got free page end. Could be any reason for this.

I think this would be something that the current live migration lacks - what the
device read from the vq is not transferred during live migration, an example is 
the
stat_vq_elem:
Line 476 at https://github.com/qemu/qemu/blob/master/hw/virtio/virtio-balloon.c

This does not touch guest memory though it just manipulates
internal state to make it easier to migrate.
It's transparent to guest as migration should be.


For all the things that are added to the vq and need to be held by the device
to use later need to consider the situation that live migration might happen at 
any
time and they need to be re-taken from the vq by the device on the destination
machine.

So, even without this live migration optimization feature, I think all the 
things that are
added to the vq for the device to hold, need a way for the device to rewind 
back from
the vq - re-adding all the elements to the vq is a trick to keep a record of 
all of them
on the vq so that the device side rewinding can work.

Please let me know if anything is missed or if you have other suggestions.

IMO migration should pass enough data source to destination for
destination to continue where source left off without guest help.


I'm afraid it would be difficult to pass the entire VirtQueueElement to the
destination. I think
that would also be the reason that stats_vq_elem chose to rewind from the
guest vq, which re-do the
virtqueue_pop() --> virtqueue_map_desc() steps (the QEMU virtual address to
the guest physical
address relationship may be changed on the destination).

Yes but note how that rewind does not involve modifying the ring.
It just rolls back some indices.


Yes, it rolls back the indices, then the following 
virtio_balloon_receive_stats()

can re-pop out the previous entry given by the guest.

Recall how stats_vq_elem works: there is only one stats buffer, which is 
used by the
guest to report stats, and also used by the host to ask the guest for 
stats report.


So the host can roll back one previous entry and what it gets will 
always be stat_vq_elem.



Our case is a little more complex than that - we have both free_page_cmd_in
(for host to guest command) and free_page_cmd_out (for guest to host 
command) buffer
passed via ctrl_vq. When the host rolls back one entry, it may get the 
free_page_cmd_out
buffer which can't be used as the host to guest buffer (i.e. 
free_page_elem held by the device).


So a trick in the driver is to refill the free_page_cmd_in buffer every 
time after the free_page_cmd_out
was sent to the host, so that when the host rewind one previous entry, 
it can always get the

free_page_cmd_in buffer (may be not a very nice method).






How about another direction which would be easier - using two 32-bit device
specific configuration registers,
Host2Guest and Guest2Host command registers, to replace the ctrlq for
command exchange:

The flow can be as follows:

1) Before Host sending a StartCMD, it flushes the free_page_vq in case any
old free page hint is left there;
2) Host writes StartCMD to the Host2Guest register, and notifies the guest;

3) Upon receiving a configuration notification, Guest reads the Host2Guest
register, and detaches all the used buffers from free_page_vq;
(then for each StartCMD, the free_page_vq will always have no obsolete free
page hints, right? )

4) Guest start report free pages:
 4.1) Host may actively write StopCMD to the Host2Guest register before
the guest finishes; or
 4.2) Guest finishes reporting, write StopCMD  the Guest2HOST register,
which traps to QEMU, to stop.


Best,
Wei

I am not sure it matters whether a VQ or the config are used to start/stop.



Not matters, in terms of the flushing issue. The config method could 
avoid the above rewind issue.




But I think flushing is very fragile. You will easily run into races
if one of the actors gets out of sync and keeps adding d

Re: [PATCH v16 5/5] virtio-balloon: VIRTIO_BALLOON_F_CTRL_VQ

2017-10-11 Thread Wei Wang

On 10/11/2017 09:49 PM, Michael S. Tsirkin wrote:

On Wed, Oct 11, 2017 at 02:03:20PM +0800, Wei Wang wrote:

On 10/10/2017 11:15 PM, Michael S. Tsirkin wrote:

On Mon, Oct 02, 2017 at 04:38:01PM +, Wang, Wei W wrote:

On Sunday, October 1, 2017 11:19 AM, Michael S. Tsirkin wrote:

On Sat, Sep 30, 2017 at 12:05:54PM +0800, Wei Wang wrote:

+static void ctrlq_send_cmd(struct virtio_balloon *vb,
+ struct virtio_balloon_ctrlq_cmd *cmd,
+ bool inbuf)
+{
+   struct virtqueue *vq = vb->ctrl_vq;
+
+   ctrlq_add_cmd(vq, cmd, inbuf);
+   if (!inbuf) {
+   /*
+* All the input cmd buffers are replenished here.
+* This is necessary because the input cmd buffers are lost
+* after live migration. The device needs to rewind all of
+* them from the ctrl_vq.

Confused. Live migration somehow loses state? Why is that and why is it a good
idea? And how do you know this is migration even?
Looks like all you know is you got free page end. Could be any reason for this.

I think this would be something that the current live migration lacks - what the
device read from the vq is not transferred during live migration, an example is 
the
stat_vq_elem:
Line 476 at https://github.com/qemu/qemu/blob/master/hw/virtio/virtio-balloon.c

This does not touch guest memory though it just manipulates
internal state to make it easier to migrate.
It's transparent to guest as migration should be.


For all the things that are added to the vq and need to be held by the device
to use later need to consider the situation that live migration might happen at 
any
time and they need to be re-taken from the vq by the device on the destination
machine.

So, even without this live migration optimization feature, I think all the 
things that are
added to the vq for the device to hold, need a way for the device to rewind 
back from
the vq - re-adding all the elements to the vq is a trick to keep a record of 
all of them
on the vq so that the device side rewinding can work.

Please let me know if anything is missed or if you have other suggestions.

IMO migration should pass enough data source to destination for
destination to continue where source left off without guest help.


I'm afraid it would be difficult to pass the entire VirtQueueElement to the
destination. I think
that would also be the reason that stats_vq_elem chose to rewind from the
guest vq, which re-do the
virtqueue_pop() --> virtqueue_map_desc() steps (the QEMU virtual address to
the guest physical
address relationship may be changed on the destination).

Yes but note how that rewind does not involve modifying the ring.
It just rolls back some indices.


Yes, it rolls back the indices, then the following 
virtio_balloon_receive_stats()

can re-pop out the previous entry given by the guest.

Recall how stats_vq_elem works: there is only one stats buffer, which is 
used by the
guest to report stats, and also used by the host to ask the guest for 
stats report.


So the host can roll back one previous entry and what it gets will 
always be stat_vq_elem.



Our case is a little more complex than that - we have both free_page_cmd_in
(for host to guest command) and free_page_cmd_out (for guest to host 
command) buffer
passed via ctrl_vq. When the host rolls back one entry, it may get the 
free_page_cmd_out
buffer which can't be used as the host to guest buffer (i.e. 
free_page_elem held by the device).


So a trick in the driver is to refill the free_page_cmd_in buffer every 
time after the free_page_cmd_out
was sent to the host, so that when the host rewind one previous entry, 
it can always get the

free_page_cmd_in buffer (may be not a very nice method).






How about another direction which would be easier - using two 32-bit device
specific configuration registers,
Host2Guest and Guest2Host command registers, to replace the ctrlq for
command exchange:

The flow can be as follows:

1) Before Host sending a StartCMD, it flushes the free_page_vq in case any
old free page hint is left there;
2) Host writes StartCMD to the Host2Guest register, and notifies the guest;

3) Upon receiving a configuration notification, Guest reads the Host2Guest
register, and detaches all the used buffers from free_page_vq;
(then for each StartCMD, the free_page_vq will always have no obsolete free
page hints, right? )

4) Guest start report free pages:
 4.1) Host may actively write StopCMD to the Host2Guest register before
the guest finishes; or
 4.2) Guest finishes reporting, write StopCMD  the Guest2HOST register,
which traps to QEMU, to stop.


Best,
Wei

I am not sure it matters whether a VQ or the config are used to start/stop.



Not matters, in terms of the flushing issue. The config method could 
avoid the above rewind issue.




But I think flushing is very fragile. You will easily run into races
if one of the actors gets out of sync and keeps adding d

Re: [PATCH v16 5/5] virtio-balloon: VIRTIO_BALLOON_F_CTRL_VQ

2017-10-11 Thread Wei Wang

On 10/10/2017 11:15 PM, Michael S. Tsirkin wrote:

On Mon, Oct 02, 2017 at 04:38:01PM +, Wang, Wei W wrote:

On Sunday, October 1, 2017 11:19 AM, Michael S. Tsirkin wrote:

On Sat, Sep 30, 2017 at 12:05:54PM +0800, Wei Wang wrote:

+static void ctrlq_send_cmd(struct virtio_balloon *vb,
+ struct virtio_balloon_ctrlq_cmd *cmd,
+ bool inbuf)
+{
+   struct virtqueue *vq = vb->ctrl_vq;
+
+   ctrlq_add_cmd(vq, cmd, inbuf);
+   if (!inbuf) {
+   /*
+* All the input cmd buffers are replenished here.
+* This is necessary because the input cmd buffers are lost
+* after live migration. The device needs to rewind all of
+* them from the ctrl_vq.

Confused. Live migration somehow loses state? Why is that and why is it a good
idea? And how do you know this is migration even?
Looks like all you know is you got free page end. Could be any reason for this.


I think this would be something that the current live migration lacks - what the
device read from the vq is not transferred during live migration, an example is 
the
stat_vq_elem:
Line 476 at https://github.com/qemu/qemu/blob/master/hw/virtio/virtio-balloon.c

This does not touch guest memory though it just manipulates
internal state to make it easier to migrate.
It's transparent to guest as migration should be.


For all the things that are added to the vq and need to be held by the device
to use later need to consider the situation that live migration might happen at 
any
time and they need to be re-taken from the vq by the device on the destination
machine.

So, even without this live migration optimization feature, I think all the 
things that are
added to the vq for the device to hold, need a way for the device to rewind 
back from
the vq - re-adding all the elements to the vq is a trick to keep a record of 
all of them
on the vq so that the device side rewinding can work.

Please let me know if anything is missed or if you have other suggestions.

IMO migration should pass enough data source to destination for
destination to continue where source left off without guest help.



I'm afraid it would be difficult to pass the entire VirtQueueElement to 
the destination. I think
that would also be the reason that stats_vq_elem chose to rewind from 
the guest vq, which re-do the
virtqueue_pop() --> virtqueue_map_desc() steps (the QEMU virtual address 
to the guest physical

address relationship may be changed on the destination).


How about another direction which would be easier - using two 32-bit 
device specific configuration registers,
Host2Guest and Guest2Host command registers, to replace the ctrlq for 
command exchange:


The flow can be as follows:

1) Before Host sending a StartCMD, it flushes the free_page_vq in case 
any old free page hint is left there;


2) Host writes StartCMD to the Host2Guest register, and notifies the guest;

3) Upon receiving a configuration notification, Guest reads the 
Host2Guest register, and detaches all the used buffers from free_page_vq;
(then for each StartCMD, the free_page_vq will always have no obsolete 
free page hints, right? )


4) Guest start report free pages:
4.1) Host may actively write StopCMD to the Host2Guest register 
before the guest finishes; or
4.2) Guest finishes reporting, write StopCMD  the Guest2HOST 
register, which traps to QEMU, to stop.



Best,
Wei






Re: [PATCH v16 5/5] virtio-balloon: VIRTIO_BALLOON_F_CTRL_VQ

2017-10-11 Thread Wei Wang

On 10/10/2017 11:15 PM, Michael S. Tsirkin wrote:

On Mon, Oct 02, 2017 at 04:38:01PM +, Wang, Wei W wrote:

On Sunday, October 1, 2017 11:19 AM, Michael S. Tsirkin wrote:

On Sat, Sep 30, 2017 at 12:05:54PM +0800, Wei Wang wrote:

+static void ctrlq_send_cmd(struct virtio_balloon *vb,
+ struct virtio_balloon_ctrlq_cmd *cmd,
+ bool inbuf)
+{
+   struct virtqueue *vq = vb->ctrl_vq;
+
+   ctrlq_add_cmd(vq, cmd, inbuf);
+   if (!inbuf) {
+   /*
+* All the input cmd buffers are replenished here.
+* This is necessary because the input cmd buffers are lost
+* after live migration. The device needs to rewind all of
+* them from the ctrl_vq.

Confused. Live migration somehow loses state? Why is that and why is it a good
idea? And how do you know this is migration even?
Looks like all you know is you got free page end. Could be any reason for this.


I think this would be something that the current live migration lacks - what the
device read from the vq is not transferred during live migration, an example is 
the
stat_vq_elem:
Line 476 at https://github.com/qemu/qemu/blob/master/hw/virtio/virtio-balloon.c

This does not touch guest memory though it just manipulates
internal state to make it easier to migrate.
It's transparent to guest as migration should be.


For all the things that are added to the vq and need to be held by the device
to use later need to consider the situation that live migration might happen at 
any
time and they need to be re-taken from the vq by the device on the destination
machine.

So, even without this live migration optimization feature, I think all the 
things that are
added to the vq for the device to hold, need a way for the device to rewind 
back from
the vq - re-adding all the elements to the vq is a trick to keep a record of 
all of them
on the vq so that the device side rewinding can work.

Please let me know if anything is missed or if you have other suggestions.

IMO migration should pass enough data source to destination for
destination to continue where source left off without guest help.



I'm afraid it would be difficult to pass the entire VirtQueueElement to 
the destination. I think
that would also be the reason that stats_vq_elem chose to rewind from 
the guest vq, which re-do the
virtqueue_pop() --> virtqueue_map_desc() steps (the QEMU virtual address 
to the guest physical

address relationship may be changed on the destination).


How about another direction which would be easier - using two 32-bit 
device specific configuration registers,
Host2Guest and Guest2Host command registers, to replace the ctrlq for 
command exchange:


The flow can be as follows:

1) Before Host sending a StartCMD, it flushes the free_page_vq in case 
any old free page hint is left there;


2) Host writes StartCMD to the Host2Guest register, and notifies the guest;

3) Upon receiving a configuration notification, Guest reads the 
Host2Guest register, and detaches all the used buffers from free_page_vq;
(then for each StartCMD, the free_page_vq will always have no obsolete 
free page hints, right? )


4) Guest start report free pages:
4.1) Host may actively write StopCMD to the Host2Guest register 
before the guest finishes; or
4.2) Guest finishes reporting, write StopCMD  the Guest2HOST 
register, which traps to QEMU, to stop.



Best,
Wei






Re: [PATCH v16 3/5] virtio-balloon: VIRTIO_BALLOON_F_SG

2017-10-10 Thread Wei Wang
On 10/11/2017 10:26 AM, Tetsuo Handa wrote:
> Wei Wang wrote:
>> On 10/10/2017 09:09 PM, Tetsuo Handa wrote:
>>> Wei Wang wrote:
>>>>> And even if we could remove balloon_lock, you still cannot use
>>>>> __GFP_DIRECT_RECLAIM at xb_set_page(). I think you will need to use
>>>>> "whether it is safe to wait" flag from
>>>>> "[PATCH] virtio: avoid possible OOM lockup at virtballoon_oom_notify()" .
>>>> Without the lock being held, why couldn't we use __GFP_DIRECT_RECLAIM at
>>>> xb_set_page()?
>>> Because of dependency shown below.
>>>
>>> leak_balloon()
>>>xb_set_page()
>>>  xb_preload(GFP_KERNEL)
>>>kmalloc(GFP_KERNEL)
>>>  __alloc_pages_may_oom()
>>>Takes oom_lock
>>>out_of_memory()
>>>  blocking_notifier_call_chain()
>>>leak_balloon()
>>>  xb_set_page()
>>>xb_preload(GFP_KERNEL)
>>>  kmalloc(GFP_KERNEL)
>>>__alloc_pages_may_oom()
>>>  Fails to take oom_lock and loop forever
>> __alloc_pages_may_oom() uses mutex_trylock(_lock).
> Yes. But this mutex_trylock(_lock) is semantically mutex_lock(_lock)
> because __alloc_pages_slowpath() will continue looping until
> mutex_trylock(_lock) succeeds (or somebody releases memory).
>
>> I think the second __alloc_pages_may_oom() will not continue since the
>> first one is in progress.
> The second __alloc_pages_may_oom() will be called repeatedly because
> __alloc_pages_slowpath() will continue looping (unless somebody releases
> memory).
>

OK, I see, thanks. So, the point is that the OOM code path should not
have memory allocation, and the
old leak_balloon (without the F_SG feature) don't need xb_preload(). I
think one solution would be to let
the OOM uses the old leak_balloon() code path, and we can add one more
parameter to leak_balloon
to control that:

leak_balloon(struct virtio_balloon *vb, size_t num, bool oom)



>>> By the way, is xb_set_page() safe?
>>> Sleeping in the kernel with preemption disabled is a bug, isn't it?
>>> __radix_tree_preload() returns 0 with preemption disabled upon success.
>>> xb_preload() disables preemption if __radix_tree_preload() fails.
>>> Then, kmalloc() is called with preemption disabled, isn't it?
>>> But xb_set_page() calls xb_preload(GFP_KERNEL) which might sleep with
>>> preemption disabled.
>> Yes, I think that should not be expected, thanks.
>>
>> I plan to change it like this:
>>
>> bool xb_preload(gfp_t gfp)
>> {
>> if (!this_cpu_read(ida_bitmap)) {
>> struct ida_bitmap *bitmap = kmalloc(sizeof(*bitmap), gfp);
>>
>> if (!bitmap)
>> return false;
>> bitmap = this_cpu_cmpxchg(ida_bitmap, NULL, bitmap);
>> kfree(bitmap);
>> }
> Excuse me, but you are allocating per-CPU memory when running CPU might
> change at this line? What happens if running CPU has changed at this line?
> Will it work even with new CPU's ida_bitmap == NULL ?
>


Yes, it will be detected in xb_set_bit(): when ida_bitmap = NULL on the
new CPU, xb_set_bit() will
return -EAGAIN to the caller, and the caller should restart from
xb_preload().

Best,
Wei





Re: [PATCH v16 3/5] virtio-balloon: VIRTIO_BALLOON_F_SG

2017-10-10 Thread Wei Wang
On 10/11/2017 10:26 AM, Tetsuo Handa wrote:
> Wei Wang wrote:
>> On 10/10/2017 09:09 PM, Tetsuo Handa wrote:
>>> Wei Wang wrote:
>>>>> And even if we could remove balloon_lock, you still cannot use
>>>>> __GFP_DIRECT_RECLAIM at xb_set_page(). I think you will need to use
>>>>> "whether it is safe to wait" flag from
>>>>> "[PATCH] virtio: avoid possible OOM lockup at virtballoon_oom_notify()" .
>>>> Without the lock being held, why couldn't we use __GFP_DIRECT_RECLAIM at
>>>> xb_set_page()?
>>> Because of dependency shown below.
>>>
>>> leak_balloon()
>>>xb_set_page()
>>>  xb_preload(GFP_KERNEL)
>>>kmalloc(GFP_KERNEL)
>>>  __alloc_pages_may_oom()
>>>Takes oom_lock
>>>out_of_memory()
>>>  blocking_notifier_call_chain()
>>>leak_balloon()
>>>  xb_set_page()
>>>xb_preload(GFP_KERNEL)
>>>  kmalloc(GFP_KERNEL)
>>>__alloc_pages_may_oom()
>>>  Fails to take oom_lock and loop forever
>> __alloc_pages_may_oom() uses mutex_trylock(_lock).
> Yes. But this mutex_trylock(_lock) is semantically mutex_lock(_lock)
> because __alloc_pages_slowpath() will continue looping until
> mutex_trylock(_lock) succeeds (or somebody releases memory).
>
>> I think the second __alloc_pages_may_oom() will not continue since the
>> first one is in progress.
> The second __alloc_pages_may_oom() will be called repeatedly because
> __alloc_pages_slowpath() will continue looping (unless somebody releases
> memory).
>

OK, I see, thanks. So, the point is that the OOM code path should not
have memory allocation, and the
old leak_balloon (without the F_SG feature) don't need xb_preload(). I
think one solution would be to let
the OOM uses the old leak_balloon() code path, and we can add one more
parameter to leak_balloon
to control that:

leak_balloon(struct virtio_balloon *vb, size_t num, bool oom)



>>> By the way, is xb_set_page() safe?
>>> Sleeping in the kernel with preemption disabled is a bug, isn't it?
>>> __radix_tree_preload() returns 0 with preemption disabled upon success.
>>> xb_preload() disables preemption if __radix_tree_preload() fails.
>>> Then, kmalloc() is called with preemption disabled, isn't it?
>>> But xb_set_page() calls xb_preload(GFP_KERNEL) which might sleep with
>>> preemption disabled.
>> Yes, I think that should not be expected, thanks.
>>
>> I plan to change it like this:
>>
>> bool xb_preload(gfp_t gfp)
>> {
>> if (!this_cpu_read(ida_bitmap)) {
>> struct ida_bitmap *bitmap = kmalloc(sizeof(*bitmap), gfp);
>>
>> if (!bitmap)
>> return false;
>> bitmap = this_cpu_cmpxchg(ida_bitmap, NULL, bitmap);
>> kfree(bitmap);
>> }
> Excuse me, but you are allocating per-CPU memory when running CPU might
> change at this line? What happens if running CPU has changed at this line?
> Will it work even with new CPU's ida_bitmap == NULL ?
>


Yes, it will be detected in xb_set_bit(): when ida_bitmap = NULL on the
new CPU, xb_set_bit() will
return -EAGAIN to the caller, and the caller should restart from
xb_preload().

Best,
Wei





Re: [PATCH v16 3/5] virtio-balloon: VIRTIO_BALLOON_F_SG

2017-10-10 Thread Wei Wang

On 10/10/2017 09:09 PM, Tetsuo Handa wrote:

Wei Wang wrote:

And even if we could remove balloon_lock, you still cannot use
__GFP_DIRECT_RECLAIM at xb_set_page(). I think you will need to use
"whether it is safe to wait" flag from
"[PATCH] virtio: avoid possible OOM lockup at virtballoon_oom_notify()" .

Without the lock being held, why couldn't we use __GFP_DIRECT_RECLAIM at
xb_set_page()?

Because of dependency shown below.

leak_balloon()
   xb_set_page()
 xb_preload(GFP_KERNEL)
   kmalloc(GFP_KERNEL)
 __alloc_pages_may_oom()
   Takes oom_lock
   out_of_memory()
 blocking_notifier_call_chain()
   leak_balloon()
 xb_set_page()
   xb_preload(GFP_KERNEL)
 kmalloc(GFP_KERNEL)
   __alloc_pages_may_oom()
 Fails to take oom_lock and loop forever


__alloc_pages_may_oom() uses mutex_trylock(_lock).

I think the second __alloc_pages_may_oom() will not continue since the
first one is in progress.



By the way, is xb_set_page() safe?
Sleeping in the kernel with preemption disabled is a bug, isn't it?
__radix_tree_preload() returns 0 with preemption disabled upon success.
xb_preload() disables preemption if __radix_tree_preload() fails.
Then, kmalloc() is called with preemption disabled, isn't it?
But xb_set_page() calls xb_preload(GFP_KERNEL) which might sleep with
preemption disabled.


Yes, I think that should not be expected, thanks.

I plan to change it like this:

bool xb_preload(gfp_t gfp)
{
if (!this_cpu_read(ida_bitmap)) {
struct ida_bitmap *bitmap = kmalloc(sizeof(*bitmap), gfp);

if (!bitmap)
return false;
bitmap = this_cpu_cmpxchg(ida_bitmap, NULL, bitmap);
kfree(bitmap);
}

if (__radix_tree_preload(gfp, XB_PRELOAD_SIZE) < 0)
return false;

return true;
}


Best,
Wei



Re: [PATCH v16 3/5] virtio-balloon: VIRTIO_BALLOON_F_SG

2017-10-10 Thread Wei Wang

On 10/10/2017 09:09 PM, Tetsuo Handa wrote:

Wei Wang wrote:

And even if we could remove balloon_lock, you still cannot use
__GFP_DIRECT_RECLAIM at xb_set_page(). I think you will need to use
"whether it is safe to wait" flag from
"[PATCH] virtio: avoid possible OOM lockup at virtballoon_oom_notify()" .

Without the lock being held, why couldn't we use __GFP_DIRECT_RECLAIM at
xb_set_page()?

Because of dependency shown below.

leak_balloon()
   xb_set_page()
 xb_preload(GFP_KERNEL)
   kmalloc(GFP_KERNEL)
 __alloc_pages_may_oom()
   Takes oom_lock
   out_of_memory()
 blocking_notifier_call_chain()
   leak_balloon()
 xb_set_page()
   xb_preload(GFP_KERNEL)
 kmalloc(GFP_KERNEL)
   __alloc_pages_may_oom()
 Fails to take oom_lock and loop forever


__alloc_pages_may_oom() uses mutex_trylock(_lock).

I think the second __alloc_pages_may_oom() will not continue since the
first one is in progress.



By the way, is xb_set_page() safe?
Sleeping in the kernel with preemption disabled is a bug, isn't it?
__radix_tree_preload() returns 0 with preemption disabled upon success.
xb_preload() disables preemption if __radix_tree_preload() fails.
Then, kmalloc() is called with preemption disabled, isn't it?
But xb_set_page() calls xb_preload(GFP_KERNEL) which might sleep with
preemption disabled.


Yes, I think that should not be expected, thanks.

I plan to change it like this:

bool xb_preload(gfp_t gfp)
{
if (!this_cpu_read(ida_bitmap)) {
struct ida_bitmap *bitmap = kmalloc(sizeof(*bitmap), gfp);

if (!bitmap)
return false;
bitmap = this_cpu_cmpxchg(ida_bitmap, NULL, bitmap);
kfree(bitmap);
}

if (__radix_tree_preload(gfp, XB_PRELOAD_SIZE) < 0)
return false;

return true;
}


Best,
Wei



Re: [PATCH][V2] ipv6: fix incorrect bitwise operator used on rt6i_flags

2017-10-10 Thread Wei Wang
On Tue, Oct 10, 2017 at 11:10 AM, Colin King <colin.k...@canonical.com> wrote:
> From: Colin Ian King <colin.k...@canonical.com>
>
> The use of the | operator always leads to true which looks rather
> suspect to me. Fix this by using & instead to just check the
> RTF_CACHE entry bit.
>
> Detected by CoverityScan, CID#1457734, #1457747 ("Wrong operator used")
>
> Fixes: 35732d01fe31 ("ipv6: introduce a hash table to store dst cache")
> Signed-off-by: Colin Ian King <colin.k...@canonical.com>
> ---

Acked-by: Wei Wang <wei...@google.com>

>  net/ipv6/route.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index 6db1541eaa7b..dd9ba1192dbc 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -1425,7 +1425,7 @@ int rt6_remove_exception_rt(struct rt6_info *rt)
> int err;
>
> if (!from ||
> -   !(rt->rt6i_flags | RTF_CACHE))
> +   !(rt->rt6i_flags & RTF_CACHE))
> return -EINVAL;
>
> if (!rcu_access_pointer(from->rt6i_exception_bucket))
> @@ -1469,7 +1469,7 @@ static void rt6_update_exception_stamp_rt(struct 
> rt6_info *rt)
> struct rt6_exception *rt6_ex;
>
> if (!from ||
> -   !(rt->rt6i_flags | RTF_CACHE))
> +   !(rt->rt6i_flags & RTF_CACHE))
> return;
>
> rcu_read_lock();
> --
> 2.14.1
>


Re: [PATCH][V2] ipv6: fix incorrect bitwise operator used on rt6i_flags

2017-10-10 Thread Wei Wang
On Tue, Oct 10, 2017 at 11:10 AM, Colin King  wrote:
> From: Colin Ian King 
>
> The use of the | operator always leads to true which looks rather
> suspect to me. Fix this by using & instead to just check the
> RTF_CACHE entry bit.
>
> Detected by CoverityScan, CID#1457734, #1457747 ("Wrong operator used")
>
> Fixes: 35732d01fe31 ("ipv6: introduce a hash table to store dst cache")
> Signed-off-by: Colin Ian King 
> ---

Acked-by: Wei Wang 

>  net/ipv6/route.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index 6db1541eaa7b..dd9ba1192dbc 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -1425,7 +1425,7 @@ int rt6_remove_exception_rt(struct rt6_info *rt)
> int err;
>
> if (!from ||
> -   !(rt->rt6i_flags | RTF_CACHE))
> +   !(rt->rt6i_flags & RTF_CACHE))
> return -EINVAL;
>
> if (!rcu_access_pointer(from->rt6i_exception_bucket))
> @@ -1469,7 +1469,7 @@ static void rt6_update_exception_stamp_rt(struct 
> rt6_info *rt)
> struct rt6_exception *rt6_ex;
>
> if (!from ||
> -   !(rt->rt6i_flags | RTF_CACHE))
> +   !(rt->rt6i_flags & RTF_CACHE))
> return;
>
> rcu_read_lock();
> --
> 2.14.1
>


Re: [PATCH][net-next] ipv6: fix incorrect bitwise operator used on rt6i_flags

2017-10-10 Thread Wei Wang
On Tue, Oct 10, 2017 at 11:10 AM, Martin KaFai Lau <ka...@fb.com> wrote:
> On Tue, Oct 10, 2017 at 05:55:27PM +, Colin King wrote:
>> From: Colin Ian King <colin.k...@canonical.com>
>>
>> The use of the | operator always leads to true on the expression
>> (rt->rt6i_flags | RTF_CACHE) which looks rather suspect to me. I
>> believe this is fixed by using & instead to just check the
>> RTF_CACHE entry bit.
> Good catch. LGTM. If rt does not have RTF_CACHE set, it should not be in the
> exception table.
>
> Acked-by: Martin KaFai Lau <ka...@fb.com>
>

Thanks a lot for catching this. Yes. It should have been '&' instead of '|'.

Acked-by: Wei Wang <wei...@google.com>

>>
>> Detected by CoverityScan, CID#1457747 ("Wrong operator used")
>>
>> Fixes: 35732d01fe31 ("ipv6: introduce a hash table to store dst cache")
>> Signed-off-by: Colin Ian King <colin.k...@canonical.com>
>> ---
>>  net/ipv6/route.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
>> index 6db1541eaa7b..0556d1ee189c 100644
>> --- a/net/ipv6/route.c
>> +++ b/net/ipv6/route.c
>> @@ -1425,7 +1425,7 @@ int rt6_remove_exception_rt(struct rt6_info *rt)
>>   int err;
>>
>>   if (!from ||
>> - !(rt->rt6i_flags | RTF_CACHE))
>> + !(rt->rt6i_flags & RTF_CACHE))
>>   return -EINVAL;
>>
>>   if (!rcu_access_pointer(from->rt6i_exception_bucket))
>> --
>> 2.14.1
>>


Re: [PATCH][net-next] ipv6: fix incorrect bitwise operator used on rt6i_flags

2017-10-10 Thread Wei Wang
On Tue, Oct 10, 2017 at 11:10 AM, Martin KaFai Lau  wrote:
> On Tue, Oct 10, 2017 at 05:55:27PM +, Colin King wrote:
>> From: Colin Ian King 
>>
>> The use of the | operator always leads to true on the expression
>> (rt->rt6i_flags | RTF_CACHE) which looks rather suspect to me. I
>> believe this is fixed by using & instead to just check the
>> RTF_CACHE entry bit.
> Good catch. LGTM. If rt does not have RTF_CACHE set, it should not be in the
> exception table.
>
> Acked-by: Martin KaFai Lau 
>

Thanks a lot for catching this. Yes. It should have been '&' instead of '|'.

Acked-by: Wei Wang 

>>
>> Detected by CoverityScan, CID#1457747 ("Wrong operator used")
>>
>> Fixes: 35732d01fe31 ("ipv6: introduce a hash table to store dst cache")
>> Signed-off-by: Colin Ian King 
>> ---
>>  net/ipv6/route.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
>> index 6db1541eaa7b..0556d1ee189c 100644
>> --- a/net/ipv6/route.c
>> +++ b/net/ipv6/route.c
>> @@ -1425,7 +1425,7 @@ int rt6_remove_exception_rt(struct rt6_info *rt)
>>   int err;
>>
>>   if (!from ||
>> - !(rt->rt6i_flags | RTF_CACHE))
>> + !(rt->rt6i_flags & RTF_CACHE))
>>   return -EINVAL;
>>
>>   if (!rcu_access_pointer(from->rt6i_exception_bucket))
>> --
>> 2.14.1
>>


Re: [PATCH v16 3/5] virtio-balloon: VIRTIO_BALLOON_F_SG

2017-10-10 Thread Wei Wang

On 10/10/2017 07:08 PM, Tetsuo Handa wrote:

Wei Wang wrote:

On 10/09/2017 11:20 PM, Michael S. Tsirkin wrote:

On Sat, Sep 30, 2017 at 12:05:52PM +0800, Wei Wang wrote:

+static inline void xb_set_page(struct virtio_balloon *vb,
+  struct page *page,
+  unsigned long *pfn_min,
+  unsigned long *pfn_max)
+{
+   unsigned long pfn = page_to_pfn(page);
+
+   *pfn_min = min(pfn, *pfn_min);
+   *pfn_max = max(pfn, *pfn_max);
+   xb_preload(GFP_KERNEL);
+   xb_set_bit(>page_xb, pfn);
+   xb_preload_end();
+}
+

So, this will allocate memory

...


@@ -198,9 +327,12 @@ static unsigned leak_balloon(struct virtio_balloon *vb, 
size_t num)
struct page *page;
struct balloon_dev_info *vb_dev_info = >vb_dev_info;
LIST_HEAD(pages);
+   bool use_sg = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_SG);
+   unsigned long pfn_max = 0, pfn_min = ULONG_MAX;
   
-	/* We can only do one array worth at a time. */

-   num = min(num, ARRAY_SIZE(vb->pfns));
+   /* Traditionally, we can only do one array worth at a time. */
+   if (!use_sg)
+   num = min(num, ARRAY_SIZE(vb->pfns));
   
   	mutex_lock(>balloon_lock);

/* We can't release more pages than taken */

And is sometimes called on OOM.


I suspect we need to

1. keep around some memory for leak on oom

2. for non oom allocate outside locks



I think maybe we can optimize the existing balloon logic, which could
remove the big balloon lock:

It would not be necessary to have the inflating and deflating run at the
same time.
For example, 1st request to inflate 7G RAM, when 1GB has been given to
the host (so 6G left), the
2nd request to deflate 5G is received. Instead of waiting for the 1st
request to inflate 6G and then
continuing with the 2nd request to deflate 5G, we can do a diff (6G to
inflate - 5G to deflate) immediately,
and got 1G to inflate. In this way, all that driver will do is to simply
inflate another 1G.

Same for the OOM case: when OOM asks for 1G, while inflating 5G is in
progress, then the driver can
deduct 1G from the amount that needs to inflate, and as a result, it
will inflate 4G.

In this case, we will never have the inflating and deflating task run at
the same time, so I think it is
possible to remove the lock, and therefore, we will not have that
deadlock issue.

What would you guys think?

What is balloon_lock at virtballoon_migratepage() for?

   e22504296d4f64fb "virtio_balloon: introduce migration primitives to balloon 
pages"
   f68b992bbb474641 "virtio_balloon: fix race by fill and leak"


I think that's the part we need to improve for the existing 
implementation when going with the above direction.


As also stated in the commit log, the lock was proposed to synchronize 
accesses to elements
of struct virtio_balloon and its queue operation. To be more precise, 
fill_balloon/leak_balloon/migrationpage
share vb->pfns[] and vb->num_pfns, which can actually be changed to use 
local variables of their own each.


For example, for migratepage:
+   __virtio32 pfn;
...
-   vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
-   set_page_pfns(vb, vb->pfns, newpage);
-   tell_host(vb, vb->inflate_vq);
+   set_page_pfns(vb, , newpage);
+   tell_host(vb, vb->inflate_vq, , VIRTIO_BALLOON_PAGES_PER_PAGE);

For the queue access, it could be a small lock for each queue access, 
which I think won't cause the issue.





And even if we could remove balloon_lock, you still cannot use
__GFP_DIRECT_RECLAIM at xb_set_page(). I think you will need to use
"whether it is safe to wait" flag from
"[PATCH] virtio: avoid possible OOM lockup at virtballoon_oom_notify()" .


Without the lock being held, why couldn't we use __GFP_DIRECT_RECLAIM at 
xb_set_page()?



Best,
Wei




Re: [PATCH v16 3/5] virtio-balloon: VIRTIO_BALLOON_F_SG

2017-10-10 Thread Wei Wang

On 10/10/2017 07:08 PM, Tetsuo Handa wrote:

Wei Wang wrote:

On 10/09/2017 11:20 PM, Michael S. Tsirkin wrote:

On Sat, Sep 30, 2017 at 12:05:52PM +0800, Wei Wang wrote:

+static inline void xb_set_page(struct virtio_balloon *vb,
+  struct page *page,
+  unsigned long *pfn_min,
+  unsigned long *pfn_max)
+{
+   unsigned long pfn = page_to_pfn(page);
+
+   *pfn_min = min(pfn, *pfn_min);
+   *pfn_max = max(pfn, *pfn_max);
+   xb_preload(GFP_KERNEL);
+   xb_set_bit(>page_xb, pfn);
+   xb_preload_end();
+}
+

So, this will allocate memory

...


@@ -198,9 +327,12 @@ static unsigned leak_balloon(struct virtio_balloon *vb, 
size_t num)
struct page *page;
struct balloon_dev_info *vb_dev_info = >vb_dev_info;
LIST_HEAD(pages);
+   bool use_sg = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_SG);
+   unsigned long pfn_max = 0, pfn_min = ULONG_MAX;
   
-	/* We can only do one array worth at a time. */

-   num = min(num, ARRAY_SIZE(vb->pfns));
+   /* Traditionally, we can only do one array worth at a time. */
+   if (!use_sg)
+   num = min(num, ARRAY_SIZE(vb->pfns));
   
   	mutex_lock(>balloon_lock);

/* We can't release more pages than taken */

And is sometimes called on OOM.


I suspect we need to

1. keep around some memory for leak on oom

2. for non oom allocate outside locks



I think maybe we can optimize the existing balloon logic, which could
remove the big balloon lock:

It would not be necessary to have the inflating and deflating run at the
same time.
For example, 1st request to inflate 7G RAM, when 1GB has been given to
the host (so 6G left), the
2nd request to deflate 5G is received. Instead of waiting for the 1st
request to inflate 6G and then
continuing with the 2nd request to deflate 5G, we can do a diff (6G to
inflate - 5G to deflate) immediately,
and got 1G to inflate. In this way, all that driver will do is to simply
inflate another 1G.

Same for the OOM case: when OOM asks for 1G, while inflating 5G is in
progress, then the driver can
deduct 1G from the amount that needs to inflate, and as a result, it
will inflate 4G.

In this case, we will never have the inflating and deflating task run at
the same time, so I think it is
possible to remove the lock, and therefore, we will not have that
deadlock issue.

What would you guys think?

What is balloon_lock at virtballoon_migratepage() for?

   e22504296d4f64fb "virtio_balloon: introduce migration primitives to balloon 
pages"
   f68b992bbb474641 "virtio_balloon: fix race by fill and leak"


I think that's the part we need to improve for the existing 
implementation when going with the above direction.


As also stated in the commit log, the lock was proposed to synchronize 
accesses to elements
of struct virtio_balloon and its queue operation. To be more precise, 
fill_balloon/leak_balloon/migrationpage
share vb->pfns[] and vb->num_pfns, which can actually be changed to use 
local variables of their own each.


For example, for migratepage:
+   __virtio32 pfn;
...
-   vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
-   set_page_pfns(vb, vb->pfns, newpage);
-   tell_host(vb, vb->inflate_vq);
+   set_page_pfns(vb, , newpage);
+   tell_host(vb, vb->inflate_vq, , VIRTIO_BALLOON_PAGES_PER_PAGE);

For the queue access, it could be a small lock for each queue access, 
which I think won't cause the issue.





And even if we could remove balloon_lock, you still cannot use
__GFP_DIRECT_RECLAIM at xb_set_page(). I think you will need to use
"whether it is safe to wait" flag from
"[PATCH] virtio: avoid possible OOM lockup at virtballoon_oom_notify()" .


Without the lock being held, why couldn't we use __GFP_DIRECT_RECLAIM at 
xb_set_page()?



Best,
Wei




Re: [PATCH v16 3/5] virtio-balloon: VIRTIO_BALLOON_F_SG

2017-10-10 Thread Wei Wang

On 10/09/2017 11:20 PM, Michael S. Tsirkin wrote:

On Sat, Sep 30, 2017 at 12:05:52PM +0800, Wei Wang wrote:

+static inline void xb_set_page(struct virtio_balloon *vb,
+  struct page *page,
+  unsigned long *pfn_min,
+  unsigned long *pfn_max)
+{
+   unsigned long pfn = page_to_pfn(page);
+
+   *pfn_min = min(pfn, *pfn_min);
+   *pfn_max = max(pfn, *pfn_max);
+   xb_preload(GFP_KERNEL);
+   xb_set_bit(>page_xb, pfn);
+   xb_preload_end();
+}
+

So, this will allocate memory

...


@@ -198,9 +327,12 @@ static unsigned leak_balloon(struct virtio_balloon *vb, 
size_t num)
struct page *page;
struct balloon_dev_info *vb_dev_info = >vb_dev_info;
LIST_HEAD(pages);
+   bool use_sg = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_SG);
+   unsigned long pfn_max = 0, pfn_min = ULONG_MAX;
  
-	/* We can only do one array worth at a time. */

-   num = min(num, ARRAY_SIZE(vb->pfns));
+   /* Traditionally, we can only do one array worth at a time. */
+   if (!use_sg)
+   num = min(num, ARRAY_SIZE(vb->pfns));
  
  	mutex_lock(>balloon_lock);

/* We can't release more pages than taken */

And is sometimes called on OOM.


I suspect we need to

1. keep around some memory for leak on oom

2. for non oom allocate outside locks




I think maybe we can optimize the existing balloon logic, which could 
remove the big balloon lock:


It would not be necessary to have the inflating and deflating run at the 
same time.
For example, 1st request to inflate 7G RAM, when 1GB has been given to 
the host (so 6G left), the
2nd request to deflate 5G is received. Instead of waiting for the 1st 
request to inflate 6G and then
continuing with the 2nd request to deflate 5G, we can do a diff (6G to 
inflate - 5G to deflate) immediately,
and got 1G to inflate. In this way, all that driver will do is to simply 
inflate another 1G.


Same for the OOM case: when OOM asks for 1G, while inflating 5G is in 
progress, then the driver can
deduct 1G from the amount that needs to inflate, and as a result, it 
will inflate 4G.


In this case, we will never have the inflating and deflating task run at 
the same time, so I think it is
possible to remove the lock, and therefore, we will not have that 
deadlock issue.


What would you guys think?

Best,
Wei


Re: [PATCH v16 3/5] virtio-balloon: VIRTIO_BALLOON_F_SG

2017-10-10 Thread Wei Wang

On 10/09/2017 11:20 PM, Michael S. Tsirkin wrote:

On Sat, Sep 30, 2017 at 12:05:52PM +0800, Wei Wang wrote:

+static inline void xb_set_page(struct virtio_balloon *vb,
+  struct page *page,
+  unsigned long *pfn_min,
+  unsigned long *pfn_max)
+{
+   unsigned long pfn = page_to_pfn(page);
+
+   *pfn_min = min(pfn, *pfn_min);
+   *pfn_max = max(pfn, *pfn_max);
+   xb_preload(GFP_KERNEL);
+   xb_set_bit(>page_xb, pfn);
+   xb_preload_end();
+}
+

So, this will allocate memory

...


@@ -198,9 +327,12 @@ static unsigned leak_balloon(struct virtio_balloon *vb, 
size_t num)
struct page *page;
struct balloon_dev_info *vb_dev_info = >vb_dev_info;
LIST_HEAD(pages);
+   bool use_sg = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_SG);
+   unsigned long pfn_max = 0, pfn_min = ULONG_MAX;
  
-	/* We can only do one array worth at a time. */

-   num = min(num, ARRAY_SIZE(vb->pfns));
+   /* Traditionally, we can only do one array worth at a time. */
+   if (!use_sg)
+   num = min(num, ARRAY_SIZE(vb->pfns));
  
  	mutex_lock(>balloon_lock);

/* We can't release more pages than taken */

And is sometimes called on OOM.


I suspect we need to

1. keep around some memory for leak on oom

2. for non oom allocate outside locks




I think maybe we can optimize the existing balloon logic, which could 
remove the big balloon lock:


It would not be necessary to have the inflating and deflating run at the 
same time.
For example, 1st request to inflate 7G RAM, when 1GB has been given to 
the host (so 6G left), the
2nd request to deflate 5G is received. Instead of waiting for the 1st 
request to inflate 6G and then
continuing with the 2nd request to deflate 5G, we can do a diff (6G to 
inflate - 5G to deflate) immediately,
and got 1G to inflate. In this way, all that driver will do is to simply 
inflate another 1G.


Same for the OOM case: when OOM asks for 1G, while inflating 5G is in 
progress, then the driver can
deduct 1G from the amount that needs to inflate, and as a result, it 
will inflate 4G.


In this case, we will never have the inflating and deflating task run at 
the same time, so I think it is
possible to remove the lock, and therefore, we will not have that 
deadlock issue.


What would you guys think?

Best,
Wei


Re: [PATCH v16 0/5] Virtio-balloon Enhancement

2017-10-09 Thread Wei Wang

On 10/01/2017 09:25 PM, Damian Tometzki wrote:

Hello,

where i can found the patch in git.kernel.org ?



We don't have patches there. If you want to try this feature, you can 
get the qemu side draft code here: https://github.com/wei-w-wang/qemu-lm


Best,
Wei


Re: [PATCH v16 0/5] Virtio-balloon Enhancement

2017-10-09 Thread Wei Wang

On 10/01/2017 09:25 PM, Damian Tometzki wrote:

Hello,

where i can found the patch in git.kernel.org ?



We don't have patches there. If you want to try this feature, you can 
get the qemu side draft code here: https://github.com/wei-w-wang/qemu-lm


Best,
Wei


[PATCH v16 2/5] radix tree test suite: add tests for xbitmap

2017-09-29 Thread Wei Wang
From: Matthew Wilcox <mawil...@microsoft.com>

Add the following tests for xbitmap:
1) single bit test: single bit set/clear/find;
2) bit range test: set/clear a range of bits and find a 0 or 1 bit in
the range.

Signed-off-by: Wei Wang <wei.w.w...@intel.com>
Cc: Matthew Wilcox <mawil...@microsoft.com>
Cc: Andrew Morton <a...@linux-foundation.org>
Cc: Michael S. Tsirkin <m...@redhat.com>
---
 tools/include/linux/bitmap.h|  34 
 tools/include/linux/kernel.h|   2 +
 tools/testing/radix-tree/Makefile   |   7 +-
 tools/testing/radix-tree/linux/kernel.h |   2 -
 tools/testing/radix-tree/main.c |   5 +
 tools/testing/radix-tree/test.h |   1 +
 tools/testing/radix-tree/xbitmap.c  | 269 
 7 files changed, 317 insertions(+), 3 deletions(-)
 create mode 100644 tools/testing/radix-tree/xbitmap.c

diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h
index e8b9f51..890dab2 100644
--- a/tools/include/linux/bitmap.h
+++ b/tools/include/linux/bitmap.h
@@ -36,6 +36,40 @@ static inline void bitmap_zero(unsigned long *dst, int nbits)
}
 }
 
+static inline void __bitmap_clear(unsigned long *map, unsigned int start,
+ int len)
+{
+   unsigned long *p = map + BIT_WORD(start);
+   const unsigned int size = start + len;
+   int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
+   unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
+
+   while (len - bits_to_clear >= 0) {
+   *p &= ~mask_to_clear;
+   len -= bits_to_clear;
+   bits_to_clear = BITS_PER_LONG;
+   mask_to_clear = ~0UL;
+   p++;
+   }
+   if (len) {
+   mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
+   *p &= ~mask_to_clear;
+   }
+}
+
+static inline __always_inline void bitmap_clear(unsigned long *map,
+   unsigned int start,
+   unsigned int nbits)
+{
+   if (__builtin_constant_p(nbits) && nbits == 1)
+   __clear_bit(start, map);
+   else if (__builtin_constant_p(start & 7) && IS_ALIGNED(start, 8) &&
+__builtin_constant_p(nbits & 7) && IS_ALIGNED(nbits, 8))
+   memset((char *)map + start / 8, 0, nbits / 8);
+   else
+   __bitmap_clear(map, start, nbits);
+}
+
 static inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
 {
unsigned int nlongs = BITS_TO_LONGS(nbits);
diff --git a/tools/include/linux/kernel.h b/tools/include/linux/kernel.h
index 77d2e94..21e90ee 100644
--- a/tools/include/linux/kernel.h
+++ b/tools/include/linux/kernel.h
@@ -12,6 +12,8 @@
 #define UINT_MAX   (~0U)
 #endif
 
+#define IS_ALIGNED(x, a)   (((x) & ((typeof(x))(a) - 1)) == 0)
+
 #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
 
 #define PERF_ALIGN(x, a)   __PERF_ALIGN_MASK(x, (typeof(x))(a)-1)
diff --git a/tools/testing/radix-tree/Makefile 
b/tools/testing/radix-tree/Makefile
index 6a9480c..fc7cb422 100644
--- a/tools/testing/radix-tree/Makefile
+++ b/tools/testing/radix-tree/Makefile
@@ -5,7 +5,8 @@ LDLIBS+= -lpthread -lurcu
 TARGETS = main idr-test multiorder
 CORE_OFILES := radix-tree.o idr.o linux.o test.o find_bit.o
 OFILES = main.o $(CORE_OFILES) regression1.o regression2.o regression3.o \
-tag_check.o multiorder.o idr-test.o iteration_check.o benchmark.o
+tag_check.o multiorder.o idr-test.o iteration_check.o benchmark.o \
+xbitmap.o
 
 ifndef SHIFT
SHIFT=3
@@ -24,6 +25,9 @@ idr-test: idr-test.o $(CORE_OFILES)
 
 multiorder: multiorder.o $(CORE_OFILES)
 
+xbitmap: xbitmap.o $(CORE_OFILES)
+   $(CC) $(CFLAGS) $(LDFLAGS) $^ -o xbitmap
+
 clean:
$(RM) $(TARGETS) *.o radix-tree.c idr.c generated/map-shift.h
 
@@ -33,6 +37,7 @@ $(OFILES): Makefile *.h */*.h generated/map-shift.h \
../../include/linux/*.h \
../../include/asm/*.h \
../../../include/linux/radix-tree.h \
+   ../../../include/linux/xbitmap.h \
../../../include/linux/idr.h
 
 radix-tree.c: ../../../lib/radix-tree.c
diff --git a/tools/testing/radix-tree/linux/kernel.h 
b/tools/testing/radix-tree/linux/kernel.h
index b21a77f..c1e6088 100644
--- a/tools/testing/radix-tree/linux/kernel.h
+++ b/tools/testing/radix-tree/linux/kernel.h
@@ -16,6 +16,4 @@
 #define pr_debug printk
 #define pr_cont printk
 
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
-
 #endif /* _KERNEL_H */
diff --git a/tools/testing/radix-tree/main.c b/tools/testing/radix-tree/main.c
index bc9a784..6f4774e 100644
--- a/tools/testing/radix-tree/main.c
+++ b/tools/testing/radix-tree/main.c
@@ -337,6 +337,11 @@ static void single_thread_tests(bool long_run)
rcu_barrier();
printv(2, "after copy_tag_check: %d al

[PATCH v16 2/5] radix tree test suite: add tests for xbitmap

2017-09-29 Thread Wei Wang
From: Matthew Wilcox 

Add the following tests for xbitmap:
1) single bit test: single bit set/clear/find;
2) bit range test: set/clear a range of bits and find a 0 or 1 bit in
the range.

Signed-off-by: Wei Wang 
Cc: Matthew Wilcox 
Cc: Andrew Morton 
Cc: Michael S. Tsirkin 
---
 tools/include/linux/bitmap.h|  34 
 tools/include/linux/kernel.h|   2 +
 tools/testing/radix-tree/Makefile   |   7 +-
 tools/testing/radix-tree/linux/kernel.h |   2 -
 tools/testing/radix-tree/main.c |   5 +
 tools/testing/radix-tree/test.h |   1 +
 tools/testing/radix-tree/xbitmap.c  | 269 
 7 files changed, 317 insertions(+), 3 deletions(-)
 create mode 100644 tools/testing/radix-tree/xbitmap.c

diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h
index e8b9f51..890dab2 100644
--- a/tools/include/linux/bitmap.h
+++ b/tools/include/linux/bitmap.h
@@ -36,6 +36,40 @@ static inline void bitmap_zero(unsigned long *dst, int nbits)
}
 }
 
+static inline void __bitmap_clear(unsigned long *map, unsigned int start,
+ int len)
+{
+   unsigned long *p = map + BIT_WORD(start);
+   const unsigned int size = start + len;
+   int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
+   unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
+
+   while (len - bits_to_clear >= 0) {
+   *p &= ~mask_to_clear;
+   len -= bits_to_clear;
+   bits_to_clear = BITS_PER_LONG;
+   mask_to_clear = ~0UL;
+   p++;
+   }
+   if (len) {
+   mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
+   *p &= ~mask_to_clear;
+   }
+}
+
+static inline __always_inline void bitmap_clear(unsigned long *map,
+   unsigned int start,
+   unsigned int nbits)
+{
+   if (__builtin_constant_p(nbits) && nbits == 1)
+   __clear_bit(start, map);
+   else if (__builtin_constant_p(start & 7) && IS_ALIGNED(start, 8) &&
+__builtin_constant_p(nbits & 7) && IS_ALIGNED(nbits, 8))
+   memset((char *)map + start / 8, 0, nbits / 8);
+   else
+   __bitmap_clear(map, start, nbits);
+}
+
 static inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
 {
unsigned int nlongs = BITS_TO_LONGS(nbits);
diff --git a/tools/include/linux/kernel.h b/tools/include/linux/kernel.h
index 77d2e94..21e90ee 100644
--- a/tools/include/linux/kernel.h
+++ b/tools/include/linux/kernel.h
@@ -12,6 +12,8 @@
 #define UINT_MAX   (~0U)
 #endif
 
+#define IS_ALIGNED(x, a)   (((x) & ((typeof(x))(a) - 1)) == 0)
+
 #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
 
 #define PERF_ALIGN(x, a)   __PERF_ALIGN_MASK(x, (typeof(x))(a)-1)
diff --git a/tools/testing/radix-tree/Makefile 
b/tools/testing/radix-tree/Makefile
index 6a9480c..fc7cb422 100644
--- a/tools/testing/radix-tree/Makefile
+++ b/tools/testing/radix-tree/Makefile
@@ -5,7 +5,8 @@ LDLIBS+= -lpthread -lurcu
 TARGETS = main idr-test multiorder
 CORE_OFILES := radix-tree.o idr.o linux.o test.o find_bit.o
 OFILES = main.o $(CORE_OFILES) regression1.o regression2.o regression3.o \
-tag_check.o multiorder.o idr-test.o iteration_check.o benchmark.o
+tag_check.o multiorder.o idr-test.o iteration_check.o benchmark.o \
+xbitmap.o
 
 ifndef SHIFT
SHIFT=3
@@ -24,6 +25,9 @@ idr-test: idr-test.o $(CORE_OFILES)
 
 multiorder: multiorder.o $(CORE_OFILES)
 
+xbitmap: xbitmap.o $(CORE_OFILES)
+   $(CC) $(CFLAGS) $(LDFLAGS) $^ -o xbitmap
+
 clean:
$(RM) $(TARGETS) *.o radix-tree.c idr.c generated/map-shift.h
 
@@ -33,6 +37,7 @@ $(OFILES): Makefile *.h */*.h generated/map-shift.h \
../../include/linux/*.h \
../../include/asm/*.h \
../../../include/linux/radix-tree.h \
+   ../../../include/linux/xbitmap.h \
../../../include/linux/idr.h
 
 radix-tree.c: ../../../lib/radix-tree.c
diff --git a/tools/testing/radix-tree/linux/kernel.h 
b/tools/testing/radix-tree/linux/kernel.h
index b21a77f..c1e6088 100644
--- a/tools/testing/radix-tree/linux/kernel.h
+++ b/tools/testing/radix-tree/linux/kernel.h
@@ -16,6 +16,4 @@
 #define pr_debug printk
 #define pr_cont printk
 
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
-
 #endif /* _KERNEL_H */
diff --git a/tools/testing/radix-tree/main.c b/tools/testing/radix-tree/main.c
index bc9a784..6f4774e 100644
--- a/tools/testing/radix-tree/main.c
+++ b/tools/testing/radix-tree/main.c
@@ -337,6 +337,11 @@ static void single_thread_tests(bool long_run)
rcu_barrier();
printv(2, "after copy_tag_check: %d allocated, preempt %d\n",
nr_allocated, preempt_count);
+
+   xbitmap_checks();
+   rcu_barrier()

[PATCH v16 4/5] mm: support reporting free page blocks

2017-09-29 Thread Wei Wang
This patch adds support to walk through the free page blocks in the
system and report them via a callback function. Some page blocks may
leave the free list after zone->lock is released, so it is the caller's
responsibility to either detect or prevent the use of such pages.

One use example of this patch is to accelerate live migration by skipping
the transfer of free pages reported from the guest. A popular method used
by the hypervisor to track which part of memory is written during live
migration is to write-protect all the guest memory. So, those pages that
are reported as free pages but are written after the report function
returns will be captured by the hypervisor, and they will be added to the
next round of memory transfer.

Signed-off-by: Wei Wang <wei.w.w...@intel.com>
Signed-off-by: Liang Li <liang.z...@intel.com>
Cc: Michal Hocko <mho...@kernel.org>
Cc: Michael S. Tsirkin <m...@redhat.com>
---
 include/linux/mm.h |  6 
 mm/page_alloc.c| 91 ++
 2 files changed, 97 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 46b9ac5..d9652c2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1835,6 +1835,12 @@ extern void free_area_init_node(int nid, unsigned long * 
zones_size,
unsigned long zone_start_pfn, unsigned long *zholes_size);
 extern void free_initmem(void);
 
+extern void walk_free_mem_block(void *opaque,
+   int min_order,
+   bool (*report_pfn_range)(void *opaque,
+unsigned long pfn,
+unsigned long num));
+
 /*
  * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
  * into the buddy system. The freed pages will be poisoned with pattern
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6d00f74..c6bb874 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4762,6 +4762,97 @@ void show_free_areas(unsigned int filter, nodemask_t 
*nodemask)
show_swap_cache_info();
 }
 
+/*
+ * Walk through a free page list and report the found pfn range via the
+ * callback.
+ *
+ * Return false if the callback requests to stop reporting. Otherwise,
+ * return true.
+ */
+static bool walk_free_page_list(void *opaque,
+   struct zone *zone,
+   int order,
+   enum migratetype mt,
+   bool (*report_pfn_range)(void *,
+unsigned long,
+unsigned long))
+{
+   struct page *page;
+   struct list_head *list;
+   unsigned long pfn, flags;
+   bool ret;
+
+   spin_lock_irqsave(>lock, flags);
+   list = >free_area[order].free_list[mt];
+   list_for_each_entry(page, list, lru) {
+   pfn = page_to_pfn(page);
+   ret = report_pfn_range(opaque, pfn, 1 << order);
+   if (!ret)
+   break;
+   }
+   spin_unlock_irqrestore(>lock, flags);
+
+   return ret;
+}
+
+/**
+ * walk_free_mem_block - Walk through the free page blocks in the system
+ * @opaque: the context passed from the caller
+ * @min_order: the minimum order of free lists to check
+ * @report_pfn_range: the callback to report the pfn range of the free pages
+ *
+ * If the callback returns false, stop iterating the list of free page blocks.
+ * Otherwise, continue to report.
+ *
+ * Please note that there are no locking guarantees for the callback and
+ * that the reported pfn range might be freed or disappear after the
+ * callback returns so the caller has to be very careful how it is used.
+ *
+ * The callback itself must not sleep or perform any operations which would
+ * require any memory allocations directly (not even GFP_NOWAIT/GFP_ATOMIC)
+ * or via any lock dependency. It is generally advisable to implement
+ * the callback as simple as possible and defer any heavy lifting to a
+ * different context.
+ *
+ * There is no guarantee that each free range will be reported only once
+ * during one walk_free_mem_block invocation.
+ *
+ * pfn_to_page on the given range is strongly discouraged and if there is
+ * an absolute need for that make sure to contact MM people to discuss
+ * potential problems.
+ *
+ * The function itself might sleep so it cannot be called from atomic
+ * contexts.
+ *
+ * In general low orders tend to be very volatile and so it makes more
+ * sense to query larger ones first for various optimizations which like
+ * ballooning etc... This will reduce the overhead as well.
+ */
+void walk_free_mem_block(void *opaque,
+int min_order,
+bool (*report_pfn_range)(void *opaque,
+

[PATCH v16 3/5] virtio-balloon: VIRTIO_BALLOON_F_SG

2017-09-29 Thread Wei Wang
Add a new feature, VIRTIO_BALLOON_F_SG, which enables the transfer
of balloon (i.e. inflated/deflated) pages using scatter-gather lists
to the host.

The implementation of the previous virtio-balloon is not very
efficient, because the balloon pages are transferred to the
host one by one. Here is the breakdown of the time in percentage
spent on each step of the balloon inflating process (inflating
7GB of an 8GB idle guest).

1) allocating pages (6.5%)
2) sending PFNs to host (68.3%)
3) address translation (6.1%)
4) madvise (19%)

It takes about 4126ms for the inflating process to complete.
The above profiling shows that the bottlenecks are stage 2)
and stage 4).

This patch optimizes step 2) by transferring pages to the host in
sgs. An sg describes a chunk of guest physically continuous pages.
With this mechanism, step 4) can also be optimized by doing address
translation and madvise() in chunks rather than page by page.

With this new feature, the above ballooning process takes ~492ms
resulting in an improvement of ~88%.

TODO: optimize stage 1) by allocating/freeing a chunk of pages
instead of a single page each time.

Signed-off-by: Wei Wang <wei.w.w...@intel.com>
Signed-off-by: Liang Li <liang.z...@intel.com>
Suggested-by: Michael S. Tsirkin <m...@redhat.com>
---
 drivers/virtio/virtio_balloon.c | 188 
 include/uapi/linux/virtio_balloon.h |   1 +
 2 files changed, 172 insertions(+), 17 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index f0b3a0b..6952e19 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -32,6 +32,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 /*
  * Balloon device works in 4K page units.  So each page is pointed to by
@@ -79,6 +81,9 @@ struct virtio_balloon {
/* Synchronize access/update to this struct virtio_balloon elements */
struct mutex balloon_lock;
 
+   /* The xbitmap used to record balloon pages */
+   struct xb page_xb;
+
/* The array of pfns we tell the Host about. */
unsigned int num_pfns;
__virtio32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX];
@@ -141,13 +146,128 @@ static void set_page_pfns(struct virtio_balloon *vb,
  page_to_balloon_pfn(page) + i);
 }
 
+
+static void kick_and_wait(struct virtqueue *vq, wait_queue_head_t wq_head)
+{
+   unsigned int len;
+
+   virtqueue_kick(vq);
+   wait_event(wq_head, virtqueue_get_buf(vq, ));
+}
+
+static int add_one_sg(struct virtqueue *vq, void *addr, uint32_t size)
+{
+   struct scatterlist sg;
+   unsigned int len;
+
+   sg_init_one(, addr, size);
+
+   /* Detach all the used buffers from the vq */
+   while (virtqueue_get_buf(vq, ))
+   ;
+
+   return virtqueue_add_inbuf(vq, , 1, vq, GFP_KERNEL);
+}
+
+static int send_balloon_page_sg(struct virtio_balloon *vb,
+struct virtqueue *vq,
+void *addr,
+uint32_t size,
+bool batch)
+{
+   int err;
+
+   err = add_one_sg(vq, addr, size);
+
+   /* If batchng is requested, we batch till the vq is full */
+   if (!batch || !vq->num_free)
+   kick_and_wait(vq, vb->acked);
+
+   return err;
+}
+
+/*
+ * Send balloon pages in sgs to host. The balloon pages are recorded in the
+ * page xbitmap. Each bit in the bitmap corresponds to a page of PAGE_SIZE.
+ * The page xbitmap is searched for continuous "1" bits, which correspond
+ * to continuous pages, to chunk into sgs.
+ *
+ * @page_xb_start and @page_xb_end form the range of bits in the xbitmap that
+ * need to be searched.
+ */
+static void tell_host_sgs(struct virtio_balloon *vb,
+ struct virtqueue *vq,
+ unsigned long page_xb_start,
+ unsigned long page_xb_end)
+{
+   unsigned long sg_pfn_start, sg_pfn_end;
+   void *sg_addr;
+   uint32_t sg_len, sg_max_len = round_down(UINT_MAX, PAGE_SIZE);
+   int err = 0;
+
+   sg_pfn_start = page_xb_start;
+   while (sg_pfn_start < page_xb_end) {
+   sg_pfn_start = xb_find_next_set_bit(>page_xb, sg_pfn_start,
+   page_xb_end);
+   if (sg_pfn_start == page_xb_end + 1)
+   break;
+   sg_pfn_end = xb_find_next_zero_bit(>page_xb,
+  sg_pfn_start + 1,
+  page_xb_end);
+   sg_addr = (void *)pfn_to_kaddr(sg_pfn_start);
+   sg_len = (sg_pfn_end - sg_pfn_start) << PAGE_SHIFT;
+   while (sg_len > sg_max_len) {
+   err = send_balloon_page_sg(vb, vq, sg_addr, sg_max_len,
+  

<    2   3   4   5   6   7   8   9   10   11   >