Re: [PATCH v14 3/5] virtio-balloon: VIRTIO_BALLOON_F_SG

2017-08-21 Thread Michael S. Tsirkin
On Fri, Aug 18, 2017 at 03:39:27PM +0800, Wei Wang wrote:
> On 08/18/2017 10:22 AM, Michael S. Tsirkin wrote:
> > +static void send_balloon_page_sg(struct virtio_balloon *vb,
> > +struct virtqueue *vq,
> > +void *addr,
> > +uint32_t size)
> > +{
> > +   unsigned int len;
> > +   int ret;
> > +
> > +   do {
> > +   ret = add_one_sg(vq, addr, size);
> > +   virtqueue_kick(vq);
> > +   wait_event(vb->acked, virtqueue_get_buf(vq, ));
> > +   /*
> > +* It is uncommon to see the vq is full, because the sg is sent
> > +* one by one and the device is able to handle it in time. But
> > +* if that happens, we go back to retry after an entry gets
> > +* released.
> > +*/
> > Why send one by one though? Why not batch some s/gs and wait for all
> > of them to be completed? If memory if fragmented, waiting every time is
> > worse than what we have now (VIRTIO_BALLOON_ARRAY_PFNS_MAX at a time).
> > 
> 
> OK, I'll do batching in some fashion.
> 
> 
> Best,
> Wei
> 
> 

btw you need to address the build errors that kbot has found.

-- 
MST


Re: [PATCH v14 3/5] virtio-balloon: VIRTIO_BALLOON_F_SG

2017-08-21 Thread Michael S. Tsirkin
On Fri, Aug 18, 2017 at 03:39:27PM +0800, Wei Wang wrote:
> On 08/18/2017 10:22 AM, Michael S. Tsirkin wrote:
> > +static void send_balloon_page_sg(struct virtio_balloon *vb,
> > +struct virtqueue *vq,
> > +void *addr,
> > +uint32_t size)
> > +{
> > +   unsigned int len;
> > +   int ret;
> > +
> > +   do {
> > +   ret = add_one_sg(vq, addr, size);
> > +   virtqueue_kick(vq);
> > +   wait_event(vb->acked, virtqueue_get_buf(vq, ));
> > +   /*
> > +* It is uncommon to see the vq is full, because the sg is sent
> > +* one by one and the device is able to handle it in time. But
> > +* if that happens, we go back to retry after an entry gets
> > +* released.
> > +*/
> > Why send one by one though? Why not batch some s/gs and wait for all
> > of them to be completed? If memory if fragmented, waiting every time is
> > worse than what we have now (VIRTIO_BALLOON_ARRAY_PFNS_MAX at a time).
> > 
> 
> OK, I'll do batching in some fashion.
> 
> 
> Best,
> Wei
> 
> 

btw you need to address the build errors that kbot has found.

-- 
MST


Re: [PATCH v14 3/5] virtio-balloon: VIRTIO_BALLOON_F_SG

2017-08-19 Thread kbuild test robot
Hi Wei,

[auto build test ERROR on linus/master]
[also build test ERROR on v4.13-rc5 next-20170817]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Wei-Wang/lib-xbitmap-Introduce-xbitmap/20170820-035516
config: xtensa-allmodconfig (attached as .config)
compiler: xtensa-linux-gcc (GCC) 4.9.0
reproduce:
wget 
https://raw.githubusercontent.com/01org/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=xtensa 

All errors (new ones prefixed by >>):

   drivers/virtio/virtio_balloon.c: In function 'tell_host_sgs':
>> drivers/virtio/virtio_balloon.c:203:3: error: implicit declaration of 
>> function 'pfn_to_kaddr' [-Werror=implicit-function-declaration]
  sg_addr = (void *)pfn_to_kaddr(sg_pfn_start);
  ^
   cc1: some warnings being treated as errors

vim +/pfn_to_kaddr +203 drivers/virtio/virtio_balloon.c

   176  
   177  /*
   178   * Send balloon pages in sgs to host. The balloon pages are recorded in 
the
   179   * page xbitmap. Each bit in the bitmap corresponds to a page of 
PAGE_SIZE.
   180   * The page xbitmap is searched for continuous "1" bits, which 
correspond
   181   * to continuous pages, to chunk into sgs.
   182   *
   183   * @page_xb_start and @page_xb_end form the range of bits in the 
xbitmap that
   184   * need to be searched.
   185   */
   186  static void tell_host_sgs(struct virtio_balloon *vb,
   187struct virtqueue *vq,
   188unsigned long page_xb_start,
   189unsigned long page_xb_end)
   190  {
   191  unsigned long sg_pfn_start, sg_pfn_end;
   192  void *sg_addr;
   193  uint32_t sg_len, sg_max_len = round_down(UINT_MAX, PAGE_SIZE);
   194  
   195  sg_pfn_start = page_xb_start;
   196  while (sg_pfn_start < page_xb_end) {
   197  sg_pfn_start = xb_find_next_bit(>page_xb, 
sg_pfn_start,
   198  page_xb_end, 1);
   199  if (sg_pfn_start == page_xb_end + 1)
   200  break;
   201  sg_pfn_end = xb_find_next_bit(>page_xb, 
sg_pfn_start + 1,
   202page_xb_end, 0);
 > 203  sg_addr = (void *)pfn_to_kaddr(sg_pfn_start);
   204  sg_len = (sg_pfn_end - sg_pfn_start) << PAGE_SHIFT;
   205  while (sg_len > sg_max_len) {
   206  send_balloon_page_sg(vb, vq, sg_addr, 
sg_max_len);
   207  sg_addr += sg_max_len;
   208  sg_len -= sg_max_len;
   209  }
   210  send_balloon_page_sg(vb, vq, sg_addr, sg_len);
   211  xb_zero(>page_xb, sg_pfn_start, sg_pfn_end);
   212  sg_pfn_start = sg_pfn_end + 1;
   213  }
   214  }
   215  

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


Re: [PATCH v14 3/5] virtio-balloon: VIRTIO_BALLOON_F_SG

2017-08-19 Thread kbuild test robot
Hi Wei,

[auto build test ERROR on linus/master]
[also build test ERROR on v4.13-rc5 next-20170817]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Wei-Wang/lib-xbitmap-Introduce-xbitmap/20170820-035516
config: xtensa-allmodconfig (attached as .config)
compiler: xtensa-linux-gcc (GCC) 4.9.0
reproduce:
wget 
https://raw.githubusercontent.com/01org/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=xtensa 

All errors (new ones prefixed by >>):

   drivers/virtio/virtio_balloon.c: In function 'tell_host_sgs':
>> drivers/virtio/virtio_balloon.c:203:3: error: implicit declaration of 
>> function 'pfn_to_kaddr' [-Werror=implicit-function-declaration]
  sg_addr = (void *)pfn_to_kaddr(sg_pfn_start);
  ^
   cc1: some warnings being treated as errors

vim +/pfn_to_kaddr +203 drivers/virtio/virtio_balloon.c

   176  
   177  /*
   178   * Send balloon pages in sgs to host. The balloon pages are recorded in 
the
   179   * page xbitmap. Each bit in the bitmap corresponds to a page of 
PAGE_SIZE.
   180   * The page xbitmap is searched for continuous "1" bits, which 
correspond
   181   * to continuous pages, to chunk into sgs.
   182   *
   183   * @page_xb_start and @page_xb_end form the range of bits in the 
xbitmap that
   184   * need to be searched.
   185   */
   186  static void tell_host_sgs(struct virtio_balloon *vb,
   187struct virtqueue *vq,
   188unsigned long page_xb_start,
   189unsigned long page_xb_end)
   190  {
   191  unsigned long sg_pfn_start, sg_pfn_end;
   192  void *sg_addr;
   193  uint32_t sg_len, sg_max_len = round_down(UINT_MAX, PAGE_SIZE);
   194  
   195  sg_pfn_start = page_xb_start;
   196  while (sg_pfn_start < page_xb_end) {
   197  sg_pfn_start = xb_find_next_bit(>page_xb, 
sg_pfn_start,
   198  page_xb_end, 1);
   199  if (sg_pfn_start == page_xb_end + 1)
   200  break;
   201  sg_pfn_end = xb_find_next_bit(>page_xb, 
sg_pfn_start + 1,
   202page_xb_end, 0);
 > 203  sg_addr = (void *)pfn_to_kaddr(sg_pfn_start);
   204  sg_len = (sg_pfn_end - sg_pfn_start) << PAGE_SHIFT;
   205  while (sg_len > sg_max_len) {
   206  send_balloon_page_sg(vb, vq, sg_addr, 
sg_max_len);
   207  sg_addr += sg_max_len;
   208  sg_len -= sg_max_len;
   209  }
   210  send_balloon_page_sg(vb, vq, sg_addr, sg_len);
   211  xb_zero(>page_xb, sg_pfn_start, sg_pfn_end);
   212  sg_pfn_start = sg_pfn_end + 1;
   213  }
   214  }
   215  

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


Re: [PATCH v14 3/5] virtio-balloon: VIRTIO_BALLOON_F_SG

2017-08-18 Thread Wei Wang

On 08/18/2017 10:22 AM, Michael S. Tsirkin wrote:

+static void send_balloon_page_sg(struct virtio_balloon *vb,
+struct virtqueue *vq,
+void *addr,
+uint32_t size)
+{
+   unsigned int len;
+   int ret;
+
+   do {
+   ret = add_one_sg(vq, addr, size);
+   virtqueue_kick(vq);
+   wait_event(vb->acked, virtqueue_get_buf(vq, ));
+   /*
+* It is uncommon to see the vq is full, because the sg is sent
+* one by one and the device is able to handle it in time. But
+* if that happens, we go back to retry after an entry gets
+* released.
+*/
Why send one by one though? Why not batch some s/gs and wait for all
of them to be completed? If memory if fragmented, waiting every time is
worse than what we have now (VIRTIO_BALLOON_ARRAY_PFNS_MAX at a time).



OK, I'll do batching in some fashion.


Best,
Wei





Re: [PATCH v14 3/5] virtio-balloon: VIRTIO_BALLOON_F_SG

2017-08-18 Thread Wei Wang

On 08/18/2017 10:22 AM, Michael S. Tsirkin wrote:

+static void send_balloon_page_sg(struct virtio_balloon *vb,
+struct virtqueue *vq,
+void *addr,
+uint32_t size)
+{
+   unsigned int len;
+   int ret;
+
+   do {
+   ret = add_one_sg(vq, addr, size);
+   virtqueue_kick(vq);
+   wait_event(vb->acked, virtqueue_get_buf(vq, ));
+   /*
+* It is uncommon to see the vq is full, because the sg is sent
+* one by one and the device is able to handle it in time. But
+* if that happens, we go back to retry after an entry gets
+* released.
+*/
Why send one by one though? Why not batch some s/gs and wait for all
of them to be completed? If memory if fragmented, waiting every time is
worse than what we have now (VIRTIO_BALLOON_ARRAY_PFNS_MAX at a time).



OK, I'll do batching in some fashion.


Best,
Wei





Re: [PATCH v14 3/5] virtio-balloon: VIRTIO_BALLOON_F_SG

2017-08-17 Thread Michael S. Tsirkin
On Thu, Aug 17, 2017 at 11:26:54AM +0800, Wei Wang wrote:
> Add a new feature, VIRTIO_BALLOON_F_SG, which enables the transfer
> of balloon (i.e. inflated/deflated) pages using scatter-gather lists
> to the host.
> 
> The implementation of the previous virtio-balloon is not very
> efficient, because the balloon pages are transferred to the
> host one by one. Here is the breakdown of the time in percentage
> spent on each step of the balloon inflating process (inflating
> 7GB of an 8GB idle guest).
> 
> 1) allocating pages (6.5%)
> 2) sending PFNs to host (68.3%)
> 3) address translation (6.1%)
> 4) madvise (19%)
> 
> It takes about 4126ms for the inflating process to complete.
> The above profiling shows that the bottlenecks are stage 2)
> and stage 4).
> 
> This patch optimizes step 2) by transferring pages to the host in
> sgs. An sg describes a chunk of guest physically continuous pages.
> With this mechanism, step 4) can also be optimized by doing address
> translation and madvise() in chunks rather than page by page.
> 
> With this new feature, the above ballooning process takes ~541ms
> resulting in an improvement of ~87%.
> 
> TODO: optimize stage 1) by allocating/freeing a chunk of pages
> instead of a single page each time.
> 
> Signed-off-by: Wei Wang 
> Signed-off-by: Liang Li 
> Suggested-by: Michael S. Tsirkin 
> ---
>  drivers/virtio/virtio_balloon.c | 157 
> 
>  include/uapi/linux/virtio_balloon.h |   1 +
>  2 files changed, 141 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index f0b3a0b..72041b4 100644
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -32,6 +32,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  /*
>   * Balloon device works in 4K page units.  So each page is pointed to by
> @@ -79,6 +80,9 @@ struct virtio_balloon {
>   /* Synchronize access/update to this struct virtio_balloon elements */
>   struct mutex balloon_lock;
>  
> + /* The xbitmap used to record ballooned pages */
> + struct xb page_xb;
> +
>   /* The array of pfns we tell the Host about. */
>   unsigned int num_pfns;
>   __virtio32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX];
> @@ -141,13 +145,98 @@ static void set_page_pfns(struct virtio_balloon *vb,
> page_to_balloon_pfn(page) + i);
>  }
>  
> +static int add_one_sg(struct virtqueue *vq, void *addr, uint32_t size)
> +{
> + struct scatterlist sg;
> +
> + sg_init_one(, addr, size);
> + return virtqueue_add_inbuf(vq, , 1, vq, GFP_KERNEL);
> +}
> +
> +static void send_balloon_page_sg(struct virtio_balloon *vb,
> +  struct virtqueue *vq,
> +  void *addr,
> +  uint32_t size)
> +{
> + unsigned int len;
> + int ret;
> +
> + do {
> + ret = add_one_sg(vq, addr, size);
> + virtqueue_kick(vq);
> + wait_event(vb->acked, virtqueue_get_buf(vq, ));
> + /*
> +  * It is uncommon to see the vq is full, because the sg is sent
> +  * one by one and the device is able to handle it in time. But
> +  * if that happens, we go back to retry after an entry gets
> +  * released.
> +  */

Why send one by one though? Why not batch some s/gs and wait for all
of them to be completed? If memory if fragmented, waiting every time is
worse than what we have now (VIRTIO_BALLOON_ARRAY_PFNS_MAX at a time).

> + } while (unlikely(ret == -ENOSPC));
> +}
> +
> +/*
> + * Send balloon pages in sgs to host. The balloon pages are recorded in the
> + * page xbitmap. Each bit in the bitmap corresponds to a page of PAGE_SIZE.
> + * The page xbitmap is searched for continuous "1" bits, which correspond
> + * to continuous pages, to chunk into sgs.
> + *
> + * @page_xb_start and @page_xb_end form the range of bits in the xbitmap that
> + * need to be searched.
> + */
> +static void tell_host_sgs(struct virtio_balloon *vb,
> +   struct virtqueue *vq,
> +   unsigned long page_xb_start,
> +   unsigned long page_xb_end)
> +{
> + unsigned long sg_pfn_start, sg_pfn_end;
> + void *sg_addr;
> + uint32_t sg_len, sg_max_len = round_down(UINT_MAX, PAGE_SIZE);
> +
> + sg_pfn_start = page_xb_start;
> + while (sg_pfn_start < page_xb_end) {
> + sg_pfn_start = xb_find_next_bit(>page_xb, sg_pfn_start,
> + page_xb_end, 1);
> + if (sg_pfn_start == page_xb_end + 1)
> + break;
> + sg_pfn_end = xb_find_next_bit(>page_xb, sg_pfn_start + 1,
> +   page_xb_end, 0);
> + sg_addr = (void *)pfn_to_kaddr(sg_pfn_start);
> 

Re: [PATCH v14 3/5] virtio-balloon: VIRTIO_BALLOON_F_SG

2017-08-17 Thread Michael S. Tsirkin
On Thu, Aug 17, 2017 at 11:26:54AM +0800, Wei Wang wrote:
> Add a new feature, VIRTIO_BALLOON_F_SG, which enables the transfer
> of balloon (i.e. inflated/deflated) pages using scatter-gather lists
> to the host.
> 
> The implementation of the previous virtio-balloon is not very
> efficient, because the balloon pages are transferred to the
> host one by one. Here is the breakdown of the time in percentage
> spent on each step of the balloon inflating process (inflating
> 7GB of an 8GB idle guest).
> 
> 1) allocating pages (6.5%)
> 2) sending PFNs to host (68.3%)
> 3) address translation (6.1%)
> 4) madvise (19%)
> 
> It takes about 4126ms for the inflating process to complete.
> The above profiling shows that the bottlenecks are stage 2)
> and stage 4).
> 
> This patch optimizes step 2) by transferring pages to the host in
> sgs. An sg describes a chunk of guest physically continuous pages.
> With this mechanism, step 4) can also be optimized by doing address
> translation and madvise() in chunks rather than page by page.
> 
> With this new feature, the above ballooning process takes ~541ms
> resulting in an improvement of ~87%.
> 
> TODO: optimize stage 1) by allocating/freeing a chunk of pages
> instead of a single page each time.
> 
> Signed-off-by: Wei Wang 
> Signed-off-by: Liang Li 
> Suggested-by: Michael S. Tsirkin 
> ---
>  drivers/virtio/virtio_balloon.c | 157 
> 
>  include/uapi/linux/virtio_balloon.h |   1 +
>  2 files changed, 141 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index f0b3a0b..72041b4 100644
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -32,6 +32,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  /*
>   * Balloon device works in 4K page units.  So each page is pointed to by
> @@ -79,6 +80,9 @@ struct virtio_balloon {
>   /* Synchronize access/update to this struct virtio_balloon elements */
>   struct mutex balloon_lock;
>  
> + /* The xbitmap used to record ballooned pages */
> + struct xb page_xb;
> +
>   /* The array of pfns we tell the Host about. */
>   unsigned int num_pfns;
>   __virtio32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX];
> @@ -141,13 +145,98 @@ static void set_page_pfns(struct virtio_balloon *vb,
> page_to_balloon_pfn(page) + i);
>  }
>  
> +static int add_one_sg(struct virtqueue *vq, void *addr, uint32_t size)
> +{
> + struct scatterlist sg;
> +
> + sg_init_one(, addr, size);
> + return virtqueue_add_inbuf(vq, , 1, vq, GFP_KERNEL);
> +}
> +
> +static void send_balloon_page_sg(struct virtio_balloon *vb,
> +  struct virtqueue *vq,
> +  void *addr,
> +  uint32_t size)
> +{
> + unsigned int len;
> + int ret;
> +
> + do {
> + ret = add_one_sg(vq, addr, size);
> + virtqueue_kick(vq);
> + wait_event(vb->acked, virtqueue_get_buf(vq, ));
> + /*
> +  * It is uncommon to see the vq is full, because the sg is sent
> +  * one by one and the device is able to handle it in time. But
> +  * if that happens, we go back to retry after an entry gets
> +  * released.
> +  */

Why send one by one though? Why not batch some s/gs and wait for all
of them to be completed? If memory if fragmented, waiting every time is
worse than what we have now (VIRTIO_BALLOON_ARRAY_PFNS_MAX at a time).

> + } while (unlikely(ret == -ENOSPC));
> +}
> +
> +/*
> + * Send balloon pages in sgs to host. The balloon pages are recorded in the
> + * page xbitmap. Each bit in the bitmap corresponds to a page of PAGE_SIZE.
> + * The page xbitmap is searched for continuous "1" bits, which correspond
> + * to continuous pages, to chunk into sgs.
> + *
> + * @page_xb_start and @page_xb_end form the range of bits in the xbitmap that
> + * need to be searched.
> + */
> +static void tell_host_sgs(struct virtio_balloon *vb,
> +   struct virtqueue *vq,
> +   unsigned long page_xb_start,
> +   unsigned long page_xb_end)
> +{
> + unsigned long sg_pfn_start, sg_pfn_end;
> + void *sg_addr;
> + uint32_t sg_len, sg_max_len = round_down(UINT_MAX, PAGE_SIZE);
> +
> + sg_pfn_start = page_xb_start;
> + while (sg_pfn_start < page_xb_end) {
> + sg_pfn_start = xb_find_next_bit(>page_xb, sg_pfn_start,
> + page_xb_end, 1);
> + if (sg_pfn_start == page_xb_end + 1)
> + break;
> + sg_pfn_end = xb_find_next_bit(>page_xb, sg_pfn_start + 1,
> +   page_xb_end, 0);
> + sg_addr = (void *)pfn_to_kaddr(sg_pfn_start);
> + sg_len = (sg_pfn_end - sg_pfn_start) << 

[PATCH v14 3/5] virtio-balloon: VIRTIO_BALLOON_F_SG

2017-08-16 Thread Wei Wang
Add a new feature, VIRTIO_BALLOON_F_SG, which enables the transfer
of balloon (i.e. inflated/deflated) pages using scatter-gather lists
to the host.

The implementation of the previous virtio-balloon is not very
efficient, because the balloon pages are transferred to the
host one by one. Here is the breakdown of the time in percentage
spent on each step of the balloon inflating process (inflating
7GB of an 8GB idle guest).

1) allocating pages (6.5%)
2) sending PFNs to host (68.3%)
3) address translation (6.1%)
4) madvise (19%)

It takes about 4126ms for the inflating process to complete.
The above profiling shows that the bottlenecks are stage 2)
and stage 4).

This patch optimizes step 2) by transferring pages to the host in
sgs. An sg describes a chunk of guest physically continuous pages.
With this mechanism, step 4) can also be optimized by doing address
translation and madvise() in chunks rather than page by page.

With this new feature, the above ballooning process takes ~541ms
resulting in an improvement of ~87%.

TODO: optimize stage 1) by allocating/freeing a chunk of pages
instead of a single page each time.

Signed-off-by: Wei Wang 
Signed-off-by: Liang Li 
Suggested-by: Michael S. Tsirkin 
---
 drivers/virtio/virtio_balloon.c | 157 
 include/uapi/linux/virtio_balloon.h |   1 +
 2 files changed, 141 insertions(+), 17 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index f0b3a0b..72041b4 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -32,6 +32,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Balloon device works in 4K page units.  So each page is pointed to by
@@ -79,6 +80,9 @@ struct virtio_balloon {
/* Synchronize access/update to this struct virtio_balloon elements */
struct mutex balloon_lock;
 
+   /* The xbitmap used to record ballooned pages */
+   struct xb page_xb;
+
/* The array of pfns we tell the Host about. */
unsigned int num_pfns;
__virtio32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX];
@@ -141,13 +145,98 @@ static void set_page_pfns(struct virtio_balloon *vb,
  page_to_balloon_pfn(page) + i);
 }
 
+static int add_one_sg(struct virtqueue *vq, void *addr, uint32_t size)
+{
+   struct scatterlist sg;
+
+   sg_init_one(, addr, size);
+   return virtqueue_add_inbuf(vq, , 1, vq, GFP_KERNEL);
+}
+
+static void send_balloon_page_sg(struct virtio_balloon *vb,
+struct virtqueue *vq,
+void *addr,
+uint32_t size)
+{
+   unsigned int len;
+   int ret;
+
+   do {
+   ret = add_one_sg(vq, addr, size);
+   virtqueue_kick(vq);
+   wait_event(vb->acked, virtqueue_get_buf(vq, ));
+   /*
+* It is uncommon to see the vq is full, because the sg is sent
+* one by one and the device is able to handle it in time. But
+* if that happens, we go back to retry after an entry gets
+* released.
+*/
+   } while (unlikely(ret == -ENOSPC));
+}
+
+/*
+ * Send balloon pages in sgs to host. The balloon pages are recorded in the
+ * page xbitmap. Each bit in the bitmap corresponds to a page of PAGE_SIZE.
+ * The page xbitmap is searched for continuous "1" bits, which correspond
+ * to continuous pages, to chunk into sgs.
+ *
+ * @page_xb_start and @page_xb_end form the range of bits in the xbitmap that
+ * need to be searched.
+ */
+static void tell_host_sgs(struct virtio_balloon *vb,
+ struct virtqueue *vq,
+ unsigned long page_xb_start,
+ unsigned long page_xb_end)
+{
+   unsigned long sg_pfn_start, sg_pfn_end;
+   void *sg_addr;
+   uint32_t sg_len, sg_max_len = round_down(UINT_MAX, PAGE_SIZE);
+
+   sg_pfn_start = page_xb_start;
+   while (sg_pfn_start < page_xb_end) {
+   sg_pfn_start = xb_find_next_bit(>page_xb, sg_pfn_start,
+   page_xb_end, 1);
+   if (sg_pfn_start == page_xb_end + 1)
+   break;
+   sg_pfn_end = xb_find_next_bit(>page_xb, sg_pfn_start + 1,
+ page_xb_end, 0);
+   sg_addr = (void *)pfn_to_kaddr(sg_pfn_start);
+   sg_len = (sg_pfn_end - sg_pfn_start) << PAGE_SHIFT;
+   while (sg_len > sg_max_len) {
+   send_balloon_page_sg(vb, vq, sg_addr, sg_max_len);
+   sg_addr += sg_max_len;
+   sg_len -= sg_max_len;
+   }
+   send_balloon_page_sg(vb, vq, sg_addr, sg_len);
+   xb_zero(>page_xb, sg_pfn_start, sg_pfn_end);
+ 

[PATCH v14 3/5] virtio-balloon: VIRTIO_BALLOON_F_SG

2017-08-16 Thread Wei Wang
Add a new feature, VIRTIO_BALLOON_F_SG, which enables the transfer
of balloon (i.e. inflated/deflated) pages using scatter-gather lists
to the host.

The implementation of the previous virtio-balloon is not very
efficient, because the balloon pages are transferred to the
host one by one. Here is the breakdown of the time in percentage
spent on each step of the balloon inflating process (inflating
7GB of an 8GB idle guest).

1) allocating pages (6.5%)
2) sending PFNs to host (68.3%)
3) address translation (6.1%)
4) madvise (19%)

It takes about 4126ms for the inflating process to complete.
The above profiling shows that the bottlenecks are stage 2)
and stage 4).

This patch optimizes step 2) by transferring pages to the host in
sgs. An sg describes a chunk of guest physically continuous pages.
With this mechanism, step 4) can also be optimized by doing address
translation and madvise() in chunks rather than page by page.

With this new feature, the above ballooning process takes ~541ms
resulting in an improvement of ~87%.

TODO: optimize stage 1) by allocating/freeing a chunk of pages
instead of a single page each time.

Signed-off-by: Wei Wang 
Signed-off-by: Liang Li 
Suggested-by: Michael S. Tsirkin 
---
 drivers/virtio/virtio_balloon.c | 157 
 include/uapi/linux/virtio_balloon.h |   1 +
 2 files changed, 141 insertions(+), 17 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index f0b3a0b..72041b4 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -32,6 +32,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Balloon device works in 4K page units.  So each page is pointed to by
@@ -79,6 +80,9 @@ struct virtio_balloon {
/* Synchronize access/update to this struct virtio_balloon elements */
struct mutex balloon_lock;
 
+   /* The xbitmap used to record ballooned pages */
+   struct xb page_xb;
+
/* The array of pfns we tell the Host about. */
unsigned int num_pfns;
__virtio32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX];
@@ -141,13 +145,98 @@ static void set_page_pfns(struct virtio_balloon *vb,
  page_to_balloon_pfn(page) + i);
 }
 
+static int add_one_sg(struct virtqueue *vq, void *addr, uint32_t size)
+{
+   struct scatterlist sg;
+
+   sg_init_one(, addr, size);
+   return virtqueue_add_inbuf(vq, , 1, vq, GFP_KERNEL);
+}
+
+static void send_balloon_page_sg(struct virtio_balloon *vb,
+struct virtqueue *vq,
+void *addr,
+uint32_t size)
+{
+   unsigned int len;
+   int ret;
+
+   do {
+   ret = add_one_sg(vq, addr, size);
+   virtqueue_kick(vq);
+   wait_event(vb->acked, virtqueue_get_buf(vq, ));
+   /*
+* It is uncommon to see the vq is full, because the sg is sent
+* one by one and the device is able to handle it in time. But
+* if that happens, we go back to retry after an entry gets
+* released.
+*/
+   } while (unlikely(ret == -ENOSPC));
+}
+
+/*
+ * Send balloon pages in sgs to host. The balloon pages are recorded in the
+ * page xbitmap. Each bit in the bitmap corresponds to a page of PAGE_SIZE.
+ * The page xbitmap is searched for continuous "1" bits, which correspond
+ * to continuous pages, to chunk into sgs.
+ *
+ * @page_xb_start and @page_xb_end form the range of bits in the xbitmap that
+ * need to be searched.
+ */
+static void tell_host_sgs(struct virtio_balloon *vb,
+ struct virtqueue *vq,
+ unsigned long page_xb_start,
+ unsigned long page_xb_end)
+{
+   unsigned long sg_pfn_start, sg_pfn_end;
+   void *sg_addr;
+   uint32_t sg_len, sg_max_len = round_down(UINT_MAX, PAGE_SIZE);
+
+   sg_pfn_start = page_xb_start;
+   while (sg_pfn_start < page_xb_end) {
+   sg_pfn_start = xb_find_next_bit(>page_xb, sg_pfn_start,
+   page_xb_end, 1);
+   if (sg_pfn_start == page_xb_end + 1)
+   break;
+   sg_pfn_end = xb_find_next_bit(>page_xb, sg_pfn_start + 1,
+ page_xb_end, 0);
+   sg_addr = (void *)pfn_to_kaddr(sg_pfn_start);
+   sg_len = (sg_pfn_end - sg_pfn_start) << PAGE_SHIFT;
+   while (sg_len > sg_max_len) {
+   send_balloon_page_sg(vb, vq, sg_addr, sg_max_len);
+   sg_addr += sg_max_len;
+   sg_len -= sg_max_len;
+   }
+   send_balloon_page_sg(vb, vq, sg_addr, sg_len);
+   xb_zero(>page_xb, sg_pfn_start, sg_pfn_end);
+   sg_pfn_start = sg_pfn_end + 1;
+   }
+}
+
+static