[RESEND PATCH v3 kernel 4/7] virtio-balloon: speed up inflate/deflate process

2016-10-20 Thread Liang Li
The implementation of the current virtio-balloon is not very
efficient, the time spends on different stages of inflating
the balloon to 7GB of a 8GB idle guest:

a. allocating pages (6.5%)
b. sending PFNs to host (68.3%)
c. address translation (6.1%)
d. madvise (19%)

It takes about 4126ms for the inflating process to complete.
Debugging shows that the bottle neck are the stage b and stage d.

If using a bitmap to send the page info instead of the PFNs, we
can reduce the overhead in stage b quite a lot. Furthermore, we
can do the address translation and call madvise() with a bulk of
RAM pages, instead of the current page per page way, the overhead
of stage c and stage d can also be reduced a lot.

This patch is the kernel side implementation which is intended to
speed up the inflating & deflating process by adding a new feature
to the virtio-balloon device. With this new feature, inflating the
balloon to 7GB of a 8GB idle guest only takes 590ms, the
performance improvement is about 85%.

TODO: optimize stage a by allocating/freeing a chunk of pages
instead of a single page at a time.

Signed-off-by: Liang Li 
Suggested-by: Michael S. Tsirkin 
Cc: Michael S. Tsirkin 
Cc: Paolo Bonzini 
Cc: Cornelia Huck 
Cc: Amit Shah 
---
 drivers/virtio/virtio_balloon.c | 233 +++-
 1 file changed, 209 insertions(+), 24 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 59ffe5a..c31839c 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -42,6 +42,10 @@
 #define OOM_VBALLOON_DEFAULT_PAGES 256
 #define VIRTBALLOON_OOM_NOTIFY_PRIORITY 80
 
+#define BALLOON_BMAP_SIZE  (8 * PAGE_SIZE)
+#define PFNS_PER_BMAP  (BALLOON_BMAP_SIZE * BITS_PER_BYTE)
+#define BALLOON_BMAP_COUNT 32
+
 static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES;
 module_param(oom_pages, int, S_IRUSR | S_IWUSR);
 MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
@@ -67,6 +71,13 @@ struct virtio_balloon {
 
/* Number of balloon pages we've told the Host we're not using. */
unsigned int num_pages;
+   /* Pointer of the bitmap header. */
+   void *bmap_hdr;
+   /* Bitmap and bitmap count used to tell the host the pages */
+   unsigned long *page_bitmap[BALLOON_BMAP_COUNT];
+   unsigned int nr_page_bmap;
+   /* Used to record the processed pfn range */
+   unsigned long min_pfn, max_pfn, start_pfn, end_pfn;
/*
 * The pages we've told the Host we're not using are enqueued
 * at vb_dev_info->pages list.
@@ -110,16 +121,66 @@ static void balloon_ack(struct virtqueue *vq)
wake_up(&vb->acked);
 }
 
+static inline void init_pfn_range(struct virtio_balloon *vb)
+{
+   vb->min_pfn = ULONG_MAX;
+   vb->max_pfn = 0;
+}
+
+static inline void update_pfn_range(struct virtio_balloon *vb,
+struct page *page)
+{
+   unsigned long balloon_pfn = page_to_balloon_pfn(page);
+
+   if (balloon_pfn < vb->min_pfn)
+   vb->min_pfn = balloon_pfn;
+   if (balloon_pfn > vb->max_pfn)
+   vb->max_pfn = balloon_pfn;
+}
+
 static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
 {
-   struct scatterlist sg;
-   unsigned int len;
+   struct scatterlist sg, sg2[BALLOON_BMAP_COUNT + 1];
+   unsigned int len, i;
+
+   if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_PAGE_BITMAP)) {
+   struct balloon_bmap_hdr *hdr = vb->bmap_hdr;
+   unsigned long bmap_len;
+   int nr_pfn, nr_used_bmap, nr_buf;
+
+   nr_pfn = vb->end_pfn - vb->start_pfn + 1;
+   nr_pfn = roundup(nr_pfn, BITS_PER_LONG);
+   nr_used_bmap = nr_pfn / PFNS_PER_BMAP;
+   bmap_len = nr_pfn / BITS_PER_BYTE;
+   nr_buf = nr_used_bmap + 1;
+
+   /* cmd, reserved and req_id are init to 0, unused here */
+   hdr->page_shift = cpu_to_virtio16(vb->vdev, PAGE_SHIFT);
+   hdr->start_pfn = cpu_to_virtio64(vb->vdev, vb->start_pfn);
+   hdr->bmap_len = cpu_to_virtio64(vb->vdev, bmap_len);
+   sg_init_table(sg2, nr_buf);
+   sg_set_buf(&sg2[0], hdr, sizeof(struct balloon_bmap_hdr));
+   for (i = 0; i < nr_used_bmap; i++) {
+   unsigned int  buf_len = BALLOON_BMAP_SIZE;
+
+   if (i + 1 == nr_used_bmap)
+   buf_len = bmap_len - BALLOON_BMAP_SIZE * i;
+   sg_set_buf(&sg2[i + 1], vb->page_bitmap[i], buf_len);
+   }
 
-   sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns);
+   while (vq->num_free < nr_buf)
+   msleep(2);
+   if (virtqueue_add_outbuf(vq, sg2, nr_buf, vb, GFP_KERNEL) == 0)
+   virtqueue_kick(vq);
 
-   /* We should always be able to add one buffer to an empty queue. */

[RESEND PATCH v3 kernel 7/7] virtio-balloon: tell host vm's unused page info

2016-10-20 Thread Liang Li
Support the request for vm's unused page information, response with
a page bitmap. QEMU can make use of this bitmap and the dirty page
logging mechanism to skip the transportation of these unused pages,
this is very helpful to speed up the live migration process.

Signed-off-by: Liang Li 
Cc: Michael S. Tsirkin 
Cc: Paolo Bonzini 
Cc: Cornelia Huck 
Cc: Amit Shah 
---
 drivers/virtio/virtio_balloon.c | 143 +---
 1 file changed, 134 insertions(+), 9 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index c31839c..f10bb8b 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -56,7 +56,7 @@
 
 struct virtio_balloon {
struct virtio_device *vdev;
-   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
+   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *misc_vq;
 
/* The balloon servicing is delegated to a freezable workqueue. */
struct work_struct update_balloon_stats_work;
@@ -78,6 +78,8 @@ struct virtio_balloon {
unsigned int nr_page_bmap;
/* Used to record the processed pfn range */
unsigned long min_pfn, max_pfn, start_pfn, end_pfn;
+   /* Request header */
+   struct balloon_req_hdr req_hdr;
/*
 * The pages we've told the Host we're not using are enqueued
 * at vb_dev_info->pages list.
@@ -423,6 +425,78 @@ static void update_balloon_stats(struct virtio_balloon *vb)
pages_to_bytes(available));
 }
 
+static void send_unused_pages_info(struct virtio_balloon *vb,
+   unsigned long req_id)
+{
+   struct scatterlist sg_in, sg_out[BALLOON_BMAP_COUNT + 1];
+   unsigned long pfn = 0, bmap_len, pfn_limit, last_pfn, nr_pfn;
+   struct virtqueue *vq = vb->misc_vq;
+   struct balloon_bmap_hdr *hdr = vb->bmap_hdr;
+   int ret = 1, nr_buf, used_nr_bmap = 0, i;
+
+   if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_PAGE_BITMAP) &&
+   vb->nr_page_bmap == 1)
+   extend_page_bitmap(vb);
+
+   pfn_limit = PFNS_PER_BMAP * vb->nr_page_bmap;
+   mutex_lock(&vb->balloon_lock);
+   last_pfn = get_max_pfn();
+
+   while (ret) {
+   clear_page_bitmap(vb);
+   ret = get_unused_pages(pfn, pfn + pfn_limit, vb->page_bitmap,
+PFNS_PER_BMAP, vb->nr_page_bmap);
+   if (ret < 0)
+   break;
+   hdr->cmd = cpu_to_virtio16(vb->vdev, BALLOON_GET_UNUSED_PAGES);
+   hdr->page_shift = cpu_to_virtio16(vb->vdev, PAGE_SHIFT);
+   hdr->req_id = cpu_to_virtio64(vb->vdev, req_id);
+   hdr->start_pfn = cpu_to_virtio64(vb->vdev, pfn);
+   bmap_len = BALLOON_BMAP_SIZE * vb->nr_page_bmap;
+
+   if (!ret) {
+   hdr->flag = cpu_to_virtio16(vb->vdev,
+BALLOON_FLAG_DONE);
+   nr_pfn = last_pfn - pfn;
+   used_nr_bmap = nr_pfn / PFNS_PER_BMAP;
+   if (nr_pfn % PFNS_PER_BMAP)
+   used_nr_bmap++;
+   bmap_len = nr_pfn / BITS_PER_BYTE;
+   } else {
+   hdr->flag = cpu_to_virtio16(vb->vdev,
+   BALLOON_FLAG_CONT);
+   used_nr_bmap = vb->nr_page_bmap;
+   }
+   hdr->bmap_len = cpu_to_virtio64(vb->vdev, bmap_len);
+   nr_buf = used_nr_bmap + 1;
+   sg_init_table(sg_out, nr_buf);
+   sg_set_buf(&sg_out[0], hdr, sizeof(struct balloon_bmap_hdr));
+   for (i = 0; i < used_nr_bmap; i++) {
+   unsigned int buf_len = BALLOON_BMAP_SIZE;
+
+   if (i + 1 == used_nr_bmap)
+   buf_len = bmap_len - BALLOON_BMAP_SIZE * i;
+   sg_set_buf(&sg_out[i + 1], vb->page_bitmap[i], buf_len);
+   }
+
+   while (vq->num_free < nr_buf)
+   msleep(2);
+   if (virtqueue_add_outbuf(vq, sg_out, nr_buf, vb,
+GFP_KERNEL) == 0) {
+   virtqueue_kick(vq);
+   while (!virtqueue_get_buf(vq, &i)
+   && !virtqueue_is_broken(vq))
+   cpu_relax();
+   }
+   pfn += pfn_limit;
+   }
+
+   mutex_unlock(&vb->balloon_lock);
+   sg_init_one(&sg_in, &vb->req_hdr, sizeof(vb->req_hdr));
+   virtqueue_add_inbuf(vq, &sg_in, 1, &vb->req_hdr, GFP_KERNEL);
+   virtqueue_kick(vq);
+}
+
 /*
  * While most virtqueues communicate guest-initiated requests to the 
hypervisor,
  * the stats queue operates in reverse.  The driver initializes the virtqueue
@@ -563,18 +637,56 @@ static void update_bal

[RESEND PATCH v3 kernel 5/7] mm: add the related functions to get unused page

2016-10-20 Thread Liang Li
Save the unused page info into page bitmap. The virtio balloon
driver call this new API to get the unused page bitmap and send
the bitmap to hypervisor(QEMU) for speeding up live migration.
During sending the bitmap, some the pages may be modified and are
no free anymore, this inaccuracy can be corrected by the dirty
page logging mechanism.

Signed-off-by: Liang Li 
Cc: Andrew Morton 
Cc: Mel Gorman 
Cc: Michael S. Tsirkin 
Cc: Paolo Bonzini 
Cc: Cornelia Huck 
Cc: Amit Shah 
---
 include/linux/mm.h |  2 ++
 mm/page_alloc.c| 84 ++
 2 files changed, 86 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2a89da0e..84f56ec 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1777,6 +1777,8 @@ extern void free_area_init_node(int nid, unsigned long * 
zones_size,
unsigned long zone_start_pfn, unsigned long *zholes_size);
 extern void free_initmem(void);
 extern unsigned long get_max_pfn(void);
+extern int get_unused_pages(unsigned long start_pfn, unsigned long end_pfn,
+   unsigned long *bitmap[], unsigned long len, unsigned int nr_bmap);
 
 /*
  * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e5f63a9..848bb85 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4436,6 +4436,90 @@ unsigned long get_max_pfn(void)
 }
 EXPORT_SYMBOL(get_max_pfn);
 
+static void mark_unused_pages_bitmap(struct zone *zone,
+   unsigned long start_pfn, unsigned long end_pfn,
+   unsigned long *bitmap[], unsigned long bits,
+   unsigned int nr_bmap)
+{
+   unsigned long pfn, flags, nr_pg, pos, *bmap;
+   unsigned int order, i, t, bmap_idx;
+   struct list_head *curr;
+
+   if (zone_is_empty(zone))
+   return;
+
+   end_pfn = min(start_pfn + nr_bmap * bits, end_pfn);
+   spin_lock_irqsave(&zone->lock, flags);
+
+   for_each_migratetype_order(order, t) {
+   list_for_each(curr, &zone->free_area[order].free_list[t]) {
+   pfn = page_to_pfn(list_entry(curr, struct page, lru));
+   if (pfn < start_pfn || pfn >= end_pfn)
+   continue;
+   nr_pg = 1UL << order;
+   if (pfn + nr_pg > end_pfn)
+   nr_pg = end_pfn - pfn;
+   bmap_idx = (pfn - start_pfn) / bits;
+   if (bmap_idx == (pfn + nr_pg - start_pfn) / bits) {
+   bmap = bitmap[bmap_idx];
+   pos = (pfn - start_pfn) % bits;
+   bitmap_set(bmap, pos, nr_pg);
+   } else
+   for (i = 0; i < nr_pg; i++) {
+   bmap_idx = pos / bits;
+   bmap = bitmap[bmap_idx];
+   pos = pos % bits;
+   bitmap_set(bmap, pos, 1);
+   }
+   }
+   }
+
+   spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+/*
+ * During live migration, page is always discardable unless it's
+ * content is needed by the system.
+ * get_unused_pages provides an API to get the unused pages, these
+ * unused pages can be discarded if there is no modification since
+ * the request. Some other mechanism, like the dirty page logging
+ * can be used to track the modification.
+ *
+ * This function scans the free page list to get the unused pages
+ * whose pfn are range from start_pfn to end_pfn, and set the
+ * corresponding bit in the bitmap if an unused page is found.
+ *
+ * Allocating a large bitmap may fail because of fragmentation,
+ * instead of using a single bitmap, we use a scatter/gather bitmap.
+ * The 'bitmap' is the start address of an array which contains
+ * 'nr_bmap' separate small bitmaps, each bitmap contains 'bits' bits.
+ *
+ * return -1 if parameters are invalid
+ * return 0 when end_pfn >= max_pfn
+ * return 1 when end_pfn < max_pfn
+ */
+int get_unused_pages(unsigned long start_pfn, unsigned long end_pfn,
+   unsigned long *bitmap[], unsigned long bits, unsigned int nr_bmap)
+{
+   struct zone *zone;
+   int ret = 0;
+
+   if (bitmap == NULL || *bitmap == NULL || nr_bmap == 0 ||
+bits == 0 || start_pfn > end_pfn)
+   return -1;
+   if (end_pfn < max_pfn)
+   ret = 1;
+   if (end_pfn >= max_pfn)
+   ret = 0;
+
+   for_each_populated_zone(zone)
+   mark_unused_pages_bitmap(zone, start_pfn, end_pfn, bitmap,
+bits, nr_bmap);
+
+   return ret;
+}
+EXPORT_SYMBOL(get_unused_pages);
+
 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
 {
zoneref->zone = zone;
-- 
1.8.3.1

__

[RESEND PATCH v3 kernel 6/7] virtio-balloon: define feature bit and head for misc virt queue

2016-10-20 Thread Liang Li
Define a new feature bit which supports a new virtual queue. This
new virtual qeuque is for information exchange between hypervisor
and guest. The VMM hypervisor can make use of this virtual queue
to request the guest do some operations, e.g. drop page cache,
synchronize file system, etc. And the VMM hypervisor can get some
of guest's runtime information through this virtual queue, e.g. the
guest's unused page information, which can be used for live migration
optimization.

Signed-off-by: Liang Li 
Cc: Michael S. Tsirkin 
Cc: Paolo Bonzini 
Cc: Cornelia Huck 
Cc: Amit Shah 
---
 include/uapi/linux/virtio_balloon.h | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/include/uapi/linux/virtio_balloon.h 
b/include/uapi/linux/virtio_balloon.h
index d3b182a..3a9d633 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -35,6 +35,7 @@
 #define VIRTIO_BALLOON_F_STATS_VQ  1 /* Memory Stats virtqueue */
 #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM2 /* Deflate balloon on OOM */
 #define VIRTIO_BALLOON_F_PAGE_BITMAP   3 /* Send page info with bitmap */
+#define VIRTIO_BALLOON_F_MISC_VQ   4 /* Misc info virtqueue */
 
 /* Size of a PFN in the balloon interface. */
 #define VIRTIO_BALLOON_PFN_SHIFT 12
@@ -101,4 +102,25 @@ struct balloon_bmap_hdr {
__virtio64 bmap_len;
 };
 
+enum balloon_req_id {
+   /* Get unused pages information */
+   BALLOON_GET_UNUSED_PAGES,
+};
+
+enum balloon_flag {
+   /* Have more data for a request */
+   BALLOON_FLAG_CONT,
+   /* No more data for a request */
+   BALLOON_FLAG_DONE,
+};
+
+struct balloon_req_hdr {
+   /* Used to distinguish different request */
+   __virtio16 cmd;
+   /* Reserved */
+   __virtio16 reserved[3];
+   /* Request parameter */
+   __virtio64 param;
+};
+
 #endif /* _LINUX_VIRTIO_BALLOON_H */
-- 
1.8.3.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[RESEND PATCH v3 kernel 0/7] Extend virtio-balloon for fast (de)inflating & fast live migration

2016-10-20 Thread Liang Li
This patch set contains two parts of changes to the virtio-balloon.
 
One is the change for speeding up the inflating & deflating process,
the main idea of this optimization is to use bitmap to send the page
information to host instead of the PFNs, to reduce the overhead of
virtio data transmission, address translation and madvise(). This can
help to improve the performance by about 85%.
 
Another change is for speeding up live migration. By skipping process
guest's free pages in the first round of data copy, to reduce needless
data processing, this can help to save quite a lot of CPU cycles and
network bandwidth. We put guest's free page information in bitmap and
send it to host with the virt queue of virtio-balloon. For an idle 8GB
guest, this can help to shorten the total live migration time from 2Sec
to about 500ms in the 10Gbps network environment.
 
Dave Hansen suggested a new scheme to encode the data structure,
because of additional complexity, it's not implemented in v3.
 
Changes from v2 to v3:
* Change the name of 'free page' to 'unused page'.
* Use the scatter & gather bitmap instead of a 1MB page bitmap.
* Fix overwriting the page bitmap after kicking.
* Some of MST's comments for v2.
 
Changes from v1 to v2:
* Abandon the patch for dropping page cache.
* Put some structures to uapi head file.
* Use a new way to determine the page bitmap size.
* Use a unified way to send the free page information with the bitmap
* Address the issues referred in MST's comments

Liang Li (7):
  virtio-balloon: rework deflate to add page to a list
  virtio-balloon: define new feature bit and page bitmap head
  mm: add a function to get the max pfn
  virtio-balloon: speed up inflate/deflate process
  mm: add the related functions to get unused page
  virtio-balloon: define feature bit and head for misc virt queue
  virtio-balloon: tell host vm's unused page info

 drivers/virtio/virtio_balloon.c | 390 
 include/linux/mm.h  |   3 +
 include/uapi/linux/virtio_balloon.h |  41 
 mm/page_alloc.c |  94 +
 4 files changed, 485 insertions(+), 43 deletions(-)

-- 
1.8.3.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[RESEND PATCH v3 kernel 3/7] mm: add a function to get the max pfn

2016-10-20 Thread Liang Li
Expose the function to get the max pfn, so it can be used in the
virtio-balloon device driver. Simply include the 'linux/bootmem.h'
is not enough, if the device driver is built to a module, directly
refer the max_pfn lead to build failed.

Signed-off-by: Liang Li 
Cc: Andrew Morton 
Cc: Mel Gorman 
Cc: Michael S. Tsirkin 
Cc: Paolo Bonzini 
Cc: Cornelia Huck 
Cc: Amit Shah 
---
 include/linux/mm.h |  1 +
 mm/page_alloc.c| 10 ++
 2 files changed, 11 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ffbd729..2a89da0e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1776,6 +1776,7 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, 
pmd_t *pmd)
 extern void free_area_init_node(int nid, unsigned long * zones_size,
unsigned long zone_start_pfn, unsigned long *zholes_size);
 extern void free_initmem(void);
+extern unsigned long get_max_pfn(void);
 
 /*
  * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2b3bf67..e5f63a9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4426,6 +4426,16 @@ void show_free_areas(unsigned int filter)
show_swap_cache_info();
 }
 
+/*
+ * The max_pfn can change because of memory hot plug, so it's only good
+ * as a hint. e.g. for sizing data structures.
+ */
+unsigned long get_max_pfn(void)
+{
+   return max_pfn;
+}
+EXPORT_SYMBOL(get_max_pfn);
+
 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
 {
zoneref->zone = zone;
-- 
1.8.3.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[RESEND PATCH v3 kernel 1/7] virtio-balloon: rework deflate to add page to a list

2016-10-20 Thread Liang Li
Will allow faster notifications using a bitmap down the road.
balloon_pfn_to_page() can be removed because it's useless.

Signed-off-by: Liang Li 
Signed-off-by: Michael S. Tsirkin 
Cc: Paolo Bonzini 
Cc: Cornelia Huck 
Cc: Amit Shah 
---
 drivers/virtio/virtio_balloon.c | 22 --
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 4e7003d..59ffe5a 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -103,12 +103,6 @@ static u32 page_to_balloon_pfn(struct page *page)
return pfn * VIRTIO_BALLOON_PAGES_PER_PAGE;
 }
 
-static struct page *balloon_pfn_to_page(u32 pfn)
-{
-   BUG_ON(pfn % VIRTIO_BALLOON_PAGES_PER_PAGE);
-   return pfn_to_page(pfn / VIRTIO_BALLOON_PAGES_PER_PAGE);
-}
-
 static void balloon_ack(struct virtqueue *vq)
 {
struct virtio_balloon *vb = vq->vdev->priv;
@@ -181,18 +175,16 @@ static unsigned fill_balloon(struct virtio_balloon *vb, 
size_t num)
return num_allocated_pages;
 }
 
-static void release_pages_balloon(struct virtio_balloon *vb)
+static void release_pages_balloon(struct virtio_balloon *vb,
+struct list_head *pages)
 {
-   unsigned int i;
-   struct page *page;
+   struct page *page, *next;
 
-   /* Find pfns pointing at start of each page, get pages and free them. */
-   for (i = 0; i < vb->num_pfns; i += VIRTIO_BALLOON_PAGES_PER_PAGE) {
-   page = balloon_pfn_to_page(virtio32_to_cpu(vb->vdev,
-  vb->pfns[i]));
+   list_for_each_entry_safe(page, next, pages, lru) {
if (!virtio_has_feature(vb->vdev,
VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
adjust_managed_page_count(page, 1);
+   list_del(&page->lru);
put_page(page); /* balloon reference */
}
 }
@@ -202,6 +194,7 @@ static unsigned leak_balloon(struct virtio_balloon *vb, 
size_t num)
unsigned num_freed_pages;
struct page *page;
struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info;
+   LIST_HEAD(pages);
 
/* We can only do one array worth at a time. */
num = min(num, ARRAY_SIZE(vb->pfns));
@@ -215,6 +208,7 @@ static unsigned leak_balloon(struct virtio_balloon *vb, 
size_t num)
if (!page)
break;
set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
+   list_add(&page->lru, &pages);
vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE;
}
 
@@ -226,7 +220,7 @@ static unsigned leak_balloon(struct virtio_balloon *vb, 
size_t num)
 */
if (vb->num_pfns != 0)
tell_host(vb, vb->deflate_vq);
-   release_pages_balloon(vb);
+   release_pages_balloon(vb, &pages);
mutex_unlock(&vb->balloon_lock);
return num_freed_pages;
 }
-- 
1.8.3.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[RESEND PATCH v3 kernel 2/7] virtio-balloon: define new feature bit and page bitmap head

2016-10-20 Thread Liang Li
Add a new feature which supports sending the page information with
a bitmap. The current implementation uses PFNs array, which is not
very efficient. Using bitmap can improve the performance of
inflating/deflating significantly

The page bitmap header will used to tell the host some information
about the page bitmap. e.g. the page size, page bitmap length and
start pfn.

Signed-off-by: Liang Li 
Cc: Michael S. Tsirkin 
Cc: Paolo Bonzini 
Cc: Cornelia Huck 
Cc: Amit Shah 
---
 include/uapi/linux/virtio_balloon.h | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/include/uapi/linux/virtio_balloon.h 
b/include/uapi/linux/virtio_balloon.h
index 343d7dd..d3b182a 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -34,6 +34,7 @@
 #define VIRTIO_BALLOON_F_MUST_TELL_HOST0 /* Tell before reclaiming 
pages */
 #define VIRTIO_BALLOON_F_STATS_VQ  1 /* Memory Stats virtqueue */
 #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM2 /* Deflate balloon on OOM */
+#define VIRTIO_BALLOON_F_PAGE_BITMAP   3 /* Send page info with bitmap */
 
 /* Size of a PFN in the balloon interface. */
 #define VIRTIO_BALLOON_PFN_SHIFT 12
@@ -82,4 +83,22 @@ struct virtio_balloon_stat {
__virtio64 val;
 } __attribute__((packed));
 
+/* Page bitmap header structure */
+struct balloon_bmap_hdr {
+   /* Used to distinguish different request */
+   __virtio16 cmd;
+   /* Shift width of page in the bitmap */
+   __virtio16 page_shift;
+   /* flag used to identify different status */
+   __virtio16 flag;
+   /* Reserved */
+   __virtio16 reserved;
+   /* ID of the request */
+   __virtio64 req_id;
+   /* The pfn of 0 bit in the bitmap */
+   __virtio64 start_pfn;
+   /* The length of the bitmap, in bytes */
+   __virtio64 bmap_len;
+};
+
 #endif /* _LINUX_VIRTIO_BALLOON_H */
-- 
1.8.3.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v5 0/9] implement vcpu preempted check

2016-10-20 Thread Peter Zijlstra
On Thu, Oct 20, 2016 at 05:27:45PM -0400, Pan Xinhui wrote:

> 
> This patch set aims to fix lock holder preemption issues.

Thanks, this looks very good. I'll wait for ACKs from at least the KVM
people, since that was I think the most contentious patch.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v5 7/9] x86, xen: support vcpu preempted check

2016-10-20 Thread Juergen Gross
Corrected xen-devel mailing list address, added other Xen maintainers

On 20/10/16 23:27, Pan Xinhui wrote:
> From: Juergen Gross 
> 
> Support the vcpu_is_preempted() functionality under Xen. This will
> enhance lock performance on overcommitted hosts (more runnable vcpus
> than physical cpus in the system) as doing busy waits for preempted
> vcpus will hurt system performance far worse than early yielding.
> 
> A quick test (4 vcpus on 1 physical cpu doing a parallel build job
> with "make -j 8") reduced system time by about 5% with this patch.
> 
> Signed-off-by: Juergen Gross 
> Signed-off-by: Pan Xinhui 
> ---
>  arch/x86/xen/spinlock.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
> index 3d6e006..74756bb 100644
> --- a/arch/x86/xen/spinlock.c
> +++ b/arch/x86/xen/spinlock.c
> @@ -114,7 +114,6 @@ void xen_uninit_lock_cpu(int cpu)
>   per_cpu(irq_name, cpu) = NULL;
>  }
>  
> -
>  /*
>   * Our init of PV spinlocks is split in two init functions due to us
>   * using paravirt patching and jump labels patching and having to do
> @@ -137,6 +136,8 @@ void __init xen_init_spinlocks(void)
>   pv_lock_ops.queued_spin_unlock = 
> PV_CALLEE_SAVE(__pv_queued_spin_unlock);
>   pv_lock_ops.wait = xen_qlock_wait;
>   pv_lock_ops.kick = xen_qlock_kick;
> +
> + pv_lock_ops.vcpu_is_preempted = xen_vcpu_stolen;
>  }
>  
>  /*
> 

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v2 6/9] net: use core MTU range checking in virt drivers

2016-10-20 Thread Michael S. Tsirkin
On Thu, Oct 20, 2016 at 10:37:20PM -0400, Jarod Wilson wrote:
> On Thu, Oct 20, 2016 at 11:23:54PM +0300, Michael S. Tsirkin wrote:
> > On Thu, Oct 20, 2016 at 01:55:21PM -0400, Jarod Wilson wrote:
> ...
> > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > index fad84f3..720809f 100644
> > > --- a/drivers/net/virtio_net.c
> > > +++ b/drivers/net/virtio_net.c
> > > @@ -1419,17 +1419,6 @@ static const struct ethtool_ops 
> > > virtnet_ethtool_ops = {
> > >   .set_settings = virtnet_set_settings,
> > >  };
> > >  
> > > -#define MIN_MTU 68
> > > -#define MAX_MTU 65535
> > > -
> > > -static int virtnet_change_mtu(struct net_device *dev, int new_mtu)
> > > -{
> > > - if (new_mtu < MIN_MTU || new_mtu > MAX_MTU)
> > > - return -EINVAL;
> > > - dev->mtu = new_mtu;
> > > - return 0;
> > > -}
> > > -
> > >  static const struct net_device_ops virtnet_netdev = {
> > >   .ndo_open= virtnet_open,
> > >   .ndo_stop= virtnet_close,
> > > @@ -1437,7 +1426,6 @@ static const struct net_device_ops virtnet_netdev = 
> > > {
> > >   .ndo_validate_addr   = eth_validate_addr,
> > >   .ndo_set_mac_address = virtnet_set_mac_address,
> > >   .ndo_set_rx_mode = virtnet_set_rx_mode,
> > > - .ndo_change_mtu  = virtnet_change_mtu,
> > >   .ndo_get_stats64 = virtnet_stats,
> > >   .ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
> > >   .ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
> > > @@ -1748,6 +1736,9 @@ static bool virtnet_validate_features(struct 
> > > virtio_device *vdev)
> > >   return true;
> > >  }
> > >  
> > > +#define MIN_MTU ETH_MIN_MTU
> > > +#define MAX_MTU ETH_MAX_MTU
> > > +
> > 
> > Can we drop these btw?
> 
> Bah. Yeah. Should have just used them directly. I didn't add ETH_MAX_MTU
> until after doing the virtio_net changes, so I missed that.
> 
> > >  static int virtnet_probe(struct virtio_device *vdev)
> > >  {
> > >   int i, err;
> > > @@ -1821,6 +1812,10 @@ static int virtnet_probe(struct virtio_device 
> > > *vdev)
> > >  
> > >   dev->vlan_features = dev->features;
> > >  
> > > + /* MTU range: 68 - 65535 */
> > > + dev->min_mtu = MIN_MTU;
> > > + dev->max_mtu = MAX_MTU;
> > > +
> > >   /* Configuration may specify what MAC to use.  Otherwise random. */
> > >   if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
> > >   virtio_cread_bytes(vdev,
> > > @@ -1875,8 +1870,10 @@ static int virtnet_probe(struct virtio_device 
> > > *vdev)
> > >   mtu = virtio_cread16(vdev,
> > >offsetof(struct virtio_net_config,
> > > mtu));
> > > - if (virtnet_change_mtu(dev, mtu))
> > > + if (mtu < dev->min_mtu || mtu > dev->max_mtu)
> > 
> > In fact the > max_mtu branch does not make sense since a 16 bit
> > value can't exceed MAX_MTU.
> 
> Hm. mtu is declared as an int, not sure if there's any sort of type
> promotion to be worried about (not an area I know much/anything about).

Not by design, that's for sure.

> Certainly something that could be looked into as a minor optimization,
> though it's only in a probe path and shouldn't hurt anything, so ... meh?

Right. Aaron said he's working on a patch that essentially does
dev->max_mtu = mtu after validation, so this part will look
a bit silly there.

> -- 
> Jarod Wilson
> ja...@redhat.com
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net-next v2 6/9] net: use core MTU range checking in virt drivers

2016-10-20 Thread Jarod Wilson
On Thu, Oct 20, 2016 at 11:23:54PM +0300, Michael S. Tsirkin wrote:
> On Thu, Oct 20, 2016 at 01:55:21PM -0400, Jarod Wilson wrote:
...
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index fad84f3..720809f 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -1419,17 +1419,6 @@ static const struct ethtool_ops virtnet_ethtool_ops 
> > = {
> > .set_settings = virtnet_set_settings,
> >  };
> >  
> > -#define MIN_MTU 68
> > -#define MAX_MTU 65535
> > -
> > -static int virtnet_change_mtu(struct net_device *dev, int new_mtu)
> > -{
> > -   if (new_mtu < MIN_MTU || new_mtu > MAX_MTU)
> > -   return -EINVAL;
> > -   dev->mtu = new_mtu;
> > -   return 0;
> > -}
> > -
> >  static const struct net_device_ops virtnet_netdev = {
> > .ndo_open= virtnet_open,
> > .ndo_stop= virtnet_close,
> > @@ -1437,7 +1426,6 @@ static const struct net_device_ops virtnet_netdev = {
> > .ndo_validate_addr   = eth_validate_addr,
> > .ndo_set_mac_address = virtnet_set_mac_address,
> > .ndo_set_rx_mode = virtnet_set_rx_mode,
> > -   .ndo_change_mtu  = virtnet_change_mtu,
> > .ndo_get_stats64 = virtnet_stats,
> > .ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
> > .ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
> > @@ -1748,6 +1736,9 @@ static bool virtnet_validate_features(struct 
> > virtio_device *vdev)
> > return true;
> >  }
> >  
> > +#define MIN_MTU ETH_MIN_MTU
> > +#define MAX_MTU ETH_MAX_MTU
> > +
> 
> Can we drop these btw?

Bah. Yeah. Should have just used them directly. I didn't add ETH_MAX_MTU
until after doing the virtio_net changes, so I missed that.

> >  static int virtnet_probe(struct virtio_device *vdev)
> >  {
> > int i, err;
> > @@ -1821,6 +1812,10 @@ static int virtnet_probe(struct virtio_device *vdev)
> >  
> > dev->vlan_features = dev->features;
> >  
> > +   /* MTU range: 68 - 65535 */
> > +   dev->min_mtu = MIN_MTU;
> > +   dev->max_mtu = MAX_MTU;
> > +
> > /* Configuration may specify what MAC to use.  Otherwise random. */
> > if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
> > virtio_cread_bytes(vdev,
> > @@ -1875,8 +1870,10 @@ static int virtnet_probe(struct virtio_device *vdev)
> > mtu = virtio_cread16(vdev,
> >  offsetof(struct virtio_net_config,
> >   mtu));
> > -   if (virtnet_change_mtu(dev, mtu))
> > +   if (mtu < dev->min_mtu || mtu > dev->max_mtu)
> 
> In fact the > max_mtu branch does not make sense since a 16 bit
> value can't exceed MAX_MTU.

Hm. mtu is declared as an int, not sure if there's any sort of type
promotion to be worried about (not an area I know much/anything about).
Certainly something that could be looked into as a minor optimization,
though it's only in a probe path and shouldn't hurt anything, so ... meh?

-- 
Jarod Wilson
ja...@redhat.com

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v5 9/9] Documentation: virtual: kvm: Support vcpu preempted check

2016-10-20 Thread Pan Xinhui



在 2016/10/21 09:23, Boqun Feng 写道:

On Thu, Oct 20, 2016 at 05:27:54PM -0400, Pan Xinhui wrote:

Commit ("x86, kvm: support vcpu preempted check") add one field "__u8
preempted" into struct kvm_steal_time. This field tells if one vcpu is
running or not.

It is zero if 1) some old KVM deos not support this filed. 2) the vcpu is
preempted. Other values means the vcpu has been preempted.

  ^
s/preempted/not preempted


yes. the less of *not* definitely sould be avoided..

And better to fix other typos in the commit log ;-)
Maybe you can try aspell? That works for me.


I will try it. :)


Regards,
Boqun



Signed-off-by: Pan Xinhui 
---
 Documentation/virtual/kvm/msr.txt | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/Documentation/virtual/kvm/msr.txt 
b/Documentation/virtual/kvm/msr.txt
index 2a71c8f..3376f13 100644
--- a/Documentation/virtual/kvm/msr.txt
+++ b/Documentation/virtual/kvm/msr.txt
@@ -208,7 +208,8 @@ MSR_KVM_STEAL_TIME: 0x4b564d03
__u64 steal;
__u32 version;
__u32 flags;
-   __u32 pad[12];
+   __u8  preempted;
+   __u32 pad[11];
}

whose data will be filled in by the hypervisor periodically. Only one
@@ -232,6 +233,11 @@ MSR_KVM_STEAL_TIME: 0x4b564d03
nanoseconds. Time during which the vcpu is idle, will not be
reported as steal time.

+   preempted: indicate the VCPU who owns this struct is running or
+   not. Non-zero values mean the VCPU has been preempted. Zero
+   means the VCPU is not preempted. NOTE, it is always zero if the
+   the hypervisor doesn't support this field.
+
 MSR_KVM_EOI_EN: 0x4b564d04
data: Bit 0 is 1 when PV end of interrupt is enabled on the vcpu; 0
when disabled.  Bit 1 is reserved and must be zero.  When PV end of
--
2.4.11



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v5 9/9] Documentation: virtual: kvm: Support vcpu preempted check

2016-10-20 Thread Boqun Feng
On Thu, Oct 20, 2016 at 05:27:54PM -0400, Pan Xinhui wrote:
> Commit ("x86, kvm: support vcpu preempted check") add one field "__u8
> preempted" into struct kvm_steal_time. This field tells if one vcpu is
> running or not.
> 
> It is zero if 1) some old KVM deos not support this filed. 2) the vcpu is
> preempted. Other values means the vcpu has been preempted.
  ^
s/preempted/not preempted

And better to fix other typos in the commit log ;-)
Maybe you can try aspell? That works for me.

Regards,
Boqun

> 
> Signed-off-by: Pan Xinhui 
> ---
>  Documentation/virtual/kvm/msr.txt | 8 +++-
>  1 file changed, 7 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/virtual/kvm/msr.txt 
> b/Documentation/virtual/kvm/msr.txt
> index 2a71c8f..3376f13 100644
> --- a/Documentation/virtual/kvm/msr.txt
> +++ b/Documentation/virtual/kvm/msr.txt
> @@ -208,7 +208,8 @@ MSR_KVM_STEAL_TIME: 0x4b564d03
>   __u64 steal;
>   __u32 version;
>   __u32 flags;
> - __u32 pad[12];
> + __u8  preempted;
> + __u32 pad[11];
>   }
>  
>   whose data will be filled in by the hypervisor periodically. Only one
> @@ -232,6 +233,11 @@ MSR_KVM_STEAL_TIME: 0x4b564d03
>   nanoseconds. Time during which the vcpu is idle, will not be
>   reported as steal time.
>  
> + preempted: indicate the VCPU who owns this struct is running or
> + not. Non-zero values mean the VCPU has been preempted. Zero
> + means the VCPU is not preempted. NOTE, it is always zero if the
> + the hypervisor doesn't support this field.
> +
>  MSR_KVM_EOI_EN: 0x4b564d04
>   data: Bit 0 is 1 when PV end of interrupt is enabled on the vcpu; 0
>   when disabled.  Bit 1 is reserved and must be zero.  When PV end of
> -- 
> 2.4.11
> 


signature.asc
Description: PGP signature
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH net-next v2 6/9] net: use core MTU range checking in virt drivers

2016-10-20 Thread Michael S. Tsirkin
On Thu, Oct 20, 2016 at 01:55:21PM -0400, Jarod Wilson wrote:
> hyperv_net:
> - set min/max_mtu, per Haiyang, after rndis_filter_device_add
> 
> virtio_net:
> - set min/max_mtu
> - remove virtnet_change_mtu
> vmxnet3:
> - set min/max_mtu
> 
> xen-netback:
> - min_mtu = 0, max_mtu = 65517
> 
> xen-netfront:
> - min_mtu = 0, max_mtu = 65535
> 
> unisys/visor:
> - clean up defines a little to not clash with network core or add
>   redundat definitions
> 
> CC: net...@vger.kernel.org
> CC: virtualization@lists.linux-foundation.org
> CC: "K. Y. Srinivasan" 
> CC: Haiyang Zhang 
> CC: "Michael S. Tsirkin" 
> CC: Shrikrishna Khare 
> CC: "VMware, Inc." 
> CC: Wei Liu 
> CC: Paul Durrant 
> CC: David Kershner 
> Signed-off-by: Jarod Wilson 
> ---
>  drivers/net/hyperv/hyperv_net.h |  4 ++--
>  drivers/net/hyperv/netvsc_drv.c | 14 +++---
>  drivers/net/virtio_net.c| 23 ++-
>  drivers/net/vmxnet3/vmxnet3_drv.c   |  7 ---
>  drivers/net/xen-netback/interface.c |  5 -
>  drivers/net/xen-netfront.c  |  2 ++
>  drivers/staging/unisys/include/iochannel.h  | 10 --
>  drivers/staging/unisys/visornic/visornic_main.c |  4 ++--
>  8 files changed, 35 insertions(+), 34 deletions(-)
> 
> diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
> index f4fbcb5..3958ada 100644
> --- a/drivers/net/hyperv/hyperv_net.h
> +++ b/drivers/net/hyperv/hyperv_net.h
> @@ -606,8 +606,8 @@ struct nvsp_message {
>  } __packed;
>  
>  
> -#define NETVSC_MTU 65536
> -#define NETVSC_MTU_MIN 68
> +#define NETVSC_MTU 65535
> +#define NETVSC_MTU_MIN ETH_MIN_MTU
>  
>  #define NETVSC_RECEIVE_BUFFER_SIZE   (1024*1024*16)  /* 16MB */
>  #define NETVSC_RECEIVE_BUFFER_SIZE_LEGACY(1024*1024*15)  /* 15MB */
> diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
> index f0919bd..3b28cf1 100644
> --- a/drivers/net/hyperv/netvsc_drv.c
> +++ b/drivers/net/hyperv/netvsc_drv.c
> @@ -872,19 +872,12 @@ static int netvsc_change_mtu(struct net_device *ndev, 
> int mtu)
>   struct netvsc_device *nvdev = ndevctx->nvdev;
>   struct hv_device *hdev = ndevctx->device_ctx;
>   struct netvsc_device_info device_info;
> - int limit = ETH_DATA_LEN;
>   u32 num_chn;
>   int ret = 0;
>  
>   if (ndevctx->start_remove || !nvdev || nvdev->destroy)
>   return -ENODEV;
>  
> - if (nvdev->nvsp_version >= NVSP_PROTOCOL_VERSION_2)
> - limit = NETVSC_MTU - ETH_HLEN;
> -
> - if (mtu < NETVSC_MTU_MIN || mtu > limit)
> - return -EINVAL;
> -
>   ret = netvsc_close(ndev);
>   if (ret)
>   goto out;
> @@ -1402,6 +1395,13 @@ static int netvsc_probe(struct hv_device *dev,
>   netif_set_real_num_tx_queues(net, nvdev->num_chn);
>   netif_set_real_num_rx_queues(net, nvdev->num_chn);
>  
> + /* MTU range: 68 - 1500 or 65521 */
> + net->min_mtu = NETVSC_MTU_MIN;
> + if (nvdev->nvsp_version >= NVSP_PROTOCOL_VERSION_2)
> + net->max_mtu = NETVSC_MTU - ETH_HLEN;
> + else
> + net->max_mtu = ETH_DATA_LEN;
> +
>   ret = register_netdev(net);
>   if (ret != 0) {
>   pr_err("Unable to register netdev.\n");
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index fad84f3..720809f 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -1419,17 +1419,6 @@ static const struct ethtool_ops virtnet_ethtool_ops = {
>   .set_settings = virtnet_set_settings,
>  };
>  
> -#define MIN_MTU 68
> -#define MAX_MTU 65535
> -
> -static int virtnet_change_mtu(struct net_device *dev, int new_mtu)
> -{
> - if (new_mtu < MIN_MTU || new_mtu > MAX_MTU)
> - return -EINVAL;
> - dev->mtu = new_mtu;
> - return 0;
> -}
> -
>  static const struct net_device_ops virtnet_netdev = {
>   .ndo_open= virtnet_open,
>   .ndo_stop= virtnet_close,
> @@ -1437,7 +1426,6 @@ static const struct net_device_ops virtnet_netdev = {
>   .ndo_validate_addr   = eth_validate_addr,
>   .ndo_set_mac_address = virtnet_set_mac_address,
>   .ndo_set_rx_mode = virtnet_set_rx_mode,
> - .ndo_change_mtu  = virtnet_change_mtu,
>   .ndo_get_stats64 = virtnet_stats,
>   .ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
>   .ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
> @@ -1748,6 +1736,9 @@ static bool virtnet_validate_features(struct 
> virtio_device *vdev)
>   return true;
>  }
>  
> +#define MIN_MTU ETH_MIN_MTU
> +#define MAX_MTU ETH_MAX_MTU
> +

Can we drop these btw?

>  static int virtnet_probe(struct virtio_device *vdev)
>  {
>   int i, err;
> @@ -1821,6 +1812,10 @@ static int virtnet_probe(struct virtio_device *vdev)
>  
>   dev->vlan_features = dev->features;
>  
> + /* MTU range: 68 - 65535 */
> + dev->min_mtu = MIN_MTU;
> + dev->max_

Re: [PATCH net-next 5/6] net: use core MTU range checking in virt drivers

2016-10-20 Thread Shrikrishna Khare


On Wed, 19 Oct 2016, Jarod Wilson wrote:

> hyperv_net:
> - set min/max_mtu
> 
> virtio_net:
> - set min/max_mtu
> - remove virtnet_change_mtu
> 
> vmxnet3:
> - set min/max_mtu
> 
> CC: net...@vger.kernel.org
> CC: virtualization@lists.linux-foundation.org
> CC: "K. Y. Srinivasan" 
> CC: Haiyang Zhang 
> CC: "Michael S. Tsirkin" 
> CC: Shrikrishna Khare 
> CC: "VMware, Inc." 
> Signed-off-by: Jarod Wilson 
> ---

The vmxnet3 part of the change looks good to me.

Thanks,
Shri
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


RE: [PATCH net-next v2 6/9] net: use core MTU range checking in virt drivers

2016-10-20 Thread Haiyang Zhang via Virtualization


> -Original Message-
> From: Jarod Wilson [mailto:ja...@redhat.com]
> Sent: Thursday, October 20, 2016 1:55 PM
> To: linux-ker...@vger.kernel.org
> Cc: Jarod Wilson ; net...@vger.kernel.org;
> virtualization@lists.linux-foundation.org; KY Srinivasan
> ; Haiyang Zhang ; Michael S.
> Tsirkin ; Shrikrishna Khare ; VMware,
> Inc. ; Wei Liu ; Paul
> Durrant ; David Kershner
> 
> Subject: [PATCH net-next v2 6/9] net: use core MTU range checking in
> virt drivers
> 
> hyperv_net:
> - set min/max_mtu, per Haiyang, after rndis_filter_device_add
> 
> virtio_net:
> - set min/max_mtu
> - remove virtnet_change_mtu
> 
> vmxnet3:
> - set min/max_mtu
> 
> xen-netback:
> - min_mtu = 0, max_mtu = 65517
> 
> xen-netfront:
> - min_mtu = 0, max_mtu = 65535
> 
> unisys/visor:
> - clean up defines a little to not clash with network core or add
>   redundat definitions
> 
> CC: net...@vger.kernel.org
> CC: virtualization@lists.linux-foundation.org
> CC: "K. Y. Srinivasan" 
> CC: Haiyang Zhang 
> CC: "Michael S. Tsirkin" 
> CC: Shrikrishna Khare 
> CC: "VMware, Inc." 
> CC: Wei Liu 
> CC: Paul Durrant 
> CC: David Kershner 
> Signed-off-by: Jarod Wilson 
> ---

The hv_netvsc changes look fine. Thanks.

Reviewed-by: Haiyang Zhang 


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH net-next v2 6/9] net: use core MTU range checking in virt drivers

2016-10-20 Thread Jarod Wilson
hyperv_net:
- set min/max_mtu, per Haiyang, after rndis_filter_device_add

virtio_net:
- set min/max_mtu
- remove virtnet_change_mtu

vmxnet3:
- set min/max_mtu

xen-netback:
- min_mtu = 0, max_mtu = 65517

xen-netfront:
- min_mtu = 0, max_mtu = 65535

unisys/visor:
- clean up defines a little to not clash with network core or add
  redundat definitions

CC: net...@vger.kernel.org
CC: virtualization@lists.linux-foundation.org
CC: "K. Y. Srinivasan" 
CC: Haiyang Zhang 
CC: "Michael S. Tsirkin" 
CC: Shrikrishna Khare 
CC: "VMware, Inc." 
CC: Wei Liu 
CC: Paul Durrant 
CC: David Kershner 
Signed-off-by: Jarod Wilson 
---
 drivers/net/hyperv/hyperv_net.h |  4 ++--
 drivers/net/hyperv/netvsc_drv.c | 14 +++---
 drivers/net/virtio_net.c| 23 ++-
 drivers/net/vmxnet3/vmxnet3_drv.c   |  7 ---
 drivers/net/xen-netback/interface.c |  5 -
 drivers/net/xen-netfront.c  |  2 ++
 drivers/staging/unisys/include/iochannel.h  | 10 --
 drivers/staging/unisys/visornic/visornic_main.c |  4 ++--
 8 files changed, 35 insertions(+), 34 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index f4fbcb5..3958ada 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -606,8 +606,8 @@ struct nvsp_message {
 } __packed;
 
 
-#define NETVSC_MTU 65536
-#define NETVSC_MTU_MIN 68
+#define NETVSC_MTU 65535
+#define NETVSC_MTU_MIN ETH_MIN_MTU
 
 #define NETVSC_RECEIVE_BUFFER_SIZE (1024*1024*16)  /* 16MB */
 #define NETVSC_RECEIVE_BUFFER_SIZE_LEGACY  (1024*1024*15)  /* 15MB */
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index f0919bd..3b28cf1 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -872,19 +872,12 @@ static int netvsc_change_mtu(struct net_device *ndev, int 
mtu)
struct netvsc_device *nvdev = ndevctx->nvdev;
struct hv_device *hdev = ndevctx->device_ctx;
struct netvsc_device_info device_info;
-   int limit = ETH_DATA_LEN;
u32 num_chn;
int ret = 0;
 
if (ndevctx->start_remove || !nvdev || nvdev->destroy)
return -ENODEV;
 
-   if (nvdev->nvsp_version >= NVSP_PROTOCOL_VERSION_2)
-   limit = NETVSC_MTU - ETH_HLEN;
-
-   if (mtu < NETVSC_MTU_MIN || mtu > limit)
-   return -EINVAL;
-
ret = netvsc_close(ndev);
if (ret)
goto out;
@@ -1402,6 +1395,13 @@ static int netvsc_probe(struct hv_device *dev,
netif_set_real_num_tx_queues(net, nvdev->num_chn);
netif_set_real_num_rx_queues(net, nvdev->num_chn);
 
+   /* MTU range: 68 - 1500 or 65521 */
+   net->min_mtu = NETVSC_MTU_MIN;
+   if (nvdev->nvsp_version >= NVSP_PROTOCOL_VERSION_2)
+   net->max_mtu = NETVSC_MTU - ETH_HLEN;
+   else
+   net->max_mtu = ETH_DATA_LEN;
+
ret = register_netdev(net);
if (ret != 0) {
pr_err("Unable to register netdev.\n");
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index fad84f3..720809f 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1419,17 +1419,6 @@ static const struct ethtool_ops virtnet_ethtool_ops = {
.set_settings = virtnet_set_settings,
 };
 
-#define MIN_MTU 68
-#define MAX_MTU 65535
-
-static int virtnet_change_mtu(struct net_device *dev, int new_mtu)
-{
-   if (new_mtu < MIN_MTU || new_mtu > MAX_MTU)
-   return -EINVAL;
-   dev->mtu = new_mtu;
-   return 0;
-}
-
 static const struct net_device_ops virtnet_netdev = {
.ndo_open= virtnet_open,
.ndo_stop= virtnet_close,
@@ -1437,7 +1426,6 @@ static const struct net_device_ops virtnet_netdev = {
.ndo_validate_addr   = eth_validate_addr,
.ndo_set_mac_address = virtnet_set_mac_address,
.ndo_set_rx_mode = virtnet_set_rx_mode,
-   .ndo_change_mtu  = virtnet_change_mtu,
.ndo_get_stats64 = virtnet_stats,
.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
@@ -1748,6 +1736,9 @@ static bool virtnet_validate_features(struct 
virtio_device *vdev)
return true;
 }
 
+#define MIN_MTU ETH_MIN_MTU
+#define MAX_MTU ETH_MAX_MTU
+
 static int virtnet_probe(struct virtio_device *vdev)
 {
int i, err;
@@ -1821,6 +1812,10 @@ static int virtnet_probe(struct virtio_device *vdev)
 
dev->vlan_features = dev->features;
 
+   /* MTU range: 68 - 65535 */
+   dev->min_mtu = MIN_MTU;
+   dev->max_mtu = MAX_MTU;
+
/* Configuration may specify what MAC to use.  Otherwise random. */
if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
virtio_cread_bytes(vdev,
@@ -1875,8 +1870,10 @@ static int virtnet_probe(struct virtio_device *vdev)
  

[PATCH v5 9/9] Documentation: virtual: kvm: Support vcpu preempted check

2016-10-20 Thread Pan Xinhui
Commit ("x86, kvm: support vcpu preempted check") add one field "__u8
preempted" into struct kvm_steal_time. This field tells if one vcpu is
running or not.

It is zero if 1) some old KVM deos not support this filed. 2) the vcpu is
preempted. Other values means the vcpu has been preempted.

Signed-off-by: Pan Xinhui 
---
 Documentation/virtual/kvm/msr.txt | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/Documentation/virtual/kvm/msr.txt 
b/Documentation/virtual/kvm/msr.txt
index 2a71c8f..3376f13 100644
--- a/Documentation/virtual/kvm/msr.txt
+++ b/Documentation/virtual/kvm/msr.txt
@@ -208,7 +208,8 @@ MSR_KVM_STEAL_TIME: 0x4b564d03
__u64 steal;
__u32 version;
__u32 flags;
-   __u32 pad[12];
+   __u8  preempted;
+   __u32 pad[11];
}
 
whose data will be filled in by the hypervisor periodically. Only one
@@ -232,6 +233,11 @@ MSR_KVM_STEAL_TIME: 0x4b564d03
nanoseconds. Time during which the vcpu is idle, will not be
reported as steal time.
 
+   preempted: indicate the VCPU who owns this struct is running or
+   not. Non-zero values mean the VCPU has been preempted. Zero
+   means the VCPU is not preempted. NOTE, it is always zero if the
+   the hypervisor doesn't support this field.
+
 MSR_KVM_EOI_EN: 0x4b564d04
data: Bit 0 is 1 when PV end of interrupt is enabled on the vcpu; 0
when disabled.  Bit 1 is reserved and must be zero.  When PV end of
-- 
2.4.11

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v5 7/9] x86, xen: support vcpu preempted check

2016-10-20 Thread Pan Xinhui
From: Juergen Gross 

Support the vcpu_is_preempted() functionality under Xen. This will
enhance lock performance on overcommitted hosts (more runnable vcpus
than physical cpus in the system) as doing busy waits for preempted
vcpus will hurt system performance far worse than early yielding.

A quick test (4 vcpus on 1 physical cpu doing a parallel build job
with "make -j 8") reduced system time by about 5% with this patch.

Signed-off-by: Juergen Gross 
Signed-off-by: Pan Xinhui 
---
 arch/x86/xen/spinlock.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 3d6e006..74756bb 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -114,7 +114,6 @@ void xen_uninit_lock_cpu(int cpu)
per_cpu(irq_name, cpu) = NULL;
 }
 
-
 /*
  * Our init of PV spinlocks is split in two init functions due to us
  * using paravirt patching and jump labels patching and having to do
@@ -137,6 +136,8 @@ void __init xen_init_spinlocks(void)
pv_lock_ops.queued_spin_unlock = 
PV_CALLEE_SAVE(__pv_queued_spin_unlock);
pv_lock_ops.wait = xen_qlock_wait;
pv_lock_ops.kick = xen_qlock_kick;
+
+   pv_lock_ops.vcpu_is_preempted = xen_vcpu_stolen;
 }
 
 /*
-- 
2.4.11

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v5 8/9] s390/spinlock: Provide vcpu_is_preempted

2016-10-20 Thread Pan Xinhui
From: Christian Borntraeger 

this implements the s390 backend for commit
"kernel/sched: introduce vcpu preempted check interface"
by reworking the existing smp_vcpu_scheduled into
arch_vcpu_is_preempted. We can then also get rid of the
local cpu_is_preempted function by moving the
CIF_ENABLED_WAIT test into arch_vcpu_is_preempted.

Signed-off-by: Christian Borntraeger 
Acked-by: Heiko Carstens 
---
 arch/s390/include/asm/spinlock.h |  8 
 arch/s390/kernel/smp.c   |  9 +++--
 arch/s390/lib/spinlock.c | 25 -
 3 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index 7e9e09f..7ecd890 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -23,6 +23,14 @@ _raw_compare_and_swap(unsigned int *lock, unsigned int old, 
unsigned int new)
return __sync_bool_compare_and_swap(lock, old, new);
 }
 
+#ifndef CONFIG_SMP
+static inline bool arch_vcpu_is_preempted(int cpu) { return false; }
+#else
+bool arch_vcpu_is_preempted(int cpu);
+#endif
+
+#define vcpu_is_preempted arch_vcpu_is_preempted
+
 /*
  * Simple spin lock operations.  There are two variants, one clears IRQ's
  * on the local processor, one does not.
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 35531fe..b988ed1 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -368,10 +368,15 @@ int smp_find_processor_id(u16 address)
return -1;
 }
 
-int smp_vcpu_scheduled(int cpu)
+bool arch_vcpu_is_preempted(int cpu)
 {
-   return pcpu_running(pcpu_devices + cpu);
+   if (test_cpu_flag_of(CIF_ENABLED_WAIT, cpu))
+   return false;
+   if (pcpu_running(pcpu_devices + cpu))
+   return false;
+   return true;
 }
+EXPORT_SYMBOL(arch_vcpu_is_preempted);
 
 void smp_yield_cpu(int cpu)
 {
diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c
index e5f50a7..e48a48e 100644
--- a/arch/s390/lib/spinlock.c
+++ b/arch/s390/lib/spinlock.c
@@ -37,15 +37,6 @@ static inline void _raw_compare_and_delay(unsigned int 
*lock, unsigned int old)
asm(".insn rsy,0xeb22,%0,0,%1" : : "d" (old), "Q" (*lock));
 }
 
-static inline int cpu_is_preempted(int cpu)
-{
-   if (test_cpu_flag_of(CIF_ENABLED_WAIT, cpu))
-   return 0;
-   if (smp_vcpu_scheduled(cpu))
-   return 0;
-   return 1;
-}
-
 void arch_spin_lock_wait(arch_spinlock_t *lp)
 {
unsigned int cpu = SPINLOCK_LOCKVAL;
@@ -62,7 +53,7 @@ void arch_spin_lock_wait(arch_spinlock_t *lp)
continue;
}
/* First iteration: check if the lock owner is running. */
-   if (first_diag && cpu_is_preempted(~owner)) {
+   if (first_diag && arch_vcpu_is_preempted(~owner)) {
smp_yield_cpu(~owner);
first_diag = 0;
continue;
@@ -81,7 +72,7 @@ void arch_spin_lock_wait(arch_spinlock_t *lp)
 * yield the CPU unconditionally. For LPAR rely on the
 * sense running status.
 */
-   if (!MACHINE_IS_LPAR || cpu_is_preempted(~owner)) {
+   if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) {
smp_yield_cpu(~owner);
first_diag = 0;
}
@@ -108,7 +99,7 @@ void arch_spin_lock_wait_flags(arch_spinlock_t *lp, unsigned 
long flags)
continue;
}
/* Check if the lock owner is running. */
-   if (first_diag && cpu_is_preempted(~owner)) {
+   if (first_diag && arch_vcpu_is_preempted(~owner)) {
smp_yield_cpu(~owner);
first_diag = 0;
continue;
@@ -127,7 +118,7 @@ void arch_spin_lock_wait_flags(arch_spinlock_t *lp, 
unsigned long flags)
 * yield the CPU unconditionally. For LPAR rely on the
 * sense running status.
 */
-   if (!MACHINE_IS_LPAR || cpu_is_preempted(~owner)) {
+   if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) {
smp_yield_cpu(~owner);
first_diag = 0;
}
@@ -165,7 +156,7 @@ void _raw_read_lock_wait(arch_rwlock_t *rw)
owner = 0;
while (1) {
if (count-- <= 0) {
-   if (owner && cpu_is_preempted(~owner))
+   if (owner && arch_vcpu_is_preempted(~owner))
smp_yield_cpu(~owner);
count = spin_retry;
}
@@ -211,7 +202,7 @@ void _raw_write_lock_wait(arch_rwlock_t *rw, unsigned int 
prev)
owner = 0;
while (1) {
if (count-- <= 0) {
-   if (owner && cpu_is_preempted(~owner))
+   if

[PATCH v5 6/9] x86, kvm: support vcpu preempted check

2016-10-20 Thread Pan Xinhui
Support the vcpu_is_preempted() functionality under KVM. This will
enhance lock performance on overcommitted hosts (more runnable vcpus
than physical cpus in the system) as doing busy waits for preempted
vcpus will hurt system performance far worse than early yielding.

Use one field of struct kvm_steal_time to indicate that if one vcpu
is running or not.

unix benchmark result:
host:  kernel 4.8.1, i5-4570, 4 cpus
guest: kernel 4.8.1, 8 vcpus

test-case   after-patch   before-patch
Execl Throughput   |18307.9 lps  |11701.6 lps
File Copy 1024 bufsize 2000 maxblocks  |  1352407.3 KBps |   790418.9 KBps
File Copy 256 bufsize 500 maxblocks|   367555.6 KBps |   222867.7 KBps
File Copy 4096 bufsize 8000 maxblocks  |  3675649.7 KBps |  1780614.4 KBps
Pipe Throughput| 11872208.7 lps  | 11855628.9 lps
Pipe-based Context Switching   |  1495126.5 lps  |  1490533.9 lps
Process Creation   |29881.2 lps  |28572.8 lps
Shell Scripts (1 concurrent)   |23224.3 lpm  |22607.4 lpm
Shell Scripts (8 concurrent)   | 3531.4 lpm  | 3211.9 lpm
System Call Overhead   | 10385653.0 lps  | 10419979.0 lps

Signed-off-by: Pan Xinhui 
---
 arch/x86/include/uapi/asm/kvm_para.h |  3 ++-
 arch/x86/kernel/kvm.c| 12 
 arch/x86/kvm/x86.c   | 18 ++
 3 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/uapi/asm/kvm_para.h 
b/arch/x86/include/uapi/asm/kvm_para.h
index 94dc8ca..b3fec56 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -45,7 +45,8 @@ struct kvm_steal_time {
__u64 steal;
__u32 version;
__u32 flags;
-   __u32 pad[12];
+   __u8 preempted;
+   __u32 pad[11];
 };
 
 #define KVM_STEAL_ALIGNMENT_BITS 5
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index edbbfc8..0b48dd2 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -415,6 +415,15 @@ void kvm_disable_steal_time(void)
wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
 }
 
+static bool kvm_vcpu_is_preempted(int cpu)
+{
+   struct kvm_steal_time *src;
+
+   src = &per_cpu(steal_time, cpu);
+
+   return !!src->preempted;
+}
+
 #ifdef CONFIG_SMP
 static void __init kvm_smp_prepare_boot_cpu(void)
 {
@@ -471,6 +480,9 @@ void __init kvm_guest_init(void)
if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
has_steal_clock = 1;
pv_time_ops.steal_clock = kvm_steal_clock;
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+   pv_lock_ops.vcpu_is_preempted = kvm_vcpu_is_preempted;
+#endif
}
 
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6c633de..a627537 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2057,6 +2057,8 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
&vcpu->arch.st.steal, sizeof(struct kvm_steal_time
return;
 
+   vcpu->arch.st.steal.preempted = 0;
+
if (vcpu->arch.st.steal.version & 1)
vcpu->arch.st.steal.version += 1;  /* first time write, random 
junk */
 
@@ -2810,8 +2812,24 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
 }
 
+static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
+{
+   if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+   return;
+
+   if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+   &vcpu->arch.st.steal, sizeof(struct kvm_steal_time
+   return;
+
+   vcpu->arch.st.steal.preempted = 1;
+
+   kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+   &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
+}
+
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
+   kvm_steal_time_set_preempted(vcpu);
kvm_x86_ops->vcpu_put(vcpu);
kvm_put_guest_fpu(vcpu);
vcpu->arch.last_host_tsc = rdtsc();
-- 
2.4.11

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v5 5/9] x86, paravirt: Add interface to support kvm/xen vcpu preempted check

2016-10-20 Thread Pan Xinhui
This is to fix some lock holder preemption issues. Some other locks
implementation do a spin loop before acquiring the lock itself.
Currently kernel has an interface of bool vcpu_is_preempted(int cpu). It
takes the cpu as parameter and return true if the cpu is preempted.
Then kernel can break the spin loops upon on the retval of
vcpu_is_preempted.

As kernel has used this interface, So lets support it.

To deal with kernel and kvm/xen, add vcpu_is_preempted into struct
pv_lock_ops.

Then kvm or xen could provide their own implementation to support
vcpu_is_preempted.

Signed-off-by: Pan Xinhui 
---
 arch/x86/include/asm/paravirt_types.h | 2 ++
 arch/x86/include/asm/spinlock.h   | 8 
 arch/x86/kernel/paravirt-spinlocks.c  | 6 ++
 3 files changed, 16 insertions(+)

diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 0f400c0..38c3bb7 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -310,6 +310,8 @@ struct pv_lock_ops {
 
void (*wait)(u8 *ptr, u8 val);
void (*kick)(int cpu);
+
+   bool (*vcpu_is_preempted)(int cpu);
 };
 
 /* This contains all the paravirt structures: we get a convenient
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 921bea7..0526f59 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -26,6 +26,14 @@
 extern struct static_key paravirt_ticketlocks_enabled;
 static __always_inline bool static_key_false(struct static_key *key);
 
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define vcpu_is_preempted vcpu_is_preempted
+static inline bool vcpu_is_preempted(int cpu)
+{
+   return pv_lock_ops.vcpu_is_preempted(cpu);
+}
+#endif
+
 #include 
 
 /*
diff --git a/arch/x86/kernel/paravirt-spinlocks.c 
b/arch/x86/kernel/paravirt-spinlocks.c
index 2c55a00..2f204dd 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -21,12 +21,18 @@ bool pv_is_native_spin_unlock(void)
__raw_callee_save___native_queued_spin_unlock;
 }
 
+static bool native_vcpu_is_preempted(int cpu)
+{
+   return 0;
+}
+
 struct pv_lock_ops pv_lock_ops = {
 #ifdef CONFIG_SMP
.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath,
.queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
.wait = paravirt_nop,
.kick = paravirt_nop,
+   .vcpu_is_preempted = native_vcpu_is_preempted,
 #endif /* SMP */
 };
 EXPORT_SYMBOL(pv_lock_ops);
-- 
2.4.11

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v5 3/9] kernel/locking: Drop the overload of {mutex, rwsem}_spin_on_owner

2016-10-20 Thread Pan Xinhui
An over-committed guest with more vCPUs than pCPUs has a heavy overload in
the two spin_on_owner. This blames on the lock holder preemption issue.

Kernel has an interface bool vcpu_is_preempted(int cpu) to see if a vCPU is
currently running or not. So break the spin loops on true condition.

test-case:
perf record -a perf bench sched messaging -g 400 -p && perf report

before patch:
20.68%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner
 8.45%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 4.12%  sched-messaging  [kernel.vmlinux]  [k] system_call
 3.01%  sched-messaging  [kernel.vmlinux]  [k] system_call_common
 2.83%  sched-messaging  [kernel.vmlinux]  [k] copypage_power7
 2.64%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
 2.00%  sched-messaging  [kernel.vmlinux]  [k] osq_lock

after patch:
 9.99%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 5.28%  sched-messaging  [unknown] [H] 0xc00768e0
 4.27%  sched-messaging  [kernel.vmlinux]  [k] __copy_tofrom_user_power7
 3.77%  sched-messaging  [kernel.vmlinux]  [k] copypage_power7
 3.24%  sched-messaging  [kernel.vmlinux]  [k] _raw_write_lock_irq
 3.02%  sched-messaging  [kernel.vmlinux]  [k] system_call
 2.69%  sched-messaging  [kernel.vmlinux]  [k] wait_consider_task

Signed-off-by: Pan Xinhui 
Acked-by: Christian Borntraeger 
Tested-by: Juergen Gross 
---
 kernel/locking/mutex.c  | 15 +--
 kernel/locking/rwsem-xadd.c | 16 +---
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index a70b90d..82108f5 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -236,7 +236,13 @@ bool mutex_spin_on_owner(struct mutex *lock, struct 
task_struct *owner)
 */
barrier();
 
-   if (!owner->on_cpu || need_resched()) {
+   /*
+* Use vcpu_is_preempted to detech lock holder preemption issue
+* and break. vcpu_is_preempted is a macro defined by false if
+* arch does not support vcpu preempted check,
+*/
+   if (!owner->on_cpu || need_resched() ||
+   vcpu_is_preempted(task_cpu(owner))) {
ret = false;
break;
}
@@ -261,8 +267,13 @@ static inline int mutex_can_spin_on_owner(struct mutex 
*lock)
 
rcu_read_lock();
owner = READ_ONCE(lock->owner);
+
+   /*
+* As lock holder preemption issue, we both skip spinning if task is not
+* on cpu or its cpu is preempted
+*/
if (owner)
-   retval = owner->on_cpu;
+   retval = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
rcu_read_unlock();
/*
 * if lock->owner is not set, the mutex owner may have just acquired
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 2337b4b..0897179 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -336,7 +336,11 @@ static inline bool rwsem_can_spin_on_owner(struct 
rw_semaphore *sem)
goto done;
}
 
-   ret = owner->on_cpu;
+   /*
+* As lock holder preemption issue, we both skip spinning if task is not
+* on cpu or its cpu is preempted
+*/
+   ret = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
 done:
rcu_read_unlock();
return ret;
@@ -362,8 +366,14 @@ static noinline bool rwsem_spin_on_owner(struct 
rw_semaphore *sem)
 */
barrier();
 
-   /* abort spinning when need_resched or owner is not running */
-   if (!owner->on_cpu || need_resched()) {
+   /*
+* abort spinning when need_resched or owner is not running or
+* owner's cpu is preempted. vcpu_is_preempted is a macro
+* defined by false if arch does not support vcpu preempted
+* check
+*/
+   if (!owner->on_cpu || need_resched() ||
+   vcpu_is_preempted(task_cpu(owner))) {
rcu_read_unlock();
return false;
}
-- 
2.4.11

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v5 4/9] powerpc/spinlock: support vcpu preempted check

2016-10-20 Thread Pan Xinhui
This is to fix some lock holder preemption issues. Some other locks
implementation do a spin loop before acquiring the lock itself.
Currently kernel has an interface of bool vcpu_is_preempted(int cpu). It
takes the cpu as parameter and return true if the cpu is preempted. Then
kernel can break the spin loops upon on the retval of vcpu_is_preempted.

As kernel has used this interface, So lets support it.

Only pSeries need support it. And the fact is powerNV are built into
same kernel image with pSeries. So we need return false if we are runnig
as powerNV. The another fact is that lppaca->yiled_count keeps zero on
powerNV. So we can just skip the machine type check.

Suggested-by: Boqun Feng 
Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Pan Xinhui 
---
 arch/powerpc/include/asm/spinlock.h | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/include/asm/spinlock.h 
b/arch/powerpc/include/asm/spinlock.h
index abb6b0f..f4a9524 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -52,6 +52,14 @@
 #define SYNC_IO
 #endif
 
+#ifdef CONFIG_PPC_PSERIES
+#define vcpu_is_preempted vcpu_is_preempted
+static inline bool vcpu_is_preempted(int cpu)
+{
+   return !!(be32_to_cpu(lppaca_of(cpu).yield_count) & 1);
+}
+#endif
+
 #if defined(CONFIG_PPC_SPLPAR)
 /* We only yield to the hypervisor if we are in shared processor mode */
 #define SHARED_PROCESSOR (lppaca_shared_proc(local_paca->lppaca_ptr))
-- 
2.4.11

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v5 2/9] locking/osq: Drop the overload of osq_lock()

2016-10-20 Thread Pan Xinhui
An over-committed guest with more vCPUs than pCPUs has a heavy overload in
osq_lock().

This is because vCPU A hold the osq lock and yield out, vCPU B wait per_cpu
node->locked to be set. IOW, vCPU B wait vCPU A to run and unlock the osq
lock.

Kernel has an interface bool vcpu_is_preempted(int cpu) to see if a vCPU is
currently running or not. So break the spin loops on true condition.

test case:
perf record -a perf bench sched messaging -g 400 -p && perf report

before patch:
18.09%  sched-messaging  [kernel.vmlinux]  [k] osq_lock
12.28%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
 5.27%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 3.89%  sched-messaging  [kernel.vmlinux]  [k] wait_consider_task
 3.64%  sched-messaging  [kernel.vmlinux]  [k] _raw_write_lock_irq
 3.41%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner.is
 2.49%  sched-messaging  [kernel.vmlinux]  [k] system_call

after patch:
20.68%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner
 8.45%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 4.12%  sched-messaging  [kernel.vmlinux]  [k] system_call
 3.01%  sched-messaging  [kernel.vmlinux]  [k] system_call_common
 2.83%  sched-messaging  [kernel.vmlinux]  [k] copypage_power7
 2.64%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
 2.00%  sched-messaging  [kernel.vmlinux]  [k] osq_lock

Suggested-by: Boqun Feng 
Signed-off-by: Pan Xinhui 
Acked-by: Christian Borntraeger 
Tested-by: Juergen Gross 
---
 kernel/locking/osq_lock.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index 05a3785..39d1385 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -21,6 +21,11 @@ static inline int encode_cpu(int cpu_nr)
return cpu_nr + 1;
 }
 
+static inline int node_cpu(struct optimistic_spin_node *node)
+{
+   return node->cpu - 1;
+}
+
 static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
 {
int cpu_nr = encoded_cpu_val - 1;
@@ -118,8 +123,11 @@ bool osq_lock(struct optimistic_spin_queue *lock)
while (!READ_ONCE(node->locked)) {
/*
 * If we need to reschedule bail... so we can block.
+* Use vcpu_is_preempted to detech lock holder preemption issue
+* and break. vcpu_is_preempted is a macro defined by false if
+* arch does not support vcpu preempted check,
 */
-   if (need_resched())
+   if (need_resched() || vcpu_is_preempted(node_cpu(node->prev)))
goto unqueue;
 
cpu_relax_lowlatency();
-- 
2.4.11

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v5 1/9] kernel/sched: introduce vcpu preempted check interface

2016-10-20 Thread Pan Xinhui
This patch support to fix lock holder preemption issue.

For kernel users, we could use bool vcpu_is_preempted(int cpu) to detech if
one vcpu is preempted or not.

The default implementation is a macro defined by false. So compiler can
wrap it out if arch dose not support such vcpu pteempted check.

Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Pan Xinhui 
Acked-by: Christian Borntraeger 
Tested-by: Juergen Gross 
---
 include/linux/sched.h | 12 
 1 file changed, 12 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 348f51b..44c1ce7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3506,6 +3506,18 @@ static inline void set_task_cpu(struct task_struct *p, 
unsigned int cpu)
 
 #endif /* CONFIG_SMP */
 
+/*
+ * In order to deal with a various lock holder preemption issues provide an
+ * interface to see if a vCPU is currently running or not.
+ *
+ * This allows us to terminate optimistic spin loops and block, analogous to
+ * the native optimistic spin heuristic of testing if the lock owner task is
+ * running or not.
+ */
+#ifndef vcpu_is_preempted
+#define vcpu_is_preempted(cpu) false
+#endif
+
 extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
 extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 
-- 
2.4.11

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v5 0/9] implement vcpu preempted check

2016-10-20 Thread Pan Xinhui
change from v4:
spilt x86 kvm vcpu preempted check into two patches.
add documentation patch.
add x86 vcpu preempted check patch under xen
add s390 vcpu preempted check patch 
change from v3:
add x86 vcpu preempted check patch
change from v2:
no code change, fix typos, update some comments
change from v1:
a simplier definition of default vcpu_is_preempted
skip mahcine type check on ppc, and add config. remove dedicated macro.
add one patch to drop overload of rwsem_spin_on_owner and 
mutex_spin_on_owner. 
add more comments
thanks boqun and Peter's suggestion.

This patch set aims to fix lock holder preemption issues.

test-case:
perf record -a perf bench sched messaging -g 400 -p && perf report

18.09%  sched-messaging  [kernel.vmlinux]  [k] osq_lock
12.28%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
 5.27%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 3.89%  sched-messaging  [kernel.vmlinux]  [k] wait_consider_task
 3.64%  sched-messaging  [kernel.vmlinux]  [k] _raw_write_lock_irq
 3.41%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner.is
 2.49%  sched-messaging  [kernel.vmlinux]  [k] system_call

We introduce interface bool vcpu_is_preempted(int cpu) and use it in some spin
loops of osq_lock, rwsem_spin_on_owner and mutex_spin_on_owner.
These spin_on_onwer variant also cause rcu stall before we apply this patch set

We also have observed some performace improvements in uninx benchmark tests.

PPC test result:
1 copy - 0.94%
2 copy - 7.17%
4 copy - 11.9%
8 copy -  3.04%
16 copy - 15.11%

details below:
Without patch:

1 copy - File Write 4096 bufsize 8000 maxblocks  2188223.0 KBps  (30.0 s, 1 
samples)
2 copy - File Write 4096 bufsize 8000 maxblocks  1804433.0 KBps  (30.0 s, 1 
samples)
4 copy - File Write 4096 bufsize 8000 maxblocks  1237257.0 KBps  (30.0 s, 1 
samples)
8 copy - File Write 4096 bufsize 8000 maxblocks  1032658.0 KBps  (30.0 s, 1 
samples)
16 copy - File Write 4096 bufsize 8000 maxblocks   768000.0 KBps  (30.1 s, 
1 samples)

With patch: 

1 copy - File Write 4096 bufsize 8000 maxblocks  2209189.0 KBps  (30.0 s, 1 
samples)
2 copy - File Write 4096 bufsize 8000 maxblocks  1943816.0 KBps  (30.0 s, 1 
samples)
4 copy - File Write 4096 bufsize 8000 maxblocks  1405591.0 KBps  (30.0 s, 1 
samples)
8 copy - File Write 4096 bufsize 8000 maxblocks  1065080.0 KBps  (30.0 s, 1 
samples)
16 copy - File Write 4096 bufsize 8000 maxblocks   904762.0 KBps  (30.0 s, 
1 samples)

X86 test result:
test-case   after-patch   before-patch
Execl Throughput   |18307.9 lps  |11701.6 lps 
File Copy 1024 bufsize 2000 maxblocks  |  1352407.3 KBps |   790418.9 KBps
File Copy 256 bufsize 500 maxblocks|   367555.6 KBps |   222867.7 KBps
File Copy 4096 bufsize 8000 maxblocks  |  3675649.7 KBps |  1780614.4 KBps
Pipe Throughput| 11872208.7 lps  | 11855628.9 lps 
Pipe-based Context Switching   |  1495126.5 lps  |  1490533.9 lps 
Process Creation   |29881.2 lps  |28572.8 lps 
Shell Scripts (1 concurrent)   |23224.3 lpm  |22607.4 lpm 
Shell Scripts (8 concurrent)   | 3531.4 lpm  | 3211.9 lpm 
System Call Overhead   | 10385653.0 lps  | 10419979.0 lps 

Christian Borntraeger (1):
  s390/spinlock: Provide vcpu_is_preempted

Juergen Gross (1):
  x86, xen: support vcpu preempted check

Pan Xinhui (7):
  kernel/sched: introduce vcpu preempted check interface
  locking/osq: Drop the overload of osq_lock()
  kernel/locking: Drop the overload of {mutex,rwsem}_spin_on_owner
  powerpc/spinlock: support vcpu preempted check
  x86, paravirt: Add interface to support kvm/xen vcpu preempted check
  x86, kvm: support vcpu preempted check
  Documentation: virtual: kvm: Support vcpu preempted check

 Documentation/virtual/kvm/msr.txt |  8 +++-
 arch/powerpc/include/asm/spinlock.h   |  8 
 arch/s390/include/asm/spinlock.h  |  8 
 arch/s390/kernel/smp.c|  9 +++--
 arch/s390/lib/spinlock.c  | 25 -
 arch/x86/include/asm/paravirt_types.h |  2 ++
 arch/x86/include/asm/spinlock.h   |  8 
 arch/x86/include/uapi/asm/kvm_para.h  |  3 ++-
 arch/x86/kernel/kvm.c | 12 
 arch/x86/kernel/paravirt-spinlocks.c  |  6 ++
 arch/x86/kvm/x86.c| 18 ++
 arch/x86/xen/spinlock.c   |  3 ++-
 include/linux/sched.h | 12 
 kernel/locking/mutex.c| 15 +--
 kernel/locking/osq_lock.c | 10 +-
 kernel/locking/rwsem-xadd.c   | 16 +---
 16 files changed, 135 insertions(+), 28 deletions(-)

-- 
2.4.11

___
Virtualization mailing list
Virtual