On Tue, Feb 05, 2019 at 04:54:03PM -0500, Nitesh Narayan Lal wrote:
> 
> On 2/5/19 3:45 PM, Michael S. Tsirkin wrote:
> > On Mon, Feb 04, 2019 at 03:18:53PM -0500, Nitesh Narayan Lal wrote:
> >> This patch enables the kernel to scan the per cpu array and
> >> compress it by removing the repetitive/re-allocated pages.
> >> Once the per cpu array is completely filled with pages in the
> >> buddy it wakes up the kernel per cpu thread which re-scans the
> >> entire per cpu array by acquiring a zone lock corresponding to
> >> the page which is being scanned. If the page is still free and
> >> present in the buddy it tries to isolate the page and adds it
> >> to another per cpu array.
> >>
> >> Once this scanning process is complete and if there are any
> >> isolated pages added to the new per cpu array kernel thread
> >> invokes hyperlist_ready().
> >>
> >> In hyperlist_ready() a hypercall is made to report these pages to
> >> the host using the virtio-balloon framework. In order to do so
> >> another virtqueue 'hinting_vq' is added to the balloon framework.
> >> As the host frees all the reported pages, the kernel thread returns
> >> them back to the buddy.
> >>
> >> Signed-off-by: Nitesh Narayan Lal <nit...@redhat.com>
> >
> > This looks kind of like what early iterations of Wei's patches did.
> >
> > But this has lots of issues, for example you might end up with
> > a hypercall per a 4K page.
> > So in the end, he switched over to just reporting only
> > MAX_ORDER - 1 pages.
> You mean that I should only capture/attempt to isolate pages with order
> MAX_ORDER - 1?
> >
> > Would that be a good idea for you too?
> Will it help if we have a threshold value based on the amount of memory
> captured instead of the number of entries/pages in the array?

This is what Wei's patches do at least.

> >
> > An alternative would be a different much lighter weight
> > way to report these pages and to free them on the host.
> >
> >> ---
> >>  drivers/virtio/virtio_balloon.c     |  56 +++++++-
> >>  include/linux/page_hinting.h        |  18 ++-
> >>  include/uapi/linux/virtio_balloon.h |   1 +
> >>  mm/page_alloc.c                     |   2 +-
> >>  virt/kvm/page_hinting.c             | 202 +++++++++++++++++++++++++++-
> >>  5 files changed, 269 insertions(+), 10 deletions(-)
> >>
> >> diff --git a/drivers/virtio/virtio_balloon.c 
> >> b/drivers/virtio/virtio_balloon.c
> >> index 728ecd1eea30..8af34e0b9a32 100644
> >> --- a/drivers/virtio/virtio_balloon.c
> >> +++ b/drivers/virtio/virtio_balloon.c
> >> @@ -57,13 +57,15 @@ enum virtio_balloon_vq {
> >>    VIRTIO_BALLOON_VQ_INFLATE,
> >>    VIRTIO_BALLOON_VQ_DEFLATE,
> >>    VIRTIO_BALLOON_VQ_STATS,
> >> +  VIRTIO_BALLOON_VQ_HINTING,
> >>    VIRTIO_BALLOON_VQ_FREE_PAGE,
> >>    VIRTIO_BALLOON_VQ_MAX
> >>  };
> >>  
> >>  struct virtio_balloon {
> >>    struct virtio_device *vdev;
> >> -  struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
> >> +  struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq,
> >> +                                                          *hinting_vq;
> >>  
> >>    /* Balloon's own wq for cpu-intensive work items */
> >>    struct workqueue_struct *balloon_wq;
> >> @@ -122,6 +124,40 @@ static struct virtio_device_id id_table[] = {
> >>    { 0 },
> >>  };
> >>  
> >> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING
> >> +void virtballoon_page_hinting(struct virtio_balloon *vb, u64 gvaddr,
> >> +                        int hyper_entries)
> >> +{
> >> +  u64 gpaddr = virt_to_phys((void *)gvaddr);
> >> +
> >> +  virtqueue_add_desc(vb->hinting_vq, gpaddr, hyper_entries, 0);
> >> +  virtqueue_kick_sync(vb->hinting_vq);
> >> +}
> >> +
> >> +static void hinting_ack(struct virtqueue *vq)
> >> +{
> >> +  struct virtio_balloon *vb = vq->vdev->priv;
> >> +
> >> +  wake_up(&vb->acked);
> >> +}
> >> +
> >> +static void enable_hinting(struct virtio_balloon *vb)
> >> +{
> >> +  guest_page_hinting_flag = 1;
> >> +  static_branch_enable(&guest_page_hinting_key);
> >> +  request_hypercall = (void *)&virtballoon_page_hinting;
> >> +  balloon_ptr = vb;
> >> +  WARN_ON(smpboot_register_percpu_thread(&hinting_threads));
> >> +}
> >> +
> >> +static void disable_hinting(void)
> >> +{
> >> +  guest_page_hinting_flag = 0;
> >> +  static_branch_enable(&guest_page_hinting_key);
> >> +  balloon_ptr = NULL;
> >> +}
> >> +#endif
> >> +
> >>  static u32 page_to_balloon_pfn(struct page *page)
> >>  {
> >>    unsigned long pfn = page_to_pfn(page);
> >> @@ -481,6 +517,7 @@ static int init_vqs(struct virtio_balloon *vb)
> >>    names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate";
> >>    names[VIRTIO_BALLOON_VQ_STATS] = NULL;
> >>    names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
> >> +  names[VIRTIO_BALLOON_VQ_HINTING] = NULL;
> >>  
> >>    if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> >>            names[VIRTIO_BALLOON_VQ_STATS] = "stats";
> >> @@ -492,11 +529,18 @@ static int init_vqs(struct virtio_balloon *vb)
> >>            callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL;
> >>    }
> >>  
> >> +  if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING)) {
> >> +          names[VIRTIO_BALLOON_VQ_HINTING] = "hinting_vq";
> >> +          callbacks[VIRTIO_BALLOON_VQ_HINTING] = hinting_ack;
> >> +  }
> >>    err = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX,
> >>                                     vqs, callbacks, names, NULL, NULL);
> >>    if (err)
> >>            return err;
> >>  
> >> +  if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
> >> +          vb->hinting_vq = vqs[VIRTIO_BALLOON_VQ_HINTING];
> >> +
> >>    vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE];
> >>    vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE];
> >>    if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> >> @@ -908,6 +952,11 @@ static int virtballoon_probe(struct virtio_device 
> >> *vdev)
> >>            if (err)
> >>                    goto out_del_balloon_wq;
> >>    }
> >> +
> >> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING
> >> +  if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
> >> +          enable_hinting(vb);
> >> +#endif
> >>    virtio_device_ready(vdev);
> >>  
> >>    if (towards_target(vb))
> >> @@ -950,6 +999,10 @@ static void virtballoon_remove(struct virtio_device 
> >> *vdev)
> >>    cancel_work_sync(&vb->update_balloon_size_work);
> >>    cancel_work_sync(&vb->update_balloon_stats_work);
> >>  
> >> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING
> >> +  if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING))
> >> +          disable_hinting();
> >> +#endif
> >>    if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
> >>            cancel_work_sync(&vb->report_free_page_work);
> >>            destroy_workqueue(vb->balloon_wq);
> >> @@ -1009,6 +1062,7 @@ static unsigned int features[] = {
> >>    VIRTIO_BALLOON_F_MUST_TELL_HOST,
> >>    VIRTIO_BALLOON_F_STATS_VQ,
> >>    VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
> >> +  VIRTIO_BALLOON_F_HINTING,
> >>    VIRTIO_BALLOON_F_FREE_PAGE_HINT,
> >>    VIRTIO_BALLOON_F_PAGE_POISON,
> >>  };
> >> diff --git a/include/linux/page_hinting.h b/include/linux/page_hinting.h
> >> index e800c6b07561..3ba8c1f3b4a4 100644
> >> --- a/include/linux/page_hinting.h
> >> +++ b/include/linux/page_hinting.h
> >> @@ -1,15 +1,12 @@
> >>  #include <linux/smpboot.h>
> >>  
> >> -/*
> >> - * Size of the array which is used to store the freed pages is defined by
> >> - * MAX_FGPT_ENTRIES. If possible, we have to find a better way using which
> >> - * we can get rid of the hardcoded array size.
> >> - */
> >>  #define MAX_FGPT_ENTRIES  1000
> >>  /*
> >>   * hypervisor_pages - It is a dummy structure passed with the hypercall.
> >> - * @pfn: page frame number for the page which needs to be sent to the 
> >> host.
> >> - * @order: order of the page needs to be reported to the host.
> >> + * @pfn - page frame number for the page which is to be freed.
> >> + * @pages - number of pages which are supposed to be freed.
> >> + * A global array object is used to to hold the list of pfn and pages and 
> >> is
> >> + * passed as part of the hypercall.
> >>   */
> >>  struct hypervisor_pages {
> >>    unsigned long pfn;
> >> @@ -19,11 +16,18 @@ struct hypervisor_pages {
> >>  extern int guest_page_hinting_flag;
> >>  extern struct static_key_false guest_page_hinting_key;
> >>  extern struct smp_hotplug_thread hinting_threads;
> >> +extern void (*request_hypercall)(void *, u64, int);
> >> +extern void *balloon_ptr;
> >>  extern bool want_page_poisoning;
> >>  
> >>  int guest_page_hinting_sysctl(struct ctl_table *table, int write,
> >>                          void __user *buffer, size_t *lenp, loff_t *ppos);
> >>  void guest_free_page(struct page *page, int order);
> >> +extern int __isolate_free_page(struct page *page, unsigned int order);
> >> +extern void free_one_page(struct zone *zone,
> >> +                    struct page *page, unsigned long pfn,
> >> +                    unsigned int order,
> >> +                    int migratetype);
> >>  
> >>  static inline void disable_page_poisoning(void)
> >>  {
> > I guess you will want to put this in some other header.  Function
> > declarations belong close to where they are implemented, not used.
> I will find a better place.
> >
> >> diff --git a/include/uapi/linux/virtio_balloon.h 
> >> b/include/uapi/linux/virtio_balloon.h
> >> index a1966cd7b677..2b0f62814e22 100644
> >> --- a/include/uapi/linux/virtio_balloon.h
> >> +++ b/include/uapi/linux/virtio_balloon.h
> >> @@ -36,6 +36,7 @@
> >>  #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM   2 /* Deflate balloon on OOM */
> >>  #define VIRTIO_BALLOON_F_FREE_PAGE_HINT   3 /* VQ to report free pages */
> >>  #define VIRTIO_BALLOON_F_PAGE_POISON      4 /* Guest is using page 
> >> poisoning */
> >> +#define VIRTIO_BALLOON_F_HINTING  5 /* Page hinting virtqueue */
> >>  
> >>  /* Size of a PFN in the balloon interface. */
> >>  #define VIRTIO_BALLOON_PFN_SHIFT 12
> >> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> >> index d295c9bc01a8..93224cba9243 100644
> >> --- a/mm/page_alloc.c
> >> +++ b/mm/page_alloc.c
> >> @@ -1199,7 +1199,7 @@ static void free_pcppages_bulk(struct zone *zone, 
> >> int count,
> >>    spin_unlock(&zone->lock);
> >>  }
> >>  
> >> -static void free_one_page(struct zone *zone,
> >> +void free_one_page(struct zone *zone,
> >>                            struct page *page, unsigned long pfn,
> >>                            unsigned int order,
> >>                            int migratetype)
> >> diff --git a/virt/kvm/page_hinting.c b/virt/kvm/page_hinting.c
> >> index be529f6f2bc0..315099fcda43 100644
> >> --- a/virt/kvm/page_hinting.c
> >> +++ b/virt/kvm/page_hinting.c
> >> @@ -1,6 +1,8 @@
> >>  #include <linux/gfp.h>
> >>  #include <linux/mm.h>
> >> +#include <linux/page_ref.h>
> >>  #include <linux/kvm_host.h>
> >> +#include <linux/sort.h>
> >>  #include <linux/kernel.h>
> >>  
> >>  /*
> >> @@ -39,6 +41,11 @@ int guest_page_hinting_flag;
> >>  EXPORT_SYMBOL(guest_page_hinting_flag);
> >>  static DEFINE_PER_CPU(struct task_struct *, hinting_task);
> >>  
> >> +void (*request_hypercall)(void *, u64, int);
> >> +EXPORT_SYMBOL(request_hypercall);
> >> +void *balloon_ptr;
> >> +EXPORT_SYMBOL(balloon_ptr);
> >> +
> >>  int guest_page_hinting_sysctl(struct ctl_table *table, int write,
> >>                          void __user *buffer, size_t *lenp,
> >>                          loff_t *ppos)
> >> @@ -55,18 +62,201 @@ int guest_page_hinting_sysctl(struct ctl_table 
> >> *table, int write,
> >>    return ret;
> >>  }
> >>  
> >> +void hyperlist_ready(struct hypervisor_pages *guest_isolated_pages, int 
> >> entries)
> >> +{
> >> +  int i = 0;
> >> +  int mt = 0;
> >> +
> >> +  if (balloon_ptr)
> >> +          request_hypercall(balloon_ptr, (u64)&guest_isolated_pages[0],
> >> +                            entries);
> >> +
> >> +  while (i < entries) {
> >> +          struct page *page = pfn_to_page(guest_isolated_pages[i].pfn);
> >> +
> >> +          mt = get_pageblock_migratetype(page);
> >> +          free_one_page(page_zone(page), page, page_to_pfn(page),
> >> +                        guest_isolated_pages[i].order, mt);
> >> +          i++;
> >> +  }
> >> +}
> >> +
> >> +struct page *get_buddy_page(struct page *page)
> >> +{
> >> +  unsigned long pfn = page_to_pfn(page);
> >> +  unsigned int order;
> >> +
> >> +  for (order = 0; order < MAX_ORDER; order++) {
> >> +          struct page *page_head = page - (pfn & ((1 << order) - 1));
> >> +
> >> +          if (PageBuddy(page_head) && page_private(page_head) >= order)
> >> +                  return page_head;
> >> +  }
> >> +  return NULL;
> >> +}
> >> +
> >>  static void hinting_fn(unsigned int cpu)
> >>  {
> >>    struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
> >> +  int idx = 0, ret = 0;
> >> +  struct zone *zone_cur;
> >> +  unsigned long flags = 0;
> >> +
> >> +  while (idx < MAX_FGPT_ENTRIES) {
> >> +          unsigned long pfn = page_hinting_obj->kvm_pt[idx].pfn;
> >> +          unsigned long pfn_end = page_hinting_obj->kvm_pt[idx].pfn +
> >> +                  (1 << page_hinting_obj->kvm_pt[idx].order) - 1;
> >> +
> >> +          while (pfn <= pfn_end) {
> >> +                  struct page *page = pfn_to_page(pfn);
> >> +                  struct page *buddy_page = NULL;
> >> +
> >> +                  zone_cur = page_zone(page);
> >> +                  spin_lock_irqsave(&zone_cur->lock, flags);
> >> +
> >> +                  if (PageCompound(page)) {
> >> +                          struct page *head_page = compound_head(page);
> >> +                          unsigned long head_pfn = page_to_pfn(head_page);
> >> +                          unsigned int alloc_pages =
> >> +                                  1 << compound_order(head_page);
> >> +
> >> +                          pfn = head_pfn + alloc_pages;
> >> +                          spin_unlock_irqrestore(&zone_cur->lock, flags);
> >> +                          continue;
> >> +                  }
> >> +
> >> +                  if (page_ref_count(page)) {
> >> +                          pfn++;
> >> +                          spin_unlock_irqrestore(&zone_cur->lock, flags);
> >> +                          continue;
> >> +                  }
> >> +
> >> +                  if (PageBuddy(page)) {
> >> +                          int buddy_order = page_private(page);
> >>  
> >> +                          ret = __isolate_free_page(page, buddy_order);
> >> +                          if (!ret) {
> >> +                          } else {
> >> +                                  int l_idx = page_hinting_obj->hyp_idx;
> >> +                                  struct hypervisor_pages *l_obj =
> >> +                                  page_hinting_obj->hypervisor_pagelist;
> >> +
> >> +                                  l_obj[l_idx].pfn = pfn;
> >> +                                  l_obj[l_idx].order = buddy_order;
> >> +                                  page_hinting_obj->hyp_idx += 1;
> >> +                          }
> >> +                          pfn = pfn + (1 << buddy_order);
> >> +                          spin_unlock_irqrestore(&zone_cur->lock, flags);
> >> +                          continue;
> >> +                  }
> >> +
> >> +                  buddy_page = get_buddy_page(page);
> >> +                  if (buddy_page) {
> >> +                          int buddy_order = page_private(buddy_page);
> >> +
> >> +                          ret = __isolate_free_page(buddy_page,
> >> +                                                    buddy_order);
> >> +                          if (!ret) {
> >> +                          } else {
> >> +                                  int l_idx = page_hinting_obj->hyp_idx;
> >> +                                  struct hypervisor_pages *l_obj =
> >> +                                  page_hinting_obj->hypervisor_pagelist;
> >> +                                  unsigned long buddy_pfn =
> >> +                                          page_to_pfn(buddy_page);
> >> +
> >> +                                  l_obj[l_idx].pfn = buddy_pfn;
> >> +                                  l_obj[l_idx].order = buddy_order;
> >> +                                  page_hinting_obj->hyp_idx += 1;
> >> +                          }
> >> +                          pfn = page_to_pfn(buddy_page) +
> >> +                                  (1 << buddy_order);
> >> +                          spin_unlock_irqrestore(&zone_cur->lock, flags);
> >> +                          continue;
> >> +                  }
> >> +                  spin_unlock_irqrestore(&zone_cur->lock, flags);
> >> +                  pfn++;
> >> +          }
> >> +          page_hinting_obj->kvm_pt[idx].pfn = 0;
> >> +          page_hinting_obj->kvm_pt[idx].order = -1;
> >> +          page_hinting_obj->kvm_pt[idx].zonenum = -1;
> >> +          idx++;
> >> +  }
> >> +  if (page_hinting_obj->hyp_idx > 0) {
> >> +          hyperlist_ready(page_hinting_obj->hypervisor_pagelist,
> >> +                          page_hinting_obj->hyp_idx);
> >> +          page_hinting_obj->hyp_idx = 0;
> >> +  }
> >>    page_hinting_obj->kvm_pt_idx = 0;
> >>    put_cpu_var(hinting_obj);
> >>  }
> >>  
> >> +int if_exist(struct page *page)
> >> +{
> >> +  int i = 0;
> >> +  struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
> >> +
> >> +  while (i < MAX_FGPT_ENTRIES) {
> >> +          if (page_to_pfn(page) == page_hinting_obj->kvm_pt[i].pfn)
> >> +                  return 1;
> >> +          i++;
> >> +  }
> >> +  return 0;
> >> +}
> >> +
> >> +void pack_array(void)
> >> +{
> >> +  int i = 0, j = 0;
> >> +  struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
> >> +
> >> +  while (i < MAX_FGPT_ENTRIES) {
> >> +          if (page_hinting_obj->kvm_pt[i].pfn != 0) {
> >> +                  if (i != j) {
> >> +                          page_hinting_obj->kvm_pt[j].pfn =
> >> +                                  page_hinting_obj->kvm_pt[i].pfn;
> >> +                          page_hinting_obj->kvm_pt[j].order =
> >> +                                  page_hinting_obj->kvm_pt[i].order;
> >> +                          page_hinting_obj->kvm_pt[j].zonenum =
> >> +                                  page_hinting_obj->kvm_pt[i].zonenum;
> >> +                  }
> >> +                  j++;
> >> +          }
> >> +          i++;
> >> +  }
> >> +  i = j;
> >> +  page_hinting_obj->kvm_pt_idx = j;
> >> +  while (j < MAX_FGPT_ENTRIES) {
> >> +          page_hinting_obj->kvm_pt[j].pfn = 0;
> >> +          page_hinting_obj->kvm_pt[j].order = -1;
> >> +          page_hinting_obj->kvm_pt[j].zonenum = -1;
> >> +          j++;
> >> +  }
> >> +}
> >> +
> >>  void scan_array(void)
> >>  {
> >>    struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj);
> >> +  int i = 0;
> >>  
> >> +  while (i < MAX_FGPT_ENTRIES) {
> >> +          struct page *page =
> >> +                  pfn_to_page(page_hinting_obj->kvm_pt[i].pfn);
> >> +          struct page *buddy_page = get_buddy_page(page);
> >> +
> >> +          if (!PageBuddy(page) && buddy_page) {
> >> +                  if (if_exist(buddy_page)) {
> >> +                          page_hinting_obj->kvm_pt[i].pfn = 0;
> >> +                          page_hinting_obj->kvm_pt[i].order = -1;
> >> +                          page_hinting_obj->kvm_pt[i].zonenum = -1;
> >> +                  } else {
> >> +                          page_hinting_obj->kvm_pt[i].pfn =
> >> +                                  page_to_pfn(buddy_page);
> >> +                          page_hinting_obj->kvm_pt[i].order =
> >> +                                  page_private(buddy_page);
> >> +                  }
> >> +          }
> >> +          i++;
> >> +  }
> >> +  pack_array();
> >>    if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES)
> >>            wake_up_process(__this_cpu_read(hinting_task));
> >>  }
> >> @@ -111,8 +301,18 @@ void guest_free_page(struct page *page, int order)
> >>            page_hinting_obj->kvm_pt[page_hinting_obj->kvm_pt_idx].order =
> >>                                                    order;
> >>            page_hinting_obj->kvm_pt_idx += 1;
> >> -          if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES)
> >> +          if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES) {
> >> +                  /*
> >> +                   * We are depending on the buddy free-list to identify
> >> +                   * if a page is free or not. Hence, we are dumping all
> >> +                   * the per-cpu pages back into the buddy allocator. This
> >> +                   * will ensure less failures when we try to isolate free
> >> +                   * captured pages and hence more memory reporting to the
> >> +                   * host.
> >> +                   */
> >> +                  drain_local_pages(NULL);
> >>                    scan_array();
> >> +          }
> >>    }
> >>    local_irq_restore(flags);
> >>  }
> >> -- 
> >> 2.17.2
> -- 
> Regards
> Nitesh
> 



Reply via email to