On Tue, Feb 05, 2019 at 04:54:03PM -0500, Nitesh Narayan Lal wrote: > > On 2/5/19 3:45 PM, Michael S. Tsirkin wrote: > > On Mon, Feb 04, 2019 at 03:18:53PM -0500, Nitesh Narayan Lal wrote: > >> This patch enables the kernel to scan the per cpu array and > >> compress it by removing the repetitive/re-allocated pages. > >> Once the per cpu array is completely filled with pages in the > >> buddy it wakes up the kernel per cpu thread which re-scans the > >> entire per cpu array by acquiring a zone lock corresponding to > >> the page which is being scanned. If the page is still free and > >> present in the buddy it tries to isolate the page and adds it > >> to another per cpu array. > >> > >> Once this scanning process is complete and if there are any > >> isolated pages added to the new per cpu array kernel thread > >> invokes hyperlist_ready(). > >> > >> In hyperlist_ready() a hypercall is made to report these pages to > >> the host using the virtio-balloon framework. In order to do so > >> another virtqueue 'hinting_vq' is added to the balloon framework. > >> As the host frees all the reported pages, the kernel thread returns > >> them back to the buddy. > >> > >> Signed-off-by: Nitesh Narayan Lal <nit...@redhat.com> > > > > This looks kind of like what early iterations of Wei's patches did. > > > > But this has lots of issues, for example you might end up with > > a hypercall per a 4K page. > > So in the end, he switched over to just reporting only > > MAX_ORDER - 1 pages. > You mean that I should only capture/attempt to isolate pages with order > MAX_ORDER - 1? > > > > Would that be a good idea for you too? > Will it help if we have a threshold value based on the amount of memory > captured instead of the number of entries/pages in the array?
This is what Wei's patches do at least. > > > > An alternative would be a different much lighter weight > > way to report these pages and to free them on the host. > > > >> --- > >> drivers/virtio/virtio_balloon.c | 56 +++++++- > >> include/linux/page_hinting.h | 18 ++- > >> include/uapi/linux/virtio_balloon.h | 1 + > >> mm/page_alloc.c | 2 +- > >> virt/kvm/page_hinting.c | 202 +++++++++++++++++++++++++++- > >> 5 files changed, 269 insertions(+), 10 deletions(-) > >> > >> diff --git a/drivers/virtio/virtio_balloon.c > >> b/drivers/virtio/virtio_balloon.c > >> index 728ecd1eea30..8af34e0b9a32 100644 > >> --- a/drivers/virtio/virtio_balloon.c > >> +++ b/drivers/virtio/virtio_balloon.c > >> @@ -57,13 +57,15 @@ enum virtio_balloon_vq { > >> VIRTIO_BALLOON_VQ_INFLATE, > >> VIRTIO_BALLOON_VQ_DEFLATE, > >> VIRTIO_BALLOON_VQ_STATS, > >> + VIRTIO_BALLOON_VQ_HINTING, > >> VIRTIO_BALLOON_VQ_FREE_PAGE, > >> VIRTIO_BALLOON_VQ_MAX > >> }; > >> > >> struct virtio_balloon { > >> struct virtio_device *vdev; > >> - struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq; > >> + struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq, > >> + *hinting_vq; > >> > >> /* Balloon's own wq for cpu-intensive work items */ > >> struct workqueue_struct *balloon_wq; > >> @@ -122,6 +124,40 @@ static struct virtio_device_id id_table[] = { > >> { 0 }, > >> }; > >> > >> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING > >> +void virtballoon_page_hinting(struct virtio_balloon *vb, u64 gvaddr, > >> + int hyper_entries) > >> +{ > >> + u64 gpaddr = virt_to_phys((void *)gvaddr); > >> + > >> + virtqueue_add_desc(vb->hinting_vq, gpaddr, hyper_entries, 0); > >> + virtqueue_kick_sync(vb->hinting_vq); > >> +} > >> + > >> +static void hinting_ack(struct virtqueue *vq) > >> +{ > >> + struct virtio_balloon *vb = vq->vdev->priv; > >> + > >> + wake_up(&vb->acked); > >> +} > >> + > >> +static void enable_hinting(struct virtio_balloon *vb) > >> +{ > >> + guest_page_hinting_flag = 1; > >> + static_branch_enable(&guest_page_hinting_key); > >> + request_hypercall = (void *)&virtballoon_page_hinting; > >> + balloon_ptr = vb; > >> + WARN_ON(smpboot_register_percpu_thread(&hinting_threads)); > >> +} > >> + > >> +static void disable_hinting(void) > >> +{ > >> + guest_page_hinting_flag = 0; > >> + static_branch_enable(&guest_page_hinting_key); > >> + balloon_ptr = NULL; > >> +} > >> +#endif > >> + > >> static u32 page_to_balloon_pfn(struct page *page) > >> { > >> unsigned long pfn = page_to_pfn(page); > >> @@ -481,6 +517,7 @@ static int init_vqs(struct virtio_balloon *vb) > >> names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate"; > >> names[VIRTIO_BALLOON_VQ_STATS] = NULL; > >> names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL; > >> + names[VIRTIO_BALLOON_VQ_HINTING] = NULL; > >> > >> if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) { > >> names[VIRTIO_BALLOON_VQ_STATS] = "stats"; > >> @@ -492,11 +529,18 @@ static int init_vqs(struct virtio_balloon *vb) > >> callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL; > >> } > >> > >> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING)) { > >> + names[VIRTIO_BALLOON_VQ_HINTING] = "hinting_vq"; > >> + callbacks[VIRTIO_BALLOON_VQ_HINTING] = hinting_ack; > >> + } > >> err = vb->vdev->config->find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX, > >> vqs, callbacks, names, NULL, NULL); > >> if (err) > >> return err; > >> > >> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING)) > >> + vb->hinting_vq = vqs[VIRTIO_BALLOON_VQ_HINTING]; > >> + > >> vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE]; > >> vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE]; > >> if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) { > >> @@ -908,6 +952,11 @@ static int virtballoon_probe(struct virtio_device > >> *vdev) > >> if (err) > >> goto out_del_balloon_wq; > >> } > >> + > >> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING > >> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING)) > >> + enable_hinting(vb); > >> +#endif > >> virtio_device_ready(vdev); > >> > >> if (towards_target(vb)) > >> @@ -950,6 +999,10 @@ static void virtballoon_remove(struct virtio_device > >> *vdev) > >> cancel_work_sync(&vb->update_balloon_size_work); > >> cancel_work_sync(&vb->update_balloon_stats_work); > >> > >> +#ifdef CONFIG_KVM_FREE_PAGE_HINTING > >> + if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HINTING)) > >> + disable_hinting(); > >> +#endif > >> if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { > >> cancel_work_sync(&vb->report_free_page_work); > >> destroy_workqueue(vb->balloon_wq); > >> @@ -1009,6 +1062,7 @@ static unsigned int features[] = { > >> VIRTIO_BALLOON_F_MUST_TELL_HOST, > >> VIRTIO_BALLOON_F_STATS_VQ, > >> VIRTIO_BALLOON_F_DEFLATE_ON_OOM, > >> + VIRTIO_BALLOON_F_HINTING, > >> VIRTIO_BALLOON_F_FREE_PAGE_HINT, > >> VIRTIO_BALLOON_F_PAGE_POISON, > >> }; > >> diff --git a/include/linux/page_hinting.h b/include/linux/page_hinting.h > >> index e800c6b07561..3ba8c1f3b4a4 100644 > >> --- a/include/linux/page_hinting.h > >> +++ b/include/linux/page_hinting.h > >> @@ -1,15 +1,12 @@ > >> #include <linux/smpboot.h> > >> > >> -/* > >> - * Size of the array which is used to store the freed pages is defined by > >> - * MAX_FGPT_ENTRIES. If possible, we have to find a better way using which > >> - * we can get rid of the hardcoded array size. > >> - */ > >> #define MAX_FGPT_ENTRIES 1000 > >> /* > >> * hypervisor_pages - It is a dummy structure passed with the hypercall. > >> - * @pfn: page frame number for the page which needs to be sent to the > >> host. > >> - * @order: order of the page needs to be reported to the host. > >> + * @pfn - page frame number for the page which is to be freed. > >> + * @pages - number of pages which are supposed to be freed. > >> + * A global array object is used to to hold the list of pfn and pages and > >> is > >> + * passed as part of the hypercall. > >> */ > >> struct hypervisor_pages { > >> unsigned long pfn; > >> @@ -19,11 +16,18 @@ struct hypervisor_pages { > >> extern int guest_page_hinting_flag; > >> extern struct static_key_false guest_page_hinting_key; > >> extern struct smp_hotplug_thread hinting_threads; > >> +extern void (*request_hypercall)(void *, u64, int); > >> +extern void *balloon_ptr; > >> extern bool want_page_poisoning; > >> > >> int guest_page_hinting_sysctl(struct ctl_table *table, int write, > >> void __user *buffer, size_t *lenp, loff_t *ppos); > >> void guest_free_page(struct page *page, int order); > >> +extern int __isolate_free_page(struct page *page, unsigned int order); > >> +extern void free_one_page(struct zone *zone, > >> + struct page *page, unsigned long pfn, > >> + unsigned int order, > >> + int migratetype); > >> > >> static inline void disable_page_poisoning(void) > >> { > > I guess you will want to put this in some other header. Function > > declarations belong close to where they are implemented, not used. > I will find a better place. > > > >> diff --git a/include/uapi/linux/virtio_balloon.h > >> b/include/uapi/linux/virtio_balloon.h > >> index a1966cd7b677..2b0f62814e22 100644 > >> --- a/include/uapi/linux/virtio_balloon.h > >> +++ b/include/uapi/linux/virtio_balloon.h > >> @@ -36,6 +36,7 @@ > >> #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */ > >> #define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */ > >> #define VIRTIO_BALLOON_F_PAGE_POISON 4 /* Guest is using page > >> poisoning */ > >> +#define VIRTIO_BALLOON_F_HINTING 5 /* Page hinting virtqueue */ > >> > >> /* Size of a PFN in the balloon interface. */ > >> #define VIRTIO_BALLOON_PFN_SHIFT 12 > >> diff --git a/mm/page_alloc.c b/mm/page_alloc.c > >> index d295c9bc01a8..93224cba9243 100644 > >> --- a/mm/page_alloc.c > >> +++ b/mm/page_alloc.c > >> @@ -1199,7 +1199,7 @@ static void free_pcppages_bulk(struct zone *zone, > >> int count, > >> spin_unlock(&zone->lock); > >> } > >> > >> -static void free_one_page(struct zone *zone, > >> +void free_one_page(struct zone *zone, > >> struct page *page, unsigned long pfn, > >> unsigned int order, > >> int migratetype) > >> diff --git a/virt/kvm/page_hinting.c b/virt/kvm/page_hinting.c > >> index be529f6f2bc0..315099fcda43 100644 > >> --- a/virt/kvm/page_hinting.c > >> +++ b/virt/kvm/page_hinting.c > >> @@ -1,6 +1,8 @@ > >> #include <linux/gfp.h> > >> #include <linux/mm.h> > >> +#include <linux/page_ref.h> > >> #include <linux/kvm_host.h> > >> +#include <linux/sort.h> > >> #include <linux/kernel.h> > >> > >> /* > >> @@ -39,6 +41,11 @@ int guest_page_hinting_flag; > >> EXPORT_SYMBOL(guest_page_hinting_flag); > >> static DEFINE_PER_CPU(struct task_struct *, hinting_task); > >> > >> +void (*request_hypercall)(void *, u64, int); > >> +EXPORT_SYMBOL(request_hypercall); > >> +void *balloon_ptr; > >> +EXPORT_SYMBOL(balloon_ptr); > >> + > >> int guest_page_hinting_sysctl(struct ctl_table *table, int write, > >> void __user *buffer, size_t *lenp, > >> loff_t *ppos) > >> @@ -55,18 +62,201 @@ int guest_page_hinting_sysctl(struct ctl_table > >> *table, int write, > >> return ret; > >> } > >> > >> +void hyperlist_ready(struct hypervisor_pages *guest_isolated_pages, int > >> entries) > >> +{ > >> + int i = 0; > >> + int mt = 0; > >> + > >> + if (balloon_ptr) > >> + request_hypercall(balloon_ptr, (u64)&guest_isolated_pages[0], > >> + entries); > >> + > >> + while (i < entries) { > >> + struct page *page = pfn_to_page(guest_isolated_pages[i].pfn); > >> + > >> + mt = get_pageblock_migratetype(page); > >> + free_one_page(page_zone(page), page, page_to_pfn(page), > >> + guest_isolated_pages[i].order, mt); > >> + i++; > >> + } > >> +} > >> + > >> +struct page *get_buddy_page(struct page *page) > >> +{ > >> + unsigned long pfn = page_to_pfn(page); > >> + unsigned int order; > >> + > >> + for (order = 0; order < MAX_ORDER; order++) { > >> + struct page *page_head = page - (pfn & ((1 << order) - 1)); > >> + > >> + if (PageBuddy(page_head) && page_private(page_head) >= order) > >> + return page_head; > >> + } > >> + return NULL; > >> +} > >> + > >> static void hinting_fn(unsigned int cpu) > >> { > >> struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj); > >> + int idx = 0, ret = 0; > >> + struct zone *zone_cur; > >> + unsigned long flags = 0; > >> + > >> + while (idx < MAX_FGPT_ENTRIES) { > >> + unsigned long pfn = page_hinting_obj->kvm_pt[idx].pfn; > >> + unsigned long pfn_end = page_hinting_obj->kvm_pt[idx].pfn + > >> + (1 << page_hinting_obj->kvm_pt[idx].order) - 1; > >> + > >> + while (pfn <= pfn_end) { > >> + struct page *page = pfn_to_page(pfn); > >> + struct page *buddy_page = NULL; > >> + > >> + zone_cur = page_zone(page); > >> + spin_lock_irqsave(&zone_cur->lock, flags); > >> + > >> + if (PageCompound(page)) { > >> + struct page *head_page = compound_head(page); > >> + unsigned long head_pfn = page_to_pfn(head_page); > >> + unsigned int alloc_pages = > >> + 1 << compound_order(head_page); > >> + > >> + pfn = head_pfn + alloc_pages; > >> + spin_unlock_irqrestore(&zone_cur->lock, flags); > >> + continue; > >> + } > >> + > >> + if (page_ref_count(page)) { > >> + pfn++; > >> + spin_unlock_irqrestore(&zone_cur->lock, flags); > >> + continue; > >> + } > >> + > >> + if (PageBuddy(page)) { > >> + int buddy_order = page_private(page); > >> > >> + ret = __isolate_free_page(page, buddy_order); > >> + if (!ret) { > >> + } else { > >> + int l_idx = page_hinting_obj->hyp_idx; > >> + struct hypervisor_pages *l_obj = > >> + page_hinting_obj->hypervisor_pagelist; > >> + > >> + l_obj[l_idx].pfn = pfn; > >> + l_obj[l_idx].order = buddy_order; > >> + page_hinting_obj->hyp_idx += 1; > >> + } > >> + pfn = pfn + (1 << buddy_order); > >> + spin_unlock_irqrestore(&zone_cur->lock, flags); > >> + continue; > >> + } > >> + > >> + buddy_page = get_buddy_page(page); > >> + if (buddy_page) { > >> + int buddy_order = page_private(buddy_page); > >> + > >> + ret = __isolate_free_page(buddy_page, > >> + buddy_order); > >> + if (!ret) { > >> + } else { > >> + int l_idx = page_hinting_obj->hyp_idx; > >> + struct hypervisor_pages *l_obj = > >> + page_hinting_obj->hypervisor_pagelist; > >> + unsigned long buddy_pfn = > >> + page_to_pfn(buddy_page); > >> + > >> + l_obj[l_idx].pfn = buddy_pfn; > >> + l_obj[l_idx].order = buddy_order; > >> + page_hinting_obj->hyp_idx += 1; > >> + } > >> + pfn = page_to_pfn(buddy_page) + > >> + (1 << buddy_order); > >> + spin_unlock_irqrestore(&zone_cur->lock, flags); > >> + continue; > >> + } > >> + spin_unlock_irqrestore(&zone_cur->lock, flags); > >> + pfn++; > >> + } > >> + page_hinting_obj->kvm_pt[idx].pfn = 0; > >> + page_hinting_obj->kvm_pt[idx].order = -1; > >> + page_hinting_obj->kvm_pt[idx].zonenum = -1; > >> + idx++; > >> + } > >> + if (page_hinting_obj->hyp_idx > 0) { > >> + hyperlist_ready(page_hinting_obj->hypervisor_pagelist, > >> + page_hinting_obj->hyp_idx); > >> + page_hinting_obj->hyp_idx = 0; > >> + } > >> page_hinting_obj->kvm_pt_idx = 0; > >> put_cpu_var(hinting_obj); > >> } > >> > >> +int if_exist(struct page *page) > >> +{ > >> + int i = 0; > >> + struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj); > >> + > >> + while (i < MAX_FGPT_ENTRIES) { > >> + if (page_to_pfn(page) == page_hinting_obj->kvm_pt[i].pfn) > >> + return 1; > >> + i++; > >> + } > >> + return 0; > >> +} > >> + > >> +void pack_array(void) > >> +{ > >> + int i = 0, j = 0; > >> + struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj); > >> + > >> + while (i < MAX_FGPT_ENTRIES) { > >> + if (page_hinting_obj->kvm_pt[i].pfn != 0) { > >> + if (i != j) { > >> + page_hinting_obj->kvm_pt[j].pfn = > >> + page_hinting_obj->kvm_pt[i].pfn; > >> + page_hinting_obj->kvm_pt[j].order = > >> + page_hinting_obj->kvm_pt[i].order; > >> + page_hinting_obj->kvm_pt[j].zonenum = > >> + page_hinting_obj->kvm_pt[i].zonenum; > >> + } > >> + j++; > >> + } > >> + i++; > >> + } > >> + i = j; > >> + page_hinting_obj->kvm_pt_idx = j; > >> + while (j < MAX_FGPT_ENTRIES) { > >> + page_hinting_obj->kvm_pt[j].pfn = 0; > >> + page_hinting_obj->kvm_pt[j].order = -1; > >> + page_hinting_obj->kvm_pt[j].zonenum = -1; > >> + j++; > >> + } > >> +} > >> + > >> void scan_array(void) > >> { > >> struct page_hinting *page_hinting_obj = this_cpu_ptr(&hinting_obj); > >> + int i = 0; > >> > >> + while (i < MAX_FGPT_ENTRIES) { > >> + struct page *page = > >> + pfn_to_page(page_hinting_obj->kvm_pt[i].pfn); > >> + struct page *buddy_page = get_buddy_page(page); > >> + > >> + if (!PageBuddy(page) && buddy_page) { > >> + if (if_exist(buddy_page)) { > >> + page_hinting_obj->kvm_pt[i].pfn = 0; > >> + page_hinting_obj->kvm_pt[i].order = -1; > >> + page_hinting_obj->kvm_pt[i].zonenum = -1; > >> + } else { > >> + page_hinting_obj->kvm_pt[i].pfn = > >> + page_to_pfn(buddy_page); > >> + page_hinting_obj->kvm_pt[i].order = > >> + page_private(buddy_page); > >> + } > >> + } > >> + i++; > >> + } > >> + pack_array(); > >> if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES) > >> wake_up_process(__this_cpu_read(hinting_task)); > >> } > >> @@ -111,8 +301,18 @@ void guest_free_page(struct page *page, int order) > >> page_hinting_obj->kvm_pt[page_hinting_obj->kvm_pt_idx].order = > >> order; > >> page_hinting_obj->kvm_pt_idx += 1; > >> - if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES) > >> + if (page_hinting_obj->kvm_pt_idx == MAX_FGPT_ENTRIES) { > >> + /* > >> + * We are depending on the buddy free-list to identify > >> + * if a page is free or not. Hence, we are dumping all > >> + * the per-cpu pages back into the buddy allocator. This > >> + * will ensure less failures when we try to isolate free > >> + * captured pages and hence more memory reporting to the > >> + * host. > >> + */ > >> + drain_local_pages(NULL); > >> scan_array(); > >> + } > >> } > >> local_irq_restore(flags); > >> } > >> -- > >> 2.17.2 > -- > Regards > Nitesh >