When a guest reports free pages to the hypervisor via the page reporting framework (used by virtio-balloon and hv_balloon), the host typically zeros those pages when reclaiming their backing memory. However, when those pages are later allocated in the guest, post_alloc_hook() unconditionally zeros them again if __GFP_ZERO is set. This double-zeroing is wasteful, especially for large pages.
Avoid redundant zeroing by propagating the "host already zeroed this" information through the allocation path: 1. Add a host_zeroes_pages flag to page_reporting_dev_info, allowing drivers to declare that their host zeros reported pages on reclaim. A static key (page_reporting_host_zeroes) gates the fast path. 2. In page_del_and_expand(), when the page was reported and the static key is enabled, stash a sentinel value (MAGIC_PAGE_ZEROED) in page->private. 3. In post_alloc_hook(), check page->private for the sentinel. If present and zeroing was requested (but not tag zeroing), skip kernel_init_pages(). In particular, __GFP_ZERO is used by the x86 arch override of vma_alloc_zeroed_movable_folio. No driver sets host_zeroes_pages yet; a follow-up patch to virtio_balloon is needed to opt in. Signed-off-by: Michael S. Tsirkin <[email protected]> Assisted-by: Claude:claude-opus-4-6 --- include/linux/mm.h | 6 ++++++ include/linux/page_reporting.h | 3 +++ mm/page_alloc.c | 21 +++++++++++++++++++++ mm/page_reporting.c | 9 +++++++++ mm/page_reporting.h | 2 ++ 5 files changed, 41 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 5be3d8a8f806..59fc77c4c90e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4814,6 +4814,12 @@ static inline bool user_alloc_needs_zeroing(void) &init_on_alloc); } +/* + * Sentinel stored in page->private to indicate the page was pre-zeroed + * by the hypervisor (via free page reporting). + */ +#define MAGIC_PAGE_ZEROED 0x5A45524FU /* ZERO */ + int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status); int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status); int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h index fe648dfa3a7c..10faadfeb4fb 100644 --- a/include/linux/page_reporting.h +++ b/include/linux/page_reporting.h @@ -13,6 +13,9 @@ struct page_reporting_dev_info { int (*report)(struct page_reporting_dev_info *prdev, struct scatterlist *sg, unsigned int nents); + /* If true, host zeros reported pages on reclaim */ + bool host_zeroes_pages; + /* work struct for processing reports */ struct delayed_work work; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index edbb1edf463d..efb65eee826b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1774,8 +1774,20 @@ static __always_inline void page_del_and_expand(struct zone *zone, bool was_reported = page_reported(page); __del_page_from_free_list(page, zone, high, migratetype); + + was_reported = was_reported && + static_branch_unlikely(&page_reporting_host_zeroes); + nr_pages -= expand(zone, page, low, high, migratetype, was_reported); account_freepages(zone, -nr_pages, migratetype); + + /* + * If the page was reported and the host is known to zero reported + * pages, mark it zeroed via page->private so that + * post_alloc_hook() can skip redundant zeroing. + */ + if (was_reported) + set_page_private(page, MAGIC_PAGE_ZEROED); } static void check_new_page_bad(struct page *page) @@ -1851,11 +1863,20 @@ inline void post_alloc_hook(struct page *page, unsigned int order, { bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && !should_skip_init(gfp_flags); + bool prezeroed = page_private(page) == MAGIC_PAGE_ZEROED; bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS); int i; set_page_private(page, 0); + /* + * If the page is pre-zeroed, skip memory initialization. + * We still need to handle tag zeroing separately since the host + * does not know about memory tags. + */ + if (prezeroed && init && !zero_tags) + init = false; + arch_alloc_page(page, order); debug_pagealloc_map_pages(page, 1 << order); diff --git a/mm/page_reporting.c b/mm/page_reporting.c index f0042d5743af..cb24832bdf4e 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -50,6 +50,8 @@ EXPORT_SYMBOL_GPL(page_reporting_order); #define PAGE_REPORTING_DELAY (2 * HZ) static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly; +DEFINE_STATIC_KEY_FALSE(page_reporting_host_zeroes); + enum { PAGE_REPORTING_IDLE = 0, PAGE_REPORTING_REQUESTED, @@ -386,6 +388,10 @@ int page_reporting_register(struct page_reporting_dev_info *prdev) /* Assign device to allow notifications */ rcu_assign_pointer(pr_dev_info, prdev); + /* enable zeroed page optimization if host zeroes reported pages */ + if (prdev->host_zeroes_pages) + static_branch_enable(&page_reporting_host_zeroes); + /* enable page reporting notification */ if (!static_key_enabled(&page_reporting_enabled)) { static_branch_enable(&page_reporting_enabled); @@ -410,6 +416,9 @@ void page_reporting_unregister(struct page_reporting_dev_info *prdev) /* Flush any existing work, and lock it out */ cancel_delayed_work_sync(&prdev->work); + + if (prdev->host_zeroes_pages) + static_branch_disable(&page_reporting_host_zeroes); } mutex_unlock(&page_reporting_mutex); diff --git a/mm/page_reporting.h b/mm/page_reporting.h index c51dbc228b94..2bbf99f456f5 100644 --- a/mm/page_reporting.h +++ b/mm/page_reporting.h @@ -15,6 +15,8 @@ DECLARE_STATIC_KEY_FALSE(page_reporting_enabled); extern unsigned int page_reporting_order; void __page_reporting_notify(void); +DECLARE_STATIC_KEY_FALSE(page_reporting_host_zeroes); + static inline bool page_reported(struct page *page) { return static_branch_unlikely(&page_reporting_enabled) && -- MST

