mm: move adjustment of claimed pages counters on allocation

Roger Pau Monne Wed, 24 Dec 2025 11:41:10 -0800

The current logic splits the update of the amount of available memory in
the system (total_avail_pages) and pending claims into two separately
locked regions.  This leads to a window between counters adjustments where
the result of total_avail_pages - outstanding_claims doesn't reflect the
real amount of free memory available, and can return a negative value due
to total_avail_pages having been updated ahead of outstanding_claims.


Fix by adjusting outstanding_claims and d->outstanding_pages in the same
place where total_avail_pages is updated.  Note that accesses to
d->outstanding_pages is protected by the global heap_lock, just like
total_avail_pages or outstanding_claims.  Add a comment to the field
declaration, and also adjust the comment at the top of
domain_set_outstanding_pages() to be clearer in that regard.

Finally, due to claims being adjusted ahead of pages having been assigned
to the domain, add logic to re-gain the claim in case assign_page() fails.
Otherwise the page is freed and the claimed amount would be lost.

Fixes: 65c9792df600 ("mmu: Introduce XENMEM_claim_pages (subop of memory ops)")
Signed-off-by: Roger Pau Monné <[email protected]>
---
Changes since v1:
 - Regain the claim if allocated page cannot be assigned to the domain.
 - Adjust comments regarding d->outstanding_pages being protected by the
   heap_lock (instead of the d->page_alloc_lock).
---
 xen/common/page_alloc.c | 88 ++++++++++++++++++++++-------------------
 xen/include/xen/sched.h |  3 +-
 2 files changed, 50 insertions(+), 41 deletions(-)

diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 1f67b88a8933..b73f6e264514 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -515,30 +515,6 @@ unsigned long domain_adjust_tot_pages(struct domain *d, 
long pages)
     ASSERT(rspin_is_locked(&d->page_alloc_lock));
     d->tot_pages += pages;
 
-    /*
-     * can test d->outstanding_pages race-free because it can only change
-     * if d->page_alloc_lock and heap_lock are both held, see also
-     * domain_set_outstanding_pages below
-     */
-    if ( !d->outstanding_pages || pages <= 0 )
-        goto out;
-
-    spin_lock(&heap_lock);
-    BUG_ON(outstanding_claims < d->outstanding_pages);
-    if ( d->outstanding_pages < pages )
-    {
-        /* `pages` exceeds the domain's outstanding count. Zero it out. */
-        outstanding_claims -= d->outstanding_pages;
-        d->outstanding_pages = 0;
-    }
-    else
-    {
-        outstanding_claims -= pages;
-        d->outstanding_pages -= pages;
-    }
-    spin_unlock(&heap_lock);
-
-out:
     return d->tot_pages;
 }
 
@@ -548,9 +524,10 @@ int domain_set_outstanding_pages(struct domain *d, 
unsigned long pages)
     unsigned long claim, avail_pages;
 
     /*
-     * take the domain's page_alloc_lock, else all d->tot_page adjustments
-     * must always take the global heap_lock rather than only in the much
-     * rarer case that d->outstanding_pages is non-zero
+     * Two locks are needed here:
+     *  - d->page_alloc_lock: protects accesses to d->{tot,max,extra}_pages.
+     *  - heap_lock: protects accesses to d->outstanding_pages, 
total_avail_pages
+     *    and outstanding_claims.
      */
     nrspin_lock(&d->page_alloc_lock);
     spin_lock(&heap_lock);
@@ -995,11 +972,11 @@ static void init_free_page_fields(struct page_info *pg)
 static struct page_info *alloc_heap_pages(
     unsigned int zone_lo, unsigned int zone_hi,
     unsigned int order, unsigned int memflags,
-    struct domain *d)
+    struct domain *d, unsigned int *claimed)
 {
     nodeid_t node;
     unsigned int i, buddy_order, zone, first_dirty;
-    unsigned long request = 1UL << order;
+    unsigned int request = 1UL << order;
     struct page_info *pg;
     bool need_tlbflush = false;
     uint32_t tlbflush_timestamp = 0;
@@ -1012,6 +989,7 @@ static struct page_info *alloc_heap_pages(
     ASSERT(zone_lo <= zone_hi);
     ASSERT(zone_hi < NR_ZONES);
 
+    BUILD_BUG_ON(MAX_ORDER >= sizeof(request) * 8);
     if ( unlikely(order > MAX_ORDER) )
         return NULL;
 
@@ -1071,6 +1049,28 @@ static struct page_info *alloc_heap_pages(
     total_avail_pages -= request;
     ASSERT(total_avail_pages >= 0);
 
+    if ( d && d->outstanding_pages && !(memflags & MEMF_no_refcount) )
+    {
+        /*
+         * Adjust claims in the same locked region where total_avail_pages is
+         * adjusted, not doing so would lead to a window where the amount of
+         * free memory (avail - claimed) would be incorrect.
+         *
+         * Note that by adjusting the claimed amount here it's possible for
+         * pages to fail to be assigned to the claiming domain while already
+         * having been subtracted from d->outstanding_pages.  Such claimed
+         * amount is then lost, as the pages that fail to be assigned to the
+         * domain are freed without replenishing the claim.
+         */
+        unsigned int outstanding = min(d->outstanding_pages, request);
+
+        BUG_ON(outstanding > outstanding_claims);
+        outstanding_claims -= outstanding;
+        d->outstanding_pages -= outstanding;
+        ASSERT(claimed);
+        *claimed = outstanding;
+    }
+
     check_low_mem_virq();
 
     if ( d != NULL )
@@ -1518,7 +1518,8 @@ static void free_color_heap_page(struct page_info *pg, 
bool need_scrub);
 
 /* Free 2^@order set of pages. */
 static void free_heap_pages(
-    struct page_info *pg, unsigned int order, bool need_scrub)
+    struct domain *d, struct page_info *pg, unsigned int order, bool 
need_scrub,
+    unsigned int claim)
 {
     unsigned long mask;
     mfn_t mfn = page_to_mfn(pg);
@@ -1553,6 +1554,12 @@ static void free_heap_pages(
 
     avail[node][zone] += 1 << order;
     total_avail_pages += 1 << order;
+    if ( d && claim )
+    {
+        outstanding_claims += claim;
+        d->outstanding_pages += claim;
+    }
+
     if ( need_scrub )
     {
         node_need_scrub[node] += 1 << order;
@@ -1842,7 +1849,7 @@ int online_page(mfn_t mfn, uint32_t *status)
     spin_unlock(&heap_lock);
 
     if ( (y & PGC_state) == PGC_state_offlined )
-        free_heap_pages(pg, 0, false);
+        free_heap_pages(NULL, pg, 0, false, 0);
 
     return ret;
 }
@@ -1918,7 +1925,7 @@ static void _init_heap_pages(const struct page_info *pg,
 
         if ( s )
             inc_order = min(inc_order, ffsl(s) - 1U);
-        free_heap_pages(mfn_to_page(_mfn(s)), inc_order, need_scrub);
+        free_heap_pages(NULL, mfn_to_page(_mfn(s)), inc_order, need_scrub, 0);
         s += (1UL << inc_order);
     }
 }
@@ -2452,7 +2459,7 @@ void *alloc_xenheap_pages(unsigned int order, unsigned 
int memflags)
     ASSERT_ALLOC_CONTEXT();
 
     pg = alloc_heap_pages(MEMZONE_XEN, MEMZONE_XEN,
-                          order, memflags | MEMF_no_scrub, NULL);
+                          order, memflags | MEMF_no_scrub, NULL, NULL);
     if ( unlikely(pg == NULL) )
         return NULL;
 
@@ -2467,7 +2474,7 @@ void free_xenheap_pages(void *v, unsigned int order)
     if ( v == NULL )
         return;
 
-    free_heap_pages(virt_to_page(v), order, false);
+    free_heap_pages(NULL, virt_to_page(v), order, false, 0);
 }
 
 #else  /* !CONFIG_SEPARATE_XENHEAP */
@@ -2523,7 +2530,7 @@ void free_xenheap_pages(void *v, unsigned int order)
     for ( i = 0; i < (1u << order); i++ )
         pg[i].count_info &= ~PGC_xen_heap;
 
-    free_heap_pages(pg, order, true);
+    free_heap_pages(NULL, pg, order, true, 0);
 }
 
 #endif  /* CONFIG_SEPARATE_XENHEAP */
@@ -2656,7 +2663,7 @@ struct page_info *alloc_domheap_pages(
 {
     struct page_info *pg = NULL;
     unsigned int bits = memflags >> _MEMF_bits, zone_hi = NR_ZONES - 1;
-    unsigned int dma_zone;
+    unsigned int dma_zone, claimed = 0;
 
     ASSERT_ALLOC_CONTEXT();
 
@@ -2679,12 +2686,13 @@ struct page_info *alloc_domheap_pages(
     else if ( !dma_bitsize )
         memflags &= ~MEMF_no_dma;
     else if ( (dma_zone = bits_to_zone(dma_bitsize)) < zone_hi )
-        pg = alloc_heap_pages(dma_zone + 1, zone_hi, order, memflags, d);
+        pg = alloc_heap_pages(dma_zone + 1, zone_hi, order, memflags, d,
+                              &claimed);
 
     if ( (pg == NULL) &&
          ((memflags & MEMF_no_dma) ||
           ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi, order,
-                                  memflags, d)) == NULL)) )
+                                  memflags, d, &claimed)) == NULL)) )
          return NULL;
 
     if ( d && !(memflags & MEMF_no_owner) )
@@ -2701,7 +2709,7 @@ struct page_info *alloc_domheap_pages(
         }
         if ( assign_page(pg, order, d, memflags) )
         {
-            free_heap_pages(pg, order, memflags & MEMF_no_scrub);
+            free_heap_pages(d, pg, order, memflags & MEMF_no_scrub, claimed);
             return NULL;
         }
     }
@@ -2786,7 +2794,7 @@ void free_domheap_pages(struct page_info *pg, unsigned 
int order)
             scrub = 1;
         }
 
-        free_heap_pages(pg, order, scrub);
+        free_heap_pages(NULL, pg, order, scrub, 0);
     }
 
     if ( drop_dom_ref )
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 1f77e0869b5d..d922c908c29f 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -413,7 +413,8 @@ struct domain
     unsigned int     tot_pages;
 
     unsigned int     xenheap_pages;     /* pages allocated from Xen heap */
-    unsigned int     outstanding_pages; /* pages claimed but not possessed */
+    /* Pages claimed but not possessed, protected by global heap_lock. */
+    unsigned int     outstanding_pages;
     unsigned int     max_pages;         /* maximum value for 
domain_tot_pages() */
     unsigned int     extra_pages;       /* pages not included in 
domain_tot_pages() */
 
-- 
2.51.0

[PATCH v2 1/2] xen/mm: move adjustment of claimed pages counters on allocation

Reply via email to