The current logic splits the update of the amount of available memory in
the system (total_avail_pages) and pending claims into two separately
locked regions. This leads to a window between counters adjustments where
the result of total_avail_pages - outstanding_claims doesn't reflect the
real amount of free memory available, and can return a negative value due
to total_avail_pages having been updated ahead of outstanding_claims.
Fix by adjusting outstanding_claims and d->outstanding_pages in the same
place where total_avail_pages is updated. Note that accesses to
d->outstanding_pages is protected by the global heap_lock, just like
total_avail_pages or outstanding_claims. Add a comment to the field
declaration, and also adjust the comment at the top of
domain_set_outstanding_pages() to be clearer in that regard.
Finally, due to claims being adjusted ahead of pages having been assigned
to the domain, add logic to re-gain the claim in case assign_page() fails.
Otherwise the page is freed and the claimed amount would be lost.
Fixes: 65c9792df600 ("mmu: Introduce XENMEM_claim_pages (subop of memory ops)")
Signed-off-by: Roger Pau Monné <[email protected]>
---
Changes since v1:
- Regain the claim if allocated page cannot be assigned to the domain.
- Adjust comments regarding d->outstanding_pages being protected by the
heap_lock (instead of the d->page_alloc_lock).
---
xen/common/page_alloc.c | 88 ++++++++++++++++++++++-------------------
xen/include/xen/sched.h | 3 +-
2 files changed, 50 insertions(+), 41 deletions(-)
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 1f67b88a8933..b73f6e264514 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -515,30 +515,6 @@ unsigned long domain_adjust_tot_pages(struct domain *d,
long pages)
ASSERT(rspin_is_locked(&d->page_alloc_lock));
d->tot_pages += pages;
- /*
- * can test d->outstanding_pages race-free because it can only change
- * if d->page_alloc_lock and heap_lock are both held, see also
- * domain_set_outstanding_pages below
- */
- if ( !d->outstanding_pages || pages <= 0 )
- goto out;
-
- spin_lock(&heap_lock);
- BUG_ON(outstanding_claims < d->outstanding_pages);
- if ( d->outstanding_pages < pages )
- {
- /* `pages` exceeds the domain's outstanding count. Zero it out. */
- outstanding_claims -= d->outstanding_pages;
- d->outstanding_pages = 0;
- }
- else
- {
- outstanding_claims -= pages;
- d->outstanding_pages -= pages;
- }
- spin_unlock(&heap_lock);
-
-out:
return d->tot_pages;
}
@@ -548,9 +524,10 @@ int domain_set_outstanding_pages(struct domain *d,
unsigned long pages)
unsigned long claim, avail_pages;
/*
- * take the domain's page_alloc_lock, else all d->tot_page adjustments
- * must always take the global heap_lock rather than only in the much
- * rarer case that d->outstanding_pages is non-zero
+ * Two locks are needed here:
+ * - d->page_alloc_lock: protects accesses to d->{tot,max,extra}_pages.
+ * - heap_lock: protects accesses to d->outstanding_pages,
total_avail_pages
+ * and outstanding_claims.
*/
nrspin_lock(&d->page_alloc_lock);
spin_lock(&heap_lock);
@@ -995,11 +972,11 @@ static void init_free_page_fields(struct page_info *pg)
static struct page_info *alloc_heap_pages(
unsigned int zone_lo, unsigned int zone_hi,
unsigned int order, unsigned int memflags,
- struct domain *d)
+ struct domain *d, unsigned int *claimed)
{
nodeid_t node;
unsigned int i, buddy_order, zone, first_dirty;
- unsigned long request = 1UL << order;
+ unsigned int request = 1UL << order;
struct page_info *pg;
bool need_tlbflush = false;
uint32_t tlbflush_timestamp = 0;
@@ -1012,6 +989,7 @@ static struct page_info *alloc_heap_pages(
ASSERT(zone_lo <= zone_hi);
ASSERT(zone_hi < NR_ZONES);
+ BUILD_BUG_ON(MAX_ORDER >= sizeof(request) * 8);
if ( unlikely(order > MAX_ORDER) )
return NULL;
@@ -1071,6 +1049,28 @@ static struct page_info *alloc_heap_pages(
total_avail_pages -= request;
ASSERT(total_avail_pages >= 0);
+ if ( d && d->outstanding_pages && !(memflags & MEMF_no_refcount) )
+ {
+ /*
+ * Adjust claims in the same locked region where total_avail_pages is
+ * adjusted, not doing so would lead to a window where the amount of
+ * free memory (avail - claimed) would be incorrect.
+ *
+ * Note that by adjusting the claimed amount here it's possible for
+ * pages to fail to be assigned to the claiming domain while already
+ * having been subtracted from d->outstanding_pages. Such claimed
+ * amount is then lost, as the pages that fail to be assigned to the
+ * domain are freed without replenishing the claim.
+ */
+ unsigned int outstanding = min(d->outstanding_pages, request);
+
+ BUG_ON(outstanding > outstanding_claims);
+ outstanding_claims -= outstanding;
+ d->outstanding_pages -= outstanding;
+ ASSERT(claimed);
+ *claimed = outstanding;
+ }
+
check_low_mem_virq();
if ( d != NULL )
@@ -1518,7 +1518,8 @@ static void free_color_heap_page(struct page_info *pg,
bool need_scrub);
/* Free 2^@order set of pages. */
static void free_heap_pages(
- struct page_info *pg, unsigned int order, bool need_scrub)
+ struct domain *d, struct page_info *pg, unsigned int order, bool
need_scrub,
+ unsigned int claim)
{
unsigned long mask;
mfn_t mfn = page_to_mfn(pg);
@@ -1553,6 +1554,12 @@ static void free_heap_pages(
avail[node][zone] += 1 << order;
total_avail_pages += 1 << order;
+ if ( d && claim )
+ {
+ outstanding_claims += claim;
+ d->outstanding_pages += claim;
+ }
+
if ( need_scrub )
{
node_need_scrub[node] += 1 << order;
@@ -1842,7 +1849,7 @@ int online_page(mfn_t mfn, uint32_t *status)
spin_unlock(&heap_lock);
if ( (y & PGC_state) == PGC_state_offlined )
- free_heap_pages(pg, 0, false);
+ free_heap_pages(NULL, pg, 0, false, 0);
return ret;
}
@@ -1918,7 +1925,7 @@ static void _init_heap_pages(const struct page_info *pg,
if ( s )
inc_order = min(inc_order, ffsl(s) - 1U);
- free_heap_pages(mfn_to_page(_mfn(s)), inc_order, need_scrub);
+ free_heap_pages(NULL, mfn_to_page(_mfn(s)), inc_order, need_scrub, 0);
s += (1UL << inc_order);
}
}
@@ -2452,7 +2459,7 @@ void *alloc_xenheap_pages(unsigned int order, unsigned
int memflags)
ASSERT_ALLOC_CONTEXT();
pg = alloc_heap_pages(MEMZONE_XEN, MEMZONE_XEN,
- order, memflags | MEMF_no_scrub, NULL);
+ order, memflags | MEMF_no_scrub, NULL, NULL);
if ( unlikely(pg == NULL) )
return NULL;
@@ -2467,7 +2474,7 @@ void free_xenheap_pages(void *v, unsigned int order)
if ( v == NULL )
return;
- free_heap_pages(virt_to_page(v), order, false);
+ free_heap_pages(NULL, virt_to_page(v), order, false, 0);
}
#else /* !CONFIG_SEPARATE_XENHEAP */
@@ -2523,7 +2530,7 @@ void free_xenheap_pages(void *v, unsigned int order)
for ( i = 0; i < (1u << order); i++ )
pg[i].count_info &= ~PGC_xen_heap;
- free_heap_pages(pg, order, true);
+ free_heap_pages(NULL, pg, order, true, 0);
}
#endif /* CONFIG_SEPARATE_XENHEAP */
@@ -2656,7 +2663,7 @@ struct page_info *alloc_domheap_pages(
{
struct page_info *pg = NULL;
unsigned int bits = memflags >> _MEMF_bits, zone_hi = NR_ZONES - 1;
- unsigned int dma_zone;
+ unsigned int dma_zone, claimed = 0;
ASSERT_ALLOC_CONTEXT();
@@ -2679,12 +2686,13 @@ struct page_info *alloc_domheap_pages(
else if ( !dma_bitsize )
memflags &= ~MEMF_no_dma;
else if ( (dma_zone = bits_to_zone(dma_bitsize)) < zone_hi )
- pg = alloc_heap_pages(dma_zone + 1, zone_hi, order, memflags, d);
+ pg = alloc_heap_pages(dma_zone + 1, zone_hi, order, memflags, d,
+ &claimed);
if ( (pg == NULL) &&
((memflags & MEMF_no_dma) ||
((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi, order,
- memflags, d)) == NULL)) )
+ memflags, d, &claimed)) == NULL)) )
return NULL;
if ( d && !(memflags & MEMF_no_owner) )
@@ -2701,7 +2709,7 @@ struct page_info *alloc_domheap_pages(
}
if ( assign_page(pg, order, d, memflags) )
{
- free_heap_pages(pg, order, memflags & MEMF_no_scrub);
+ free_heap_pages(d, pg, order, memflags & MEMF_no_scrub, claimed);
return NULL;
}
}
@@ -2786,7 +2794,7 @@ void free_domheap_pages(struct page_info *pg, unsigned
int order)
scrub = 1;
}
- free_heap_pages(pg, order, scrub);
+ free_heap_pages(NULL, pg, order, scrub, 0);
}
if ( drop_dom_ref )
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 1f77e0869b5d..d922c908c29f 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -413,7 +413,8 @@ struct domain
unsigned int tot_pages;
unsigned int xenheap_pages; /* pages allocated from Xen heap */
- unsigned int outstanding_pages; /* pages claimed but not possessed */
+ /* Pages claimed but not possessed, protected by global heap_lock. */
+ unsigned int outstanding_pages;
unsigned int max_pages; /* maximum value for
domain_tot_pages() */
unsigned int extra_pages; /* pages not included in
domain_tot_pages() */
--
2.51.0