The pin_user_pages() + page_maybe_dma_pinned() infrastructure is a
framework for tackling the kernel's struggles with gup+DMA.
DAX presents a unique flavor of the gup+DMA problem since pinned pages
are identical to physical filesystem blocks. Unlike the page-cache case,
a mapping of a file can not be truncated while DMA is in-flight because
the DMA must complete before the filesystem block is reclaimed.
DAX has a homegrown solution to this problem based on watching the
page->_refcount go idle. Beyond being awkward to catch that idle transition
in put_page(), it is overkill when only the page_maybe_dma_pinned()
transition needs to be captured.
Move the wakeup of filesystem-DAX truncate paths
({ext4,xfs,fuse_dax}_break_layouts()) to unpin_user_pages() with a new
wakeup_fsdax_pin_waiters() helper, and use !page_maybe_dma_pinned() as
the wake condition.
Cc: Jan Kara <[email protected]>
Cc: "Darrick J. Wong" <[email protected]>
Cc: Christoph Hellwig <[email protected]>
Cc: John Hubbard <[email protected]>
Reported-by: Jason Gunthorpe <[email protected]>
Reported-by: Matthew Wilcox <[email protected]>
Signed-off-by: Dan Williams <[email protected]>
---
fs/dax.c | 4 ++--
fs/ext4/inode.c | 7 +++----
fs/fuse/dax.c | 6 +++---
fs/xfs/xfs_file.c | 6 +++---
include/linux/mm.h | 28 ++++++++++++++++++++++++++++
mm/gup.c | 6 ++++--
6 files changed, 43 insertions(+), 14 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index 0f22f7b46de0..aceb587bc27e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -395,7 +395,7 @@ static void dax_disassociate_entry(void *entry, struct
address_space *mapping,
for_each_mapped_pfn(entry, pfn) {
struct page *page = pfn_to_page(pfn);
- WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
+ WARN_ON_ONCE(trunc && page_maybe_dma_pinned(page));
if (dax_mapping_is_cow(page->mapping)) {
/* keep the CoW flag if this page is still shared */
if (page->index-- > 0)
@@ -414,7 +414,7 @@ static struct page *dax_pinned_page(void *entry)
for_each_mapped_pfn(entry, pfn) {
struct page *page = pfn_to_page(pfn);
- if (page_ref_count(page) > 1)
+ if (page_maybe_dma_pinned(page))
return page;
}
return NULL;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bf49bf506965..5e68e64f155a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3961,10 +3961,9 @@ int ext4_break_layouts(struct inode *inode)
if (!page)
return 0;
- error = ___wait_var_event(&page->_refcount,
- atomic_read(&page->_refcount) == 1,
- TASK_INTERRUPTIBLE, 0, 0,
- ext4_wait_dax_page(inode));
+ error = ___wait_var_event(page, !page_maybe_dma_pinned(page),
+ TASK_INTERRUPTIBLE, 0, 0,
+ ext4_wait_dax_page(inode));
} while (error == 0);
return error;
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index e0b846f16bc5..6419ca420c42 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -676,9 +676,9 @@ static int __fuse_dax_break_layouts(struct inode *inode,
bool *retry,
return 0;
*retry = true;
- return ___wait_var_event(&page->_refcount,
- atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
- 0, 0, fuse_wait_dax_page(inode));
+ return ___wait_var_event(page, !page_maybe_dma_pinned(page),
+ TASK_INTERRUPTIBLE, 0, 0,
+ fuse_wait_dax_page(inode));
}
/* dmap_end == 0 leads to unmapping of whole file */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 954bb6e83796..dbffb9481b71 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -827,9 +827,9 @@ xfs_break_dax_layouts(
return 0;
*retry = true;
- return ___wait_var_event(&page->_refcount,
- atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
- 0, 0, xfs_wait_dax_page(inode));
+ return ___wait_var_event(page, !page_maybe_dma_pinned(page),
+ TASK_INTERRUPTIBLE, 0, 0,
+ xfs_wait_dax_page(inode));
}
int
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3bedc449c14d..557d5447ebec 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1517,6 +1517,34 @@ static inline bool page_maybe_dma_pinned(struct page
*page)
return folio_maybe_dma_pinned(page_folio(page));
}
+#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX)
+/*
+ * Unlike typical file backed pages that support truncating a page from
+ * a file while it is under active DMA, DAX pages need to hold off
+ * truncate operations until transient page pins are released.
+ *
+ * The filesystem (via dax_layout_pinned_page()) takes steps to make
+ * sure that any observation of the !page_maybe_dma_pinned() state is
+ * stable until the truncation completes.
+ */
+static inline void wakeup_fsdax_pin_waiters(struct folio *folio)
+{
+ struct page *page = &folio->page;
+
+ if (!folio_is_zone_device(folio))
+ return;
+ if (page->pgmap->type != MEMORY_DEVICE_FS_DAX)
+ return;
+ if (folio_maybe_dma_pinned(folio))
+ return;
+ wake_up_var(page);
+}
+#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
+static inline void wakeup_fsdax_pin_waiters(struct folio *folio)
+{
+}
+#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
+
/*
* This should most likely only be called during fork() to see whether we
* should break the cow immediately for an anon page on the src mm.
diff --git a/mm/gup.c b/mm/gup.c
index 732825157430..499c46296fda 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -177,8 +177,10 @@ static void gup_put_folio(struct folio *folio, int refs,
unsigned int flags)
refs *= GUP_PIN_COUNTING_BIAS;
}
- if (!put_devmap_managed_page_refs(&folio->page, refs))
- folio_put_refs(folio, refs);
+ folio_put_refs(folio, refs);
+
+ if (flags & FOLL_PIN)
+ wakeup_fsdax_pin_waiters(folio);
}
/**