Several of the subtleties and assumptions of the DAX fsync/msync
implementation are not immediately obvious, so document them with comments.

Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
Reported-by: Jan Kara <j...@suse.cz>
Reviewed-by: Jan Kara <j...@suse.cz>
---
 fs/dax.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/fs/dax.c b/fs/dax.c
index d589113..ab2faa9 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -335,6 +335,7 @@ static int dax_radix_entry(struct address_space *mapping, 
pgoff_t index,
        int type, error = 0;
        void *entry;
 
+       WARN_ON_ONCE(pmd_entry && !dirty);
        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 
        spin_lock_irq(&mapping->tree_lock);
@@ -350,6 +351,13 @@ static int dax_radix_entry(struct address_space *mapping, 
pgoff_t index,
 
                if (!pmd_entry || type == RADIX_DAX_PMD)
                        goto dirty;
+
+               /*
+                * We only insert dirty PMD entries into the radix tree.  This
+                * means we don't need to worry about removing a dirty PTE
+                * entry and inserting a clean PMD entry, thus reducing the
+                * range we would flush with a follow-up fsync/msync call.
+                */
                radix_tree_delete(&mapping->page_tree, index);
                mapping->nrexceptional--;
        }
@@ -912,6 +920,21 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned 
long address,
                }
                dax_unmap_atomic(bdev, &dax);
 
+               /*
+                * For PTE faults we insert a radix tree entry for reads, and
+                * leave it clean.  Then on the first write we dirty the radix
+                * tree entry via the dax_pfn_mkwrite() path.  This sequence
+                * allows the dax_pfn_mkwrite() call to be simpler and avoid a
+                * call into get_block() to translate the pgoff to a sector in
+                * order to be able to create a new radix tree entry.
+                *
+                * The PMD path doesn't have an equivalent to
+                * dax_pfn_mkwrite(), though, so for a read followed by a
+                * write we traverse all the way through __dax_pmd_fault()
+                * twice.  This means we can just skip inserting a radix tree
+                * entry completely on the initial read and just wait until
+                * the write to insert a dirty entry.
+                */
                if (write) {
                        error = dax_radix_entry(mapping, pgoff, dax.sector,
                                        true, true);
@@ -985,6 +1008,14 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct 
vm_fault *vmf)
 {
        struct file *file = vma->vm_file;
 
+       /*
+        * We pass NO_SECTOR to dax_radix_entry() because we expect that a
+        * RADIX_DAX_PTE entry already exists in the radix tree from a
+        * previous call to __dax_fault().  We just want to look up that PTE
+        * entry using vmf->pgoff and make sure the dirty tag is set.  This
+        * saves us from having to make a call to get_block() here to look
+        * up the sector.
+        */
        dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true);
        return VM_FAULT_NOPAGE;
 }
-- 
2.5.0

Reply via email to