bcachefs currently populates fiemap data from the extents btree.
This works correctly when the fiemap sync flag is provided, but if
not, it skips all delalloc extents that have not yet been flushed.
This is because delalloc extents from buffered writes are first
stored as reservation in the pagecache, and only become resident in
the extents btree after writeback completes.

Update the fiemap implementation to process holes between extents by
scanning pagecache for data, via seek data/hole. If a valid data
range is found over a hole in the extent btree, fake up an extent
key and flag the extent as delalloc for reporting to userspace.

Note that this does not necessarily change behavior for the case
where there is dirty pagecache over already written extents, where
when in COW mode, writeback will allocate new blocks for the
underlying ranges. The existing behavior is consistent with btrfs
and it is recommended to use the sync flag for the most up to date
extent state from fiemap.

Signed-off-by: Brian Foster <bfos...@redhat.com>
---
 fs/bcachefs/fs.c | 87 +++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 79 insertions(+), 8 deletions(-)

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 9f14eb256c09..77a8c826b637 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -993,6 +993,51 @@ static int bch2_fiemap_extent(struct btree_trans *trans,
        return 0;
 }
 
+/*
+ * Scan a range of pagecache that corresponds to a file mapping hole in the
+ * extent btree. If found, fake up an extent key so it looks like a delalloc
+ * extent to the rest of the fiemap processing code.
+ *
+ * Returns 0 if cached data was found, -ENOENT if not, or -EAGAIN for folio
+ * trylock failure.
+ */
+static int
+bch2_fiemap_hole(struct inode *vinode, u64 start, u64 end,
+                struct bch_fiemap_extent *cur)
+{
+       struct bch_fs           *c = vinode->i_sb->s_fs_info;
+       struct bch_inode_info   *ei = to_bch_ei(vinode);
+       struct bkey_i_extent    *delextent;
+       struct bch_extent_ptr   ptr = {};
+       loff_t                  dstart, dend;
+
+       /*
+        * We hold btree node locks here so we cannot block on folio locks.
+        * Propagate -EAGAIN errors so the caller can retry.
+        */
+       dstart = bch2_seek_pagecache_data(vinode, start, end, 0, true);
+       if (dstart < 0)
+               return dstart;
+       if (dstart >= end)
+               return -ENOENT;
+       dend = bch2_seek_pagecache_hole(vinode, dstart, end, 0, true);
+
+       /*
+        * Create a fake extent key in the buffer. We have to add a dummy extent
+        * pointer for the fill code to add an extent entry. It's explicitly
+        * zeroed to reflect delayed allocation (i.e. phys offset 0).
+        */
+       bch2_bkey_buf_realloc(&cur->kbuf, c, sizeof(*delextent) / sizeof(u64));
+       delextent = bkey_extent_init(cur->kbuf.k);
+       delextent->k.p = POS(ei->v.i_ino, dstart >> 9);
+       bch2_key_resize(&delextent->k, (dend - dstart) >> 9);
+       bch2_bkey_append_ptr(&delextent->k_i, ptr);
+
+       cur->flags = FIEMAP_EXTENT_DELALLOC;
+
+       return 0;
+}
+
 static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
                       u64 start, u64 len)
 {
@@ -1032,16 +1077,41 @@ static int bch2_fiemap(struct inode *vinode, struct 
fiemap_extent_info *info,
        while (!(ret = btree_trans_too_many_iters(trans)) &&
               (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
               !(ret = bkey_err(k))) {
+               bool have_delalloc = false;
 
-               if (!bkey_extent_is_data(k.k) &&
-                   k.k->type != KEY_TYPE_reservation) {
-                       bch2_btree_iter_advance(&iter);
-                       continue;
+               /*
+                * If a hole exists before the start of the extent key, scan the
+                * range for pagecache data that might be pending writeback and
+                * thus not yet exist in the extent tree. Note that this can
+                * return -EAGAIN to request to retry folio locks.
+                */
+               if (iter.pos.offset > start) {
+                       ret = bch2_fiemap_hole(vinode, start << 9,
+                                              iter.pos.offset << 9, &cur);
+                       if (!ret)
+                               have_delalloc = true;
+                       else if (ret != -ENOENT)
+                               break;
                }
 
-               ret = bch2_fiemap_extent(trans, &iter, k, &cur);
-               if (ret)
-                       break;
+                /* process the current key if there's no delalloc to report */
+               if (!have_delalloc) {
+                       if (!bkey_extent_is_data(k.k) &&
+                           k.k->type != KEY_TYPE_reservation) {
+                               start = bkey_start_offset(k.k) + k.k->size;
+                               bch2_btree_iter_advance(&iter);
+                               continue;
+                       }
+
+                       ret = bch2_fiemap_extent(trans, &iter, k, &cur);
+                       if (ret)
+                               break;
+               }
+
+               /*
+                * Store the current extent in prev so we can flag the last
+                * extent on the way out.
+                */
                bch2_bkey_buf_realloc(&prev.kbuf, c, cur.kbuf.k->k.u64s);
                start = cur.kbuf.k->k.p.offset;
 
@@ -1061,7 +1131,8 @@ static int bch2_fiemap(struct inode *vinode, struct 
fiemap_extent_info *info,
        start = iter.pos.offset;
        bch2_trans_iter_exit(trans, &iter);
 err:
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+       if (ret == -EAGAIN ||
+           bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
        if (!ret && have_extent) {
-- 
2.44.0


Reply via email to