bcachefs currently populates fiemap data from the extents btree. This works correctly when the fiemap sync flag is provided, but if not, it skips all delalloc extents that have not yet been flushed. This is because delalloc extents from buffered writes are first stored as reservation in the pagecache, and only become resident in the extents btree after writeback completes.
Update the fiemap implementation to process holes between extents by scanning pagecache for data, via seek data/hole. If a valid data range is found over a hole in the extent btree, fake up an extent key and flag the extent as delalloc for reporting to userspace. Note that this does not necessarily change behavior for the case where there is dirty pagecache over already written extents, where when in COW mode, writeback will allocate new blocks for the underlying ranges. The existing behavior is consistent with btrfs and it is recommended to use the sync flag for the most up to date extent state from fiemap. Signed-off-by: Brian Foster <bfos...@redhat.com> --- fs/bcachefs/fs.c | 64 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 57 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 1364f491af87..2e525735dd73 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -923,6 +923,41 @@ static int bch2_fiemap_extent(struct btree_trans *trans, return 0; } +/* + * Scan a range of pagecache that corresponds to a file mapping hole in the + * extent btree. If found, fake up an extent key so it looks like a delalloc + * extent to the rest of the fiemap processing code. + */ +static bool +bch2_fiemap_hole(struct inode *vinode, u64 start, u64 end, + struct bch_fiemap_extent *cur) +{ + struct bch_fs *c = vinode->i_sb->s_fs_info; + struct bch_inode_info *ei = to_bch_ei(vinode); + struct bkey_i_extent *delextent; + struct bch_extent_ptr ptr = {}; + + start = bch2_seek_pagecache_data(vinode, start, end, 0, false); + if (start >= end) + return false; + end = bch2_seek_pagecache_hole(vinode, start, end, 0, false); + + /* + * Create a fake extent key in the buffer. We have to add a dummy extent + * pointer for the fill code to add an extent entry. It's explicitly + * zeroed to reflect delayed allocation (i.e. phys offset 0). + */ + bch2_bkey_buf_realloc(&cur->kbuf, c, sizeof(*delextent) / sizeof(u64)); + delextent = bkey_extent_init(cur->kbuf.k); + delextent->k.p = POS(ei->v.i_ino, start >> 9); + bch2_key_resize(&delextent->k, (end - start) >> 9); + bch2_bkey_append_ptr(&delextent->k_i, ptr); + + cur->flags = FIEMAP_EXTENT_DELALLOC; + + return true; +} + static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, u64 start, u64 len) { @@ -962,16 +997,31 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, while (!(ret = btree_trans_too_many_iters(trans)) && (k = bch2_btree_iter_peek_upto(&iter, end)).k && !(ret = bkey_err(k))) { + /* + * If a hole exists before the start of the next key, scan that + * range in pagecache for delalloc data that might be pending + * writeback and report it as delalloc. Otherwise, process the + * key. + */ + if (iter.pos.offset <= start || + !bch2_fiemap_hole(vinode, start << 9, iter.pos.offset << 9, + &cur)) { + if (!bkey_extent_is_data(k.k) && + k.k->type != KEY_TYPE_reservation) { + start = bkey_start_offset(k.k) + k.k->size; + bch2_btree_iter_advance(&iter); + continue; + } - if (!bkey_extent_is_data(k.k) && - k.k->type != KEY_TYPE_reservation) { - bch2_btree_iter_advance(&iter); - continue; + ret = bch2_fiemap_extent(trans, &iter, k, &cur); + if (ret) + break; } - ret = bch2_fiemap_extent(trans, &iter, k, &cur); - if (ret) - break; + /* + * Store the current extent in prev so we can flag the last + * extent on the way out. + */ bch2_bkey_buf_realloc(&prev.kbuf, c, cur.kbuf.k->k.u64s); start = cur.kbuf.k->k.p.offset; -- 2.42.0