In the subpagesize-blocksize scenario, the following command (with 4k as the PAGE_SIZE and 2k as the block size) can cause false accounting of blocks of an ordered extent that is written to disk:
$ xfs_io -f -c "pwrite 0 10240" \ -c "sync_range 0 4096" \ -c "sync_range 8192 2048" \ -c "pwrite 10240 2048" \ -c "sync_range 10240 2048" \ /mnt/btrfs/file.bin To fix this, we would have to explicitly track the blocks of an ordered extent that have already been submitted for write I/O. Signed-off-by: Chandan Rajendra <chan...@linux.vnet.ibm.com> --- fs/btrfs/extent_io.c | 24 ++++++++++++++++++++++-- fs/btrfs/ordered-data.c | 4 +++- fs/btrfs/ordered-data.h | 4 ++++ 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 168252e..3649c5d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3201,6 +3201,8 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, u64 extent_offset; u64 extent_end; u64 iosize; + u64 blk, nr_blks; + u64 blk_submitted; sector_t sector; struct extent_state *cached_state = NULL; struct block_device *bdev; @@ -3267,11 +3269,26 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, iosize = min(extent_end - cur, end - cur + 1); iosize = ALIGN(iosize, blocksize); + blk = extent_offset >> inode->i_sb->s_blocksize_bits; + nr_blks = iosize >> inode->i_sb->s_blocksize_bits; + + blk_submitted = find_next_bit(ordered->blocks_submitted, + ordered->len >> inode->i_sb->s_blocksize_bits, + blk); + if (blk_submitted < blk + nr_blks) { + if (blk_submitted == blk) { + cur += blocksize; + btrfs_put_ordered_extent(ordered); + continue; + } + iosize = (blk_submitted - blk) + << inode->i_sb->s_blocksize_bits; + nr_blks = iosize >> inode->i_sb->s_blocksize_bits; + } + sector = (ordered->start + extent_offset) >> 9; bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; compressed = test_bit(BTRFS_ORDERED_COMPRESSED, &ordered->flags); - btrfs_put_ordered_extent(ordered); - ordered = NULL; /* * compressed and inline extents are written through other @@ -3284,6 +3301,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, */ nr++; cur += iosize; + btrfs_put_ordered_extent(ordered); continue; } @@ -3298,6 +3316,8 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, } else { unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1; + bitmap_set(ordered->blocks_submitted, blk, nr_blks); + btrfs_put_ordered_extent(ordered); set_range_writeback(tree, cur, cur + iosize - 1); if (!PageWriteback(page)) { btrfs_err(BTRFS_I(inode)->root->fs_info, diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 4d9832f..59b2544 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -199,13 +199,15 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, nr_longs = BITS_TO_LONGS(len >> inode->i_sb->s_blocksize_bits); if (nr_longs == 1) { entry->blocks_done = &entry->blocks_bitmap; + entry->blocks_submitted = &entry->blocks_submitted_bitmap; } else { - entry->blocks_done = kzalloc(nr_longs * sizeof(unsigned long), + entry->blocks_done = kzalloc(2 * nr_longs * sizeof(unsigned long), GFP_NOFS); if (!entry->blocks_done) { kmem_cache_free(btrfs_ordered_extent_cache, entry); return -ENOMEM; } + entry->blocks_submitted = entry->blocks_done + nr_longs; } entry->file_offset = file_offset; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 7de3b1e..851914c 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -139,6 +139,10 @@ struct btrfs_ordered_extent { /* bitmap to track the blocks that have been written to disk */ unsigned long *blocks_done; unsigned long blocks_bitmap; + + /* bitmap to track the blocks that have been submitted for write i/o */ + unsigned long *blocks_submitted; + unsigned long blocks_submitted_bitmap; }; /* -- 2.1.0 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html