Re: [PATCH RFC ver.B] btrfs: scrub: Don't use inode pages for device replace

Qu Wenruo Mon, 04 Jun 2018 22:34:45 -0700

Hi David,

It would be pretty nice if we could get this fix (or previous RFC patch)
to get into current release cycle.


As it's a unrecoverable data corruption, it would be better to get it
fixed as soon as possible.

Thanks,
Qu

On 2018年06月05日 12:36, Qu Wenruo wrote:
> [BUG]
> Btrfs can easily create compressed extent without checksum (even
> though it shouldn't), and if we then try to replace device containing
> such extent, the result device will contain all the uncompressed data
> instead of the compressed one.
> 
> Test case already submitted to fstests:
> https://patchwork.kernel.org/patch/10442353/
> 
> [CAUSE]
> When handling compressed extent without checksum, device replace will
> goes into copy_nocow_pages() function.
> 
> In that function, btrfs will get all inodes referring to this data
> extents and then use find_or_create_page() to get pages direct from that
> inode.
> 
> The problem here is, pages directly from inode are always uncompressed.
> And for compressed data extent, they mismatch with on-disk data.
> Thus this leads to corrupted compressed data extent written to replace
> device.
> 
> [FIX]
> In this attempt, we could just remove the "optimization" branch, and let
> unified scrub_pages() to handle it.
> 
> Although scrub_pages() won't bother reusing page cache, it will be a
> little slower, but it does the correct csum checking and won't cause
> such data corruption cause by "optimization".
> 
> Reported-by: James Harvey <jamespharve...@gmail.com>
> Signed-off-by: Qu Wenruo <w...@suse.com>
> ---
> Changelog:
> ver.B:
>     Instead of checking extent type in copy_nocow_pages() branch, just
>     remove them all, since existing scrub_pages() can handle nodatasum
>     case pretty well and it saves a lot of codes.
> ---
>  fs/btrfs/scrub.c | 341 -----------------------------------------------
>  1 file changed, 341 deletions(-)
> 
> diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
> index 52b39a0924e9..799e4c6b2551 100644
> --- a/fs/btrfs/scrub.c
> +++ b/fs/btrfs/scrub.c
> @@ -277,13 +277,6 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx 
> *sctx,
>  static void scrub_wr_submit(struct scrub_ctx *sctx);
>  static void scrub_wr_bio_end_io(struct bio *bio);
>  static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
> -static int write_page_nocow(struct scrub_ctx *sctx,
> -                         u64 physical_for_dev_replace, struct page *page);
> -static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
> -                                   struct scrub_copy_nocow_ctx *ctx);
> -static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
> -                         int mirror_num, u64 physical_for_dev_replace);
> -static void copy_nocow_pages_worker(struct btrfs_work *work);
>  static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
>  static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
>  static void scrub_put_ctx(struct scrub_ctx *sctx);
> @@ -2799,17 +2792,10 @@ static int scrub_extent(struct scrub_ctx *sctx, 
> struct map_lookup *map,
>                       have_csum = scrub_find_csum(sctx, logical, csum);
>                       if (have_csum == 0)
>                               ++sctx->stat.no_csum;
> -                     if (sctx->is_dev_replace && !have_csum) {
> -                             ret = copy_nocow_pages(sctx, logical, l,
> -                                                    mirror_num,
> -                                                   physical_for_dev_replace);
> -                             goto behind_scrub_pages;
> -                     }
>               }
>               ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
>                                 mirror_num, have_csum ? csum : NULL, 0,
>                                 physical_for_dev_replace);
> -behind_scrub_pages:
>               if (ret)
>                       return ret;
>               len -= l;
> @@ -4357,330 +4343,3 @@ static void scrub_remap_extent(struct btrfs_fs_info 
> *fs_info,
>       *extent_dev = bbio->stripes[0].dev;
>       btrfs_put_bbio(bbio);
>  }
> -
> -static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
> -                         int mirror_num, u64 physical_for_dev_replace)
> -{
> -     struct scrub_copy_nocow_ctx *nocow_ctx;
> -     struct btrfs_fs_info *fs_info = sctx->fs_info;
> -
> -     nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
> -     if (!nocow_ctx) {
> -             spin_lock(&sctx->stat_lock);
> -             sctx->stat.malloc_errors++;
> -             spin_unlock(&sctx->stat_lock);
> -             return -ENOMEM;
> -     }
> -
> -     scrub_pending_trans_workers_inc(sctx);
> -
> -     nocow_ctx->sctx = sctx;
> -     nocow_ctx->logical = logical;
> -     nocow_ctx->len = len;
> -     nocow_ctx->mirror_num = mirror_num;
> -     nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
> -     btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
> -                     copy_nocow_pages_worker, NULL, NULL);
> -     INIT_LIST_HEAD(&nocow_ctx->inodes);
> -     btrfs_queue_work(fs_info->scrub_nocow_workers,
> -                      &nocow_ctx->work);
> -
> -     return 0;
> -}
> -
> -static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
> -{
> -     struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
> -     struct scrub_nocow_inode *nocow_inode;
> -
> -     nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
> -     if (!nocow_inode)
> -             return -ENOMEM;
> -     nocow_inode->inum = inum;
> -     nocow_inode->offset = offset;
> -     nocow_inode->root = root;
> -     list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
> -     return 0;
> -}
> -
> -#define COPY_COMPLETE 1
> -
> -static void copy_nocow_pages_worker(struct btrfs_work *work)
> -{
> -     struct scrub_copy_nocow_ctx *nocow_ctx =
> -             container_of(work, struct scrub_copy_nocow_ctx, work);
> -     struct scrub_ctx *sctx = nocow_ctx->sctx;
> -     struct btrfs_fs_info *fs_info = sctx->fs_info;
> -     struct btrfs_root *root = fs_info->extent_root;
> -     u64 logical = nocow_ctx->logical;
> -     u64 len = nocow_ctx->len;
> -     int mirror_num = nocow_ctx->mirror_num;
> -     u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
> -     int ret;
> -     struct btrfs_trans_handle *trans = NULL;
> -     struct btrfs_path *path;
> -     int not_written = 0;
> -
> -     path = btrfs_alloc_path();
> -     if (!path) {
> -             spin_lock(&sctx->stat_lock);
> -             sctx->stat.malloc_errors++;
> -             spin_unlock(&sctx->stat_lock);
> -             not_written = 1;
> -             goto out;
> -     }
> -
> -     trans = btrfs_join_transaction(root);
> -     if (IS_ERR(trans)) {
> -             not_written = 1;
> -             goto out;
> -     }
> -
> -     ret = iterate_inodes_from_logical(logical, fs_info, path,
> -                     record_inode_for_nocow, nocow_ctx, false);
> -     if (ret != 0 && ret != -ENOENT) {
> -             btrfs_warn(fs_info,
> -                        "iterate_inodes_from_logical() failed: log %llu, 
> phys %llu, len %llu, mir %u, ret %d",
> -                        logical, physical_for_dev_replace, len, mirror_num,
> -                        ret);
> -             not_written = 1;
> -             goto out;
> -     }
> -
> -     btrfs_end_transaction(trans);
> -     trans = NULL;
> -     while (!list_empty(&nocow_ctx->inodes)) {
> -             struct scrub_nocow_inode *entry;
> -             entry = list_first_entry(&nocow_ctx->inodes,
> -                                      struct scrub_nocow_inode,
> -                                      list);
> -             list_del_init(&entry->list);
> -             ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
> -                                              entry->root, nocow_ctx);
> -             kfree(entry);
> -             if (ret == COPY_COMPLETE) {
> -                     ret = 0;
> -                     break;
> -             } else if (ret) {
> -                     break;
> -             }
> -     }
> -out:
> -     while (!list_empty(&nocow_ctx->inodes)) {
> -             struct scrub_nocow_inode *entry;
> -             entry = list_first_entry(&nocow_ctx->inodes,
> -                                      struct scrub_nocow_inode,
> -                                      list);
> -             list_del_init(&entry->list);
> -             kfree(entry);
> -     }
> -     if (trans && !IS_ERR(trans))
> -             btrfs_end_transaction(trans);
> -     if (not_written)
> -             btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
> -                                         num_uncorrectable_read_errors);
> -
> -     btrfs_free_path(path);
> -     kfree(nocow_ctx);
> -
> -     scrub_pending_trans_workers_dec(sctx);
> -}
> -
> -static int check_extent_to_block(struct btrfs_inode *inode, u64 start, u64 
> len,
> -                              u64 logical)
> -{
> -     struct extent_state *cached_state = NULL;
> -     struct btrfs_ordered_extent *ordered;
> -     struct extent_io_tree *io_tree;
> -     struct extent_map *em;
> -     u64 lockstart = start, lockend = start + len - 1;
> -     int ret = 0;
> -
> -     io_tree = &inode->io_tree;
> -
> -     lock_extent_bits(io_tree, lockstart, lockend, &cached_state);
> -     ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
> -     if (ordered) {
> -             btrfs_put_ordered_extent(ordered);
> -             ret = 1;
> -             goto out_unlock;
> -     }
> -
> -     em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
> -     if (IS_ERR(em)) {
> -             ret = PTR_ERR(em);
> -             goto out_unlock;
> -     }
> -
> -     /*
> -      * This extent does not actually cover the logical extent anymore,
> -      * move on to the next inode.
> -      */
> -     if (em->block_start > logical ||
> -         em->block_start + em->block_len < logical + len ||
> -         test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
> -             free_extent_map(em);
> -             ret = 1;
> -             goto out_unlock;
> -     }
> -     free_extent_map(em);
> -
> -out_unlock:
> -     unlock_extent_cached(io_tree, lockstart, lockend, &cached_state);
> -     return ret;
> -}
> -
> -static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
> -                                   struct scrub_copy_nocow_ctx *nocow_ctx)
> -{
> -     struct btrfs_fs_info *fs_info = nocow_ctx->sctx->fs_info;
> -     struct btrfs_key key;
> -     struct inode *inode;
> -     struct page *page;
> -     struct btrfs_root *local_root;
> -     struct extent_io_tree *io_tree;
> -     u64 physical_for_dev_replace;
> -     u64 nocow_ctx_logical;
> -     u64 len = nocow_ctx->len;
> -     unsigned long index;
> -     int srcu_index;
> -     int ret = 0;
> -     int err = 0;
> -
> -     key.objectid = root;
> -     key.type = BTRFS_ROOT_ITEM_KEY;
> -     key.offset = (u64)-1;
> -
> -     srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
> -
> -     local_root = btrfs_read_fs_root_no_name(fs_info, &key);
> -     if (IS_ERR(local_root)) {
> -             srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
> -             return PTR_ERR(local_root);
> -     }
> -
> -     key.type = BTRFS_INODE_ITEM_KEY;
> -     key.objectid = inum;
> -     key.offset = 0;
> -     inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
> -     srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
> -     if (IS_ERR(inode))
> -             return PTR_ERR(inode);
> -
> -     /* Avoid truncate/dio/punch hole.. */
> -     inode_lock(inode);
> -     inode_dio_wait(inode);
> -
> -     physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
> -     io_tree = &BTRFS_I(inode)->io_tree;
> -     nocow_ctx_logical = nocow_ctx->logical;
> -
> -     ret = check_extent_to_block(BTRFS_I(inode), offset, len,
> -                     nocow_ctx_logical);
> -     if (ret) {
> -             ret = ret > 0 ? 0 : ret;
> -             goto out;
> -     }
> -
> -     while (len >= PAGE_SIZE) {
> -             index = offset >> PAGE_SHIFT;
> -again:
> -             page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
> -             if (!page) {
> -                     btrfs_err(fs_info, "find_or_create_page() failed");
> -                     ret = -ENOMEM;
> -                     goto out;
> -             }
> -
> -             if (PageUptodate(page)) {
> -                     if (PageDirty(page))
> -                             goto next_page;
> -             } else {
> -                     ClearPageError(page);
> -                     err = extent_read_full_page(io_tree, page,
> -                                                        btrfs_get_extent,
> -                                                        
> nocow_ctx->mirror_num);
> -                     if (err) {
> -                             ret = err;
> -                             goto next_page;
> -                     }
> -
> -                     lock_page(page);
> -                     /*
> -                      * If the page has been remove from the page cache,
> -                      * the data on it is meaningless, because it may be
> -                      * old one, the new data may be written into the new
> -                      * page in the page cache.
> -                      */
> -                     if (page->mapping != inode->i_mapping) {
> -                             unlock_page(page);
> -                             put_page(page);
> -                             goto again;
> -                     }
> -                     if (!PageUptodate(page)) {
> -                             ret = -EIO;
> -                             goto next_page;
> -                     }
> -             }
> -
> -             ret = check_extent_to_block(BTRFS_I(inode), offset, len,
> -                                         nocow_ctx_logical);
> -             if (ret) {
> -                     ret = ret > 0 ? 0 : ret;
> -                     goto next_page;
> -             }
> -
> -             err = write_page_nocow(nocow_ctx->sctx,
> -                                    physical_for_dev_replace, page);
> -             if (err)
> -                     ret = err;
> -next_page:
> -             unlock_page(page);
> -             put_page(page);
> -
> -             if (ret)
> -                     break;
> -
> -             offset += PAGE_SIZE;
> -             physical_for_dev_replace += PAGE_SIZE;
> -             nocow_ctx_logical += PAGE_SIZE;
> -             len -= PAGE_SIZE;
> -     }
> -     ret = COPY_COMPLETE;
> -out:
> -     inode_unlock(inode);
> -     iput(inode);
> -     return ret;
> -}
> -
> -static int write_page_nocow(struct scrub_ctx *sctx,
> -                         u64 physical_for_dev_replace, struct page *page)
> -{
> -     struct bio *bio;
> -     struct btrfs_device *dev;
> -
> -     dev = sctx->wr_tgtdev;
> -     if (!dev)
> -             return -EIO;
> -     if (!dev->bdev) {
> -             btrfs_warn_rl(dev->fs_info,
> -                     "scrub write_page_nocow(bdev == NULL) is unexpected");
> -             return -EIO;
> -     }
> -     bio = btrfs_io_bio_alloc(1);
> -     bio->bi_iter.bi_size = 0;
> -     bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
> -     bio_set_dev(bio, dev->bdev);
> -     bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
> -     /* bio_add_page won't fail on a freshly allocated bio */
> -     bio_add_page(bio, page, PAGE_SIZE, 0);
> -
> -     if (btrfsic_submit_bio_wait(bio)) {
> -             bio_put(bio);
> -             btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
> -             return -EIO;
> -     }
> -
> -     bio_put(bio);
> -     return 0;
> -}
>

signature.asc
Description: OpenPGP digital signature

Re: [PATCH RFC ver.B] btrfs: scrub: Don't use inode pages for device replace

Reply via email to