[PATCH RFC] btrfs: fix read corrpution from disks of different generation

Anand Jain Tue, 19 Mar 2019 04:35:36 -0700

   In case of file regular extent (non inline), the metadata and data are 
 read from two different IO operations. When we read the metadata using 
 the btree each extent block gets verified with the expected  transid as 
 per its parent. So suppose if any of the block is stale it gets reported 
 and corrected by reading from the mirror (consider raid1).
 
   This also means the data extent is not yet read or verified and this 
 happens when application tries to read the file.
 When application tries to read the file btrfs_readpage(), it shall first 
 read file extent map as provided by the metadata in the above operation 
 (which is assured to be consistent).
 And when file data has to be read using the above filemap as it again 
 has the choice of using either of the mirrors, suppose if the IO is 
 routed to the disk on which we haven't verified the metadata on it and 
 if there is write hole on that disk, then we shall be reading the junk 
 data without being reported anywhere.


   This is reproducible with the test case [1] in this email thread.
     [1]
     fstests: btrfs test read from disks of different generation

   As we have data csum most of these get caught and reported as csum 
 error. But csum verification is a point in verification and its not a 
 tree based transid verification. Which means if there is a stale data 
 with matching csum we may return a junk data silently. This problem is 
 easily reproducible when csum is disabled but not impossible to achieve 
 when csum is not disabled as well. A tree based integrity verification 
 is important for all data, which is missing.
 
 Fix:
   In this RFC patch it proposes to use same disk from with the metadata 
 is read to read the data. Which means the feature such as readmirror is 
 less effective (which is fine). But if the good data fails (media error) 
 as usual we shall read the mirrored data without being aware that this 
 data may be stale. To fix that, we have to invalidate the corresponding 
 metadata and re-read the metadata from other disk (this part is not 
 implemented in this patch).
 
    The other better solution is - a tree based verification, which means 
 we have to verify the data-extent's metadata in the extent tree when its 
 read - but there is a problem we don't update the generation number in 
 the extent tree for all types of write, for example consider overwrite 
 with notrunc option on a filesystem mounted with nodatacow option [1], 
 which I wonder if its a bug or if its like that for a reason? as because 
 there is no cow?
 
 [1] The EXTENT_DATA generation is same before and after dd with notrunc
 
   mkfs.btrfs -fq /dev/sdb && mount -o notreelog,nodatacow /dev/sdb /btrfs
   dd if=/dev/urandom of=/btrfs/tf1 bs=4096 count=1 conv=fsync,notrunc && 
 sync
   btrfs in dump-tree /dev/sdb | grep -A4 "257 EXTENT_DATA 0"
   btrfs in dump-tree /dev/sdb | egrep -A7 "257 INODE_ITEM 0\) itemoff"
   dd if=/dev/urandom of=/btrfs/tf1 bs=4096 count=1 conv=fsync,notrunc && 
 sync
   btrfs in dump-tree /dev/sdb | grep -A4 "257 EXTENT_DATA 0"
   btrfs in dump-tree /dev/sdb | egrep -A7 "257 INODE_ITEM 0\) itemoff"
 
 ---------
      item 6 key (257 EXTENT_DATA 0) itemoff 15819 itemsize 53
          generation 6 type 1 (regular)
          extent data disk byte 13631488 nr 4096
          extent data offset 0 nr 4096 ram 4096
          extent compression 0 (none)
      item 4 key (257 INODE_ITEM 0) itemoff 15885 itemsize 160
          generation 6 transid 6 size 4096 nbytes 4096
          block group 0 mode 100644 links 1 uid 0 gid 0 rdev 0
          sequence 66785 flags 0x3(NODATASUM|NODATACOW)
          atime 1552993569.276079763 (2019-03-19 19:06:09)
          ctime 1552993569.276079763 (2019-03-19 19:06:09)
          mtime 1552993569.276079763 (2019-03-19 19:06:09)
          otime 1552993569.276079763 (2019-03-19 19:06:09)
 1+0 records in
 1+0 records out
 4096 bytes (4.1 kB) copied, 0.00302717 s, 1.4 MB/s
      item 6 key (257 EXTENT_DATA 0) itemoff 15819 itemsize 53
          generation 6 type 1 (regular)
          extent data disk byte 13631488 nr 4096
          extent data offset 0 nr 4096 ram 4096
          extent compression 0 (none)
      item 4 key (257 INODE_ITEM 0) itemoff 15885 itemsize 160
          generation 6 transid 7 size 4096 nbytes 4096
          block group 0 mode 100644 links 1 uid 0 gid 0 rdev 0
          sequence 66786 flags 0x3(NODATASUM|NODATACOW)
          atime 1552993569.276079763 (2019-03-19 19:06:09)
          ctime 1552993569.302079763 (2019-03-19 19:06:09)
          mtime 1552993569.302079763 (2019-03-19 19:06:09)
          otime 1552993569.276079763 (2019-03-19 19:06:09)
 -------
 
   Comments appreciated.


Signed-off-by: Anand Jain <anand.j...@oracle.com>
---
 fs/btrfs/extent_io.c | 41 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 34 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 31892052a24f..f1cc21e8dff7 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3079,7 +3079,7 @@ static inline void __do_contiguous_readpages(struct 
extent_io_tree *tree,
                                             struct extent_map **em_cached,
                                             struct bio **bio,
                                             unsigned long *bio_flags,
-                                            u64 *prev_em_start)
+                                            u64 *prev_em_start, int mirror)
 {
        struct inode *inode;
        struct btrfs_ordered_extent *ordered;
@@ -3099,7 +3099,7 @@ static inline void __do_contiguous_readpages(struct 
extent_io_tree *tree,
 
        for (index = 0; index < nr_pages; index++) {
                __do_readpage(tree, pages[index], btrfs_get_extent, em_cached,
-                               bio, 0, bio_flags, REQ_RAHEAD, prev_em_start);
+                               bio, mirror, bio_flags, REQ_RAHEAD, 
prev_em_start);
                put_page(pages[index]);
        }
 }
@@ -3109,7 +3109,7 @@ static void __extent_readpages(struct extent_io_tree 
*tree,
                               int nr_pages,
                               struct extent_map **em_cached,
                               struct bio **bio, unsigned long *bio_flags,
-                              u64 *prev_em_start)
+                              u64 *prev_em_start, int mirror)
 {
        u64 start = 0;
        u64 end = 0;
@@ -3130,7 +3130,7 @@ static void __extent_readpages(struct extent_io_tree 
*tree,
                                                  index - first_index, start,
                                                  end, em_cached,
                                                  bio, bio_flags,
-                                                 prev_em_start);
+                                                 prev_em_start, mirror);
                        start = page_start;
                        end = start + PAGE_SIZE - 1;
                        first_index = index;
@@ -3141,7 +3141,7 @@ static void __extent_readpages(struct extent_io_tree 
*tree,
                __do_contiguous_readpages(tree, &pages[first_index],
                                          index - first_index, start,
                                          end, em_cached, bio,
-                                         bio_flags, prev_em_start);
+                                         bio_flags, prev_em_start, mirror);
 }
 
 static int __extent_read_full_page(struct extent_io_tree *tree,
@@ -4093,6 +4093,27 @@ int extent_writepages(struct address_space *mapping,
        return ret;
 }
 
+static int btrfs_get_read_mirror(struct inode *inode)
+{
+       int ret;
+       struct btrfs_path *path;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, path,
+                                      btrfs_ino(BTRFS_I(inode)), 0, 0);
+       if (!ret) {
+               struct extent_buffer *eb = path->nodes[0];
+
+               ret = eb->read_mirror;
+       }
+
+       btrfs_free_path(path);
+       return ret;
+}
+
 int extent_readpages(struct address_space *mapping, struct list_head *pages,
                     unsigned nr_pages)
 {
@@ -4103,6 +4124,11 @@ int extent_readpages(struct address_space *mapping, 
struct list_head *pages,
        struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
        int nr = 0;
        u64 prev_em_start = (u64)-1;
+       int mirror;
+
+       mirror = btrfs_get_read_mirror(mapping->host);
+       if (mirror < 0)
+               return mirror;
 
        while (!list_empty(pages)) {
                for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) {
@@ -4120,14 +4146,15 @@ int extent_readpages(struct address_space *mapping, 
struct list_head *pages,
                }
 
                __extent_readpages(tree, pagepool, nr, &em_cached, &bio,
-                                  &bio_flags, &prev_em_start);
+                                  &bio_flags, &prev_em_start, mirror);
+
        }
 
        if (em_cached)
                free_extent_map(em_cached);
 
        if (bio)
-               return submit_one_bio(bio, 0, bio_flags);
+               return submit_one_bio(bio, mirror, bio_flags);
        return 0;
 }
 
-- 
1.8.3.1

[PATCH RFC] btrfs: fix read corrpution from disks of different generation

Reply via email to