Hi,

This sounds to me like we are doing things in the wrong order. We
shouldn't need to undo things that have been done, otherwise we'll just
land up in a tangle,

Steve.

On Tue, 2013-05-14 at 11:21 -0500, Bob Peterson wrote:
> In pass1, it traverses the metadata free, processing each dinode and
> marking which blocks are used by that dinode. If a dinode is found
> to have unrecoverable errors, it does a bunch of work to "undo" the
> things it did. This is especially important for the processing of
> duplicate block references. Suppose dinode X references block 1234.
> Later in pass1, suppose a different dinode, Y, also references block
> 1234. This is flagged as a duplicate block reference. Still later,
> suppose pass1 determines dinode Y is bad. Now it has to undo the
> work it did. It needs to properly unmark the data and metadata
> blocks it marked as no longer "free" so that valid references that
> follow aren't flagged as duplicate references. At the same time,
> it needs to unflag block 1234 as a duplicate reference as well, so
> that dinode X's original reference is still considered valid.
> 
> Before this patch, fsck.gfs2 was trying to traverse the entire
> metadata tree for the bad dinode, trying to "undo" the designations.
> That becomes a huge problem if the damage was discovered in the
> middle of the metadata, in which case it may never have flagged any
> of the data blocks as "in use as data" in its blockmap. The result
> of "undoing" the designations sometimes resulted in blocks improperly
> being marked as "free" when they were, in fact, referenced by other
> valid dinodes.
> 
> For example, suppose corrupt dinode Y references metadata blocks
> 1234, 1235, and 1236. Now suppose a serious problem is found as part
> of its processing of block 1234, and so it stopped its metadata tree
> traversal there. Metadata blocks 1235 and 1236 are still listed as
> metadata for the bad dinode, but if we traverse the entire tree,
> those two blocks may be improperly processed. If another dinode
> actually uses blocks 1235 or 1236, the improper "undo" processing
> of those two blocks can screw up the valid references.
> 
> This patch reworks the "undo" functions so that the "undo" functions
> don't get called on the entire metadata and data of the defective
> dinode. Instead, only the metadata and data blocks queued onto the
> metadata list are processed. This should ensure that the "undo"
> functions only operate on blocks that were processed in the first
> place.
> 
> rhbz#902920
> ---
>  gfs2/fsck/metawalk.c | 109 ++++++++++++++++++++++----------
>  gfs2/fsck/metawalk.h |   4 ++
>  gfs2/fsck/pass1.c    | 172 
> ++++++++++++++++-----------------------------------
>  3 files changed, 135 insertions(+), 150 deletions(-)
> 
> diff --git a/gfs2/fsck/metawalk.c b/gfs2/fsck/metawalk.c
> index d1b12f1..b9d9f89 100644
> --- a/gfs2/fsck/metawalk.c
> +++ b/gfs2/fsck/metawalk.c
> @@ -1259,7 +1259,7 @@ static int build_and_check_metalist(struct gfs2_inode 
> *ip, osi_list_t *mlp,
>                               if (err < 0) {
>                                       stack;
>                                       error = err;
> -                                     goto fail;
> +                                     return error;
>                               }
>                               if (err > 0) {
>                                       if (!error)
> @@ -1278,14 +1278,11 @@ static int build_and_check_metalist(struct gfs2_inode 
> *ip, osi_list_t *mlp,
>                               }
>                               if (!nbh)
>                                       nbh = bread(ip->i_sbd, block);
> -                             osi_list_add(&nbh->b_altlist, cur_list);
> +                             osi_list_add_prev(&nbh->b_altlist, cur_list);
>                       } /* for all data on the indirect block */
>               } /* for blocks at that height */
>       } /* for height */
> -     return error;
> -fail:
> -     free_metalist(ip, mlp);
> -     return error;
> +     return 0;
>  }
>  
>  /**
> @@ -1331,6 +1328,27 @@ static int check_data(struct gfs2_inode *ip, struct 
> metawalk_fxns *pass,
>       return error;
>  }
>  
> +static int undo_check_data(struct gfs2_inode *ip, struct metawalk_fxns *pass,
> +                        uint64_t *ptr_start, char *ptr_end)
> +{
> +     int rc = 0;
> +     uint64_t block, *ptr;
> +
> +     /* If there isn't much pointer corruption check the pointers */
> +     for (ptr = ptr_start ; (char *)ptr < ptr_end && !fsck_abort; ptr++) {
> +             if (!*ptr)
> +                     continue;
> +
> +             if (skip_this_pass || fsck_abort)
> +                     return 1;
> +             block =  be64_to_cpu(*ptr);
> +             rc = pass->undo_check_data(ip, block, pass->private);
> +             if (rc < 0)
> +                     return rc;
> +     }
> +     return 0;
> +}
> +
>  static int hdr_size(struct gfs2_buffer_head *bh, int height)
>  {
>       if (height > 1) {
> @@ -1363,6 +1381,7 @@ int check_metatree(struct gfs2_inode *ip, struct 
> metawalk_fxns *pass)
>       int  i, head_size;
>       uint64_t blks_checked = 0;
>       int error, rc;
> +     int metadata_clean = 0;
>  
>       if (!height && !is_dir(&ip->i_di, ip->i_sbd->gfs1))
>               return 0;
> @@ -1374,35 +1393,21 @@ int check_metatree(struct gfs2_inode *ip, struct 
> metawalk_fxns *pass)
>       error = build_and_check_metalist(ip, &metalist[0], pass);
>       if (error) {
>               stack;
> -             free_metalist(ip, &metalist[0]);
> -             return error;
> +             goto undo_metalist;
>       }
>  
> +     metadata_clean = 1;
>       /* For directories, we've already checked the "data" blocks which
>        * comprise the directory hash table, so we perform the directory
>        * checks and exit. */
>          if (is_dir(&ip->i_di, ip->i_sbd->gfs1)) {
> -             free_metalist(ip, &metalist[0]);
>               if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH))
> -                     return 0;
> +                     goto out;
>               /* check validity of leaf blocks and leaf chains */
>               error = check_leaf_blks(ip, pass);
> -             return error;
> -     }
> -
> -     /* Free the metalist buffers from heights we don't need to check.
> -        For the rest we'll free as we check them to save time.
> -        metalist[0] will only have the dinode bh, so we can skip it. */
> -     for (i = 1; i < height - 1; i++) {
> -             list = &metalist[i];
> -             while (!osi_list_empty(list)) {
> -                     bh = osi_list_entry(list->next,
> -                                         struct gfs2_buffer_head, b_altlist);
> -                     if (bh == ip->i_bh)
> -                             osi_list_del(&bh->b_altlist);
> -                     else
> -                             brelse(bh);
> -             }
> +             if (error)
> +                     goto undo_metalist;
> +             goto out;
>       }
>  
>       /* check data blocks */
> @@ -1435,14 +1440,12 @@ int check_metatree(struct gfs2_inode *ip, struct 
> metawalk_fxns *pass)
>               else
>                       rc = 0;
>  
> -             if (rc && (!error || rc < 0))
> +             if (rc && (!error || rc < 0)) {
>                       error = rc;
> +                     break;
> +             }
>               if (pass->big_file_msg && ip->i_di.di_blocks > COMFORTABLE_BLKS)
>                       pass->big_file_msg(ip, blks_checked);
> -             if (bh == ip->i_bh)
> -                     osi_list_del(&bh->b_altlist);
> -             else
> -                     brelse(bh);
>       }
>       if (pass->big_file_msg && ip->i_di.di_blocks > COMFORTABLE_BLKS) {
>               log_notice( _("\rLarge file at %lld (0x%llx) - 100 percent "
> @@ -1452,6 +1455,50 @@ int check_metatree(struct gfs2_inode *ip, struct 
> metawalk_fxns *pass)
>                           (unsigned long long)ip->i_di.di_num.no_addr);
>               fflush(stdout);
>       }
> +undo_metalist:
> +     if (!error)
> +             goto out;
> +     log_err( _("Error: inode %llu (0x%llx) had unrecoverable errors.\n"),
> +              (unsigned long long)ip->i_di.di_num.no_addr,
> +              (unsigned long long)ip->i_di.di_num.no_addr);
> +     if (!query( _("Remove the invalid inode? (y/n) "))) {
> +             free_metalist(ip, &metalist[0]);
> +             log_err(_("Invalid inode not deleted.\n"));
> +             return error;
> +     }
> +     for (i = 0; pass->undo_check_meta && i < height; i++) {
> +             while (!osi_list_empty(&metalist[i])) {
> +                     list = &metalist[i];
> +                     bh = osi_list_entry(list->next,
> +                                         struct gfs2_buffer_head,
> +                                         b_altlist);
> +                     log_err(_("Undoing metadata work for block %llu "
> +                               "(0x%llx)\n"),
> +                             (unsigned long long)bh->b_blocknr,
> +                             (unsigned long long)bh->b_blocknr);
> +                     if (i)
> +                             rc = pass->undo_check_meta(ip, bh->b_blocknr,
> +                                                        i, pass->private);
> +                     else
> +                             rc = 0;
> +                     if (metadata_clean && rc == 0 && i == height - 1) {
> +                             head_size = hdr_size(bh, height);
> +                             if (head_size)
> +                                     undo_check_data(ip, pass, (uint64_t *)
> +                                           (bh->b_data + head_size),
> +                                           (bh->b_data + ip->i_sbd->bsize));
> +                     }
> +                     if (bh == ip->i_bh)
> +                             osi_list_del(&bh->b_altlist);
> +                     else
> +                             brelse(bh);
> +             }
> +     }
> +     /* Set the dinode as "bad" so it gets deleted */
> +     fsck_blockmap_set(ip, ip->i_di.di_num.no_addr,
> +                       _("corrupt"), gfs2_block_free);
> +     log_err(_("The corrupt inode was invalidated.\n"));
> +out:
>       free_metalist(ip, &metalist[0]);
>       return error;
>  }
> diff --git a/gfs2/fsck/metawalk.h b/gfs2/fsck/metawalk.h
> index 486c6eb..f5e71e1 100644
> --- a/gfs2/fsck/metawalk.h
> +++ b/gfs2/fsck/metawalk.h
> @@ -108,6 +108,10 @@ struct metawalk_fxns {
>       int (*repair_leaf) (struct gfs2_inode *ip, uint64_t *leaf_no,
>                           int lindex, int ref_count, const char *msg,
>                           void *private);
> +     int (*undo_check_meta) (struct gfs2_inode *ip, uint64_t block,
> +                             int h, void *private);
> +     int (*undo_check_data) (struct gfs2_inode *ip, uint64_t block,
> +                             void *private);
>  };
>  
>  #endif /* _METAWALK_H */
> diff --git a/gfs2/fsck/pass1.c b/gfs2/fsck/pass1.c
> index 04e5289..a88895f 100644
> --- a/gfs2/fsck/pass1.c
> +++ b/gfs2/fsck/pass1.c
> @@ -39,8 +39,7 @@ static int check_leaf(struct gfs2_inode *ip, uint64_t 
> block, void *private);
>  static int check_metalist(struct gfs2_inode *ip, uint64_t block,
>                         struct gfs2_buffer_head **bh, int h, void *private);
>  static int undo_check_metalist(struct gfs2_inode *ip, uint64_t block,
> -                            struct gfs2_buffer_head **bh, int h,
> -                            void *private);
> +                            int h, void *private);
>  static int check_data(struct gfs2_inode *ip, uint64_t block, void *private);
>  static int undo_check_data(struct gfs2_inode *ip, uint64_t block,
>                          void *private);
> @@ -104,12 +103,8 @@ struct metawalk_fxns pass1_fxns = {
>       .finish_eattr_indir = finish_eattr_indir,
>       .big_file_msg = big_file_comfort,
>       .repair_leaf = pass1_repair_leaf,
> -};
> -
> -struct metawalk_fxns undo_fxns = {
> -     .private = NULL,
> -     .check_metalist = undo_check_metalist,
> -     .check_data = undo_check_data,
> +     .undo_check_meta = undo_check_metalist,
> +     .undo_check_data = undo_check_data,
>  };
>  
>  struct metawalk_fxns invalidate_fxns = {
> @@ -326,53 +321,67 @@ static int check_metalist(struct gfs2_inode *ip, 
> uint64_t block,
>       return 0;
>  }
>  
> -static int undo_check_metalist(struct gfs2_inode *ip, uint64_t block,
> -                            struct gfs2_buffer_head **bh, int h,
> -                            void *private)
> +/* undo_reference - undo previously processed data or metadata
> + * We've treated the metadata for this dinode as good so far, but not we
> + * realize it's bad. So we need to undo what we've done.
> + *
> + * Returns: 0 - We need to process the block as metadata. In other words,
> + *              we need to undo any blocks it refers to.
> + *          1 - We can't process the block as metadata.
> + */
> +
> +static int undo_reference(struct gfs2_inode *ip, uint64_t block, int meta,
> +                       void *private)
>  {
> -     int found_dup = 0, iblk_type;
> -     struct gfs2_buffer_head *nbh;
>       struct block_count *bc = (struct block_count *)private;
> -
> -     *bh = NULL;
> +     struct duptree *dt;
> +     struct inode_with_dups *id;
>  
>       if (!valid_block(ip->i_sbd, block)) { /* blk outside of FS */
>               fsck_blockmap_set(ip, ip->i_di.di_num.no_addr,
>                                 _("bad block referencing"), gfs2_block_free);
>               return 1;
>       }
> -     if (is_dir(&ip->i_di, ip->i_sbd->gfs1) && h == ip->i_di.di_height)
> -             iblk_type = GFS2_METATYPE_JD;
> -     else
> -             iblk_type = GFS2_METATYPE_IN;
>  
> -     found_dup = find_remove_dup(ip, block, _("Metadata"));
> -     nbh = bread(ip->i_sbd, block);
> +     if (meta)
> +             bc->indir_count--;
> +     dt = dupfind(block);
> +     if (dt) {
> +             /* remove all duplicate reference structures from this inode */
> +             do {
> +                     id = find_dup_ref_inode(dt, ip);
> +                     if (!id)
> +                             break;
>  
> -     if (gfs2_check_meta(nbh, iblk_type)) {
> -             if (!found_dup) {
> -                     fsck_blockmap_set(ip, block, _("bad indirect"),
> -                                       gfs2_block_free);
> -                     brelse(nbh);
> +                     dup_listent_delete(id);
> +             } while (id);
> +
> +             if (dt->refs) {
> +                     log_err(_("Block %llu (0x%llx) is still referenced "
> +                               "from another inode; not freeing.\n"),
> +                             (unsigned long long)block,
> +                             (unsigned long long)block);
>                       return 1;
>               }
> -             brelse(nbh);
> -             nbh = NULL;
> -     } else /* blk check ok */
> -             *bh = nbh;
> -
> -     bc->indir_count--;
> -     if (found_dup) {
> -             if (nbh)
> -                     brelse(nbh);
> -             *bh = NULL;
> -             return 1; /* don't process the metadata again */
> -     } else
> -             fsck_blockmap_set(ip, block, _("bad indirect"),
> -                               gfs2_block_free);
> +     }
> +     fsck_blockmap_set(ip, block,
> +                       meta ? _("bad indirect") : _("referenced data"),
> +                       gfs2_block_free);
>       return 0;
>  }
>  
> +static int undo_check_metalist(struct gfs2_inode *ip, uint64_t block,
> +                            int h, void *private)
> +{
> +     return undo_reference(ip, block, 1, private);
> +}
> +
> +static int undo_check_data(struct gfs2_inode *ip, uint64_t block,
> +                        void *private)
> +{
> +     return undo_reference(ip, block, 0, private);
> +}
> +
>  static int check_data(struct gfs2_inode *ip, uint64_t block, void *private)
>  {
>       uint8_t q;
> @@ -438,71 +447,9 @@ static int check_data(struct gfs2_inode *ip, uint64_t 
> block, void *private)
>       return 0;
>  }
>  
> -static int undo_check_data(struct gfs2_inode *ip, uint64_t block,
> -                        void *private)
> -{
> -     struct block_count *bc = (struct block_count *) private;
> -
> -     if (!valid_block(ip->i_sbd, block)) {
> -             /* Mark the owner of this block with the bad_block
> -              * designator so we know to check it for out of range
> -              * blocks later */
> -             fsck_blockmap_set(ip, ip->i_di.di_num.no_addr,
> -                               _("bad (invalid or out of range) data"),
> -                               gfs2_block_free);
> -             return 1;
> -     }
> -     bc->data_count--;
> -     return free_block_if_notdup(ip, block, _("data"));
> -}
> -
>  static int remove_inode_eattr(struct gfs2_inode *ip, struct block_count *bc)
>  {
> -     struct duptree *dt;
> -     struct inode_with_dups *id;
> -     osi_list_t *ref;
> -     int moved = 0;
> -
> -     /* If it's a duplicate reference to the block, we need to check
> -        if the reference is on the valid or invalid inodes list.
> -        If it's on the valid inode's list, move it to the invalid
> -        inodes list.  The reason is simple: This inode, although
> -        valid, has an now-invalid reference, so we should not give
> -        this reference preferential treatment over others. */
> -     dt = dupfind(ip->i_di.di_eattr);
> -     if (dt) {
> -             osi_list_foreach(ref, &dt->ref_inode_list) {
> -                     id = osi_list_entry(ref, struct inode_with_dups, list);
> -                     if (id->block_no == ip->i_di.di_num.no_addr) {
> -                             log_debug( _("Moving inode %lld (0x%llx)'s "
> -                                          "duplicate reference to %lld "
> -                                          "(0x%llx) from the valid to the "
> -                                          "invalid reference list.\n"),
> -                                        (unsigned long long)
> -                                        ip->i_di.di_num.no_addr,
> -                                        (unsigned long long)
> -                                        ip->i_di.di_num.no_addr,
> -                                        (unsigned long long)
> -                                        ip->i_di.di_eattr,
> -                                        (unsigned long long)
> -                                        ip->i_di.di_eattr);
> -                             /* Move from the normal to the invalid list */
> -                             osi_list_del(&id->list);
> -                             osi_list_add_prev(&id->list,
> -                                               &dt->ref_invinode_list);
> -                             moved = 1;
> -                             break;
> -                     }
> -             }
> -             if (!moved)
> -                     log_debug( _("Duplicate reference to %lld "
> -                                  "(0x%llx) not moved.\n"),
> -                                (unsigned long long)ip->i_di.di_eattr,
> -                                (unsigned long long)ip->i_di.di_eattr);
> -     } else {
> -             delete_block(ip, ip->i_di.di_eattr, NULL,
> -                          "extended attribute", NULL);
> -     }
> +     undo_reference(ip, ip->i_di.di_eattr, 0, bc);
>       ip->i_di.di_eattr = 0;
>       bc->ea_count = 0;
>       ip->i_di.di_blocks = 1 + bc->indir_count + bc->data_count;
> @@ -1080,23 +1027,10 @@ static int handle_ip(struct gfs2_sbd *sdp, struct 
> gfs2_inode *ip)
>       if (lf_dip && lf_dip->i_di.di_blocks != lf_blks)
>               reprocess_inode(lf_dip, "lost+found");
>  
> -     if (fsck_abort || error < 0)
> +     /* We there was an error, we return 0 because we want fsck to continue
> +        and analyze the other dinodes as well. */
> +     if (fsck_abort || error != 0)
>               return 0;
> -     if (error > 0) {
> -             log_err( _("Error: inode %llu (0x%llx) has unrecoverable "
> -                        "errors; invalidating.\n"),
> -                      (unsigned long long)ip->i_di.di_num.no_addr,
> -                      (unsigned long long)ip->i_di.di_num.no_addr);
> -             undo_fxns.private = &bc;
> -             check_metatree(ip, &undo_fxns);
> -             /* If we undo the metadata accounting, including metadatas
> -                duplicate block status, we need to make sure later passes
> -                don't try to free up the metadata referenced by this inode.
> -                Therefore we mark the inode as free space. */
> -             fsck_blockmap_set(ip, ip->i_di.di_num.no_addr,
> -                               _("corrupt"), gfs2_block_free);
> -             return 0;
> -     }
>  
>       error = check_inode_eattr(ip, &pass1_fxns);
>  


Reply via email to