Hi, This sounds to me like we are doing things in the wrong order. We shouldn't need to undo things that have been done, otherwise we'll just land up in a tangle,
Steve. On Tue, 2013-05-14 at 11:21 -0500, Bob Peterson wrote: > In pass1, it traverses the metadata free, processing each dinode and > marking which blocks are used by that dinode. If a dinode is found > to have unrecoverable errors, it does a bunch of work to "undo" the > things it did. This is especially important for the processing of > duplicate block references. Suppose dinode X references block 1234. > Later in pass1, suppose a different dinode, Y, also references block > 1234. This is flagged as a duplicate block reference. Still later, > suppose pass1 determines dinode Y is bad. Now it has to undo the > work it did. It needs to properly unmark the data and metadata > blocks it marked as no longer "free" so that valid references that > follow aren't flagged as duplicate references. At the same time, > it needs to unflag block 1234 as a duplicate reference as well, so > that dinode X's original reference is still considered valid. > > Before this patch, fsck.gfs2 was trying to traverse the entire > metadata tree for the bad dinode, trying to "undo" the designations. > That becomes a huge problem if the damage was discovered in the > middle of the metadata, in which case it may never have flagged any > of the data blocks as "in use as data" in its blockmap. The result > of "undoing" the designations sometimes resulted in blocks improperly > being marked as "free" when they were, in fact, referenced by other > valid dinodes. > > For example, suppose corrupt dinode Y references metadata blocks > 1234, 1235, and 1236. Now suppose a serious problem is found as part > of its processing of block 1234, and so it stopped its metadata tree > traversal there. Metadata blocks 1235 and 1236 are still listed as > metadata for the bad dinode, but if we traverse the entire tree, > those two blocks may be improperly processed. If another dinode > actually uses blocks 1235 or 1236, the improper "undo" processing > of those two blocks can screw up the valid references. > > This patch reworks the "undo" functions so that the "undo" functions > don't get called on the entire metadata and data of the defective > dinode. Instead, only the metadata and data blocks queued onto the > metadata list are processed. This should ensure that the "undo" > functions only operate on blocks that were processed in the first > place. > > rhbz#902920 > --- > gfs2/fsck/metawalk.c | 109 ++++++++++++++++++++++---------- > gfs2/fsck/metawalk.h | 4 ++ > gfs2/fsck/pass1.c | 172 > ++++++++++++++++----------------------------------- > 3 files changed, 135 insertions(+), 150 deletions(-) > > diff --git a/gfs2/fsck/metawalk.c b/gfs2/fsck/metawalk.c > index d1b12f1..b9d9f89 100644 > --- a/gfs2/fsck/metawalk.c > +++ b/gfs2/fsck/metawalk.c > @@ -1259,7 +1259,7 @@ static int build_and_check_metalist(struct gfs2_inode > *ip, osi_list_t *mlp, > if (err < 0) { > stack; > error = err; > - goto fail; > + return error; > } > if (err > 0) { > if (!error) > @@ -1278,14 +1278,11 @@ static int build_and_check_metalist(struct gfs2_inode > *ip, osi_list_t *mlp, > } > if (!nbh) > nbh = bread(ip->i_sbd, block); > - osi_list_add(&nbh->b_altlist, cur_list); > + osi_list_add_prev(&nbh->b_altlist, cur_list); > } /* for all data on the indirect block */ > } /* for blocks at that height */ > } /* for height */ > - return error; > -fail: > - free_metalist(ip, mlp); > - return error; > + return 0; > } > > /** > @@ -1331,6 +1328,27 @@ static int check_data(struct gfs2_inode *ip, struct > metawalk_fxns *pass, > return error; > } > > +static int undo_check_data(struct gfs2_inode *ip, struct metawalk_fxns *pass, > + uint64_t *ptr_start, char *ptr_end) > +{ > + int rc = 0; > + uint64_t block, *ptr; > + > + /* If there isn't much pointer corruption check the pointers */ > + for (ptr = ptr_start ; (char *)ptr < ptr_end && !fsck_abort; ptr++) { > + if (!*ptr) > + continue; > + > + if (skip_this_pass || fsck_abort) > + return 1; > + block = be64_to_cpu(*ptr); > + rc = pass->undo_check_data(ip, block, pass->private); > + if (rc < 0) > + return rc; > + } > + return 0; > +} > + > static int hdr_size(struct gfs2_buffer_head *bh, int height) > { > if (height > 1) { > @@ -1363,6 +1381,7 @@ int check_metatree(struct gfs2_inode *ip, struct > metawalk_fxns *pass) > int i, head_size; > uint64_t blks_checked = 0; > int error, rc; > + int metadata_clean = 0; > > if (!height && !is_dir(&ip->i_di, ip->i_sbd->gfs1)) > return 0; > @@ -1374,35 +1393,21 @@ int check_metatree(struct gfs2_inode *ip, struct > metawalk_fxns *pass) > error = build_and_check_metalist(ip, &metalist[0], pass); > if (error) { > stack; > - free_metalist(ip, &metalist[0]); > - return error; > + goto undo_metalist; > } > > + metadata_clean = 1; > /* For directories, we've already checked the "data" blocks which > * comprise the directory hash table, so we perform the directory > * checks and exit. */ > if (is_dir(&ip->i_di, ip->i_sbd->gfs1)) { > - free_metalist(ip, &metalist[0]); > if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH)) > - return 0; > + goto out; > /* check validity of leaf blocks and leaf chains */ > error = check_leaf_blks(ip, pass); > - return error; > - } > - > - /* Free the metalist buffers from heights we don't need to check. > - For the rest we'll free as we check them to save time. > - metalist[0] will only have the dinode bh, so we can skip it. */ > - for (i = 1; i < height - 1; i++) { > - list = &metalist[i]; > - while (!osi_list_empty(list)) { > - bh = osi_list_entry(list->next, > - struct gfs2_buffer_head, b_altlist); > - if (bh == ip->i_bh) > - osi_list_del(&bh->b_altlist); > - else > - brelse(bh); > - } > + if (error) > + goto undo_metalist; > + goto out; > } > > /* check data blocks */ > @@ -1435,14 +1440,12 @@ int check_metatree(struct gfs2_inode *ip, struct > metawalk_fxns *pass) > else > rc = 0; > > - if (rc && (!error || rc < 0)) > + if (rc && (!error || rc < 0)) { > error = rc; > + break; > + } > if (pass->big_file_msg && ip->i_di.di_blocks > COMFORTABLE_BLKS) > pass->big_file_msg(ip, blks_checked); > - if (bh == ip->i_bh) > - osi_list_del(&bh->b_altlist); > - else > - brelse(bh); > } > if (pass->big_file_msg && ip->i_di.di_blocks > COMFORTABLE_BLKS) { > log_notice( _("\rLarge file at %lld (0x%llx) - 100 percent " > @@ -1452,6 +1455,50 @@ int check_metatree(struct gfs2_inode *ip, struct > metawalk_fxns *pass) > (unsigned long long)ip->i_di.di_num.no_addr); > fflush(stdout); > } > +undo_metalist: > + if (!error) > + goto out; > + log_err( _("Error: inode %llu (0x%llx) had unrecoverable errors.\n"), > + (unsigned long long)ip->i_di.di_num.no_addr, > + (unsigned long long)ip->i_di.di_num.no_addr); > + if (!query( _("Remove the invalid inode? (y/n) "))) { > + free_metalist(ip, &metalist[0]); > + log_err(_("Invalid inode not deleted.\n")); > + return error; > + } > + for (i = 0; pass->undo_check_meta && i < height; i++) { > + while (!osi_list_empty(&metalist[i])) { > + list = &metalist[i]; > + bh = osi_list_entry(list->next, > + struct gfs2_buffer_head, > + b_altlist); > + log_err(_("Undoing metadata work for block %llu " > + "(0x%llx)\n"), > + (unsigned long long)bh->b_blocknr, > + (unsigned long long)bh->b_blocknr); > + if (i) > + rc = pass->undo_check_meta(ip, bh->b_blocknr, > + i, pass->private); > + else > + rc = 0; > + if (metadata_clean && rc == 0 && i == height - 1) { > + head_size = hdr_size(bh, height); > + if (head_size) > + undo_check_data(ip, pass, (uint64_t *) > + (bh->b_data + head_size), > + (bh->b_data + ip->i_sbd->bsize)); > + } > + if (bh == ip->i_bh) > + osi_list_del(&bh->b_altlist); > + else > + brelse(bh); > + } > + } > + /* Set the dinode as "bad" so it gets deleted */ > + fsck_blockmap_set(ip, ip->i_di.di_num.no_addr, > + _("corrupt"), gfs2_block_free); > + log_err(_("The corrupt inode was invalidated.\n")); > +out: > free_metalist(ip, &metalist[0]); > return error; > } > diff --git a/gfs2/fsck/metawalk.h b/gfs2/fsck/metawalk.h > index 486c6eb..f5e71e1 100644 > --- a/gfs2/fsck/metawalk.h > +++ b/gfs2/fsck/metawalk.h > @@ -108,6 +108,10 @@ struct metawalk_fxns { > int (*repair_leaf) (struct gfs2_inode *ip, uint64_t *leaf_no, > int lindex, int ref_count, const char *msg, > void *private); > + int (*undo_check_meta) (struct gfs2_inode *ip, uint64_t block, > + int h, void *private); > + int (*undo_check_data) (struct gfs2_inode *ip, uint64_t block, > + void *private); > }; > > #endif /* _METAWALK_H */ > diff --git a/gfs2/fsck/pass1.c b/gfs2/fsck/pass1.c > index 04e5289..a88895f 100644 > --- a/gfs2/fsck/pass1.c > +++ b/gfs2/fsck/pass1.c > @@ -39,8 +39,7 @@ static int check_leaf(struct gfs2_inode *ip, uint64_t > block, void *private); > static int check_metalist(struct gfs2_inode *ip, uint64_t block, > struct gfs2_buffer_head **bh, int h, void *private); > static int undo_check_metalist(struct gfs2_inode *ip, uint64_t block, > - struct gfs2_buffer_head **bh, int h, > - void *private); > + int h, void *private); > static int check_data(struct gfs2_inode *ip, uint64_t block, void *private); > static int undo_check_data(struct gfs2_inode *ip, uint64_t block, > void *private); > @@ -104,12 +103,8 @@ struct metawalk_fxns pass1_fxns = { > .finish_eattr_indir = finish_eattr_indir, > .big_file_msg = big_file_comfort, > .repair_leaf = pass1_repair_leaf, > -}; > - > -struct metawalk_fxns undo_fxns = { > - .private = NULL, > - .check_metalist = undo_check_metalist, > - .check_data = undo_check_data, > + .undo_check_meta = undo_check_metalist, > + .undo_check_data = undo_check_data, > }; > > struct metawalk_fxns invalidate_fxns = { > @@ -326,53 +321,67 @@ static int check_metalist(struct gfs2_inode *ip, > uint64_t block, > return 0; > } > > -static int undo_check_metalist(struct gfs2_inode *ip, uint64_t block, > - struct gfs2_buffer_head **bh, int h, > - void *private) > +/* undo_reference - undo previously processed data or metadata > + * We've treated the metadata for this dinode as good so far, but not we > + * realize it's bad. So we need to undo what we've done. > + * > + * Returns: 0 - We need to process the block as metadata. In other words, > + * we need to undo any blocks it refers to. > + * 1 - We can't process the block as metadata. > + */ > + > +static int undo_reference(struct gfs2_inode *ip, uint64_t block, int meta, > + void *private) > { > - int found_dup = 0, iblk_type; > - struct gfs2_buffer_head *nbh; > struct block_count *bc = (struct block_count *)private; > - > - *bh = NULL; > + struct duptree *dt; > + struct inode_with_dups *id; > > if (!valid_block(ip->i_sbd, block)) { /* blk outside of FS */ > fsck_blockmap_set(ip, ip->i_di.di_num.no_addr, > _("bad block referencing"), gfs2_block_free); > return 1; > } > - if (is_dir(&ip->i_di, ip->i_sbd->gfs1) && h == ip->i_di.di_height) > - iblk_type = GFS2_METATYPE_JD; > - else > - iblk_type = GFS2_METATYPE_IN; > > - found_dup = find_remove_dup(ip, block, _("Metadata")); > - nbh = bread(ip->i_sbd, block); > + if (meta) > + bc->indir_count--; > + dt = dupfind(block); > + if (dt) { > + /* remove all duplicate reference structures from this inode */ > + do { > + id = find_dup_ref_inode(dt, ip); > + if (!id) > + break; > > - if (gfs2_check_meta(nbh, iblk_type)) { > - if (!found_dup) { > - fsck_blockmap_set(ip, block, _("bad indirect"), > - gfs2_block_free); > - brelse(nbh); > + dup_listent_delete(id); > + } while (id); > + > + if (dt->refs) { > + log_err(_("Block %llu (0x%llx) is still referenced " > + "from another inode; not freeing.\n"), > + (unsigned long long)block, > + (unsigned long long)block); > return 1; > } > - brelse(nbh); > - nbh = NULL; > - } else /* blk check ok */ > - *bh = nbh; > - > - bc->indir_count--; > - if (found_dup) { > - if (nbh) > - brelse(nbh); > - *bh = NULL; > - return 1; /* don't process the metadata again */ > - } else > - fsck_blockmap_set(ip, block, _("bad indirect"), > - gfs2_block_free); > + } > + fsck_blockmap_set(ip, block, > + meta ? _("bad indirect") : _("referenced data"), > + gfs2_block_free); > return 0; > } > > +static int undo_check_metalist(struct gfs2_inode *ip, uint64_t block, > + int h, void *private) > +{ > + return undo_reference(ip, block, 1, private); > +} > + > +static int undo_check_data(struct gfs2_inode *ip, uint64_t block, > + void *private) > +{ > + return undo_reference(ip, block, 0, private); > +} > + > static int check_data(struct gfs2_inode *ip, uint64_t block, void *private) > { > uint8_t q; > @@ -438,71 +447,9 @@ static int check_data(struct gfs2_inode *ip, uint64_t > block, void *private) > return 0; > } > > -static int undo_check_data(struct gfs2_inode *ip, uint64_t block, > - void *private) > -{ > - struct block_count *bc = (struct block_count *) private; > - > - if (!valid_block(ip->i_sbd, block)) { > - /* Mark the owner of this block with the bad_block > - * designator so we know to check it for out of range > - * blocks later */ > - fsck_blockmap_set(ip, ip->i_di.di_num.no_addr, > - _("bad (invalid or out of range) data"), > - gfs2_block_free); > - return 1; > - } > - bc->data_count--; > - return free_block_if_notdup(ip, block, _("data")); > -} > - > static int remove_inode_eattr(struct gfs2_inode *ip, struct block_count *bc) > { > - struct duptree *dt; > - struct inode_with_dups *id; > - osi_list_t *ref; > - int moved = 0; > - > - /* If it's a duplicate reference to the block, we need to check > - if the reference is on the valid or invalid inodes list. > - If it's on the valid inode's list, move it to the invalid > - inodes list. The reason is simple: This inode, although > - valid, has an now-invalid reference, so we should not give > - this reference preferential treatment over others. */ > - dt = dupfind(ip->i_di.di_eattr); > - if (dt) { > - osi_list_foreach(ref, &dt->ref_inode_list) { > - id = osi_list_entry(ref, struct inode_with_dups, list); > - if (id->block_no == ip->i_di.di_num.no_addr) { > - log_debug( _("Moving inode %lld (0x%llx)'s " > - "duplicate reference to %lld " > - "(0x%llx) from the valid to the " > - "invalid reference list.\n"), > - (unsigned long long) > - ip->i_di.di_num.no_addr, > - (unsigned long long) > - ip->i_di.di_num.no_addr, > - (unsigned long long) > - ip->i_di.di_eattr, > - (unsigned long long) > - ip->i_di.di_eattr); > - /* Move from the normal to the invalid list */ > - osi_list_del(&id->list); > - osi_list_add_prev(&id->list, > - &dt->ref_invinode_list); > - moved = 1; > - break; > - } > - } > - if (!moved) > - log_debug( _("Duplicate reference to %lld " > - "(0x%llx) not moved.\n"), > - (unsigned long long)ip->i_di.di_eattr, > - (unsigned long long)ip->i_di.di_eattr); > - } else { > - delete_block(ip, ip->i_di.di_eattr, NULL, > - "extended attribute", NULL); > - } > + undo_reference(ip, ip->i_di.di_eattr, 0, bc); > ip->i_di.di_eattr = 0; > bc->ea_count = 0; > ip->i_di.di_blocks = 1 + bc->indir_count + bc->data_count; > @@ -1080,23 +1027,10 @@ static int handle_ip(struct gfs2_sbd *sdp, struct > gfs2_inode *ip) > if (lf_dip && lf_dip->i_di.di_blocks != lf_blks) > reprocess_inode(lf_dip, "lost+found"); > > - if (fsck_abort || error < 0) > + /* We there was an error, we return 0 because we want fsck to continue > + and analyze the other dinodes as well. */ > + if (fsck_abort || error != 0) > return 0; > - if (error > 0) { > - log_err( _("Error: inode %llu (0x%llx) has unrecoverable " > - "errors; invalidating.\n"), > - (unsigned long long)ip->i_di.di_num.no_addr, > - (unsigned long long)ip->i_di.di_num.no_addr); > - undo_fxns.private = &bc; > - check_metatree(ip, &undo_fxns); > - /* If we undo the metadata accounting, including metadatas > - duplicate block status, we need to make sure later passes > - don't try to free up the metadata referenced by this inode. > - Therefore we mark the inode as free space. */ > - fsck_blockmap_set(ip, ip->i_di.di_num.no_addr, > - _("corrupt"), gfs2_block_free); > - return 0; > - } > > error = check_inode_eattr(ip, &pass1_fxns); >