Hi Qu, On Thu, Oct 30, 2014 at 4:54 AM, Qu Wenruo <quwen...@cn.fujitsu.com> wrote: > Before the patch, chunk will be considered bad if the corresponding > block group is missing, even the only uncertain data is the 'used' > member of the block group. > > This patch will try to recalculate the 'used' value of the block group > and rebuild it. > So even only chunk item and dev extent item is found, the chunk can be > recovered. > Although if extent tree is damanged and needed extent item can't be > read, the block group's 'used' value will be the block group length, to > prevent any later write/block reserve damaging the block group. > In that case, we will prompt user and recommend them to use > '--init-extent-tree' to rebuild extent tree if possible. > > Signed-off-by: Qu Wenruo <quwen...@cn.fujitsu.com> > --- > btrfsck.h | 3 +- > chunk-recover.c | 242 > +++++++++++++++++++++++++++++++++++++++++++++++++------- > cmds-check.c | 29 ++++--- > 3 files changed, 234 insertions(+), 40 deletions(-) > > diff --git a/btrfsck.h b/btrfsck.h > index 356c767..7a50648 100644 > --- a/btrfsck.h > +++ b/btrfsck.h > @@ -179,5 +179,6 @@ btrfs_new_device_extent_record(struct extent_buffer *leaf, > int check_chunks(struct cache_tree *chunk_cache, > struct block_group_tree *block_group_cache, > struct device_extent_tree *dev_extent_cache, > - struct list_head *good, struct list_head *bad, int silent); > + struct list_head *good, struct list_head *bad, > + struct list_head *rebuild, int silent); > #endif > diff --git a/chunk-recover.c b/chunk-recover.c > index 6f43066..dbf98b5 100644 > --- a/chunk-recover.c > +++ b/chunk-recover.c > @@ -61,6 +61,7 @@ struct recover_control { > > struct list_head good_chunks; > struct list_head bad_chunks; > + struct list_head rebuild_chunks; > struct list_head unrepaired_chunks; > pthread_mutex_t rc_lock; > }; > @@ -203,6 +204,7 @@ static void init_recover_control(struct recover_control > *rc, int verbose, > > INIT_LIST_HEAD(&rc->good_chunks); > INIT_LIST_HEAD(&rc->bad_chunks); > + INIT_LIST_HEAD(&rc->rebuild_chunks); > INIT_LIST_HEAD(&rc->unrepaired_chunks); > > rc->verbose = verbose; > @@ -529,22 +531,32 @@ static void print_check_result(struct recover_control > *rc) > return; > > printf("CHECK RESULT:\n"); > - printf("Healthy Chunks:\n"); > + printf("Recoverable Chunks:\n"); > list_for_each_entry(chunk, &rc->good_chunks, list) { > print_chunk_info(chunk, " "); > good++; > total++; > } > - printf("Bad Chunks:\n"); > + list_for_each_entry(chunk, &rc->rebuild_chunks, list) { > + print_chunk_info(chunk, " "); > + good++; > + total++; > + } > + list_for_each_entry(chunk, &rc->unrepaired_chunks, list) { > + print_chunk_info(chunk, " "); > + good++; > + total++; > + } > + printf("Unrecoverable Chunks:\n"); > list_for_each_entry(chunk, &rc->bad_chunks, list) { > print_chunk_info(chunk, " "); > bad++; > total++; > } > printf("\n"); > - printf("Total Chunks:\t%d\n", total); > - printf(" Heathy:\t%d\n", good); > - printf(" Bad:\t%d\n", bad); > + printf("Total Chunks:\t\t%d\n", total); > + printf(" Recoverable:\t\t%d\n", good); > + printf(" Unrecoverable:\t%d\n", bad); > > printf("\n"); > printf("Orphan Block Groups:\n"); > @@ -555,6 +567,7 @@ static void print_check_result(struct recover_control *rc) > printf("Orphan Device Extents:\n"); > list_for_each_entry(devext, &rc->devext.no_chunk_orphans, chunk_list) > print_device_extent_info(devext, " "); > + printf("\n"); > } > > static int check_chunk_by_metadata(struct recover_control *rc, > @@ -938,6 +951,11 @@ static int build_device_maps_by_chunk_records(struct > recover_control *rc, > if (ret) > return ret; > } > + list_for_each_entry(chunk, &rc->rebuild_chunks, list) { > + ret = build_device_map_by_chunk_record(root, chunk); > + if (ret) > + return ret; > + } > return ret; > } > > @@ -1168,12 +1186,31 @@ static int __rebuild_device_items(struct > btrfs_trans_handle *trans, > return ret; > } > > +static int __insert_chunk_item(struct btrfs_trans_handle *trans, > + struct chunk_record *chunk_rec, > + struct btrfs_root *chunk_root) > +{ > + struct btrfs_key key; > + struct btrfs_chunk *chunk = NULL; > + int ret = 0; > + > + chunk = create_chunk_item(chunk_rec); > + if (!chunk) > + return -ENOMEM; > + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; > + key.type = BTRFS_CHUNK_ITEM_KEY; > + key.offset = chunk_rec->offset; > + > + ret = btrfs_insert_item(trans, chunk_root, &key, chunk, > + btrfs_chunk_item_size(chunk->num_stripes)); > + free(chunk); > + return ret; > +} > + > static int __rebuild_chunk_items(struct btrfs_trans_handle *trans, > struct recover_control *rc, > struct btrfs_root *root) > { > - struct btrfs_key key; > - struct btrfs_chunk *chunk = NULL; > struct btrfs_root *chunk_root; > struct chunk_record *chunk_rec; > int ret; > @@ -1181,17 +1218,12 @@ static int __rebuild_chunk_items(struct > btrfs_trans_handle *trans, > chunk_root = root->fs_info->chunk_root; > > list_for_each_entry(chunk_rec, &rc->good_chunks, list) { > - chunk = create_chunk_item(chunk_rec); > - if (!chunk) > - return -ENOMEM; > - > - key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; > - key.type = BTRFS_CHUNK_ITEM_KEY; > - key.offset = chunk_rec->offset; > - > - ret = btrfs_insert_item(trans, chunk_root, &key, chunk, > - btrfs_chunk_item_size(chunk->num_stripes)); > - free(chunk); > + ret = __insert_chunk_item(trans, chunk_rec, chunk_root); > + if (ret) > + return ret; > + } > + list_for_each_entry(chunk_rec, &rc->rebuild_chunks, list) { > + ret = __insert_chunk_item(trans, chunk_rec, chunk_root); > if (ret) > return ret; > } > @@ -1255,6 +1287,131 @@ static int rebuild_sys_array(struct recover_control > *rc, > > } > > +static int calculate_bg_used(struct btrfs_root *extent_root, > + struct chunk_record *chunk_rec, > + struct btrfs_path *path, > + u64 *used) > +{ > + struct extent_buffer *node; > + struct btrfs_key found_key; > + int slot; > + int ret = 0; > + u64 used_ret = 0; > + > + while (1) { > + node = path->nodes[0]; > + slot = path->slots[0]; > + btrfs_item_key_to_cpu(node, &found_key, slot); > + if (found_key.objectid >= chunk_rec->offset + > chunk_rec->length) > + break; > + if (found_key.type != BTRFS_METADATA_ITEM_KEY && > + found_key.type != BTRFS_EXTENT_DATA_KEY) > + goto next; > + if (found_key.type == BTRFS_METADATA_ITEM_KEY) > + used_ret += extent_root->nodesize; > + else > + used_ret += found_key.offset; > +next: > + if (slot + 1 < btrfs_header_nritems(node)) > + slot++; > + else { > + ret = btrfs_next_leaf(extent_root, path); > + if (ret > 0) { > + ret = 0; > + break; > + } > + if (ret < 0) > + break; > + } > + } > + if (!ret) > + *used = used_ret; > + return ret; > +} > + > +static int __insert_block_group(struct btrfs_trans_handle *trans, > + struct chunk_record *chunk_rec, > + struct btrfs_root *extent_root, > + u64 used) > +{ > + struct btrfs_block_group_item bg_item; > + struct btrfs_key key; > + int ret = 0; > + > + btrfs_set_block_group_used(&bg_item, used); > + btrfs_set_block_group_chunk_objectid(&bg_item, used); This looks like a bug. Instead of "used", I think it should be "BTRFS_FIRST_CHUNK_TREE_OBJECTID".
> + btrfs_set_block_group_flags(&bg_item, chunk_rec->type_flags); > + key.objectid = chunk_rec->offset; > + key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; > + key.offset = chunk_rec->length; > + > + ret = btrfs_insert_item(trans, extent_root, &key, &bg_item, > + sizeof(bg_item)); > + return ret; > +} > + > +/* > + * Search through the extent tree to rebuild the 'used' member of the block > + * group. > + * However, since block group and extent item shares the extent tree, > + * the extent item may also missing. > + * In that case, we fill the 'used' with the length of the block group to > + * ensure no write into the block group. > + * Btrfsck will hate it but we will inform user to call '--init-extent-tree' > + * if possible, or just salvage as much data as possible from the fs. > + */ > +static int rebuild_block_group(struct btrfs_trans_handle *trans, > + struct recover_control *rc, > + struct btrfs_root *root) > +{ > + struct chunk_record *chunk_rec; > + struct btrfs_key search_key; > + struct btrfs_path *path; > + u64 used = 0; > + int ret = 0; > + > + if (list_empty(&rc->rebuild_chunks)) > + return 0; > + > + path = btrfs_alloc_path(); > + if (!path) > + return -ENOMEM; > + list_for_each_entry(chunk_rec, &rc->rebuild_chunks, list) { > + search_key.objectid = chunk_rec->offset; > + search_key.type = BTRFS_EXTENT_ITEM_KEY; > + search_key.offset = 0; > + ret = btrfs_search_slot(NULL, root->fs_info->extent_root, > + &search_key, path, 0, 0); > + if (ret < 0) > + goto out; > + ret = calculate_bg_used(root->fs_info->extent_root, > + chunk_rec, path, &used); > + /* > + * Extent tree is damaged, better to rebuild the whole extent > + * tree. Currently, change the used to chunk's len to prevent > + * write/block reserve happening in that block group. > + */ > + if (ret < 0) { > + fprintf(stderr, > + "Fail to search extent tree for block group: > [%llu,%llu]\n", > + chunk_rec->offset, > + chunk_rec->offset + chunk_rec->length); > + fprintf(stderr, > + "Mark the block group full to prevent block > rsv problems\n"); > + used = chunk_rec->length; > + } > + btrfs_release_path(path); > + ret = __insert_block_group(trans, chunk_rec, > + root->fs_info->extent_root, > + used); > + if (ret < 0) > + goto out; > + } > +out: > + btrfs_free_path(path); > + return ret; > +} > + > static struct btrfs_root * > open_ctree_with_broken_chunk(struct recover_control *rc) > { > @@ -2063,6 +2220,7 @@ static int btrfs_recover_chunks(struct recover_control > *rc) > ret = insert_cache_extent(&rc->chunk, &chunk->cache); > BUG_ON(ret); > > + list_del_init(&bg->list); > if (!nstripes) { > list_add_tail(&chunk->list, &rc->bad_chunks); > continue; > @@ -2093,6 +2251,33 @@ static int btrfs_recover_chunks(struct recover_control > *rc) > return 0; > } > > +static inline int is_chunk_overlap(struct chunk_record *chunk1, > + struct chunk_record *chunk2) > +{ > + if (chunk1->offset >= chunk2->offset + chunk2->length || > + chunk1->offset + chunk1->length <= chunk2->offset) > + return 0; > + return 1; > +} > + > +/* Move invalid(overlap with good chunks) rebuild chunks to bad chunk list */ > +static void validate_rebuild_chunks(struct recover_control *rc) > +{ > + struct chunk_record *good; > + struct chunk_record *rebuild; > + struct chunk_record *tmp; > + > + list_for_each_entry_safe(rebuild, tmp, &rc->rebuild_chunks, list) { > + list_for_each_entry(good, &rc->good_chunks, list) { > + if (is_chunk_overlap(rebuild, good)) { > + list_move_tail(&rebuild->list, > + &rc->bad_chunks); > + break; > + } > + } > + } > +} > + > /* > * Return 0 when succesful, < 0 on error and > 0 if aborted by user > */ > @@ -2127,8 +2312,7 @@ int btrfs_recover_chunk_tree(char *path, int verbose, > int yes) > print_scan_result(&rc); > > ret = check_chunks(&rc.chunk, &rc.bg, &rc.devext, &rc.good_chunks, > - &rc.bad_chunks, 1); > - print_check_result(&rc); > + &rc.bad_chunks, &rc.rebuild_chunks, 1); > if (ret) { > if (!list_empty(&rc.bg.block_groups) || > !list_empty(&rc.devext.no_chunk_orphans)) { > @@ -2136,17 +2320,13 @@ int btrfs_recover_chunk_tree(char *path, int verbose, > int yes) > if (ret) > goto fail_rc; > } > - /* > - * If the chunk is healthy, its block group item and device > - * extent item should be written on the disks. So, it is very > - * likely that the bad chunk is a old one that has been > - * droppped from the fs. Don't deal with them now, we will > - * check it after the fs is opened. > - */ > } else { > - fprintf(stderr, "Check chunks successfully with no > orphans\n"); > + print_check_result(&rc); > + printf("Check chunks successfully with no orphans\n"); > goto fail_rc; > } > + validate_rebuild_chunks(&rc); > + print_check_result(&rc); > > root = open_ctree_with_broken_chunk(&rc); > if (IS_ERR(root)) { > @@ -2185,6 +2365,12 @@ int btrfs_recover_chunk_tree(char *path, int verbose, > int yes) > ret = rebuild_sys_array(&rc, root); > BUG_ON(ret); > > + ret = rebuild_block_group(trans, &rc, root); > + if (ret) { > + printf("Fail to rebuild block groups.\n"); > + printf("Recommend to run 'btrfs check --init-extent-tree > <dev>' after recovery\n"); > + } > + > btrfs_commit_transaction(trans, root); > fail_close_ctree: > close_ctree(root); > diff --git a/cmds-check.c b/cmds-check.c > index 2a5f823..2795ccf 100644 > --- a/cmds-check.c > +++ b/cmds-check.c > @@ -6133,6 +6133,13 @@ u64 calc_stripe_length(u64 type, u64 length, int > num_stripes) > return stripe_size; > } > > +/* > + * Check the chunk with its block group/dev list ref: > + * Return 0 if all refs seems valid. > + * Return 1 if part of refs seems valid, need later check for rebuild ref > + * like missing block group and needs to search extent tree to rebuild them. > + * Return -1 if essential refs are missing and unable to rebuild. > + */ > static int check_chunk_refs(struct chunk_record *chunk_rec, > struct block_group_tree *block_group_cache, > struct device_extent_tree *dev_extent_cache, > @@ -6188,7 +6195,7 @@ static int check_chunk_refs(struct chunk_record > *chunk_rec, > chunk_rec->length, > chunk_rec->offset, > chunk_rec->type_flags); > - ret = -1; > + ret = 1; > } > > length = calc_stripe_length(chunk_rec->type_flags, chunk_rec->length, > @@ -6241,7 +6248,8 @@ static int check_chunk_refs(struct chunk_record > *chunk_rec, > int check_chunks(struct cache_tree *chunk_cache, > struct block_group_tree *block_group_cache, > struct device_extent_tree *dev_extent_cache, > - struct list_head *good, struct list_head *bad, int silent) > + struct list_head *good, struct list_head *bad, > + struct list_head *rebuild, int silent) > { > struct cache_extent *chunk_item; > struct chunk_record *chunk_rec; > @@ -6256,15 +6264,14 @@ int check_chunks(struct cache_tree *chunk_cache, > cache); > err = check_chunk_refs(chunk_rec, block_group_cache, > dev_extent_cache, silent); > - if (err) { > + if (err) > ret = err; > - if (bad) > - list_add_tail(&chunk_rec->list, bad); > - } else { > - if (good) > - list_add_tail(&chunk_rec->list, good); > - } > - > + if (err == 0 && good) > + list_add_tail(&chunk_rec->list, good); > + if (err > 0 && rebuild) > + list_add_tail(&chunk_rec->list, rebuild); > + if (err < 0 && bad) > + list_add_tail(&chunk_rec->list, bad); > chunk_item = next_cache_extent(chunk_item); > } > > @@ -6548,7 +6555,7 @@ again: > } > > err = check_chunks(&chunk_cache, &block_group_cache, > - &dev_extent_cache, NULL, NULL, 0); > + &dev_extent_cache, NULL, NULL, NULL, 0); > if (err && !ret) > ret = err; > > -- > 2.1.2 Couple of questions: # In remove_chunk_extent_item, should we also consider "rebuild" chunks now? It can happen that a "rebuild" chunks is a SYSTEM chunk. Should we try to handle it as well? # Same question for "rebuild_sys_array". Should we also consider "rebuild" chunks? Thanks, Alex. > > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html