)sorry, forgot to CC btrfs Maillist)

On 08/11/2012 10:36 AM, Liu Bo wrote:
> On 08/10/2012 09:38 PM, Stefan Behrens wrote:
>> So far the return code of barrier_all_devices() is ignored, which
>> means that errors are ignored. The result can be a corrupt
>> filesystem which is not consistent.
>> This commit adds code to evaluate the return code of
>> barrier_all_devices(). The normal btrfs_error() mechanism is used to
>> switch the filesystem into read-only mode when errors are detected.
>>
>> In order to decide whether barrier_all_devices() should return
>> error or success, the number of disks that are allowed to fail the
>> barrier submission is calculated. This calculation accounts for the
>> worst RAID level of metadata, system and data. If single, dup or
>> RAID0 is in use, a single disk error is already considered to be
>> fatal. Otherwise a single disk error is tolerated.
>>
>> The calculation of the number of disks that are tolerated to fail
>> the barrier operation is performed when the filesystem gets mounted,
>> when a balance operation is started and finished, and when devices
>> are added or removed.
>>
>> Signed-off-by: Stefan Behrens <sbehr...@giantdisaster.de>
>> ---
>>  fs/btrfs/ctree.h    |   5 +++
>>  fs/btrfs/disk-io.c  | 109 
>> +++++++++++++++++++++++++++++++++++++++++++++-------
>>  fs/btrfs/disk-io.h  |   2 +
>>  fs/btrfs/ioctl.c    |   8 ++--
>>  fs/btrfs/tree-log.c |   7 +++-
>>  fs/btrfs/volumes.c  |  30 +++++++++++++++
>>  6 files changed, 142 insertions(+), 19 deletions(-)
>>
>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>> index adb1cd7..af38d6d 100644
>> --- a/fs/btrfs/ctree.h
>> +++ b/fs/btrfs/ctree.h
>> @@ -1442,6 +1442,8 @@ struct btrfs_fs_info {
>>  
>>      /* next backup root to be overwritten */
>>      int backup_root_index;
>> +
>> +    int num_tolerated_disk_barrier_failures;
> [...]  
>> +int btrfs_calc_num_tolerated_disk_barrier_failures(
>> +    struct btrfs_fs_info *fs_info)
>> +{
>> +    struct btrfs_ioctl_space_info space;
>> +    struct btrfs_space_info *sinfo;
>> +    u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
>> +                   BTRFS_BLOCK_GROUP_SYSTEM,
>> +                   BTRFS_BLOCK_GROUP_METADATA,
>> +                   BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
>> +    int num_types = 4;
>> +    int i;
>> +    int c;
>> +    int num_tolerated_disk_barrier_failures =
>> +            (int)fs_info->fs_devices->num_devices;
>> +
>> +    for (i = 0; i < num_types; i++) {
>> +            struct btrfs_space_info *tmp;
>> +
>> +            sinfo = NULL;
>> +            rcu_read_lock();
>> +            list_for_each_entry_rcu(tmp, &fs_info->space_info, list) {
>> +                    if (tmp->flags == types[i]) {
>> +                            sinfo = tmp;
>> +                            break;
>> +                    }
>> +            }
>> +            rcu_read_unlock();
>> +
>> +            if (!sinfo)
>> +                    continue;
>> +
>> +            down_read(&sinfo->groups_sem);
>> +            for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
>> +                    if (!list_empty(&sinfo->block_groups[c])) {
>> +                            u64 flags;
>> +
>> +                            btrfs_get_block_group_info(
>> +                                    &sinfo->block_groups[c], &space);
>> +                            if (space.total_bytes == 0 ||
>> +                                space.used_bytes == 0)
>> +                                    continue;
>> +                            flags = space.flags;
>> +                            /*
>> +                             * return
>> +                             * 0: if dup, single or RAID0 is configured for
>> +                             *    any of metadata, system or data, else
>> +                             * 1: if RAID5 is configured, or if RAID1 or
>> +                             *    RAID10 is configured and only two mirrors
>> +                             *    are used, else
>> +                             * 2: if RAID6 is configured, else
>> +                             * num_mirrors - 1: if RAID1 or RAID10 is
>> +                             *                  configured and more than
>> +                             *                  2 mirrors are used.
>> +                             */
>> +                            if (num_tolerated_disk_barrier_failures > 0 &&
>> +                                ((flags & (BTRFS_BLOCK_GROUP_DUP |
>> +                                           BTRFS_BLOCK_GROUP_RAID0)) ||
>> +                                 ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
>> +                                  == 0)))
> 
> Good work.
> 
> We already have __get_block_group_index(), for dup, single, raid0 we can do 
> the same thing like
> this:
> __get_block_group_index(flags) > 1
> 
> the only problem is to make the function public :)
> 
> 
>> +                                    num_tolerated_disk_barrier_failures = 0;
>> +                            else if (num_tolerated_disk_barrier_failures > 1
>> +                                     &&
>> +                                     (flags & (BTRFS_BLOCK_GROUP_RAID1 |
>> +                                               BTRFS_BLOCK_GROUP_RAID10)))
>> +                                    num_tolerated_disk_barrier_failures = 1;
>> +                    }
>> +            }
>> +            up_read(&sinfo->groups_sem);
>> +    }
>> +
>> +    return num_tolerated_disk_barrier_failures;
>> +}
>> +
>>  int write_all_supers(struct btrfs_root *root, int max_mirrors)
>>  {
>>      struct list_head *head;
>> @@ -2979,8 +3054,16 @@ int write_all_supers(struct btrfs_root *root, int 
>> max_mirrors)
>>      mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
>>      head = &root->fs_info->fs_devices->devices;
>>  
>> -    if (do_barriers)
>> -            barrier_all_devices(root->fs_info);
>> +    if (do_barriers) {
>> +            ret = barrier_all_devices(root->fs_info);
>> +            if (ret) {
>> +                    mutex_unlock(
>> +                            &root->fs_info->fs_devices->device_list_mutex);
>> +                    btrfs_error(root->fs_info, ret,
>> +                                "errors while submitting device barriers.");
>> +                    return ret;
>> +            }
>> +    }
>>  
>>      list_for_each_entry_rcu(dev, head, dev_list) {
>>              if (!dev->bdev) {
>> diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
>> index c5b00a7..2025a91 100644
>> --- a/fs/btrfs/disk-io.h
>> +++ b/fs/btrfs/disk-io.h
>> @@ -95,6 +95,8 @@ struct btrfs_root *btrfs_create_tree(struct 
>> btrfs_trans_handle *trans,
>>                                   u64 objectid);
>>  int btree_lock_page_hook(struct page *page, void *data,
>>                              void (*flush_fn)(void *));
>> +int btrfs_calc_num_tolerated_disk_barrier_failures(
>> +    struct btrfs_fs_info *fs_info);
>>  
>>  #ifdef CONFIG_DEBUG_LOCK_ALLOC
>>  void btrfs_init_lockdep(void);
>> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
>> index 43f0012..6cd4125 100644
>> --- a/fs/btrfs/ioctl.c
>> +++ b/fs/btrfs/ioctl.c
>> @@ -2852,8 +2852,8 @@ static long btrfs_ioctl_default_subvol(struct file 
>> *file, void __user *argp)
>>      return 0;
>>  }
>>  
>> -static void get_block_group_info(struct list_head *groups_list,
>> -                             struct btrfs_ioctl_space_info *space)
>> +void btrfs_get_block_group_info(struct list_head *groups_list,
>> +                            struct btrfs_ioctl_space_info *space)
>>  {
>>      struct btrfs_block_group_cache *block_group;
>>  
>> @@ -2961,8 +2961,8 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, 
>> void __user *arg)
>>              down_read(&info->groups_sem);
>>              for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
>>                      if (!list_empty(&info->block_groups[c])) {
>> -                            get_block_group_info(&info->block_groups[c],
>> -                                                 &space);
>> +                            btrfs_get_block_group_info(
>> +                                    &info->block_groups[c], &space);
>>                              memcpy(dest, &space, sizeof(space));
>>                              dest++;
>>                              space_args.total_spaces++;
>> diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
>> index c86670f..c30e1c0 100644
>> --- a/fs/btrfs/tree-log.c
>> +++ b/fs/btrfs/tree-log.c
>> @@ -2171,9 +2171,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
>>       * in and cause problems either.
>>       */
>>      btrfs_scrub_pause_super(root);
>> -    write_ctree_super(trans, root->fs_info->tree_root, 1);
>> +    ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
>>      btrfs_scrub_continue_super(root);
>> -    ret = 0;
>> +    if (ret) {
>> +            btrfs_abort_transaction(trans, root, ret);
>> +            goto out_wake_log_root;
>> +    }
>>  
>>      mutex_lock(&root->log_mutex);
>>      if (root->last_log_commit < log_transid)
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index b8708f9..48ccaa4 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -1474,6 +1474,9 @@ int btrfs_rm_device(struct btrfs_root *root, char 
>> *device_path)
>>              free_fs_devices(cur_devices);
>>      }
>>  
>> +    root->fs_info->num_tolerated_disk_barrier_failures =
>> +            btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
>> +
>>      /*
>>       * at this point, the device is zero sized.  We want to
>>       * remove it from the devices list and zero out the old super
>> @@ -1796,6 +1799,8 @@ int btrfs_init_new_device(struct btrfs_root *root, 
>> char *device_path)
>>      btrfs_clear_space_info_full(root->fs_info);
>>  
>>      unlock_chunks(root);
>> +    root->fs_info->num_tolerated_disk_barrier_failures =
>> +            btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
>>      ret = btrfs_commit_transaction(trans, root);
>>  
>>      if (seeding_dev) {
>> @@ -2807,6 +2812,26 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
>>              }
>>      }
>>  
>> +    if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
>> +            int num_tolerated_disk_barrier_failures;
>> +            u64 target = bctl->sys.target;
>> +
>> +            num_tolerated_disk_barrier_failures =
>> +                    btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
>> +            if (num_tolerated_disk_barrier_failures > 0 &&
>> +                (target &
>> +                 (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
>> +                  BTRFS_AVAIL_ALLOC_BIT_SINGLE)))
> 
> ditto.
> 
> thanks,
> liubo
> 
>> +                    num_tolerated_disk_barrier_failures = 0;
>> +            else if (num_tolerated_disk_barrier_failures > 1 &&
>> +                     (target &
>> +                      (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)))
>> +                    num_tolerated_disk_barrier_failures = 1;
>> +
>> +            fs_info->num_tolerated_disk_barrier_failures =
>> +                    num_tolerated_disk_barrier_failures;
>> +    }
>> +
>>      ret = insert_balance_item(fs_info->tree_root, bctl);
>>      if (ret && ret != -EEXIST)
>>              goto out;
>> @@ -2839,6 +2864,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
>>              __cancel_balance(fs_info);
>>      }
>>  
>> +    if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
>> +            fs_info->num_tolerated_disk_barrier_failures =
>> +                    btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
>> +    }
>> +
>>      wake_up(&fs_info->balance_wait_q);
>>  
>>      return ret;
>>
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to