On Wed, Jun 26, 2019 at 04:34:02PM +0800, Anand Jain wrote:
> Introduces devid readmirror property, to direct read IO to the specified
> device(s).
> 
> The readmirror property is stored as extended attribute on the root
> inode.

So this is permanently stored on the filesystem, is the policy applied
at mount time?

> The readmirror input format is devid1,2,3.. etc. And for the
> each devid provided, a new flag BTRFS_DEV_STATE_READ_OPTIMISED is set.

I'd say it should be called PREFERRED, and the format specifier could
add ":" between devid and the numbers, like "devid:1,2,3", we'll
probably want more types in the future.

This is the first whole-filesystem property so we can't follow a naming
scheme, so we'll have to decide if we go with flat or hierarchical
naming with more generic categories like policy.mirror.read and similar.

> As of now readmirror by devid supports only raid1s. Raid10 support has
> to leverage device grouping feature, which is yet to be implemented.
> 
> Signed-off-by: Anand Jain <anand.j...@oracle.com>
> ---
>  fs/btrfs/props.c   | 69 +++++++++++++++++++++++++++++++++++++++++++++-
>  fs/btrfs/volumes.c | 16 +++++++++++
>  fs/btrfs/volumes.h |  2 ++
>  3 files changed, 86 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
> index 0dc26a154a98..6a789e1c150c 100644
> --- a/fs/btrfs/props.c
> +++ b/fs/btrfs/props.c
> @@ -316,7 +316,10 @@ static const char *prop_compression_extract(struct inode 
> *inode)
>  static int prop_readmirror_validate(struct inode *inode, const char *value,
>                                   size_t len)
>  {
> +     char *value_dup;
> +     char *devid_str;
>       struct btrfs_root *root = BTRFS_I(inode)->root;
> +     struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
>  
>       if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID)
>               return -EINVAL;
> @@ -324,16 +327,80 @@ static int prop_readmirror_validate(struct inode 
> *inode, const char *value,
>       if (!len)
>               return 0;
>  
> -     return -EINVAL;
> +     if (len <= 5 || strncmp("devid", value, 5))
> +             return -EINVAL;
> +
> +     value_dup = kstrndup(value + 5, len - 5, GFP_KERNEL);
> +     if (!value_dup)
> +             return -ENOMEM;
> +
> +     while ((devid_str = strsep(&value_dup, ",")) != NULL) {
> +             u64 devid;
> +             struct btrfs_device *device;
> +
> +             if (kstrtoull(devid_str, 10, &devid)) {
> +                     kfree(value_dup);
> +                     return -EINVAL;
> +             }
> +
> +             device = btrfs_find_device(fs_devices, devid, NULL, NULL, 
> false);

Doesn't this need locking?

> +             if (!device) {
> +                     kfree(value_dup);
> +                     return -EINVAL;
> +             }
> +     }
> +
> +     return 0;
>  }
>  
>  static int prop_readmirror_apply(struct inode *inode, const char *value,
>                                size_t len)
>  {
> +     char *value_dup;
> +     char *devid_str;
> +     struct btrfs_device *device;
>       struct btrfs_fs_devices *fs_devices = btrfs_sb(inode->i_sb)->fs_devices;
>  
> +     if (value) {
> +             value_dup = kstrndup(value + 5, len - 5, GFP_KERNEL);
> +             if (!value_dup)
> +                     return -ENOMEM;
> +     }
> +
> +     /* Both set and reset has to clear the exisiting values */
> +     list_for_each_entry(device, &fs_devices->devices, dev_list) {
> +             if (test_bit(BTRFS_DEV_STATE_READ_OPTIMISED,
> +                          &device->dev_state)) {
> +                     clear_bit(BTRFS_DEV_STATE_READ_OPTIMISED,
> +                               &device->dev_state);
> +             }
> +     }

And this one too.

>       fs_devices->readmirror_policy = BTRFS_READMIRROR_DEFAULT;
>  
> +     /* Its only reset so just return */
> +     if (!value)
> +             return 0;
> +
> +     while ((devid_str = strsep(&value_dup, ",")) != NULL) {
> +             u64 devid;
> +
> +             /* Has been verified in validate() this will not fail */
> +             if (kstrtoull(devid_str, 10, &devid)) {
> +                     kfree(value_dup);
> +                     return -EINVAL;
> +             }
> +
> +             device = btrfs_find_device(fs_devices, devid, NULL, NULL, 
> false);
> +             if (!device) {
> +                     kfree(value_dup);
> +                     return -EINVAL;
> +             }
> +
> +             set_bit(BTRFS_DEV_STATE_READ_OPTIMISED, &device->dev_state);
> +             fs_devices->readmirror_policy = BTRFS_READMIRROR_DEVID;
> +     }
> +
> +     kfree(value_dup);
>       return 0;
>  }
>  
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index d72850ed4f88..993645f4b5f6 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -5481,6 +5481,7 @@ static int find_live_mirror(struct btrfs_fs_info 
> *fs_info,
>       int preferred_mirror;
>       int tolerance;
>       struct btrfs_device *srcdev;
> +     bool found = false;
>  
>       ASSERT((map->type &
>                (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
> @@ -5491,6 +5492,21 @@ static int find_live_mirror(struct btrfs_fs_info 
> *fs_info,
>               num_stripes = map->num_stripes;
>  
>       switch(fs_info->fs_devices->readmirror_policy) {
> +     case BTRFS_READMIRROR_DEVID:
> +             /* skip raid10 for now */
> +             if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
> +                     for (i = first; i < first + num_stripes; i++) {
> +                             if (test_bit(BTRFS_DEV_STATE_READ_OPTIMISED,
> +                                          &map->stripes[i].dev->dev_state)) {
> +                                     preferred_mirror = i;
> +                                     found = true;
> +                                     break;
> +                             }
> +                     }
> +                     if (found)
> +                             break;
> +             }
> +             /* fall through */
>       case BTRFS_READMIRROR_DEFAULT:
>               /* fall through */
>       default:
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index e985d2133c0a..55b0f3b28595 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -56,6 +56,7 @@ struct btrfs_io_geometry {
>  #define BTRFS_DEV_STATE_MISSING              (2)
>  #define BTRFS_DEV_STATE_REPLACE_TGT  (3)
>  #define BTRFS_DEV_STATE_FLUSH_SENT   (4)
> +#define BTRFS_DEV_STATE_READ_OPTIMISED       (5)
>  
>  struct btrfs_device {
>       struct list_head dev_list; /* device_list_mutex */
> @@ -221,6 +222,7 @@ BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
>  
>  enum btrfs_readmirror_policy {
>       BTRFS_READMIRROR_DEFAULT,
> +     BTRFS_READMIRROR_DEVID,
>  };
>  
>  struct btrfs_fs_devices {
> -- 
> 2.20.1 (Apple Git-117)

Reply via email to