On Wed, Jun 26, 2019 at 04:34:02PM +0800, Anand Jain wrote: > Introduces devid readmirror property, to direct read IO to the specified > device(s). > > The readmirror property is stored as extended attribute on the root > inode.
So this is permanently stored on the filesystem, is the policy applied at mount time? > The readmirror input format is devid1,2,3.. etc. And for the > each devid provided, a new flag BTRFS_DEV_STATE_READ_OPTIMISED is set. I'd say it should be called PREFERRED, and the format specifier could add ":" between devid and the numbers, like "devid:1,2,3", we'll probably want more types in the future. This is the first whole-filesystem property so we can't follow a naming scheme, so we'll have to decide if we go with flat or hierarchical naming with more generic categories like policy.mirror.read and similar. > As of now readmirror by devid supports only raid1s. Raid10 support has > to leverage device grouping feature, which is yet to be implemented. > > Signed-off-by: Anand Jain <anand.j...@oracle.com> > --- > fs/btrfs/props.c | 69 +++++++++++++++++++++++++++++++++++++++++++++- > fs/btrfs/volumes.c | 16 +++++++++++ > fs/btrfs/volumes.h | 2 ++ > 3 files changed, 86 insertions(+), 1 deletion(-) > > diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c > index 0dc26a154a98..6a789e1c150c 100644 > --- a/fs/btrfs/props.c > +++ b/fs/btrfs/props.c > @@ -316,7 +316,10 @@ static const char *prop_compression_extract(struct inode > *inode) > static int prop_readmirror_validate(struct inode *inode, const char *value, > size_t len) > { > + char *value_dup; > + char *devid_str; > struct btrfs_root *root = BTRFS_I(inode)->root; > + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; > > if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID) > return -EINVAL; > @@ -324,16 +327,80 @@ static int prop_readmirror_validate(struct inode > *inode, const char *value, > if (!len) > return 0; > > - return -EINVAL; > + if (len <= 5 || strncmp("devid", value, 5)) > + return -EINVAL; > + > + value_dup = kstrndup(value + 5, len - 5, GFP_KERNEL); > + if (!value_dup) > + return -ENOMEM; > + > + while ((devid_str = strsep(&value_dup, ",")) != NULL) { > + u64 devid; > + struct btrfs_device *device; > + > + if (kstrtoull(devid_str, 10, &devid)) { > + kfree(value_dup); > + return -EINVAL; > + } > + > + device = btrfs_find_device(fs_devices, devid, NULL, NULL, > false); Doesn't this need locking? > + if (!device) { > + kfree(value_dup); > + return -EINVAL; > + } > + } > + > + return 0; > } > > static int prop_readmirror_apply(struct inode *inode, const char *value, > size_t len) > { > + char *value_dup; > + char *devid_str; > + struct btrfs_device *device; > struct btrfs_fs_devices *fs_devices = btrfs_sb(inode->i_sb)->fs_devices; > > + if (value) { > + value_dup = kstrndup(value + 5, len - 5, GFP_KERNEL); > + if (!value_dup) > + return -ENOMEM; > + } > + > + /* Both set and reset has to clear the exisiting values */ > + list_for_each_entry(device, &fs_devices->devices, dev_list) { > + if (test_bit(BTRFS_DEV_STATE_READ_OPTIMISED, > + &device->dev_state)) { > + clear_bit(BTRFS_DEV_STATE_READ_OPTIMISED, > + &device->dev_state); > + } > + } And this one too. > fs_devices->readmirror_policy = BTRFS_READMIRROR_DEFAULT; > > + /* Its only reset so just return */ > + if (!value) > + return 0; > + > + while ((devid_str = strsep(&value_dup, ",")) != NULL) { > + u64 devid; > + > + /* Has been verified in validate() this will not fail */ > + if (kstrtoull(devid_str, 10, &devid)) { > + kfree(value_dup); > + return -EINVAL; > + } > + > + device = btrfs_find_device(fs_devices, devid, NULL, NULL, > false); > + if (!device) { > + kfree(value_dup); > + return -EINVAL; > + } > + > + set_bit(BTRFS_DEV_STATE_READ_OPTIMISED, &device->dev_state); > + fs_devices->readmirror_policy = BTRFS_READMIRROR_DEVID; > + } > + > + kfree(value_dup); > return 0; > } > > diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c > index d72850ed4f88..993645f4b5f6 100644 > --- a/fs/btrfs/volumes.c > +++ b/fs/btrfs/volumes.c > @@ -5481,6 +5481,7 @@ static int find_live_mirror(struct btrfs_fs_info > *fs_info, > int preferred_mirror; > int tolerance; > struct btrfs_device *srcdev; > + bool found = false; > > ASSERT((map->type & > (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); > @@ -5491,6 +5492,21 @@ static int find_live_mirror(struct btrfs_fs_info > *fs_info, > num_stripes = map->num_stripes; > > switch(fs_info->fs_devices->readmirror_policy) { > + case BTRFS_READMIRROR_DEVID: > + /* skip raid10 for now */ > + if (map->type & BTRFS_BLOCK_GROUP_RAID1) { > + for (i = first; i < first + num_stripes; i++) { > + if (test_bit(BTRFS_DEV_STATE_READ_OPTIMISED, > + &map->stripes[i].dev->dev_state)) { > + preferred_mirror = i; > + found = true; > + break; > + } > + } > + if (found) > + break; > + } > + /* fall through */ > case BTRFS_READMIRROR_DEFAULT: > /* fall through */ > default: > diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h > index e985d2133c0a..55b0f3b28595 100644 > --- a/fs/btrfs/volumes.h > +++ b/fs/btrfs/volumes.h > @@ -56,6 +56,7 @@ struct btrfs_io_geometry { > #define BTRFS_DEV_STATE_MISSING (2) > #define BTRFS_DEV_STATE_REPLACE_TGT (3) > #define BTRFS_DEV_STATE_FLUSH_SENT (4) > +#define BTRFS_DEV_STATE_READ_OPTIMISED (5) > > struct btrfs_device { > struct list_head dev_list; /* device_list_mutex */ > @@ -221,6 +222,7 @@ BTRFS_DEVICE_GETSET_FUNCS(bytes_used); > > enum btrfs_readmirror_policy { > BTRFS_READMIRROR_DEFAULT, > + BTRFS_READMIRROR_DEVID, > }; > > struct btrfs_fs_devices { > -- > 2.20.1 (Apple Git-117)