Function call chain  __btrfs_map_block()->find_live_mirror() uses
thread %pid to determine the %mirror_num for the read when the
mirror_num=0 in the argument.

This pid based mirror_num extrapolation has following disadvantages
 A single-process large read IO will read only from one disk.
 In a worst scenario all processes read accessing the FS could have
   either odd or even pid, the read IO gets skewed.
 There is no deterministic way of knowing/controlling which copy will
   be used for reading.
 May see performance variations for a given set of multi process
   workload ran at different times.

So we need other types of readmirror policies.

This patch introduces a framework so that we can add more policies, and
converts the existing %pid into as a configurable parameter using the
property.

 For example:
  btrfs property set /btrfs readmirror pid
  btrfs property set /btrfs readmirror ""

Signed-off-by: Anand Jain <anand.j...@oracle.com>
---
 fs/btrfs/props.c   | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.c | 11 ++++++++++-
 fs/btrfs/volumes.h |  7 +++++++
 3 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 1a13f10a6ef5..776cdf099f93 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -11,6 +11,7 @@
 #include "ctree.h"
 #include "xattr.h"
 #include "compression.h"
+#include "volumes.h"
 
 #define BTRFS_PROP_HANDLERS_HT_BITS 8
 static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
@@ -326,6 +327,45 @@ static const char *prop_compression_extract(struct inode 
*inode)
        return NULL;
 }
 
+static int prop_readmirror_validate(struct inode *inode, const char *value,
+                                   size_t len)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+
+       if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID)
+               return -EINVAL;
+
+       if (!len)
+               return 0;
+
+       if (!strncmp("pid", value, 3))
+               return 0;
+
+       return -EINVAL;
+}
+
+static int prop_readmirror_apply(struct inode *inode, const char *value,
+                                size_t len)
+{
+       struct btrfs_fs_devices *fs_devices = btrfs_sb(inode->i_sb)->fs_devices;
+
+       if (!value)
+               fs_devices->readmirror_policy = BTRFS_READMIRROR_DEFAULT;
+       else if (!strncmp("pid", value, 3))
+               fs_devices->readmirror_policy = BTRFS_READMIRROR_PID;
+
+       return 0;
+}
+
+static const char *prop_readmirror_extract(struct inode *inode)
+{
+       /*
+        * readmirror policy is applied for the whole FS, inheritance is not
+        * applicable.
+        */
+       return NULL;
+}
+
 static struct prop_handler prop_handlers[] = {
        {
                .xattr_name = XATTR_BTRFS_PREFIX "compression",
@@ -334,6 +374,13 @@ static const char *prop_compression_extract(struct inode 
*inode)
                .extract = prop_compression_extract,
                .inheritable = 1
        },
+       {
+               .xattr_name = XATTR_BTRFS_PREFIX "readmirror",
+               .validate = prop_readmirror_validate,
+               .apply = prop_readmirror_apply,
+               .extract = prop_readmirror_extract,
+               .inheritable = 0
+       },
 };
 
 static int inherit_props(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9024eee889b9..e5072d46e181 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5562,7 +5562,16 @@ static int find_live_mirror(struct btrfs_fs_info 
*fs_info,
        else
                num_stripes = map->num_stripes;
 
-       preferred_mirror = first + current->pid % num_stripes;
+       switch(fs_info->fs_devices->readmirror_policy) {
+       case BTRFS_READMIRROR_PID:
+               /* fall through */
+       case BTRFS_READMIRROR_DEFAULT:
+               /* fall through */
+       default:
+               /* readmirror as per thread pid */
+               preferred_mirror = first + current->pid % num_stripes;
+               break;
+       }
 
        if (dev_replace_is_ongoing &&
            fs_info->dev_replace.cont_reading_from_srcdev_mode ==
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 3ad9d58d1b66..27dce9242b55 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -208,6 +208,11 @@ struct btrfs_device {
 BTRFS_DEVICE_GETSET_FUNCS(disk_total_bytes);
 BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
 
+enum btrfs_readmirror_policy {
+       BTRFS_READMIRROR_DEFAULT,
+       BTRFS_READMIRROR_PID,
+};
+
 struct btrfs_fs_devices {
        u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
        u8 metadata_uuid[BTRFS_FSID_SIZE];
@@ -254,6 +259,8 @@ struct btrfs_fs_devices {
        struct kobject fsid_kobj;
        struct kobject *device_dir_kobj;
        struct completion kobj_unregister;
+
+       int readmirror_policy;
 };
 
 #define BTRFS_BIO_INLINE_CSUM_SIZE     64
-- 
1.8.3.1

Reply via email to