__send_empty_flush() sends empty flush bios to every target in the
dm_table. However, if the num_targets exceeds the number of block
devices in the dm_table's device list, it could lead to multiple
invocations of __send_duplicate_bios() for the same block device.
Typically, a single thread sending numerous empty flush bios to one
block device is redundant, as these bios are likely to be merged by the
flush state machine. In scenarios where num_targets significantly
outweighs the number of block devices, such behavior may result in a
noteworthy decrease in performance.

This issue can be reproduced using this command line:
  for i in {0..1023}; do
    echo $((8000*$i)) 8000 linear /dev/sda2 $((16384*$i))
  done | dmsetup create example

With this fix, a random write with fsync workload executed with the
following fio command:

  fio --group_reporting --name=benchmark --filename=/dev/mapper/example \
      --ioengine=sync --invalidate=1 --numjobs=16 --rw=randwrite \
      --blocksize=4k --size=2G --time_based --runtime=30 --fdatasync=1

results in an increase from 857 KB/s to 30.8 MB/s of the write
throughput (3580% increase).

Signed-off-by: Yang Yang <[email protected]>
---
 drivers/md/dm-core.h          |  1 +
 drivers/md/dm-table.c         |  7 +++++
 drivers/md/dm.c               | 59 +++++++++++++++++++++++++++++++++++
 include/linux/device-mapper.h |  6 ++++
 4 files changed, 73 insertions(+)

diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index e6757a30dcca..7e3f2168289f 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -217,6 +217,7 @@ struct dm_table {
        /* a list of devices used by this table */
        struct list_head devices;
        struct rw_semaphore devices_lock;
+       unsigned short num_devices;
 
        /* events get handed up using this callback */
        void (*event_fn)(void *data);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 41f1d731ae5a..ddc60e498afb 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -2133,6 +2133,8 @@ void dm_table_postsuspend_targets(struct dm_table *t)
 
 int dm_table_resume_targets(struct dm_table *t)
 {
+       struct list_head *devices = dm_table_get_devices(t);
+       struct dm_dev_internal *dd;
        unsigned int i;
        int r = 0;
 
@@ -2159,6 +2161,11 @@ int dm_table_resume_targets(struct dm_table *t)
                        ti->type->resume(ti);
        }
 
+       t->num_devices = 0;
+
+       list_for_each_entry(dd, devices, list)
+               dd->dm_dev->index = ++(t->num_devices);
+
        return 0;
 }
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 56aa2a8b9d71..7297235291f6 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -48,6 +48,8 @@
  */
 #define REQ_DM_POLL_LIST       REQ_DRV
 
+#define DM_MAX_TABLE_DEVICES   1024
+
 static const char *_name = DM_NAME;
 
 static unsigned int major;
@@ -1543,10 +1545,38 @@ static unsigned int __send_duplicate_bios(struct 
clone_info *ci, struct dm_targe
        return ret;
 }
 
+static inline bool has_redundant_flush(struct dm_table *t,
+               unsigned long **bitmap)
+{
+       if (t->num_devices < t->num_targets) {
+               /* Add a limit here to prevent excessive memory usage for 
bitmaps */
+               if (t->num_devices >= DM_MAX_TABLE_DEVICES)
+                       return false;
+
+               /* dm_dev's index starts from 1, so need plus 1 here */
+               *bitmap = bitmap_zalloc(t->num_devices + 1, GFP_KERNEL);
+               if (*bitmap)
+                       return true;
+       }
+
+       return false;
+}
+
+static int dm_get_dev_index(struct dm_target *ti, struct dm_dev *dev,
+                                    sector_t start, sector_t len, void *data)
+{
+       unsigned short *index = data;
+       *index = dev->index;
+       return 0;
+}
+
 static void __send_empty_flush(struct clone_info *ci)
 {
        struct dm_table *t = ci->map;
        struct bio flush_bio;
+       unsigned long *handled_map;
+       unsigned int nr_handled = 0;
+       bool check = has_redundant_flush(t, &handled_map);
 
        /*
         * Use an on-stack bio for this, it's safe since we don't
@@ -1562,17 +1592,46 @@ static void __send_empty_flush(struct clone_info *ci)
 
        for (unsigned int i = 0; i < t->num_targets; i++) {
                unsigned int bios;
+               unsigned short index = 0;
                struct dm_target *ti = dm_table_get_target(t, i);
 
                if (unlikely(ti->num_flush_bios == 0))
                        continue;
 
+               /*
+                * If the num_targets is greater than the number of block 
devices
+                * in the dm_table's devices list, __send_empty_flush() might
+                * invoke __send_duplicate_bios() multiple times for the same
+                * block device. This could lead to a substantial decrease in
+                * performance when num_targets significantly exceeds the number
+                * of block devices.
+                * Ensure that __send_duplicate_bios() is only called once for
+                * each block device.
+                */
+               if (check) {
+                       if (nr_handled == t->num_devices)
+                               break;
+
+                       if (ti->type->iterate_devices)
+                               ti->type->iterate_devices(ti, dm_get_dev_index, 
&index);
+
+                       if (index > 0) {
+                               if (__test_and_set_bit(index, handled_map))
+                                       continue;
+                               else
+                                       nr_handled++;
+                       }
+               }
+
                atomic_add(ti->num_flush_bios, &ci->io->io_count);
                bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios,
                                             NULL, GFP_NOWAIT);
                atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
        }
 
+       if (check)
+               bitmap_free(handled_map);
+
        /*
         * alloc_io() takes one extra reference for submission, so the
         * reference won't reach 0 without the following subtraction
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 82b2195efaca..4a54b4f0a609 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -169,6 +169,12 @@ struct dm_dev {
        struct dax_device *dax_dev;
        blk_mode_t mode;
        char name[16];
+
+       /*
+        * sequential number for each dm_dev in dm_table's devices list,
+        * start from 1
+        */
+       unsigned short index;
 };
 
 /*
-- 
2.34.1


Reply via email to