This commit implements a zoned chunk/dev_extent allocator. The zoned
allocator aligns the device extents to zone boundaries, so that a zone
reset affects only the device extent and does not change the state of
blocks in the neighbor device extents.

Also, it checks that a region allocation is not overlapping any of the
super block zones, and ensures the region is empty.

Reviewed-by: Josef Bacik <jo...@toxicpanda.com>
Signed-off-by: Naohiro Aota <naohiro.a...@wdc.com>
---
 fs/btrfs/volumes.c | 169 ++++++++++++++++++++++++++++++++++++++++-----
 fs/btrfs/volumes.h |   1 +
 fs/btrfs/zoned.c   | 144 ++++++++++++++++++++++++++++++++++++++
 fs/btrfs/zoned.h   |  25 +++++++
 4 files changed, 323 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bb3f341f6a22..27208139d6e2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1414,11 +1414,62 @@ static u64 dev_extent_search_start(struct btrfs_device 
*device, u64 start)
                 * make sure to start at an offset of at least 1MB.
                 */
                return max_t(u64, start, SZ_1M);
+       case BTRFS_CHUNK_ALLOC_ZONED:
+               /*
+                * We don't care about the starting region like regular
+                * allocator, because we anyway use/reserve the first two
+                * zones for superblock logging.
+                */
+               return ALIGN(start, device->zone_info->zone_size);
        default:
                BUG();
        }
 }
 
+static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
+                                       u64 *hole_start, u64 *hole_size,
+                                       u64 num_bytes)
+{
+       u64 zone_size = device->zone_info->zone_size;
+       u64 pos;
+       int ret;
+       int changed = 0;
+
+       ASSERT(IS_ALIGNED(*hole_start, zone_size));
+
+       while (*hole_size > 0) {
+               pos = btrfs_find_allocatable_zones(device, *hole_start,
+                                                  *hole_start + *hole_size,
+                                                  num_bytes);
+               if (pos != *hole_start) {
+                       *hole_size = *hole_start + *hole_size - pos;
+                       *hole_start = pos;
+                       changed = 1;
+                       if (*hole_size < num_bytes)
+                               break;
+               }
+
+               ret = btrfs_ensure_empty_zones(device, pos, num_bytes);
+
+               /* Range is ensured to be empty */
+               if (!ret)
+                       return changed;
+
+               /* Given hole range was invalid (outside of device) */
+               if (ret == -ERANGE) {
+                       *hole_start += *hole_size;
+                       *hole_size = 0;
+                       return 1;
+               }
+
+               *hole_start += zone_size;
+               *hole_size -= zone_size;
+               changed = 1;
+       }
+
+       return changed;
+}
+
 /**
  * dev_extent_hole_check - check if specified hole is suitable for allocation
  * @device:    the device which we have the hole
@@ -1435,24 +1486,39 @@ static bool dev_extent_hole_check(struct btrfs_device 
*device, u64 *hole_start,
        bool changed = false;
        u64 hole_end = *hole_start + *hole_size;
 
-       /*
-        * Check before we set max_hole_start, otherwise we could end up
-        * sending back this offset anyway.
-        */
-       if (contains_pending_extent(device, hole_start, *hole_size)) {
-               if (hole_end >= *hole_start)
-                       *hole_size = hole_end - *hole_start;
-               else
-                       *hole_size = 0;
-               changed = true;
-       }
+       for (;;) {
+               /*
+                * Check before we set max_hole_start, otherwise we could end up
+                * sending back this offset anyway.
+                */
+               if (contains_pending_extent(device, hole_start, *hole_size)) {
+                       if (hole_end >= *hole_start)
+                               *hole_size = hole_end - *hole_start;
+                       else
+                               *hole_size = 0;
+                       changed = true;
+               }
+
+               switch (device->fs_devices->chunk_alloc_policy) {
+               case BTRFS_CHUNK_ALLOC_REGULAR:
+                       /* No extra check */
+                       break;
+               case BTRFS_CHUNK_ALLOC_ZONED:
+                       if (dev_extent_hole_check_zoned(device, hole_start,
+                                                       hole_size, num_bytes)) {
+                               changed = true;
+                               /*
+                                * The changed hole can contain pending
+                                * extent. Loop again to check that.
+                                */
+                               continue;
+                       }
+                       break;
+               default:
+                       BUG();
+               }
 
-       switch (device->fs_devices->chunk_alloc_policy) {
-       case BTRFS_CHUNK_ALLOC_REGULAR:
-               /* No extra check */
                break;
-       default:
-               BUG();
        }
 
        return changed;
@@ -1505,6 +1571,9 @@ static int find_free_dev_extent_start(struct btrfs_device 
*device,
 
        search_start = dev_extent_search_start(device, search_start);
 
+       WARN_ON(device->zone_info &&
+               !IS_ALIGNED(num_bytes, device->zone_info->zone_size));
+
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
@@ -4899,6 +4968,37 @@ static void init_alloc_chunk_ctl_policy_regular(
        ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
 }
 
+static void init_alloc_chunk_ctl_policy_zoned(
+                                     struct btrfs_fs_devices *fs_devices,
+                                     struct alloc_chunk_ctl *ctl)
+{
+       u64 zone_size = fs_devices->fs_info->zone_size;
+       u64 limit;
+       int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
+       int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
+       u64 min_chunk_size = min_data_stripes * zone_size;
+       u64 type = ctl->type;
+
+       ctl->max_stripe_size = zone_size;
+       if (type & BTRFS_BLOCK_GROUP_DATA) {
+               ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
+                                                zone_size);
+       } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
+               ctl->max_chunk_size = ctl->max_stripe_size;
+       } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
+               ctl->max_chunk_size = 2 * ctl->max_stripe_size;
+               ctl->devs_max = min_t(int, ctl->devs_max,
+                                     BTRFS_MAX_DEVS_SYS_CHUNK);
+       }
+
+       /* We don't want a chunk larger than 10% of writable space */
+       limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1),
+                              zone_size),
+                   min_chunk_size);
+       ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
+       ctl->dev_extent_min = zone_size * ctl->dev_stripes;
+}
+
 static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
                                 struct alloc_chunk_ctl *ctl)
 {
@@ -4919,6 +5019,9 @@ static void init_alloc_chunk_ctl(struct btrfs_fs_devices 
*fs_devices,
        case BTRFS_CHUNK_ALLOC_REGULAR:
                init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
                break;
+       case BTRFS_CHUNK_ALLOC_ZONED:
+               init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
+               break;
        default:
                BUG();
        }
@@ -5045,6 +5148,38 @@ static int decide_stripe_size_regular(struct 
alloc_chunk_ctl *ctl,
        return 0;
 }
 
+static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
+                                   struct btrfs_device_info *devices_info)
+{
+       u64 zone_size = devices_info[0].dev->zone_info->zone_size;
+       /* Number of stripes that count for block group size */
+       int data_stripes;
+
+       /*
+        * It should hold because:
+        *    dev_extent_min == dev_extent_want == zone_size * dev_stripes
+        */
+       ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
+
+       ctl->stripe_size = zone_size;
+       ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
+       data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
+
+       /* stripe_size is fixed in ZONED. Reduce ndevs instead. */
+       if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
+               ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
+                                            ctl->stripe_size) + ctl->nparity,
+                                    ctl->dev_stripes);
+               ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
+               data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
+               ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
+       }
+
+       ctl->chunk_size = ctl->stripe_size * data_stripes;
+
+       return 0;
+}
+
 static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
                              struct alloc_chunk_ctl *ctl,
                              struct btrfs_device_info *devices_info)
@@ -5072,6 +5207,8 @@ static int decide_stripe_size(struct btrfs_fs_devices 
*fs_devices,
        switch (fs_devices->chunk_alloc_policy) {
        case BTRFS_CHUNK_ALLOC_REGULAR:
                return decide_stripe_size_regular(ctl, devices_info);
+       case BTRFS_CHUNK_ALLOC_ZONED:
+               return decide_stripe_size_zoned(ctl, devices_info);
        default:
                BUG();
        }
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 1997a4649a66..98a447badd6a 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -213,6 +213,7 @@ BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
 
 enum btrfs_chunk_allocation_policy {
        BTRFS_CHUNK_ALLOC_REGULAR,
+       BTRFS_CHUNK_ALLOC_ZONED,
 };
 
 /*
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index f0af88d497c7..e829fa2df8ac 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1,11 +1,13 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#include <linux/bitops.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include "ctree.h"
 #include "volumes.h"
 #include "zoned.h"
 #include "rcu-string.h"
+#include "disk-io.h"
 
 /* Maximum number of zones to report per blkdev_report_zones() call */
 #define BTRFS_REPORT_NR_ZONES   4096
@@ -557,6 +559,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
 
        fs_info->zone_size = zone_size;
        fs_info->max_zone_append_size = max_zone_append_size;
+       fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
 
        /*
         * Check mount options here, because we might change fs_info->zoned
@@ -779,3 +782,144 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, 
int mirror)
                                sb_zone << zone_sectors_shift,
                                zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
 }
+
+/*
+ * btrfs_check_allocatable_zones - find allocatable zones within give region
+ * @device:    the device to allocate a region
+ * @hole_start: the position of the hole to allocate the region
+ * @num_bytes: the size of wanted region
+ * @hole_size: the size of hole
+ * @return:    position of allocatable zones
+ *
+ * Allocatable region should not contain any superblock locations.
+ */
+u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
+                                u64 hole_end, u64 num_bytes)
+{
+       struct btrfs_zoned_device_info *zinfo = device->zone_info;
+       u8 shift = zinfo->zone_size_shift;
+       u64 nzones = num_bytes >> shift;
+       u64 pos = hole_start;
+       u64 begin, end;
+       bool have_sb;
+       int i;
+
+       ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
+       ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
+
+       while (pos < hole_end) {
+               begin = pos >> shift;
+               end = begin + nzones;
+
+               if (end > zinfo->nr_zones)
+                       return hole_end;
+
+               /* Check if zones in the region are all empty */
+               if (btrfs_dev_is_sequential(device, pos) &&
+                   find_next_zero_bit(zinfo->empty_zones, end, begin) != end) {
+                       pos += zinfo->zone_size;
+                       continue;
+               }
+
+               have_sb = false;
+               for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+                       u32 sb_zone;
+                       u64 sb_pos;
+
+                       sb_zone = sb_zone_number(shift, i);
+                       if (!(end <= sb_zone ||
+                             sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) {
+                               have_sb = true;
+                               pos = ((u64)sb_zone + BTRFS_NR_SB_LOG_ZONES) << 
shift;
+                               break;
+                       }
+
+                       /*
+                        * We also need to exclude regular superblock
+                        * positions
+                        */
+                       sb_pos = btrfs_sb_offset(i);
+                       if (!(pos + num_bytes <= sb_pos ||
+                             sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) {
+                               have_sb = true;
+                               pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE,
+                                           zinfo->zone_size);
+                               break;
+                       }
+               }
+               if (!have_sb)
+                       break;
+       }
+
+       return pos;
+}
+
+int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
+                           u64 length, u64 *bytes)
+{
+       int ret;
+
+       *bytes = 0;
+       ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
+                              physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
+                              GFP_NOFS);
+       if (ret)
+               return ret;
+
+       *bytes = length;
+       while (length) {
+               btrfs_dev_set_zone_empty(device, physical);
+               physical += device->zone_info->zone_size;
+               length -= device->zone_info->zone_size;
+       }
+
+       return 0;
+}
+
+int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
+{
+       struct btrfs_zoned_device_info *zinfo = device->zone_info;
+       u8 shift = zinfo->zone_size_shift;
+       unsigned long begin = start >> shift;
+       unsigned long end = (start + size) >> shift;
+       u64 pos;
+       int ret;
+
+       ASSERT(IS_ALIGNED(start, zinfo->zone_size));
+       ASSERT(IS_ALIGNED(size, zinfo->zone_size));
+
+       if (end > zinfo->nr_zones)
+               return -ERANGE;
+
+       /* All the zones are conventional */
+       if (find_next_bit(zinfo->seq_zones, begin, end) == end)
+               return 0;
+
+       /* All the zones are sequential and empty */
+       if (find_next_zero_bit(zinfo->seq_zones, begin, end) == end &&
+           find_next_zero_bit(zinfo->empty_zones, begin, end) == end)
+               return 0;
+
+       for (pos = start; pos < start + size; pos += zinfo->zone_size) {
+               u64 reset_bytes;
+
+               if (!btrfs_dev_is_sequential(device, pos) ||
+                   btrfs_dev_is_empty_zone(device, pos))
+                       continue;
+
+               /* Free regions should be empty */
+               btrfs_warn_in_rcu(
+                       device->fs_info,
+                       "zoned: resetting device %s (devid %llu) zone %llu for 
allocation",
+                       rcu_str_deref(device->name), device->devid,
+                       pos >> shift);
+               WARN_ON_ONCE(1);
+
+               ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
+                                             &reset_bytes);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 058a57317c05..de5901f5ae66 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -36,6 +36,11 @@ int btrfs_sb_log_location(struct btrfs_device *device, int 
mirror, int rw,
                          u64 *bytenr_ret);
 void btrfs_advance_sb_log(struct btrfs_device *device, int mirror);
 int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror);
+u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
+                                u64 hole_end, u64 num_bytes);
+int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
+                           u64 length, u64 *bytes);
+int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size);
 #else /* CONFIG_BLK_DEV_ZONED */
 static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
                                     struct blk_zone *zone)
@@ -92,6 +97,26 @@ static inline int btrfs_reset_sb_log_zones(struct 
block_device *bdev, int mirror
        return 0;
 }
 
+static inline u64 btrfs_find_allocatable_zones(struct btrfs_device *device,
+                                              u64 hole_start, u64 hole_end,
+                                              u64 num_bytes)
+{
+       return hole_start;
+}
+
+static inline int btrfs_reset_device_zone(struct btrfs_device *device,
+                                         u64 physical, u64 length, u64 *bytes)
+{
+       *bytes = 0;
+       return 0;
+}
+
+static inline int btrfs_ensure_empty_zones(struct btrfs_device *device,
+                                          u64 start, u64 size)
+{
+       return 0;
+}
+
 #endif
 
 static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 
pos)
-- 
2.27.0

Reply via email to