In HMZONED mode, align the device extents to zone boundaries so that a zone
reset affects only the device extent and does not change the state of
blocks in the neighbor device extents. Also, check that a region allocation
is always over empty same-type zones.

Signed-off-by: Naohiro Aota <naohiro.a...@wdc.com>
---
 fs/btrfs/extent-tree.c |   6 +++
 fs/btrfs/volumes.c     | 100 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 103 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1aee51a9f3bf..363db58f56b8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9884,6 +9884,12 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, 
u64 bytenr)
                min_free = div64_u64(min_free, dev_min);
        }
 
+       /* We cannot allocate size less than zone_size anyway */
+       if (index == BTRFS_RAID_DUP)
+               min_free = max_t(u64, min_free, 2 * fs_info->zone_size);
+       else
+               min_free = max_t(u64, min_free, fs_info->zone_size);
+
        mutex_lock(&fs_info->chunk_mutex);
        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
                u64 dev_offset;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b6f367d19dc9..c1ed3b6e3cfd 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1737,6 +1737,46 @@ static bool contains_pending_extent(struct btrfs_device 
*device, u64 *start,
        return false;
 }
 
+static u64 dev_zone_align(struct btrfs_device *device, u64 pos)
+{
+       if (device->zone_size)
+               return ALIGN(pos, device->zone_size);
+       return pos;
+}
+
+/*
+ * is_allocatable_region - check if spcecifeid region is suitable for 
allocation
+ * @device:    the device to allocate a region
+ * @pos:       the position of the region
+ * @num_bytes: the size of the region
+ *
+ * In non-ZONED device, anywhere is suitable for allocation. In ZONED
+ * device, check if the region is not on non-empty zones. Also, check if
+ * all zones in the region have the same zone type.
+ */
+static bool is_allocatable_region(struct btrfs_device *device, u64 pos,
+                                 u64 num_bytes)
+{
+       int is_sequential;
+
+       if (device->zone_size == 0)
+               return true;
+
+       WARN_ON(!IS_ALIGNED(pos, device->zone_size));
+       WARN_ON(!IS_ALIGNED(num_bytes, device->zone_size));
+
+       is_sequential = btrfs_dev_is_sequential(device, pos);
+
+       while (num_bytes > 0) {
+               if (!btrfs_dev_is_empty_zone(device, pos) ||
+                   (is_sequential != btrfs_dev_is_sequential(device, pos)))
+                       return false;
+               pos += device->zone_size;
+               num_bytes -= device->zone_size;
+       }
+
+       return true;
+}
 
 /*
  * find_free_dev_extent_start - find free space in the specified device
@@ -1779,9 +1819,14 @@ int find_free_dev_extent_start(struct btrfs_device 
*device, u64 num_bytes,
        /*
         * We don't want to overwrite the superblock on the drive nor any area
         * used by the boot loader (grub for example), so we make sure to start
-        * at an offset of at least 1MB.
+        * at an offset of at least 1MB on a regular disk. For a zoned block
+        * device, skip the first zone of the device entirely.
         */
-       search_start = max_t(u64, search_start, SZ_1M);
+       if (device->zone_size)
+               search_start = max_t(u64, dev_zone_align(device, search_start),
+                                    device->zone_size);
+       else
+               search_start = max_t(u64, search_start, SZ_1M);
 
        path = btrfs_alloc_path();
        if (!path)
@@ -1846,12 +1891,22 @@ int find_free_dev_extent_start(struct btrfs_device 
*device, u64 num_bytes,
                         */
                        if (contains_pending_extent(device, &search_start,
                                                    hole_size)) {
+                               search_start = dev_zone_align(device,
+                                                             search_start);
                                if (key.offset >= search_start)
                                        hole_size = key.offset - search_start;
                                else
                                        hole_size = 0;
                        }
 
+                       if (!is_allocatable_region(device, search_start,
+                                                  num_bytes)) {
+                               search_start = dev_zone_align(device,
+                                                             search_start+1);
+                               btrfs_release_path(path);
+                               goto again;
+                       }
+
                        if (hole_size > max_hole_size) {
                                max_hole_start = search_start;
                                max_hole_size = hole_size;
@@ -1876,7 +1931,7 @@ int find_free_dev_extent_start(struct btrfs_device 
*device, u64 num_bytes,
                extent_end = key.offset + btrfs_dev_extent_length(l,
                                                                  dev_extent);
                if (extent_end > search_start)
-                       search_start = extent_end;
+                       search_start = dev_zone_align(device, extent_end);
 next:
                path->slots[0]++;
                cond_resched();
@@ -1891,6 +1946,14 @@ int find_free_dev_extent_start(struct btrfs_device 
*device, u64 num_bytes,
                hole_size = search_end - search_start;
 
                if (contains_pending_extent(device, &search_start, hole_size)) {
+                       search_start = dev_zone_align(device,
+                                                     search_start);
+                       btrfs_release_path(path);
+                       goto again;
+               }
+
+               if (!is_allocatable_region(device, search_start, num_bytes)) {
+                       search_start = dev_zone_align(device, search_start+1);
                        btrfs_release_path(path);
                        goto again;
                }
@@ -5177,6 +5240,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle 
*trans,
        int i;
        int j;
        int index;
+       int hmzoned = btrfs_fs_incompat(info, HMZONED);
 
        BUG_ON(!alloc_profile_is_valid(type, 0));
 
@@ -5221,10 +5285,20 @@ static int __btrfs_alloc_chunk(struct 
btrfs_trans_handle *trans,
                BUG();
        }
 
+       if (hmzoned) {
+               max_stripe_size = info->zone_size;
+               max_chunk_size = round_down(max_chunk_size, info->zone_size);
+       }
+
        /* We don't want a chunk larger than 10% of writable space */
        max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
                             max_chunk_size);
 
+       if (hmzoned)
+               max_chunk_size = max(round_down(max_chunk_size,
+                                               info->zone_size),
+                                    info->zone_size);
+
        devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
                               GFP_NOFS);
        if (!devices_info)
@@ -5259,6 +5333,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle 
*trans,
                if (total_avail == 0)
                        continue;
 
+               if (hmzoned && total_avail < max_stripe_size * dev_stripes)
+                       continue;
+
                ret = find_free_dev_extent(device,
                                           max_stripe_size * dev_stripes,
                                           &dev_offset, &max_avail);
@@ -5277,6 +5354,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle 
*trans,
                        continue;
                }
 
+               if (hmzoned && max_avail < max_stripe_size * dev_stripes)
+                       continue;
+
                if (ndevs == fs_devices->rw_devices) {
                        WARN(1, "%s: found more than %llu devices\n",
                             __func__, fs_devices->rw_devices);
@@ -5310,6 +5390,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle 
*trans,
 
        ndevs = min(ndevs, devs_max);
 
+again:
        /*
         * The primary goal is to maximize the number of stripes, so use as
         * many devices as possible, even if the stripes are not maximum sized.
@@ -5333,6 +5414,17 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle 
*trans,
         * we try to reduce stripe_size.
         */
        if (stripe_size * data_stripes > max_chunk_size) {
+               if (hmzoned) {
+                       /*
+                        * stripe_size is fixed in HMZONED. Reduce ndevs
+                        * instead.
+                        */
+                       WARN_ON(nparity != 0);
+                       ndevs = div_u64(max_chunk_size * ncopies,
+                                       stripe_size * dev_stripes);
+                       goto again;
+               }
+
                /*
                 * Reduce stripe_size, round it up to a 16MB boundary again and
                 * then use it, unless it ends up being even bigger than the
@@ -5346,6 +5438,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle 
*trans,
        /* align to BTRFS_STRIPE_LEN */
        stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN);
 
+       WARN_ON(hmzoned && stripe_size != info->zone_size);
+
        map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
        if (!map) {
                ret = -ENOMEM;
-- 
2.21.0

Reply via email to