Trimming is completely transactionless, and the way it operates consists
of hiding free space entries from a block group, perform the trim/discard
and then make the free space entries visible again.
Therefore while free space entry is being trimmed, we can have free space
cache writing running in parallel (as part of a transaction commit) which
will miss the free space entry. This means that an unmount (or crash/reboot)
after that transaction commit and mount again before another transaction
starts/commits, we will have some free space that won't be used again unless
the free space cache is rebuilt. After the unmount, fsck (btrfsck, btrfs check)
reports the issue like the following example:

        *** fsck.btrfs output ***
        checking extents
        checking free space cache
        There is no free space entry for 521764864-521781248
        There is no free space entry for 521764864-1103101952
        cache appears valid but isnt 29360128
        Checking filesystem on /dev/sdc
        UUID: b4789e27-4774-4626-98e9-ae8dfbfb0fb5
        found 1235681286 bytes used err is -22
        (...)

Signed-off-by: Filipe Manana <fdman...@suse.com>
---
 fs/btrfs/free-space-cache.c | 59 ++++++++++++++++++++++++++++++++++++++++++---
 fs/btrfs/free-space-cache.h |  2 ++
 fs/btrfs/inode-map.c        |  2 ++
 3 files changed, 60 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 16c2d39..6380863 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -31,6 +31,12 @@
 #define BITS_PER_BITMAP                (PAGE_CACHE_SIZE * 8)
 #define MAX_CACHE_BYTES_PER_GIG        (32 * 1024)
 
+struct btrfs_trim_range {
+       u64 start;
+       u64 bytes;
+       struct list_head list;
+};
+
 static int link_free_space(struct btrfs_free_space_ctl *ctl,
                           struct btrfs_free_space *info);
 static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
@@ -881,6 +887,7 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
        int ret;
        struct btrfs_free_cluster *cluster = NULL;
        struct rb_node *node = rb_first(&ctl->free_space_offset);
+       struct btrfs_trim_range *trim_entry;
 
        /* Get the cluster for this block_group if it exists */
        if (block_group && !list_empty(&block_group->cluster_list)) {
@@ -916,6 +923,21 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
                        cluster = NULL;
                }
        }
+
+       /*
+        * Make sure we don't miss any range that was removed from our rbtree
+        * because trimming is running. Otherwise after a umount+mount (or crash
+        * after committing the transaction) we would leak free space and get
+        * an inconsistent free space cache report from fsck.
+        */
+       list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
+               ret = io_ctl_add_entry(io_ctl, trim_entry->start,
+                                      trim_entry->bytes, NULL);
+               if (ret)
+                       goto fail;
+               *entries += 1;
+       }
+
        return 0;
 fail:
        return -ENOSPC;
@@ -1135,10 +1157,12 @@ static int __btrfs_write_out_cache(struct btrfs_root 
*root, struct inode *inode,
 
        io_ctl_set_generation(&io_ctl, trans->transid);
 
+       mutex_lock(&ctl->cache_writeout_mutex);
        /* Write out the extent entries in the free space cache */
        ret = write_cache_extent_entries(&io_ctl, ctl,
                                         block_group, &entries, &bitmaps,
                                         &bitmap_list);
+       mutex_unlock(&ctl->cache_writeout_mutex);
        if (ret)
                goto out_nospc;
 
@@ -2295,6 +2319,8 @@ void btrfs_init_free_space_ctl(struct 
btrfs_block_group_cache *block_group)
        ctl->start = block_group->key.objectid;
        ctl->private = block_group;
        ctl->op = &free_space_op;
+       INIT_LIST_HEAD(&ctl->trimming_ranges);
+       mutex_init(&ctl->cache_writeout_mutex);
 
        /*
         * we only want to have 32k of ram per block group for keeping
@@ -2911,10 +2937,12 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster 
*cluster)
 
 static int do_trimming(struct btrfs_block_group_cache *block_group,
                       u64 *total_trimmed, u64 start, u64 bytes,
-                      u64 reserved_start, u64 reserved_bytes)
+                      u64 reserved_start, u64 reserved_bytes,
+                      struct btrfs_trim_range *trim_entry)
 {
        struct btrfs_space_info *space_info = block_group->space_info;
        struct btrfs_fs_info *fs_info = block_group->fs_info;
+       struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        int ret;
        int update = 0;
        u64 trimmed = 0;
@@ -2934,7 +2962,10 @@ static int do_trimming(struct btrfs_block_group_cache 
*block_group,
        if (!ret)
                *total_trimmed += trimmed;
 
+       mutex_lock(&ctl->cache_writeout_mutex);
        btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
+       list_del(&trim_entry->list);
+       mutex_unlock(&ctl->cache_writeout_mutex);
 
        if (update) {
                spin_lock(&space_info->lock);
@@ -2962,16 +2993,21 @@ static int trim_no_bitmap(struct 
btrfs_block_group_cache *block_group,
        u64 bytes;
 
        while (start < end) {
+               struct btrfs_trim_range trim_entry;
+
+               mutex_lock(&ctl->cache_writeout_mutex);
                spin_lock(&ctl->tree_lock);
 
                if (ctl->free_space < minlen) {
                        spin_unlock(&ctl->tree_lock);
+                       mutex_unlock(&ctl->cache_writeout_mutex);
                        break;
                }
 
                entry = tree_search_offset(ctl, start, 0, 1);
                if (!entry) {
                        spin_unlock(&ctl->tree_lock);
+                       mutex_unlock(&ctl->cache_writeout_mutex);
                        break;
                }
 
@@ -2980,6 +3016,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache 
*block_group,
                        node = rb_next(&entry->offset_index);
                        if (!node) {
                                spin_unlock(&ctl->tree_lock);
+                               mutex_unlock(&ctl->cache_writeout_mutex);
                                goto out;
                        }
                        entry = rb_entry(node, struct btrfs_free_space,
@@ -2988,6 +3025,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache 
*block_group,
 
                if (entry->offset >= end) {
                        spin_unlock(&ctl->tree_lock);
+                       mutex_unlock(&ctl->cache_writeout_mutex);
                        break;
                }
 
@@ -2997,6 +3035,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache 
*block_group,
                bytes = min(extent_start + extent_bytes, end) - start;
                if (bytes < minlen) {
                        spin_unlock(&ctl->tree_lock);
+                       mutex_unlock(&ctl->cache_writeout_mutex);
                        goto next;
                }
 
@@ -3004,9 +3043,13 @@ static int trim_no_bitmap(struct btrfs_block_group_cache 
*block_group,
                kmem_cache_free(btrfs_free_space_cachep, entry);
 
                spin_unlock(&ctl->tree_lock);
+               trim_entry.start = extent_start;
+               trim_entry.bytes = extent_bytes;
+               list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
+               mutex_unlock(&ctl->cache_writeout_mutex);
 
                ret = do_trimming(block_group, total_trimmed, start, bytes,
-                                 extent_start, extent_bytes);
+                                 extent_start, extent_bytes, &trim_entry);
                if (ret)
                        break;
 next:
@@ -3035,17 +3078,21 @@ static int trim_bitmaps(struct btrfs_block_group_cache 
*block_group,
 
        while (offset < end) {
                bool next_bitmap = false;
+               struct btrfs_trim_range trim_entry;
 
+               mutex_lock(&ctl->cache_writeout_mutex);
                spin_lock(&ctl->tree_lock);
 
                if (ctl->free_space < minlen) {
                        spin_unlock(&ctl->tree_lock);
+                       mutex_unlock(&ctl->cache_writeout_mutex);
                        break;
                }
 
                entry = tree_search_offset(ctl, offset, 1, 0);
                if (!entry) {
                        spin_unlock(&ctl->tree_lock);
+                       mutex_unlock(&ctl->cache_writeout_mutex);
                        next_bitmap = true;
                        goto next;
                }
@@ -3054,6 +3101,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache 
*block_group,
                ret2 = search_bitmap(ctl, entry, &start, &bytes);
                if (ret2 || start >= end) {
                        spin_unlock(&ctl->tree_lock);
+                       mutex_unlock(&ctl->cache_writeout_mutex);
                        next_bitmap = true;
                        goto next;
                }
@@ -3061,6 +3109,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache 
*block_group,
                bytes = min(bytes, end - start);
                if (bytes < minlen) {
                        spin_unlock(&ctl->tree_lock);
+                       mutex_unlock(&ctl->cache_writeout_mutex);
                        goto next;
                }
 
@@ -3069,9 +3118,13 @@ static int trim_bitmaps(struct btrfs_block_group_cache 
*block_group,
                        free_bitmap(ctl, entry);
 
                spin_unlock(&ctl->tree_lock);
+               trim_entry.start = start;
+               trim_entry.bytes = bytes;
+               list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
+               mutex_unlock(&ctl->cache_writeout_mutex);
 
                ret = do_trimming(block_group, total_trimmed, start, bytes,
-                                 start, bytes);
+                                 start, bytes, &trim_entry);
                if (ret)
                        break;
 next:
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 0cf4977..88b2238 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -38,6 +38,8 @@ struct btrfs_free_space_ctl {
        u64 start;
        struct btrfs_free_space_op *op;
        void *private;
+       struct mutex cache_writeout_mutex;
+       struct list_head trimming_ranges;
 };
 
 struct btrfs_free_space_op {
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 83d646b..81efd83 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -364,6 +364,8 @@ void btrfs_init_free_ino_ctl(struct btrfs_root *root)
        ctl->start = 0;
        ctl->private = NULL;
        ctl->op = &free_ino_op;
+       INIT_LIST_HEAD(&ctl->trimming_ranges);
+       mutex_init(&ctl->cache_writeout_mutex);
 
        /*
         * Initially we allow to use 16K of ram to cache chunks of
-- 
2.1.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to