Core implement for inband de-duplication.
It reuse the async_cow_start() facility to do the calculate dedupe hash.
And use dedupe hash to do inband de-duplication at extent level.

The work flow is as below:
1) Run delalloc range for an inode
2) Calculate hash for the delalloc range at the unit of dedupe_bs
3) For hash match(duplicated) case, just increase source extent ref
   and insert file extent.
   For hash mismatch case, go through the normal cow_file_range()
   fallback, and add hash into dedupe_tree.
   Compress for hash miss case is not supported yet.

Current implement restore all dedupe hash in memory rb-tree, with LRU
behavior to control the limit.

Signed-off-by: Wang Xiaoguang <wangxg.f...@cn.fujitsu.com>
Signed-off-by: Qu Wenruo <quwen...@cn.fujitsu.com>
---
 fs/btrfs/extent-tree.c |  19 ++++
 fs/btrfs/inode.c       | 259 ++++++++++++++++++++++++++++++++++++++++++-------
 fs/btrfs/relocation.c  |  15 +++
 3 files changed, 258 insertions(+), 35 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 54105ba2f429..ad34a69a77f2 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -38,6 +38,7 @@
 #include "math.h"
 #include "sysfs.h"
 #include "qgroup.h"
+#include "dedupe.h"
 
 #undef SCRAMBLE_DELAYED_REFS
 
@@ -2391,6 +2392,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle 
*trans,
 
        if (btrfs_delayed_ref_is_head(node)) {
                struct btrfs_delayed_ref_head *head;
+
                /*
                 * we've hit the end of the chain and we were supposed
                 * to insert this extent into the tree.  But, it got
@@ -2405,6 +2407,18 @@ static int run_one_delayed_ref(struct btrfs_trans_handle 
*trans,
                        btrfs_pin_extent(fs_info, node->bytenr,
                                         node->num_bytes, 1);
                        if (head->is_data) {
+                               /*
+                                * If insert_reserved is given, it means
+                                * a new extent is revered, then deleted
+                                * in one tran, and inc/dec get merged to 0.
+                                *
+                                * In this case, we need to remove its dedupe
+                                * hash.
+                                */
+                               ret = btrfs_dedupe_del(trans, fs_info,
+                                                      node->bytenr);
+                               if (ret < 0)
+                                       return ret;
                                ret = btrfs_del_csums(trans, fs_info,
                                                      node->bytenr,
                                                      node->num_bytes);
@@ -7074,6 +7088,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle 
*trans,
                btrfs_release_path(path);
 
                if (is_data) {
+                       ret = btrfs_dedupe_del(trans, info, bytenr);
+                       if (ret < 0) {
+                               btrfs_abort_transaction(trans, ret);
+                               goto out;
+                       }
                        ret = btrfs_del_csums(trans, info, bytenr, num_bytes);
                        if (ret) {
                                btrfs_abort_transaction(trans, ret);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1afa03d3dc5f..6289582e7caa 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -339,6 +339,7 @@ struct async_extent {
        struct page **pages;
        unsigned long nr_pages;
        int compress_type;
+       struct btrfs_dedupe_hash *hash;
        struct list_head list;
 };
 
@@ -350,6 +351,7 @@ struct async_cow {
        u64 end;
        struct list_head extents;
        struct btrfs_work work;
+       enum btrfs_metadata_reserve_type reserve_type;
 };
 
 static noinline int add_async_extent(struct async_cow *cow,
@@ -357,7 +359,8 @@ static noinline int add_async_extent(struct async_cow *cow,
                                     u64 compressed_size,
                                     struct page **pages,
                                     unsigned long nr_pages,
-                                    int compress_type)
+                                    int compress_type,
+                                    struct btrfs_dedupe_hash *hash)
 {
        struct async_extent *async_extent;
 
@@ -369,6 +372,7 @@ static noinline int add_async_extent(struct async_cow *cow,
        async_extent->pages = pages;
        async_extent->nr_pages = nr_pages;
        async_extent->compress_type = compress_type;
+       async_extent->hash = hash;
        list_add_tail(&async_extent->list, &cow->extents);
        return 0;
 }
@@ -598,7 +602,7 @@ static noinline void compress_file_range(struct inode 
*inode,
                         */
                        add_async_extent(async_cow, start, num_bytes,
                                        total_compressed, pages, nr_pages,
-                                       compress_type);
+                                       compress_type, NULL);
 
                        if (start + num_bytes < end) {
                                start += num_bytes;
@@ -644,7 +648,7 @@ static noinline void compress_file_range(struct inode 
*inode,
        if (redirty)
                extent_range_redirty_for_io(inode, start, end);
        add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0,
-                        BTRFS_COMPRESS_NONE);
+                        BTRFS_COMPRESS_NONE, NULL);
        *num_added += 1;
 
        return;
@@ -673,6 +677,38 @@ static void free_async_extent_pages(struct async_extent 
*async_extent)
        async_extent->pages = NULL;
 }
 
+static void end_dedupe_extent(struct inode *inode, u64 start,
+                             u32 len, unsigned long page_ops)
+{
+       int i;
+       unsigned int nr_pages = len / PAGE_SIZE;
+       struct page *page;
+
+       for (i = 0; i < nr_pages; i++) {
+               page = find_get_page(inode->i_mapping,
+                                    start >> PAGE_SHIFT);
+               /* page should be already locked by caller */
+               if (WARN_ON(!page))
+                       continue;
+
+               /* We need to do this by ourselves as we skipped IO */
+               if (page_ops & PAGE_CLEAR_DIRTY)
+                       clear_page_dirty_for_io(page);
+               if (page_ops & PAGE_SET_WRITEBACK)
+                       set_page_writeback(page);
+
+               end_extent_writepage(page, 0, start,
+                                    start + PAGE_SIZE - 1);
+               if (page_ops & PAGE_END_WRITEBACK)
+                       end_page_writeback(page);
+               if (page_ops & PAGE_UNLOCK)
+                       unlock_page(page);
+
+               start += PAGE_SIZE;
+               put_page(page);
+       }
+}
+
 /*
  * phase two of compressed writeback.  This is the ordered portion
  * of the code, which only gets called in the order the work was
@@ -689,6 +725,7 @@ static noinline void submit_compressed_extents(struct inode 
*inode,
        struct extent_map *em;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_io_tree *io_tree;
+       struct btrfs_dedupe_hash *hash;
        int ret = 0;
 
 again:
@@ -698,6 +735,7 @@ static noinline void submit_compressed_extents(struct inode 
*inode,
                list_del(&async_extent->list);
 
                io_tree = &BTRFS_I(inode)->io_tree;
+               hash = async_extent->hash;
 
 retry:
                /* did the compression code fall back to uncompressed IO? */
@@ -728,7 +766,7 @@ static noinline void submit_compressed_extents(struct inode 
*inode,
                                             async_extent->start +
                                             async_extent->ram_size - 1,
                                             &page_started, &nr_written, 0,
-                                            NULL);
+                                            hash);
 
                        /* JDM XXX */
 
@@ -738,15 +776,26 @@ static noinline void submit_compressed_extents(struct 
inode *inode,
                         * and IO for us.  Otherwise, we need to submit
                         * all those pages down to the drive.
                         */
-                       if (!page_started && !ret)
-                               extent_write_locked_range(io_tree,
-                                                 inode, async_extent->start,
-                                                 async_extent->start +
-                                                 async_extent->ram_size - 1,
-                                                 btrfs_get_extent,
-                                                 WB_SYNC_ALL);
-                       else if (ret)
+                       if (!page_started && !ret) {
+                               /* Skip IO for dedupe async_extent */
+                               if (btrfs_dedupe_hash_hit(hash))
+                                       end_dedupe_extent(inode,
+                                               async_extent->start,
+                                               async_extent->ram_size,
+                                               PAGE_CLEAR_DIRTY |
+                                               PAGE_SET_WRITEBACK |
+                                               PAGE_END_WRITEBACK |
+                                               PAGE_UNLOCK);
+                               else
+                                       extent_write_locked_range(io_tree,
+                                               inode, async_extent->start,
+                                               async_extent->start +
+                                               async_extent->ram_size - 1,
+                                               btrfs_get_extent,
+                                               WB_SYNC_ALL);
+                       } else if (ret)
                                unlock_page(async_cow->locked_page);
+                       kfree(hash);
                        kfree(async_extent);
                        cond_resched();
                        continue;
@@ -850,6 +899,7 @@ static noinline void submit_compressed_extents(struct inode 
*inode,
                        free_async_extent_pages(async_extent);
                }
                alloc_hint = ins.objectid + ins.offset;
+               kfree(hash);
                kfree(async_extent);
                cond_resched();
        }
@@ -869,6 +919,7 @@ static noinline void submit_compressed_extents(struct inode 
*inode,
                                     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
                                     PAGE_SET_ERROR);
        free_async_extent_pages(async_extent);
+       kfree(hash);
        kfree(async_extent);
        goto again;
 }
@@ -981,11 +1032,17 @@ static noinline int cow_file_range(struct inode *inode,
                unsigned long op;
 
                cur_alloc_size = disk_num_bytes;
-               ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
+               if (btrfs_dedupe_hash_hit(hash)) {
+                       ins.objectid = hash->bytenr;
+                       ins.offset = hash->num_bytes;
+               } else {
+                       ret = btrfs_reserve_extent(root, cur_alloc_size,
+                                          cur_alloc_size,
                                           fs_info->sectorsize, 0, alloc_hint,
                                           &ins, 1, 1);
-               if (ret < 0)
-                       goto out_unlock;
+                       if (ret < 0)
+                               goto out_unlock;
+               }
 
                ram_size = ins.offset;
                em = create_io_em(inode, start, ins.offset, /* len */
@@ -1001,8 +1058,9 @@ static noinline int cow_file_range(struct inode *inode,
                free_extent_map(em);
 
                cur_alloc_size = ins.offset;
-               ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
-                                              ram_size, cur_alloc_size, 0);
+               ret = btrfs_add_ordered_extent_dedupe(inode, start,
+                               ins.objectid, cur_alloc_size, ins.offset,
+                               0, hash);
                if (ret)
                        goto out_drop_extent_cache;
 
@@ -1014,7 +1072,14 @@ static noinline int cow_file_range(struct inode *inode,
                                goto out_drop_extent_cache;
                }
 
-               btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+               /*
+                * Hash hit didn't allocate extent, no need to dec bg
+                * reservation.
+                * Or we will underflow reservations and block balance.
+                */
+               if (!btrfs_dedupe_hash_hit(hash))
+                       btrfs_dec_block_group_reservations(fs_info,
+                                                          ins.objectid);
 
                if (disk_num_bytes < cur_alloc_size)
                        break;
@@ -1057,6 +1122,79 @@ static noinline int cow_file_range(struct inode *inode,
        goto out;
 }
 
+static int hash_file_ranges(struct inode *inode, u64 start, u64 end,
+                           struct async_cow *async_cow, int *num_added)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_dedupe_info *dedupe_info = fs_info->dedupe_info;
+       struct page *locked_page = async_cow->locked_page;
+       u16 hash_algo;
+       u64 dedupe_bs;
+       u64 cur_offset = start;
+       int ret = 0;
+
+       /* If dedupe is not enabled, don't split extent into dedupe_bs */
+       if (fs_info->dedupe_enabled && dedupe_info) {
+               dedupe_bs = dedupe_info->blocksize;
+               hash_algo = dedupe_info->hash_algo;
+       } else {
+               dedupe_bs = SZ_128M;
+               /* Just dummy, to avoid access NULL pointer */
+               hash_algo = BTRFS_DEDUPE_HASH_SHA256;
+       }
+
+       while (cur_offset < end) {
+               struct btrfs_dedupe_hash *hash = NULL;
+               u64 len;
+
+               len = min(end + 1 - cur_offset, dedupe_bs);
+               if (len < dedupe_bs)
+                       goto next;
+
+               hash = btrfs_dedupe_alloc_hash(hash_algo);
+               if (!hash) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
+               ret = btrfs_dedupe_calc_hash(fs_info, inode, cur_offset, hash);
+               if (ret < 0) {
+                       kfree(hash);
+                       goto out;
+               }
+
+               ret = btrfs_dedupe_search(fs_info, inode, cur_offset, hash);
+               if (ret < 0) {
+                       kfree(hash);
+                       goto out;
+               }
+               ret = 0;
+
+next:
+               /* Redirty the locked page if it corresponds to our extent */
+               if (page_offset(locked_page) >= start &&
+                   page_offset(locked_page) <= end)
+                       __set_page_dirty_nobuffers(locked_page);
+
+               add_async_extent(async_cow, cur_offset, len, 0, NULL, 0,
+                                BTRFS_COMPRESS_NONE, hash);
+               cur_offset += len;
+               (*num_added)++;
+       }
+out:
+       /*
+        * Caller won't unlock pages, so if error happens, we must unlock
+        * pages by ourselves.
+        */
+       if (ret)
+               extent_clear_unlock_delalloc(inode, cur_offset,
+                       end, end, NULL, EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
+                       EXTENT_DELALLOC | EXTENT_DEFRAG, PAGE_UNLOCK |
+                       PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
+                       PAGE_END_WRITEBACK | PAGE_SET_ERROR);
+       return ret;
+}
+
 /*
  * work queue call back to started compression on a file and pages
  */
@@ -1064,11 +1202,17 @@ static noinline void async_cow_start(struct btrfs_work 
*work)
 {
        struct async_cow *async_cow;
        int num_added = 0;
+       int ret = 0;
        async_cow = container_of(work, struct async_cow, work);
 
-       compress_file_range(async_cow->inode, async_cow->locked_page,
-                           async_cow->start, async_cow->end, async_cow,
-                           &num_added);
+       if (async_cow->reserve_type == BTRFS_RESERVE_COMPRESS)
+               compress_file_range(async_cow->inode, async_cow->locked_page,
+                                   async_cow->start, async_cow->end, async_cow,
+                                   &num_added);
+       else
+               ret = hash_file_ranges(async_cow->inode, async_cow->start,
+                                      async_cow->end, async_cow, &num_added);
+
        if (num_added == 0) {
                btrfs_add_delayed_iput(async_cow->inode);
                async_cow->inode = NULL;
@@ -1121,6 +1265,7 @@ static int cow_file_range_async(struct inode *inode, 
struct page *locked_page,
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct async_cow *async_cow;
        struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_dedupe_info *dedupe_info = fs_info->dedupe_info;
        unsigned long nr_pages;
        u64 cur_end;
 
@@ -1133,10 +1278,15 @@ static int cow_file_range_async(struct inode *inode, 
struct page *locked_page,
                async_cow->root = root;
                async_cow->locked_page = locked_page;
                async_cow->start = start;
+               async_cow->reserve_type = reserve_type;
 
                cur_end = end;
                if (reserve_type == BTRFS_RESERVE_COMPRESS)
                        cur_end = min(end, start + SZ_512K - 1);
+               else if (fs_info->dedupe_enabled && dedupe_info) {
+                       u64 len = max_t(u64, SZ_512K, dedupe_info->blocksize);
+                       cur_end = min(end, start + len - 1);
+               }
 
                async_cow->end = cur_end;
                INIT_LIST_HEAD(&async_cow->extents);
@@ -1507,6 +1657,8 @@ static int run_delalloc_range(struct inode *inode, struct 
page *locked_page,
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        int need_compress;
        enum btrfs_metadata_reserve_type reserve_type = BTRFS_RESERVE_NORMAL;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_fs_info *fs_info = root->fs_info;
 
        need_compress = test_range_bit(io_tree, start, end,
                                       EXTENT_COMPRESS, 1, NULL);
@@ -1527,7 +1679,7 @@ static int run_delalloc_range(struct inode *inode, struct 
page *locked_page,
 
                ret = run_delalloc_nocow(inode, locked_page, start, end,
                                         page_started, 0, nr_written);
-       } else if (!need_compress) {
+       } else if (!need_compress && !fs_info->dedupe_enabled) {
                ret = cow_file_range(inode, locked_page, start, end, end,
                                      page_started, nr_written, 1, NULL);
        } else {
@@ -2167,7 +2319,8 @@ static int insert_reserved_file_extent(struct 
btrfs_trans_handle *trans,
                                       u64 disk_bytenr, u64 disk_num_bytes,
                                       u64 num_bytes, u64 ram_bytes,
                                       u8 compression, u8 encryption,
-                                      u16 other_encoding, int extent_type)
+                                      u16 other_encoding, int extent_type,
+                                      struct btrfs_dedupe_hash *hash)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_file_extent_item *fi;
@@ -2229,13 +2382,43 @@ static int insert_reserved_file_extent(struct 
btrfs_trans_handle *trans,
        ins.objectid = disk_bytenr;
        ins.offset = disk_num_bytes;
        ins.type = BTRFS_EXTENT_ITEM_KEY;
-       ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid,
-                       btrfs_ino(BTRFS_I(inode)), file_pos, ram_bytes, &ins);
-       /*
-        * Release the reserved range from inode dirty range map, as it is
-        * already moved into delayed_ref_head
-        */
-       btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
+
+       if (btrfs_dedupe_hash_hit(hash)) {
+               /*
+                * Hash hit won't create a new data extent, so its reserved
+                * space won't be freed by new delayed_ref_head.
+                * Manually free it.
+                */
+               btrfs_free_reserved_data_space(inode, file_pos, ram_bytes);
+       } else {
+               /*
+                * Hash miss or none-dedupe write, will create a new data
+                * extent, we need to release the qgroup reserved data space.
+                */
+               ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
+               if (ret < 0)
+                       goto out;
+               ret = btrfs_alloc_reserved_file_extent(trans,
+                               root->root_key.objectid,
+                               btrfs_ino(BTRFS_I(inode)), file_pos, ram_bytes,
+                               &ins);
+               if (ret < 0)
+                       goto out;
+       }
+
+       /* Add missed hash into dedupe tree */
+       if (hash && hash->bytenr == 0) {
+               hash->bytenr = ins.objectid;
+               hash->num_bytes = ins.offset;
+
+               /*
+                * Here we ignore dedupe_add error, as even it failed,
+                * it won't corrupt the filesystem. It will only only slightly
+                * reduce dedup rate
+                */
+               btrfs_dedupe_add(trans, root->fs_info, hash);
+       }
+
 out:
        btrfs_free_path(path);
 
@@ -2922,6 +3105,7 @@ static int btrfs_finish_ordered_io(struct 
btrfs_ordered_extent *ordered_extent)
        bool nolock;
        bool truncated = false;
        enum btrfs_metadata_reserve_type reserve_type = BTRFS_RESERVE_NORMAL;
+       int hash_hit = btrfs_dedupe_hash_hit(ordered_extent->hash);
 
        nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
 
@@ -3019,8 +3203,10 @@ static int btrfs_finish_ordered_io(struct 
btrfs_ordered_extent *ordered_extent)
                                                ordered_extent->disk_len,
                                                logical_len, logical_len,
                                                compress_type, 0, 0,
-                                               BTRFS_FILE_EXTENT_REG);
-               if (!ret)
+                                               BTRFS_FILE_EXTENT_REG,
+                                               ordered_extent->hash);
+               /* Hash hit case doesn't reserve delalloc bytes */
+               if (!ret && !hash_hit)
                        btrfs_release_delalloc_bytes(fs_info,
                                                     ordered_extent->start,
                                                     ordered_extent->disk_len);
@@ -3071,8 +3257,11 @@ static int btrfs_finish_ordered_io(struct 
btrfs_ordered_extent *ordered_extent)
                 * wrong we need to return the space for this ordered extent
                 * back to the allocator.  We only free the extent in the
                 * truncated case if we didn't write out the extent at all.
+                *
+                * For hash hit case, never free that extent, as it's being used
+                * by others.
                 */
-               if ((ret || !logical_len) &&
+               if ((ret || !logical_len) && !hash_hit &&
                    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
                    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
                        btrfs_free_reserved_extent(fs_info,
@@ -3080,7 +3269,6 @@ static int btrfs_finish_ordered_io(struct 
btrfs_ordered_extent *ordered_extent)
                                                   ordered_extent->disk_len, 1);
        }
 
-
        /*
         * This needs to be done to make sure anybody waiting knows we are done
         * updating everything for this ordered extent.
@@ -10467,7 +10655,8 @@ static int __btrfs_prealloc_file_range(struct inode 
*inode, int mode,
                                                  cur_offset, ins.objectid,
                                                  ins.offset, ins.offset,
                                                  ins.offset, 0, 0, 0,
-                                                 BTRFS_FILE_EXTENT_PREALLOC);
+                                                 BTRFS_FILE_EXTENT_PREALLOC,
+                                                 NULL);
                if (ret) {
                        btrfs_free_reserved_extent(fs_info, ins.objectid,
                                                   ins.offset, 0);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index f70f4df422d8..32ba88053939 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -32,6 +32,7 @@
 #include "free-space-cache.h"
 #include "inode-map.h"
 #include "qgroup.h"
+#include "dedupe.h"
 
 /*
  * backref_node, mapping_node and tree_block start with this
@@ -4116,6 +4117,20 @@ static noinline_for_stack int 
relocate_block_group(struct reloc_control *rc)
                                rc->search_start = key.objectid;
                        }
                }
+               /*
+                * This data extent will be replaced, but normal dedupe_del()
+                * will only happen at run_delayed_ref() time, which is too
+                * late, so delete dedupe_hash early to prevent its ref get
+                * increased during relocation
+                */
+               if (rc->stage == MOVE_DATA_EXTENTS &&
+                   (flags & BTRFS_EXTENT_FLAG_DATA)) {
+                       ret = btrfs_dedupe_del(trans, fs_info, key.objectid);
+                       if (ret < 0) {
+                               err = ret;
+                               break;
+                       }
+               }
 
                btrfs_end_transaction_throttle(trans);
                btrfs_btree_balance_dirty(fs_info);
-- 
2.12.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to