The following deadlock may happen when doing reservation for metadata:

Task0                           Flush thread            Task1
start_transaction()
  shrink_delalloc()
    writeback_inodes_sb_nr()
      wait for flush thread
      end.
                                btrfs_writepages()
                                  cow_file_range()
                                                        btrfs_commit_transaction
                                                          wait num_writer == 1
                                                          (wait Task0 end
                                                           transaction)
                                  start_transaction()
                                    wait trans commit
                                    end

Task0 -> Flush thread -> Task1 -> Task0

Fix the above deadlock by doing reservation before the trans handle has
been joined into the transaction.

Signed-off-by: Miao Xie <mi...@cn.fujitsu.com>
---
 fs/btrfs/extent-tree.c |   25 +++++++++++++----------
 fs/btrfs/transaction.c |   51 +++++++++++++++++++++++++++++++++--------------
 2 files changed, 50 insertions(+), 26 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b42efc2..eefa432 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3290,9 +3290,11 @@ again:
 
 /*
  * shrink metadata reservation for delalloc
+ *
+ * NOTE: This function can not run in the transaction context, or deadlock
+ *       will happen.
  */
-static int shrink_delalloc(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root, u64 to_reclaim, int sync)
+static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim)
 {
        struct btrfs_block_rsv *block_rsv;
        struct btrfs_space_info *space_info;
@@ -3338,9 +3340,6 @@ static int shrink_delalloc(struct btrfs_trans_handle 
*trans,
                if (reserved == 0 || reclaimed >= max_reclaim)
                        break;
 
-               if (trans && trans->transaction->blocked)
-                       return -EAGAIN;
-
                time_left = schedule_timeout_interruptible(1);
 
                /* We were interrupted, exit */
@@ -3449,12 +3448,16 @@ again:
        /*
         * We do synchronous shrinking since we don't actually unreserve
         * metadata until after the IO is completed.
+        *
+        * shrink_delalloc() can not run in the transaction context.
         */
-       ret = shrink_delalloc(trans, root, num_bytes, 1);
-       if (ret > 0)
-               return 0;
-       else if (ret < 0)
-               goto out;
+       if (!trans || !trans->transaction) {
+               ret = shrink_delalloc(root, num_bytes);
+               if (ret > 0)
+                       return 0;
+               else if (ret < 0)
+                       goto out;
+       }
 
        /*
         * So if we were overcommitted it's possible that somebody else flushed
@@ -3989,7 +3992,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, 
u64 num_bytes)
        block_rsv_add_bytes(block_rsv, to_reserve, 1);
 
        if (block_rsv->size > 512 * 1024 * 1024)
-               shrink_delalloc(NULL, root, to_reserve, 0);
+               shrink_delalloc(root, to_reserve);
 
        return 0;
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2b3590b..173b15d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -222,9 +222,31 @@ again:
        if (!h)
                return ERR_PTR(-ENOMEM);
 
+       h->transid = 0;
+       h->transaction = NULL;
+       h->blocks_used = 0;
+       h->bytes_reserved = 0;
+       h->delayed_ref_updates = 0;
+       h->use_count = 1;
+       h->block_rsv = NULL;
+       h->orig_rsv = NULL;
+
        if (may_wait_transaction(root, type))
                wait_current_trans(root);
 
+       if (num_items > 0) {
+               /*
+                * Now the handle has not been joined into the transaction,
+                * so btrfs will shrink metadata reservation for delalloc if
+                * there is no enough free space to reserve.
+                */
+               ret = btrfs_trans_reserve_metadata(h, root, num_items);
+               if (ret < 0 && ret != -EAGAIN) {
+                       kmem_cache_free(btrfs_trans_handle_cachep, h);
+                       return ERR_PTR(ret);
+               }
+       }
+
        do {
                ret = join_transaction(root, type == TRANS_JOIN_NOLOCK);
                if (ret == -EBUSY)
@@ -232,6 +254,7 @@ again:
        } while (ret == -EBUSY);
 
        if (ret < 0) {
+               btrfs_trans_release_metadata(h, root);
                kmem_cache_free(btrfs_trans_handle_cachep, h);
                return ERR_PTR(ret);
        }
@@ -240,12 +263,6 @@ again:
 
        h->transid = cur_trans->transid;
        h->transaction = cur_trans;
-       h->blocks_used = 0;
-       h->bytes_reserved = 0;
-       h->delayed_ref_updates = 0;
-       h->use_count = 1;
-       h->block_rsv = NULL;
-       h->orig_rsv = NULL;
 
        smp_mb();
        if (cur_trans->blocked && may_wait_transaction(root, type)) {
@@ -253,21 +270,25 @@ again:
                goto again;
        }
 
-       if (num_items > 0) {
-               ret = btrfs_trans_reserve_metadata(h, root, num_items);
-               if (ret == -EAGAIN && !retries) {
+       /*
+        * Though we shrink metadata reservation for delalloc, we might still
+        * not get enough free space, so we will commit the transaction and try
+        * to reclaim the reservation.
+        *
+        * NOTE: In the transaction context, we won't shrink metadata
+        *       reservation for delalloc, or the deadlock will happen.
+        */
+       if (num_items > 0 && !h->bytes_reserved) {
+               if (!retries) {
                        retries++;
                        btrfs_commit_transaction(h, root);
                        goto again;
-               } else if (ret == -EAGAIN) {
+               } else {
                        /*
-                        * We have already retried and got EAGAIN, so really we
-                        * don't have space, so set ret to -ENOSPC.
+                        * We have already retried, so really we don't have
+                        * space, so set ret to -ENOSPC.
                         */
                        ret = -ENOSPC;
-               }
-
-               if (ret < 0) {
                        btrfs_end_transaction(h, root);
                        return ERR_PTR(ret);
                }
-- 
1.7.4
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to