On 11.05.2018 10:56, Omar Sandoval wrote:
> From: Omar Sandoval <osan...@fb.com>
> 
> Currently, we keep space reserved for all inode orphan items until the
> inode is evicted (i.e., all references to it are dropped). We hit an
> issue where an application would keep a bunch of deleted files open (by
> design) and thus keep a large amount of space reserved, causing ENOSPC
> errors when other operations tried to reserve space. This long-standing
> reservation isn't absolutely necessary for a couple of reasons:
> 
> - We can almost always make the reservation we need or steal from the
>   global reserve for the orphan item
> - If we can't, it's not the end of the world if we drop the orphan item
>   on the floor and let the next mount clean it up
> 
> So, get rid of persistent reservation and just reserve space in
> btrfs_evict_inode().
> 
> Signed-off-by: Omar Sandoval <osan...@fb.com>

Reviewed-by: Nikolay Borisov <nbori...@suse.com>

> ---
>  fs/btrfs/btrfs_inode.h |  17 +++--
>  fs/btrfs/inode.c       | 158 ++++++++++-------------------------------
>  2 files changed, 46 insertions(+), 129 deletions(-)
> 
> diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
> index a81112706cd5..bbbe7f308d68 100644
> --- a/fs/btrfs/btrfs_inode.h
> +++ b/fs/btrfs/btrfs_inode.h
> @@ -20,15 +20,14 @@
>   * new data the application may have written before commit.
>   */
>  #define BTRFS_INODE_ORDERED_DATA_CLOSE               0
> -#define BTRFS_INODE_ORPHAN_META_RESERVED     1
> -#define BTRFS_INODE_DUMMY                    2
> -#define BTRFS_INODE_IN_DEFRAG                        3
> -#define BTRFS_INODE_HAS_ASYNC_EXTENT         4
> -#define BTRFS_INODE_NEEDS_FULL_SYNC          5
> -#define BTRFS_INODE_COPY_EVERYTHING          6
> -#define BTRFS_INODE_IN_DELALLOC_LIST         7
> -#define BTRFS_INODE_READDIO_NEED_LOCK                8
> -#define BTRFS_INODE_HAS_PROPS                        9
> +#define BTRFS_INODE_DUMMY                    1
> +#define BTRFS_INODE_IN_DEFRAG                        2
> +#define BTRFS_INODE_HAS_ASYNC_EXTENT         3
> +#define BTRFS_INODE_NEEDS_FULL_SYNC          4
> +#define BTRFS_INODE_COPY_EVERYTHING          5
> +#define BTRFS_INODE_IN_DELALLOC_LIST         6
> +#define BTRFS_INODE_READDIO_NEED_LOCK                7
> +#define BTRFS_INODE_HAS_PROPS                        8
>  
>  /* in memory btrfs inode */
>  struct btrfs_inode {
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 7ca55af8aa17..b64c4189e2c0 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -3331,77 +3331,16 @@ void btrfs_orphan_commit_root(struct 
> btrfs_trans_handle *trans,
>  /*
>   * This creates an orphan entry for the given inode in case something goes 
> wrong
>   * in the middle of an unlink.
> - *
> - * NOTE: caller of this function should reserve 5 units of metadata for
> - *    this function.
>   */
>  int btrfs_orphan_add(struct btrfs_trans_handle *trans,
> -             struct btrfs_inode *inode)
> +                  struct btrfs_inode *inode)
>  {
> -     struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
> -     struct btrfs_root *root = inode->root;
> -     struct btrfs_block_rsv *block_rsv = NULL;
> -     int reserve = 0;
>       int ret;
>  
> -     if (!root->orphan_block_rsv) {
> -             block_rsv = btrfs_alloc_block_rsv(fs_info,
> -                                               BTRFS_BLOCK_RSV_TEMP);
> -             if (!block_rsv)
> -                     return -ENOMEM;
> -     }
> -
> -     if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
> -                           &inode->runtime_flags))
> -             reserve = 1;
> -
> -     spin_lock(&root->orphan_lock);
> -     /* If someone has created ->orphan_block_rsv, be happy to use it. */
> -     if (!root->orphan_block_rsv) {
> -             root->orphan_block_rsv = block_rsv;
> -     } else if (block_rsv) {
> -             btrfs_free_block_rsv(fs_info, block_rsv);
> -             block_rsv = NULL;
> -     }
> -
> -     atomic_inc(&root->orphan_inodes);
> -     spin_unlock(&root->orphan_lock);
> -
> -     /* grab metadata reservation from transaction handle */
> -     if (reserve) {
> -             ret = btrfs_orphan_reserve_metadata(trans, inode);
> -             ASSERT(!ret);
> -             if (ret) {
> -                     /*
> -                      * dec doesn't need spin_lock as ->orphan_block_rsv
> -                      * would be released only if ->orphan_inodes is
> -                      * zero.
> -                      */
> -                     atomic_dec(&root->orphan_inodes);
> -                     clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
> -                               &inode->runtime_flags);
> -                     return ret;
> -             }
> -     }
> -
> -     /* insert an orphan item to track this unlinked file */
> -     ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
> -     if (ret) {
> -             if (reserve) {
> -                     clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
> -                               &inode->runtime_flags);
> -                     btrfs_orphan_release_metadata(inode);
> -             }
> -             /*
> -              * btrfs_orphan_commit_root may race with us and set
> -              * ->orphan_block_rsv to zero, in order to avoid that,
> -              * decrease ->orphan_inodes after everything is done.
> -              */
> -             atomic_dec(&root->orphan_inodes);
> -             if (ret != -EEXIST) {
> -                     btrfs_abort_transaction(trans, ret);
> -                     return ret;
> -             }
> +     ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
> +     if (ret && ret != -EEXIST) {
> +             btrfs_abort_transaction(trans, ret);
> +             return ret;
>       }
>  
>       return 0;
> @@ -3414,24 +3353,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans,
>  static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
>                           struct btrfs_inode *inode)
>  {
> -     struct btrfs_root *root = inode->root;
> -     int ret = 0;
> -
> -     if (trans)
> -             ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
> -
> -     if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
> -                            &inode->runtime_flags))
> -             btrfs_orphan_release_metadata(inode);
> -
> -     /*
> -      * btrfs_orphan_commit_root may race with us and set ->orphan_block_rsv
> -      * to zero, in order to avoid that, decrease ->orphan_inodes after
> -      * everything is done.
> -      */
> -     atomic_dec(&root->orphan_inodes);
> -
> -     return ret;
> +     return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
>  }
>  
>  /*
> @@ -3587,8 +3509,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
>                       continue;
>               }
>  
> -             atomic_inc(&root->orphan_inodes);
> -
>               nr_unlink++;
>  
>               /* this will do delete_inode and everything for us */
> @@ -5255,10 +5175,8 @@ void btrfs_evict_inode(struct inode *inode)
>            btrfs_is_free_space_inode(BTRFS_I(inode))))
>               goto no_delete;
>  
> -     if (is_bad_inode(inode)) {
> -             btrfs_orphan_del(NULL, BTRFS_I(inode));
> +     if (is_bad_inode(inode))
>               goto no_delete;
> -     }
>       /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
>       if (!special_file(inode->i_mode))
>               btrfs_wait_ordered_range(inode, 0, (u64)-1);
> @@ -5275,16 +5193,12 @@ void btrfs_evict_inode(struct inode *inode)
>       }
>  
>       ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
> -     if (ret) {
> -             btrfs_orphan_del(NULL, BTRFS_I(inode));
> +     if (ret)
>               goto no_delete;
> -     }
>  
>       rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
> -     if (!rsv) {
> -             btrfs_orphan_del(NULL, BTRFS_I(inode));
> +     if (!rsv)
>               goto no_delete;
> -     }
>       rsv->size = min_size;
>       rsv->failfast = 1;
>  
> @@ -5292,46 +5206,50 @@ void btrfs_evict_inode(struct inode *inode)
>  
>       while (1) {
>               trans = evict_refill_and_join(root, rsv, min_size);
> -             if (IS_ERR(trans)) {
> -                     btrfs_orphan_del(NULL, BTRFS_I(inode));
> -                     btrfs_free_block_rsv(fs_info, rsv);
> -                     goto no_delete;
> -             }
> +             if (IS_ERR(trans))
> +                     goto free_rsv;
>  
>               trans->block_rsv = rsv;
>  
>               ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
> -             if (ret) {
> -                     trans->block_rsv = &fs_info->trans_block_rsv;
> -                     btrfs_end_transaction(trans);
> -                     btrfs_btree_balance_dirty(fs_info);
> -                     if (ret != -ENOSPC && ret != -EAGAIN) {
> -                             btrfs_orphan_del(NULL, BTRFS_I(inode));
> -                             btrfs_free_block_rsv(fs_info, rsv);
> -                             goto no_delete;
> -                     }
> -             } else {
> +             trans->block_rsv = &fs_info->trans_block_rsv;
> +             btrfs_end_transaction(trans);
> +             btrfs_btree_balance_dirty(fs_info);
> +             if (ret && ret != -ENOSPC && ret != -EAGAIN)
> +                     goto free_rsv;
> +             else if (!ret)
>                       break;
> -             }
>       }
>  
> -     btrfs_free_block_rsv(fs_info, rsv);
> -
>       /*
> -      * Errors here aren't a big deal, it just means we leave orphan items
> -      * in the tree.  They will be cleaned up on the next mount.
> +      * Errors here aren't a big deal, it just means we leave orphan items in
> +      * the tree. They will be cleaned up on the next mount. If the inode
> +      * number gets reused, cleanup deletes the orphan item without doing
> +      * anything, and unlink reuses the existing orphan item.
> +      *
> +      * If it turns out that we are dropping too many of these, we might want
> +      * to add a mechanism for retrying these after a commit.
>        */
> -     trans->block_rsv = root->orphan_block_rsv;
> -     btrfs_orphan_del(trans, BTRFS_I(inode));
> +     trans = evict_refill_and_join(root, rsv, min_size);
> +     if (!IS_ERR(trans)) {
> +             trans->block_rsv = rsv;
> +             btrfs_orphan_del(trans, BTRFS_I(inode));
> +             trans->block_rsv = &fs_info->trans_block_rsv;
> +             btrfs_end_transaction(trans);
> +     }
>  
> -     trans->block_rsv = &fs_info->trans_block_rsv;
>       if (!(root == fs_info->tree_root ||
>             root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
>               btrfs_return_ino(root, btrfs_ino(BTRFS_I(inode)));
>  
> -     btrfs_end_transaction(trans);
> -     btrfs_btree_balance_dirty(fs_info);
> +free_rsv:
> +     btrfs_free_block_rsv(fs_info, rsv);
>  no_delete:
> +     /*
> +      * If we didn't successfully delete, the orphan item will still be in
> +      * the tree and we'll retry on the next mount. Again, we might also want
> +      * to retry these periodically in the future.
> +      */
>       btrfs_remove_delayed_node(BTRFS_I(inode));
>       clear_inode(inode);
>  }
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to