This patch can fix some false ENOSPC errors, below test script can
reproduce one false ENOSPC error:
#!/bin/bash
dd if=/dev/zero of=fs.img bs=$((1024*1024)) count=128
dev=$(losetup --show -f fs.img)
mkfs.btrfs -f -M $dev
mkdir /tmp/mntpoint
mount $dev /tmp/mntpoint
cd /tmp/mntpoint
xfs_io -f -c "falloc 0 $((64*1024*1024))" testfile
Above script will fail for ENOSPC reason, but indeed fs still has free
space to satisfy this request. Please see call graph:
btrfs_fallocate()
|-> btrfs_alloc_data_chunk_ondemand()
| bytes_may_use += 64M
|-> btrfs_prealloc_file_range()
|-> btrfs_reserve_extent()
|-> btrfs_add_reserved_bytes()
| alloc_type is RESERVE_ALLOC_NO_ACCOUNT, so it does not
| change bytes_may_use, and bytes_reserved += 64M. Now
| bytes_may_use + bytes_reserved == 128M, which is greater
| than btrfs_space_info's total_bytes, false enospc occurs.
| Note, the bytes_may_use decrease operation will done in
| end of btrfs_fallocate(), which is too late.
Here is another simple case for buffered write:
CPU 1 | CPU 2
|
|-> cow_file_range() |-> __btrfs_buffered_write()
|-> btrfs_reserve_extent() | |
| | |
| | |
| ..... | |->
btrfs_check_data_free_space()
| |
| |
|-> extent_clear_unlock_delalloc() |
In CPU 1, btrfs_reserve_extent()->find_free_extent()->
btrfs_add_reserved_bytes() do not decrease bytes_may_use, the decrease
operation will be delayed to be done in extent_clear_unlock_delalloc().
Assume in this case, btrfs_reserve_extent() reserved 128MB data, CPU2's
btrfs_check_data_free_space() tries to reserve 100MB data space.
If
100MB > data_sinfo->total_bytes - data_sinfo->bytes_used -
data_sinfo->bytes_reserved - data_sinfo->bytes_pinned -
data_sinfo->bytes_readonly - data_sinfo->bytes_may_use
btrfs_check_data_free_space() will try to allcate new data chunk or call
btrfs_start_delalloc_roots(), or commit current transaction inorder to
reserve some free space, obviously a lot of work. But indeed it's not
necessary as long as decreasing bytes_may_use timely, we still have
free space, decreasing 128M from bytes_may_use.
To fix this issue, this patch chooses to update bytes_may_use for both
data and metadata in btrfs_add_reserved_bytes(). For compress path, real
extent length may not be equal to file content length, so introduce a
ram_bytes argument for btrfs_reserve_extent(), find_free_extent() and
btrfs_add_reserved_bytes(), it's becasue bytes_may_use is increased by
file content length. Then compress path can update bytes_may_use
correctly. Also now we can discard RESERVE_ALLOC_NO_ACCOUNT,
RESERVE_ALLOC
and RESERVE_FREE.
For inode marked as NODATACOW or extent marked as PREALLOC, we can
directly call btrfs_free_reserved_data_space() to adjust bytes_may_use.
Meanwhile __btrfs_prealloc_file_range() will call
btrfs_free_reserved_data_space() internally for both sucessful and
failed
path, btrfs_prealloc_file_range()'s callers does not need to call
btrfs_free_reserved_data_space() any more.
Signed-off-by: Wang Xiaoguang <wangxg.f...@cn.fujitsu.com>
---
fs/btrfs/ctree.h | 2 +-
fs/btrfs/extent-tree.c | 56
+++++++++++++++++---------------------------------
fs/btrfs/file.c | 26 +++++++++++++----------
fs/btrfs/inode-map.c | 3 +--
fs/btrfs/inode.c | 37 ++++++++++++++++++++++++---------
fs/btrfs/relocation.c | 11 ++++++++--
6 files changed, 72 insertions(+), 63 deletions(-)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4274a7b..7eb2913 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2556,7 +2556,7 @@ int btrfs_alloc_logged_file_extent(struct
btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 root_objectid, u64 owner, u64 offset,
struct btrfs_key *ins);
-int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
+int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64
num_bytes,
u64 min_alloc_size, u64 empty_size, u64 hint_byte,
struct btrfs_key *ins, int is_data, int delalloc);
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct
btrfs_root *root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8eaac39..5447973 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -60,21 +60,6 @@ enum {
CHUNK_ALLOC_FORCE = 2,
};
-/*
- * Control how reservations are dealt with.
- *
- * RESERVE_FREE - freeing a reservation.
- * RESERVE_ALLOC - allocating space and we need to update
bytes_may_use for
- * ENOSPC accounting
- * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
- * bytes_may_use as the ENOSPC accounting is done elsewhere
- */
-enum {
- RESERVE_FREE = 0,
- RESERVE_ALLOC = 1,
- RESERVE_ALLOC_NO_ACCOUNT = 2,
-};
-
static int update_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr,
u64 num_bytes, int alloc);
@@ -105,7 +90,7 @@ static int find_next_key(struct btrfs_path *path,
int level,
static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
int dump_block_groups);
static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache
*cache,
- u64 num_bytes, int reserve, int delalloc);
+ u64 ram_bytes, u64 num_bytes, int delalloc);
static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache
*cache,
u64 num_bytes, int delalloc);
static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
@@ -3491,7 +3476,6 @@ again:
dcs = BTRFS_DC_SETUP;
else if (ret == -ENOSPC)
set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
- btrfs_free_reserved_data_space(inode, 0, num_pages);
out_put:
iput(inode);
@@ -6300,8 +6284,9 @@ void btrfs_wait_block_group_reservations(struct
btrfs_block_group_cache *bg)
/**
* btrfs_add_reserved_bytes - update the block_group and space info
counters
* @cache: The cache we are manipulating
+ * @ram_bytes: The number of bytes of file content, and will be
same to
+ * @num_bytes except for the compress path.
* @num_bytes: The number of bytes in question
- * @reserve: One of the reservation enums
* @delalloc: The blocks are allocated for the delalloc write
*
* This is called by the allocator when it reserves space. Metadata
@@ -6316,7 +6301,7 @@ void btrfs_wait_block_group_reservations(struct
btrfs_block_group_cache *bg)
* succeeds.
*/
static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache
*cache,
- u64 num_bytes, int reserve, int delalloc)
+ u64 ram_bytes, u64 num_bytes, int delalloc)
{
struct btrfs_space_info *space_info = cache->space_info;
int ret = 0;
@@ -6328,13 +6313,11 @@ static int btrfs_add_reserved_bytes(struct
btrfs_block_group_cache *cache,
} else {
cache->reserved += num_bytes;
space_info->bytes_reserved += num_bytes;
- if (reserve == RESERVE_ALLOC) {
- trace_btrfs_space_reservation(cache->fs_info,
- "space_info", space_info->flags,
- num_bytes, 0);
- space_info->bytes_may_use -= num_bytes;
- }
+ trace_btrfs_space_reservation(cache->fs_info,
+ "space_info", space_info->flags,
+ num_bytes, 0);