[PATCH 4/6] Btrfs: fix race between fs trimming and block group remove/allocation

2014-11-26 Thread Filipe Manana
Our fs trim operation, which is completely transactionless (doesn't start
or joins an existing transaction) consists of visiting all block groups
and then for each one to iterate its free space entries and perform a
discard operation against the space range represented by the free space
entries. However before performing a discard, the corresponding free space
entry is removed from the free space rbtree, and when the discard completes
it is added back to the free space rbtree.

If a block group remove operation happens while the discard is ongoing (or
before it starts and after a free space entry is hidden), we end up not
waiting for the discard to complete, remove the extent map that maps
logical address to physical addresses and the corresponding chunk metadata
from the the chunk and device trees. After that and before the discard
completes, the current running transaction can finish and a new one start,
allowing for new block groups that map to the same physical addresses to
be allocated and written to.

So fix this by keeping the extent map in memory until the discard completes
so that the same physical addresses aren't reused before it completes.

If the physical locations that are under a discard operation end up being
used for a new metadata block group for example, and dirty metadata extents
are written before the discard finishes (the VM might call writepages() of
our btree inode's i_mapping for example, or an fsync log commit happens) we
end up overwriting metadata with zeroes, which leads to errors from fsck
like the following:

checking extents
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
read block failed check_tree_block
owner ref check failed [833912832 16384]
Errors found in extent allocation tree or chunk allocation
checking free space cache
checking fs roots
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
read block failed check_tree_block
root 5 root dir 256 error
root 5 inode 260 errors 2001, no inode item, link count wrong
unresolved ref dir 256 index 0 namelen 8 name foobar_3 filetype 
1 errors 6, no dir index, no inode ref
root 5 inode 262 errors 2001, no inode item, link count wrong
unresolved ref dir 256 index 0 namelen 8 name foobar_5 filetype 
1 errors 6, no dir index, no inode ref
root 5 inode 263 errors 2001, no inode item, link count wrong
(...)

Signed-off-by: Filipe Manana fdman...@suse.com
---
 fs/btrfs/ctree.h| 13 -
 fs/btrfs/disk-io.c  | 14 ++
 fs/btrfs/extent-tree.c  | 24 +++-
 fs/btrfs/free-space-cache.c | 26 +-
 fs/btrfs/volumes.c  | 33 ++---
 5 files changed, 100 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7f40a65..51056c7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1278,6 +1278,7 @@ struct btrfs_block_group_cache {
unsigned int dirty:1;
unsigned int iref:1;
unsigned int has_caching_ctl:1;
+   unsigned int removed:1;
 
int disk_cache_state;
 
@@ -1307,6 +1308,8 @@ struct btrfs_block_group_cache {
 
/* For delayed block group creation or deletion of empty block groups */
struct list_head bg_list;
+
+   atomic_t trimming;
 };
 
 /* delayed seq elem */
@@ -1731,6 +1734,13 @@ struct btrfs_fs_info {
 
/* For btrfs to record security options */
struct security_mnt_opts security_opts;
+
+   /*
+* Chunks that can't be freed yet (under a trim/discard operation)
+* and will be latter freed.
+*/
+   rwlock_t pinned_chunks_lock;
+   struct list_head pinned_chunks;
 };
 
 struct btrfs_subvolume_writers {
@@ -3353,7 +3363,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle 
*trans,
   u64 type, u64 chunk_objectid, u64 chunk_offset,
   u64 size);
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
-struct btrfs_root *root, u64 group_start);
+struct btrfs_root *root, u64 group_start,
+bool *remove_em);
 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
   struct btrfs_root *root);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 

Re: [PATCH 4/6] Btrfs: fix race between fs trimming and block group remove/allocation

2014-11-26 Thread Josef Bacik

On 11/26/2014 10:28 AM, Filipe Manana wrote:

Our fs trim operation, which is completely transactionless (doesn't start
or joins an existing transaction) consists of visiting all block groups
and then for each one to iterate its free space entries and perform a
discard operation against the space range represented by the free space
entries. However before performing a discard, the corresponding free space
entry is removed from the free space rbtree, and when the discard completes
it is added back to the free space rbtree.

If a block group remove operation happens while the discard is ongoing (or
before it starts and after a free space entry is hidden), we end up not
waiting for the discard to complete, remove the extent map that maps
logical address to physical addresses and the corresponding chunk metadata
from the the chunk and device trees. After that and before the discard
completes, the current running transaction can finish and a new one start,
allowing for new block groups that map to the same physical addresses to
be allocated and written to.

So fix this by keeping the extent map in memory until the discard completes
so that the same physical addresses aren't reused before it completes.

If the physical locations that are under a discard operation end up being
used for a new metadata block group for example, and dirty metadata extents
are written before the discard finishes (the VM might call writepages() of
our btree inode's i_mapping for example, or an fsync log commit happens) we
end up overwriting metadata with zeroes, which leads to errors from fsck
like the following:

 checking extents
 Check tree block failed, want=833912832, have=0
 Check tree block failed, want=833912832, have=0
 Check tree block failed, want=833912832, have=0
 Check tree block failed, want=833912832, have=0
 Check tree block failed, want=833912832, have=0
 read block failed check_tree_block
 owner ref check failed [833912832 16384]
 Errors found in extent allocation tree or chunk allocation
 checking free space cache
 checking fs roots
 Check tree block failed, want=833912832, have=0
 Check tree block failed, want=833912832, have=0
 Check tree block failed, want=833912832, have=0
 Check tree block failed, want=833912832, have=0
 Check tree block failed, want=833912832, have=0
 read block failed check_tree_block
 root 5 root dir 256 error
 root 5 inode 260 errors 2001, no inode item, link count wrong
 unresolved ref dir 256 index 0 namelen 8 name foobar_3 
filetype 1 errors 6, no dir index, no inode ref
 root 5 inode 262 errors 2001, no inode item, link count wrong
 unresolved ref dir 256 index 0 namelen 8 name foobar_5 
filetype 1 errors 6, no dir index, no inode ref
 root 5 inode 263 errors 2001, no inode item, link count wrong
 (...)

Signed-off-by: Filipe Manana fdman...@suse.com
---
  fs/btrfs/ctree.h| 13 -
  fs/btrfs/disk-io.c  | 14 ++
  fs/btrfs/extent-tree.c  | 24 +++-
  fs/btrfs/free-space-cache.c | 26 +-
  fs/btrfs/volumes.c  | 33 ++---
  5 files changed, 100 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7f40a65..51056c7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1278,6 +1278,7 @@ struct btrfs_block_group_cache {
unsigned int dirty:1;
unsigned int iref:1;
unsigned int has_caching_ctl:1;
+   unsigned int removed:1;

int disk_cache_state;

@@ -1307,6 +1308,8 @@ struct btrfs_block_group_cache {

/* For delayed block group creation or deletion of empty block groups */
struct list_head bg_list;
+
+   atomic_t trimming;
  };

  /* delayed seq elem */
@@ -1731,6 +1734,13 @@ struct btrfs_fs_info {

/* For btrfs to record security options */
struct security_mnt_opts security_opts;
+
+   /*
+* Chunks that can't be freed yet (under a trim/discard operation)
+* and will be latter freed.
+*/
+   rwlock_t pinned_chunks_lock;
+   struct list_head pinned_chunks;
  };

  struct btrfs_subvolume_writers {
@@ -3353,7 +3363,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle 
*trans,
   u64 type, u64 chunk_objectid, u64 chunk_offset,
   u64 size);
  int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
-struct btrfs_root *root, u64 group_start);
+struct btrfs_root *root, u64 group_start,
+bool *remove_em);
  void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
  void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
   struct 

Re: [PATCH 4/6] Btrfs: fix race between fs trimming and block group remove/allocation

2014-11-26 Thread Filipe David Manana
On Wed, Nov 26, 2014 at 4:15 PM, Josef Bacik jba...@fb.com wrote:
 On 11/26/2014 10:28 AM, Filipe Manana wrote:

 Our fs trim operation, which is completely transactionless (doesn't start
 or joins an existing transaction) consists of visiting all block groups
 and then for each one to iterate its free space entries and perform a
 discard operation against the space range represented by the free space
 entries. However before performing a discard, the corresponding free space
 entry is removed from the free space rbtree, and when the discard
 completes
 it is added back to the free space rbtree.

 If a block group remove operation happens while the discard is ongoing (or
 before it starts and after a free space entry is hidden), we end up not
 waiting for the discard to complete, remove the extent map that maps
 logical address to physical addresses and the corresponding chunk metadata
 from the the chunk and device trees. After that and before the discard
 completes, the current running transaction can finish and a new one start,
 allowing for new block groups that map to the same physical addresses to
 be allocated and written to.

 So fix this by keeping the extent map in memory until the discard
 completes
 so that the same physical addresses aren't reused before it completes.

 If the physical locations that are under a discard operation end up being
 used for a new metadata block group for example, and dirty metadata
 extents
 are written before the discard finishes (the VM might call writepages() of
 our btree inode's i_mapping for example, or an fsync log commit happens)
 we
 end up overwriting metadata with zeroes, which leads to errors from fsck
 like the following:

  checking extents
  Check tree block failed, want=833912832, have=0
  Check tree block failed, want=833912832, have=0
  Check tree block failed, want=833912832, have=0
  Check tree block failed, want=833912832, have=0
  Check tree block failed, want=833912832, have=0
  read block failed check_tree_block
  owner ref check failed [833912832 16384]
  Errors found in extent allocation tree or chunk allocation
  checking free space cache
  checking fs roots
  Check tree block failed, want=833912832, have=0
  Check tree block failed, want=833912832, have=0
  Check tree block failed, want=833912832, have=0
  Check tree block failed, want=833912832, have=0
  Check tree block failed, want=833912832, have=0
  read block failed check_tree_block
  root 5 root dir 256 error
  root 5 inode 260 errors 2001, no inode item, link count wrong
  unresolved ref dir 256 index 0 namelen 8 name foobar_3
 filetype 1 errors 6, no dir index, no inode ref
  root 5 inode 262 errors 2001, no inode item, link count wrong
  unresolved ref dir 256 index 0 namelen 8 name foobar_5
 filetype 1 errors 6, no dir index, no inode ref
  root 5 inode 263 errors 2001, no inode item, link count wrong
  (...)

 Signed-off-by: Filipe Manana fdman...@suse.com
 ---
   fs/btrfs/ctree.h| 13 -
   fs/btrfs/disk-io.c  | 14 ++
   fs/btrfs/extent-tree.c  | 24 +++-
   fs/btrfs/free-space-cache.c | 26 +-
   fs/btrfs/volumes.c  | 33 ++---
   5 files changed, 100 insertions(+), 10 deletions(-)

 diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
 index 7f40a65..51056c7 100644
 --- a/fs/btrfs/ctree.h
 +++ b/fs/btrfs/ctree.h
 @@ -1278,6 +1278,7 @@ struct btrfs_block_group_cache {
 unsigned int dirty:1;
 unsigned int iref:1;
 unsigned int has_caching_ctl:1;
 +   unsigned int removed:1;

 int disk_cache_state;

 @@ -1307,6 +1308,8 @@ struct btrfs_block_group_cache {

 /* For delayed block group creation or deletion of empty block
 groups */
 struct list_head bg_list;
 +
 +   atomic_t trimming;
   };

   /* delayed seq elem */
 @@ -1731,6 +1734,13 @@ struct btrfs_fs_info {

 /* For btrfs to record security options */
 struct security_mnt_opts security_opts;
 +
 +   /*
 +* Chunks that can't be freed yet (under a trim/discard operation)
 +* and will be latter freed.
 +*/
 +   rwlock_t pinned_chunks_lock;
 +   struct list_head pinned_chunks;
   };

   struct btrfs_subvolume_writers {
 @@ -3353,7 +3363,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle
 *trans,
u64 type, u64 chunk_objectid, u64 chunk_offset,
u64 size);
   int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 -struct btrfs_root *root, u64 group_start);
 +struct btrfs_root *root, u64 group_start,
 +bool *remove_em);
   void 

Re: [PATCH 4/6] Btrfs: fix race between fs trimming and block group remove/allocation

2014-11-26 Thread Josef Bacik

On 11/26/2014 11:25 AM, Filipe David Manana wrote:

On Wed, Nov 26, 2014 at 4:15 PM, Josef Bacik jba...@fb.com wrote:

On 11/26/2014 10:28 AM, Filipe Manana wrote:


Our fs trim operation, which is completely transactionless (doesn't start
or joins an existing transaction) consists of visiting all block groups
and then for each one to iterate its free space entries and perform a
discard operation against the space range represented by the free space
entries. However before performing a discard, the corresponding free space
entry is removed from the free space rbtree, and when the discard
completes
it is added back to the free space rbtree.

If a block group remove operation happens while the discard is ongoing (or
before it starts and after a free space entry is hidden), we end up not
waiting for the discard to complete, remove the extent map that maps
logical address to physical addresses and the corresponding chunk metadata
from the the chunk and device trees. After that and before the discard
completes, the current running transaction can finish and a new one start,
allowing for new block groups that map to the same physical addresses to
be allocated and written to.

So fix this by keeping the extent map in memory until the discard
completes
so that the same physical addresses aren't reused before it completes.

If the physical locations that are under a discard operation end up being
used for a new metadata block group for example, and dirty metadata
extents
are written before the discard finishes (the VM might call writepages() of
our btree inode's i_mapping for example, or an fsync log commit happens)
we
end up overwriting metadata with zeroes, which leads to errors from fsck
like the following:

  checking extents
  Check tree block failed, want=833912832, have=0
  Check tree block failed, want=833912832, have=0
  Check tree block failed, want=833912832, have=0
  Check tree block failed, want=833912832, have=0
  Check tree block failed, want=833912832, have=0
  read block failed check_tree_block
  owner ref check failed [833912832 16384]
  Errors found in extent allocation tree or chunk allocation
  checking free space cache
  checking fs roots
  Check tree block failed, want=833912832, have=0
  Check tree block failed, want=833912832, have=0
  Check tree block failed, want=833912832, have=0
  Check tree block failed, want=833912832, have=0
  Check tree block failed, want=833912832, have=0
  read block failed check_tree_block
  root 5 root dir 256 error
  root 5 inode 260 errors 2001, no inode item, link count wrong
  unresolved ref dir 256 index 0 namelen 8 name foobar_3
filetype 1 errors 6, no dir index, no inode ref
  root 5 inode 262 errors 2001, no inode item, link count wrong
  unresolved ref dir 256 index 0 namelen 8 name foobar_5
filetype 1 errors 6, no dir index, no inode ref
  root 5 inode 263 errors 2001, no inode item, link count wrong
  (...)

Signed-off-by: Filipe Manana fdman...@suse.com
---
   fs/btrfs/ctree.h| 13 -
   fs/btrfs/disk-io.c  | 14 ++
   fs/btrfs/extent-tree.c  | 24 +++-
   fs/btrfs/free-space-cache.c | 26 +-
   fs/btrfs/volumes.c  | 33 ++---
   5 files changed, 100 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7f40a65..51056c7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1278,6 +1278,7 @@ struct btrfs_block_group_cache {
 unsigned int dirty:1;
 unsigned int iref:1;
 unsigned int has_caching_ctl:1;
+   unsigned int removed:1;

 int disk_cache_state;

@@ -1307,6 +1308,8 @@ struct btrfs_block_group_cache {

 /* For delayed block group creation or deletion of empty block
groups */
 struct list_head bg_list;
+
+   atomic_t trimming;
   };

   /* delayed seq elem */
@@ -1731,6 +1734,13 @@ struct btrfs_fs_info {

 /* For btrfs to record security options */
 struct security_mnt_opts security_opts;
+
+   /*
+* Chunks that can't be freed yet (under a trim/discard operation)
+* and will be latter freed.
+*/
+   rwlock_t pinned_chunks_lock;
+   struct list_head pinned_chunks;
   };

   struct btrfs_subvolume_writers {
@@ -3353,7 +3363,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle
*trans,
u64 type, u64 chunk_objectid, u64 chunk_offset,
u64 size);
   int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
-struct btrfs_root *root, u64 group_start);
+struct btrfs_root *root, u64 group_start,
+bool *remove_em);
   void