[PATCH 02/35] fscrypt: add per-extent encryption support

2023-09-26 Thread Josef Bacik
This adds the code necessary for per-extent encryption.  We will store a
nonce for every extent we create, and then use the inode's policy and
the extents nonce to derive a per-extent key.

This is meant to be flexible, if we choose to expand the on-disk extent
information in the future we have a version number we can use to change
what exists on disk.

The file system indicates it wants to use per-extent encryption by
setting s_cop->set_extent_context.  This also requires the use of inline
block encryption.

The support is relatively straightforward, the only "extra" bit is we're
deriving a per-extent key to use for the encryption, the inode still
controls the policy and access to the master key.

Signed-off-by: Josef Bacik 
---
 fs/crypto/crypto.c  |  10 ++-
 fs/crypto/fscrypt_private.h |  45 +++
 fs/crypto/inline_crypt.c|  84 
 fs/crypto/keysetup.c| 152 
 fs/crypto/policy.c  |  47 +++
 include/linux/fscrypt.h |  66 
 6 files changed, 403 insertions(+), 1 deletion(-)

diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 2b2bb5740322..e3ba06cb4082 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -40,6 +40,7 @@ static struct workqueue_struct *fscrypt_read_workqueue;
 static DEFINE_MUTEX(fscrypt_init_mutex);
 
 struct kmem_cache *fscrypt_inode_info_cachep;
+struct kmem_cache *fscrypt_extent_info_cachep;
 
 void fscrypt_enqueue_decrypt_work(struct work_struct *work)
 {
@@ -396,12 +397,19 @@ static int __init fscrypt_init(void)
if (!fscrypt_inode_info_cachep)
goto fail_free_queue;
 
+   fscrypt_extent_info_cachep = KMEM_CACHE(fscrypt_extent_info,
+   SLAB_RECLAIM_ACCOUNT);
+   if (!fscrypt_extent_info_cachep)
+   goto fail_free_info;
+
err = fscrypt_init_keyring();
if (err)
-   goto fail_free_info;
+   goto fail_free_extent_info;
 
return 0;
 
+fail_free_extent_info:
+   kmem_cache_destroy(fscrypt_extent_info_cachep);
 fail_free_info:
kmem_cache_destroy(fscrypt_inode_info_cachep);
 fail_free_queue:
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index b982073d1fe2..a7e902813cde 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -30,6 +30,8 @@
 #define FSCRYPT_CONTEXT_V1 1
 #define FSCRYPT_CONTEXT_V2 2
 
+#define FSCRYPT_EXTENT_CONTEXT_V1  1
+
 /* Keep this in sync with include/uapi/linux/fscrypt.h */
 #define FSCRYPT_MODE_MAX   FSCRYPT_MODE_AES_256_HCTR2
 
@@ -52,6 +54,28 @@ struct fscrypt_context_v2 {
u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
 };
 
+/*
+ * fscrypt_extent_context - the encryption context of an extent
+ *
+ * This is the on-disk information stored for an extent.  The policy and
+ * relevante information is stored in the inode, the per-extent information is
+ * simply the nonce that's used in as KDF input in conjunction with the inode
+ * context to derive a per-extent key for encryption.
+ *
+ * At this point the master_key_identifier exists only for possible future
+ * expansion.  This will allow for an inode to have extents with multiple 
master
+ * keys, although sharing the same encryption mode.  This would be for 
re-keying
+ * or for reflinking between two differently encrypted inodes.  For now the
+ * master_key_descriptor must match the inode's, and we'll be using the inode's
+ * for all key derivation.
+ */
+struct fscrypt_extent_context {
+   u8 version; /* FSCRYPT_EXTENT_CONTEXT_V2 */
+   u8 encryption_mode;
+   u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE];
+   u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
+};
+
 /*
  * fscrypt_context - the encryption context of an inode
  *
@@ -260,6 +284,25 @@ struct fscrypt_inode_info {
u32 ci_hashed_ino;
 };
 
+/*
+ * fscrypt_extent_info - the "encryption key" for an extent.
+ *
+ * This contains the dervied key for the given extent and the nonce for the
+ * extent.
+ */
+struct fscrypt_extent_info {
+   refcount_t refs;
+
+   /* The derived key for this extent. */
+   struct fscrypt_prepared_key prep_key;
+
+   /* The super block that this extent belongs to. */
+   struct super_block *sb;
+
+   /* This is the extents nonce, loaded from the fscrypt_extent_context */
+   u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
+};
+
 typedef enum {
FS_DECRYPT = 0,
FS_ENCRYPT,
@@ -267,6 +310,7 @@ typedef enum {
 
 /* crypto.c */
 extern struct kmem_cache *fscrypt_inode_info_cachep;
+extern struct kmem_cache *fscrypt_extent_info_cachep;
 int fscrypt_initialize(struct super_block *sb);
 int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
u64 lblk_num, struct page *src_page,
@@ -326,6 +370,7 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 
*master_k

[PATCH 01/35] fscrypt: rename fscrypt_info => fscrypt_inode_info

2023-09-26 Thread Josef Bacik
We are going to track per-extent information, so it'll be necessary to
distinguish between inode infos and extent infos.  Rename fscrypt_info
to fscrypt_inode_info, adjusting any lines that now exceed 80
characters.

Signed-off-by: Josef Bacik 
---
 fs/crypto/crypto.c  | 13 ++--
 fs/crypto/fname.c   |  6 +++---
 fs/crypto/fscrypt_private.h | 42 -
 fs/crypto/hooks.c   |  2 +-
 fs/crypto/inline_crypt.c| 13 ++--
 fs/crypto/keyring.c |  4 ++--
 fs/crypto/keysetup.c| 38 +
 fs/crypto/keysetup_v1.c | 14 +++--
 fs/crypto/policy.c  | 11 +-
 include/linux/fs.h  |  4 ++--
 include/linux/fscrypt.h |  6 +++---
 11 files changed, 82 insertions(+), 71 deletions(-)

diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 6a837e4b80dc..2b2bb5740322 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -39,7 +39,7 @@ static mempool_t *fscrypt_bounce_page_pool = NULL;
 static struct workqueue_struct *fscrypt_read_workqueue;
 static DEFINE_MUTEX(fscrypt_init_mutex);
 
-struct kmem_cache *fscrypt_info_cachep;
+struct kmem_cache *fscrypt_inode_info_cachep;
 
 void fscrypt_enqueue_decrypt_work(struct work_struct *work)
 {
@@ -78,7 +78,7 @@ EXPORT_SYMBOL(fscrypt_free_bounce_page);
  * simply contain the lblk_num (e.g., IV_INO_LBLK_32).
  */
 void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
-const struct fscrypt_info *ci)
+const struct fscrypt_inode_info *ci)
 {
u8 flags = fscrypt_policy_flags(&ci->ci_policy);
 
@@ -107,7 +107,7 @@ int fscrypt_crypt_block(const struct inode *inode, 
fscrypt_direction_t rw,
struct skcipher_request *req = NULL;
DECLARE_CRYPTO_WAIT(wait);
struct scatterlist dst, src;
-   struct fscrypt_info *ci = inode->i_crypt_info;
+   struct fscrypt_inode_info *ci = inode->i_crypt_info;
struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
int res = 0;
 
@@ -391,8 +391,9 @@ static int __init fscrypt_init(void)
if (!fscrypt_read_workqueue)
goto fail;
 
-   fscrypt_info_cachep = KMEM_CACHE(fscrypt_info, SLAB_RECLAIM_ACCOUNT);
-   if (!fscrypt_info_cachep)
+   fscrypt_inode_info_cachep = KMEM_CACHE(fscrypt_inode_info,
+  SLAB_RECLAIM_ACCOUNT);
+   if (!fscrypt_inode_info_cachep)
goto fail_free_queue;
 
err = fscrypt_init_keyring();
@@ -402,7 +403,7 @@ static int __init fscrypt_init(void)
return 0;
 
 fail_free_info:
-   kmem_cache_destroy(fscrypt_info_cachep);
+   kmem_cache_destroy(fscrypt_inode_info_cachep);
 fail_free_queue:
destroy_workqueue(fscrypt_read_workqueue);
 fail:
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 6eae3f12ad50..7b3fc189593a 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -100,7 +100,7 @@ int fscrypt_fname_encrypt(const struct inode *inode, const 
struct qstr *iname,
 {
struct skcipher_request *req = NULL;
DECLARE_CRYPTO_WAIT(wait);
-   const struct fscrypt_info *ci = inode->i_crypt_info;
+   const struct fscrypt_inode_info *ci = inode->i_crypt_info;
struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
union fscrypt_iv iv;
struct scatterlist sg;
@@ -157,7 +157,7 @@ static int fname_decrypt(const struct inode *inode,
struct skcipher_request *req = NULL;
DECLARE_CRYPTO_WAIT(wait);
struct scatterlist src_sg, dst_sg;
-   const struct fscrypt_info *ci = inode->i_crypt_info;
+   const struct fscrypt_inode_info *ci = inode->i_crypt_info;
struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
union fscrypt_iv iv;
int res;
@@ -568,7 +568,7 @@ EXPORT_SYMBOL_GPL(fscrypt_match_name);
  */
 u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name)
 {
-   const struct fscrypt_info *ci = dir->i_crypt_info;
+   const struct fscrypt_inode_info *ci = dir->i_crypt_info;
 
WARN_ON_ONCE(!ci->ci_dirhash_key_initialized);
 
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 2d63da48635a..b982073d1fe2 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -189,18 +189,21 @@ struct fscrypt_prepared_key {
 };
 
 /*
- * fscrypt_info - the "encryption key" for an inode
+ * fscrypt_inode_info - the "encryption key" for an inode
  *
  * When an encrypted file's key is made available, an instance of this struct 
is
  * allocated and stored in ->i_crypt_info.  Once created, it remains until the
  * inode is evicted.
  */
-struct fscrypt_info {
+struct fscrypt_inode_info {
 
/* The key in a form prepared for actual encryption/decryption */
struct fscrypt_prepared_key ci_enc_key;
 
-   /* True if ci_enc_key should be freed

[PATCH 00/35] btrfs: add fscrypt support

2023-09-26 Thread Josef Bacik
Hello,

This is the newly reworked fscrypt support for btrfs.  There have been a few
things changed since Sweet Tea's last post[1], and my RFC[2].  The changes from
Sweet Tea's patchset are mostly related to the fscrypt changes, but I'll detail
them here

- We have a fscrypt_extent_info struct that simply has the blk key in it and a
  nonce.
- We have a stripped down on disk context that just has what we need for
  extents.  At this time we only care about the nonce, everything else is
  supposed to match the owning inode.
- I've disabled everything except bog standard v2 policies to limit the
  complexity.
- Added the necessary hooks we needed for checksumming the encrypted bios.
- Reworked the on-disk stuff to be better described and accessed through
  helpers.
- Plumbed through the fscrypt_extent_info through everything to simplify the
  API calls we need from fscrypt.
- Instead of handling async key free'ing in fscrypt, handle the case where we're
  freeing extent_maps under the lock in a safe way.  This is cleaner than
  pushing this into fscrypt.
- Fixed a few things that fsstress uncovered in testing.

Changes to the fscrypt code since my RFC

- Took Eric's advice and added the policy and key to the extent context, this
  way if we want to in the future we could handle key changing.
- Added a helper to give us the fscrypt extent info context size.  We need the
  size ahead of time to setup the item properly.
- Fixed the blk crypto fallback not actually working with our process_bio
  callback.  Added a policy flag to make sure the checks work properly.
- Added some documentation.

Things left to do

- I still have to update fstests to deal with v2 only policies.  I haven't
  touched fstests at all yet, I've merely done my own rough testing with
  fsstress.
- Update the btrfs-progs patches.  This needs to be done to get the fstests
  stuff to work as well.
- fsverity still isn't encrypted.  I'm going to hit that next, it should be
  straightforward enough.

This is based on for-next from Dave's tree [3], but in case that moves between
now and then you can see my current branch here [4].  Thanks,

Josef

[1] 
https://lore.kernel.org/linux-fscrypt/cover.1693630890.git.sweettea-ker...@dorminy.me/
[2] 
https://lore.kernel.org/linux-btrfs/cover.1694738282.git.jo...@toxicpanda.com/
[3] https://github.com/kdave/btrfs-devel/tree/for-next
[4] https://github.com/josefbacik/linux/tree/fscrypt

Josef Bacik (20):
  fscrypt: rename fscrypt_info => fscrypt_inode_info
  fscrypt: add per-extent encryption support
  fscrypt: disable all but standard v2 policies for extent encryption
  blk-crypto: add a process bio callback
  fscrypt: add documentation about extent encryption
  btrfs: add infrastructure for safe em freeing
  btrfs: add fscrypt_info and encryption_type to ordered_extent
  btrfs: plumb through setting the fscrypt_info for ordered extents
  btrfs: populate the ordered_extent with the fscrypt context
  btrfs: keep track of fscrypt info and orig_start for dio reads
  btrfs: add an optional encryption context to the end of file extents
  btrfs: pass through fscrypt_extent_info to the file extent helpers
  btrfs: pass the fscrypt_info through the replace extent infrastructure
  btrfs: implement the fscrypt extent encryption hooks
  btrfs: setup fscrypt_extent_info for new extents
  btrfs: populate ordered_extent with the orig offset
  btrfs: set the bio fscrypt context when applicable
  btrfs: add a bio argument to btrfs_csum_one_bio
  btrfs: add orig_logical to btrfs_bio
  btrfs: implement process_bio cb for fscrypt

Omar Sandoval (7):
  fscrypt: expose fscrypt_nokey_name
  btrfs: disable various operations on encrypted inodes
  btrfs: start using fscrypt hooks
  btrfs: add inode encryption contexts
  btrfs: add new FEATURE_INCOMPAT_ENCRYPT flag
  btrfs: adapt readdir for encrypted and nokey names
  btrfs: implement fscrypt ioctls

Sweet Tea Dorminy (8):
  btrfs: disable verity on encrypted inodes
  btrfs: handle nokey names.
  btrfs: add encryption to CONFIG_BTRFS_DEBUG
  btrfs: add get_devices hook for fscrypt
  btrfs: turn on inlinecrypt mount option for encrypt
  btrfs: set file extent encryption excplicitly
  btrfs: add fscrypt_info and encryption_type to extent_map
  btrfs: explicitly track file extent length for replace and drop

 Documentation/filesystems/fscrypt.rst |  36 ++
 block/blk-crypto-fallback.c   |  28 ++
 block/blk-crypto-profile.c|   2 +
 block/blk-crypto.c|   6 +-
 fs/btrfs/Makefile |   1 +
 fs/btrfs/accessors.h  |  50 +++
 fs/btrfs/bio.c|  45 ++-
 fs/btrfs/bio.h|   6 +
 fs/btrfs/btrfs_inode.h|   3 +-
 fs/btrfs/compression.c|   6 +
 fs/btrfs/ctree.h  |   4 +
 fs/btrfs/defrag.c |  10 +-
 fs/btrfs/delayed-inode.c  |  29 +

Re: [RFC PATCH 0/4] fscrypt: add support for per-extent encryption

2023-09-15 Thread Josef Bacik
On Thu, Sep 14, 2023 at 11:48:16PM -0700, Eric Biggers wrote:
> On Thu, Sep 14, 2023 at 08:47:41PM -0400, Josef Bacik wrote:
> > Hello,
> > 
> > This is meant as a replacement for the last set of patches Sweet Tea sent 
> > [1].
> > This is an attempt to find a different path forward.  Strip down everything 
> > to
> > the basics.  Essentially all we appear to need is a nonce, and then we can 
> > use
> > the inode context to derive per-extent keys.
> > 
> > I'm sending this as an RFC to see if this is a better direction to try and 
> > make
> > some headway on this project.  The btrfs side doesn't change too much, the 
> > code
> > just needs to be adjusted to use the new helpers for the extent contexts.  I
> > have this work mostly complete, but I'm afraid I won't have it ready for 
> > another
> > day or two and I want to get feedback on this ASAP before I burn too much 
> > time
> > on it.
> > 
> > Additionally there is a callback I've put in the inline block crypto stuff 
> > that
> > we need in order to handle the checksumming.  I made my best guess here as 
> > to
> > what would be the easiest and simplest way to acheive what we need, but I'm 
> > open
> > to suggestions here.
> > 
> > The other note is I've disabled all of the policy variations other than 
> > default
> > v2 policies if you enable extent encryption.  This is for simplicity sake.  
> > We
> > could probably make most of it work, but reflink is basically impossible 
> > for v1
> > with direct key, and is problematic for the lblk related options.  It 
> > appears
> > this is fine, as those other modes are for specific use cases and the vast
> > majority of normal users are encouraged to use normal v2 policies anyway.
> > 
> > This stripped down version gives us most of what we want, we can reflink 
> > between
> > different inodes that have the same policy.  We lose the ability to mix
> > differently encrypted extents in the same inode, but this is an acceptable
> > limitation for now.
> > 
> > This has only been compile tested, and as I've said I haven't wired it
> > completely up into btrfs yet.  But this is based on a rough wire up and 
> > appears
> > to give us everything we need.  The btrfs portion of Sweet Teas patches are
> > basically untouched except where we use these helpers to deal with the 
> > extent
> > contexts.  Thanks,
> > 
> > Josef
> > 
> > [1] 
> > https://lore.kernel.org/linux-fscrypt/cover.1693630890.git.sweettea-ker...@dorminy.me/
> > 
> > Josef Bacik (4):
> >   fscrypt: rename fscrypt_info => fscrypt_inode_info
> >   fscrypt: add per-extent encryption support
> >   fscrypt: disable all but standard v2 policies for extent encryption
> >   blk-crypto: add a process bio callback
> > 
> >  block/blk-crypto-fallback.c |  18 
> >  block/blk-crypto-profile.c  |   2 +
> >  block/blk-crypto.c  |   6 +-
> >  fs/crypto/crypto.c  |  23 +++--
> >  fs/crypto/fname.c   |   6 +-
> >  fs/crypto/fscrypt_private.h |  78 
> >  fs/crypto/hooks.c   |   2 +-
> >  fs/crypto/inline_crypt.c|  50 +--
> >  fs/crypto/keyring.c |   4 +-
> >  fs/crypto/keysetup.c| 174 
> >  fs/crypto/keysetup_v1.c |  14 +--
> >  fs/crypto/policy.c  |  45 --
> >  include/linux/blk-crypto.h  |   9 +-
> >  include/linux/fs.h  |   4 +-
> >  include/linux/fscrypt.h |  41 -
> >  15 files changed, 400 insertions(+), 76 deletions(-)
> 
> Thanks Josef!  At a high level this looks good to me.  It's much simpler.  I
> guess my main question is "what is missing" (besides the obvious things like
> updating the documentation and polishing code comments).  I see you got rid 
> of a
> lot of the complexity in Sweet Tea's patchset, which is great as I think a lot
> of it was unnecessary as I've mentioned.  But maybe something got overlooked?
> I'm mainly wondering about the patches like "fscrypt: allow asynchronous info
> freeing" that were a bit puzzling but have now gone away.
> 

I'm going to fix this in a different way internally in btrfs.  There's only once
place where we can't drop the lock, so I plan to just collate free'd EM's and
free them in bulk after we drop the lock.  This *may* not fix the problem, I
have to wait for lockdep to tell me I'm stupid and I missed some other
dependen

[PATCH 3/4] fscrypt: disable all but standard v2 policies for extent encryption

2023-09-14 Thread Josef Bacik
The different encryption related options for fscrypt are too numerous to
support for extent based encryption.  Support for a few of these options
could possibly be added, but since they're niche options simply reject
them for file systems using extent based encryption.

Signed-off-by: Josef Bacik 
---
 fs/crypto/policy.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index cb944338271d..3c8665c21ee8 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -198,6 +198,12 @@ static bool fscrypt_supported_v1_policy(const struct 
fscrypt_policy_v1 *policy,
return false;
}
 
+   if (inode->i_sb->s_cop->flags & FS_CFLG_EXTENT_ENCRYPTION) {
+   fscrypt_warn(inode,
+"v1 policies can't be used on file systems that 
use extent encryption");
+   return false;
+   }
+
return true;
 }
 
@@ -233,6 +239,12 @@ static bool fscrypt_supported_v2_policy(const struct 
fscrypt_policy_v2 *policy,
return false;
}
 
+   if ((inode->i_sb->s_cop->flags & FS_CFLG_EXTENT_ENCRYPTION) && count) {
+   fscrypt_warn(inode,
+"Encryption flags aren't supported on file systems 
that use extent encryption");
+   return false;
+   }
+
if ((policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) &&
!supported_direct_key_modes(inode, policy->contents_encryption_mode,
policy->filenames_encryption_mode))
-- 
2.41.0



[PATCH 4/4] blk-crypto: add a process bio callback

2023-09-14 Thread Josef Bacik
Btrfs does checksumming, and the checksums need to match the bytes on
disk.  In order to facilitate this add a process bio callback for the
blk-crypto layer.  This allows the file system to specify a callback and
then can process the encrypted bio as necessary.

For btrfs, writes will have the checksums calculated and saved into our
relevant data structures for storage once the write completes.  For
reads we will validate the checksums match what is on disk and error out
if there is a mismatch.

This is incompatible with native encryption obviously, so make sure we
don't use native encryption if this callback is set.

Signed-off-by: Josef Bacik 
---
 block/blk-crypto-fallback.c | 18 ++
 block/blk-crypto-profile.c  |  2 ++
 block/blk-crypto.c  |  6 +-
 fs/crypto/inline_crypt.c|  3 ++-
 include/linux/blk-crypto.h  |  9 +++--
 include/linux/fscrypt.h | 14 ++
 6 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index e6468eab2681..91e5f70ef6b1 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -346,6 +346,15 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio 
**bio_ptr)
}
}
 
+   /* Process the encrypted bio before we submit it. */
+   if (bc->bc_key->crypto_cfg.process_bio) {
+   blk_st = bc->bc_key->crypto_cfg.process_bio(src_bio, enc_bio);
+   if (blk_st != BLK_STS_OK) {
+   src_bio->bi_status = blk_st;
+   goto out_free_bounce_pages;
+   }
+   }
+
enc_bio->bi_private = src_bio;
enc_bio->bi_end_io = blk_crypto_fallback_encrypt_endio;
*bio_ptr = enc_bio;
@@ -391,6 +400,15 @@ static void blk_crypto_fallback_decrypt_bio(struct 
work_struct *work)
unsigned int i;
blk_status_t blk_st;
 
+   /* Process the bio first before trying to decrypt. */
+   if (bc->bc_key->crypto_cfg.process_bio) {
+   blk_st = bc->bc_key->crypto_cfg.process_bio(bio, bio);
+   if (blk_st != BLK_STS_OK) {
+   bio->bi_status = blk_st;
+   goto out_no_keyslot;
+   }
+   }
+
/*
 * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for
 * this bio's algorithm and key.
diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c
index 7fabc883e39f..1d55b952e52e 100644
--- a/block/blk-crypto-profile.c
+++ b/block/blk-crypto-profile.c
@@ -352,6 +352,8 @@ bool __blk_crypto_cfg_supported(struct blk_crypto_profile 
*profile,
return false;
if (profile->max_dun_bytes_supported < cfg->dun_bytes)
return false;
+   if (cfg->process_bio)
+   return false;
return true;
 }
 
diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index 4d760b092deb..50556952df19 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -321,6 +321,8 @@ int __blk_crypto_rq_bio_prep(struct request *rq, struct bio 
*bio,
  * @dun_bytes: number of bytes that will be used to specify the DUN when this
  *key is used
  * @data_unit_size: the data unit size to use for en/decryption
+ * @process_bio: the call back if the upper layer needs to process the 
encrypted
+ *  bio
  *
  * Return: 0 on success, -errno on failure.  The caller is responsible for
  *zeroizing both blk_key and raw_key when done with them.
@@ -328,7 +330,8 @@ int __blk_crypto_rq_bio_prep(struct request *rq, struct bio 
*bio,
 int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key,
enum blk_crypto_mode_num crypto_mode,
unsigned int dun_bytes,
-   unsigned int data_unit_size)
+   unsigned int data_unit_size,
+   blk_crypto_process_bio_t process_bio)
 {
const struct blk_crypto_mode *mode;
 
@@ -350,6 +353,7 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, 
const u8 *raw_key,
blk_key->crypto_cfg.crypto_mode = crypto_mode;
blk_key->crypto_cfg.dun_bytes = dun_bytes;
blk_key->crypto_cfg.data_unit_size = data_unit_size;
+   blk_key->crypto_cfg.process_bio = process_bio;
blk_key->data_unit_size_bits = ilog2(data_unit_size);
blk_key->size = mode->keysize;
memcpy(blk_key->raw, raw_key, mode->keysize);
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index 69bb6ac9a7f7..0558fe73e82f 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -168,7 +168,8 @@ int fscrypt_prepare_inline_crypt_key(struct 
fscrypt_prepared_key *prep_key,
return -ENOMEM;
 
err = blk_crypto_init_key(blk_key, raw_key, crypto_mode,
- fscrypt_get_dun_bytes(c

[PATCH 2/4] fscrypt: add per-extent encryption support

2023-09-14 Thread Josef Bacik
This adds the code necessary for per-extent encryption.  We will store a
nonce for every extent we create, and then use the inode's policy and
the extents nonce to derive a per-extent key.

This is meant to be flexible, if we choose to expand the on-disk extent
information in the future we have a version number we can use to change
what exists on disk.

The file system indicates it wants to use per-extent encryption by
setting s_cop->set_extent_context.  This also requires the use of inline
block encryption.

The support is relatively straightforward, the only "extra" bit is we're
deriving a per-extent key to use for the encryption, the inode still
controls the policy and access to the master key.

Signed-off-by: Josef Bacik 
---
 fs/crypto/crypto.c  |  10 ++-
 fs/crypto/fscrypt_private.h |  36 ++
 fs/crypto/inline_crypt.c|  34 +
 fs/crypto/keysetup.c| 136 
 fs/crypto/policy.c  |  22 ++
 include/linux/fscrypt.h |  21 ++
 6 files changed, 258 insertions(+), 1 deletion(-)

diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 2b2bb5740322..e3ba06cb4082 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -40,6 +40,7 @@ static struct workqueue_struct *fscrypt_read_workqueue;
 static DEFINE_MUTEX(fscrypt_init_mutex);
 
 struct kmem_cache *fscrypt_inode_info_cachep;
+struct kmem_cache *fscrypt_extent_info_cachep;
 
 void fscrypt_enqueue_decrypt_work(struct work_struct *work)
 {
@@ -396,12 +397,19 @@ static int __init fscrypt_init(void)
if (!fscrypt_inode_info_cachep)
goto fail_free_queue;
 
+   fscrypt_extent_info_cachep = KMEM_CACHE(fscrypt_extent_info,
+   SLAB_RECLAIM_ACCOUNT);
+   if (!fscrypt_extent_info_cachep)
+   goto fail_free_info;
+
err = fscrypt_init_keyring();
if (err)
-   goto fail_free_info;
+   goto fail_free_extent_info;
 
return 0;
 
+fail_free_extent_info:
+   kmem_cache_destroy(fscrypt_extent_info_cachep);
 fail_free_info:
kmem_cache_destroy(fscrypt_inode_info_cachep);
 fail_free_queue:
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index b982073d1fe2..7e5ab3695023 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -30,6 +30,8 @@
 #define FSCRYPT_CONTEXT_V1 1
 #define FSCRYPT_CONTEXT_V2 2
 
+#define FSCRYPT_EXTENT_CONTEXT_V1  1
+
 /* Keep this in sync with include/uapi/linux/fscrypt.h */
 #define FSCRYPT_MODE_MAX   FSCRYPT_MODE_AES_256_HCTR2
 
@@ -52,6 +54,19 @@ struct fscrypt_context_v2 {
u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
 };
 
+/*
+ * fscrypt_extent_context - the encryption context of an extent
+ *
+ * This is the on-disk information stored for an extent.  The policy and
+ * relevante information is stored in the inode, the per-extent information is
+ * simply the nonce that's used in as KDF input in conjunction with the inode
+ * context to derive a per-extent key for encryption.
+ */
+struct fscrypt_extent_context {
+   u8 version; /* FSCRYPT_EXTENT_CONTEXT_V2 */
+   u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
+};
+
 /*
  * fscrypt_context - the encryption context of an inode
  *
@@ -260,6 +275,25 @@ struct fscrypt_inode_info {
u32 ci_hashed_ino;
 };
 
+/*
+ * fscrypt_extent_info - the "encryption key" for an extent.
+ *
+ * This contains the dervied key for the given extent and the nonce for the
+ * extent.
+ */
+struct fscrypt_extent_info {
+   refcount_t refs;
+
+   /* The derived key for this extent. */
+   struct fscrypt_prepared_key prep_key;
+
+   /* The super block that this extent belongs to. */
+   struct super_block *sb;
+
+   /* This is the extents nonce, loaded from the fscrypt_extent_context */
+   u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
+};
+
 typedef enum {
FS_DECRYPT = 0,
FS_ENCRYPT,
@@ -267,6 +301,7 @@ typedef enum {
 
 /* crypto.c */
 extern struct kmem_cache *fscrypt_inode_info_cachep;
+extern struct kmem_cache *fscrypt_extent_info_cachep;
 int fscrypt_initialize(struct super_block *sb);
 int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw,
u64 lblk_num, struct page *src_page,
@@ -326,6 +361,7 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 
*master_key,
 #define HKDF_CONTEXT_DIRHASH_KEY   5 /* info=file_nonce*/
 #define HKDF_CONTEXT_IV_INO_LBLK_32_KEY6 /* info=mode_num||fs_uuid 
*/
 #define HKDF_CONTEXT_INODE_HASH_KEY7 /* info=   */
+#define HKDF_CONTEXT_PER_EXTENT_ENC_KEY8 /* info=extent_nonce  
*/
 
 int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context,
const u8 *info, unsigned int infolen,
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index 3cfb04b8b64f..69bb6ac9a7f7 100644
--- a/fs

[PATCH 1/4] fscrypt: rename fscrypt_info => fscrypt_inode_info

2023-09-14 Thread Josef Bacik
We are going to track per-extent information, so it'll be necessary to
distinguish between inode infos and extent infos.  Rename fscrypt_info
to fscrypt_inode_info, adjusting any lines that now exceed 80
characters.

Signed-off-by: Josef Bacik 
---
 fs/crypto/crypto.c  | 13 ++--
 fs/crypto/fname.c   |  6 +++---
 fs/crypto/fscrypt_private.h | 42 -
 fs/crypto/hooks.c   |  2 +-
 fs/crypto/inline_crypt.c| 13 ++--
 fs/crypto/keyring.c |  4 ++--
 fs/crypto/keysetup.c| 38 +
 fs/crypto/keysetup_v1.c | 14 +++--
 fs/crypto/policy.c  | 11 +-
 include/linux/fs.h  |  4 ++--
 include/linux/fscrypt.h |  6 +++---
 11 files changed, 82 insertions(+), 71 deletions(-)

diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 6a837e4b80dc..2b2bb5740322 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -39,7 +39,7 @@ static mempool_t *fscrypt_bounce_page_pool = NULL;
 static struct workqueue_struct *fscrypt_read_workqueue;
 static DEFINE_MUTEX(fscrypt_init_mutex);
 
-struct kmem_cache *fscrypt_info_cachep;
+struct kmem_cache *fscrypt_inode_info_cachep;
 
 void fscrypt_enqueue_decrypt_work(struct work_struct *work)
 {
@@ -78,7 +78,7 @@ EXPORT_SYMBOL(fscrypt_free_bounce_page);
  * simply contain the lblk_num (e.g., IV_INO_LBLK_32).
  */
 void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
-const struct fscrypt_info *ci)
+const struct fscrypt_inode_info *ci)
 {
u8 flags = fscrypt_policy_flags(&ci->ci_policy);
 
@@ -107,7 +107,7 @@ int fscrypt_crypt_block(const struct inode *inode, 
fscrypt_direction_t rw,
struct skcipher_request *req = NULL;
DECLARE_CRYPTO_WAIT(wait);
struct scatterlist dst, src;
-   struct fscrypt_info *ci = inode->i_crypt_info;
+   struct fscrypt_inode_info *ci = inode->i_crypt_info;
struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
int res = 0;
 
@@ -391,8 +391,9 @@ static int __init fscrypt_init(void)
if (!fscrypt_read_workqueue)
goto fail;
 
-   fscrypt_info_cachep = KMEM_CACHE(fscrypt_info, SLAB_RECLAIM_ACCOUNT);
-   if (!fscrypt_info_cachep)
+   fscrypt_inode_info_cachep = KMEM_CACHE(fscrypt_inode_info,
+  SLAB_RECLAIM_ACCOUNT);
+   if (!fscrypt_inode_info_cachep)
goto fail_free_queue;
 
err = fscrypt_init_keyring();
@@ -402,7 +403,7 @@ static int __init fscrypt_init(void)
return 0;
 
 fail_free_info:
-   kmem_cache_destroy(fscrypt_info_cachep);
+   kmem_cache_destroy(fscrypt_inode_info_cachep);
 fail_free_queue:
destroy_workqueue(fscrypt_read_workqueue);
 fail:
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 6eae3f12ad50..7b3fc189593a 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -100,7 +100,7 @@ int fscrypt_fname_encrypt(const struct inode *inode, const 
struct qstr *iname,
 {
struct skcipher_request *req = NULL;
DECLARE_CRYPTO_WAIT(wait);
-   const struct fscrypt_info *ci = inode->i_crypt_info;
+   const struct fscrypt_inode_info *ci = inode->i_crypt_info;
struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
union fscrypt_iv iv;
struct scatterlist sg;
@@ -157,7 +157,7 @@ static int fname_decrypt(const struct inode *inode,
struct skcipher_request *req = NULL;
DECLARE_CRYPTO_WAIT(wait);
struct scatterlist src_sg, dst_sg;
-   const struct fscrypt_info *ci = inode->i_crypt_info;
+   const struct fscrypt_inode_info *ci = inode->i_crypt_info;
struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
union fscrypt_iv iv;
int res;
@@ -568,7 +568,7 @@ EXPORT_SYMBOL_GPL(fscrypt_match_name);
  */
 u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name)
 {
-   const struct fscrypt_info *ci = dir->i_crypt_info;
+   const struct fscrypt_inode_info *ci = dir->i_crypt_info;
 
WARN_ON_ONCE(!ci->ci_dirhash_key_initialized);
 
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 2d63da48635a..b982073d1fe2 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -189,18 +189,21 @@ struct fscrypt_prepared_key {
 };
 
 /*
- * fscrypt_info - the "encryption key" for an inode
+ * fscrypt_inode_info - the "encryption key" for an inode
  *
  * When an encrypted file's key is made available, an instance of this struct 
is
  * allocated and stored in ->i_crypt_info.  Once created, it remains until the
  * inode is evicted.
  */
-struct fscrypt_info {
+struct fscrypt_inode_info {
 
/* The key in a form prepared for actual encryption/decryption */
struct fscrypt_prepared_key ci_enc_key;
 
-   /* True if ci_enc_key should be freed

[RFC PATCH 0/4] fscrypt: add support for per-extent encryption

2023-09-14 Thread Josef Bacik
Hello,

This is meant as a replacement for the last set of patches Sweet Tea sent [1].
This is an attempt to find a different path forward.  Strip down everything to
the basics.  Essentially all we appear to need is a nonce, and then we can use
the inode context to derive per-extent keys.

I'm sending this as an RFC to see if this is a better direction to try and make
some headway on this project.  The btrfs side doesn't change too much, the code
just needs to be adjusted to use the new helpers for the extent contexts.  I
have this work mostly complete, but I'm afraid I won't have it ready for another
day or two and I want to get feedback on this ASAP before I burn too much time
on it.

Additionally there is a callback I've put in the inline block crypto stuff that
we need in order to handle the checksumming.  I made my best guess here as to
what would be the easiest and simplest way to acheive what we need, but I'm open
to suggestions here.

The other note is I've disabled all of the policy variations other than default
v2 policies if you enable extent encryption.  This is for simplicity sake.  We
could probably make most of it work, but reflink is basically impossible for v1
with direct key, and is problematic for the lblk related options.  It appears
this is fine, as those other modes are for specific use cases and the vast
majority of normal users are encouraged to use normal v2 policies anyway.

This stripped down version gives us most of what we want, we can reflink between
different inodes that have the same policy.  We lose the ability to mix
differently encrypted extents in the same inode, but this is an acceptable
limitation for now.

This has only been compile tested, and as I've said I haven't wired it
completely up into btrfs yet.  But this is based on a rough wire up and appears
to give us everything we need.  The btrfs portion of Sweet Teas patches are
basically untouched except where we use these helpers to deal with the extent
contexts.  Thanks,

Josef

[1] 
https://lore.kernel.org/linux-fscrypt/cover.1693630890.git.sweettea-ker...@dorminy.me/

Josef Bacik (4):
  fscrypt: rename fscrypt_info => fscrypt_inode_info
  fscrypt: add per-extent encryption support
  fscrypt: disable all but standard v2 policies for extent encryption
  blk-crypto: add a process bio callback

 block/blk-crypto-fallback.c |  18 
 block/blk-crypto-profile.c  |   2 +
 block/blk-crypto.c  |   6 +-
 fs/crypto/crypto.c  |  23 +++--
 fs/crypto/fname.c   |   6 +-
 fs/crypto/fscrypt_private.h |  78 
 fs/crypto/hooks.c   |   2 +-
 fs/crypto/inline_crypt.c|  50 +--
 fs/crypto/keyring.c |   4 +-
 fs/crypto/keysetup.c| 174 
 fs/crypto/keysetup_v1.c |  14 +--
 fs/crypto/policy.c  |  45 --
 include/linux/blk-crypto.h  |   9 +-
 include/linux/fs.h  |   4 +-
 include/linux/fscrypt.h |  41 -
 15 files changed, 400 insertions(+), 76 deletions(-)

-- 
2.41.0



Re: [PATCH 21/42] btrfs: make process_one_page() to handle subpage locking

2021-04-16 Thread Josef Bacik

On 4/15/21 1:04 AM, Qu Wenruo wrote:

Introduce a new data inodes specific subpage member, writers, to record
how many sectors are under page lock for delalloc writing.

This member acts pretty much the same as readers, except it's only for
delalloc writes.

This is important for delalloc code to trace which page can really be
freed, as we have cases like run_delalloc_nocow() where we may exit
processing nocow range inside a page, but need to exit to do cow half
way.
In that case, we need a way to determine if we can really unlock a full
page.

With the new btrfs_subpage::writers, there is a new requirement:
- Page locked by process_one_page() must be unlocked by
   process_one_page()
   There are still tons of call sites manually lock and unlock a page,
   without updating btrfs_subpage::writers.
   So if we lock a page through process_one_page() then it must be
   unlocked by process_one_page() to keep btrfs_subpage::writers
   consistent.

   This will be handled in next patch.

Signed-off-by: Qu Wenruo 


Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH 20/42] btrfs: make end_bio_extent_writepage() to be subpage compatible

2021-04-16 Thread Josef Bacik

On 4/15/21 1:04 AM, Qu Wenruo wrote:

Now in end_bio_extent_writepage(), the only subpage incompatible code is
the end_page_writeback().

Just call the subpage helpers.

Signed-off-by: Qu Wenruo 


Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH 19/42] btrfs: make __process_pages_contig() to handle subpage dirty/error/writeback status

2021-04-16 Thread Josef Bacik

On 4/15/21 1:04 AM, Qu Wenruo wrote:

For __process_pages_contig() and process_one_page(), to handle subpage
we only need to pass bytenr in and call subpage helpers to handle
dirty/error/writeback status.

Signed-off-by: Qu Wenruo 


Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH 18/42] btrfs: make btrfs_dirty_pages() to be subpage compatible

2021-04-16 Thread Josef Bacik

On 4/15/21 1:04 AM, Qu Wenruo wrote:

Since the extent io tree operations in btrfs_dirty_pages() are already
subpage compatible, we only need to make the page status update to use
subpage helpers.

Signed-off-by: Qu Wenruo 


Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH 17/42] btrfs: only require sector size alignment for end_bio_extent_writepage()

2021-04-16 Thread Josef Bacik

On 4/15/21 1:04 AM, Qu Wenruo wrote:

Just like read page, for subpage support we only require sector size
alignment.

So change the error message condition to only require sector alignment.

This should not affect existing code, as for regular sectorsize ==
PAGE_SIZE case, we are still requiring page alignment.

Signed-off-by: Qu Wenruo 
---
  fs/btrfs/extent_io.c | 29 -
  1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 53ac22e3560f..94f8b3ffe6a7 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2779,25 +2779,20 @@ static void end_bio_extent_writepage(struct bio *bio)
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+   const u32 sectorsize = fs_info->sectorsize;
  
-		/* We always issue full-page reads, but if some block

-* in a page fails to read, blk_update_request() will
-* advance bv_offset and adjust bv_len to compensate.
-* Print a warning for nonzero offsets, and an error
-* if they don't add up to a full page.  */
-   if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
-   if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
-   btrfs_err(fs_info,
-  "partial page write in btrfs with offset %u and 
length %u",
-   bvec->bv_offset, bvec->bv_len);
-   else
-   btrfs_info(fs_info,
-  "incomplete page write in btrfs with offset %u 
and length %u",
-   bvec->bv_offset, bvec->bv_len);
-   }
+   /* Btrfs read write should always be sector aligned. */
+   if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
+   btrfs_err(fs_info,
+   "partial page write in btrfs with offset %u and length %u",
+ bvec->bv_offset, bvec->bv_len);
+   else if (!IS_ALIGNED(bvec->bv_len, sectorsize))
+   btrfs_info(fs_info,
+   "incomplete page write with offset %u and length %u",
+  bvec->bv_offset, bvec->bv_len);
  
-		start = page_offset(page);

-   end = start + bvec->bv_offset + bvec->bv_len - 1;
+   start = page_offset(page) + bvec->bv_offset;
+   end = start + bvec->bv_len - 1;


Does this bit work out for you now?  Because before start was just the page 
offset.  Clearly the way it was before is a bug (I think?), because it gets used 
in btrfs_writepage_endio_finish_ordered() with the start+len, so you really do 
want start = page_offset(page) + bv_offset.  But this is a behavior change that 
warrants a patch of its own as it's unrelated to the sectorsize change.  (Yes I 
realize I'm asking for more patches in an already huge series, yes I'm insane.) 
Thanks,


Josef


Re: [PATCH 16/42] btrfs: provide btrfs_page_clamp_*() helpers

2021-04-16 Thread Josef Bacik

On 4/15/21 1:04 AM, Qu Wenruo wrote:

In the coming subpage RW supports, there are a lot of page status update
calls which need to be converted to subpage compatible version, which
needs @start and @len.

Some call sites already have such @start/@len and are already in
page range, like various endio functions.

But there are also call sites which need to clamp the range for subpage
case, like btrfs_dirty_pagse() and __process_contig_pages().

Here we introduce new helpers, btrfs_page_clamp_*(), to do and only do the
clamp for subpage version.

Although in theory all existing btrfs_page_*() calls can be converted to
use btrfs_page_clamp_*() directly, but that would make us to do
unnecessary clamp operations.

Signed-off-by: Qu Wenruo 


Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH 15/42] btrfs: refactor the page status update into process_one_page()

2021-04-16 Thread Josef Bacik

On 4/15/21 1:04 AM, Qu Wenruo wrote:

In __process_pages_contig() we update page status according to page_ops.

That update process is a bunch of if () {} branches, which lies inside
two loops, this makes it pretty hard to expand for later subpage
operations.

So this patch will extract this operations into its own function,
process_one_pages().

Also since we're refactoring __process_pages_contig(), also move the new
helper and __process_pages_contig() before the first caller of them, to
remove the forward declaration.

Signed-off-by: Qu Wenruo 


The refactor is fine, I still have questions about the pages_processed thing, 
but that can be addressed in the other patch and then will trickle down to here, 
so you can add


Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH 14/42] btrfs: pass bytenr directly to __process_pages_contig()

2021-04-16 Thread Josef Bacik

On 4/15/21 1:04 AM, Qu Wenruo wrote:

As a preparation for incoming subpage support, we need bytenr passed to
__process_pages_contig() directly, not the current page index.

So change the parameter and all callers to pass bytenr in.

With the modification, here we need to replace the old @index_ret with
@processed_end for __process_pages_contig(), but this brings a small
problem.

Normally we follow the inclusive return value, meaning @processed_end
should be the last byte we processed.

If parameter @start is 0, and we failed to lock any page, then we would
return @processed_end as -1, causing more problems for
__unlock_for_delalloc().

So here for @processed_end, we use two different return value patterns.
If we have locked any page, @processed_end will be the last byte of
locked page.
Or it will be @start otherwise.

This change will impact lock_delalloc_pages(), so it needs to check
@processed_end to only unlock the range if we have locked any.

Signed-off-by: Qu Wenruo 
---
  fs/btrfs/extent_io.c | 57 
  1 file changed, 37 insertions(+), 20 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ac01f29b00c9..ff24db8513b4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1807,8 +1807,8 @@ bool btrfs_find_delalloc_range(struct extent_io_tree 
*tree, u64 *start,
  
  static int __process_pages_contig(struct address_space *mapping,

  struct page *locked_page,
- pgoff_t start_index, pgoff_t end_index,
- unsigned long page_ops, pgoff_t *index_ret);
+ u64 start, u64 end, unsigned long page_ops,
+ u64 *processed_end);
  
  static noinline void __unlock_for_delalloc(struct inode *inode,

   struct page *locked_page,
@@ -1821,7 +1821,7 @@ static noinline void __unlock_for_delalloc(struct inode 
*inode,
if (index == locked_page->index && end_index == index)
return;
  
-	__process_pages_contig(inode->i_mapping, locked_page, index, end_index,

+   __process_pages_contig(inode->i_mapping, locked_page, start, end,
   PAGE_UNLOCK, NULL);
  }
  
@@ -1831,19 +1831,19 @@ static noinline int lock_delalloc_pages(struct inode *inode,

u64 delalloc_end)
  {
unsigned long index = delalloc_start >> PAGE_SHIFT;
-   unsigned long index_ret = index;
unsigned long end_index = delalloc_end >> PAGE_SHIFT;
+   u64 processed_end = delalloc_start;
int ret;
  
  	ASSERT(locked_page);

if (index == locked_page->index && index == end_index)
return 0;
  
-	ret = __process_pages_contig(inode->i_mapping, locked_page, index,

-end_index, PAGE_LOCK, &index_ret);
-   if (ret == -EAGAIN)
+   ret = __process_pages_contig(inode->i_mapping, locked_page, 
delalloc_start,
+delalloc_end, PAGE_LOCK, &processed_end);
+   if (ret == -EAGAIN && processed_end > delalloc_start)
__unlock_for_delalloc(inode, locked_page, delalloc_start,
- (u64)index_ret << PAGE_SHIFT);
+ processed_end);
return ret;
  }
  
@@ -1938,12 +1938,14 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
  
  static int __process_pages_contig(struct address_space *mapping,

  struct page *locked_page,
- pgoff_t start_index, pgoff_t end_index,
- unsigned long page_ops, pgoff_t *index_ret)
+ u64 start, u64 end, unsigned long page_ops,
+ u64 *processed_end)
  {
+   pgoff_t start_index = start >> PAGE_SHIFT;
+   pgoff_t end_index = end >> PAGE_SHIFT;
+   pgoff_t index = start_index;
unsigned long nr_pages = end_index - start_index + 1;
unsigned long pages_processed = 0;
-   pgoff_t index = start_index;
struct page *pages[16];
unsigned ret;
int err = 0;
@@ -1951,17 +1953,19 @@ static int __process_pages_contig(struct address_space 
*mapping,
  
  	if (page_ops & PAGE_LOCK) {

ASSERT(page_ops == PAGE_LOCK);
-   ASSERT(index_ret && *index_ret == start_index);
+   ASSERT(processed_end && *processed_end == start);
}
  
  	if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)

mapping_set_error(mapping, -EIO);
  
  	while (nr_pages > 0) {

-   ret = find_get_pages_contig(mapping, index,
+   int found_pages;
+
+   found_pages = find_get_pages_contig(mapping, index,
 min_t(unsigned long,
 nr_pages,

Re: [PATCH 13/42] btrfs: rename PagePrivate2 to PageOrdered inside btrfs

2021-04-16 Thread Josef Bacik

On 4/15/21 1:04 AM, Qu Wenruo wrote:

Inside btrfs, we use Private2 page status to indicate we have ordered
extent with pending IO for the sector.

But the page status name, Private2, tells us nothing about the bit
itself, so this patch will rename it to Ordered.
And with extra comment about the bit added, so reader who is still
uncertain about the page Ordered status, will find the comment pretty
easily.

Signed-off-by: Qu Wenruo 


Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH 12/42] btrfs: make Private2 lifespan more consistent

2021-04-16 Thread Josef Bacik

On 4/15/21 1:04 AM, Qu Wenruo wrote:

Currently btrfs uses page Private2 bit to incidate if we have ordered
extent for the page range.

But the lifespan of it is not consistent, during regular writeback path,
there are two locations to clear the same PagePrivate2:

 T - Page marked Dirty
 |
 + - Page marked Private2, through btrfs_run_dealloc_range()
 |
 + - Page cleared Private2, through btrfs_writepage_cow_fixup()
 |   in __extent_writepage_io()
 |   ^^^ Private2 cleared for the first time
 |
 + - Page marked Writeback, through btrfs_set_range_writeback()
 |   in __extent_writepage_io().
 |
 + - Page cleared Private2, through
 |   btrfs_writepage_endio_finish_ordered()
 |   ^^^ Private2 cleared for the second time.
 |
 + - Page cleared Writeback, through
 btrfs_writepage_endio_finish_ordered()

Currently PagePrivate2 is mostly to prevent ordered extent accounting
being executed for both endio and invalidatepage.
Thus only the one who cleared page Private2 is responsible for ordered
extent accounting.

But the fact is, in btrfs_writepage_endio_finish_ordered(), page
Private2 is cleared and ordered extent accounting is executed
unconditionally.

The race prevention only happens through btrfs_invalidatepage(), where
we wait the page writeback first, before checking the Private2 bit.

This means, Private2 is also protected by Writeback bit, and there is no
need for btrfs_writepage_cow_fixup() to clear Priavte2.

This patch will change btrfs_writepage_cow_fixup() to just
check PagePrivate2, not to clear it.
The clear will happen either in btrfs_invalidatepage() or
btrfs_writepage_endio_finish_ordered().

This makes the Private2 bit easier to understand, just meaning the page
has unfinished ordered extent attached to it.

Signed-off-by: Qu Wenruo 


Reviewed-by: Josef Bacik 

Thanks,

Josef



Re: [PATCH 11/42] btrfs: refactor btrfs_invalidatepage()

2021-04-16 Thread Josef Bacik

On 4/15/21 1:04 AM, Qu Wenruo wrote:

This patch will refactor btrfs_invalidatepage() for the incoming subpage
support.

The invovled modifcations are:
- Use while() loop instead of "goto again;"
- Use single variable to determine whether to delete extent states
   Each branch will also have comments why we can or cannot delete the
   extent states
- Do qgroup free and extent states deletion per-loop
   Current code can only work for PAGE_SIZE == sectorsize case.

This refactor also makes it clear what we do for different sectors:
- Sectors without ordered extent
   We're completely safe to remove all extent states for the sector(s)

- Sectors with ordered extent, but no Private2 bit
   This means the endio has already been executed, we can't remove all
   extent states for the sector(s).

- Sectors with ordere extent, still has Private2 bit
   This means we need to decrease the ordered extent accounting.
   And then it comes to two different variants:
   * We have finished and removed the ordered extent
 Then it's the same as "sectors without ordered extent"
   * We didn't finished the ordered extent
 We can remove some extent states, but not all.

Signed-off-by: Qu Wenruo 
---
  fs/btrfs/inode.c | 173 +--
  1 file changed, 94 insertions(+), 79 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4c894de2e813..93bb7c0482ba 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8320,15 +8320,12 @@ static void btrfs_invalidatepage(struct page *page, 
unsigned int offset,
  {
struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
struct extent_io_tree *tree = &inode->io_tree;
-   struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
u64 page_start = page_offset(page);
u64 page_end = page_start + PAGE_SIZE - 1;
-   u64 start;
-   u64 end;
+   u64 cur;
+   u32 sectorsize = inode->root->fs_info->sectorsize;
int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
-   bool found_ordered = false;
-   bool completed_ordered = false;
  
  	/*

 * We have page locked so no new ordered extent can be created on
@@ -8352,96 +8349,114 @@ static void btrfs_invalidatepage(struct page *page, 
unsigned int offset,
if (!inode_evicting)
lock_extent_bits(tree, page_start, page_end, &cached_state);
  
-	start = page_start;

-again:
-   ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 
1);
-   if (ordered) {
-   found_ordered = true;
-   end = min(page_end,
- ordered->file_offset + ordered->num_bytes - 1);
+   cur = page_start;
+   while (cur < page_end) {
+   struct btrfs_ordered_extent *ordered;
+   bool delete_states = false;
+   u64 range_end;
+
+   /*
+* Here we can't pass "file_offset = cur" and
+* "len = page_end + 1 - cur", as btrfs_lookup_ordered_range()
+* may not return the first ordered extent after @file_offset.
+*
+* Here we want to iterate through the range in byte order.
+* This is slower but definitely correct.
+*
+* TODO: Make btrfs_lookup_ordered_range() to return the
+* first ordered extent in the range to reduce the number
+* of loops.
+*/
+   ordered = btrfs_lookup_ordered_range(inode, cur, sectorsize);


How does it not find the first ordered extent after file_offset?  Looking at the 
code it just loops through and returns the first thing it finds that overlaps 
our range.  Is there a bug in btrfs_lookup_ordered_range()?


We should add some self tests to make sure these helpers are doing the right 
thing if there is in fact a bug.



+   if (!ordered) {
+   range_end = cur + sectorsize - 1;
+   /*
+* No ordered extent covering this sector, we are safe
+* to delete all extent states in the range.
+*/
+   delete_states = true;
+   goto next;
+   }
+
+   range_end = min(ordered->file_offset + ordered->num_bytes - 1,
+   page_end);
+   if (!PagePrivate2(page)) {
+   /*
+* If Private2 is cleared, it means endio has already
+* been executed for the range.
+* We can't delete the extent states as
+* btrfs_finish_ordered_io() may still use some of them.
+*/
+   delete_states = false;


delete_states is already false.


+   goto next;
+   }
+   ClearPagePrivate2(page);
+

Re: [PATCH 10/42] btrfs: update the comments in btrfs_invalidatepage()

2021-04-16 Thread Josef Bacik

On 4/15/21 1:04 AM, Qu Wenruo wrote:

The existing comments in btrfs_invalidatepage() don't really get to the
point, especially for what Private2 is really representing and how the
race avoidance is done.

The truth is, there are only three entrances to do ordered extent
accounting:
- btrfs_writepage_endio_finish_ordered()
- __endio_write_update_ordered()
   Those two entrance are just endio functions for dio and buffered
   write.

- btrfs_invalidatepage()

But there is a pitfall, in endio functions there is no check on whether
the ordered extent is already accounted.
They just blindly clear the Private2 bit and do the accounting.

So it's all btrfs_invalidatepage()'s responsibility to make sure we
won't do double account on the same sector.

That's why in btrfs_invalidatepage() we have to wait page writeback,
this will ensure all submitted bios has finished, thus their endio
functions have finished the accounting on the ordered extent.

Then we also check page Private2 to ensure that, we only run ordered
extent accounting on pages who has no bio submitted.

This patch will rework related comments to make it more clear on the
race and how we use wait_on_page_writeback() and Private2 to prevent
double accounting on ordered extent.

Signed-off-by: Qu Wenruo 


Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH 09/42] btrfs: refactor how we finish ordered extent io for endio functions

2021-04-16 Thread Josef Bacik

On 4/15/21 1:04 AM, Qu Wenruo wrote:

Btrfs has two endio functions to mark certain io range finished for
ordered extents:
- __endio_write_update_ordered()
   This is for direct IO

- btrfs_writepage_endio_finish_ordered()
   This for buffered IO.

However they go different routines to handle ordered extent io:
- Whether to iterate through all ordered extents
   __endio_write_update_ordered() will but
   btrfs_writepage_endio_finish_ordered() will not.

   In fact, iterating through all ordered extents will benefit later
   subpage support, while for current PAGE_SIZE == sectorsize requirement
   those behavior makes no difference.

- Whether to update page Private2 flag
   __endio_write_update_ordered() will no update page Private2 flag as
   for iomap direct IO, the page can be not even mapped.
   While btrfs_writepage_endio_finish_ordered() will clear Private2 to
   prevent double accounting against btrfs_invalidatepage().

Those differences are pretty small, and the ordered extent iterations
codes in callers makes code much harder to read.

So this patch will introduce a new function,
btrfs_mark_ordered_io_finished(), to do the heavy lifting work:
- Iterate through all ordered extents in the range
- Do the ordered extent accounting
- Queue the work for finished ordered extent

This function has two new feature:
- Proper underflow detection and recover
   The old underflow detection will only detect the problem, then
   continue.
   No proper info like root/inode/ordered extent info, nor noisy enough
   to be caught by fstests.

   Furthermore when underflow happens, the ordered extent will never
   finish.

   New error detection will reset the bytes_left to 0, do proper
   kernel warning, and output extra info including root, ino, ordered
   extent range, the underflow value.

- Prevent double accounting based on Private2 flag
   Now if we find a range without Private2 flag, we will skip to next
   range.
   As that means someone else has already finished the accounting of
   ordered extent.
   This makes no difference for current code, but will be a critical part
   for incoming subpage support.

Now both endio functions only need to call that new function.

And since the only caller of btrfs_dec_test_first_ordered_pending() is
removed, also remove btrfs_dec_test_first_ordered_pending() completely.

Signed-off-by: Qu Wenruo 
---
  fs/btrfs/inode.c|  55 +---
  fs/btrfs/ordered-data.c | 179 +++-
  fs/btrfs/ordered-data.h |   8 +-
  3 files changed, 129 insertions(+), 113 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 752f0c78e1df..645097bff5a0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3063,25 +3063,11 @@ void btrfs_writepage_endio_finish_ordered(struct 
btrfs_inode *inode,
  struct page *page, u64 start,
  u64 end, int uptodate)
  {
-   struct btrfs_fs_info *fs_info = inode->root->fs_info;
-   struct btrfs_ordered_extent *ordered_extent = NULL;
-   struct btrfs_workqueue *wq;
-
ASSERT(end + 1 - start < U32_MAX);
trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
  
-	ClearPagePrivate2(page);

-   if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
-   end - start + 1, uptodate))
-   return;
-
-   if (btrfs_is_free_space_inode(inode))
-   wq = fs_info->endio_freespace_worker;
-   else
-   wq = fs_info->endio_write_workers;
-
-   btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
-   btrfs_queue_work(wq, &ordered_extent->work);
+   btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start,
+  finish_ordered_fn, uptodate);
  }
  
  /*

@@ -7959,42 +7945,9 @@ static void __endio_write_update_ordered(struct 
btrfs_inode *inode,
 const u64 offset, const u64 bytes,
 const bool uptodate)
  {
-   struct btrfs_fs_info *fs_info = inode->root->fs_info;
-   struct btrfs_ordered_extent *ordered = NULL;
-   struct btrfs_workqueue *wq;
-   u64 ordered_offset = offset;
-   u64 ordered_bytes = bytes;
-   u64 last_offset;
-
-   if (btrfs_is_free_space_inode(inode))
-   wq = fs_info->endio_freespace_worker;
-   else
-   wq = fs_info->endio_write_workers;
-
ASSERT(bytes < U32_MAX);
-   while (ordered_offset < offset + bytes) {
-   last_offset = ordered_offset;
-   if (btrfs_dec_test_first_ordered_pending(inode, &ordered,
-&ordered_offset,
-ordered_bytes,
-uptodate)) {
-   btrfs_init_wor

Re: [PATCH 08/42] btrfs: pass btrfs_inode into btrfs_writepage_endio_finish_ordered()

2021-04-16 Thread Josef Bacik

On 4/15/21 1:04 AM, Qu Wenruo wrote:

There is a pretty bad abuse of btrfs_writepage_endio_finish_ordered() in
end_compressed_bio_write().

It passes compressed pages to btrfs_writepage_endio_finish_ordered(),
which is only supposed to accept inode pages.

Thankfully the important info here is the inode, so let's pass
btrfs_inode directly into btrfs_writepage_endio_finish_ordered(), and
make @page parameter optional.

By this, end_compressed_bio_write() can happily pass page=NULL while
still get everything done properly.

Also, to cooperate with such modification, replace @page parameter for
trace_btrfs_writepage_end_io_hook() with btrfs_inode.
Although this removes page_index info, the existing start/len should be
enough for most usage.

Signed-off-by: Qu Wenruo 
---
  fs/btrfs/compression.c   |  4 +---
  fs/btrfs/ctree.h |  3 ++-
  fs/btrfs/extent_io.c | 16 ++--
  fs/btrfs/inode.c |  9 +
  include/trace/events/btrfs.h | 19 ---
  5 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 2600703fab83..4fbe3e12be71 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -343,11 +343,9 @@ static void end_compressed_bio_write(struct bio *bio)
 * call back into the FS and do all the end_io operations
 */
inode = cb->inode;
-   cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
-   btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0],
+   btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
cb->start, cb->start + cb->len - 1,
bio->bi_status == BLK_STS_OK);
-   cb->compressed_pages[0]->mapping = NULL;
  
  	end_compressed_writeback(inode, cb);

/* note, our inode could be gone now */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2c858d5349c8..505bc6674bcc 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3175,7 +3175,8 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, 
struct page *locked_page
u64 start, u64 end, int *page_started, unsigned long 
*nr_written,
struct writeback_control *wbc);
  int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
-void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
+void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
+ struct page *page, u64 start,
  u64 end, int uptodate);
  extern const struct dentry_operations btrfs_dentry_operations;
  extern const struct iomap_ops btrfs_dio_iomap_ops;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7d1fca9b87f0..6d712418b67b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2711,10 +2711,13 @@ blk_status_t btrfs_submit_read_repair(struct inode 
*inode,
  
  void end_extent_writepage(struct page *page, int err, u64 start, u64 end)

  {
+   struct btrfs_inode *inode;
int uptodate = (err == 0);
int ret = 0;
  
-	btrfs_writepage_endio_finish_ordered(page, start, end, uptodate);

+   ASSERT(page && page->mapping);
+   inode = BTRFS_I(page->mapping->host);
+   btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate);
  
  	if (!uptodate) {

ClearPageUptodate(page);
@@ -3739,7 +3742,8 @@ static noinline_for_stack int 
__extent_writepage_io(struct btrfs_inode *inode,
u32 iosize;
  
  		if (cur >= i_size) {

-   btrfs_writepage_endio_finish_ordered(page, cur, end, 1);
+   btrfs_writepage_endio_finish_ordered(inode, page, cur,
+end, 1);
break;
}
em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
@@ -3777,8 +3781,8 @@ static noinline_for_stack int 
__extent_writepage_io(struct btrfs_inode *inode,
if (compressed)
nr++;
else
-   btrfs_writepage_endio_finish_ordered(page, cur,
-   cur + iosize - 1, 1);
+   btrfs_writepage_endio_finish_ordered(inode,
+   page, cur, cur + iosize - 1, 1);
cur += iosize;
continue;
}
@@ -4842,8 +4846,8 @@ int extent_write_locked_range(struct inode *inode, u64 
start, u64 end,
if (clear_page_dirty_for_io(page))
ret = __extent_writepage(page, &wbc_writepages, &epd);
else {
-   btrfs_writepage_endio_finish_ordered(page, start,
-   start + PAGE_SIZE - 1, 1);
+   btrfs_writepage_endio_fini

Re: [PATCH 07/42] btrfs: use u32 for length related members of btrfs_ordered_extent

2021-04-16 Thread Josef Bacik

On 4/15/21 1:04 AM, Qu Wenruo wrote:

Unlike btrfs_file_extent_item, btrfs_ordered_extent has its length
limit (BTRFS_MAX_EXTENT_SIZE), which is far smaller than U32_MAX.

Using u64 for those length related members are just a waste of memory.

This patch will make the following members u32:
- num_bytes
- disk_num_bytes
- bytes_left
- truncated_len

This will save 16 bytes for btrfs_ordered_extent structure.

For btrfs_add_ordered_extent*() call sites, they are mostly deeply
inside other functions passing u64.
Thus this patch will keep those u64, but do internal ASSERT() to ensure
the correct length values are passed in.

For btrfs_dec_test_.*_ordered_extent() call sites, length related
parameters are converted to u32, with extra ASSERT() added to ensure we
get correct values passed in.

There is special convert needed in btrfs_remove_ordered_extent(), which
needs s64, using "-entry->num_bytes" from u32 directly will cause
underflow.

Signed-off-by: Qu Wenruo 
---
  fs/btrfs/inode.c| 11 ---
  fs/btrfs/ordered-data.c | 21 ++---
  fs/btrfs/ordered-data.h | 25 ++---
  3 files changed, 36 insertions(+), 21 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 74ee34fc820d..554effbf307e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3066,6 +3066,7 @@ void btrfs_writepage_endio_finish_ordered(struct page 
*page, u64 start,
struct btrfs_ordered_extent *ordered_extent = NULL;
struct btrfs_workqueue *wq;
  
+	ASSERT(end + 1 - start < U32_MAX);

trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
  
  	ClearPagePrivate2(page);

@@ -7969,6 +7970,7 @@ static void __endio_write_update_ordered(struct 
btrfs_inode *inode,
else
wq = fs_info->endio_write_workers;
  
+	ASSERT(bytes < U32_MAX);

while (ordered_offset < offset + bytes) {
last_offset = ordered_offset;
if (btrfs_dec_test_first_ordered_pending(inode, &ordered,
@@ -8415,10 +8417,13 @@ static void btrfs_invalidatepage(struct page *page, 
unsigned int offset,
if (TestClearPagePrivate2(page)) {
spin_lock_irq(&inode->ordered_tree.lock);
set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
-   ordered->truncated_len = min(ordered->truncated_len,
-start - 
ordered->file_offset);
+   ASSERT(start - ordered->file_offset < U32_MAX);
+   ordered->truncated_len = min_t(u32,
+   ordered->truncated_len,
+   start - ordered->file_offset);
spin_unlock_irq(&inode->ordered_tree.lock);
  
+			ASSERT(end - start + 1 < U32_MAX);

if (btrfs_dec_test_ordered_pending(inode, &ordered,
   start,
   end - start + 1, 1)) 
{
@@ -8937,7 +8942,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
break;
else {
btrfs_err(root->fs_info,
- "found ordered extent %llu %llu on inode 
cleanup",
+ "found ordered extent %llu %u on inode 
cleanup",
  ordered->file_offset, ordered->num_bytes);
btrfs_remove_ordered_extent(inode, ordered);
btrfs_put_ordered_extent(ordered);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 07b0b4218791..8e6d9d906bdd 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -160,6 +160,12 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode 
*inode, u64 file_offset
struct btrfs_ordered_extent *entry;
int ret;
  
+	/*

+* Basic size check, all length related members should be smaller
+* than U32_MAX.
+*/
+   ASSERT(num_bytes < U32_MAX && disk_num_bytes < U32_MAX);
+
if (type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_PREALLOC) {
/* For nocow write, we can release the qgroup rsv right now */
ret = btrfs_qgroup_free_data(inode, NULL, file_offset, 
num_bytes);
@@ -186,7 +192,7 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode 
*inode, u64 file_offset
entry->bytes_left = num_bytes;
entry->inode = igrab(&inode->vfs_inode);
entry->compress_type = compress_type;
-   entry->truncated_len = (u64)-1;
+   entry->truncated_len = (u32)-1;
entry->qgroup_rsv = ret;
entry->physical = (u64)-1;
entry->disk = NULL;
@@ -320,7 +326,7 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent 
*entry,
   */
  bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
   stru

Re: [PATCH 06/42] btrfs: allow btrfs_bio_fits_in_stripe() to accept bio without any page

2021-04-16 Thread Josef Bacik

On 4/15/21 1:04 AM, Qu Wenruo wrote:

Function btrfs_bio_fits_in_stripe() now requires a bio with at least one
page added.
Or btrfs_get_chunk_map() will fail with -ENOENT.

But in fact this requirement is not needed at all, as we can just pass
sectorsize for btrfs_get_chunk_map().

This tiny behavior change is important for later subpage refactor on
submit_extent_page().

As for 64K page size, we can have a page range with pgoff=0 and
size=64K.
If the logical bytenr is just 16K before the stripe boundary, we have to
split the page range into two bios.

This means, we must check page range against stripe boundary, even adding
the range to an empty bio.

This tiny refactor is for the incoming change, but on its own, regular
sectorsize == PAGE_SIZE is not affected anyway.

Signed-off-by: Qu Wenruo 


Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH 05/42] btrfs: remove the unused parameter @len for btrfs_bio_fits_in_stripe()

2021-04-16 Thread Josef Bacik

On 4/15/21 1:04 AM, Qu Wenruo wrote:

The parameter @len is not really used in btrfs_bio_fits_in_stripe(),
just remove it.

Signed-off-by: Qu Wenruo 


Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH 02/42] btrfs: introduce write_one_subpage_eb() function

2021-04-16 Thread Josef Bacik

On 4/15/21 7:25 PM, Qu Wenruo wrote:



On 2021/4/16 上午3:03, Josef Bacik wrote:

On 4/15/21 1:04 AM, Qu Wenruo wrote:

The new function, write_one_subpage_eb(), as a subroutine for subpage
metadata write, will handle the extent buffer bio submission.

The major differences between the new write_one_subpage_eb() and
write_one_eb() is:
- No page locking
   When entering write_one_subpage_eb() the page is no longerlocked.
   We only lock the page for its status update, and unlock immediately.
   Now we completely rely on extent io tree locking.

- Extra bitmap update along with page status update
   Now page dirty and writeback is controlled by
   btrfs_subpage::dirty_bitmap and btrfs_subpage::writeback_bitmap.
   They both follow the schema that any sector is dirty/writeback, then
   the full page get dirty/writeback.

- When to update the nr_written number
   Now we take a short cut, if we have cleared the last dirtybit of the
   page, we update nr_written.
   This is not completely perfect, but should emulate the oldbehavior
   good enough.

Signed-off-by: Qu Wenruo 
---
  fs/btrfs/extent_io.c | 55 
  1 file changed, 55 insertions(+)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 21a14b1cb065..f32163a465ec 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4196,6 +4196,58 @@ static void
end_bio_extent_buffer_writepage(struct bio *bio)
  bio_put(bio);
  }
+/*
+ * Unlike the work in write_one_eb(), we rely completely on extent
locking.
+ * Page locking is only utizlied at minimal to keep the VM code happy.
+ *
+ * Caller should still call write_one_eb() other than this function
directly.
+ * As write_one_eb() has extra prepration before submitting the
extent buffer.
+ */
+static int write_one_subpage_eb(struct extent_buffer *eb,
+    struct writeback_control *wbc,
+    struct extent_page_data *epd)
+{
+    struct btrfs_fs_info *fs_info = eb->fs_info;
+    struct page *page = eb->pages[0];
+    unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
+    bool no_dirty_ebs = false;
+    int ret;
+
+    /* clear_page_dirty_for_io() in subpage helper needpage locked. */
+    lock_page(page);
+    btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len);
+
+    /* If we're the last dirty bit to update nr_written*/
+    no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page,
+  eb->start, eb->len);
+    if (no_dirty_ebs)
+    clear_page_dirty_for_io(page);
+
+    ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, page,
+    eb->start, eb->len, eb->start - page_offset(page),
+    &epd->bio, end_bio_extent_buffer_writepage, 0, 0, 0,
+    false);
+    if (ret) {
+    btrfs_subpage_clear_writeback(fs_info, page, eb->start,
+  eb->len);
+    set_btree_ioerr(page, eb);
+    unlock_page(page);
+
+    if (atomic_dec_and_test(&eb->io_pages))
+    end_extent_buffer_writeback(eb);
+    return -EIO;
+    }
+    unlock_page(page);
+    /*
+ * Submission finishes without problem, if no range of the page is
+ * dirty anymore, we have submitted a page.
+ * Update the nr_written in wbc.
+ */
+    if (no_dirty_ebs)
+    update_nr_written(wbc, 1);
+    return ret;
+}
+
  static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
  struct writeback_control *wbc,
  struct extent_page_data *epd)
@@ -4227,6 +4279,9 @@ static noinline_for_stack int
write_one_eb(struct extent_buffer *eb,
  memzero_extent_buffer(eb, start, end - start);
  }
+    if (eb->fs_info->sectorsize < PAGE_SIZE)
+    return write_one_subpage_eb(eb, wbc, epd);
+


Same comment here, again you're calling write_one_eb() which expects to
do the eb thing, but then later have an entirely different path for the
subpage stuff, and thus could just call your write_one_subpage_eb()
helper from there instead of stuffing it into write_one_eb().


But there are some common code before calling the subpage routine.

I don't think it's a good idea to have duplicated code between subpage
and regular routine.



Ah I missed the part at the top for zero'ing out the buffer.  In that case turn 
that into a helper function and then keep the paths separate.  Thanks,


Josef


Re: [PATCH 04/42] btrfs: introduce submit_eb_subpage() to submit a subpage metadata page

2021-04-16 Thread Josef Bacik

On 4/15/21 7:28 PM, Qu Wenruo wrote:



On 2021/4/16 上午3:27, Josef Bacik wrote:

On 4/15/21 1:04 AM, Qu Wenruo wrote:

The new function, submit_eb_subpage(), will submit all the dirty extent
buffers in the page.

The major difference between submit_eb_page() and submit_eb_subpage()
is:
- How to grab extent buffer
   Now we use find_extent_buffer_nospinlock() other than using
   page::private.

All other different handling is already done in functions like
lock_extent_buffer_for_io() and write_one_eb().

Signed-off-by: Qu Wenruo 
---
  fs/btrfs/extent_io.c | 95 
  1 file changed, 95 insertions(+)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c068c2fcba09..7d1fca9b87f0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4323,6 +4323,98 @@ static noinline_for_stack int
write_one_eb(struct extent_buffer *eb,
  return ret;
  }
+/*
+ * Submit one subpage btree page.
+ *
+ * The main difference between submit_eb_page() is:
+ * - Page locking
+ *   For subpage, we don't rely on page locking at all.
+ *
+ * - Flush write bio
+ *   We only flush bio if we may be unable to fit current extent
buffers into
+ *   current bio.
+ *
+ * Return >=0 for the number of submitted extent buffers.
+ * Return <0 for fatal error.
+ */
+static int submit_eb_subpage(struct page *page,
+ struct writeback_control *wbc,
+ struct extent_page_data *epd)
+{
+    struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+    int submitted = 0;
+    u64 page_start = page_offset(page);
+    int bit_start = 0;
+    int nbits = BTRFS_SUBPAGE_BITMAP_SIZE;
+    int sectors_per_node = fs_info->nodesize >>
fs_info->sectorsize_bits;
+    int ret;
+
+    /* Lock and write each dirty extent buffers in the range */
+    while (bit_start < nbits) {
+    struct btrfs_subpage *subpage = (struct btrfs_subpage
*)page->private;
+    struct extent_buffer *eb;
+    unsigned long flags;
+    u64 start;
+
+    /*
+ * Take private lock to ensure the subpage won't be detached
+ * halfway.
+ */
+    spin_lock(&page->mapping->private_lock);
+    if (!PagePrivate(page)) {
+    spin_unlock(&page->mapping->private_lock);
+    break;
+    }
+    spin_lock_irqsave(&subpage->lock, flags);


writepages doesn't get called with irq context, so you can just do
spin_lock_irq()/spin_unlock_irq().


But this spinlock is used in endio function.
If we don't use irqsave variant here, won't an endio interruption call
sneak in and screw up everything?



No, you use irqsave if the function can be called under irq.  So in the endio 
call you do irqsave.  This function can't be called by an interrupt handler, so 
just _irq() is fine because you just need to disable irq's.  Thanks,


Josef


Re: [PATCH 04/42] btrfs: introduce submit_eb_subpage() to submit a subpage metadata page

2021-04-15 Thread Josef Bacik

On 4/15/21 1:04 AM, Qu Wenruo wrote:

The new function, submit_eb_subpage(), will submit all the dirty extent
buffers in the page.

The major difference between submit_eb_page() and submit_eb_subpage()
is:
- How to grab extent buffer
   Now we use find_extent_buffer_nospinlock() other than using
   page::private.

All other different handling is already done in functions like
lock_extent_buffer_for_io() and write_one_eb().

Signed-off-by: Qu Wenruo 
---
  fs/btrfs/extent_io.c | 95 
  1 file changed, 95 insertions(+)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c068c2fcba09..7d1fca9b87f0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4323,6 +4323,98 @@ static noinline_for_stack int write_one_eb(struct 
extent_buffer *eb,
return ret;
  }
  
+/*

+ * Submit one subpage btree page.
+ *
+ * The main difference between submit_eb_page() is:
+ * - Page locking
+ *   For subpage, we don't rely on page locking at all.
+ *
+ * - Flush write bio
+ *   We only flush bio if we may be unable to fit current extent buffers into
+ *   current bio.
+ *
+ * Return >=0 for the number of submitted extent buffers.
+ * Return <0 for fatal error.
+ */
+static int submit_eb_subpage(struct page *page,
+struct writeback_control *wbc,
+struct extent_page_data *epd)
+{
+   struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+   int submitted = 0;
+   u64 page_start = page_offset(page);
+   int bit_start = 0;
+   int nbits = BTRFS_SUBPAGE_BITMAP_SIZE;
+   int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
+   int ret;
+
+   /* Lock and write each dirty extent buffers in the range */
+   while (bit_start < nbits) {
+   struct btrfs_subpage *subpage = (struct btrfs_subpage 
*)page->private;
+   struct extent_buffer *eb;
+   unsigned long flags;
+   u64 start;
+
+   /*
+* Take private lock to ensure the subpage won't be detached
+* halfway.
+*/
+   spin_lock(&page->mapping->private_lock);
+   if (!PagePrivate(page)) {
+   spin_unlock(&page->mapping->private_lock);
+   break;
+   }
+   spin_lock_irqsave(&subpage->lock, flags);


writepages doesn't get called with irq context, so you can just do 
spin_lock_irq()/spin_unlock_irq().



+   if (!((1 << bit_start) & subpage->dirty_bitmap)) {


Can we make this a helper so it's more clear what's going on here?  Thanks,

Josef


Re: [PATCH 03/42] btrfs: make lock_extent_buffer_for_io() to be subpage compatible

2021-04-15 Thread Josef Bacik

On 4/15/21 1:04 AM, Qu Wenruo wrote:

For subpage metadata, we don't use page locking at all.
So just skip the page locking part for subpage.

All the remaining routine can be reused.

Signed-off-by: Qu Wenruo 


Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH 02/42] btrfs: introduce write_one_subpage_eb() function

2021-04-15 Thread Josef Bacik

On 4/15/21 1:04 AM, Qu Wenruo wrote:

The new function, write_one_subpage_eb(), as a subroutine for subpage
metadata write, will handle the extent buffer bio submission.

The major differences between the new write_one_subpage_eb() and
write_one_eb() is:
- No page locking
   When entering write_one_subpage_eb() the page is no longer locked.
   We only lock the page for its status update, and unlock immediately.
   Now we completely rely on extent io tree locking.

- Extra bitmap update along with page status update
   Now page dirty and writeback is controlled by
   btrfs_subpage::dirty_bitmap and btrfs_subpage::writeback_bitmap.
   They both follow the schema that any sector is dirty/writeback, then
   the full page get dirty/writeback.

- When to update the nr_written number
   Now we take a short cut, if we have cleared the last dirty bit of the
   page, we update nr_written.
   This is not completely perfect, but should emulate the old behavior
   good enough.

Signed-off-by: Qu Wenruo 
---
  fs/btrfs/extent_io.c | 55 
  1 file changed, 55 insertions(+)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 21a14b1cb065..f32163a465ec 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4196,6 +4196,58 @@ static void end_bio_extent_buffer_writepage(struct bio 
*bio)
bio_put(bio);
  }
  
+/*

+ * Unlike the work in write_one_eb(), we rely completely on extent locking.
+ * Page locking is only utizlied at minimal to keep the VM code happy.
+ *
+ * Caller should still call write_one_eb() other than this function directly.
+ * As write_one_eb() has extra prepration before submitting the extent buffer.
+ */
+static int write_one_subpage_eb(struct extent_buffer *eb,
+   struct writeback_control *wbc,
+   struct extent_page_data *epd)
+{
+   struct btrfs_fs_info *fs_info = eb->fs_info;
+   struct page *page = eb->pages[0];
+   unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
+   bool no_dirty_ebs = false;
+   int ret;
+
+   /* clear_page_dirty_for_io() in subpage helper need page locked. */
+   lock_page(page);
+   btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len);
+
+   /* If we're the last dirty bit to update nr_written */
+   no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page,
+ eb->start, eb->len);
+   if (no_dirty_ebs)
+   clear_page_dirty_for_io(page);
+
+   ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, page,
+   eb->start, eb->len, eb->start - page_offset(page),
+   &epd->bio, end_bio_extent_buffer_writepage, 0, 0, 0,
+   false);
+   if (ret) {
+   btrfs_subpage_clear_writeback(fs_info, page, eb->start,
+ eb->len);
+   set_btree_ioerr(page, eb);
+   unlock_page(page);
+
+   if (atomic_dec_and_test(&eb->io_pages))
+   end_extent_buffer_writeback(eb);
+   return -EIO;
+   }
+   unlock_page(page);
+   /*
+* Submission finishes without problem, if no range of the page is
+* dirty anymore, we have submitted a page.
+* Update the nr_written in wbc.
+*/
+   if (no_dirty_ebs)
+   update_nr_written(wbc, 1);
+   return ret;
+}
+
  static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
struct writeback_control *wbc,
struct extent_page_data *epd)
@@ -4227,6 +4279,9 @@ static noinline_for_stack int write_one_eb(struct 
extent_buffer *eb,
memzero_extent_buffer(eb, start, end - start);
}
  
+	if (eb->fs_info->sectorsize < PAGE_SIZE)

+   return write_one_subpage_eb(eb, wbc, epd);
+


Same comment here, again you're calling write_one_eb() which expects to do the 
eb thing, but then later have an entirely different path for the subpage stuff, 
and thus could just call your write_one_subpage_eb() helper from there instead 
of stuffing it into write_one_eb().


Also, I generally don't care about ordering of patches as long as they make 
sense generally.


However in this case if you were to bisect to just this patch you would be 
completely screwed, as the normal write path would just fail to write the other 
eb's on the page.  You really need to have the patches that do the 
write_cache_pages part done first, and then have this patch.


Or alternatively, leave the order as it is, and simply don't wire the helper up 
until you implement the subpage writepages further down.  That may be better, 
you won't have to re-order anything and you can maintain these smaller chunks 
for review, which may not be possible if you re-order them.  Thanks,


Josef


Re: [PATCH 01/42] btrfs: introduce end_bio_subpage_eb_writepage() function

2021-04-15 Thread Josef Bacik

On 4/15/21 1:04 AM, Qu Wenruo wrote:

The new function, end_bio_subpage_eb_writepage(), will handle the
metadata writeback endio.

The major differences involved are:
- How to grab extent buffer
   Now page::private is a pointer to btrfs_subpage, we can no longer grab
   extent buffer directly.
   Thus we need to use the bv_offset to locate the extent buffer manually
   and iterate through the whole range.

- Use btrfs_subpage_end_writeback() caller
   This helper will handle the subpage writeback for us.

Since this function is executed under endio context, when grabbing
extent buffers it can't grab eb->refs_lock as that lock is not designed
to be grabbed under hardirq context.

So here introduce a helper, find_extent_buffer_nospinlock(), for such
situation, and convert find_extent_buffer() to use that helper.

Signed-off-by: Qu Wenruo 
---
  fs/btrfs/extent_io.c | 135 +--
  1 file changed, 106 insertions(+), 29 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a50adbd8808d..21a14b1cb065 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4080,13 +4080,97 @@ static void set_btree_ioerr(struct page *page, struct 
extent_buffer *eb)
}
  }
  
+/*

+ * This is the endio specific version which won't touch any unsafe spinlock
+ * in endio context.
+ */
+static struct extent_buffer *find_extent_buffer_nospinlock(
+   struct btrfs_fs_info *fs_info, u64 start)
+{
+   struct extent_buffer *eb;
+
+   rcu_read_lock();
+   eb = radix_tree_lookup(&fs_info->buffer_radix,
+  start >> fs_info->sectorsize_bits);
+   if (eb && atomic_inc_not_zero(&eb->refs)) {
+   rcu_read_unlock();
+   return eb;
+   }
+   rcu_read_unlock();
+   return NULL;
+}
+/*
+ * The endio function for subpage extent buffer write.
+ *
+ * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback()
+ * after all extent buffers in the page has finished their writeback.
+ */
+static void end_bio_subpage_eb_writepage(struct btrfs_fs_info *fs_info,
+struct bio *bio)
+{
+   struct bio_vec *bvec;
+   struct bvec_iter_all iter_all;
+
+   ASSERT(!bio_flagged(bio, BIO_CLONED));
+   bio_for_each_segment_all(bvec, bio, iter_all) {
+   struct page *page = bvec->bv_page;
+   u64 bvec_start = page_offset(page) + bvec->bv_offset;
+   u64 bvec_end = bvec_start + bvec->bv_len - 1;
+   u64 cur_bytenr = bvec_start;
+
+   ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize));
+
+   /* Iterate through all extent buffers in the range */
+   while (cur_bytenr <= bvec_end) {
+   struct extent_buffer *eb;
+   int done;
+
+   /*
+* Here we can't use find_extent_buffer(), as it may
+* try to lock eb->refs_lock, which is not safe in endio
+* context.
+*/
+   eb = find_extent_buffer_nospinlock(fs_info, cur_bytenr);
+   ASSERT(eb);
+
+   cur_bytenr = eb->start + eb->len;
+
+   ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags));
+   done = atomic_dec_and_test(&eb->io_pages);
+   ASSERT(done);
+
+   if (bio->bi_status ||
+   test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
+   ClearPageUptodate(page);
+   set_btree_ioerr(page, eb);
+   }
+
+   btrfs_subpage_clear_writeback(fs_info, page, eb->start,
+ eb->len);
+   end_extent_buffer_writeback(eb);
+   /*
+* free_extent_buffer() will grab spinlock which is not
+* safe in endio context. Thus here we manually dec
+* the ref.
+*/
+   atomic_dec(&eb->refs);
+   }
+   }
+   bio_put(bio);
+}
+
  static void end_bio_extent_buffer_writepage(struct bio *bio)
  {
+   struct btrfs_fs_info *fs_info;
struct bio_vec *bvec;
struct extent_buffer *eb;
int done;
struct bvec_iter_all iter_all;
  
+	fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb);

+   if (fs_info->sectorsize < PAGE_SIZE)
+   return end_bio_subpage_eb_writepage(fs_info, bio);
+


You replace the write_one_eb() call with one specifically for subpage, why not 
just use your special endio from there without polluting the normal writepage 
helper?  Thanks,


Josef


Re: [PATCH v4 3/3] btrfs: zoned: automatically reclaim zones

2021-04-15 Thread Josef Bacik

On 4/15/21 9:58 AM, Johannes Thumshirn wrote:

When a file gets deleted on a zoned file system, the space freed is not
returned back into the block group's free space, but is migrated to
zone_unusable.

As this zone_unusable space is behind the current write pointer it is not
possible to use it for new allocations. In the current implementation a
zone is reset once all of the block group's space is accounted as zone
unusable.

This behaviour can lead to premature ENOSPC errors on a busy file system.

Instead of only reclaiming the zone once it is completely unusable,
kick off a reclaim job once the amount of unusable bytes exceeds a user
configurable threshold between 51% and 100%. It can be set per mounted
filesystem via the sysfs tunable bg_reclaim_threshold which is set to 75%
per default.

Similar to reclaiming unused block groups, these dirty block groups are
added to a to_reclaim list and then on a transaction commit, the reclaim
process is triggered but after we deleted unused block groups, which will
free space for the relocation process.

Signed-off-by: Johannes Thumshirn 
---
  fs/btrfs/block-group.c   | 96 
  fs/btrfs/block-group.h   |  3 ++
  fs/btrfs/ctree.h |  6 +++
  fs/btrfs/disk-io.c   | 13 +
  fs/btrfs/free-space-cache.c  |  9 +++-
  fs/btrfs/sysfs.c | 35 +
  fs/btrfs/volumes.c   |  2 +-
  fs/btrfs/volumes.h   |  1 +
  include/trace/events/btrfs.h | 12 +
  9 files changed, 175 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index bbb5a6e170c7..3f06ea42c013 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1485,6 +1485,92 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
spin_unlock(&fs_info->unused_bgs_lock);
  }
  
+void btrfs_reclaim_bgs_work(struct work_struct *work)

+{
+   struct btrfs_fs_info *fs_info =
+   container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
+   struct btrfs_block_group *bg;
+   struct btrfs_space_info *space_info;
+   int ret = 0;
+
+   if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
+   return;
+
+   if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
+   return;
+
+   mutex_lock(&fs_info->reclaim_bgs_lock);
+   spin_lock(&fs_info->unused_bgs_lock);
+   while (!list_empty(&fs_info->reclaim_bgs)) {
+   bg = list_first_entry(&fs_info->reclaim_bgs,
+ struct btrfs_block_group,
+ bg_list);
+   list_del_init(&bg->bg_list);
+
+   space_info = bg->space_info;
+   spin_unlock(&fs_info->unused_bgs_lock);
+
+   /* Don't want to race with allocators so take the groups_sem */
+   down_write(&space_info->groups_sem);
+
+   spin_lock(&bg->lock);
+   if (bg->reserved || bg->pinned || bg->ro) {
+   /*
+* We want to bail if we made new allocations or have
+* outstanding allocations in this block group.  We do
+* the ro check in case balance is currently acting on
+* this block group.
+*/
+   spin_unlock(&bg->lock);
+   up_write(&space_info->groups_sem);
+   goto next;
+   }
+   spin_unlock(&bg->lock);
+


Here I think we want a

if (btrfs_fs_closing())
goto next;

so we don't block out a umount for all eternity.  Thanks,

Josef


Re: [PATCH v4 2/3] btrfs: rename delete_unused_bgs_mutex

2021-04-15 Thread Josef Bacik

On 4/15/21 9:58 AM, Johannes Thumshirn wrote:

As a preparation for another user, rename the unused_bgs_mutex into
reclaim_bgs_lock.

Signed-off-by: Johannes Thumshirn 


Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH v4 1/3] btrfs: zoned: reset zones of relocated block groups

2021-04-15 Thread Josef Bacik

On 4/15/21 9:58 AM, Johannes Thumshirn wrote:

When relocating a block group the freed up space is not discarded in one
big block, but each extent is discarded on it's own with -odisard=sync.

For a zoned filesystem we need to discard the whole block group at once,
so btrfs_discard_extent() will translate the discard into a
REQ_OP_ZONE_RESET operation, which then resets the device's zone.

Link: 
https://lore.kernel.org/linux-btrfs/459e2932c48e12e883dcfd3dda828d9da251d5b5.1617962110.git.johannes.thumsh...@wdc.com
Signed-off-by: Johannes Thumshirn 


What would be cool is if we could disable discard per bg so we don't discard at 
all during the relocation, and then discard the whole block group no matter if 
we have zoned or not.  However not really something you need to do, just 
thinking out loud


Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [RFC v3 0/2] vfs / btrfs: add support for ustat()

2021-04-15 Thread Josef Bacik

On 4/15/21 1:53 PM, Luis Chamberlain wrote:

On Wed, Aug 23, 2017 at 3:31 PM Jeff Mahoney  wrote:


On 8/15/14 5:29 AM, Al Viro wrote:

On Thu, Aug 14, 2014 at 07:58:56PM -0700, Luis R. Rodriguez wrote:


Christoph had noted that this seemed associated to the problem
that the btrfs uses different assignments for st_dev than s_dev,
but much as I'd like to see that changed based on discussions so
far its unclear if this is going to be possible unless strong
commitment is reached.


Resurrecting a dead thread since we've been carrying this patch anyway
since then.


Explain, please.  Whose commitment and commitment to what, exactly?
Having different ->st_dev values for different files on the same
fs is a bloody bad idea; why does btrfs do that at all?  If nothing else,
it breaks the usual "are those two files on the same fs?" tests...


It's because btrfs snapshots would have inode number collisions.
Changing the inode numbers for snapshots would negate a big benefit of
btrfs snapshots: the quick creation and lightweight on-disk
representation due to metadata sharing.

The thing is that ustat() used to work.  Your commit 0ee5dc676a5f8
(btrfs: kill magical embedded struct superblock) had a regression:
Since it replaced the superblock with a simple dev_t, it rendered the
device no longer discoverable by user_get_super.  We need a list_head to
attach for searching.

There's an argument that this is hacky.  It's valid.  The only other
feedback I've heard is to use a real superblock for subvolumes to do
this instead.  That doesn't work either, due to things like freeze/thaw
and inode writeback.  Ultimately, what we need is a single file system
with multiple namespaces.  Years ago we just needed different inode
namespaces, but as people have started adopting btrfs for containers, we
need more than that.  I've heard requests for per-subvolume security
contexts.  I'd imagine user namespaces are on someone's wish list.  A
working df can be done with ->d_automount, but the way btrfs handles
having a "canonical" subvolume location has always been a way to avoid
directory loops.  I'd like to just automount subvolumes everywhere
they're referenced.  One solution, for which I have no code yet, is to
have something like a superblock-light that we can hang things like a
security context, a user namespace, and an anonymous dev.  Most file
systems would have just one.  Btrfs would have one per subvolume.

That's a big project with a bunch of discussion.


4 years have gone by and this patch is still being carried around for
btrfs. Other than resolving this ustat() issue for btrfs are there new
reasons to support this effort done to be done properly? Are there
other filesystems that would benefit? I'd like to get an idea of the
stakeholder here before considering taking this on or not.



Not really sure why this needs to be addressed, we have statfs(), and what we 
have has worked forever now.  There's a lot of larger things that need to be 
addressed in general to support the volume approach inside file systems that is 
going to require a lot of work inside of VFS.  If you feel like tackling that 
work and then wiring up btrfs by all means have at it, but I'm not seeing a 
urgent need to address this.  Thanks,


Josef



Re: [PATCH] btrfs: fix race between transaction aborts and fsyncs leading to use-after-free

2021-04-08 Thread Josef Bacik
also setting BTRFS_FS_STATE_ERROR on fs_info->fs_state.

After that, at cleanup_transaction(), it deletes the transaction from
the list of transactions (fs_info->trans_list), sets the transaction
to the state TRANS_STATE_COMMIT_DOING and then waits for the number
of writers to go down to 1, as it's currently 2 (1 for task A and 1
for task B);

6) The transaction kthread is running and sees that BTRFS_FS_STATE_ERROR
is set in fs_info->fs_state, so it calls btrfs_cleanup_transaction().

There it sees the list fs_info->trans_list is empty, and then proceeds
into calling btrfs_drop_all_logs(), which frees the log root tree with
a call to btrfs_free_log_root_tree();

7) Task B calls write_all_supers() and, shortly after, under the label
'out_wake_log_root', it deferences the pointer stored in
'log_root_tree', which was already freed in the previous step by the
transaction kthread. This results in a use-after-free leading to a
crash.

Fix this by deleting the transaction from the list of transactions at
cleanup_transaction() only after setting the transaction state to
TRANS_STATE_COMMIT_DOING and waiting for all existing tasks that are
attached to the transaction to release their transaction handles.
This makes the transaction kthread wait for all the tasks attached to
the transaction to be done with the transaction before dropping the
log roots and doing other cleanups.

Fixes: ef67963dac255b ("btrfs: drop logs when we've aborted a transaction")
Signed-off-by: Filipe Manana 


Nice catch,

Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH] btrfs-progs: Fix null pointer deref in balance_level

2021-04-08 Thread Josef Bacik

On 4/6/21 9:55 AM, Nikolay Borisov wrote:

In case the right buffer is emptied it's first set to null and
subsequently it's dereferenced to get its size to pass to root_sub_used.
This naturally leads to a null pointer dereference. The correct thing
to do is to pass the stashed right->len in "blocksize".

Fixes #296

Signed-off-by: Nikolay Borisov 


Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH] btrfs: return whole extents in fiemap

2021-04-08 Thread Josef Bacik

On 4/6/21 6:31 PM, Boris Burkov wrote:

`xfs_io -c 'fiemap  ' `
can give surprising results on btrfs that differ from xfs.

btrfs spits out extents trimmed to fit the user input. If the user's
fiemap request has an offset, then rather than returning each whole
extent which intersects that range, we also trim the start extent to not
have start < off.

Documentation in filesystems/fiemap.txt and the xfs_io man page suggests
that returning the whole extent is expected.

Some cases which all yield the same fiemap in xfs, but not btrfs:
dd if=/dev/zero of=$f bs=4k count=1
sudo xfs_io -c 'fiemap 0 1024' $f
   0: [0..7]: 26624..26631
sudo xfs_io -c 'fiemap 2048 1024' $f
   0: [4..7]: 26628..26631
sudo xfs_io -c 'fiemap 2048 4096' $f
   0: [4..7]: 26628..26631
sudo xfs_io -c 'fiemap 3584 512' $f
   0: [7..7]: 26631..26631
sudo xfs_io -c 'fiemap 4091 5' $f
   0: [7..6]: 26631..26630

I believe this is a consequence of the logic for merging contiguous
extents represented by separate extent items. That logic needs to track
the last offset as it loops through the extent items, which happens to
pick up the start offset on the first iteration, and trim off the
beginning of the full extent. To fix it, start `off` at 0 rather than
`start` so that we keep the iteration/merging intact without cutting off
the start of the extent.

after the fix, all the above commands give:
0: [0..7]: 26624..26631

The merging logic is exercised by xfstest generic/483, and I have
written a new xfstest for checking we don't have backwards or
zero-length fiemaps for cases like those above.

Signed-off-by: Boris Burkov 


Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH v2] btrfs: do more graceful error/warning for 32bit kernel

2021-04-08 Thread Josef Bacik

On 2/24/21 8:18 PM, Qu Wenruo wrote:

Due to the pagecache limit of 32bit systems, btrfs can't access metadata
at or beyond (ULONG_MAX + 1) << PAGE_SHIFT.
This is 16T for 4K page size while 256T for 64K page size.

And unlike other fses, btrfs uses internally mapped u64 address space for
all of its metadata, this is more tricky than other fses.

Users can have a fs which doesn't have metadata beyond the boundary at
mount time, but later balance can cause btrfs to create metadata beyond
the boundary.

And modification to MM layer is unrealistic just for such minor use
case.

To address such problem, this patch will introduce the following checks,
much like how XFS handles such problem:

- Mount time rejection
   This will reject any fs which has metadata chunk at or beyond the
   boundary.

- Mount time early warning
   If there is any metadata chunk beyond 5/8 of the boundary, we do an
   early warning and hope the end user will see it.

- Runtime extent buffer rejection
   If we're going to allocate an extent buffer at or beyond the boundary,
   reject such request with -EOVERFLOW.
   This is definitely going to cause problems like transaction abort, but
   we have no better ways.

- Runtime extent buffer early warning
   If an extent buffer beyond 5/8 of the max file size is allocated, do
   an early warning.

Above error/warning message will only be outputted once for each fs to
reduce dmesg flood.

Reported-by: Erik Jensen 
Signed-off-by: Qu Wenruo 


This doesn't apply cleanly to misc-next so it needs to be respun, but otherwise

Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH] btrfs: Correct try_lock_extent() usage in read_extent_buffer_subpage()

2021-04-08 Thread Josef Bacik

On 4/8/21 8:40 AM, Goldwyn Rodrigues wrote:

try_lock_extent() returns 1 on success or 0 for failure and not an error
code. If try_lock_extent() fails, read_extent_buffer_subpage() returns
zero indicating subpage extent read success.

Return EAGAIN/EWOULDBLOCK if try_lock_extent() fails in locking the
extent.

Signed-off-by: Goldwyn Rodrigues 


Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: ENOSPC in btrfs_run_delayed_refs with 5.10.8

2021-04-08 Thread Josef Bacik

On 4/8/21 12:10 PM, Martin Raiber wrote:

On 11.03.2021 18:58 Martin Raiber wrote:

On 01.02.2021 23:08 Martin Raiber wrote:

On 27.01.2021 22:03 Chris Murphy wrote:

On Wed, Jan 27, 2021 at 10:27 AM Martin Raiber  wrote:

Hi,

seems 5.10.8 still has the ENOSPC issue when compression is used 
(compress-force=zstd,space_cache=v2):


Jan 27 11:02:14  kernel: [248571.569840] [ cut here ]
Jan 27 11:02:14  kernel: [248571.569843] BTRFS: Transaction aborted (error 
-28)
Jan 27 11:02:14  kernel: [248571.569845] BTRFS: error (device dm-0) in 
add_to_free_space_tree:1039: errno=-28 No space left
Jan 27 11:02:14  kernel: [248571.569848] BTRFS info (device dm-0): forced 
readonly
Jan 27 11:02:14  kernel: [248571.569851] BTRFS: error (device dm-0) in 
add_to_free_space_tree:1039: errno=-28 No space left
Jan 27 11:02:14  kernel: [248571.569852] BTRFS: error (device dm-0) in 
__btrfs_free_extent:3270: errno=-28 No space left
Jan 27 11:02:14  kernel: [248571.569854] BTRFS: error (device dm-0) in 
btrfs_run_delayed_refs:2191: errno=-28 No space left
Jan 27 11:02:14  kernel: [248571.569898] WARNING: CPU: 3 PID: 21255 at 
fs/btrfs/free-space-tree.c:1039 add_to_free_space_tree+0xe8/0x130
Jan 27 11:02:14  kernel: [248571.569913] BTRFS: error (device dm-0) in 
__btrfs_free_extent:3270: errno=-28 No space left

Jan 27 11:02:14  kernel: [248571.569939] Modules linked in:
Jan 27 11:02:14  kernel: [248571.569966] BTRFS: error (device dm-0) in 
btrfs_run_delayed_refs:2191: errno=-28 No space left
Jan 27 11:02:14  kernel: [248571.569992]  bfq zram bcache crc64 loop 
dm_crypt xfs dm_mod st sr_mod cdrom nf_tables nfnetlink iptable_filter 
bridge stp llc intel_powerclamp coretemp k$
Jan 27 11:02:14  kernel: [248571.570075] CPU: 3 PID: 21255 Comm: 
kworker/u50:22 Tainted: G  I   5.10.8 #1
Jan 27 11:02:14  kernel: [248571.570076] Hardware name: Dell Inc. PowerEdge 
R510/0DPRKF, BIOS 1.13.0 03/02/2018
Jan 27 11:02:14  kernel: [248571.570079] Workqueue: events_unbound 
btrfs_async_reclaim_metadata_space
Jan 27 11:02:14  kernel: [248571.570081] RIP: 
0010:add_to_free_space_tree+0xe8/0x130
Jan 27 11:02:14  kernel: [248571.570082] Code: 55 50 f0 48 0f ba aa 40 0a 
00 00 02 72 22 83 f8 fb 74 4c 83 f8 e2 74 47 89 c6 48 c7 c7 b8 39 49 82 89 
44 24 04 e8 8a 99 4a 00 <0f> 0b 8$
Jan 27 11:02:14  kernel: [248571.570083] RSP: 0018:c90009c57b88 EFLAGS: 
00010282
Jan 27 11:02:14  kernel: [248571.570084] RAX:  RBX: 
4000 RCX: 0027
Jan 27 11:02:14  kernel: [248571.570085] RDX: 0027 RSI: 
0004 RDI: 888617a58b88
Jan 27 11:02:14  kernel: [248571.570086] RBP: 8889ecb874e0 R08: 
888617a58b80 R09: 
Jan 27 11:02:14  kernel: [248571.570087] R10: 0001 R11: 
822372e0 R12: 00574151
Jan 27 11:02:14  kernel: [248571.570087] R13: 8884e05727e0 R14: 
88815ae4fc00 R15: 88815ae4fdd8
Jan 27 11:02:14  kernel: [248571.570088] FS:  () 
GS:888617a4() knlGS:
Jan 27 11:02:14  kernel: [248571.570089] CS:  0010 DS:  ES:  CR0: 
80050033
Jan 27 11:02:14  kernel: [248571.570090] CR2: 7eb4a3a4f00a CR3: 
0260a005 CR4: 000206e0

Jan 27 11:02:14  kernel: [248571.570091] Call Trace:
Jan 27 11:02:14  kernel: [248571.570097]  
__btrfs_free_extent.isra.0+0x56a/0xa10

Jan 27 11:02:14  kernel: [248571.570100]  __btrfs_run_delayed_refs+0x659/0xf20
Jan 27 11:02:14  kernel: [248571.570102]  btrfs_run_delayed_refs+0x73/0x200
Jan 27 11:02:14  kernel: [248571.570103]  flush_space+0x4e8/0x5e0
Jan 27 11:02:14  kernel: [248571.570105]  ? btrfs_get_alloc_profile+0x66/0x1b0
Jan 27 11:02:14  kernel: [248571.570106]  ? btrfs_get_alloc_profile+0x66/0x1b0
Jan 27 11:02:14  kernel: [248571.570107]  
btrfs_async_reclaim_metadata_space+0x107/0x3a0

Jan 27 11:02:14  kernel: [248571.570111]  process_one_work+0x1b6/0x350
Jan 27 11:02:14  kernel: [248571.570112]  worker_thread+0x50/0x3b0
Jan 27 11:02:14  kernel: [248571.570114]  ? process_one_work+0x350/0x350
Jan 27 11:02:14  kernel: [248571.570116]  kthread+0xfe/0x140
Jan 27 11:02:14  kernel: [248571.570117]  ? kthread_park+0x90/0x90
Jan 27 11:02:14  kernel: [248571.570120]  ret_from_fork+0x22/0x30
Jan 27 11:02:14  kernel: [248571.570122] ---[ end trace 568d2f30de65b1c0 ]---
Jan 27 11:02:14  kernel: [248571.570123] BTRFS: error (device dm-0) in 
add_to_free_space_tree:1039: errno=-28 No space left
Jan 27 11:02:14  kernel: [248571.570151] BTRFS: error (device dm-0) in 
__btrfs_free_extent:3270: errno=-28 No space left
Jan 27 11:02:14  kernel: [248571.570178] BTRFS: error (device dm-0) in 
btrfs_run_delayed_refs:2191: errno=-28 No space left



btrfs fi usage:

Overall:
 Device size: 931.49GiB
 Device allocated:    931.49GiB
 Device unallocated:    1.00MiB
 Device missing:  0.00B
 Used:    786.39GiB
 Free (estimated):   

Re: [PATCH v2] btrfs: zoned: move superblock logging zone location

2021-04-08 Thread Josef Bacik

On 4/8/21 4:25 AM, Naohiro Aota wrote:

This commit moves the location of the superblock logging zones. The new
locations of the logging zones are now determined based on fixed block
addresses instead of on fixed zone numbers.

The old placement method based on fixed zone numbers causes problems when
one needs to inspect a file system image without access to the drive zone
information. In such case, the super block locations cannot be reliably
determined as the zone size is unknown. By locating the superblock logging
zones using fixed addresses, we can scan a dumped file system image without
the zone information since a super block copy will always be present at or
after the fixed location.

This commit introduces the following three pairs of zones containing fixed
offset locations, regardless of the device zone size.

   - Primary superblock: zone starting at offset 0 and the following zone
   - First copy: zone containing offset 64GB and the following zone
   - Second copy: zone containing offset 256GB and the following zone

If a logging zone is outside of the disk capacity, we do not record the
superblock copy.

The first copy position is much larger than for a regular btrfs volume
(64M).  This increase is to avoid overlapping with the log zones for the
primary superblock. This higher location is arbitrary but allows supporting
devices with very large zone sizes, up to 32GB. Such large zone size is
unrealistic and very unlikely to ever be seen in real devices. Currently,
SMR disks have a zone size of 256MB, and we are expecting ZNS drives to be
in the 1-4GB range, so this 32GB limit gives us room to breathe. For now,
we only allow zone sizes up to 8GB, below this hard limit of 32GB.

The fixed location addresses are somewhat arbitrary, but with the intent of
maintaining superblock reliability even for smaller devices. For this
reason, the superblock fixed locations do not exceed 1TB.

The superblock logging zones are reserved for superblock logging and never
used for data or metadata blocks. Note that we only reserve the two zones
per primary/copy actually used for superblock logging. We do not reserve
the ranges of zones possibly containing superblocks with the largest
supported zone size (0-16GB, 64G-80GB, 256G-272GB).

The zones containing the fixed location offsets used to store superblocks
in a regular btrfs volume (no zoned case) are also reserved to avoid
confusion.

Signed-off-by: Naohiro Aota 


Thanks Naohiro, this makes it much easier to understand,

Reviewed-by: Josef Bacik 

Dave, I know you're on vacation, this needs to go to Linus for this cycle so we 
don't have two different SB slots for two different kernels.  I don't want to 
disturb your vacation, so I'll put this in a pull request tomorrow to Linus.  If 
you already had plans to do this let me know and I'll hold off.  Thanks,


Josef


Re: [PATCH] btrfs: zoned: move superblock logging zone location

2021-04-07 Thread Josef Bacik

On 4/7/21 2:31 PM, Johannes Thumshirn wrote:

On 07/04/2021 19:54, Josef Bacik wrote:

On 3/15/21 1:53 AM, Naohiro Aota wrote:

This commit moves the location of superblock logging zones. The location of
the logging zones are determined based on fixed block addresses instead of
on fixed zone numbers.

By locating the superblock zones using fixed addresses, we can scan a
dumped file system image without the zone information. And, no drawbacks
exist.

We use the following three pairs of zones containing fixed offset
locations, regardless of the device zone size.

- Primary superblock: zone starting at offset 0 and the following zone
- First copy: zone containing offset 64GB and the following zone
- Second copy: zone containing offset 256GB and the following zone

If the location of the zones are outside of disk, we don't record the
superblock copy.

These addresses are arbitrary, but using addresses that are too large
reduces superblock reliability for smaller devices, so we do not want to
exceed 1T to cover all case nicely.

Also, LBAs are generally distributed initially across one head (platter
side) up to one or more zones, then go on the next head backward (the other
side of the same platter), and on to the following head/platter. Thus using
non sequential fixed addresses for superblock logging, such as 0/64G/256G,
likely result in each superblock copy being on a different head/platter
which improves chances of recovery in case of superblock read error.

These zones are reserved for superblock logging and never used for data or
metadata blocks. Zones containing the offsets used to store superblocks in
a regular btrfs volume (no zoned case) are also reserved to avoid
confusion.

Note that we only reserve the 2 zones per primary/copy actually used for
superblock logging. We don't reserve the ranges possibly containing
superblock with the largest supported zone size (0-16GB, 64G-80GB,
256G-272GB).

The first copy position is much larger than for a regular btrfs volume
(64M).  This increase is to avoid overlapping with the log zones for the
primary superblock. This higher location is arbitrary but allows supporting
devices with very large zone size, up to 32GB. But we only allow zone sizes
up to 8GB for now.



Ok it took me a few reads to figure out what's going on.

The problem is that with large zone sizes, our current choices put the back up
super blocks wyy out on the disk, correct?  So instead you've picked
arbitrary byte offsets, hoping that they'll be closer to the front of the disk
and thus actually be useful.

And then you've introduced the 8gib zone size as a way to avoid problems where
we get the same zone for the backup supers.

Are these statements correct?  If so the changelog should be updated to make
this clear up front, because it took me a while to work that out.


No the problem is, we're placing superblocks into specific zones, regardless of
the zone size. This creates a problem when you need to inspect a file system,
but don't have the block device available, because you can't look at the zone
size to calculate where the superblocks are on the device.

With this change we're placing the superblocks not into specific zone numbers,
but into the zones starting at specific offsets. We're taking 8G zone size as
a maximum expected zone size, to make sure we're not overlapping superblock
zones. Currently SMR disks have a zone size of 256MB and we're expecting ZNS
drives to be in the 1-2GB range, so this 8GB gives us room to breath.

Hope this helps clearing up any confusion.



Ok this makes a lot more sense, and should be the first thing in the changelog, 
because I still got it wrong after reading the thing a few times.


And I think it's worth pointing out in the comments that 8gib represents a zone 
size that doesn't exist currently and is likely to never exist.


That will make this much easier to grok and understand in the future.  Thanks,

Josef


Re: [PATCH] btrfs: zoned: move superblock logging zone location

2021-04-07 Thread Josef Bacik

On 3/15/21 1:53 AM, Naohiro Aota wrote:

This commit moves the location of superblock logging zones. The location of
the logging zones are determined based on fixed block addresses instead of
on fixed zone numbers.

By locating the superblock zones using fixed addresses, we can scan a
dumped file system image without the zone information. And, no drawbacks
exist.

We use the following three pairs of zones containing fixed offset
locations, regardless of the device zone size.

   - Primary superblock: zone starting at offset 0 and the following zone
   - First copy: zone containing offset 64GB and the following zone
   - Second copy: zone containing offset 256GB and the following zone

If the location of the zones are outside of disk, we don't record the
superblock copy.

These addresses are arbitrary, but using addresses that are too large
reduces superblock reliability for smaller devices, so we do not want to
exceed 1T to cover all case nicely.

Also, LBAs are generally distributed initially across one head (platter
side) up to one or more zones, then go on the next head backward (the other
side of the same platter), and on to the following head/platter. Thus using
non sequential fixed addresses for superblock logging, such as 0/64G/256G,
likely result in each superblock copy being on a different head/platter
which improves chances of recovery in case of superblock read error.

These zones are reserved for superblock logging and never used for data or
metadata blocks. Zones containing the offsets used to store superblocks in
a regular btrfs volume (no zoned case) are also reserved to avoid
confusion.

Note that we only reserve the 2 zones per primary/copy actually used for
superblock logging. We don't reserve the ranges possibly containing
superblock with the largest supported zone size (0-16GB, 64G-80GB,
256G-272GB).

The first copy position is much larger than for a regular btrfs volume
(64M).  This increase is to avoid overlapping with the log zones for the
primary superblock. This higher location is arbitrary but allows supporting
devices with very large zone size, up to 32GB. But we only allow zone sizes
up to 8GB for now.



Ok it took me a few reads to figure out what's going on.

The problem is that with large zone sizes, our current choices put the back up 
super blocks wyy out on the disk, correct?  So instead you've picked 
arbitrary byte offsets, hoping that they'll be closer to the front of the disk 
and thus actually be useful.


And then you've introduced the 8gib zone size as a way to avoid problems where 
we get the same zone for the backup supers.


Are these statements correct?  If so the changelog should be updated to make 
this clear up front, because it took me a while to work that out.


Something at the beginning like the following

"With larger zone sizes, for example 8gib, the 3rd backup super would be located 
8tib into the device.  However not all zoned block devices are this large.  In 
order to fix this limitation set the zones to a static byte offset, and 
calculate the zone number from there based on the devices zone size."


So that it's clear from the outset why we're making this change.

And this brings up another problem, in that what happens when we _do_ run into 
block devices that have huge zones, like 64gib zones?  We have to change the 
disk format to support these devices.  I'm not against that per-se, but it seems 
like a limitation, even if it's unlikely to ever happen.  With the locations we 
currently have, any arbitrary zone size is going to work in the future, and the 
only drawback is you need a device of a certain size to take advantage of the 
back up super blocks.  I would hope that we don't have 64gib zone size block 
devices that are only 128gib in size in the future.




Signed-off-by: Naohiro Aota 
---
  fs/btrfs/zoned.c | 39 +++
  1 file changed, 31 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 43948bd40e02..6a72ca1f7988 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -21,9 +21,24 @@
  /* Pseudo write pointer value for conventional zone */
  #define WP_CONVENTIONAL ((u64)-2)
  
+/*

+ * Location of the first zone of superblock logging zone pairs.
+ * - Primary superblock: the zone containing offset 0 (zone 0)
+ * - First superblock copy: the zone containing offset 64G
+ * - Second superblock copy: the zone containing offset 256G
+ */
+#define BTRFS_PRIMARY_SB_LOG_ZONE 0ULL
+#define BTRFS_FIRST_SB_LOG_ZONE (64ULL * SZ_1G)
+#define BTRFS_SECOND_SB_LOG_ZONE (256ULL * SZ_1G)
+#define BTRFS_FIRST_SB_LOG_ZONE_SHIFT const_ilog2(BTRFS_FIRST_SB_LOG_ZONE)
+#define BTRFS_SECOND_SB_LOG_ZONE_SHIFT const_ilog2(BTRFS_SECOND_SB_LOG_ZONE)
+
  /* Number of superblock log zones */
  #define BTRFS_NR_SB_LOG_ZONES 2
  
+/* Max size of supported zone size */

+#define BTRFS_MAX_ZONE_SIZE SZ_8G
+
  static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void 
*data)

Re: Aw: Re: Re: Help needed with filesystem errors: parent transid verify failed

2021-03-29 Thread Josef Bacik

On 3/29/21 4:42 AM, B A wrote:

Gesendet: Montag, 29. März 2021 um 08:09 Uhr
Von: "Chris Murphy" 
An: "B A" , "Btrfs BTRFS" 
Cc: "Qu Wenruo" , "Josef Bacik" 
Betreff: Re: Re: Help needed with filesystem errors: parent transid verify 
failed
[…]

What do you get for:

btrfs insp dump-s -f /dev/dm-0


See attached file "btrfs_insp_dump-s_-f.txt"



I'm on PTO this week so I'll be a little less responsive, but thankfully this is 
just the extent tree.  First thing is to make sure you've backed everything up, 
and then you should be able to just do btrfs check --repair and it should fix it 
for you.


However I've noticed some failure cases where it won't fix transid errors 
sometimes because it errors out trying to read the things.  If that happens just 
let me know, I have a private branch with fsck changes to address this class of 
problems and I can point you at that.  I'd rather wait to make sure the normal 
fsck won't work first tho, just in case.  Thanks,


Josef


[PATCH] btrfs: use percpu_read_positive instead of sum_positive for need_preempt

2021-03-24 Thread Josef Bacik
Looking at perf data for a fio workload I noticed that we were spending
a pretty large chunk of time (around 5%) doing percpu_counter_sum() in
need_preemptive_reclaim.  This is silly, as we only want to know if we
have more ordered than delalloc to see if we should be counting the
delayed items in our threshold calculation.  Change this to
percpu_read_positive() to avoid the overhead.

I ran this through fsperf to validate the changes, obviously the latency
numbers in dbench and fio are quite jittery, so take them as you wish,
but overall the improvements on throughput, iops, and bw are all
positive.  Each test was run two times, the given value is the average
of both runs for their respective column.

btrfs ssd normal test results

bufferedrandwrite16g results
 metric baseline   current  diff
==
write_io_kbytes 16777216   16777216 0.00%
read_clat_ns_p99   0  0 0.00%
write_bw_bytes  1.04e+08   1.05e+08 1.12%
read_iops  0  0 0.00%
write_clat_ns_p50  13888  11840   -14.75%
read_io_kbytes 0  0 0.00%
read_io_bytes  0  0 0.00%
write_clat_ns_p99  35008  29312   -16.27%
read_bw_bytes  0  0 0.00%
elapsed  170167-1.76%
write_lat_ns_min 4221.503762.50   -10.87%
sys_cpu39.65  35.37   -10.79%
write_lat_ns_max2.67e+10   2.50e+10-6.63%
read_lat_ns_min0  0 0.00%
write_iops  25270.10   25553.43 1.12%
read_lat_ns_max0  0 0.00%
read_clat_ns_p50   0  0 0.00%

dbench60 results
  metric baseline   current diff
==
qpathinfo   11.12 12.7314.52%
throughput 416.09445.66 7.11%
flush 3485.63   1887.55   -45.85%
qfileinfo0.70  1.92   173.86%
ntcreatex  992.60695.76   -29.91%
qfsinfo  2.43  3.7152.48%
close1.67  3.1488.09%
sfileinfo   66.54105.2058.10%
rename 809.23619.59   -23.43%
find16.88 15.46-8.41%
unlink 820.54670.86   -18.24%
writex3375.20   2637.91   -21.84%
deltree386.33449.9816.48%
readx3.43  3.41-0.60%
mkdir0.05  0.03   -38.46%
lockx0.26  0.26-0.76%
unlockx  0.81  0.32   -60.33%

dio4kbs16threads results
 metric  baseline   current   diff

write_io_kbytes 5249676   3357150   -36.05%
read_clat_ns_p99  0 0 0.00%
write_bw_bytes  89583501.50   57291192.50   -36.05%
read_iops 0 0 0.00%
write_clat_ns_p50242688263680 8.65%
read_io_kbytes0 0 0.00%
read_io_bytes 0 0 0.00%
write_clat_ns_p99  15826944  36732928   132.09%
read_bw_bytes 0 0 0.00%
elapsed  6161 0.00%
write_lat_ns_min  42704 42095-1.43%
sys_cpu5.27  3.45   -34.52%
write_lat_ns_max   7.43e+08  9.27e+0824.71%
read_lat_ns_min   0 0 0.00%
write_iops 21870.97  13987.11   -36.05%
read_lat_ns_max   0 0 0.00%
read_clat_ns_p50  0 0 0.00%

randwrite2xram results
 metric  baseline   current   diff

write_io_kbytes24831972  2887626216.29%
read_clat_ns_p99  0 0 0.00%
write_bw_bytes  83745273.50   92182192.5010.07%
read_iops 0 0 0.00%
write_clat_ns_p50 13952 11648   -16.51%
read_io_kbytes0 0 0.00%
read_io_bytes 0 0 0.00%
write_clat_ns_p99 50176 52992 5.61%
read_bw_bytes 0 0 0.00%
elapsed 314   332 5.73%
write_lat_ns_min5920.50  5127   -13.40%
sys_cpu7.82  7.35-6.07%
write_lat_ns_max   5.27e+10  3.88e+10   -26.44%
read_lat_ns_min   0 0 0.00%
write_iops 20445.62  22505.4210.07%
read_lat_ns_max   0 0 0.00%
read_clat_ns_p50  0 0 0.00%

untarfirefox results
metricbaseline   currentdiff
==
elapsed  47.41 47.40   -0.03%

Signed-off-by: Josef Bacik 
---
 fs/btrfs/space-info.c | 4 ++--
 1 file changed, 2

Re: [PATCH v8 00/10] fs: interface for directly reading/writing compressed data

2021-03-19 Thread Josef Bacik

On 3/19/21 4:08 PM, Linus Torvalds wrote:

On Fri, Mar 19, 2021 at 11:21 AM Josef Bacik  wrote:


Can we get some movement on this?  Omar is sort of spinning his wheels here
trying to get this stuff merged, no major changes have been done in a few
postings.


I'm not Al, and I absolutely detest the IOCB_ENCODED thing, and want
more explanations of why this should be done that way, and pollute our
iov_iter handling EVEN MORE.

Our iov_iter stuff isn't the most legible, and I don't understand why
anybody would ever think it's a good idea to spread what is clearly a
"struct" inside multiple different iov extents.

Honestly, this sounds way more like an ioctl interface than
read/write. We've done that before.


That's actually the way this started

https://lore.kernel.org/linux-fsdevel/8eae56abb90c0fe87c350322485ce8674e135074.1567623877.git.osan...@fb.com/

it was suggested that Omar make it generic by Dave Chinner, hence this is the 
direction it took.  I'll leave the rest of the comments for Omar to respond to 
himself.  Thanks,


Josef


Re: [PATCH v8 00/10] fs: interface for directly reading/writing compressed data

2021-03-19 Thread Josef Bacik

On 3/16/21 3:42 PM, Omar Sandoval wrote:

From: Omar Sandoval 

This series adds an API for reading compressed data on a filesystem
without decompressing it as well as support for writing compressed data
directly to the filesystem. As with the previous submissions, I've
included a man page patch describing the API. I have test cases
(including fsstress support) and example programs which I'll send up
[1].

The main use-case is Btrfs send/receive: currently, when sending data
from one compressed filesystem to another, the sending side decompresses
the data and the receiving side recompresses it before writing it out.
This is wasteful and can be avoided if we can just send and write
compressed extents. The patches implementing the send/receive support
will be sent shortly.

Patches 1-3 add the VFS support and UAPI. Patch 4 is a fix that this
series depends on; it can be merged independently. Patches 5-8 are Btrfs
prep patches. Patch 9 adds Btrfs encoded read support and patch 10 adds
Btrfs encoded write support.

These patches are based on Dave Sterba's Btrfs misc-next branch [2],
which is in turn currently based on v5.12-rc3.



Al,

Can we get some movement on this?  Omar is sort of spinning his wheels here 
trying to get this stuff merged, no major changes have been done in a few 
postings.  Thanks,


Josef


Re: [PATCH v2 2/2] btrfs: zoned: automatically reclaim zones

2021-03-19 Thread Josef Bacik

On 3/19/21 6:48 AM, Johannes Thumshirn wrote:

When a file gets deleted on a zoned file system, the space freed is not
returned back into the block group's free space, but is migrated to
zone_unusable.

As this zone_unusable space is behind the current write pointer it is not
possible to use it for new allocations. In the current implementation a
zone is reset once all of the block group's space is accounted as zone
unusable.

This behaviour can lead to premature ENOSPC errors on a busy file system.

Instead of only reclaiming the zone once it is completely unusable,
kick off a reclaim job once the amount of unusable bytes exceeds a user
configurable threshold between 51% and 100%. It can be set per mounted
filesystem via the sysfs tunable bg_reclaim_threshold which is set to 75%
per default.

Similar to reclaiming unused block groups, these dirty block groups are
added to a to_reclaim list and then on a transaction commit, the reclaim
process is triggered but after we deleted unused block groups, which will
free space for the relocation process.

Signed-off-by: Johannes Thumshirn 
---

AFAICT sysfs_create_files() does not have the ability to provide a is_visible
callback, so the bg_reclaim_threshold sysfs file is visible for non zoned
filesystems as well, even though only for zoned filesystems we're adding block
groups to the reclaim list. I'm all ears for a approach that is sensible in
this regard.


  fs/btrfs/block-group.c   | 84 
  fs/btrfs/block-group.h   |  2 +
  fs/btrfs/ctree.h |  3 ++
  fs/btrfs/disk-io.c   | 11 +
  fs/btrfs/free-space-cache.c  |  9 +++-
  fs/btrfs/sysfs.c | 35 +++
  fs/btrfs/volumes.c   |  2 +-
  fs/btrfs/volumes.h   |  1 +
  include/trace/events/btrfs.h | 12 ++
  9 files changed, 157 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 9ae3ac96a521..af9026795ddd 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1485,6 +1485,80 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
spin_unlock(&fs_info->unused_bgs_lock);
  }
  
+void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)

+{
+   struct btrfs_block_group *bg;
+   struct btrfs_space_info *space_info;
+   int ret = 0;
+
+   if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
+   return;
+
+   if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
+   return;
+
+   mutex_lock(&fs_info->reclaim_bgs_lock);
+   while (!list_empty(&fs_info->reclaim_bgs)) {
+   bg = list_first_entry(&fs_info->reclaim_bgs,
+ struct btrfs_block_group,
+ bg_list);
+   list_del_init(&bg->bg_list);
+
+   space_info = bg->space_info;
+   mutex_unlock(&fs_info->reclaim_bgs_lock);
+
+   /* Don't want to race with allocators so take the groups_sem */
+   down_write(&space_info->groups_sem);
+
+   spin_lock(&bg->lock);
+   if (bg->reserved || bg->pinned || bg->ro) {


How do we deal with backup supers in zoned again?  Will they show up as 
readonly?  If so we may not want the bg->ro check, but I may be insane.



+   /*
+* We want to bail if we made new allocations or have
+* outstanding allocations in this block group.  We do
+* the ro check in case balance is currently acting on
+* this block group.
+*/
+   spin_unlock(&bg->lock);
+   up_write(&space_info->groups_sem);
+   goto next;
+   }
+   spin_unlock(&bg->lock);
+
+   ret = inc_block_group_ro(bg, 0);
+   up_write(&space_info->groups_sem);
+   if (ret < 0) {
+   ret = 0;
+   goto next;
+   }
+
+   btrfs_info(fs_info, "reclaiming chunk %llu", bg->start);
+   trace_btrfs_reclaim_block_group(bg);
+   ret = btrfs_relocate_chunk(fs_info, bg->start);
+   if (ret)
+   btrfs_err(fs_info, "error relocating chunk %llu",
+ bg->start);
+
+next:
+   btrfs_put_block_group(bg);
+   mutex_lock(&fs_info->reclaim_bgs_lock);
+   }
+   mutex_unlock(&fs_info->reclaim_bgs_lock);
+   btrfs_exclop_finish(fs_info);
+}
+
+void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
+{
+   struct btrfs_fs_info *fs_info = bg->fs_info;
+
+   mutex_lock(&fs_info->reclaim_bgs_lock);
+   if (list_empty(&bg->bg_list)) {
+   btrfs_get_block_group(bg);
+   trace_btrfs_add_reclaim_block_group(bg);
+   list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs

Re: [PATCH v2 1/2] btrfs: rename delete_unused_bgs_mutex

2021-03-19 Thread Josef Bacik

On 3/19/21 6:48 AM, Johannes Thumshirn wrote:

As a preparation for another user, rename the unused_bgs_mutex into
reclaim_bgs_lock.

Signed-off-by: Johannes Thumshirn 


Reviewed-by: Josef Bacik 

Thanks,

Josef


Re: [PATCH 0/3] Handle bad dev_root properly with rescue=all

2021-03-18 Thread Josef Bacik

On 3/18/21 11:43 AM, David Sterba wrote:

On Thu, Mar 11, 2021 at 11:23:13AM -0500, Josef Bacik wrote:
[...]


rescue=all working without panicing the machine,
and I verified everything by
using btrfs-corrupt-block to corrupt a dev root of a file system.  Thanks,


We need to have such testing part of some suite but it depends on the
utilities that we don't want to ship by default. Last time we discussed
how to make btrfs-corrupt-block or similar available in source form in
fstests it was not pretty, like having half of the btrfs-progs sources
providing code to read and modify the data structures.

So, what if: we have such tests in the btrfs-progs testsuite. As they're
potentially dangerous, not run by default, but available for easy setup
and test in a VM. The testsuite can be exported into a tarball, with the
binaries included. Or simply run it built from git, that works too just
needs more dependencies installed.

I was thinking about collecting all the stress tests into one place,
fstests already has some but my idea is to provide stress testing of
btrfs-specific features, more at once. Or require some 3rd party tools
and data sources to provide the load.

It would be some infrastructure duplication with fstests, but lots of
that is already done in btrfs-progs and I'd rather go with some
duplication instead of development-time-only testing.



I actually had Boris working on this, basically allow us to set 
BTRFS_CORRUPT_PROG inside our fstests config, and then allow us to write these 
tests for fstests.  I'd prefer to keep this inside xfstests for mostly selfish 
reasons, it's easier for me to adapt the existing continual testing to take 
advantage of this and automatically run the xfstests stuff and post the results. 
 If we keep it in btrfs-progs I have to write a bunch of code to run those to 
post the results to our continual testing stuff.  Thanks,


Josef


Re: [PATCH] btrfs: zoned: automatically reclaim zones

2021-03-17 Thread Josef Bacik

On 3/17/21 6:38 AM, Johannes Thumshirn wrote:

When a file gets deleted on a zoned file system, the space freed is no
returned back into the block group's free space, but is migrated to
zone_unusable.

As this zone_unusable space is behind the current write pointer it is not
possible to use it for new allocations. In the current implementation a
zone is reset once all of the block group's space is accounted as zone
unusable.

This behaviour can lead to premature ENOSPC errors on a busy file system.

Instead of only reclaiming the zone once it is completely unusable,
kick off a reclaim job once the amount of unusable bytes exceeds a user
configurable threshold between 51% and 100%.

Similar to reclaiming unused block groups, these dirty block groups are
added to a to_reclaim list and then on a transaction commit, the reclaim
process is triggered but after we deleted unused block groups, which will
free space for the relocation process.

Signed-off-by: Johannes Thumshirn 


I think this is generally useful, but we should indicate to the user that it's 
happening via automatic cleanup and not because of somebody using btrfs balance 
start.


As for doing it on non-zoned, I'm still hesitant to do this simply because I 
haven't been able to finish working on the relocation stuff.  Once my current 
set of patches get merged I'll pick it back up, but there still remains some 
pretty unpleasant side-effects of balance.


Chris and I have been discussing your RAID5/6 plan, and we both really, really 
like the idea of the stripe root from the point of view that it makes relocation 
insanely straightforward and simple.  I think once we have that in place I'd 
feel a lot more comfortable running relocation automatically everywhere, as 
it'll be a much faster and less error prone operation.  Thanks,


Josef


Re: [PATCH 0/3] Handle bad dev_root properly with rescue=all

2021-03-17 Thread Josef Bacik

On 3/17/21 8:27 AM, David Sterba wrote:

On Thu, Mar 11, 2021 at 11:23:13AM -0500, Josef Bacik wrote:

Hello,

My recent debugging session with Neal's broken filesystem uncovered a glaring
hole in my rescue=all patches, they don't deal with a NULL dev_root properly.
In testing I only ever tested corrupting the extent tree or the csum tree, since
those are the most problematic.  The following 3 fixes allowed Neal to get
rescue=all working without panicing the machine, and I verified everything by
using btrfs-corrupt-block to corrupt a dev root of a file system.  Thanks,


When rescue= is set lots of things can't work, I was wondering if we
should add messages once the "if (!dev_root)" cases are hit but I don't
think we need it, it should be clear enough from the other rescue=
related messages.



Yeah I went back and forth on this, and I came to the same conclusion as you. 
If we're doing rescue=all we know things are bad, and we just want our data 
back.  Thanks,


Josef


[PATCH v8 39/39] btrfs: check return value of btrfs_commit_transaction in relocation

2021-03-12 Thread Josef Bacik
There's a few places where we don't check the return value of
btrfs_commit_transaction in relocation.c.  Thankfully all these places
have straightforward error handling, so simply change all of the sites
at once.

Reviewed-by: Qu Wenruo 
Signed-off-by: Josef Bacik 
---
 fs/btrfs/relocation.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 557d4f09ce7f..371cd4476daf 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1915,7 +1915,7 @@ int prepare_to_merge(struct reloc_control *rc, int err)
list_splice(&reloc_roots, &rc->reloc_roots);
 
if (!err)
-   btrfs_commit_transaction(trans);
+   err = btrfs_commit_transaction(trans);
else
btrfs_end_transaction(trans);
return err;
@@ -3487,8 +3487,7 @@ int prepare_to_relocate(struct reloc_control *rc)
 */
return PTR_ERR(trans);
}
-   btrfs_commit_transaction(trans);
-   return 0;
+   return btrfs_commit_transaction(trans);
 }
 
 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
@@ -3647,7 +3646,9 @@ static noinline_for_stack int relocate_block_group(struct 
reloc_control *rc)
err = PTR_ERR(trans);
goto out_free;
}
-   btrfs_commit_transaction(trans);
+   ret = btrfs_commit_transaction(trans);
+   if (ret && !err)
+   err = ret;
 out_free:
ret = clean_dirty_subvols(rc);
if (ret < 0 && !err)
-- 
2.26.2



[PATCH v8 38/39] btrfs: do proper error handling in merge_reloc_roots

2021-03-12 Thread Josef Bacik
We have a BUG_ON() if we get an error back from btrfs_get_fs_root().
This honestly should never fail, as at this point we have a solid
coordination of fs root to reloc root, and these roots will all be in
memory.  But in the name of killing BUG_ON()'s remove these and handle
the error condition properly, ASSERT()'ing for developers.

Signed-off-by: Josef Bacik 
---
 fs/btrfs/relocation.c | 25 +++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 621c4284d158..557d4f09ce7f 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1960,8 +1960,29 @@ void merge_reloc_roots(struct reloc_control *rc)
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
 false);
if (btrfs_root_refs(&reloc_root->root_item) > 0) {
-   BUG_ON(IS_ERR(root));
-   BUG_ON(root->reloc_root != reloc_root);
+   if (IS_ERR(root)) {
+   /*
+* For recovery we read the fs roots on mount,
+* and if we didn't find the root then we marked
+* the reloc root as a garbage root.  For normal
+* relocation obviously the root should exist in
+* memory.  However there's no reason we can't
+* handle the error properly here just in case.
+*/
+   ASSERT(0);
+   ret = PTR_ERR(root);
+   goto out;
+   }
+   if (root->reloc_root != reloc_root) {
+   /*
+* This is actually impossible without something
+* going really wrong (like weird race condition
+* or cosmic rays).
+*/
+   ASSERT(0);
+   ret = -EINVAL;
+   goto out;
+   }
ret = merge_reloc_root(rc, root);
btrfs_put_root(root);
if (ret) {
-- 
2.26.2



[PATCH v8 37/39] btrfs: handle extent corruption with select_one_root properly

2021-03-12 Thread Josef Bacik
In corruption cases we could have paths from a block up to no root at
all, and thus we'll BUG_ON(!root) in select_one_root.  Handle this by
adding an ASSERT() for developers, and returning an error for normal
users.

Signed-off-by: Josef Bacik 
---
 fs/btrfs/relocation.c | 19 ---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 8f7760f8fcc3..621c4284d158 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2209,7 +2209,13 @@ struct btrfs_root *select_one_root(struct 
btrfs_backref_node *node)
cond_resched();
next = walk_up_backref(next, edges, &index);
root = next->root;
-   BUG_ON(!root);
+
+   /*
+* This can occur if we have incomplete extent refs leading all
+* the way up a particular path, in this case return -EUCLEAN.
+*/
+   if (!root)
+   return ERR_PTR(-EUCLEAN);
 
/* No other choice for non-shareable tree */
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
@@ -2599,8 +2605,15 @@ static int relocate_tree_block(struct btrfs_trans_handle 
*trans,
 
BUG_ON(node->processed);
root = select_one_root(node);
-   if (root == ERR_PTR(-ENOENT)) {
-   update_processed_blocks(rc, node);
+   if (IS_ERR(root)) {
+   ret = PTR_ERR(root);
+
+   /* See explanation in select_one_root for the -EUCLEAN case. */
+   ASSERT(ret == -ENOENT);
+   if (ret == -ENOENT) {
+   ret = 0;
+   update_processed_blocks(rc, node);
+   }
goto out;
}
 
-- 
2.26.2



[PATCH v8 36/39] btrfs: cleanup error handling in prepare_to_merge

2021-03-12 Thread Josef Bacik
This probably can't happen even with a corrupt file system, because we
would have failed much earlier on than here.  However there's no reason
we can't just check and bail out as appropriate, so do that and convert
the correctness BUG_ON() to an ASSERT().

Reviewed-by: Qu Wenruo 
Signed-off-by: Josef Bacik 
---
 fs/btrfs/relocation.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 857da684d415..8f7760f8fcc3 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1880,8 +1880,14 @@ int prepare_to_merge(struct reloc_control *rc, int err)
 
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
false);
-   BUG_ON(IS_ERR(root));
-   BUG_ON(root->reloc_root != reloc_root);
+   if (IS_ERR(root)) {
+   list_add(&reloc_root->root_list, &reloc_roots);
+   btrfs_abort_transaction(trans, (int)PTR_ERR(root));
+   if (!err)
+   err = PTR_ERR(root);
+   break;
+   }
+   ASSERT(root->reloc_root == reloc_root);
 
/*
 * set reference count to 1, so btrfs_recover_relocation
-- 
2.26.2



[PATCH v8 34/39] btrfs: handle __add_reloc_root failures in btrfs_recover_relocation

2021-03-12 Thread Josef Bacik
We can already handle errors appropriately from this function, deal with
an error coming from __add_reloc_root appropriately.

Reviewed-by: Qu Wenruo 
Signed-off-by: Josef Bacik 
---
 fs/btrfs/relocation.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 43037cb302fd..2fc04f96294e 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4054,7 +4054,12 @@ int btrfs_recover_relocation(struct btrfs_root *root)
}
 
err = __add_reloc_root(reloc_root);
-   BUG_ON(err < 0); /* -ENOMEM or logic error */
+   if (err) {
+   list_add_tail(&reloc_root->root_list, &reloc_roots);
+   btrfs_put_root(fs_root);
+   btrfs_end_transaction(trans);
+   goto out_unset;
+   }
fs_root->reloc_root = btrfs_grab_root(reloc_root);
btrfs_put_root(fs_root);
}
@@ -4269,7 +4274,10 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle 
*trans,
return PTR_ERR(reloc_root);
 
ret = __add_reloc_root(reloc_root);
-   BUG_ON(ret < 0);
+   if (ret) {
+   btrfs_put_root(reloc_root);
+   return ret;
+   }
new_root->reloc_root = btrfs_grab_root(reloc_root);
 
if (rc->create_reloc_tree)
-- 
2.26.2



[PATCH v8 32/39] btrfs: remove the extent item sanity checks in relocate_block_group

2021-03-12 Thread Josef Bacik
These checks are all taken care of for us by the tree checker code.

Reviewed-by: Qu Wenruo 
Signed-off-by: Josef Bacik 
---
 fs/btrfs/relocation.c | 29 +
 1 file changed, 1 insertion(+), 28 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 71a3db66c1ab..288d5df39fa0 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3407,20 +3407,6 @@ static void unset_reloc_control(struct reloc_control *rc)
mutex_unlock(&fs_info->reloc_mutex);
 }
 
-static int check_extent_flags(u64 flags)
-{
-   if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
-   (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
-   return 1;
-   if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
-   !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
-   return 1;
-   if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
-   (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
-   return 1;
-   return 0;
-}
-
 static noinline_for_stack
 int prepare_to_relocate(struct reloc_control *rc)
 {
@@ -3472,7 +3458,6 @@ static noinline_for_stack int relocate_block_group(struct 
reloc_control *rc)
struct btrfs_path *path;
struct btrfs_extent_item *ei;
u64 flags;
-   u32 item_size;
int ret;
int err = 0;
int progress = 0;
@@ -3521,19 +3506,7 @@ static noinline_for_stack int 
relocate_block_group(struct reloc_control *rc)
 
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_extent_item);
-   item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
-   if (item_size >= sizeof(*ei)) {
-   flags = btrfs_extent_flags(path->nodes[0], ei);
-   ret = check_extent_flags(flags);
-   BUG_ON(ret);
-   } else if (unlikely(item_size == sizeof(struct 
btrfs_extent_item_v0))) {
-   err = -EINVAL;
-   btrfs_print_v0_err(trans->fs_info);
-   btrfs_abort_transaction(trans, err);
-   break;
-   } else {
-   BUG();
-   }
+   flags = btrfs_extent_flags(path->nodes[0], ei);
 
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
ret = add_tree_block(rc, &key, path, &blocks);
-- 
2.26.2



[PATCH v8 30/39] btrfs: handle extent reference errors in do_relocation

2021-03-12 Thread Josef Bacik
We can already deal with errors appropriately from do_relocation, simply
handle any errors that come from changing the refs at this point
cleanly.  We have to abort the transaction if we fail here as we've
modified metadata at this point.

Signed-off-by: Josef Bacik 
---
 fs/btrfs/relocation.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 1a0e07507796..71a3db66c1ab 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2432,10 +2432,11 @@ static int do_relocation(struct btrfs_trans_handle 
*trans,
btrfs_init_tree_ref(&ref, node->level,
btrfs_header_owner(upper->eb));
ret = btrfs_inc_extent_ref(trans, &ref);
-   BUG_ON(ret);
-
-   ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
-   BUG_ON(ret);
+   if (!ret)
+   ret = btrfs_drop_subtree(trans, root, eb,
+upper->eb);
+   if (ret)
+   btrfs_abort_transaction(trans, ret);
}
 next:
if (!upper->pending)
-- 
2.26.2



[PATCH v8 33/39] btrfs: do proper error handling in create_reloc_inode

2021-03-12 Thread Josef Bacik
We already handle some errors in this function, and the callers do the
correct error handling, so clean up the rest of the function to do the
appropriate error handling.

There's a little extra work that needs to be done here, as we create the
inode item before we create the orphan item.  We could potentially add
the orphan item, but if we failed to create the inode item we would have
to abort the transaction.  Instead add a helper to delete the inode item
we created in the case that we're unable to look up the inode (this
would likely be caused by an ENOMEM), which if it succeeds means we can
avoid a transaction abort in this particular error case.

Signed-off-by: Josef Bacik 
---
 fs/btrfs/relocation.c | 39 +--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 288d5df39fa0..43037cb302fd 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3648,6 +3648,35 @@ static int __insert_orphan_inode(struct 
btrfs_trans_handle *trans,
return ret;
 }
 
+static void delete_orphan_inode(struct btrfs_trans_handle *trans,
+   struct btrfs_root *root, u64 objectid)
+{
+   struct btrfs_path *path;
+   struct btrfs_key key;
+   int ret = 0;
+
+   path = btrfs_alloc_path();
+   if (!path) {
+   ret = -ENOMEM;
+   goto out;
+   }
+
+   key.objectid = objectid;
+   key.type = BTRFS_INODE_ITEM_KEY;
+   key.offset = 0;
+   ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+   if (ret) {
+   if (ret > 0)
+   ret = -ENOENT;
+   goto out;
+   }
+   ret = btrfs_del_item(trans, root, path);
+out:
+   if (ret)
+   btrfs_abort_transaction(trans, ret);
+   btrfs_free_path(path);
+}
+
 /*
  * helper to create inode for data relocation.
  * the inode is in data relocation tree and its link count is 0
@@ -3674,10 +3703,16 @@ struct inode *create_reloc_inode(struct btrfs_fs_info 
*fs_info,
goto out;
 
err = __insert_orphan_inode(trans, root, objectid);
-   BUG_ON(err);
+   if (err)
+   goto out;
 
inode = btrfs_iget(fs_info->sb, objectid, root);
-   BUG_ON(IS_ERR(inode));
+   if (IS_ERR(inode)) {
+   delete_orphan_inode(trans, root, objectid);
+   err = PTR_ERR(inode);
+   inode = NULL;
+   goto out;
+   }
BTRFS_I(inode)->index_cnt = group->start;
 
err = btrfs_orphan_add(trans, BTRFS_I(inode));
-- 
2.26.2



[PATCH v8 31/39] btrfs: check for BTRFS_BLOCK_FLAG_FULL_BACKREF being set improperly

2021-03-12 Thread Josef Bacik
We need to validate that a data extent item does not have the
FULL_BACKREF flag set on it's flags.

Reviewed-by: Qu Wenruo 
Signed-off-by: Josef Bacik 
---
 fs/btrfs/tree-checker.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index f4ade821307d..8e5926232d1e 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1290,6 +1290,11 @@ static int check_extent_item(struct extent_buffer *leaf,
   key->offset, fs_info->sectorsize);
return -EUCLEAN;
}
+   if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
+   extent_err(leaf, slot,
+   "invalid extent flag, data has full backref set");
+   return -EUCLEAN;
+   }
}
ptr = (unsigned long)(struct btrfs_extent_item *)(ei + 1);
 
-- 
2.26.2



[PATCH v8 35/39] btrfs: do not panic in __add_reloc_root

2021-03-12 Thread Josef Bacik
If we have a duplicate entry for a reloc root then we could have fs
corruption that resulted in a double allocation.  Since this shouldn't
happen unless there is corruption, add an ASSERT(ret != -EEXIST) to all
of the callers of __add_reloc_root() to catch any logic mistakes for
developers, otherwise normal error handling will happen for normal
users.

Signed-off-by: Josef Bacik 
---
 fs/btrfs/relocation.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 2fc04f96294e..857da684d415 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -638,9 +638,10 @@ static int __must_check __add_reloc_root(struct btrfs_root 
*root)
   node->bytenr, &node->rb_node);
spin_unlock(&rc->reloc_root_tree.lock);
if (rb_node) {
-   btrfs_panic(fs_info, -EEXIST,
+   btrfs_err(fs_info,
"Duplicate root found for start=%llu while 
inserting into relocation tree",
node->bytenr);
+   return -EEXIST;
}
 
list_add_tail(&root->root_list, &rc->reloc_roots);
@@ -882,6 +883,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
return PTR_ERR(reloc_root);
 
ret = __add_reloc_root(reloc_root);
+   ASSERT(ret != -EEXIST);
if (ret) {
btrfs_put_root(reloc_root);
return ret;
@@ -4054,6 +4056,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
}
 
err = __add_reloc_root(reloc_root);
+   ASSERT(err != -EEXIST);
if (err) {
list_add_tail(&reloc_root->root_list, &reloc_roots);
btrfs_put_root(fs_root);
@@ -4274,6 +4277,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle 
*trans,
return PTR_ERR(reloc_root);
 
ret = __add_reloc_root(reloc_root);
+   ASSERT(ret != -EEXIST);
if (ret) {
btrfs_put_root(reloc_root);
return ret;
-- 
2.26.2



[PATCH v8 28/39] btrfs: handle btrfs_search_slot failure in replace_path

2021-03-12 Thread Josef Bacik
This can fail for any number of reasons, why bring the whole box down
with it?

Reviewed-by: Qu Wenruo 
Signed-off-by: Josef Bacik 
---
 fs/btrfs/relocation.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 592b2d156626..6e8d89e4733a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1322,7 +1322,8 @@ int replace_path(struct btrfs_trans_handle *trans, struct 
reloc_control *rc,
path->lowest_level = level;
ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
path->lowest_level = 0;
-   BUG_ON(ret);
+   if (ret)
+   break;
 
/*
 * Info qgroup to trace both subtrees.
-- 
2.26.2



[PATCH v8 25/39] btrfs: do proper error handling in btrfs_update_reloc_root

2021-03-12 Thread Josef Bacik
We call btrfs_update_root in btrfs_update_reloc_root, which can fail for
all sorts of reasons, including IO errors.  Instead of panicing the box
lets return the error, now that all callers properly handle those
errors.

Reviewed-by: Qu Wenruo 
Signed-off-by: Josef Bacik 
---
 fs/btrfs/relocation.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b9cc99d4b950..7b242f527bd8 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -902,7 +902,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle 
*trans,
int ret;
 
if (!have_reloc_root(root))
-   goto out;
+   return 0;
 
reloc_root = root->reloc_root;
root_item = &reloc_root->root_item;
@@ -935,10 +935,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle 
*trans,
 
ret = btrfs_update_root(trans, fs_info->tree_root,
&reloc_root->root_key, root_item);
-   BUG_ON(ret);
btrfs_put_root(reloc_root);
-out:
-   return 0;
+   return ret;
 }
 
 /*
-- 
2.26.2



[PATCH v8 24/39] btrfs: handle btrfs_update_reloc_root failure in prepare_to_merge

2021-03-12 Thread Josef Bacik
btrfs_update_reloc_root will will return errors in the future, so handle
an error properly in prepare_to_merge.

Signed-off-by: Josef Bacik 
---
 fs/btrfs/relocation.c | 13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index e6f9304d6a29..b9cc99d4b950 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1868,10 +1868,21 @@ int prepare_to_merge(struct reloc_control *rc, int err)
 */
if (!err)
btrfs_set_root_refs(&reloc_root->root_item, 1);
-   btrfs_update_reloc_root(trans, root);
+   ret = btrfs_update_reloc_root(trans, root);
 
+   /*
+* Even if we have an error we need this reloc root back on our
+* list so we can clean up properly.
+*/
list_add(&reloc_root->root_list, &reloc_roots);
btrfs_put_root(root);
+
+   if (ret) {
+   btrfs_abort_transaction(trans, ret);
+   if (!err)
+   err = ret;
+   break;
+   }
}
 
list_splice(&reloc_roots, &rc->reloc_roots);
-- 
2.26.2



[PATCH v8 29/39] btrfs: handle errors in reference count manipulation in replace_path

2021-03-12 Thread Josef Bacik
If any of the reference count manipulation stuff fails in replace_path
we need to abort the transaction, as we've modified the blocks already.
We can simply break at this point and everything will be cleaned up.

Reviewed-by: Qu Wenruo 
Signed-off-by: Josef Bacik 
---
 fs/btrfs/relocation.c | 20 
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 6e8d89e4733a..1a0e07507796 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1363,27 +1363,39 @@ int replace_path(struct btrfs_trans_handle *trans, 
struct reloc_control *rc,
ref.skip_qgroup = true;
btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
ret = btrfs_inc_extent_ref(trans, &ref);
-   BUG_ON(ret);
+   if (ret) {
+   btrfs_abort_transaction(trans, ret);
+   break;
+   }
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
   blocksize, 0);
ref.skip_qgroup = true;
btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
ret = btrfs_inc_extent_ref(trans, &ref);
-   BUG_ON(ret);
+   if (ret) {
+   btrfs_abort_transaction(trans, ret);
+   break;
+   }
 
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
   blocksize, path->nodes[level]->start);
btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
ref.skip_qgroup = true;
ret = btrfs_free_extent(trans, &ref);
-   BUG_ON(ret);
+   if (ret) {
+   btrfs_abort_transaction(trans, ret);
+   break;
+   }
 
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
   blocksize, 0);
btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
ref.skip_qgroup = true;
ret = btrfs_free_extent(trans, &ref);
-   BUG_ON(ret);
+   if (ret) {
+   btrfs_abort_transaction(trans, ret);
+   break;
+   }
 
btrfs_unlock_up_safe(path, 0);
 
-- 
2.26.2



[PATCH v8 26/39] btrfs: convert logic BUG_ON()'s in replace_path to ASSERT()'s

2021-03-12 Thread Josef Bacik
A few BUG_ON()'s in replace_path are purely to keep us from making
logical mistakes, so replace them with ASSERT()'s.

Reviewed-by: Qu Wenruo 
Signed-off-by: Josef Bacik 
---
 fs/btrfs/relocation.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 7b242f527bd8..dfd58fa9f189 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1210,8 +1210,8 @@ int replace_path(struct btrfs_trans_handle *trans, struct 
reloc_control *rc,
int ret;
int slot;
 
-   BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
-   BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
+   ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
+   ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
 
last_snapshot = btrfs_root_last_snapshot(&src->root_item);
 again:
@@ -1242,7 +1242,7 @@ int replace_path(struct btrfs_trans_handle *trans, struct 
reloc_control *rc,
parent = eb;
while (1) {
level = btrfs_header_level(parent);
-   BUG_ON(level < lowest_level);
+   ASSERT(level >= lowest_level);
 
ret = btrfs_bin_search(parent, &key, &slot);
if (ret < 0)
-- 
2.26.2



[PATCH v8 20/39] btrfs: validate ->reloc_root after recording root in trans

2021-03-12 Thread Josef Bacik
If we fail to setup a ->reloc_root in a different thread that path will
error out, however it still leaves root->reloc_root NULL but would still
appear set up in the transaction.  Subsequent calls to
btrfs_record_root_in_transaction would succeed without attempting to
create the reloc root, as the transid has already been update.  Handle
this case by making sure we have a root->reloc_root set after a
btrfs_record_root_in_transaction call so we don't end up deref'ing a
NULL pointer.

Reported-by: Zygo Blaxell 
Signed-off-by: Josef Bacik 
---
 fs/btrfs/relocation.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index df26d6055cc6..4022649e2365 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2081,6 +2081,13 @@ struct btrfs_root *select_reloc_root(struct 
btrfs_trans_handle *trans,
return ERR_PTR(ret);
root = root->reloc_root;
 
+   /*
+* We could have raced with another thread which failed, so
+* ->reloc_root may not be set, return -ENOENT in this case.
+*/
+   if (!root)
+   return ERR_PTR(-ENOENT);
+
if (next->new_bytenr != root->node->start) {
/*
 * We just created the reloc root, so we shouldn't have
@@ -2578,6 +2585,14 @@ static int relocate_tree_block(struct btrfs_trans_handle 
*trans,
ret = btrfs_record_root_in_trans(trans, root);
if (ret)
goto out;
+   /*
+* Another thread could have failed, need to check if we
+* have ->reloc_root actually set.
+*/
+   if (!root->reloc_root) {
+   ret = -ENOENT;
+   goto out;
+   }
root = root->reloc_root;
node->new_bytenr = root->node->start;
btrfs_put_root(node->root);
-- 
2.26.2



[PATCH v8 21/39] btrfs: handle btrfs_update_reloc_root failure in commit_fs_roots

2021-03-12 Thread Josef Bacik
btrfs_update_reloc_root will will return errors in the future, so handle
the error properly in commit_fs_roots.

Reviewed-by: Qu Wenruo 
Signed-off-by: Josef Bacik 
---
 fs/btrfs/transaction.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c24ffbf0fb74..9dbbc354e5ad 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1353,7 +1353,9 @@ static noinline int commit_fs_roots(struct 
btrfs_trans_handle *trans)
spin_unlock(&fs_info->fs_roots_radix_lock);
 
btrfs_free_log(trans, root);
-   btrfs_update_reloc_root(trans, root);
+   ret2 = btrfs_update_reloc_root(trans, root);
+   if (ret2)
+   return ret2;
 
/* see comments in should_cow_block() */
clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
-- 
2.26.2



[PATCH v8 19/39] btrfs: do proper error handling in create_reloc_root

2021-03-12 Thread Josef Bacik
We do memory allocations here, read blocks from disk, all sorts of
operations that could easily fail at any given point.  Instead of
panicing the box, simply return the error back up the chain, all callers
at this point have proper error handling.

Signed-off-by: Josef Bacik 
---
 fs/btrfs/relocation.c | 34 --
 1 file changed, 28 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 9a028376f475..df26d6055cc6 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -733,10 +733,12 @@ static struct btrfs_root *create_reloc_root(struct 
btrfs_trans_handle *trans,
struct extent_buffer *eb;
struct btrfs_root_item *root_item;
struct btrfs_key root_key;
-   int ret;
+   int ret = 0;
+   bool must_abort = false;
 
root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
-   BUG_ON(!root_item);
+   if (!root_item)
+   return ERR_PTR(-ENOMEM);
 
root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
root_key.type = BTRFS_ROOT_ITEM_KEY;
@@ -748,7 +750,9 @@ static struct btrfs_root *create_reloc_root(struct 
btrfs_trans_handle *trans,
/* called by btrfs_init_reloc_root */
ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
  BTRFS_TREE_RELOC_OBJECTID);
-   BUG_ON(ret);
+   if (ret)
+   goto fail;
+
/*
 * Set the last_snapshot field to the generation of the commit
 * root - like this ctree.c:btrfs_block_can_be_shared() behaves
@@ -769,9 +773,16 @@ static struct btrfs_root *create_reloc_root(struct 
btrfs_trans_handle *trans,
 */
ret = btrfs_copy_root(trans, root, root->node, &eb,
  BTRFS_TREE_RELOC_OBJECTID);
-   BUG_ON(ret);
+   if (ret)
+   goto fail;
}
 
+   /*
+* We have changed references at this point, we must abort the
+* transaction if anything fails.
+*/
+   must_abort = true;
+
memcpy(root_item, &root->root_item, sizeof(*root_item));
btrfs_set_root_bytenr(root_item, eb->start);
btrfs_set_root_level(root_item, btrfs_header_level(eb));
@@ -789,14 +800,25 @@ static struct btrfs_root *create_reloc_root(struct 
btrfs_trans_handle *trans,
 
ret = btrfs_insert_root(trans, fs_info->tree_root,
&root_key, root_item);
-   BUG_ON(ret);
+   if (ret)
+   goto fail;
+
kfree(root_item);
 
reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key);
-   BUG_ON(IS_ERR(reloc_root));
+   if (IS_ERR(reloc_root)) {
+   ret = PTR_ERR(reloc_root);
+   goto abort;
+   }
set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
reloc_root->last_trans = trans->transid;
return reloc_root;
+fail:
+   kfree(root_item);
+abort:
+   if (must_abort)
+   btrfs_abort_transaction(trans, ret);
+   return ERR_PTR(ret);
 }
 
 /*
-- 
2.26.2



[PATCH v8 27/39] btrfs: handle btrfs_cow_block errors in replace_path

2021-03-12 Thread Josef Bacik
If we error out cow'ing the root node when doing a replace_path then we
simply unlock and free the buffer and return the error.

Reviewed-by: Qu Wenruo 
Signed-off-by: Josef Bacik 
---
 fs/btrfs/relocation.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index dfd58fa9f189..592b2d156626 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1230,7 +1230,11 @@ int replace_path(struct btrfs_trans_handle *trans, 
struct reloc_control *rc,
if (cow) {
ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb,
  BTRFS_NESTING_COW);
-   BUG_ON(ret);
+   if (ret) {
+   btrfs_tree_unlock(eb);
+   free_extent_buffer(eb);
+   return ret;
+   }
}
 
if (next_key) {
@@ -1290,7 +1294,11 @@ int replace_path(struct btrfs_trans_handle *trans, 
struct reloc_control *rc,
ret = btrfs_cow_block(trans, dest, eb, parent,
  slot, &eb,
  BTRFS_NESTING_COW);
-   BUG_ON(ret);
+   if (ret) {
+   btrfs_tree_unlock(eb);
+   free_extent_buffer(eb);
+   break;
+   }
}
 
btrfs_tree_unlock(parent);
-- 
2.26.2



[PATCH v8 22/39] btrfs: change insert_dirty_subvol to return errors

2021-03-12 Thread Josef Bacik
This will be able to return errors in the future, so change it to return
an error and handle the error appropriately.

Signed-off-by: Josef Bacik 
---
 fs/btrfs/relocation.c | 14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4022649e2365..2187015fc412 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1564,9 +1564,9 @@ static int find_next_key(struct btrfs_path *path, int 
level,
 /*
  * Insert current subvolume into reloc_control::dirty_subvol_roots
  */
-static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
-   struct reloc_control *rc,
-   struct btrfs_root *root)
+static int insert_dirty_subvol(struct btrfs_trans_handle *trans,
+  struct reloc_control *rc,
+  struct btrfs_root *root)
 {
struct btrfs_root *reloc_root = root->reloc_root;
struct btrfs_root_item *reloc_root_item;
@@ -1586,6 +1586,7 @@ static void insert_dirty_subvol(struct btrfs_trans_handle 
*trans,
btrfs_grab_root(root);
list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
}
+   return 0;
 }
 
 static int clean_dirty_subvols(struct reloc_control *rc)
@@ -1787,8 +1788,11 @@ static noinline_for_stack int merge_reloc_root(struct 
reloc_control *rc,
 out:
btrfs_free_path(path);
 
-   if (ret == 0)
-   insert_dirty_subvol(trans, rc, root);
+   if (ret == 0) {
+   ret = insert_dirty_subvol(trans, rc, root);
+   if (ret)
+   btrfs_abort_transaction(trans, ret);
+   }
 
if (trans)
btrfs_end_transaction_throttle(trans);
-- 
2.26.2



[PATCH v8 23/39] btrfs: handle btrfs_update_reloc_root failure in insert_dirty_subvol

2021-03-12 Thread Josef Bacik
btrfs_update_reloc_root will will return errors in the future, so handle
the error properly in insert_dirty_subvol.

Signed-off-by: Josef Bacik 
---
 fs/btrfs/relocation.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 2187015fc412..e6f9304d6a29 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1570,6 +1570,7 @@ static int insert_dirty_subvol(struct btrfs_trans_handle 
*trans,
 {
struct btrfs_root *reloc_root = root->reloc_root;
struct btrfs_root_item *reloc_root_item;
+   int ret;
 
/* @root must be a subvolume tree root with a valid reloc tree */
ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
@@ -1580,7 +1581,9 @@ static int insert_dirty_subvol(struct btrfs_trans_handle 
*trans,
sizeof(reloc_root_item->drop_progress));
btrfs_set_root_drop_level(reloc_root_item, 0);
btrfs_set_root_refs(reloc_root_item, 0);
-   btrfs_update_reloc_root(trans, root);
+   ret = btrfs_update_reloc_root(trans, root);
+   if (ret)
+   return ret;
 
if (list_empty(&root->reloc_dirty_list)) {
btrfs_grab_root(root);
-- 
2.26.2



[PATCH v8 18/39] btrfs: have proper error handling in btrfs_init_reloc_root

2021-03-12 Thread Josef Bacik
create_reloc_root will return errors in the future, and __add_reloc_root
can return -ENOMEM or -EEXIST, so handle these errors properly.

Signed-off-by: Josef Bacik 
---
 fs/btrfs/relocation.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 7ca7ab3d40e2..9a028376f475 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -856,9 +856,14 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
if (clear_rsv)
trans->block_rsv = rsv;
+   if (IS_ERR(reloc_root))
+   return PTR_ERR(reloc_root);
 
ret = __add_reloc_root(reloc_root);
-   BUG_ON(ret < 0);
+   if (ret) {
+   btrfs_put_root(reloc_root);
+   return ret;
+   }
root->reloc_root = btrfs_grab_root(reloc_root);
return 0;
 }
-- 
2.26.2



[PATCH v8 14/39] btrfs: handle record_root_in_trans failure in qgroup_account_snapshot

2021-03-12 Thread Josef Bacik
record_root_in_trans can fail currently, so handle this failure
properly.

Reviewed-by: Qu Wenruo 
Signed-off-by: Josef Bacik 
---
 fs/btrfs/transaction.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 3a3a582063b4..26c91c4eba89 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1444,7 +1444,9 @@ static int qgroup_account_snapshot(struct 
btrfs_trans_handle *trans,
 * recorded root will never be updated again, causing an outdated root
 * item.
 */
-   record_root_in_trans(trans, src, 1);
+   ret = record_root_in_trans(trans, src, 1);
+   if (ret)
+   return ret;
 
/*
 * btrfs_qgroup_inherit relies on a consistent view of the usage for the
@@ -1513,7 +1515,7 @@ static int qgroup_account_snapshot(struct 
btrfs_trans_handle *trans,
 * insert_dir_item()
 */
if (!ret)
-   record_root_in_trans(trans, parent, 1);
+   ret = record_root_in_trans(trans, parent, 1);
return ret;
 }
 
-- 
2.26.2



[PATCH v8 15/39] btrfs: handle record_root_in_trans failure in btrfs_record_root_in_trans

2021-03-12 Thread Josef Bacik
record_root_in_trans can fail currently, handle this failure properly.

Signed-off-by: Josef Bacik 
---
 fs/btrfs/transaction.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 26c91c4eba89..7ee0199fbb95 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -487,6 +487,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle 
*trans,
   struct btrfs_root *root)
 {
struct btrfs_fs_info *fs_info = root->fs_info;
+   int ret;
 
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
return 0;
@@ -501,10 +502,10 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle 
*trans,
return 0;
 
mutex_lock(&fs_info->reloc_mutex);
-   record_root_in_trans(trans, root, 0);
+   ret = record_root_in_trans(trans, root, 0);
mutex_unlock(&fs_info->reloc_mutex);
 
-   return 0;
+   return ret;
 }
 
 static inline int is_transaction_blocked(struct btrfs_transaction *trans)
-- 
2.26.2



[PATCH v8 16/39] btrfs: handle record_root_in_trans failure in create_pending_snapshot

2021-03-12 Thread Josef Bacik
record_root_in_trans can currently fail, so handle this failure
properly.

Reviewed-by: Qu Wenruo 
Signed-off-by: Josef Bacik 
---
 fs/btrfs/transaction.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 7ee0199fbb95..93464b93ee9d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1593,8 +1593,9 @@ static noinline int create_pending_snapshot(struct 
btrfs_trans_handle *trans,
dentry = pending->dentry;
parent_inode = pending->dir;
parent_root = BTRFS_I(parent_inode)->root;
-   record_root_in_trans(trans, parent_root, 0);
-
+   ret = record_root_in_trans(trans, parent_root, 0);
+   if (ret)
+   goto fail;
cur_time = current_time(parent_inode);
 
/*
@@ -1630,7 +1631,11 @@ static noinline int create_pending_snapshot(struct 
btrfs_trans_handle *trans,
goto fail;
}
 
-   record_root_in_trans(trans, root, 0);
+   ret = record_root_in_trans(trans, root, 0);
+   if (ret) {
+   btrfs_abort_transaction(trans, ret);
+   goto fail;
+   }
btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
btrfs_check_and_init_root_item(new_root_item);
-- 
2.26.2



[PATCH v8 17/39] btrfs: return an error from btrfs_record_root_in_trans

2021-03-12 Thread Josef Bacik
We can create a reloc root when we record the root in the trans, which
can fail for all sorts of different reasons.  Propagate this error up
the chain of callers.  Future patches will fix the callers of
btrfs_record_root_in_trans() to handle the error.

Reviewed-by: Qu Wenruo 
Reviewed-by: Johannes Thumshirn 
Signed-off-by: Josef Bacik 
---
 fs/btrfs/transaction.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 93464b93ee9d..c24ffbf0fb74 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -408,6 +408,7 @@ static int record_root_in_trans(struct btrfs_trans_handle 
*trans,
   int force)
 {
struct btrfs_fs_info *fs_info = root->fs_info;
+   int ret = 0;
 
if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
root->last_trans < trans->transid) || force) {
@@ -456,11 +457,11 @@ static int record_root_in_trans(struct btrfs_trans_handle 
*trans,
 * lock.  smp_wmb() makes sure that all the writes above are
 * done before we pop in the zero below
 */
-   btrfs_init_reloc_root(trans, root);
+   ret = btrfs_init_reloc_root(trans, root);
smp_mb__before_atomic();
clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
}
-   return 0;
+   return ret;
 }
 
 
-- 
2.26.2



[PATCH v8 13/39] btrfs: handle btrfs_record_root_in_trans failure in start_transaction

2021-03-12 Thread Josef Bacik
btrfs_record_root_in_trans will return errors in the future, so handle
the error properly in start_transaction.

Reviewed-by: Qu Wenruo 
Signed-off-by: Josef Bacik 
---
 fs/btrfs/transaction.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index acff6bb49a97..3a3a582063b4 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -741,7 +741,11 @@ start_transaction(struct btrfs_root *root, unsigned int 
num_items,
 * Thus it need to be called after current->journal_info initialized,
 * or we can deadlock.
 */
-   btrfs_record_root_in_trans(h, root);
+   ret = btrfs_record_root_in_trans(h, root);
+   if (ret) {
+   btrfs_end_transaction(h);
+   return ERR_PTR(ret);
+   }
 
return h;
 
-- 
2.26.2



[PATCH v8 12/39] btrfs: btrfs: handle btrfs_record_root_in_trans failure in relocate_tree_block

2021-03-12 Thread Josef Bacik
btrfs_record_root_in_trans will return errors in the future, so handle
the error properly in relocate_tree_block.

Reviewed-by: Qu Wenruo 
Signed-off-by: Josef Bacik 
---
 fs/btrfs/relocation.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 28d49b14a23a..7ca7ab3d40e2 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2548,7 +2548,9 @@ static int relocate_tree_block(struct btrfs_trans_handle 
*trans,
ret = -EUCLEAN;
goto out;
}
-   btrfs_record_root_in_trans(trans, root);
+   ret = btrfs_record_root_in_trans(trans, root);
+   if (ret)
+   goto out;
root = root->reloc_root;
node->new_bytenr = root->node->start;
btrfs_put_root(node->root);
-- 
2.26.2



[PATCH v8 11/39] btrfs: handle btrfs_record_root_in_trans failure in create_subvol

2021-03-12 Thread Josef Bacik
btrfs_record_root_in_trans will return errors in the future, so handle
the error properly in create_subvol.

Signed-off-by: Josef Bacik 
---
 fs/btrfs/ioctl.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e8d53fea4c61..29dbb1644935 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -721,7 +721,12 @@ static noinline int create_subvol(struct inode *dir,
/* Freeing will be done in btrfs_put_root() of new_root */
anon_dev = 0;
 
-   btrfs_record_root_in_trans(trans, new_root);
+   ret = btrfs_record_root_in_trans(trans, new_root);
+   if (ret) {
+   btrfs_put_root(new_root);
+   btrfs_abort_transaction(trans, ret);
+   goto fail;
+   }
 
ret = btrfs_create_subvol_root(trans, new_root, root);
btrfs_put_root(new_root);
-- 
2.26.2



[PATCH v8 09/39] btrfs: handle btrfs_record_root_in_trans failure in btrfs_delete_subvolume

2021-03-12 Thread Josef Bacik
btrfs_record_root_in_trans will return errors in the future, so handle
the error properly in btrfs_delete_subvolume.

Reviewed-by: Qu Wenruo 
Signed-off-by: Josef Bacik 
---
 fs/btrfs/inode.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c64a5e3eca47..48c953d0dde7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4317,7 +4317,11 @@ int btrfs_delete_subvolume(struct inode *dir, struct 
dentry *dentry)
goto out_end_trans;
}
 
-   btrfs_record_root_in_trans(trans, dest);
+   ret = btrfs_record_root_in_trans(trans, dest);
+   if (ret) {
+   btrfs_abort_transaction(trans, ret);
+   goto out_end_trans;
+   }
 
memset(&dest->root_item.drop_progress, 0,
sizeof(dest->root_item.drop_progress));
-- 
2.26.2



[PATCH v8 06/39] btrfs: do proper error handling in record_reloc_root_in_trans

2021-03-12 Thread Josef Bacik
Generally speaking this shouldn't ever fail, the corresponding fs root
for the reloc root will already be in memory, so we won't get -ENOMEM
here.

However if there is no corresponding root for the reloc root then we
could get -ENOMEM when we try to allocate it or we could get -ENOENT
when we look it up and see that it doesn't exist.

Convert these BUG_ON()'s into ASSERT()'s + proper error handling for the
case of corruption.

Signed-off-by: Josef Bacik 
---
 fs/btrfs/relocation.c | 23 +--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b70a12a7..28d49b14a23a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1971,8 +1971,27 @@ static int record_reloc_root_in_trans(struct 
btrfs_trans_handle *trans,
return 0;
 
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false);
-   BUG_ON(IS_ERR(root));
-   BUG_ON(root->reloc_root != reloc_root);
+
+   /*
+* This should succeed, since we can't have a reloc root without having
+* already looked up the actual root and created the reloc root for this
+* root.
+*
+* However if there's some sort of corruption where we have a ref to a
+* reloc root without a corresponding root this could return -ENOENT.
+*/
+   if (IS_ERR(root)) {
+   ASSERT(0);
+   return PTR_ERR(root);
+   }
+   if (root->reloc_root != reloc_root) {
+   ASSERT(0);
+   btrfs_err(fs_info,
+ "root %llu has two reloc roots associated with it",
+ reloc_root->root_key.offset);
+   btrfs_put_root(root);
+   return -EUCLEAN;
+   }
ret = btrfs_record_root_in_trans(trans, root);
btrfs_put_root(root);
 
-- 
2.26.2



[PATCH v8 10/39] btrfs: handle btrfs_record_root_in_trans failure in btrfs_recover_log_trees

2021-03-12 Thread Josef Bacik
btrfs_record_root_in_trans will return errors in the future, so handle
the error properly in btrfs_recover_log_trees.

This appears tricky, however we have a reference count on the
destination root, so if this fails we need to continue on in the loop to
make sure the properly cleanup is done.

Reviewed-by: Qu Wenruo 
Signed-off-by: Josef Bacik 
---
 fs/btrfs/tree-log.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 2f1acc9aea9e..941b4dd96dd0 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -6278,8 +6278,12 @@ int btrfs_recover_log_trees(struct btrfs_root 
*log_root_tree)
}
 
wc.replay_dest->log_root = log;
-   btrfs_record_root_in_trans(trans, wc.replay_dest);
-   ret = walk_log_tree(trans, log, &wc);
+   ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
+   if (ret)
+   btrfs_handle_fs_error(fs_info, ret,
+   "Couldn't record the root in the transaction.");
+   else
+   ret = walk_log_tree(trans, log, &wc);
 
if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
ret = fixup_inode_link_counts(trans, wc.replay_dest,
-- 
2.26.2



[PATCH v8 08/39] btrfs: handle btrfs_record_root_in_trans failure in btrfs_rename

2021-03-12 Thread Josef Bacik
btrfs_record_root_in_trans will return errors in the future, so handle
the error properly in btrfs_rename.

Signed-off-by: Josef Bacik 
---
 fs/btrfs/inode.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a9fde9a22fff..c64a5e3eca47 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9410,8 +9410,11 @@ static int btrfs_rename(struct inode *old_dir, struct 
dentry *old_dentry,
goto out_notrans;
}
 
-   if (dest != root)
-   btrfs_record_root_in_trans(trans, dest);
+   if (dest != root) {
+   ret = btrfs_record_root_in_trans(trans, dest);
+   if (ret)
+   goto out_fail;
+   }
 
ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
if (ret)
-- 
2.26.2



[PATCH v8 05/39] btrfs: check record_root_in_trans related failures in select_reloc_root

2021-03-12 Thread Josef Bacik
We will record the fs root or the reloc root in the trans in
select_reloc_root.  These will actually return errors in the following
patches, so check their return value here and return it up the stack.

Reviewed-by: Qu Wenruo 
Signed-off-by: Josef Bacik 
---
 fs/btrfs/relocation.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 0f2ecf61696e..b70a12a7 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1988,6 +1988,7 @@ struct btrfs_root *select_reloc_root(struct 
btrfs_trans_handle *trans,
struct btrfs_backref_node *next;
struct btrfs_root *root;
int index = 0;
+   int ret;
 
next = node;
while (1) {
@@ -2023,11 +2024,15 @@ struct btrfs_root *select_reloc_root(struct 
btrfs_trans_handle *trans,
}
 
if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
-   record_reloc_root_in_trans(trans, root);
+   ret = record_reloc_root_in_trans(trans, root);
+   if (ret)
+   return ERR_PTR(ret);
break;
}
 
-   btrfs_record_root_in_trans(trans, root);
+   ret = btrfs_record_root_in_trans(trans, root);
+   if (ret)
+   return ERR_PTR(ret);
root = root->reloc_root;
 
if (next->new_bytenr != root->node->start) {
-- 
2.26.2



[PATCH v8 07/39] btrfs: handle btrfs_record_root_in_trans failure in btrfs_rename_exchange

2021-03-12 Thread Josef Bacik
btrfs_record_root_in_trans will return errors in the future, so handle
the error properly in btrfs_rename_exchange.

Reviewed-by: Qu Wenruo 
Signed-off-by: Josef Bacik 
---
 fs/btrfs/inode.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 52dc5f52ea58..a9fde9a22fff 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9102,8 +9102,11 @@ static int btrfs_rename_exchange(struct inode *old_dir,
goto out_notrans;
}
 
-   if (dest != root)
-   btrfs_record_root_in_trans(trans, dest);
+   if (dest != root) {
+   ret = btrfs_record_root_in_trans(trans, dest);
+   if (ret)
+   goto out_fail;
+   }
 
/*
 * We need to find a free sequence number both in the source and
-- 
2.26.2



[PATCH v8 02/39] btrfs: convert BUG_ON()'s in relocate_tree_block

2021-03-12 Thread Josef Bacik
We have a couple of BUG_ON()'s in relocate_tree_block() that can be
tripped if we have file system corruption.  Convert these to ASSERT()'s
so developers still get yelled at when they break the backref code, but
error out nicely for users so the whole box doesn't go down.

Reviewed-by: Qu Wenruo 
Signed-off-by: Josef Bacik 
---
 fs/btrfs/relocation.c | 24 ++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 63e416ac79ae..1f371a878831 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2454,8 +2454,28 @@ static int relocate_tree_block(struct btrfs_trans_handle 
*trans,
 
if (root) {
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
-   BUG_ON(node->new_bytenr);
-   BUG_ON(!list_empty(&node->list));
+   /*
+* This block was the root block of a root, and this is
+* the first time we're processing the block and thus it
+* should not have had the ->new_bytenr modified and
+* should have not been included on the changed list.
+*
+* However in the case of corruption we could have
+* multiple refs pointing to the same block improperly,
+* and thus we would trip over these checks.  ASSERT()
+* for the developer case, because it could indicate a
+* bug in the backref code, however error out for a
+* normal user in the case of corruption.
+*/
+   ASSERT(node->new_bytenr == 0);
+   ASSERT(list_empty(&node->list));
+   if (node->new_bytenr || !list_empty(&node->list)) {
+   btrfs_err(root->fs_info,
+ "bytenr %llu has improper references to it",
+ node->bytenr);
+   ret = -EUCLEAN;
+   goto out;
+   }
btrfs_record_root_in_trans(trans, root);
root = root->reloc_root;
node->new_bytenr = root->node->start;
-- 
2.26.2



[PATCH v8 03/39] btrfs: handle errors from select_reloc_root()

2021-03-12 Thread Josef Bacik
Currently select_reloc_root() doesn't return an error, but followup
patches will make it possible for it to return an error.  We do have
proper error recovery in do_relocation however, so handle the
possibility of select_reloc_root() having an error properly instead of
BUG_ON(!root).  I've also adjusted select_reloc_root() to return
ERR_PTR(-ENOENT) if we don't find a root, instead of NULL, to make the
error case easier to deal with.  I've replaced the BUG_ON(!root) with an
ASSERT(0) for this case as it indicates we messed up the backref walking
code, but it could also indicate corruption.

Signed-off-by: Josef Bacik 
---
 fs/btrfs/relocation.c | 15 ---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 1f371a878831..097fea09e2d2 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2024,8 +2024,14 @@ struct btrfs_root *select_reloc_root(struct 
btrfs_trans_handle *trans,
if (!next || next->level <= node->level)
break;
}
-   if (!root)
-   return NULL;
+   if (!root) {
+   /*
+* This can happen if there's fs corruption or if there's a bug
+* in the backref lookup code.
+*/
+   ASSERT(0);
+   return ERR_PTR(-ENOENT);
+   }
 
next = node;
/* setup backref node path for btrfs_reloc_cow_block */
@@ -2196,7 +2202,10 @@ static int do_relocation(struct btrfs_trans_handle 
*trans,
 
upper = edge->node[UPPER];
root = select_reloc_root(trans, rc, upper, edges);
-   BUG_ON(!root);
+   if (IS_ERR(root)) {
+   ret = PTR_ERR(root);
+   goto next;
+   }
 
if (upper->eb && !upper->locked) {
if (!lowest) {
-- 
2.26.2



[PATCH v8 04/39] btrfs: convert BUG_ON()'s in select_reloc_root() to proper errors

2021-03-12 Thread Josef Bacik
We have several BUG_ON()'s in select_reloc_root() that can be tripped if
you have extent tree corruption.  Convert these to ASSERT()'s, because
if we hit it during testing it really is bad, or could indicate a
problem with the backref walking code.

However if users hit these problems it generally indicates corruption,
I've hit a few machines in the fleet that trip over these with clearly
corrupted extent trees, so be nice and spit out an error message and
return an error instead of bringing the whole box down.

Signed-off-by: Josef Bacik 
---
 fs/btrfs/relocation.c | 47 +++
 1 file changed, 43 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 097fea09e2d2..0f2ecf61696e 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1994,8 +1994,33 @@ struct btrfs_root *select_reloc_root(struct 
btrfs_trans_handle *trans,
cond_resched();
next = walk_up_backref(next, edges, &index);
root = next->root;
-   BUG_ON(!root);
-   BUG_ON(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state));
+
+   /*
+* If there is no root, then our references for this block are
+* incomplete, as we should be able to walk all the way up to a
+* block that is owned by a root.
+*
+* This path is only for SHAREABLE roots, so if we come upon a
+* non-SHAREABLE root then we have backrefs that resolve
+* improperly.
+*
+* Both of these cases indicate file system corruption, or a bug
+* in the backref walking code.
+*/
+   if (!root) {
+   ASSERT(0);
+   btrfs_err(trans->fs_info,
+   "bytenr %llu doesn't have a backref path ending in a root",
+ node->bytenr);
+   return ERR_PTR(-EUCLEAN);
+   }
+   if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
+   ASSERT(0);
+   btrfs_err(trans->fs_info,
+"bytenr %llu has multiple refs with one ending in a non shareable root",
+ node->bytenr);
+   return ERR_PTR(-EUCLEAN);
+   }
 
if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
record_reloc_root_in_trans(trans, root);
@@ -2006,8 +2031,22 @@ struct btrfs_root *select_reloc_root(struct 
btrfs_trans_handle *trans,
root = root->reloc_root;
 
if (next->new_bytenr != root->node->start) {
-   BUG_ON(next->new_bytenr);
-   BUG_ON(!list_empty(&next->list));
+   /*
+* We just created the reloc root, so we shouldn't have
+* ->new_bytenr set and this shouldn't be in the changed
+*  list.  If it is then we have multiple roots pointing
+*  at the same bytenr which indicates corruption, or
+*  we've made a mistake in the backref walking code.
+*/
+   ASSERT(next->new_bytenr == 0);
+   ASSERT(list_empty(&next->list));
+   if (next->new_bytenr || !list_empty(&next->list)) {
+   btrfs_err(trans->fs_info,
+"bytenr %llu possibly has multiple roots pointing at the same bytenr %llu",
+ node->bytenr, next->bytenr);
+   return ERR_PTR(-EUCLEAN);
+   }
+
next->new_bytenr = root->node->start;
btrfs_put_root(next->root);
next->root = btrfs_grab_root(root);
-- 
2.26.2



[PATCH v8 01/39] btrfs: convert some BUG_ON()'s to ASSERT()'s in do_relocation

2021-03-12 Thread Josef Bacik
A few of these are checking for correctness, and won't be triggered by
corrupted file systems, so convert them to ASSERT() instead of BUG_ON()
and add a comment explaining their existence.

Reviewed-by: Qu Wenruo 
Signed-off-by: Josef Bacik 
---
 fs/btrfs/relocation.c | 19 ---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 232d5da7b7be..63e416ac79ae 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2181,7 +2181,11 @@ static int do_relocation(struct btrfs_trans_handle 
*trans,
int slot;
int ret = 0;
 
-   BUG_ON(lowest && node->eb);
+   /*
+* If we are lowest then this is the first time we're processing this
+* block, and thus shouldn't have an eb associated with it yet.
+*/
+   ASSERT(!lowest || !node->eb);
 
path->lowest_level = node->level + 1;
rc->backref_cache.path[node->level] = node;
@@ -2266,7 +2270,11 @@ static int do_relocation(struct btrfs_trans_handle 
*trans,
free_extent_buffer(eb);
if (ret < 0)
goto next;
-   BUG_ON(node->eb != eb);
+   /*
+* We've just cow'ed this block, it should have updated
+* the correct backref node entry.
+*/
+   ASSERT(node->eb == eb);
} else {
btrfs_set_node_blockptr(upper->eb, slot,
node->eb->start);
@@ -2302,7 +2310,12 @@ static int do_relocation(struct btrfs_trans_handle 
*trans,
}
 
path->lowest_level = 0;
-   BUG_ON(ret == -ENOSPC);
+
+   /*
+* We should have allocated all of our space in the block rsv and thus
+* shouldn't ENOSPC.
+*/
+   ASSERT(ret != -ENOSPC);
return ret;
 }
 
-- 
2.26.2



[PATCH v8 00/39] Cleanup error handling in relocation

2021-03-12 Thread Josef Bacik
v7->v8:
- Reordered some of the patches, so that the callers of functions that added new
  error cases were fixed first, and then added the new error handler.
- Moved a few of the ASSERT(0) to where they made sense.  Left the others that
  are in functions that can have multiple cases of the same error that indicates
  a logic error.

v6->v7:
- Broke up the series into 3 series, 1 for cosmetic things, 1 for all the major
  issues (including those reported on v6 of this set), and this new set which is
  solely the error handling related patches for relocation.  It's still a lot of
  patches, sorry about that.

v5->v6:
- Reworked "btrfs: handle errors from select_reloc_root()" because Zygo reported
  hitting an ASSERT(ret != -ENOENT) during his testing.  This was because I
  changed select_reloc_root() to return -ENOENT if we happened to race with
  somebody else who failed to init the reloc root, however we had an ASSERT() to
  check for this because it indicated corruption.  I modified that patch to move
  the ASSERT() to where the problem actually is, so select_reloc_root() can
  return whatever error and it'll pass it along.  I also removed Qu's
  reviewed-by for the patch because of the change.

v4->v5:
- Dropped "btrfs: fix error handling in commit_fs_roots" as it was merged.
- Fixed an ASSERT() that happened during relocation recovery that Zygo reported,
  I moved the error condition out of another condition which broke recovery if
  we had deleted subvols pending with relocation.

v3->v4:
- Squashed the __add_reloc_root error handling patches in
  btrfs_recover_relocation as they were small and in the same function.
- Squashed the record_root_in_trans failure handling patches for
  select_reloc_root as they were small and in the same function.
- Added a new patch to address an existing error handling problem with subvol
  creation.
- Fixed up the various cases that Qu noticed where I got things wrong, cleaning
  up a leaked root extent ref, a leaked inode item, and where I accidentally
  stopped dealing with errors from btrfs_drop_subtree.
- Reworked a bunch of the ASSERT()'s to do ASSERT(0) in their respective if
  statements.
- Added reviewed-bys.

v2->v3:
- A lot of extra patches fixing various things that I encountered while
  debugging the corruption problem that was uncovered by these patches.
- Fixed the panic that Zygo was seeing and other issues.
- Fixed up the comments from Nikolay and Filipe.

A slight note, the first set of patches could probably be taken now, and in fact

  btrfs: fix error handling in commit_fs_roots

Was sent earlier this week and is very important and needs to be reviewed and
merged ASAP.  The following are safe and could be merged outside of the rest of
this series

  btrfs: allow error injection for btrfs_search_slot and btrfs_cow_block
  btrfs: fix lockdep splat in btrfs_recover_relocation
  btrfs: keep track of the root owner for relocation reads
  btrfs: noinline btrfs_should_cancel_balance
  btrfs: do not cleanup upper nodes in btrfs_backref_cleanup_node
  btrfs: pass down the tree block level through ref-verify
  btrfs: make sure owner is set in ref-verify
  btrfs: don't clear ret in btrfs_start_dirty_block_groups

The rest obviously are all around the actual error handling.

v1->v2:
- fixed a bug where I accidentally dropped reading flags in relocate_block_group
  when I dropped the extra checks that we handle in the tree checker.

--- Original message ---
Hello,

Relocation is the last place that is not able to handle errors at all, which
results in all sorts of lovely panics if you encounter corruptions or IO errors.
I'm going to start cleaning up relocation, but before I move code around I want
the error handling to be somewhat sane, so I'm not changing behavior and error
handling at the same time.

These patches are purely about error handling, there is no behavior changing
other than returning errors up the chain properly.  There is a lot of room for
follow up cleanups, which will happen next.  However I wanted to get this series
done today and out so we could get it merged ASAP, and then the follow up
cleanups can happen later as they are less important and less critical.

The only exception to the above is the patch to add the error injection sites
for btrfs_cow_block and btrfs_search_slot, and a lockdep fix that I discovered
while running my tests, those are the first two patches in the series.

I tested this with my error injection stress test, where I keep track of all
stack traces that have been tested and only inject errors when we have a new
stack trace, which means I should have covered all of the various error
conditions.  With this patchset I'm no longer panicing while stressing the error
conditions.  Thanks,

Josef

Josef Bacik (39):
  btrfs: convert some BUG_ON()'s to ASSERT()'s in do_relocation
  btrfs: convert BUG_ON()'s in relocate_tree_block

Re: [PATCH v7 03/38] btrfs: handle errors from select_reloc_root()

2021-03-11 Thread Josef Bacik

On 2/26/21 1:30 PM, David Sterba wrote:

On Wed, Dec 16, 2020 at 11:26:19AM -0500, Josef Bacik wrote:

Currently select_reloc_root() doesn't return an error, but followup
patches will make it possible for it to return an error.  We do have
proper error recovery in do_relocation however, so handle the
possibility of select_reloc_root() having an error properly instead of
BUG_ON(!root).  I've also adjusted select_reloc_root() to return
ERR_PTR(-ENOENT) if we don't find a root, instead of NULL, to make the
error case easier to deal with.  I've replaced the BUG_ON(!root) with an
ASSERT(0) for this case as it indicates we messed up the backref walking
code, but it could also indicate corruption.

Signed-off-by: Josef Bacik 
---
  fs/btrfs/relocation.c | 15 ---
  1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 08ffaa93b78f..741068580455 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2024,8 +2024,14 @@ struct btrfs_root *select_reloc_root(struct 
btrfs_trans_handle *trans,
if (!next || next->level <= node->level)
break;
}
-   if (!root)
-   return NULL;
+   if (!root) {
+   /*
+* This can happen if there's fs corruption or if there's a bug
+* in the backref lookup code.
+*/
+   ASSERT(0);


You've added these assert(0) to several patches and I think it's wrong.
The asserts are supposed to verify invariants, you can hardly say that
we're expecting 0 to be true, so the construct serves as "please crash
here", which is no better than BUG().  It's been spreading, there are
like 25 now.


They are much better than a BUG_ON(), as they won't trip in production, they'll 
only trip for developers.  And we need them in many of these cases, and this 
example you've called out is a clear example of where we absolutely want to 
differentiate, because we have 3 different failure modes that will return 
-EUCLEAN.  If I add a ASSERT(ret != -EUCLEAN) to all of the callers then when 
the developer hits a logic bug they'll have to go and manually add in their own 
assert to figure out which error condition failed.  Instead I've added 
explanations in the comments for each assert and then have an assert for every 
error condition so that it's clear where things went wrong.


So these ASSERT()'s are staying.  I could be convinced to change them to match 
their error condition, so in the above case instead I would be fine changing it to


if (!root) {
ASSERT(root);
...

If that's more acceptable let me know, I'll change them to match their error 
condition.  Thanks,


Josef


[PATCH 1/3] btrfs: init devices always

2021-03-11 Thread Josef Bacik
Neal reported a panic trying to use -o rescue=all

BUG: kernel NULL pointer dereference, address: 0030
PGD 0 P4D 0
Oops:  [#1] SMP NOPTI
CPU: 0 PID: 696 Comm: mount Tainted: GW 5.12.0-rc2+ #296
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 
04/01/2014
RIP: 0010:btrfs_device_init_dev_stats+0x1d/0x200
RSP: 0018:afaec1483bb8 EFLAGS: 00010286
RAX:  RBX: 9a5715bcb298 RCX: 0070
RDX: 9a5703248000 RSI: 9a57052ea150 RDI: 9a5715bca400
RBP: 9a57052ea150 R08: 0070 R09: 9a57052ea150
R10: 000130faf0741c10 R11:  R12: 9a570370
R13:  R14: 9a5715bcb278 R15: 9a57052ea150
FS:  7f600d122c40() GS:9a577bc0() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2: 0030 CR3: 000112a46005 CR4: 00370ef0
Call Trace:
 ? btrfs_init_dev_stats+0x1f/0xf0
 ? kmem_cache_alloc+0xef/0x1f0
 btrfs_init_dev_stats+0x5f/0xf0
 open_ctree+0x10cb/0x1720
 btrfs_mount_root.cold+0x12/0xea
 legacy_get_tree+0x27/0x40
 vfs_get_tree+0x25/0xb0
 vfs_kern_mount.part.0+0x71/0xb0
 btrfs_mount+0x10d/0x380
 legacy_get_tree+0x27/0x40
 vfs_get_tree+0x25/0xb0
 path_mount+0x433/0xa00
 __x64_sys_mount+0xe3/0x120
 do_syscall_64+0x33/0x40
 entry_SYSCALL_64_after_hwframe+0x44/0xae

This happens because when we call btrfs_init_dev_stats we do
device->fs_info->dev_root.  However device->fs_info isn't init'ed
because we were only calling btrfs_init_devices_late() if we properly
read the device root.  However we don't actually need the device root to
init the devices, this function simply assigns the devices their
->fs_info pointer properly, so this needs to be done unconditionally
always so that we can properly deref device->fs_info in rescue cases.

Reported-by: Neal Gompa 
Signed-off-by: Josef Bacik 
---
 fs/btrfs/disk-io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 41b718cfea40..63656bf23ff2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2387,8 +2387,8 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
} else {
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->dev_root = root;
-   btrfs_init_devices_late(fs_info);
}
+   btrfs_init_devices_late(fs_info);
 
/* If IGNOREDATACSUMS is set don't bother reading the csum root. */
if (!btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
-- 
2.26.2



[PATCH 2/3] btrfs: do not init dev stats if we have no dev_root

2021-03-11 Thread Josef Bacik
Neal reported a panic trying to use -o rescue=all

BUG: kernel NULL pointer dereference, address: 0030
PGD 0 P4D 0
Oops:  [#1] SMP PTI
CPU: 0 PID: 4095 Comm: mount Not tainted 5.11.0-0.rc7.149.fc34.x86_64 #1
RIP: 0010:btrfs_device_init_dev_stats+0x4c/0x1f0
RSP: 0018:a60285fbfb68 EFLAGS: 00010246
RAX:  RBX: 88b88f806498 RCX: 88b82e7a2a10
RDX: a60285fbfb97 RSI: 88b82e7a2a10 RDI: 
RBP: 88b88f806b3c R08:  R09: 
R10: 88b82e7a2a10 R11:  R12: 88b88f806a00
R13: 88b88f806478 R14: 88b88f806a00 R15: 88b82e7a2a10
FS:  7f698be1ec40() GS:88b937e0() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2: 0030 CR3: 92c9c006 CR4: 003706f0
Call Trace:
? btrfs_init_dev_stats+0x1f/0xf0
btrfs_init_dev_stats+0x62/0xf0
open_ctree+0x1019/0x15ff
btrfs_mount_root.cold+0x13/0xfa
legacy_get_tree+0x27/0x40
vfs_get_tree+0x25/0xb0
vfs_kern_mount.part.0+0x71/0xb0
btrfs_mount+0x131/0x3d0
? legacy_get_tree+0x27/0x40
? btrfs_show_options+0x640/0x640
legacy_get_tree+0x27/0x40
vfs_get_tree+0x25/0xb0
path_mount+0x441/0xa80
__x64_sys_mount+0xf4/0x130
do_syscall_64+0x33/0x40
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f698c04e52e

This happens because we unconditionally attempt to init device stats on
mount, but we may not have been able to read the device root.  Fix this
by skipping init'ing the device stats if we do not have a device root.

Reported-by: Neal Gompa 
Signed-off-by: Josef Bacik 
---
 fs/btrfs/volumes.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 995920fcce9b..d4ca721c1d91 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -7448,6 +7448,9 @@ static int btrfs_device_init_dev_stats(struct 
btrfs_device *device,
int item_size;
int i, ret, slot;
 
+   if (!device->fs_info->dev_root)
+   return 0;
+
key.objectid = BTRFS_DEV_STATS_OBJECTID;
key.type = BTRFS_PERSISTENT_ITEM_KEY;
key.offset = device->devid;
-- 
2.26.2



[PATCH 3/3] btrfs: don't init dev replace for bad dev root

2021-03-11 Thread Josef Bacik
While helping Neal fix his broken file system I added a debug patch to
catch if we were calling btrfs_search_slot with a NULL root, and this
stack trace popped

we tried to search with a NULL root
CPU: 0 PID: 1760 Comm: mount Not tainted 5.11.0-155.nealbtrfstest.1.fc34.x86_64 
#1
Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference 
Platform, BIOS 6.00 07/22/2020
Call Trace:
 dump_stack+0x6b/0x83
 btrfs_search_slot.cold+0x11/0x1b
 ? btrfs_init_dev_replace+0x36/0x450
 btrfs_init_dev_replace+0x71/0x450
 open_ctree+0x1054/0x1610
 btrfs_mount_root.cold+0x13/0xfa
 legacy_get_tree+0x27/0x40
 vfs_get_tree+0x25/0xb0
 vfs_kern_mount.part.0+0x71/0xb0
 btrfs_mount+0x131/0x3d0
 ? legacy_get_tree+0x27/0x40
 ? btrfs_show_options+0x640/0x640
 legacy_get_tree+0x27/0x40
 vfs_get_tree+0x25/0xb0
 path_mount+0x441/0xa80
 __x64_sys_mount+0xf4/0x130
 do_syscall_64+0x33/0x40
 entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f644730352e

Fix this by not starting the device replace stuff if we do not have a
NULL dev root.

Reported-by: Neal Gompa 
Signed-off-by: Josef Bacik 
---
 fs/btrfs/dev-replace.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 3a9c1e046ebe..d05f73530af7 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -81,6 +81,9 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
struct btrfs_dev_replace_item *ptr;
u64 src_devid;
 
+   if (!dev_root)
+   return 0;
+
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
-- 
2.26.2



[PATCH 0/3] Handle bad dev_root properly with rescue=all

2021-03-11 Thread Josef Bacik
Hello,

My recent debugging session with Neal's broken filesystem uncovered a glaring
hole in my rescue=all patches, they don't deal with a NULL dev_root properly.
In testing I only ever tested corrupting the extent tree or the csum tree, since
those are the most problematic.  The following 3 fixes allowed Neal to get
rescue=all working without panicing the machine, and I verified everything by
using btrfs-corrupt-block to corrupt a dev root of a file system.  Thanks,

Josef

Josef Bacik (3):
  btrfs: init devices always
  btrfs: do not init dev stats if we have no dev_root
  btrfs: don't init dev replace for bad dev root

 fs/btrfs/dev-replace.c | 3 +++
 fs/btrfs/disk-io.c | 2 +-
 fs/btrfs/volumes.c | 3 +++
 3 files changed, 7 insertions(+), 1 deletion(-)

-- 
2.26.2



Re: Recovering Btrfs from a freak failure of the disk controller

2021-03-09 Thread Josef Bacik

On 3/9/21 4:06 PM, Neal Gompa wrote:

On Tue, Mar 9, 2021 at 2:04 PM Josef Bacik  wrote:


On 3/8/21 8:12 PM, Neal Gompa wrote:

On Mon, Mar 8, 2021 at 5:04 PM Josef Bacik  wrote:


On 3/8/21 3:01 PM, Neal Gompa wrote:

On Mon, Mar 8, 2021 at 1:38 PM Josef Bacik  wrote:


On 3/5/21 8:03 PM, Neal Gompa wrote:

On Fri, Mar 5, 2021 at 5:01 PM Josef Bacik  wrote:


On 3/5/21 9:41 AM, Neal Gompa wrote:

On Fri, Mar 5, 2021 at 9:12 AM Josef Bacik  wrote:


On 3/4/21 6:54 PM, Neal Gompa wrote:

On Thu, Mar 4, 2021 at 3:25 PM Josef Bacik  wrote:


On 3/3/21 2:38 PM, Neal Gompa wrote:

On Wed, Mar 3, 2021 at 1:42 PM Josef Bacik  wrote:


On 2/24/21 10:47 PM, Neal Gompa wrote:

On Wed, Feb 24, 2021 at 10:44 AM Josef Bacik  wrote:


On 2/24/21 9:23 AM, Neal Gompa wrote:

On Tue, Feb 23, 2021 at 10:05 AM Josef Bacik  wrote:


On 2/22/21 11:03 PM, Neal Gompa wrote:

On Mon, Feb 22, 2021 at 2:34 PM Josef Bacik  wrote:


On 2/21/21 1:27 PM, Neal Gompa wrote:

On Wed, Feb 17, 2021 at 11:44 AM Josef Bacik  wrote:


On 2/17/21 11:29 AM, Neal Gompa wrote:

On Wed, Feb 17, 2021 at 9:59 AM Josef Bacik  wrote:


On 2/17/21 9:50 AM, Neal Gompa wrote:

On Wed, Feb 17, 2021 at 9:36 AM Josef Bacik  wrote:


On 2/16/21 9:05 PM, Neal Gompa wrote:

On Tue, Feb 16, 2021 at 4:24 PM Josef Bacik  wrote:


On 2/16/21 3:29 PM, Neal Gompa wrote:

On Tue, Feb 16, 2021 at 1:11 PM Josef Bacik  wrote:


On 2/16/21 11:27 AM, Neal Gompa wrote:

On Tue, Feb 16, 2021 at 10:19 AM Josef Bacik  wrote:


On 2/14/21 3:25 PM, Neal Gompa wrote:

Hey all,

So one of my main computers recently had a disk controller failure
that caused my machine to freeze. After rebooting, Btrfs refuses to
mount. I tried to do a mount and the following errors show up in the
journal:


Feb 14 15:20:49 localhost-live kernel: BTRFS info (device sda3): disk space 
caching is enabled
Feb 14 15:20:49 localhost-live kernel: BTRFS info (device sda3): has skinny 
extents
Feb 14 15:20:49 localhost-live kernel: BTRFS critical (device sda3): corrupt 
leaf: root=401 block=796082176 slot=15 ino=203657, invalid inode transid: has 
96 expect [0, 95]
Feb 14 15:20:49 localhost-live kernel: BTRFS error (device sda3): 
block=796082176 read time tree block corruption detected
Feb 14 15:20:49 localhost-live kernel: BTRFS critical (device sda3): corrupt 
leaf: root=401 block=796082176 slot=15 ino=203657, invalid inode transid: has 
96 expect [0, 95]
Feb 14 15:20:49 localhost-live kernel: BTRFS error (device sda3): 
block=796082176 read time tree block corruption detected
Feb 14 15:20:49 localhost-live kernel: BTRFS warning (device sda3): couldn't 
read tree root
Feb 14 15:20:49 localhost-live kernel: BTRFS error (device sda3): open_ctree 
failed


I've tried to do -o recovery,ro mount and get the same issue. I can't
seem to find any reasonably good information on how to do recovery in
this scenario, even to just recover enough to copy data off.

I'm on Fedora 33, the system was on Linux kernel version 5.9.16 and
the Fedora 33 live ISO I'm using has Linux kernel version 5.10.14. I'm
using btrfs-progs v5.10.

Can anyone help?


Can you try

btrfs check --clear-space-cache v1 /dev/whatever

That should fix the inode generation thing so it's sane, and then the tree
checker will allow the fs to be read, hopefully.  If not we can work out some
other magic.  Thanks,

Josef


I got the same error as I did with btrfs-check --readonly...



Oh lovely, what does btrfs check --readonly --backup do?



No dice...

# btrfs check --readonly --backup /dev/sda3

Opening filesystem to check...
parent transid verify failed on 791281664 wanted 93 found 95
parent transid verify failed on 791281664 wanted 93 found 95
parent transid verify failed on 791281664 wanted 93 found 95


Hey look the block we're looking for, I wrote you some magic, just pull

https://github.com/josefbacik/btrfs-progs/tree/for-neal

build, and then run

btrfs-neal-magic /dev/sda3 791281664 95

This will force us to point at the old root with (hopefully) the right bytenr
and gen, and then hopefully you'll be able to recover from there.  This is kind
of saucy, so yolo, but I can undo it if it makes things worse.  Thanks,



# btrfs check --readonly /dev/sda3

Opening filesystem to check...
ERROR: could not setup extent tree
ERROR: cannot open file system

# btrfs check --clear-space-cache v1 /dev/sda3

Opening filesystem to check...
ERROR: could not setup extent tree
ERROR: cannot open file system


It's better, but still no dice... :(




Hmm it's not telling us what's wrong with the extent tree, which is annoying.
Does mount -o rescue=all,ro work now that the root tree is normal?  Thanks,



Nope, I see this in the journal:


Feb 17 09:49:40 localhost-live kernel: BTRFS info (device sda3): enabling all 
of the rescue options
Feb 17 09:49:40 localhost-live kernel: BTRFS info (device sda3): ignoring data 
csums
Feb 17 09:49:40 localhost-live 

<    1   2   3   4   5   6   7   8   9   10   >