[PATCH 02/35] fscrypt: add per-extent encryption support
This adds the code necessary for per-extent encryption. We will store a nonce for every extent we create, and then use the inode's policy and the extents nonce to derive a per-extent key. This is meant to be flexible, if we choose to expand the on-disk extent information in the future we have a version number we can use to change what exists on disk. The file system indicates it wants to use per-extent encryption by setting s_cop->set_extent_context. This also requires the use of inline block encryption. The support is relatively straightforward, the only "extra" bit is we're deriving a per-extent key to use for the encryption, the inode still controls the policy and access to the master key. Signed-off-by: Josef Bacik --- fs/crypto/crypto.c | 10 ++- fs/crypto/fscrypt_private.h | 45 +++ fs/crypto/inline_crypt.c| 84 fs/crypto/keysetup.c| 152 fs/crypto/policy.c | 47 +++ include/linux/fscrypt.h | 66 6 files changed, 403 insertions(+), 1 deletion(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 2b2bb5740322..e3ba06cb4082 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -40,6 +40,7 @@ static struct workqueue_struct *fscrypt_read_workqueue; static DEFINE_MUTEX(fscrypt_init_mutex); struct kmem_cache *fscrypt_inode_info_cachep; +struct kmem_cache *fscrypt_extent_info_cachep; void fscrypt_enqueue_decrypt_work(struct work_struct *work) { @@ -396,12 +397,19 @@ static int __init fscrypt_init(void) if (!fscrypt_inode_info_cachep) goto fail_free_queue; + fscrypt_extent_info_cachep = KMEM_CACHE(fscrypt_extent_info, + SLAB_RECLAIM_ACCOUNT); + if (!fscrypt_extent_info_cachep) + goto fail_free_info; + err = fscrypt_init_keyring(); if (err) - goto fail_free_info; + goto fail_free_extent_info; return 0; +fail_free_extent_info: + kmem_cache_destroy(fscrypt_extent_info_cachep); fail_free_info: kmem_cache_destroy(fscrypt_inode_info_cachep); fail_free_queue: diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index b982073d1fe2..a7e902813cde 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -30,6 +30,8 @@ #define FSCRYPT_CONTEXT_V1 1 #define FSCRYPT_CONTEXT_V2 2 +#define FSCRYPT_EXTENT_CONTEXT_V1 1 + /* Keep this in sync with include/uapi/linux/fscrypt.h */ #define FSCRYPT_MODE_MAX FSCRYPT_MODE_AES_256_HCTR2 @@ -52,6 +54,28 @@ struct fscrypt_context_v2 { u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; }; +/* + * fscrypt_extent_context - the encryption context of an extent + * + * This is the on-disk information stored for an extent. The policy and + * relevante information is stored in the inode, the per-extent information is + * simply the nonce that's used in as KDF input in conjunction with the inode + * context to derive a per-extent key for encryption. + * + * At this point the master_key_identifier exists only for possible future + * expansion. This will allow for an inode to have extents with multiple master + * keys, although sharing the same encryption mode. This would be for re-keying + * or for reflinking between two differently encrypted inodes. For now the + * master_key_descriptor must match the inode's, and we'll be using the inode's + * for all key derivation. + */ +struct fscrypt_extent_context { + u8 version; /* FSCRYPT_EXTENT_CONTEXT_V2 */ + u8 encryption_mode; + u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]; + u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; +}; + /* * fscrypt_context - the encryption context of an inode * @@ -260,6 +284,25 @@ struct fscrypt_inode_info { u32 ci_hashed_ino; }; +/* + * fscrypt_extent_info - the "encryption key" for an extent. + * + * This contains the dervied key for the given extent and the nonce for the + * extent. + */ +struct fscrypt_extent_info { + refcount_t refs; + + /* The derived key for this extent. */ + struct fscrypt_prepared_key prep_key; + + /* The super block that this extent belongs to. */ + struct super_block *sb; + + /* This is the extents nonce, loaded from the fscrypt_extent_context */ + u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; +}; + typedef enum { FS_DECRYPT = 0, FS_ENCRYPT, @@ -267,6 +310,7 @@ typedef enum { /* crypto.c */ extern struct kmem_cache *fscrypt_inode_info_cachep; +extern struct kmem_cache *fscrypt_extent_info_cachep; int fscrypt_initialize(struct super_block *sb); int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, u64 lblk_num, struct page *src_page, @@ -326,6 +370,7 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_k
[PATCH 01/35] fscrypt: rename fscrypt_info => fscrypt_inode_info
We are going to track per-extent information, so it'll be necessary to distinguish between inode infos and extent infos. Rename fscrypt_info to fscrypt_inode_info, adjusting any lines that now exceed 80 characters. Signed-off-by: Josef Bacik --- fs/crypto/crypto.c | 13 ++-- fs/crypto/fname.c | 6 +++--- fs/crypto/fscrypt_private.h | 42 - fs/crypto/hooks.c | 2 +- fs/crypto/inline_crypt.c| 13 ++-- fs/crypto/keyring.c | 4 ++-- fs/crypto/keysetup.c| 38 + fs/crypto/keysetup_v1.c | 14 +++-- fs/crypto/policy.c | 11 +- include/linux/fs.h | 4 ++-- include/linux/fscrypt.h | 6 +++--- 11 files changed, 82 insertions(+), 71 deletions(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 6a837e4b80dc..2b2bb5740322 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -39,7 +39,7 @@ static mempool_t *fscrypt_bounce_page_pool = NULL; static struct workqueue_struct *fscrypt_read_workqueue; static DEFINE_MUTEX(fscrypt_init_mutex); -struct kmem_cache *fscrypt_info_cachep; +struct kmem_cache *fscrypt_inode_info_cachep; void fscrypt_enqueue_decrypt_work(struct work_struct *work) { @@ -78,7 +78,7 @@ EXPORT_SYMBOL(fscrypt_free_bounce_page); * simply contain the lblk_num (e.g., IV_INO_LBLK_32). */ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, -const struct fscrypt_info *ci) +const struct fscrypt_inode_info *ci) { u8 flags = fscrypt_policy_flags(&ci->ci_policy); @@ -107,7 +107,7 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); struct scatterlist dst, src; - struct fscrypt_info *ci = inode->i_crypt_info; + struct fscrypt_inode_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_enc_key.tfm; int res = 0; @@ -391,8 +391,9 @@ static int __init fscrypt_init(void) if (!fscrypt_read_workqueue) goto fail; - fscrypt_info_cachep = KMEM_CACHE(fscrypt_info, SLAB_RECLAIM_ACCOUNT); - if (!fscrypt_info_cachep) + fscrypt_inode_info_cachep = KMEM_CACHE(fscrypt_inode_info, + SLAB_RECLAIM_ACCOUNT); + if (!fscrypt_inode_info_cachep) goto fail_free_queue; err = fscrypt_init_keyring(); @@ -402,7 +403,7 @@ static int __init fscrypt_init(void) return 0; fail_free_info: - kmem_cache_destroy(fscrypt_info_cachep); + kmem_cache_destroy(fscrypt_inode_info_cachep); fail_free_queue: destroy_workqueue(fscrypt_read_workqueue); fail: diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 6eae3f12ad50..7b3fc189593a 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -100,7 +100,7 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, { struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); - const struct fscrypt_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_enc_key.tfm; union fscrypt_iv iv; struct scatterlist sg; @@ -157,7 +157,7 @@ static int fname_decrypt(const struct inode *inode, struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); struct scatterlist src_sg, dst_sg; - const struct fscrypt_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_enc_key.tfm; union fscrypt_iv iv; int res; @@ -568,7 +568,7 @@ EXPORT_SYMBOL_GPL(fscrypt_match_name); */ u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name) { - const struct fscrypt_info *ci = dir->i_crypt_info; + const struct fscrypt_inode_info *ci = dir->i_crypt_info; WARN_ON_ONCE(!ci->ci_dirhash_key_initialized); diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 2d63da48635a..b982073d1fe2 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -189,18 +189,21 @@ struct fscrypt_prepared_key { }; /* - * fscrypt_info - the "encryption key" for an inode + * fscrypt_inode_info - the "encryption key" for an inode * * When an encrypted file's key is made available, an instance of this struct is * allocated and stored in ->i_crypt_info. Once created, it remains until the * inode is evicted. */ -struct fscrypt_info { +struct fscrypt_inode_info { /* The key in a form prepared for actual encryption/decryption */ struct fscrypt_prepared_key ci_enc_key; - /* True if ci_enc_key should be freed
[PATCH 00/35] btrfs: add fscrypt support
Hello, This is the newly reworked fscrypt support for btrfs. There have been a few things changed since Sweet Tea's last post[1], and my RFC[2]. The changes from Sweet Tea's patchset are mostly related to the fscrypt changes, but I'll detail them here - We have a fscrypt_extent_info struct that simply has the blk key in it and a nonce. - We have a stripped down on disk context that just has what we need for extents. At this time we only care about the nonce, everything else is supposed to match the owning inode. - I've disabled everything except bog standard v2 policies to limit the complexity. - Added the necessary hooks we needed for checksumming the encrypted bios. - Reworked the on-disk stuff to be better described and accessed through helpers. - Plumbed through the fscrypt_extent_info through everything to simplify the API calls we need from fscrypt. - Instead of handling async key free'ing in fscrypt, handle the case where we're freeing extent_maps under the lock in a safe way. This is cleaner than pushing this into fscrypt. - Fixed a few things that fsstress uncovered in testing. Changes to the fscrypt code since my RFC - Took Eric's advice and added the policy and key to the extent context, this way if we want to in the future we could handle key changing. - Added a helper to give us the fscrypt extent info context size. We need the size ahead of time to setup the item properly. - Fixed the blk crypto fallback not actually working with our process_bio callback. Added a policy flag to make sure the checks work properly. - Added some documentation. Things left to do - I still have to update fstests to deal with v2 only policies. I haven't touched fstests at all yet, I've merely done my own rough testing with fsstress. - Update the btrfs-progs patches. This needs to be done to get the fstests stuff to work as well. - fsverity still isn't encrypted. I'm going to hit that next, it should be straightforward enough. This is based on for-next from Dave's tree [3], but in case that moves between now and then you can see my current branch here [4]. Thanks, Josef [1] https://lore.kernel.org/linux-fscrypt/cover.1693630890.git.sweettea-ker...@dorminy.me/ [2] https://lore.kernel.org/linux-btrfs/cover.1694738282.git.jo...@toxicpanda.com/ [3] https://github.com/kdave/btrfs-devel/tree/for-next [4] https://github.com/josefbacik/linux/tree/fscrypt Josef Bacik (20): fscrypt: rename fscrypt_info => fscrypt_inode_info fscrypt: add per-extent encryption support fscrypt: disable all but standard v2 policies for extent encryption blk-crypto: add a process bio callback fscrypt: add documentation about extent encryption btrfs: add infrastructure for safe em freeing btrfs: add fscrypt_info and encryption_type to ordered_extent btrfs: plumb through setting the fscrypt_info for ordered extents btrfs: populate the ordered_extent with the fscrypt context btrfs: keep track of fscrypt info and orig_start for dio reads btrfs: add an optional encryption context to the end of file extents btrfs: pass through fscrypt_extent_info to the file extent helpers btrfs: pass the fscrypt_info through the replace extent infrastructure btrfs: implement the fscrypt extent encryption hooks btrfs: setup fscrypt_extent_info for new extents btrfs: populate ordered_extent with the orig offset btrfs: set the bio fscrypt context when applicable btrfs: add a bio argument to btrfs_csum_one_bio btrfs: add orig_logical to btrfs_bio btrfs: implement process_bio cb for fscrypt Omar Sandoval (7): fscrypt: expose fscrypt_nokey_name btrfs: disable various operations on encrypted inodes btrfs: start using fscrypt hooks btrfs: add inode encryption contexts btrfs: add new FEATURE_INCOMPAT_ENCRYPT flag btrfs: adapt readdir for encrypted and nokey names btrfs: implement fscrypt ioctls Sweet Tea Dorminy (8): btrfs: disable verity on encrypted inodes btrfs: handle nokey names. btrfs: add encryption to CONFIG_BTRFS_DEBUG btrfs: add get_devices hook for fscrypt btrfs: turn on inlinecrypt mount option for encrypt btrfs: set file extent encryption excplicitly btrfs: add fscrypt_info and encryption_type to extent_map btrfs: explicitly track file extent length for replace and drop Documentation/filesystems/fscrypt.rst | 36 ++ block/blk-crypto-fallback.c | 28 ++ block/blk-crypto-profile.c| 2 + block/blk-crypto.c| 6 +- fs/btrfs/Makefile | 1 + fs/btrfs/accessors.h | 50 +++ fs/btrfs/bio.c| 45 ++- fs/btrfs/bio.h| 6 + fs/btrfs/btrfs_inode.h| 3 +- fs/btrfs/compression.c| 6 + fs/btrfs/ctree.h | 4 + fs/btrfs/defrag.c | 10 +- fs/btrfs/delayed-inode.c | 29 +
Re: [RFC PATCH 0/4] fscrypt: add support for per-extent encryption
On Thu, Sep 14, 2023 at 11:48:16PM -0700, Eric Biggers wrote: > On Thu, Sep 14, 2023 at 08:47:41PM -0400, Josef Bacik wrote: > > Hello, > > > > This is meant as a replacement for the last set of patches Sweet Tea sent > > [1]. > > This is an attempt to find a different path forward. Strip down everything > > to > > the basics. Essentially all we appear to need is a nonce, and then we can > > use > > the inode context to derive per-extent keys. > > > > I'm sending this as an RFC to see if this is a better direction to try and > > make > > some headway on this project. The btrfs side doesn't change too much, the > > code > > just needs to be adjusted to use the new helpers for the extent contexts. I > > have this work mostly complete, but I'm afraid I won't have it ready for > > another > > day or two and I want to get feedback on this ASAP before I burn too much > > time > > on it. > > > > Additionally there is a callback I've put in the inline block crypto stuff > > that > > we need in order to handle the checksumming. I made my best guess here as > > to > > what would be the easiest and simplest way to acheive what we need, but I'm > > open > > to suggestions here. > > > > The other note is I've disabled all of the policy variations other than > > default > > v2 policies if you enable extent encryption. This is for simplicity sake. > > We > > could probably make most of it work, but reflink is basically impossible > > for v1 > > with direct key, and is problematic for the lblk related options. It > > appears > > this is fine, as those other modes are for specific use cases and the vast > > majority of normal users are encouraged to use normal v2 policies anyway. > > > > This stripped down version gives us most of what we want, we can reflink > > between > > different inodes that have the same policy. We lose the ability to mix > > differently encrypted extents in the same inode, but this is an acceptable > > limitation for now. > > > > This has only been compile tested, and as I've said I haven't wired it > > completely up into btrfs yet. But this is based on a rough wire up and > > appears > > to give us everything we need. The btrfs portion of Sweet Teas patches are > > basically untouched except where we use these helpers to deal with the > > extent > > contexts. Thanks, > > > > Josef > > > > [1] > > https://lore.kernel.org/linux-fscrypt/cover.1693630890.git.sweettea-ker...@dorminy.me/ > > > > Josef Bacik (4): > > fscrypt: rename fscrypt_info => fscrypt_inode_info > > fscrypt: add per-extent encryption support > > fscrypt: disable all but standard v2 policies for extent encryption > > blk-crypto: add a process bio callback > > > > block/blk-crypto-fallback.c | 18 > > block/blk-crypto-profile.c | 2 + > > block/blk-crypto.c | 6 +- > > fs/crypto/crypto.c | 23 +++-- > > fs/crypto/fname.c | 6 +- > > fs/crypto/fscrypt_private.h | 78 > > fs/crypto/hooks.c | 2 +- > > fs/crypto/inline_crypt.c| 50 +-- > > fs/crypto/keyring.c | 4 +- > > fs/crypto/keysetup.c| 174 > > fs/crypto/keysetup_v1.c | 14 +-- > > fs/crypto/policy.c | 45 -- > > include/linux/blk-crypto.h | 9 +- > > include/linux/fs.h | 4 +- > > include/linux/fscrypt.h | 41 - > > 15 files changed, 400 insertions(+), 76 deletions(-) > > Thanks Josef! At a high level this looks good to me. It's much simpler. I > guess my main question is "what is missing" (besides the obvious things like > updating the documentation and polishing code comments). I see you got rid > of a > lot of the complexity in Sweet Tea's patchset, which is great as I think a lot > of it was unnecessary as I've mentioned. But maybe something got overlooked? > I'm mainly wondering about the patches like "fscrypt: allow asynchronous info > freeing" that were a bit puzzling but have now gone away. > I'm going to fix this in a different way internally in btrfs. There's only once place where we can't drop the lock, so I plan to just collate free'd EM's and free them in bulk after we drop the lock. This *may* not fix the problem, I have to wait for lockdep to tell me I'm stupid and I missed some other dependen
[PATCH 3/4] fscrypt: disable all but standard v2 policies for extent encryption
The different encryption related options for fscrypt are too numerous to support for extent based encryption. Support for a few of these options could possibly be added, but since they're niche options simply reject them for file systems using extent based encryption. Signed-off-by: Josef Bacik --- fs/crypto/policy.c | 12 1 file changed, 12 insertions(+) diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index cb944338271d..3c8665c21ee8 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -198,6 +198,12 @@ static bool fscrypt_supported_v1_policy(const struct fscrypt_policy_v1 *policy, return false; } + if (inode->i_sb->s_cop->flags & FS_CFLG_EXTENT_ENCRYPTION) { + fscrypt_warn(inode, +"v1 policies can't be used on file systems that use extent encryption"); + return false; + } + return true; } @@ -233,6 +239,12 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, return false; } + if ((inode->i_sb->s_cop->flags & FS_CFLG_EXTENT_ENCRYPTION) && count) { + fscrypt_warn(inode, +"Encryption flags aren't supported on file systems that use extent encryption"); + return false; + } + if ((policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) && !supported_direct_key_modes(inode, policy->contents_encryption_mode, policy->filenames_encryption_mode)) -- 2.41.0
[PATCH 4/4] blk-crypto: add a process bio callback
Btrfs does checksumming, and the checksums need to match the bytes on disk. In order to facilitate this add a process bio callback for the blk-crypto layer. This allows the file system to specify a callback and then can process the encrypted bio as necessary. For btrfs, writes will have the checksums calculated and saved into our relevant data structures for storage once the write completes. For reads we will validate the checksums match what is on disk and error out if there is a mismatch. This is incompatible with native encryption obviously, so make sure we don't use native encryption if this callback is set. Signed-off-by: Josef Bacik --- block/blk-crypto-fallback.c | 18 ++ block/blk-crypto-profile.c | 2 ++ block/blk-crypto.c | 6 +- fs/crypto/inline_crypt.c| 3 ++- include/linux/blk-crypto.h | 9 +++-- include/linux/fscrypt.h | 14 ++ 6 files changed, 48 insertions(+), 4 deletions(-) diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index e6468eab2681..91e5f70ef6b1 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -346,6 +346,15 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr) } } + /* Process the encrypted bio before we submit it. */ + if (bc->bc_key->crypto_cfg.process_bio) { + blk_st = bc->bc_key->crypto_cfg.process_bio(src_bio, enc_bio); + if (blk_st != BLK_STS_OK) { + src_bio->bi_status = blk_st; + goto out_free_bounce_pages; + } + } + enc_bio->bi_private = src_bio; enc_bio->bi_end_io = blk_crypto_fallback_encrypt_endio; *bio_ptr = enc_bio; @@ -391,6 +400,15 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work) unsigned int i; blk_status_t blk_st; + /* Process the bio first before trying to decrypt. */ + if (bc->bc_key->crypto_cfg.process_bio) { + blk_st = bc->bc_key->crypto_cfg.process_bio(bio, bio); + if (blk_st != BLK_STS_OK) { + bio->bi_status = blk_st; + goto out_no_keyslot; + } + } + /* * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for * this bio's algorithm and key. diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c index 7fabc883e39f..1d55b952e52e 100644 --- a/block/blk-crypto-profile.c +++ b/block/blk-crypto-profile.c @@ -352,6 +352,8 @@ bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, return false; if (profile->max_dun_bytes_supported < cfg->dun_bytes) return false; + if (cfg->process_bio) + return false; return true; } diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 4d760b092deb..50556952df19 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -321,6 +321,8 @@ int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, * @dun_bytes: number of bytes that will be used to specify the DUN when this *key is used * @data_unit_size: the data unit size to use for en/decryption + * @process_bio: the call back if the upper layer needs to process the encrypted + * bio * * Return: 0 on success, -errno on failure. The caller is responsible for *zeroizing both blk_key and raw_key when done with them. @@ -328,7 +330,8 @@ int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, enum blk_crypto_mode_num crypto_mode, unsigned int dun_bytes, - unsigned int data_unit_size) + unsigned int data_unit_size, + blk_crypto_process_bio_t process_bio) { const struct blk_crypto_mode *mode; @@ -350,6 +353,7 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, blk_key->crypto_cfg.crypto_mode = crypto_mode; blk_key->crypto_cfg.dun_bytes = dun_bytes; blk_key->crypto_cfg.data_unit_size = data_unit_size; + blk_key->crypto_cfg.process_bio = process_bio; blk_key->data_unit_size_bits = ilog2(data_unit_size); blk_key->size = mode->keysize; memcpy(blk_key->raw, raw_key, mode->keysize); diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index 69bb6ac9a7f7..0558fe73e82f 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -168,7 +168,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, return -ENOMEM; err = blk_crypto_init_key(blk_key, raw_key, crypto_mode, - fscrypt_get_dun_bytes(c
[PATCH 2/4] fscrypt: add per-extent encryption support
This adds the code necessary for per-extent encryption. We will store a nonce for every extent we create, and then use the inode's policy and the extents nonce to derive a per-extent key. This is meant to be flexible, if we choose to expand the on-disk extent information in the future we have a version number we can use to change what exists on disk. The file system indicates it wants to use per-extent encryption by setting s_cop->set_extent_context. This also requires the use of inline block encryption. The support is relatively straightforward, the only "extra" bit is we're deriving a per-extent key to use for the encryption, the inode still controls the policy and access to the master key. Signed-off-by: Josef Bacik --- fs/crypto/crypto.c | 10 ++- fs/crypto/fscrypt_private.h | 36 ++ fs/crypto/inline_crypt.c| 34 + fs/crypto/keysetup.c| 136 fs/crypto/policy.c | 22 ++ include/linux/fscrypt.h | 21 ++ 6 files changed, 258 insertions(+), 1 deletion(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 2b2bb5740322..e3ba06cb4082 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -40,6 +40,7 @@ static struct workqueue_struct *fscrypt_read_workqueue; static DEFINE_MUTEX(fscrypt_init_mutex); struct kmem_cache *fscrypt_inode_info_cachep; +struct kmem_cache *fscrypt_extent_info_cachep; void fscrypt_enqueue_decrypt_work(struct work_struct *work) { @@ -396,12 +397,19 @@ static int __init fscrypt_init(void) if (!fscrypt_inode_info_cachep) goto fail_free_queue; + fscrypt_extent_info_cachep = KMEM_CACHE(fscrypt_extent_info, + SLAB_RECLAIM_ACCOUNT); + if (!fscrypt_extent_info_cachep) + goto fail_free_info; + err = fscrypt_init_keyring(); if (err) - goto fail_free_info; + goto fail_free_extent_info; return 0; +fail_free_extent_info: + kmem_cache_destroy(fscrypt_extent_info_cachep); fail_free_info: kmem_cache_destroy(fscrypt_inode_info_cachep); fail_free_queue: diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index b982073d1fe2..7e5ab3695023 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -30,6 +30,8 @@ #define FSCRYPT_CONTEXT_V1 1 #define FSCRYPT_CONTEXT_V2 2 +#define FSCRYPT_EXTENT_CONTEXT_V1 1 + /* Keep this in sync with include/uapi/linux/fscrypt.h */ #define FSCRYPT_MODE_MAX FSCRYPT_MODE_AES_256_HCTR2 @@ -52,6 +54,19 @@ struct fscrypt_context_v2 { u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; }; +/* + * fscrypt_extent_context - the encryption context of an extent + * + * This is the on-disk information stored for an extent. The policy and + * relevante information is stored in the inode, the per-extent information is + * simply the nonce that's used in as KDF input in conjunction with the inode + * context to derive a per-extent key for encryption. + */ +struct fscrypt_extent_context { + u8 version; /* FSCRYPT_EXTENT_CONTEXT_V2 */ + u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; +}; + /* * fscrypt_context - the encryption context of an inode * @@ -260,6 +275,25 @@ struct fscrypt_inode_info { u32 ci_hashed_ino; }; +/* + * fscrypt_extent_info - the "encryption key" for an extent. + * + * This contains the dervied key for the given extent and the nonce for the + * extent. + */ +struct fscrypt_extent_info { + refcount_t refs; + + /* The derived key for this extent. */ + struct fscrypt_prepared_key prep_key; + + /* The super block that this extent belongs to. */ + struct super_block *sb; + + /* This is the extents nonce, loaded from the fscrypt_extent_context */ + u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; +}; + typedef enum { FS_DECRYPT = 0, FS_ENCRYPT, @@ -267,6 +301,7 @@ typedef enum { /* crypto.c */ extern struct kmem_cache *fscrypt_inode_info_cachep; +extern struct kmem_cache *fscrypt_extent_info_cachep; int fscrypt_initialize(struct super_block *sb); int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, u64 lblk_num, struct page *src_page, @@ -326,6 +361,7 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, #define HKDF_CONTEXT_DIRHASH_KEY 5 /* info=file_nonce*/ #define HKDF_CONTEXT_IV_INO_LBLK_32_KEY6 /* info=mode_num||fs_uuid */ #define HKDF_CONTEXT_INODE_HASH_KEY7 /* info= */ +#define HKDF_CONTEXT_PER_EXTENT_ENC_KEY8 /* info=extent_nonce */ int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, const u8 *info, unsigned int infolen, diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index 3cfb04b8b64f..69bb6ac9a7f7 100644 --- a/fs
[PATCH 1/4] fscrypt: rename fscrypt_info => fscrypt_inode_info
We are going to track per-extent information, so it'll be necessary to distinguish between inode infos and extent infos. Rename fscrypt_info to fscrypt_inode_info, adjusting any lines that now exceed 80 characters. Signed-off-by: Josef Bacik --- fs/crypto/crypto.c | 13 ++-- fs/crypto/fname.c | 6 +++--- fs/crypto/fscrypt_private.h | 42 - fs/crypto/hooks.c | 2 +- fs/crypto/inline_crypt.c| 13 ++-- fs/crypto/keyring.c | 4 ++-- fs/crypto/keysetup.c| 38 + fs/crypto/keysetup_v1.c | 14 +++-- fs/crypto/policy.c | 11 +- include/linux/fs.h | 4 ++-- include/linux/fscrypt.h | 6 +++--- 11 files changed, 82 insertions(+), 71 deletions(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 6a837e4b80dc..2b2bb5740322 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -39,7 +39,7 @@ static mempool_t *fscrypt_bounce_page_pool = NULL; static struct workqueue_struct *fscrypt_read_workqueue; static DEFINE_MUTEX(fscrypt_init_mutex); -struct kmem_cache *fscrypt_info_cachep; +struct kmem_cache *fscrypt_inode_info_cachep; void fscrypt_enqueue_decrypt_work(struct work_struct *work) { @@ -78,7 +78,7 @@ EXPORT_SYMBOL(fscrypt_free_bounce_page); * simply contain the lblk_num (e.g., IV_INO_LBLK_32). */ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, -const struct fscrypt_info *ci) +const struct fscrypt_inode_info *ci) { u8 flags = fscrypt_policy_flags(&ci->ci_policy); @@ -107,7 +107,7 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); struct scatterlist dst, src; - struct fscrypt_info *ci = inode->i_crypt_info; + struct fscrypt_inode_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_enc_key.tfm; int res = 0; @@ -391,8 +391,9 @@ static int __init fscrypt_init(void) if (!fscrypt_read_workqueue) goto fail; - fscrypt_info_cachep = KMEM_CACHE(fscrypt_info, SLAB_RECLAIM_ACCOUNT); - if (!fscrypt_info_cachep) + fscrypt_inode_info_cachep = KMEM_CACHE(fscrypt_inode_info, + SLAB_RECLAIM_ACCOUNT); + if (!fscrypt_inode_info_cachep) goto fail_free_queue; err = fscrypt_init_keyring(); @@ -402,7 +403,7 @@ static int __init fscrypt_init(void) return 0; fail_free_info: - kmem_cache_destroy(fscrypt_info_cachep); + kmem_cache_destroy(fscrypt_inode_info_cachep); fail_free_queue: destroy_workqueue(fscrypt_read_workqueue); fail: diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 6eae3f12ad50..7b3fc189593a 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -100,7 +100,7 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, { struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); - const struct fscrypt_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_enc_key.tfm; union fscrypt_iv iv; struct scatterlist sg; @@ -157,7 +157,7 @@ static int fname_decrypt(const struct inode *inode, struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); struct scatterlist src_sg, dst_sg; - const struct fscrypt_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_enc_key.tfm; union fscrypt_iv iv; int res; @@ -568,7 +568,7 @@ EXPORT_SYMBOL_GPL(fscrypt_match_name); */ u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name) { - const struct fscrypt_info *ci = dir->i_crypt_info; + const struct fscrypt_inode_info *ci = dir->i_crypt_info; WARN_ON_ONCE(!ci->ci_dirhash_key_initialized); diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 2d63da48635a..b982073d1fe2 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -189,18 +189,21 @@ struct fscrypt_prepared_key { }; /* - * fscrypt_info - the "encryption key" for an inode + * fscrypt_inode_info - the "encryption key" for an inode * * When an encrypted file's key is made available, an instance of this struct is * allocated and stored in ->i_crypt_info. Once created, it remains until the * inode is evicted. */ -struct fscrypt_info { +struct fscrypt_inode_info { /* The key in a form prepared for actual encryption/decryption */ struct fscrypt_prepared_key ci_enc_key; - /* True if ci_enc_key should be freed
[RFC PATCH 0/4] fscrypt: add support for per-extent encryption
Hello, This is meant as a replacement for the last set of patches Sweet Tea sent [1]. This is an attempt to find a different path forward. Strip down everything to the basics. Essentially all we appear to need is a nonce, and then we can use the inode context to derive per-extent keys. I'm sending this as an RFC to see if this is a better direction to try and make some headway on this project. The btrfs side doesn't change too much, the code just needs to be adjusted to use the new helpers for the extent contexts. I have this work mostly complete, but I'm afraid I won't have it ready for another day or two and I want to get feedback on this ASAP before I burn too much time on it. Additionally there is a callback I've put in the inline block crypto stuff that we need in order to handle the checksumming. I made my best guess here as to what would be the easiest and simplest way to acheive what we need, but I'm open to suggestions here. The other note is I've disabled all of the policy variations other than default v2 policies if you enable extent encryption. This is for simplicity sake. We could probably make most of it work, but reflink is basically impossible for v1 with direct key, and is problematic for the lblk related options. It appears this is fine, as those other modes are for specific use cases and the vast majority of normal users are encouraged to use normal v2 policies anyway. This stripped down version gives us most of what we want, we can reflink between different inodes that have the same policy. We lose the ability to mix differently encrypted extents in the same inode, but this is an acceptable limitation for now. This has only been compile tested, and as I've said I haven't wired it completely up into btrfs yet. But this is based on a rough wire up and appears to give us everything we need. The btrfs portion of Sweet Teas patches are basically untouched except where we use these helpers to deal with the extent contexts. Thanks, Josef [1] https://lore.kernel.org/linux-fscrypt/cover.1693630890.git.sweettea-ker...@dorminy.me/ Josef Bacik (4): fscrypt: rename fscrypt_info => fscrypt_inode_info fscrypt: add per-extent encryption support fscrypt: disable all but standard v2 policies for extent encryption blk-crypto: add a process bio callback block/blk-crypto-fallback.c | 18 block/blk-crypto-profile.c | 2 + block/blk-crypto.c | 6 +- fs/crypto/crypto.c | 23 +++-- fs/crypto/fname.c | 6 +- fs/crypto/fscrypt_private.h | 78 fs/crypto/hooks.c | 2 +- fs/crypto/inline_crypt.c| 50 +-- fs/crypto/keyring.c | 4 +- fs/crypto/keysetup.c| 174 fs/crypto/keysetup_v1.c | 14 +-- fs/crypto/policy.c | 45 -- include/linux/blk-crypto.h | 9 +- include/linux/fs.h | 4 +- include/linux/fscrypt.h | 41 - 15 files changed, 400 insertions(+), 76 deletions(-) -- 2.41.0
Re: [PATCH 21/42] btrfs: make process_one_page() to handle subpage locking
On 4/15/21 1:04 AM, Qu Wenruo wrote: Introduce a new data inodes specific subpage member, writers, to record how many sectors are under page lock for delalloc writing. This member acts pretty much the same as readers, except it's only for delalloc writes. This is important for delalloc code to trace which page can really be freed, as we have cases like run_delalloc_nocow() where we may exit processing nocow range inside a page, but need to exit to do cow half way. In that case, we need a way to determine if we can really unlock a full page. With the new btrfs_subpage::writers, there is a new requirement: - Page locked by process_one_page() must be unlocked by process_one_page() There are still tons of call sites manually lock and unlock a page, without updating btrfs_subpage::writers. So if we lock a page through process_one_page() then it must be unlocked by process_one_page() to keep btrfs_subpage::writers consistent. This will be handled in next patch. Signed-off-by: Qu Wenruo Reviewed-by: Josef Bacik Thanks, Josef
Re: [PATCH 20/42] btrfs: make end_bio_extent_writepage() to be subpage compatible
On 4/15/21 1:04 AM, Qu Wenruo wrote: Now in end_bio_extent_writepage(), the only subpage incompatible code is the end_page_writeback(). Just call the subpage helpers. Signed-off-by: Qu Wenruo Reviewed-by: Josef Bacik Thanks, Josef
Re: [PATCH 19/42] btrfs: make __process_pages_contig() to handle subpage dirty/error/writeback status
On 4/15/21 1:04 AM, Qu Wenruo wrote: For __process_pages_contig() and process_one_page(), to handle subpage we only need to pass bytenr in and call subpage helpers to handle dirty/error/writeback status. Signed-off-by: Qu Wenruo Reviewed-by: Josef Bacik Thanks, Josef
Re: [PATCH 18/42] btrfs: make btrfs_dirty_pages() to be subpage compatible
On 4/15/21 1:04 AM, Qu Wenruo wrote: Since the extent io tree operations in btrfs_dirty_pages() are already subpage compatible, we only need to make the page status update to use subpage helpers. Signed-off-by: Qu Wenruo Reviewed-by: Josef Bacik Thanks, Josef
Re: [PATCH 17/42] btrfs: only require sector size alignment for end_bio_extent_writepage()
On 4/15/21 1:04 AM, Qu Wenruo wrote: Just like read page, for subpage support we only require sector size alignment. So change the error message condition to only require sector alignment. This should not affect existing code, as for regular sectorsize == PAGE_SIZE case, we are still requiring page alignment. Signed-off-by: Qu Wenruo --- fs/btrfs/extent_io.c | 29 - 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 53ac22e3560f..94f8b3ffe6a7 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2779,25 +2779,20 @@ static void end_bio_extent_writepage(struct bio *bio) struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + const u32 sectorsize = fs_info->sectorsize; - /* We always issue full-page reads, but if some block -* in a page fails to read, blk_update_request() will -* advance bv_offset and adjust bv_len to compensate. -* Print a warning for nonzero offsets, and an error -* if they don't add up to a full page. */ - if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { - if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) - btrfs_err(fs_info, - "partial page write in btrfs with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); - else - btrfs_info(fs_info, - "incomplete page write in btrfs with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); - } + /* Btrfs read write should always be sector aligned. */ + if (!IS_ALIGNED(bvec->bv_offset, sectorsize)) + btrfs_err(fs_info, + "partial page write in btrfs with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); + else if (!IS_ALIGNED(bvec->bv_len, sectorsize)) + btrfs_info(fs_info, + "incomplete page write with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); - start = page_offset(page); - end = start + bvec->bv_offset + bvec->bv_len - 1; + start = page_offset(page) + bvec->bv_offset; + end = start + bvec->bv_len - 1; Does this bit work out for you now? Because before start was just the page offset. Clearly the way it was before is a bug (I think?), because it gets used in btrfs_writepage_endio_finish_ordered() with the start+len, so you really do want start = page_offset(page) + bv_offset. But this is a behavior change that warrants a patch of its own as it's unrelated to the sectorsize change. (Yes I realize I'm asking for more patches in an already huge series, yes I'm insane.) Thanks, Josef
Re: [PATCH 16/42] btrfs: provide btrfs_page_clamp_*() helpers
On 4/15/21 1:04 AM, Qu Wenruo wrote: In the coming subpage RW supports, there are a lot of page status update calls which need to be converted to subpage compatible version, which needs @start and @len. Some call sites already have such @start/@len and are already in page range, like various endio functions. But there are also call sites which need to clamp the range for subpage case, like btrfs_dirty_pagse() and __process_contig_pages(). Here we introduce new helpers, btrfs_page_clamp_*(), to do and only do the clamp for subpage version. Although in theory all existing btrfs_page_*() calls can be converted to use btrfs_page_clamp_*() directly, but that would make us to do unnecessary clamp operations. Signed-off-by: Qu Wenruo Reviewed-by: Josef Bacik Thanks, Josef
Re: [PATCH 15/42] btrfs: refactor the page status update into process_one_page()
On 4/15/21 1:04 AM, Qu Wenruo wrote: In __process_pages_contig() we update page status according to page_ops. That update process is a bunch of if () {} branches, which lies inside two loops, this makes it pretty hard to expand for later subpage operations. So this patch will extract this operations into its own function, process_one_pages(). Also since we're refactoring __process_pages_contig(), also move the new helper and __process_pages_contig() before the first caller of them, to remove the forward declaration. Signed-off-by: Qu Wenruo The refactor is fine, I still have questions about the pages_processed thing, but that can be addressed in the other patch and then will trickle down to here, so you can add Reviewed-by: Josef Bacik Thanks, Josef
Re: [PATCH 14/42] btrfs: pass bytenr directly to __process_pages_contig()
On 4/15/21 1:04 AM, Qu Wenruo wrote: As a preparation for incoming subpage support, we need bytenr passed to __process_pages_contig() directly, not the current page index. So change the parameter and all callers to pass bytenr in. With the modification, here we need to replace the old @index_ret with @processed_end for __process_pages_contig(), but this brings a small problem. Normally we follow the inclusive return value, meaning @processed_end should be the last byte we processed. If parameter @start is 0, and we failed to lock any page, then we would return @processed_end as -1, causing more problems for __unlock_for_delalloc(). So here for @processed_end, we use two different return value patterns. If we have locked any page, @processed_end will be the last byte of locked page. Or it will be @start otherwise. This change will impact lock_delalloc_pages(), so it needs to check @processed_end to only unlock the range if we have locked any. Signed-off-by: Qu Wenruo --- fs/btrfs/extent_io.c | 57 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ac01f29b00c9..ff24db8513b4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1807,8 +1807,8 @@ bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, static int __process_pages_contig(struct address_space *mapping, struct page *locked_page, - pgoff_t start_index, pgoff_t end_index, - unsigned long page_ops, pgoff_t *index_ret); + u64 start, u64 end, unsigned long page_ops, + u64 *processed_end); static noinline void __unlock_for_delalloc(struct inode *inode, struct page *locked_page, @@ -1821,7 +1821,7 @@ static noinline void __unlock_for_delalloc(struct inode *inode, if (index == locked_page->index && end_index == index) return; - __process_pages_contig(inode->i_mapping, locked_page, index, end_index, + __process_pages_contig(inode->i_mapping, locked_page, start, end, PAGE_UNLOCK, NULL); } @@ -1831,19 +1831,19 @@ static noinline int lock_delalloc_pages(struct inode *inode, u64 delalloc_end) { unsigned long index = delalloc_start >> PAGE_SHIFT; - unsigned long index_ret = index; unsigned long end_index = delalloc_end >> PAGE_SHIFT; + u64 processed_end = delalloc_start; int ret; ASSERT(locked_page); if (index == locked_page->index && index == end_index) return 0; - ret = __process_pages_contig(inode->i_mapping, locked_page, index, -end_index, PAGE_LOCK, &index_ret); - if (ret == -EAGAIN) + ret = __process_pages_contig(inode->i_mapping, locked_page, delalloc_start, +delalloc_end, PAGE_LOCK, &processed_end); + if (ret == -EAGAIN && processed_end > delalloc_start) __unlock_for_delalloc(inode, locked_page, delalloc_start, - (u64)index_ret << PAGE_SHIFT); + processed_end); return ret; } @@ -1938,12 +1938,14 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, static int __process_pages_contig(struct address_space *mapping, struct page *locked_page, - pgoff_t start_index, pgoff_t end_index, - unsigned long page_ops, pgoff_t *index_ret) + u64 start, u64 end, unsigned long page_ops, + u64 *processed_end) { + pgoff_t start_index = start >> PAGE_SHIFT; + pgoff_t end_index = end >> PAGE_SHIFT; + pgoff_t index = start_index; unsigned long nr_pages = end_index - start_index + 1; unsigned long pages_processed = 0; - pgoff_t index = start_index; struct page *pages[16]; unsigned ret; int err = 0; @@ -1951,17 +1953,19 @@ static int __process_pages_contig(struct address_space *mapping, if (page_ops & PAGE_LOCK) { ASSERT(page_ops == PAGE_LOCK); - ASSERT(index_ret && *index_ret == start_index); + ASSERT(processed_end && *processed_end == start); } if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) mapping_set_error(mapping, -EIO); while (nr_pages > 0) { - ret = find_get_pages_contig(mapping, index, + int found_pages; + + found_pages = find_get_pages_contig(mapping, index, min_t(unsigned long, nr_pages,
Re: [PATCH 13/42] btrfs: rename PagePrivate2 to PageOrdered inside btrfs
On 4/15/21 1:04 AM, Qu Wenruo wrote: Inside btrfs, we use Private2 page status to indicate we have ordered extent with pending IO for the sector. But the page status name, Private2, tells us nothing about the bit itself, so this patch will rename it to Ordered. And with extra comment about the bit added, so reader who is still uncertain about the page Ordered status, will find the comment pretty easily. Signed-off-by: Qu Wenruo Reviewed-by: Josef Bacik Thanks, Josef
Re: [PATCH 12/42] btrfs: make Private2 lifespan more consistent
On 4/15/21 1:04 AM, Qu Wenruo wrote: Currently btrfs uses page Private2 bit to incidate if we have ordered extent for the page range. But the lifespan of it is not consistent, during regular writeback path, there are two locations to clear the same PagePrivate2: T - Page marked Dirty | + - Page marked Private2, through btrfs_run_dealloc_range() | + - Page cleared Private2, through btrfs_writepage_cow_fixup() | in __extent_writepage_io() | ^^^ Private2 cleared for the first time | + - Page marked Writeback, through btrfs_set_range_writeback() | in __extent_writepage_io(). | + - Page cleared Private2, through | btrfs_writepage_endio_finish_ordered() | ^^^ Private2 cleared for the second time. | + - Page cleared Writeback, through btrfs_writepage_endio_finish_ordered() Currently PagePrivate2 is mostly to prevent ordered extent accounting being executed for both endio and invalidatepage. Thus only the one who cleared page Private2 is responsible for ordered extent accounting. But the fact is, in btrfs_writepage_endio_finish_ordered(), page Private2 is cleared and ordered extent accounting is executed unconditionally. The race prevention only happens through btrfs_invalidatepage(), where we wait the page writeback first, before checking the Private2 bit. This means, Private2 is also protected by Writeback bit, and there is no need for btrfs_writepage_cow_fixup() to clear Priavte2. This patch will change btrfs_writepage_cow_fixup() to just check PagePrivate2, not to clear it. The clear will happen either in btrfs_invalidatepage() or btrfs_writepage_endio_finish_ordered(). This makes the Private2 bit easier to understand, just meaning the page has unfinished ordered extent attached to it. Signed-off-by: Qu Wenruo Reviewed-by: Josef Bacik Thanks, Josef
Re: [PATCH 11/42] btrfs: refactor btrfs_invalidatepage()
On 4/15/21 1:04 AM, Qu Wenruo wrote: This patch will refactor btrfs_invalidatepage() for the incoming subpage support. The invovled modifcations are: - Use while() loop instead of "goto again;" - Use single variable to determine whether to delete extent states Each branch will also have comments why we can or cannot delete the extent states - Do qgroup free and extent states deletion per-loop Current code can only work for PAGE_SIZE == sectorsize case. This refactor also makes it clear what we do for different sectors: - Sectors without ordered extent We're completely safe to remove all extent states for the sector(s) - Sectors with ordered extent, but no Private2 bit This means the endio has already been executed, we can't remove all extent states for the sector(s). - Sectors with ordere extent, still has Private2 bit This means we need to decrease the ordered extent accounting. And then it comes to two different variants: * We have finished and removed the ordered extent Then it's the same as "sectors without ordered extent" * We didn't finished the ordered extent We can remove some extent states, but not all. Signed-off-by: Qu Wenruo --- fs/btrfs/inode.c | 173 +-- 1 file changed, 94 insertions(+), 79 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4c894de2e813..93bb7c0482ba 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8320,15 +8320,12 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, { struct btrfs_inode *inode = BTRFS_I(page->mapping->host); struct extent_io_tree *tree = &inode->io_tree; - struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; u64 page_start = page_offset(page); u64 page_end = page_start + PAGE_SIZE - 1; - u64 start; - u64 end; + u64 cur; + u32 sectorsize = inode->root->fs_info->sectorsize; int inode_evicting = inode->vfs_inode.i_state & I_FREEING; - bool found_ordered = false; - bool completed_ordered = false; /* * We have page locked so no new ordered extent can be created on @@ -8352,96 +8349,114 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, if (!inode_evicting) lock_extent_bits(tree, page_start, page_end, &cached_state); - start = page_start; -again: - ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1); - if (ordered) { - found_ordered = true; - end = min(page_end, - ordered->file_offset + ordered->num_bytes - 1); + cur = page_start; + while (cur < page_end) { + struct btrfs_ordered_extent *ordered; + bool delete_states = false; + u64 range_end; + + /* +* Here we can't pass "file_offset = cur" and +* "len = page_end + 1 - cur", as btrfs_lookup_ordered_range() +* may not return the first ordered extent after @file_offset. +* +* Here we want to iterate through the range in byte order. +* This is slower but definitely correct. +* +* TODO: Make btrfs_lookup_ordered_range() to return the +* first ordered extent in the range to reduce the number +* of loops. +*/ + ordered = btrfs_lookup_ordered_range(inode, cur, sectorsize); How does it not find the first ordered extent after file_offset? Looking at the code it just loops through and returns the first thing it finds that overlaps our range. Is there a bug in btrfs_lookup_ordered_range()? We should add some self tests to make sure these helpers are doing the right thing if there is in fact a bug. + if (!ordered) { + range_end = cur + sectorsize - 1; + /* +* No ordered extent covering this sector, we are safe +* to delete all extent states in the range. +*/ + delete_states = true; + goto next; + } + + range_end = min(ordered->file_offset + ordered->num_bytes - 1, + page_end); + if (!PagePrivate2(page)) { + /* +* If Private2 is cleared, it means endio has already +* been executed for the range. +* We can't delete the extent states as +* btrfs_finish_ordered_io() may still use some of them. +*/ + delete_states = false; delete_states is already false. + goto next; + } + ClearPagePrivate2(page); +
Re: [PATCH 10/42] btrfs: update the comments in btrfs_invalidatepage()
On 4/15/21 1:04 AM, Qu Wenruo wrote: The existing comments in btrfs_invalidatepage() don't really get to the point, especially for what Private2 is really representing and how the race avoidance is done. The truth is, there are only three entrances to do ordered extent accounting: - btrfs_writepage_endio_finish_ordered() - __endio_write_update_ordered() Those two entrance are just endio functions for dio and buffered write. - btrfs_invalidatepage() But there is a pitfall, in endio functions there is no check on whether the ordered extent is already accounted. They just blindly clear the Private2 bit and do the accounting. So it's all btrfs_invalidatepage()'s responsibility to make sure we won't do double account on the same sector. That's why in btrfs_invalidatepage() we have to wait page writeback, this will ensure all submitted bios has finished, thus their endio functions have finished the accounting on the ordered extent. Then we also check page Private2 to ensure that, we only run ordered extent accounting on pages who has no bio submitted. This patch will rework related comments to make it more clear on the race and how we use wait_on_page_writeback() and Private2 to prevent double accounting on ordered extent. Signed-off-by: Qu Wenruo Reviewed-by: Josef Bacik Thanks, Josef
Re: [PATCH 09/42] btrfs: refactor how we finish ordered extent io for endio functions
On 4/15/21 1:04 AM, Qu Wenruo wrote: Btrfs has two endio functions to mark certain io range finished for ordered extents: - __endio_write_update_ordered() This is for direct IO - btrfs_writepage_endio_finish_ordered() This for buffered IO. However they go different routines to handle ordered extent io: - Whether to iterate through all ordered extents __endio_write_update_ordered() will but btrfs_writepage_endio_finish_ordered() will not. In fact, iterating through all ordered extents will benefit later subpage support, while for current PAGE_SIZE == sectorsize requirement those behavior makes no difference. - Whether to update page Private2 flag __endio_write_update_ordered() will no update page Private2 flag as for iomap direct IO, the page can be not even mapped. While btrfs_writepage_endio_finish_ordered() will clear Private2 to prevent double accounting against btrfs_invalidatepage(). Those differences are pretty small, and the ordered extent iterations codes in callers makes code much harder to read. So this patch will introduce a new function, btrfs_mark_ordered_io_finished(), to do the heavy lifting work: - Iterate through all ordered extents in the range - Do the ordered extent accounting - Queue the work for finished ordered extent This function has two new feature: - Proper underflow detection and recover The old underflow detection will only detect the problem, then continue. No proper info like root/inode/ordered extent info, nor noisy enough to be caught by fstests. Furthermore when underflow happens, the ordered extent will never finish. New error detection will reset the bytes_left to 0, do proper kernel warning, and output extra info including root, ino, ordered extent range, the underflow value. - Prevent double accounting based on Private2 flag Now if we find a range without Private2 flag, we will skip to next range. As that means someone else has already finished the accounting of ordered extent. This makes no difference for current code, but will be a critical part for incoming subpage support. Now both endio functions only need to call that new function. And since the only caller of btrfs_dec_test_first_ordered_pending() is removed, also remove btrfs_dec_test_first_ordered_pending() completely. Signed-off-by: Qu Wenruo --- fs/btrfs/inode.c| 55 +--- fs/btrfs/ordered-data.c | 179 +++- fs/btrfs/ordered-data.h | 8 +- 3 files changed, 129 insertions(+), 113 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 752f0c78e1df..645097bff5a0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3063,25 +3063,11 @@ void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, struct page *page, u64 start, u64 end, int uptodate) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_ordered_extent *ordered_extent = NULL; - struct btrfs_workqueue *wq; - ASSERT(end + 1 - start < U32_MAX); trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate); - ClearPagePrivate2(page); - if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, - end - start + 1, uptodate)) - return; - - if (btrfs_is_free_space_inode(inode)) - wq = fs_info->endio_freespace_worker; - else - wq = fs_info->endio_write_workers; - - btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL); - btrfs_queue_work(wq, &ordered_extent->work); + btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, + finish_ordered_fn, uptodate); } /* @@ -7959,42 +7945,9 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode, const u64 offset, const u64 bytes, const bool uptodate) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_ordered_extent *ordered = NULL; - struct btrfs_workqueue *wq; - u64 ordered_offset = offset; - u64 ordered_bytes = bytes; - u64 last_offset; - - if (btrfs_is_free_space_inode(inode)) - wq = fs_info->endio_freespace_worker; - else - wq = fs_info->endio_write_workers; - ASSERT(bytes < U32_MAX); - while (ordered_offset < offset + bytes) { - last_offset = ordered_offset; - if (btrfs_dec_test_first_ordered_pending(inode, &ordered, -&ordered_offset, -ordered_bytes, -uptodate)) { - btrfs_init_wor
Re: [PATCH 08/42] btrfs: pass btrfs_inode into btrfs_writepage_endio_finish_ordered()
On 4/15/21 1:04 AM, Qu Wenruo wrote: There is a pretty bad abuse of btrfs_writepage_endio_finish_ordered() in end_compressed_bio_write(). It passes compressed pages to btrfs_writepage_endio_finish_ordered(), which is only supposed to accept inode pages. Thankfully the important info here is the inode, so let's pass btrfs_inode directly into btrfs_writepage_endio_finish_ordered(), and make @page parameter optional. By this, end_compressed_bio_write() can happily pass page=NULL while still get everything done properly. Also, to cooperate with such modification, replace @page parameter for trace_btrfs_writepage_end_io_hook() with btrfs_inode. Although this removes page_index info, the existing start/len should be enough for most usage. Signed-off-by: Qu Wenruo --- fs/btrfs/compression.c | 4 +--- fs/btrfs/ctree.h | 3 ++- fs/btrfs/extent_io.c | 16 ++-- fs/btrfs/inode.c | 9 + include/trace/events/btrfs.h | 19 --- 5 files changed, 26 insertions(+), 25 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 2600703fab83..4fbe3e12be71 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -343,11 +343,9 @@ static void end_compressed_bio_write(struct bio *bio) * call back into the FS and do all the end_io operations */ inode = cb->inode; - cb->compressed_pages[0]->mapping = cb->inode->i_mapping; - btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0], + btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL, cb->start, cb->start + cb->len - 1, bio->bi_status == BLK_STS_OK); - cb->compressed_pages[0]->mapping = NULL; end_compressed_writeback(inode, cb); /* note, our inode could be gone now */ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2c858d5349c8..505bc6674bcc 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3175,7 +3175,8 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page u64 start, u64 end, int *page_started, unsigned long *nr_written, struct writeback_control *wbc); int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end); -void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, +void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, + struct page *page, u64 start, u64 end, int uptodate); extern const struct dentry_operations btrfs_dentry_operations; extern const struct iomap_ops btrfs_dio_iomap_ops; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7d1fca9b87f0..6d712418b67b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2711,10 +2711,13 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode, void end_extent_writepage(struct page *page, int err, u64 start, u64 end) { + struct btrfs_inode *inode; int uptodate = (err == 0); int ret = 0; - btrfs_writepage_endio_finish_ordered(page, start, end, uptodate); + ASSERT(page && page->mapping); + inode = BTRFS_I(page->mapping->host); + btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate); if (!uptodate) { ClearPageUptodate(page); @@ -3739,7 +3742,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, u32 iosize; if (cur >= i_size) { - btrfs_writepage_endio_finish_ordered(page, cur, end, 1); + btrfs_writepage_endio_finish_ordered(inode, page, cur, +end, 1); break; } em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1); @@ -3777,8 +3781,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, if (compressed) nr++; else - btrfs_writepage_endio_finish_ordered(page, cur, - cur + iosize - 1, 1); + btrfs_writepage_endio_finish_ordered(inode, + page, cur, cur + iosize - 1, 1); cur += iosize; continue; } @@ -4842,8 +4846,8 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, if (clear_page_dirty_for_io(page)) ret = __extent_writepage(page, &wbc_writepages, &epd); else { - btrfs_writepage_endio_finish_ordered(page, start, - start + PAGE_SIZE - 1, 1); + btrfs_writepage_endio_fini
Re: [PATCH 07/42] btrfs: use u32 for length related members of btrfs_ordered_extent
On 4/15/21 1:04 AM, Qu Wenruo wrote: Unlike btrfs_file_extent_item, btrfs_ordered_extent has its length limit (BTRFS_MAX_EXTENT_SIZE), which is far smaller than U32_MAX. Using u64 for those length related members are just a waste of memory. This patch will make the following members u32: - num_bytes - disk_num_bytes - bytes_left - truncated_len This will save 16 bytes for btrfs_ordered_extent structure. For btrfs_add_ordered_extent*() call sites, they are mostly deeply inside other functions passing u64. Thus this patch will keep those u64, but do internal ASSERT() to ensure the correct length values are passed in. For btrfs_dec_test_.*_ordered_extent() call sites, length related parameters are converted to u32, with extra ASSERT() added to ensure we get correct values passed in. There is special convert needed in btrfs_remove_ordered_extent(), which needs s64, using "-entry->num_bytes" from u32 directly will cause underflow. Signed-off-by: Qu Wenruo --- fs/btrfs/inode.c| 11 --- fs/btrfs/ordered-data.c | 21 ++--- fs/btrfs/ordered-data.h | 25 ++--- 3 files changed, 36 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 74ee34fc820d..554effbf307e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3066,6 +3066,7 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, struct btrfs_ordered_extent *ordered_extent = NULL; struct btrfs_workqueue *wq; + ASSERT(end + 1 - start < U32_MAX); trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); ClearPagePrivate2(page); @@ -7969,6 +7970,7 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode, else wq = fs_info->endio_write_workers; + ASSERT(bytes < U32_MAX); while (ordered_offset < offset + bytes) { last_offset = ordered_offset; if (btrfs_dec_test_first_ordered_pending(inode, &ordered, @@ -8415,10 +8417,13 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, if (TestClearPagePrivate2(page)) { spin_lock_irq(&inode->ordered_tree.lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); - ordered->truncated_len = min(ordered->truncated_len, -start - ordered->file_offset); + ASSERT(start - ordered->file_offset < U32_MAX); + ordered->truncated_len = min_t(u32, + ordered->truncated_len, + start - ordered->file_offset); spin_unlock_irq(&inode->ordered_tree.lock); + ASSERT(end - start + 1 < U32_MAX); if (btrfs_dec_test_ordered_pending(inode, &ordered, start, end - start + 1, 1)) { @@ -8937,7 +8942,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode) break; else { btrfs_err(root->fs_info, - "found ordered extent %llu %llu on inode cleanup", + "found ordered extent %llu %u on inode cleanup", ordered->file_offset, ordered->num_bytes); btrfs_remove_ordered_extent(inode, ordered); btrfs_put_ordered_extent(ordered); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 07b0b4218791..8e6d9d906bdd 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -160,6 +160,12 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset struct btrfs_ordered_extent *entry; int ret; + /* +* Basic size check, all length related members should be smaller +* than U32_MAX. +*/ + ASSERT(num_bytes < U32_MAX && disk_num_bytes < U32_MAX); + if (type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_PREALLOC) { /* For nocow write, we can release the qgroup rsv right now */ ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes); @@ -186,7 +192,7 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset entry->bytes_left = num_bytes; entry->inode = igrab(&inode->vfs_inode); entry->compress_type = compress_type; - entry->truncated_len = (u64)-1; + entry->truncated_len = (u32)-1; entry->qgroup_rsv = ret; entry->physical = (u64)-1; entry->disk = NULL; @@ -320,7 +326,7 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, */ bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, stru
Re: [PATCH 06/42] btrfs: allow btrfs_bio_fits_in_stripe() to accept bio without any page
On 4/15/21 1:04 AM, Qu Wenruo wrote: Function btrfs_bio_fits_in_stripe() now requires a bio with at least one page added. Or btrfs_get_chunk_map() will fail with -ENOENT. But in fact this requirement is not needed at all, as we can just pass sectorsize for btrfs_get_chunk_map(). This tiny behavior change is important for later subpage refactor on submit_extent_page(). As for 64K page size, we can have a page range with pgoff=0 and size=64K. If the logical bytenr is just 16K before the stripe boundary, we have to split the page range into two bios. This means, we must check page range against stripe boundary, even adding the range to an empty bio. This tiny refactor is for the incoming change, but on its own, regular sectorsize == PAGE_SIZE is not affected anyway. Signed-off-by: Qu Wenruo Reviewed-by: Josef Bacik Thanks, Josef
Re: [PATCH 05/42] btrfs: remove the unused parameter @len for btrfs_bio_fits_in_stripe()
On 4/15/21 1:04 AM, Qu Wenruo wrote: The parameter @len is not really used in btrfs_bio_fits_in_stripe(), just remove it. Signed-off-by: Qu Wenruo Reviewed-by: Josef Bacik Thanks, Josef
Re: [PATCH 02/42] btrfs: introduce write_one_subpage_eb() function
On 4/15/21 7:25 PM, Qu Wenruo wrote: On 2021/4/16 上午3:03, Josef Bacik wrote: On 4/15/21 1:04 AM, Qu Wenruo wrote: The new function, write_one_subpage_eb(), as a subroutine for subpage metadata write, will handle the extent buffer bio submission. The major differences between the new write_one_subpage_eb() and write_one_eb() is: - No page locking When entering write_one_subpage_eb() the page is no longerlocked. We only lock the page for its status update, and unlock immediately. Now we completely rely on extent io tree locking. - Extra bitmap update along with page status update Now page dirty and writeback is controlled by btrfs_subpage::dirty_bitmap and btrfs_subpage::writeback_bitmap. They both follow the schema that any sector is dirty/writeback, then the full page get dirty/writeback. - When to update the nr_written number Now we take a short cut, if we have cleared the last dirtybit of the page, we update nr_written. This is not completely perfect, but should emulate the oldbehavior good enough. Signed-off-by: Qu Wenruo --- fs/btrfs/extent_io.c | 55 1 file changed, 55 insertions(+) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 21a14b1cb065..f32163a465ec 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4196,6 +4196,58 @@ static void end_bio_extent_buffer_writepage(struct bio *bio) bio_put(bio); } +/* + * Unlike the work in write_one_eb(), we rely completely on extent locking. + * Page locking is only utizlied at minimal to keep the VM code happy. + * + * Caller should still call write_one_eb() other than this function directly. + * As write_one_eb() has extra prepration before submitting the extent buffer. + */ +static int write_one_subpage_eb(struct extent_buffer *eb, + struct writeback_control *wbc, + struct extent_page_data *epd) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + struct page *page = eb->pages[0]; + unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; + bool no_dirty_ebs = false; + int ret; + + /* clear_page_dirty_for_io() in subpage helper needpage locked. */ + lock_page(page); + btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len); + + /* If we're the last dirty bit to update nr_written*/ + no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page, + eb->start, eb->len); + if (no_dirty_ebs) + clear_page_dirty_for_io(page); + + ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, page, + eb->start, eb->len, eb->start - page_offset(page), + &epd->bio, end_bio_extent_buffer_writepage, 0, 0, 0, + false); + if (ret) { + btrfs_subpage_clear_writeback(fs_info, page, eb->start, + eb->len); + set_btree_ioerr(page, eb); + unlock_page(page); + + if (atomic_dec_and_test(&eb->io_pages)) + end_extent_buffer_writeback(eb); + return -EIO; + } + unlock_page(page); + /* + * Submission finishes without problem, if no range of the page is + * dirty anymore, we have submitted a page. + * Update the nr_written in wbc. + */ + if (no_dirty_ebs) + update_nr_written(wbc, 1); + return ret; +} + static noinline_for_stack int write_one_eb(struct extent_buffer *eb, struct writeback_control *wbc, struct extent_page_data *epd) @@ -4227,6 +4279,9 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, memzero_extent_buffer(eb, start, end - start); } + if (eb->fs_info->sectorsize < PAGE_SIZE) + return write_one_subpage_eb(eb, wbc, epd); + Same comment here, again you're calling write_one_eb() which expects to do the eb thing, but then later have an entirely different path for the subpage stuff, and thus could just call your write_one_subpage_eb() helper from there instead of stuffing it into write_one_eb(). But there are some common code before calling the subpage routine. I don't think it's a good idea to have duplicated code between subpage and regular routine. Ah I missed the part at the top for zero'ing out the buffer. In that case turn that into a helper function and then keep the paths separate. Thanks, Josef
Re: [PATCH 04/42] btrfs: introduce submit_eb_subpage() to submit a subpage metadata page
On 4/15/21 7:28 PM, Qu Wenruo wrote: On 2021/4/16 上午3:27, Josef Bacik wrote: On 4/15/21 1:04 AM, Qu Wenruo wrote: The new function, submit_eb_subpage(), will submit all the dirty extent buffers in the page. The major difference between submit_eb_page() and submit_eb_subpage() is: - How to grab extent buffer Now we use find_extent_buffer_nospinlock() other than using page::private. All other different handling is already done in functions like lock_extent_buffer_for_io() and write_one_eb(). Signed-off-by: Qu Wenruo --- fs/btrfs/extent_io.c | 95 1 file changed, 95 insertions(+) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c068c2fcba09..7d1fca9b87f0 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4323,6 +4323,98 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, return ret; } +/* + * Submit one subpage btree page. + * + * The main difference between submit_eb_page() is: + * - Page locking + * For subpage, we don't rely on page locking at all. + * + * - Flush write bio + * We only flush bio if we may be unable to fit current extent buffers into + * current bio. + * + * Return >=0 for the number of submitted extent buffers. + * Return <0 for fatal error. + */ +static int submit_eb_subpage(struct page *page, + struct writeback_control *wbc, + struct extent_page_data *epd) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + int submitted = 0; + u64 page_start = page_offset(page); + int bit_start = 0; + int nbits = BTRFS_SUBPAGE_BITMAP_SIZE; + int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; + int ret; + + /* Lock and write each dirty extent buffers in the range */ + while (bit_start < nbits) { + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct extent_buffer *eb; + unsigned long flags; + u64 start; + + /* + * Take private lock to ensure the subpage won't be detached + * halfway. + */ + spin_lock(&page->mapping->private_lock); + if (!PagePrivate(page)) { + spin_unlock(&page->mapping->private_lock); + break; + } + spin_lock_irqsave(&subpage->lock, flags); writepages doesn't get called with irq context, so you can just do spin_lock_irq()/spin_unlock_irq(). But this spinlock is used in endio function. If we don't use irqsave variant here, won't an endio interruption call sneak in and screw up everything? No, you use irqsave if the function can be called under irq. So in the endio call you do irqsave. This function can't be called by an interrupt handler, so just _irq() is fine because you just need to disable irq's. Thanks, Josef
Re: [PATCH 04/42] btrfs: introduce submit_eb_subpage() to submit a subpage metadata page
On 4/15/21 1:04 AM, Qu Wenruo wrote: The new function, submit_eb_subpage(), will submit all the dirty extent buffers in the page. The major difference between submit_eb_page() and submit_eb_subpage() is: - How to grab extent buffer Now we use find_extent_buffer_nospinlock() other than using page::private. All other different handling is already done in functions like lock_extent_buffer_for_io() and write_one_eb(). Signed-off-by: Qu Wenruo --- fs/btrfs/extent_io.c | 95 1 file changed, 95 insertions(+) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c068c2fcba09..7d1fca9b87f0 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4323,6 +4323,98 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, return ret; } +/* + * Submit one subpage btree page. + * + * The main difference between submit_eb_page() is: + * - Page locking + * For subpage, we don't rely on page locking at all. + * + * - Flush write bio + * We only flush bio if we may be unable to fit current extent buffers into + * current bio. + * + * Return >=0 for the number of submitted extent buffers. + * Return <0 for fatal error. + */ +static int submit_eb_subpage(struct page *page, +struct writeback_control *wbc, +struct extent_page_data *epd) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + int submitted = 0; + u64 page_start = page_offset(page); + int bit_start = 0; + int nbits = BTRFS_SUBPAGE_BITMAP_SIZE; + int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; + int ret; + + /* Lock and write each dirty extent buffers in the range */ + while (bit_start < nbits) { + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct extent_buffer *eb; + unsigned long flags; + u64 start; + + /* +* Take private lock to ensure the subpage won't be detached +* halfway. +*/ + spin_lock(&page->mapping->private_lock); + if (!PagePrivate(page)) { + spin_unlock(&page->mapping->private_lock); + break; + } + spin_lock_irqsave(&subpage->lock, flags); writepages doesn't get called with irq context, so you can just do spin_lock_irq()/spin_unlock_irq(). + if (!((1 << bit_start) & subpage->dirty_bitmap)) { Can we make this a helper so it's more clear what's going on here? Thanks, Josef
Re: [PATCH 03/42] btrfs: make lock_extent_buffer_for_io() to be subpage compatible
On 4/15/21 1:04 AM, Qu Wenruo wrote: For subpage metadata, we don't use page locking at all. So just skip the page locking part for subpage. All the remaining routine can be reused. Signed-off-by: Qu Wenruo Reviewed-by: Josef Bacik Thanks, Josef
Re: [PATCH 02/42] btrfs: introduce write_one_subpage_eb() function
On 4/15/21 1:04 AM, Qu Wenruo wrote: The new function, write_one_subpage_eb(), as a subroutine for subpage metadata write, will handle the extent buffer bio submission. The major differences between the new write_one_subpage_eb() and write_one_eb() is: - No page locking When entering write_one_subpage_eb() the page is no longer locked. We only lock the page for its status update, and unlock immediately. Now we completely rely on extent io tree locking. - Extra bitmap update along with page status update Now page dirty and writeback is controlled by btrfs_subpage::dirty_bitmap and btrfs_subpage::writeback_bitmap. They both follow the schema that any sector is dirty/writeback, then the full page get dirty/writeback. - When to update the nr_written number Now we take a short cut, if we have cleared the last dirty bit of the page, we update nr_written. This is not completely perfect, but should emulate the old behavior good enough. Signed-off-by: Qu Wenruo --- fs/btrfs/extent_io.c | 55 1 file changed, 55 insertions(+) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 21a14b1cb065..f32163a465ec 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4196,6 +4196,58 @@ static void end_bio_extent_buffer_writepage(struct bio *bio) bio_put(bio); } +/* + * Unlike the work in write_one_eb(), we rely completely on extent locking. + * Page locking is only utizlied at minimal to keep the VM code happy. + * + * Caller should still call write_one_eb() other than this function directly. + * As write_one_eb() has extra prepration before submitting the extent buffer. + */ +static int write_one_subpage_eb(struct extent_buffer *eb, + struct writeback_control *wbc, + struct extent_page_data *epd) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + struct page *page = eb->pages[0]; + unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; + bool no_dirty_ebs = false; + int ret; + + /* clear_page_dirty_for_io() in subpage helper need page locked. */ + lock_page(page); + btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len); + + /* If we're the last dirty bit to update nr_written */ + no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page, + eb->start, eb->len); + if (no_dirty_ebs) + clear_page_dirty_for_io(page); + + ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, page, + eb->start, eb->len, eb->start - page_offset(page), + &epd->bio, end_bio_extent_buffer_writepage, 0, 0, 0, + false); + if (ret) { + btrfs_subpage_clear_writeback(fs_info, page, eb->start, + eb->len); + set_btree_ioerr(page, eb); + unlock_page(page); + + if (atomic_dec_and_test(&eb->io_pages)) + end_extent_buffer_writeback(eb); + return -EIO; + } + unlock_page(page); + /* +* Submission finishes without problem, if no range of the page is +* dirty anymore, we have submitted a page. +* Update the nr_written in wbc. +*/ + if (no_dirty_ebs) + update_nr_written(wbc, 1); + return ret; +} + static noinline_for_stack int write_one_eb(struct extent_buffer *eb, struct writeback_control *wbc, struct extent_page_data *epd) @@ -4227,6 +4279,9 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, memzero_extent_buffer(eb, start, end - start); } + if (eb->fs_info->sectorsize < PAGE_SIZE) + return write_one_subpage_eb(eb, wbc, epd); + Same comment here, again you're calling write_one_eb() which expects to do the eb thing, but then later have an entirely different path for the subpage stuff, and thus could just call your write_one_subpage_eb() helper from there instead of stuffing it into write_one_eb(). Also, I generally don't care about ordering of patches as long as they make sense generally. However in this case if you were to bisect to just this patch you would be completely screwed, as the normal write path would just fail to write the other eb's on the page. You really need to have the patches that do the write_cache_pages part done first, and then have this patch. Or alternatively, leave the order as it is, and simply don't wire the helper up until you implement the subpage writepages further down. That may be better, you won't have to re-order anything and you can maintain these smaller chunks for review, which may not be possible if you re-order them. Thanks, Josef
Re: [PATCH 01/42] btrfs: introduce end_bio_subpage_eb_writepage() function
On 4/15/21 1:04 AM, Qu Wenruo wrote: The new function, end_bio_subpage_eb_writepage(), will handle the metadata writeback endio. The major differences involved are: - How to grab extent buffer Now page::private is a pointer to btrfs_subpage, we can no longer grab extent buffer directly. Thus we need to use the bv_offset to locate the extent buffer manually and iterate through the whole range. - Use btrfs_subpage_end_writeback() caller This helper will handle the subpage writeback for us. Since this function is executed under endio context, when grabbing extent buffers it can't grab eb->refs_lock as that lock is not designed to be grabbed under hardirq context. So here introduce a helper, find_extent_buffer_nospinlock(), for such situation, and convert find_extent_buffer() to use that helper. Signed-off-by: Qu Wenruo --- fs/btrfs/extent_io.c | 135 +-- 1 file changed, 106 insertions(+), 29 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a50adbd8808d..21a14b1cb065 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4080,13 +4080,97 @@ static void set_btree_ioerr(struct page *page, struct extent_buffer *eb) } } +/* + * This is the endio specific version which won't touch any unsafe spinlock + * in endio context. + */ +static struct extent_buffer *find_extent_buffer_nospinlock( + struct btrfs_fs_info *fs_info, u64 start) +{ + struct extent_buffer *eb; + + rcu_read_lock(); + eb = radix_tree_lookup(&fs_info->buffer_radix, + start >> fs_info->sectorsize_bits); + if (eb && atomic_inc_not_zero(&eb->refs)) { + rcu_read_unlock(); + return eb; + } + rcu_read_unlock(); + return NULL; +} +/* + * The endio function for subpage extent buffer write. + * + * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback() + * after all extent buffers in the page has finished their writeback. + */ +static void end_bio_subpage_eb_writepage(struct btrfs_fs_info *fs_info, +struct bio *bio) +{ + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + ASSERT(!bio_flagged(bio, BIO_CLONED)); + bio_for_each_segment_all(bvec, bio, iter_all) { + struct page *page = bvec->bv_page; + u64 bvec_start = page_offset(page) + bvec->bv_offset; + u64 bvec_end = bvec_start + bvec->bv_len - 1; + u64 cur_bytenr = bvec_start; + + ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize)); + + /* Iterate through all extent buffers in the range */ + while (cur_bytenr <= bvec_end) { + struct extent_buffer *eb; + int done; + + /* +* Here we can't use find_extent_buffer(), as it may +* try to lock eb->refs_lock, which is not safe in endio +* context. +*/ + eb = find_extent_buffer_nospinlock(fs_info, cur_bytenr); + ASSERT(eb); + + cur_bytenr = eb->start + eb->len; + + ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)); + done = atomic_dec_and_test(&eb->io_pages); + ASSERT(done); + + if (bio->bi_status || + test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { + ClearPageUptodate(page); + set_btree_ioerr(page, eb); + } + + btrfs_subpage_clear_writeback(fs_info, page, eb->start, + eb->len); + end_extent_buffer_writeback(eb); + /* +* free_extent_buffer() will grab spinlock which is not +* safe in endio context. Thus here we manually dec +* the ref. +*/ + atomic_dec(&eb->refs); + } + } + bio_put(bio); +} + static void end_bio_extent_buffer_writepage(struct bio *bio) { + struct btrfs_fs_info *fs_info; struct bio_vec *bvec; struct extent_buffer *eb; int done; struct bvec_iter_all iter_all; + fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb); + if (fs_info->sectorsize < PAGE_SIZE) + return end_bio_subpage_eb_writepage(fs_info, bio); + You replace the write_one_eb() call with one specifically for subpage, why not just use your special endio from there without polluting the normal writepage helper? Thanks, Josef
Re: [PATCH v4 3/3] btrfs: zoned: automatically reclaim zones
On 4/15/21 9:58 AM, Johannes Thumshirn wrote: When a file gets deleted on a zoned file system, the space freed is not returned back into the block group's free space, but is migrated to zone_unusable. As this zone_unusable space is behind the current write pointer it is not possible to use it for new allocations. In the current implementation a zone is reset once all of the block group's space is accounted as zone unusable. This behaviour can lead to premature ENOSPC errors on a busy file system. Instead of only reclaiming the zone once it is completely unusable, kick off a reclaim job once the amount of unusable bytes exceeds a user configurable threshold between 51% and 100%. It can be set per mounted filesystem via the sysfs tunable bg_reclaim_threshold which is set to 75% per default. Similar to reclaiming unused block groups, these dirty block groups are added to a to_reclaim list and then on a transaction commit, the reclaim process is triggered but after we deleted unused block groups, which will free space for the relocation process. Signed-off-by: Johannes Thumshirn --- fs/btrfs/block-group.c | 96 fs/btrfs/block-group.h | 3 ++ fs/btrfs/ctree.h | 6 +++ fs/btrfs/disk-io.c | 13 + fs/btrfs/free-space-cache.c | 9 +++- fs/btrfs/sysfs.c | 35 + fs/btrfs/volumes.c | 2 +- fs/btrfs/volumes.h | 1 + include/trace/events/btrfs.h | 12 + 9 files changed, 175 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index bbb5a6e170c7..3f06ea42c013 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1485,6 +1485,92 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg) spin_unlock(&fs_info->unused_bgs_lock); } +void btrfs_reclaim_bgs_work(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info = + container_of(work, struct btrfs_fs_info, reclaim_bgs_work); + struct btrfs_block_group *bg; + struct btrfs_space_info *space_info; + int ret = 0; + + if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) + return; + + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) + return; + + mutex_lock(&fs_info->reclaim_bgs_lock); + spin_lock(&fs_info->unused_bgs_lock); + while (!list_empty(&fs_info->reclaim_bgs)) { + bg = list_first_entry(&fs_info->reclaim_bgs, + struct btrfs_block_group, + bg_list); + list_del_init(&bg->bg_list); + + space_info = bg->space_info; + spin_unlock(&fs_info->unused_bgs_lock); + + /* Don't want to race with allocators so take the groups_sem */ + down_write(&space_info->groups_sem); + + spin_lock(&bg->lock); + if (bg->reserved || bg->pinned || bg->ro) { + /* +* We want to bail if we made new allocations or have +* outstanding allocations in this block group. We do +* the ro check in case balance is currently acting on +* this block group. +*/ + spin_unlock(&bg->lock); + up_write(&space_info->groups_sem); + goto next; + } + spin_unlock(&bg->lock); + Here I think we want a if (btrfs_fs_closing()) goto next; so we don't block out a umount for all eternity. Thanks, Josef
Re: [PATCH v4 2/3] btrfs: rename delete_unused_bgs_mutex
On 4/15/21 9:58 AM, Johannes Thumshirn wrote: As a preparation for another user, rename the unused_bgs_mutex into reclaim_bgs_lock. Signed-off-by: Johannes Thumshirn Reviewed-by: Josef Bacik Thanks, Josef
Re: [PATCH v4 1/3] btrfs: zoned: reset zones of relocated block groups
On 4/15/21 9:58 AM, Johannes Thumshirn wrote: When relocating a block group the freed up space is not discarded in one big block, but each extent is discarded on it's own with -odisard=sync. For a zoned filesystem we need to discard the whole block group at once, so btrfs_discard_extent() will translate the discard into a REQ_OP_ZONE_RESET operation, which then resets the device's zone. Link: https://lore.kernel.org/linux-btrfs/459e2932c48e12e883dcfd3dda828d9da251d5b5.1617962110.git.johannes.thumsh...@wdc.com Signed-off-by: Johannes Thumshirn What would be cool is if we could disable discard per bg so we don't discard at all during the relocation, and then discard the whole block group no matter if we have zoned or not. However not really something you need to do, just thinking out loud Reviewed-by: Josef Bacik Thanks, Josef
Re: [RFC v3 0/2] vfs / btrfs: add support for ustat()
On 4/15/21 1:53 PM, Luis Chamberlain wrote: On Wed, Aug 23, 2017 at 3:31 PM Jeff Mahoney wrote: On 8/15/14 5:29 AM, Al Viro wrote: On Thu, Aug 14, 2014 at 07:58:56PM -0700, Luis R. Rodriguez wrote: Christoph had noted that this seemed associated to the problem that the btrfs uses different assignments for st_dev than s_dev, but much as I'd like to see that changed based on discussions so far its unclear if this is going to be possible unless strong commitment is reached. Resurrecting a dead thread since we've been carrying this patch anyway since then. Explain, please. Whose commitment and commitment to what, exactly? Having different ->st_dev values for different files on the same fs is a bloody bad idea; why does btrfs do that at all? If nothing else, it breaks the usual "are those two files on the same fs?" tests... It's because btrfs snapshots would have inode number collisions. Changing the inode numbers for snapshots would negate a big benefit of btrfs snapshots: the quick creation and lightweight on-disk representation due to metadata sharing. The thing is that ustat() used to work. Your commit 0ee5dc676a5f8 (btrfs: kill magical embedded struct superblock) had a regression: Since it replaced the superblock with a simple dev_t, it rendered the device no longer discoverable by user_get_super. We need a list_head to attach for searching. There's an argument that this is hacky. It's valid. The only other feedback I've heard is to use a real superblock for subvolumes to do this instead. That doesn't work either, due to things like freeze/thaw and inode writeback. Ultimately, what we need is a single file system with multiple namespaces. Years ago we just needed different inode namespaces, but as people have started adopting btrfs for containers, we need more than that. I've heard requests for per-subvolume security contexts. I'd imagine user namespaces are on someone's wish list. A working df can be done with ->d_automount, but the way btrfs handles having a "canonical" subvolume location has always been a way to avoid directory loops. I'd like to just automount subvolumes everywhere they're referenced. One solution, for which I have no code yet, is to have something like a superblock-light that we can hang things like a security context, a user namespace, and an anonymous dev. Most file systems would have just one. Btrfs would have one per subvolume. That's a big project with a bunch of discussion. 4 years have gone by and this patch is still being carried around for btrfs. Other than resolving this ustat() issue for btrfs are there new reasons to support this effort done to be done properly? Are there other filesystems that would benefit? I'd like to get an idea of the stakeholder here before considering taking this on or not. Not really sure why this needs to be addressed, we have statfs(), and what we have has worked forever now. There's a lot of larger things that need to be addressed in general to support the volume approach inside file systems that is going to require a lot of work inside of VFS. If you feel like tackling that work and then wiring up btrfs by all means have at it, but I'm not seeing a urgent need to address this. Thanks, Josef
Re: [PATCH] btrfs: fix race between transaction aborts and fsyncs leading to use-after-free
also setting BTRFS_FS_STATE_ERROR on fs_info->fs_state. After that, at cleanup_transaction(), it deletes the transaction from the list of transactions (fs_info->trans_list), sets the transaction to the state TRANS_STATE_COMMIT_DOING and then waits for the number of writers to go down to 1, as it's currently 2 (1 for task A and 1 for task B); 6) The transaction kthread is running and sees that BTRFS_FS_STATE_ERROR is set in fs_info->fs_state, so it calls btrfs_cleanup_transaction(). There it sees the list fs_info->trans_list is empty, and then proceeds into calling btrfs_drop_all_logs(), which frees the log root tree with a call to btrfs_free_log_root_tree(); 7) Task B calls write_all_supers() and, shortly after, under the label 'out_wake_log_root', it deferences the pointer stored in 'log_root_tree', which was already freed in the previous step by the transaction kthread. This results in a use-after-free leading to a crash. Fix this by deleting the transaction from the list of transactions at cleanup_transaction() only after setting the transaction state to TRANS_STATE_COMMIT_DOING and waiting for all existing tasks that are attached to the transaction to release their transaction handles. This makes the transaction kthread wait for all the tasks attached to the transaction to be done with the transaction before dropping the log roots and doing other cleanups. Fixes: ef67963dac255b ("btrfs: drop logs when we've aborted a transaction") Signed-off-by: Filipe Manana Nice catch, Reviewed-by: Josef Bacik Thanks, Josef
Re: [PATCH] btrfs-progs: Fix null pointer deref in balance_level
On 4/6/21 9:55 AM, Nikolay Borisov wrote: In case the right buffer is emptied it's first set to null and subsequently it's dereferenced to get its size to pass to root_sub_used. This naturally leads to a null pointer dereference. The correct thing to do is to pass the stashed right->len in "blocksize". Fixes #296 Signed-off-by: Nikolay Borisov Reviewed-by: Josef Bacik Thanks, Josef
Re: [PATCH] btrfs: return whole extents in fiemap
On 4/6/21 6:31 PM, Boris Burkov wrote: `xfs_io -c 'fiemap ' ` can give surprising results on btrfs that differ from xfs. btrfs spits out extents trimmed to fit the user input. If the user's fiemap request has an offset, then rather than returning each whole extent which intersects that range, we also trim the start extent to not have start < off. Documentation in filesystems/fiemap.txt and the xfs_io man page suggests that returning the whole extent is expected. Some cases which all yield the same fiemap in xfs, but not btrfs: dd if=/dev/zero of=$f bs=4k count=1 sudo xfs_io -c 'fiemap 0 1024' $f 0: [0..7]: 26624..26631 sudo xfs_io -c 'fiemap 2048 1024' $f 0: [4..7]: 26628..26631 sudo xfs_io -c 'fiemap 2048 4096' $f 0: [4..7]: 26628..26631 sudo xfs_io -c 'fiemap 3584 512' $f 0: [7..7]: 26631..26631 sudo xfs_io -c 'fiemap 4091 5' $f 0: [7..6]: 26631..26630 I believe this is a consequence of the logic for merging contiguous extents represented by separate extent items. That logic needs to track the last offset as it loops through the extent items, which happens to pick up the start offset on the first iteration, and trim off the beginning of the full extent. To fix it, start `off` at 0 rather than `start` so that we keep the iteration/merging intact without cutting off the start of the extent. after the fix, all the above commands give: 0: [0..7]: 26624..26631 The merging logic is exercised by xfstest generic/483, and I have written a new xfstest for checking we don't have backwards or zero-length fiemaps for cases like those above. Signed-off-by: Boris Burkov Reviewed-by: Josef Bacik Thanks, Josef
Re: [PATCH v2] btrfs: do more graceful error/warning for 32bit kernel
On 2/24/21 8:18 PM, Qu Wenruo wrote: Due to the pagecache limit of 32bit systems, btrfs can't access metadata at or beyond (ULONG_MAX + 1) << PAGE_SHIFT. This is 16T for 4K page size while 256T for 64K page size. And unlike other fses, btrfs uses internally mapped u64 address space for all of its metadata, this is more tricky than other fses. Users can have a fs which doesn't have metadata beyond the boundary at mount time, but later balance can cause btrfs to create metadata beyond the boundary. And modification to MM layer is unrealistic just for such minor use case. To address such problem, this patch will introduce the following checks, much like how XFS handles such problem: - Mount time rejection This will reject any fs which has metadata chunk at or beyond the boundary. - Mount time early warning If there is any metadata chunk beyond 5/8 of the boundary, we do an early warning and hope the end user will see it. - Runtime extent buffer rejection If we're going to allocate an extent buffer at or beyond the boundary, reject such request with -EOVERFLOW. This is definitely going to cause problems like transaction abort, but we have no better ways. - Runtime extent buffer early warning If an extent buffer beyond 5/8 of the max file size is allocated, do an early warning. Above error/warning message will only be outputted once for each fs to reduce dmesg flood. Reported-by: Erik Jensen Signed-off-by: Qu Wenruo This doesn't apply cleanly to misc-next so it needs to be respun, but otherwise Reviewed-by: Josef Bacik Thanks, Josef
Re: [PATCH] btrfs: Correct try_lock_extent() usage in read_extent_buffer_subpage()
On 4/8/21 8:40 AM, Goldwyn Rodrigues wrote: try_lock_extent() returns 1 on success or 0 for failure and not an error code. If try_lock_extent() fails, read_extent_buffer_subpage() returns zero indicating subpage extent read success. Return EAGAIN/EWOULDBLOCK if try_lock_extent() fails in locking the extent. Signed-off-by: Goldwyn Rodrigues Reviewed-by: Josef Bacik Thanks, Josef
Re: ENOSPC in btrfs_run_delayed_refs with 5.10.8
On 4/8/21 12:10 PM, Martin Raiber wrote: On 11.03.2021 18:58 Martin Raiber wrote: On 01.02.2021 23:08 Martin Raiber wrote: On 27.01.2021 22:03 Chris Murphy wrote: On Wed, Jan 27, 2021 at 10:27 AM Martin Raiber wrote: Hi, seems 5.10.8 still has the ENOSPC issue when compression is used (compress-force=zstd,space_cache=v2): Jan 27 11:02:14 kernel: [248571.569840] [ cut here ] Jan 27 11:02:14 kernel: [248571.569843] BTRFS: Transaction aborted (error -28) Jan 27 11:02:14 kernel: [248571.569845] BTRFS: error (device dm-0) in add_to_free_space_tree:1039: errno=-28 No space left Jan 27 11:02:14 kernel: [248571.569848] BTRFS info (device dm-0): forced readonly Jan 27 11:02:14 kernel: [248571.569851] BTRFS: error (device dm-0) in add_to_free_space_tree:1039: errno=-28 No space left Jan 27 11:02:14 kernel: [248571.569852] BTRFS: error (device dm-0) in __btrfs_free_extent:3270: errno=-28 No space left Jan 27 11:02:14 kernel: [248571.569854] BTRFS: error (device dm-0) in btrfs_run_delayed_refs:2191: errno=-28 No space left Jan 27 11:02:14 kernel: [248571.569898] WARNING: CPU: 3 PID: 21255 at fs/btrfs/free-space-tree.c:1039 add_to_free_space_tree+0xe8/0x130 Jan 27 11:02:14 kernel: [248571.569913] BTRFS: error (device dm-0) in __btrfs_free_extent:3270: errno=-28 No space left Jan 27 11:02:14 kernel: [248571.569939] Modules linked in: Jan 27 11:02:14 kernel: [248571.569966] BTRFS: error (device dm-0) in btrfs_run_delayed_refs:2191: errno=-28 No space left Jan 27 11:02:14 kernel: [248571.569992] bfq zram bcache crc64 loop dm_crypt xfs dm_mod st sr_mod cdrom nf_tables nfnetlink iptable_filter bridge stp llc intel_powerclamp coretemp k$ Jan 27 11:02:14 kernel: [248571.570075] CPU: 3 PID: 21255 Comm: kworker/u50:22 Tainted: G I 5.10.8 #1 Jan 27 11:02:14 kernel: [248571.570076] Hardware name: Dell Inc. PowerEdge R510/0DPRKF, BIOS 1.13.0 03/02/2018 Jan 27 11:02:14 kernel: [248571.570079] Workqueue: events_unbound btrfs_async_reclaim_metadata_space Jan 27 11:02:14 kernel: [248571.570081] RIP: 0010:add_to_free_space_tree+0xe8/0x130 Jan 27 11:02:14 kernel: [248571.570082] Code: 55 50 f0 48 0f ba aa 40 0a 00 00 02 72 22 83 f8 fb 74 4c 83 f8 e2 74 47 89 c6 48 c7 c7 b8 39 49 82 89 44 24 04 e8 8a 99 4a 00 <0f> 0b 8$ Jan 27 11:02:14 kernel: [248571.570083] RSP: 0018:c90009c57b88 EFLAGS: 00010282 Jan 27 11:02:14 kernel: [248571.570084] RAX: RBX: 4000 RCX: 0027 Jan 27 11:02:14 kernel: [248571.570085] RDX: 0027 RSI: 0004 RDI: 888617a58b88 Jan 27 11:02:14 kernel: [248571.570086] RBP: 8889ecb874e0 R08: 888617a58b80 R09: Jan 27 11:02:14 kernel: [248571.570087] R10: 0001 R11: 822372e0 R12: 00574151 Jan 27 11:02:14 kernel: [248571.570087] R13: 8884e05727e0 R14: 88815ae4fc00 R15: 88815ae4fdd8 Jan 27 11:02:14 kernel: [248571.570088] FS: () GS:888617a4() knlGS: Jan 27 11:02:14 kernel: [248571.570089] CS: 0010 DS: ES: CR0: 80050033 Jan 27 11:02:14 kernel: [248571.570090] CR2: 7eb4a3a4f00a CR3: 0260a005 CR4: 000206e0 Jan 27 11:02:14 kernel: [248571.570091] Call Trace: Jan 27 11:02:14 kernel: [248571.570097] __btrfs_free_extent.isra.0+0x56a/0xa10 Jan 27 11:02:14 kernel: [248571.570100] __btrfs_run_delayed_refs+0x659/0xf20 Jan 27 11:02:14 kernel: [248571.570102] btrfs_run_delayed_refs+0x73/0x200 Jan 27 11:02:14 kernel: [248571.570103] flush_space+0x4e8/0x5e0 Jan 27 11:02:14 kernel: [248571.570105] ? btrfs_get_alloc_profile+0x66/0x1b0 Jan 27 11:02:14 kernel: [248571.570106] ? btrfs_get_alloc_profile+0x66/0x1b0 Jan 27 11:02:14 kernel: [248571.570107] btrfs_async_reclaim_metadata_space+0x107/0x3a0 Jan 27 11:02:14 kernel: [248571.570111] process_one_work+0x1b6/0x350 Jan 27 11:02:14 kernel: [248571.570112] worker_thread+0x50/0x3b0 Jan 27 11:02:14 kernel: [248571.570114] ? process_one_work+0x350/0x350 Jan 27 11:02:14 kernel: [248571.570116] kthread+0xfe/0x140 Jan 27 11:02:14 kernel: [248571.570117] ? kthread_park+0x90/0x90 Jan 27 11:02:14 kernel: [248571.570120] ret_from_fork+0x22/0x30 Jan 27 11:02:14 kernel: [248571.570122] ---[ end trace 568d2f30de65b1c0 ]--- Jan 27 11:02:14 kernel: [248571.570123] BTRFS: error (device dm-0) in add_to_free_space_tree:1039: errno=-28 No space left Jan 27 11:02:14 kernel: [248571.570151] BTRFS: error (device dm-0) in __btrfs_free_extent:3270: errno=-28 No space left Jan 27 11:02:14 kernel: [248571.570178] BTRFS: error (device dm-0) in btrfs_run_delayed_refs:2191: errno=-28 No space left btrfs fi usage: Overall: Device size: 931.49GiB Device allocated: 931.49GiB Device unallocated: 1.00MiB Device missing: 0.00B Used: 786.39GiB Free (estimated):
Re: [PATCH v2] btrfs: zoned: move superblock logging zone location
On 4/8/21 4:25 AM, Naohiro Aota wrote: This commit moves the location of the superblock logging zones. The new locations of the logging zones are now determined based on fixed block addresses instead of on fixed zone numbers. The old placement method based on fixed zone numbers causes problems when one needs to inspect a file system image without access to the drive zone information. In such case, the super block locations cannot be reliably determined as the zone size is unknown. By locating the superblock logging zones using fixed addresses, we can scan a dumped file system image without the zone information since a super block copy will always be present at or after the fixed location. This commit introduces the following three pairs of zones containing fixed offset locations, regardless of the device zone size. - Primary superblock: zone starting at offset 0 and the following zone - First copy: zone containing offset 64GB and the following zone - Second copy: zone containing offset 256GB and the following zone If a logging zone is outside of the disk capacity, we do not record the superblock copy. The first copy position is much larger than for a regular btrfs volume (64M). This increase is to avoid overlapping with the log zones for the primary superblock. This higher location is arbitrary but allows supporting devices with very large zone sizes, up to 32GB. Such large zone size is unrealistic and very unlikely to ever be seen in real devices. Currently, SMR disks have a zone size of 256MB, and we are expecting ZNS drives to be in the 1-4GB range, so this 32GB limit gives us room to breathe. For now, we only allow zone sizes up to 8GB, below this hard limit of 32GB. The fixed location addresses are somewhat arbitrary, but with the intent of maintaining superblock reliability even for smaller devices. For this reason, the superblock fixed locations do not exceed 1TB. The superblock logging zones are reserved for superblock logging and never used for data or metadata blocks. Note that we only reserve the two zones per primary/copy actually used for superblock logging. We do not reserve the ranges of zones possibly containing superblocks with the largest supported zone size (0-16GB, 64G-80GB, 256G-272GB). The zones containing the fixed location offsets used to store superblocks in a regular btrfs volume (no zoned case) are also reserved to avoid confusion. Signed-off-by: Naohiro Aota Thanks Naohiro, this makes it much easier to understand, Reviewed-by: Josef Bacik Dave, I know you're on vacation, this needs to go to Linus for this cycle so we don't have two different SB slots for two different kernels. I don't want to disturb your vacation, so I'll put this in a pull request tomorrow to Linus. If you already had plans to do this let me know and I'll hold off. Thanks, Josef
Re: [PATCH] btrfs: zoned: move superblock logging zone location
On 4/7/21 2:31 PM, Johannes Thumshirn wrote: On 07/04/2021 19:54, Josef Bacik wrote: On 3/15/21 1:53 AM, Naohiro Aota wrote: This commit moves the location of superblock logging zones. The location of the logging zones are determined based on fixed block addresses instead of on fixed zone numbers. By locating the superblock zones using fixed addresses, we can scan a dumped file system image without the zone information. And, no drawbacks exist. We use the following three pairs of zones containing fixed offset locations, regardless of the device zone size. - Primary superblock: zone starting at offset 0 and the following zone - First copy: zone containing offset 64GB and the following zone - Second copy: zone containing offset 256GB and the following zone If the location of the zones are outside of disk, we don't record the superblock copy. These addresses are arbitrary, but using addresses that are too large reduces superblock reliability for smaller devices, so we do not want to exceed 1T to cover all case nicely. Also, LBAs are generally distributed initially across one head (platter side) up to one or more zones, then go on the next head backward (the other side of the same platter), and on to the following head/platter. Thus using non sequential fixed addresses for superblock logging, such as 0/64G/256G, likely result in each superblock copy being on a different head/platter which improves chances of recovery in case of superblock read error. These zones are reserved for superblock logging and never used for data or metadata blocks. Zones containing the offsets used to store superblocks in a regular btrfs volume (no zoned case) are also reserved to avoid confusion. Note that we only reserve the 2 zones per primary/copy actually used for superblock logging. We don't reserve the ranges possibly containing superblock with the largest supported zone size (0-16GB, 64G-80GB, 256G-272GB). The first copy position is much larger than for a regular btrfs volume (64M). This increase is to avoid overlapping with the log zones for the primary superblock. This higher location is arbitrary but allows supporting devices with very large zone size, up to 32GB. But we only allow zone sizes up to 8GB for now. Ok it took me a few reads to figure out what's going on. The problem is that with large zone sizes, our current choices put the back up super blocks wyy out on the disk, correct? So instead you've picked arbitrary byte offsets, hoping that they'll be closer to the front of the disk and thus actually be useful. And then you've introduced the 8gib zone size as a way to avoid problems where we get the same zone for the backup supers. Are these statements correct? If so the changelog should be updated to make this clear up front, because it took me a while to work that out. No the problem is, we're placing superblocks into specific zones, regardless of the zone size. This creates a problem when you need to inspect a file system, but don't have the block device available, because you can't look at the zone size to calculate where the superblocks are on the device. With this change we're placing the superblocks not into specific zone numbers, but into the zones starting at specific offsets. We're taking 8G zone size as a maximum expected zone size, to make sure we're not overlapping superblock zones. Currently SMR disks have a zone size of 256MB and we're expecting ZNS drives to be in the 1-2GB range, so this 8GB gives us room to breath. Hope this helps clearing up any confusion. Ok this makes a lot more sense, and should be the first thing in the changelog, because I still got it wrong after reading the thing a few times. And I think it's worth pointing out in the comments that 8gib represents a zone size that doesn't exist currently and is likely to never exist. That will make this much easier to grok and understand in the future. Thanks, Josef
Re: [PATCH] btrfs: zoned: move superblock logging zone location
On 3/15/21 1:53 AM, Naohiro Aota wrote: This commit moves the location of superblock logging zones. The location of the logging zones are determined based on fixed block addresses instead of on fixed zone numbers. By locating the superblock zones using fixed addresses, we can scan a dumped file system image without the zone information. And, no drawbacks exist. We use the following three pairs of zones containing fixed offset locations, regardless of the device zone size. - Primary superblock: zone starting at offset 0 and the following zone - First copy: zone containing offset 64GB and the following zone - Second copy: zone containing offset 256GB and the following zone If the location of the zones are outside of disk, we don't record the superblock copy. These addresses are arbitrary, but using addresses that are too large reduces superblock reliability for smaller devices, so we do not want to exceed 1T to cover all case nicely. Also, LBAs are generally distributed initially across one head (platter side) up to one or more zones, then go on the next head backward (the other side of the same platter), and on to the following head/platter. Thus using non sequential fixed addresses for superblock logging, such as 0/64G/256G, likely result in each superblock copy being on a different head/platter which improves chances of recovery in case of superblock read error. These zones are reserved for superblock logging and never used for data or metadata blocks. Zones containing the offsets used to store superblocks in a regular btrfs volume (no zoned case) are also reserved to avoid confusion. Note that we only reserve the 2 zones per primary/copy actually used for superblock logging. We don't reserve the ranges possibly containing superblock with the largest supported zone size (0-16GB, 64G-80GB, 256G-272GB). The first copy position is much larger than for a regular btrfs volume (64M). This increase is to avoid overlapping with the log zones for the primary superblock. This higher location is arbitrary but allows supporting devices with very large zone size, up to 32GB. But we only allow zone sizes up to 8GB for now. Ok it took me a few reads to figure out what's going on. The problem is that with large zone sizes, our current choices put the back up super blocks wyy out on the disk, correct? So instead you've picked arbitrary byte offsets, hoping that they'll be closer to the front of the disk and thus actually be useful. And then you've introduced the 8gib zone size as a way to avoid problems where we get the same zone for the backup supers. Are these statements correct? If so the changelog should be updated to make this clear up front, because it took me a while to work that out. Something at the beginning like the following "With larger zone sizes, for example 8gib, the 3rd backup super would be located 8tib into the device. However not all zoned block devices are this large. In order to fix this limitation set the zones to a static byte offset, and calculate the zone number from there based on the devices zone size." So that it's clear from the outset why we're making this change. And this brings up another problem, in that what happens when we _do_ run into block devices that have huge zones, like 64gib zones? We have to change the disk format to support these devices. I'm not against that per-se, but it seems like a limitation, even if it's unlikely to ever happen. With the locations we currently have, any arbitrary zone size is going to work in the future, and the only drawback is you need a device of a certain size to take advantage of the back up super blocks. I would hope that we don't have 64gib zone size block devices that are only 128gib in size in the future. Signed-off-by: Naohiro Aota --- fs/btrfs/zoned.c | 39 +++ 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 43948bd40e02..6a72ca1f7988 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -21,9 +21,24 @@ /* Pseudo write pointer value for conventional zone */ #define WP_CONVENTIONAL ((u64)-2) +/* + * Location of the first zone of superblock logging zone pairs. + * - Primary superblock: the zone containing offset 0 (zone 0) + * - First superblock copy: the zone containing offset 64G + * - Second superblock copy: the zone containing offset 256G + */ +#define BTRFS_PRIMARY_SB_LOG_ZONE 0ULL +#define BTRFS_FIRST_SB_LOG_ZONE (64ULL * SZ_1G) +#define BTRFS_SECOND_SB_LOG_ZONE (256ULL * SZ_1G) +#define BTRFS_FIRST_SB_LOG_ZONE_SHIFT const_ilog2(BTRFS_FIRST_SB_LOG_ZONE) +#define BTRFS_SECOND_SB_LOG_ZONE_SHIFT const_ilog2(BTRFS_SECOND_SB_LOG_ZONE) + /* Number of superblock log zones */ #define BTRFS_NR_SB_LOG_ZONES 2 +/* Max size of supported zone size */ +#define BTRFS_MAX_ZONE_SIZE SZ_8G + static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
Re: Aw: Re: Re: Help needed with filesystem errors: parent transid verify failed
On 3/29/21 4:42 AM, B A wrote: Gesendet: Montag, 29. März 2021 um 08:09 Uhr Von: "Chris Murphy" An: "B A" , "Btrfs BTRFS" Cc: "Qu Wenruo" , "Josef Bacik" Betreff: Re: Re: Help needed with filesystem errors: parent transid verify failed […] What do you get for: btrfs insp dump-s -f /dev/dm-0 See attached file "btrfs_insp_dump-s_-f.txt" I'm on PTO this week so I'll be a little less responsive, but thankfully this is just the extent tree. First thing is to make sure you've backed everything up, and then you should be able to just do btrfs check --repair and it should fix it for you. However I've noticed some failure cases where it won't fix transid errors sometimes because it errors out trying to read the things. If that happens just let me know, I have a private branch with fsck changes to address this class of problems and I can point you at that. I'd rather wait to make sure the normal fsck won't work first tho, just in case. Thanks, Josef
[PATCH] btrfs: use percpu_read_positive instead of sum_positive for need_preempt
Looking at perf data for a fio workload I noticed that we were spending a pretty large chunk of time (around 5%) doing percpu_counter_sum() in need_preemptive_reclaim. This is silly, as we only want to know if we have more ordered than delalloc to see if we should be counting the delayed items in our threshold calculation. Change this to percpu_read_positive() to avoid the overhead. I ran this through fsperf to validate the changes, obviously the latency numbers in dbench and fio are quite jittery, so take them as you wish, but overall the improvements on throughput, iops, and bw are all positive. Each test was run two times, the given value is the average of both runs for their respective column. btrfs ssd normal test results bufferedrandwrite16g results metric baseline current diff == write_io_kbytes 16777216 16777216 0.00% read_clat_ns_p99 0 0 0.00% write_bw_bytes 1.04e+08 1.05e+08 1.12% read_iops 0 0 0.00% write_clat_ns_p50 13888 11840 -14.75% read_io_kbytes 0 0 0.00% read_io_bytes 0 0 0.00% write_clat_ns_p99 35008 29312 -16.27% read_bw_bytes 0 0 0.00% elapsed 170167-1.76% write_lat_ns_min 4221.503762.50 -10.87% sys_cpu39.65 35.37 -10.79% write_lat_ns_max2.67e+10 2.50e+10-6.63% read_lat_ns_min0 0 0.00% write_iops 25270.10 25553.43 1.12% read_lat_ns_max0 0 0.00% read_clat_ns_p50 0 0 0.00% dbench60 results metric baseline current diff == qpathinfo 11.12 12.7314.52% throughput 416.09445.66 7.11% flush 3485.63 1887.55 -45.85% qfileinfo0.70 1.92 173.86% ntcreatex 992.60695.76 -29.91% qfsinfo 2.43 3.7152.48% close1.67 3.1488.09% sfileinfo 66.54105.2058.10% rename 809.23619.59 -23.43% find16.88 15.46-8.41% unlink 820.54670.86 -18.24% writex3375.20 2637.91 -21.84% deltree386.33449.9816.48% readx3.43 3.41-0.60% mkdir0.05 0.03 -38.46% lockx0.26 0.26-0.76% unlockx 0.81 0.32 -60.33% dio4kbs16threads results metric baseline current diff write_io_kbytes 5249676 3357150 -36.05% read_clat_ns_p99 0 0 0.00% write_bw_bytes 89583501.50 57291192.50 -36.05% read_iops 0 0 0.00% write_clat_ns_p50242688263680 8.65% read_io_kbytes0 0 0.00% read_io_bytes 0 0 0.00% write_clat_ns_p99 15826944 36732928 132.09% read_bw_bytes 0 0 0.00% elapsed 6161 0.00% write_lat_ns_min 42704 42095-1.43% sys_cpu5.27 3.45 -34.52% write_lat_ns_max 7.43e+08 9.27e+0824.71% read_lat_ns_min 0 0 0.00% write_iops 21870.97 13987.11 -36.05% read_lat_ns_max 0 0 0.00% read_clat_ns_p50 0 0 0.00% randwrite2xram results metric baseline current diff write_io_kbytes24831972 2887626216.29% read_clat_ns_p99 0 0 0.00% write_bw_bytes 83745273.50 92182192.5010.07% read_iops 0 0 0.00% write_clat_ns_p50 13952 11648 -16.51% read_io_kbytes0 0 0.00% read_io_bytes 0 0 0.00% write_clat_ns_p99 50176 52992 5.61% read_bw_bytes 0 0 0.00% elapsed 314 332 5.73% write_lat_ns_min5920.50 5127 -13.40% sys_cpu7.82 7.35-6.07% write_lat_ns_max 5.27e+10 3.88e+10 -26.44% read_lat_ns_min 0 0 0.00% write_iops 20445.62 22505.4210.07% read_lat_ns_max 0 0 0.00% read_clat_ns_p50 0 0 0.00% untarfirefox results metricbaseline currentdiff == elapsed 47.41 47.40 -0.03% Signed-off-by: Josef Bacik --- fs/btrfs/space-info.c | 4 ++-- 1 file changed, 2
Re: [PATCH v8 00/10] fs: interface for directly reading/writing compressed data
On 3/19/21 4:08 PM, Linus Torvalds wrote: On Fri, Mar 19, 2021 at 11:21 AM Josef Bacik wrote: Can we get some movement on this? Omar is sort of spinning his wheels here trying to get this stuff merged, no major changes have been done in a few postings. I'm not Al, and I absolutely detest the IOCB_ENCODED thing, and want more explanations of why this should be done that way, and pollute our iov_iter handling EVEN MORE. Our iov_iter stuff isn't the most legible, and I don't understand why anybody would ever think it's a good idea to spread what is clearly a "struct" inside multiple different iov extents. Honestly, this sounds way more like an ioctl interface than read/write. We've done that before. That's actually the way this started https://lore.kernel.org/linux-fsdevel/8eae56abb90c0fe87c350322485ce8674e135074.1567623877.git.osan...@fb.com/ it was suggested that Omar make it generic by Dave Chinner, hence this is the direction it took. I'll leave the rest of the comments for Omar to respond to himself. Thanks, Josef
Re: [PATCH v8 00/10] fs: interface for directly reading/writing compressed data
On 3/16/21 3:42 PM, Omar Sandoval wrote: From: Omar Sandoval This series adds an API for reading compressed data on a filesystem without decompressing it as well as support for writing compressed data directly to the filesystem. As with the previous submissions, I've included a man page patch describing the API. I have test cases (including fsstress support) and example programs which I'll send up [1]. The main use-case is Btrfs send/receive: currently, when sending data from one compressed filesystem to another, the sending side decompresses the data and the receiving side recompresses it before writing it out. This is wasteful and can be avoided if we can just send and write compressed extents. The patches implementing the send/receive support will be sent shortly. Patches 1-3 add the VFS support and UAPI. Patch 4 is a fix that this series depends on; it can be merged independently. Patches 5-8 are Btrfs prep patches. Patch 9 adds Btrfs encoded read support and patch 10 adds Btrfs encoded write support. These patches are based on Dave Sterba's Btrfs misc-next branch [2], which is in turn currently based on v5.12-rc3. Al, Can we get some movement on this? Omar is sort of spinning his wheels here trying to get this stuff merged, no major changes have been done in a few postings. Thanks, Josef
Re: [PATCH v2 2/2] btrfs: zoned: automatically reclaim zones
On 3/19/21 6:48 AM, Johannes Thumshirn wrote: When a file gets deleted on a zoned file system, the space freed is not returned back into the block group's free space, but is migrated to zone_unusable. As this zone_unusable space is behind the current write pointer it is not possible to use it for new allocations. In the current implementation a zone is reset once all of the block group's space is accounted as zone unusable. This behaviour can lead to premature ENOSPC errors on a busy file system. Instead of only reclaiming the zone once it is completely unusable, kick off a reclaim job once the amount of unusable bytes exceeds a user configurable threshold between 51% and 100%. It can be set per mounted filesystem via the sysfs tunable bg_reclaim_threshold which is set to 75% per default. Similar to reclaiming unused block groups, these dirty block groups are added to a to_reclaim list and then on a transaction commit, the reclaim process is triggered but after we deleted unused block groups, which will free space for the relocation process. Signed-off-by: Johannes Thumshirn --- AFAICT sysfs_create_files() does not have the ability to provide a is_visible callback, so the bg_reclaim_threshold sysfs file is visible for non zoned filesystems as well, even though only for zoned filesystems we're adding block groups to the reclaim list. I'm all ears for a approach that is sensible in this regard. fs/btrfs/block-group.c | 84 fs/btrfs/block-group.h | 2 + fs/btrfs/ctree.h | 3 ++ fs/btrfs/disk-io.c | 11 + fs/btrfs/free-space-cache.c | 9 +++- fs/btrfs/sysfs.c | 35 +++ fs/btrfs/volumes.c | 2 +- fs/btrfs/volumes.h | 1 + include/trace/events/btrfs.h | 12 ++ 9 files changed, 157 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 9ae3ac96a521..af9026795ddd 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1485,6 +1485,80 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg) spin_unlock(&fs_info->unused_bgs_lock); } +void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_group *bg; + struct btrfs_space_info *space_info; + int ret = 0; + + if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) + return; + + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) + return; + + mutex_lock(&fs_info->reclaim_bgs_lock); + while (!list_empty(&fs_info->reclaim_bgs)) { + bg = list_first_entry(&fs_info->reclaim_bgs, + struct btrfs_block_group, + bg_list); + list_del_init(&bg->bg_list); + + space_info = bg->space_info; + mutex_unlock(&fs_info->reclaim_bgs_lock); + + /* Don't want to race with allocators so take the groups_sem */ + down_write(&space_info->groups_sem); + + spin_lock(&bg->lock); + if (bg->reserved || bg->pinned || bg->ro) { How do we deal with backup supers in zoned again? Will they show up as readonly? If so we may not want the bg->ro check, but I may be insane. + /* +* We want to bail if we made new allocations or have +* outstanding allocations in this block group. We do +* the ro check in case balance is currently acting on +* this block group. +*/ + spin_unlock(&bg->lock); + up_write(&space_info->groups_sem); + goto next; + } + spin_unlock(&bg->lock); + + ret = inc_block_group_ro(bg, 0); + up_write(&space_info->groups_sem); + if (ret < 0) { + ret = 0; + goto next; + } + + btrfs_info(fs_info, "reclaiming chunk %llu", bg->start); + trace_btrfs_reclaim_block_group(bg); + ret = btrfs_relocate_chunk(fs_info, bg->start); + if (ret) + btrfs_err(fs_info, "error relocating chunk %llu", + bg->start); + +next: + btrfs_put_block_group(bg); + mutex_lock(&fs_info->reclaim_bgs_lock); + } + mutex_unlock(&fs_info->reclaim_bgs_lock); + btrfs_exclop_finish(fs_info); +} + +void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + mutex_lock(&fs_info->reclaim_bgs_lock); + if (list_empty(&bg->bg_list)) { + btrfs_get_block_group(bg); + trace_btrfs_add_reclaim_block_group(bg); + list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs
Re: [PATCH v2 1/2] btrfs: rename delete_unused_bgs_mutex
On 3/19/21 6:48 AM, Johannes Thumshirn wrote: As a preparation for another user, rename the unused_bgs_mutex into reclaim_bgs_lock. Signed-off-by: Johannes Thumshirn Reviewed-by: Josef Bacik Thanks, Josef
Re: [PATCH 0/3] Handle bad dev_root properly with rescue=all
On 3/18/21 11:43 AM, David Sterba wrote: On Thu, Mar 11, 2021 at 11:23:13AM -0500, Josef Bacik wrote: [...] rescue=all working without panicing the machine, and I verified everything by using btrfs-corrupt-block to corrupt a dev root of a file system. Thanks, We need to have such testing part of some suite but it depends on the utilities that we don't want to ship by default. Last time we discussed how to make btrfs-corrupt-block or similar available in source form in fstests it was not pretty, like having half of the btrfs-progs sources providing code to read and modify the data structures. So, what if: we have such tests in the btrfs-progs testsuite. As they're potentially dangerous, not run by default, but available for easy setup and test in a VM. The testsuite can be exported into a tarball, with the binaries included. Or simply run it built from git, that works too just needs more dependencies installed. I was thinking about collecting all the stress tests into one place, fstests already has some but my idea is to provide stress testing of btrfs-specific features, more at once. Or require some 3rd party tools and data sources to provide the load. It would be some infrastructure duplication with fstests, but lots of that is already done in btrfs-progs and I'd rather go with some duplication instead of development-time-only testing. I actually had Boris working on this, basically allow us to set BTRFS_CORRUPT_PROG inside our fstests config, and then allow us to write these tests for fstests. I'd prefer to keep this inside xfstests for mostly selfish reasons, it's easier for me to adapt the existing continual testing to take advantage of this and automatically run the xfstests stuff and post the results. If we keep it in btrfs-progs I have to write a bunch of code to run those to post the results to our continual testing stuff. Thanks, Josef
Re: [PATCH] btrfs: zoned: automatically reclaim zones
On 3/17/21 6:38 AM, Johannes Thumshirn wrote: When a file gets deleted on a zoned file system, the space freed is no returned back into the block group's free space, but is migrated to zone_unusable. As this zone_unusable space is behind the current write pointer it is not possible to use it for new allocations. In the current implementation a zone is reset once all of the block group's space is accounted as zone unusable. This behaviour can lead to premature ENOSPC errors on a busy file system. Instead of only reclaiming the zone once it is completely unusable, kick off a reclaim job once the amount of unusable bytes exceeds a user configurable threshold between 51% and 100%. Similar to reclaiming unused block groups, these dirty block groups are added to a to_reclaim list and then on a transaction commit, the reclaim process is triggered but after we deleted unused block groups, which will free space for the relocation process. Signed-off-by: Johannes Thumshirn I think this is generally useful, but we should indicate to the user that it's happening via automatic cleanup and not because of somebody using btrfs balance start. As for doing it on non-zoned, I'm still hesitant to do this simply because I haven't been able to finish working on the relocation stuff. Once my current set of patches get merged I'll pick it back up, but there still remains some pretty unpleasant side-effects of balance. Chris and I have been discussing your RAID5/6 plan, and we both really, really like the idea of the stripe root from the point of view that it makes relocation insanely straightforward and simple. I think once we have that in place I'd feel a lot more comfortable running relocation automatically everywhere, as it'll be a much faster and less error prone operation. Thanks, Josef
Re: [PATCH 0/3] Handle bad dev_root properly with rescue=all
On 3/17/21 8:27 AM, David Sterba wrote: On Thu, Mar 11, 2021 at 11:23:13AM -0500, Josef Bacik wrote: Hello, My recent debugging session with Neal's broken filesystem uncovered a glaring hole in my rescue=all patches, they don't deal with a NULL dev_root properly. In testing I only ever tested corrupting the extent tree or the csum tree, since those are the most problematic. The following 3 fixes allowed Neal to get rescue=all working without panicing the machine, and I verified everything by using btrfs-corrupt-block to corrupt a dev root of a file system. Thanks, When rescue= is set lots of things can't work, I was wondering if we should add messages once the "if (!dev_root)" cases are hit but I don't think we need it, it should be clear enough from the other rescue= related messages. Yeah I went back and forth on this, and I came to the same conclusion as you. If we're doing rescue=all we know things are bad, and we just want our data back. Thanks, Josef
[PATCH v8 39/39] btrfs: check return value of btrfs_commit_transaction in relocation
There's a few places where we don't check the return value of btrfs_commit_transaction in relocation.c. Thankfully all these places have straightforward error handling, so simply change all of the sites at once. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 9 + 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 557d4f09ce7f..371cd4476daf 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1915,7 +1915,7 @@ int prepare_to_merge(struct reloc_control *rc, int err) list_splice(&reloc_roots, &rc->reloc_roots); if (!err) - btrfs_commit_transaction(trans); + err = btrfs_commit_transaction(trans); else btrfs_end_transaction(trans); return err; @@ -3487,8 +3487,7 @@ int prepare_to_relocate(struct reloc_control *rc) */ return PTR_ERR(trans); } - btrfs_commit_transaction(trans); - return 0; + return btrfs_commit_transaction(trans); } static noinline_for_stack int relocate_block_group(struct reloc_control *rc) @@ -3647,7 +3646,9 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) err = PTR_ERR(trans); goto out_free; } - btrfs_commit_transaction(trans); + ret = btrfs_commit_transaction(trans); + if (ret && !err) + err = ret; out_free: ret = clean_dirty_subvols(rc); if (ret < 0 && !err) -- 2.26.2
[PATCH v8 38/39] btrfs: do proper error handling in merge_reloc_roots
We have a BUG_ON() if we get an error back from btrfs_get_fs_root(). This honestly should never fail, as at this point we have a solid coordination of fs root to reloc root, and these roots will all be in memory. But in the name of killing BUG_ON()'s remove these and handle the error condition properly, ASSERT()'ing for developers. Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 25 +++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 621c4284d158..557d4f09ce7f 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1960,8 +1960,29 @@ void merge_reloc_roots(struct reloc_control *rc) root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); if (btrfs_root_refs(&reloc_root->root_item) > 0) { - BUG_ON(IS_ERR(root)); - BUG_ON(root->reloc_root != reloc_root); + if (IS_ERR(root)) { + /* +* For recovery we read the fs roots on mount, +* and if we didn't find the root then we marked +* the reloc root as a garbage root. For normal +* relocation obviously the root should exist in +* memory. However there's no reason we can't +* handle the error properly here just in case. +*/ + ASSERT(0); + ret = PTR_ERR(root); + goto out; + } + if (root->reloc_root != reloc_root) { + /* +* This is actually impossible without something +* going really wrong (like weird race condition +* or cosmic rays). +*/ + ASSERT(0); + ret = -EINVAL; + goto out; + } ret = merge_reloc_root(rc, root); btrfs_put_root(root); if (ret) { -- 2.26.2
[PATCH v8 37/39] btrfs: handle extent corruption with select_one_root properly
In corruption cases we could have paths from a block up to no root at all, and thus we'll BUG_ON(!root) in select_one_root. Handle this by adding an ASSERT() for developers, and returning an error for normal users. Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 19 --- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 8f7760f8fcc3..621c4284d158 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2209,7 +2209,13 @@ struct btrfs_root *select_one_root(struct btrfs_backref_node *node) cond_resched(); next = walk_up_backref(next, edges, &index); root = next->root; - BUG_ON(!root); + + /* +* This can occur if we have incomplete extent refs leading all +* the way up a particular path, in this case return -EUCLEAN. +*/ + if (!root) + return ERR_PTR(-EUCLEAN); /* No other choice for non-shareable tree */ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) @@ -2599,8 +2605,15 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, BUG_ON(node->processed); root = select_one_root(node); - if (root == ERR_PTR(-ENOENT)) { - update_processed_blocks(rc, node); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + + /* See explanation in select_one_root for the -EUCLEAN case. */ + ASSERT(ret == -ENOENT); + if (ret == -ENOENT) { + ret = 0; + update_processed_blocks(rc, node); + } goto out; } -- 2.26.2
[PATCH v8 36/39] btrfs: cleanup error handling in prepare_to_merge
This probably can't happen even with a corrupt file system, because we would have failed much earlier on than here. However there's no reason we can't just check and bail out as appropriate, so do that and convert the correctness BUG_ON() to an ASSERT(). Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 857da684d415..8f7760f8fcc3 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1880,8 +1880,14 @@ int prepare_to_merge(struct reloc_control *rc, int err) root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); - BUG_ON(IS_ERR(root)); - BUG_ON(root->reloc_root != reloc_root); + if (IS_ERR(root)) { + list_add(&reloc_root->root_list, &reloc_roots); + btrfs_abort_transaction(trans, (int)PTR_ERR(root)); + if (!err) + err = PTR_ERR(root); + break; + } + ASSERT(root->reloc_root == reloc_root); /* * set reference count to 1, so btrfs_recover_relocation -- 2.26.2
[PATCH v8 34/39] btrfs: handle __add_reloc_root failures in btrfs_recover_relocation
We can already handle errors appropriately from this function, deal with an error coming from __add_reloc_root appropriately. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 12 ++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 43037cb302fd..2fc04f96294e 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4054,7 +4054,12 @@ int btrfs_recover_relocation(struct btrfs_root *root) } err = __add_reloc_root(reloc_root); - BUG_ON(err < 0); /* -ENOMEM or logic error */ + if (err) { + list_add_tail(&reloc_root->root_list, &reloc_roots); + btrfs_put_root(fs_root); + btrfs_end_transaction(trans); + goto out_unset; + } fs_root->reloc_root = btrfs_grab_root(reloc_root); btrfs_put_root(fs_root); } @@ -4269,7 +4274,10 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, return PTR_ERR(reloc_root); ret = __add_reloc_root(reloc_root); - BUG_ON(ret < 0); + if (ret) { + btrfs_put_root(reloc_root); + return ret; + } new_root->reloc_root = btrfs_grab_root(reloc_root); if (rc->create_reloc_tree) -- 2.26.2
[PATCH v8 32/39] btrfs: remove the extent item sanity checks in relocate_block_group
These checks are all taken care of for us by the tree checker code. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 29 + 1 file changed, 1 insertion(+), 28 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 71a3db66c1ab..288d5df39fa0 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3407,20 +3407,6 @@ static void unset_reloc_control(struct reloc_control *rc) mutex_unlock(&fs_info->reloc_mutex); } -static int check_extent_flags(u64 flags) -{ - if ((flags & BTRFS_EXTENT_FLAG_DATA) && - (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) - return 1; - if (!(flags & BTRFS_EXTENT_FLAG_DATA) && - !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) - return 1; - if ((flags & BTRFS_EXTENT_FLAG_DATA) && - (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - return 1; - return 0; -} - static noinline_for_stack int prepare_to_relocate(struct reloc_control *rc) { @@ -3472,7 +3458,6 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) struct btrfs_path *path; struct btrfs_extent_item *ei; u64 flags; - u32 item_size; int ret; int err = 0; int progress = 0; @@ -3521,19 +3506,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); - item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); - if (item_size >= sizeof(*ei)) { - flags = btrfs_extent_flags(path->nodes[0], ei); - ret = check_extent_flags(flags); - BUG_ON(ret); - } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) { - err = -EINVAL; - btrfs_print_v0_err(trans->fs_info); - btrfs_abort_transaction(trans, err); - break; - } else { - BUG(); - } + flags = btrfs_extent_flags(path->nodes[0], ei); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { ret = add_tree_block(rc, &key, path, &blocks); -- 2.26.2
[PATCH v8 30/39] btrfs: handle extent reference errors in do_relocation
We can already deal with errors appropriately from do_relocation, simply handle any errors that come from changing the refs at this point cleanly. We have to abort the transaction if we fail here as we've modified metadata at this point. Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 9 + 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 1a0e07507796..71a3db66c1ab 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2432,10 +2432,11 @@ static int do_relocation(struct btrfs_trans_handle *trans, btrfs_init_tree_ref(&ref, node->level, btrfs_header_owner(upper->eb)); ret = btrfs_inc_extent_ref(trans, &ref); - BUG_ON(ret); - - ret = btrfs_drop_subtree(trans, root, eb, upper->eb); - BUG_ON(ret); + if (!ret) + ret = btrfs_drop_subtree(trans, root, eb, +upper->eb); + if (ret) + btrfs_abort_transaction(trans, ret); } next: if (!upper->pending) -- 2.26.2
[PATCH v8 33/39] btrfs: do proper error handling in create_reloc_inode
We already handle some errors in this function, and the callers do the correct error handling, so clean up the rest of the function to do the appropriate error handling. There's a little extra work that needs to be done here, as we create the inode item before we create the orphan item. We could potentially add the orphan item, but if we failed to create the inode item we would have to abort the transaction. Instead add a helper to delete the inode item we created in the case that we're unable to look up the inode (this would likely be caused by an ENOMEM), which if it succeeds means we can avoid a transaction abort in this particular error case. Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 39 +-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 288d5df39fa0..43037cb302fd 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3648,6 +3648,35 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, return ret; } +static void delete_orphan_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = objectid; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret) { + if (ret > 0) + ret = -ENOENT; + goto out; + } + ret = btrfs_del_item(trans, root, path); +out: + if (ret) + btrfs_abort_transaction(trans, ret); + btrfs_free_path(path); +} + /* * helper to create inode for data relocation. * the inode is in data relocation tree and its link count is 0 @@ -3674,10 +3703,16 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, goto out; err = __insert_orphan_inode(trans, root, objectid); - BUG_ON(err); + if (err) + goto out; inode = btrfs_iget(fs_info->sb, objectid, root); - BUG_ON(IS_ERR(inode)); + if (IS_ERR(inode)) { + delete_orphan_inode(trans, root, objectid); + err = PTR_ERR(inode); + inode = NULL; + goto out; + } BTRFS_I(inode)->index_cnt = group->start; err = btrfs_orphan_add(trans, BTRFS_I(inode)); -- 2.26.2
[PATCH v8 31/39] btrfs: check for BTRFS_BLOCK_FLAG_FULL_BACKREF being set improperly
We need to validate that a data extent item does not have the FULL_BACKREF flag set on it's flags. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik --- fs/btrfs/tree-checker.c | 5 + 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index f4ade821307d..8e5926232d1e 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1290,6 +1290,11 @@ static int check_extent_item(struct extent_buffer *leaf, key->offset, fs_info->sectorsize); return -EUCLEAN; } + if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { + extent_err(leaf, slot, + "invalid extent flag, data has full backref set"); + return -EUCLEAN; + } } ptr = (unsigned long)(struct btrfs_extent_item *)(ei + 1); -- 2.26.2
[PATCH v8 35/39] btrfs: do not panic in __add_reloc_root
If we have a duplicate entry for a reloc root then we could have fs corruption that resulted in a double allocation. Since this shouldn't happen unless there is corruption, add an ASSERT(ret != -EEXIST) to all of the callers of __add_reloc_root() to catch any logic mistakes for developers, otherwise normal error handling will happen for normal users. Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 2fc04f96294e..857da684d415 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -638,9 +638,10 @@ static int __must_check __add_reloc_root(struct btrfs_root *root) node->bytenr, &node->rb_node); spin_unlock(&rc->reloc_root_tree.lock); if (rb_node) { - btrfs_panic(fs_info, -EEXIST, + btrfs_err(fs_info, "Duplicate root found for start=%llu while inserting into relocation tree", node->bytenr); + return -EEXIST; } list_add_tail(&root->root_list, &rc->reloc_roots); @@ -882,6 +883,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, return PTR_ERR(reloc_root); ret = __add_reloc_root(reloc_root); + ASSERT(ret != -EEXIST); if (ret) { btrfs_put_root(reloc_root); return ret; @@ -4054,6 +4056,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) } err = __add_reloc_root(reloc_root); + ASSERT(err != -EEXIST); if (err) { list_add_tail(&reloc_root->root_list, &reloc_roots); btrfs_put_root(fs_root); @@ -4274,6 +4277,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, return PTR_ERR(reloc_root); ret = __add_reloc_root(reloc_root); + ASSERT(ret != -EEXIST); if (ret) { btrfs_put_root(reloc_root); return ret; -- 2.26.2
[PATCH v8 28/39] btrfs: handle btrfs_search_slot failure in replace_path
This can fail for any number of reasons, why bring the whole box down with it? Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 592b2d156626..6e8d89e4733a 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1322,7 +1322,8 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, path->lowest_level = level; ret = btrfs_search_slot(trans, src, &key, path, 0, 1); path->lowest_level = 0; - BUG_ON(ret); + if (ret) + break; /* * Info qgroup to trace both subtrees. -- 2.26.2
[PATCH v8 25/39] btrfs: do proper error handling in btrfs_update_reloc_root
We call btrfs_update_root in btrfs_update_reloc_root, which can fail for all sorts of reasons, including IO errors. Instead of panicing the box lets return the error, now that all callers properly handle those errors. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 6 ++ 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b9cc99d4b950..7b242f527bd8 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -902,7 +902,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, int ret; if (!have_reloc_root(root)) - goto out; + return 0; reloc_root = root->reloc_root; root_item = &reloc_root->root_item; @@ -935,10 +935,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, ret = btrfs_update_root(trans, fs_info->tree_root, &reloc_root->root_key, root_item); - BUG_ON(ret); btrfs_put_root(reloc_root); -out: - return 0; + return ret; } /* -- 2.26.2
[PATCH v8 24/39] btrfs: handle btrfs_update_reloc_root failure in prepare_to_merge
btrfs_update_reloc_root will will return errors in the future, so handle an error properly in prepare_to_merge. Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 13 - 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index e6f9304d6a29..b9cc99d4b950 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1868,10 +1868,21 @@ int prepare_to_merge(struct reloc_control *rc, int err) */ if (!err) btrfs_set_root_refs(&reloc_root->root_item, 1); - btrfs_update_reloc_root(trans, root); + ret = btrfs_update_reloc_root(trans, root); + /* +* Even if we have an error we need this reloc root back on our +* list so we can clean up properly. +*/ list_add(&reloc_root->root_list, &reloc_roots); btrfs_put_root(root); + + if (ret) { + btrfs_abort_transaction(trans, ret); + if (!err) + err = ret; + break; + } } list_splice(&reloc_roots, &rc->reloc_roots); -- 2.26.2
[PATCH v8 29/39] btrfs: handle errors in reference count manipulation in replace_path
If any of the reference count manipulation stuff fails in replace_path we need to abort the transaction, as we've modified the blocks already. We can simply break at this point and everything will be cleaned up. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 20 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 6e8d89e4733a..1a0e07507796 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1363,27 +1363,39 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, ref.skip_qgroup = true; btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid); ret = btrfs_inc_extent_ref(trans, &ref); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, blocksize, 0); ref.skip_qgroup = true; btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid); ret = btrfs_inc_extent_ref(trans, &ref); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr, blocksize, path->nodes[level]->start); btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid); ref.skip_qgroup = true; ret = btrfs_free_extent(trans, &ref); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr, blocksize, 0); btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid); ref.skip_qgroup = true; ret = btrfs_free_extent(trans, &ref); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } btrfs_unlock_up_safe(path, 0); -- 2.26.2
[PATCH v8 26/39] btrfs: convert logic BUG_ON()'s in replace_path to ASSERT()'s
A few BUG_ON()'s in replace_path are purely to keep us from making logical mistakes, so replace them with ASSERT()'s. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 7b242f527bd8..dfd58fa9f189 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1210,8 +1210,8 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, int ret; int slot; - BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); - BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); + ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); + ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); last_snapshot = btrfs_root_last_snapshot(&src->root_item); again: @@ -1242,7 +1242,7 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, parent = eb; while (1) { level = btrfs_header_level(parent); - BUG_ON(level < lowest_level); + ASSERT(level >= lowest_level); ret = btrfs_bin_search(parent, &key, &slot); if (ret < 0) -- 2.26.2
[PATCH v8 20/39] btrfs: validate ->reloc_root after recording root in trans
If we fail to setup a ->reloc_root in a different thread that path will error out, however it still leaves root->reloc_root NULL but would still appear set up in the transaction. Subsequent calls to btrfs_record_root_in_transaction would succeed without attempting to create the reloc root, as the transid has already been update. Handle this case by making sure we have a root->reloc_root set after a btrfs_record_root_in_transaction call so we don't end up deref'ing a NULL pointer. Reported-by: Zygo Blaxell Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 15 +++ 1 file changed, 15 insertions(+) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index df26d6055cc6..4022649e2365 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2081,6 +2081,13 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, return ERR_PTR(ret); root = root->reloc_root; + /* +* We could have raced with another thread which failed, so +* ->reloc_root may not be set, return -ENOENT in this case. +*/ + if (!root) + return ERR_PTR(-ENOENT); + if (next->new_bytenr != root->node->start) { /* * We just created the reloc root, so we shouldn't have @@ -2578,6 +2585,14 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, ret = btrfs_record_root_in_trans(trans, root); if (ret) goto out; + /* +* Another thread could have failed, need to check if we +* have ->reloc_root actually set. +*/ + if (!root->reloc_root) { + ret = -ENOENT; + goto out; + } root = root->reloc_root; node->new_bytenr = root->node->start; btrfs_put_root(node->root); -- 2.26.2
[PATCH v8 21/39] btrfs: handle btrfs_update_reloc_root failure in commit_fs_roots
btrfs_update_reloc_root will will return errors in the future, so handle the error properly in commit_fs_roots. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c24ffbf0fb74..9dbbc354e5ad 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1353,7 +1353,9 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) spin_unlock(&fs_info->fs_roots_radix_lock); btrfs_free_log(trans, root); - btrfs_update_reloc_root(trans, root); + ret2 = btrfs_update_reloc_root(trans, root); + if (ret2) + return ret2; /* see comments in should_cow_block() */ clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); -- 2.26.2
[PATCH v8 19/39] btrfs: do proper error handling in create_reloc_root
We do memory allocations here, read blocks from disk, all sorts of operations that could easily fail at any given point. Instead of panicing the box, simply return the error back up the chain, all callers at this point have proper error handling. Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 34 -- 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 9a028376f475..df26d6055cc6 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -733,10 +733,12 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, struct extent_buffer *eb; struct btrfs_root_item *root_item; struct btrfs_key root_key; - int ret; + int ret = 0; + bool must_abort = false; root_item = kmalloc(sizeof(*root_item), GFP_NOFS); - BUG_ON(!root_item); + if (!root_item) + return ERR_PTR(-ENOMEM); root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; root_key.type = BTRFS_ROOT_ITEM_KEY; @@ -748,7 +750,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, /* called by btrfs_init_reloc_root */ ret = btrfs_copy_root(trans, root, root->commit_root, &eb, BTRFS_TREE_RELOC_OBJECTID); - BUG_ON(ret); + if (ret) + goto fail; + /* * Set the last_snapshot field to the generation of the commit * root - like this ctree.c:btrfs_block_can_be_shared() behaves @@ -769,9 +773,16 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, */ ret = btrfs_copy_root(trans, root, root->node, &eb, BTRFS_TREE_RELOC_OBJECTID); - BUG_ON(ret); + if (ret) + goto fail; } + /* +* We have changed references at this point, we must abort the +* transaction if anything fails. +*/ + must_abort = true; + memcpy(root_item, &root->root_item, sizeof(*root_item)); btrfs_set_root_bytenr(root_item, eb->start); btrfs_set_root_level(root_item, btrfs_header_level(eb)); @@ -789,14 +800,25 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, ret = btrfs_insert_root(trans, fs_info->tree_root, &root_key, root_item); - BUG_ON(ret); + if (ret) + goto fail; + kfree(root_item); reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key); - BUG_ON(IS_ERR(reloc_root)); + if (IS_ERR(reloc_root)) { + ret = PTR_ERR(reloc_root); + goto abort; + } set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); reloc_root->last_trans = trans->transid; return reloc_root; +fail: + kfree(root_item); +abort: + if (must_abort) + btrfs_abort_transaction(trans, ret); + return ERR_PTR(ret); } /* -- 2.26.2
[PATCH v8 27/39] btrfs: handle btrfs_cow_block errors in replace_path
If we error out cow'ing the root node when doing a replace_path then we simply unlock and free the buffer and return the error. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 12 ++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index dfd58fa9f189..592b2d156626 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1230,7 +1230,11 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, if (cow) { ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb, BTRFS_NESTING_COW); - BUG_ON(ret); + if (ret) { + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + return ret; + } } if (next_key) { @@ -1290,7 +1294,11 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, ret = btrfs_cow_block(trans, dest, eb, parent, slot, &eb, BTRFS_NESTING_COW); - BUG_ON(ret); + if (ret) { + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + break; + } } btrfs_tree_unlock(parent); -- 2.26.2
[PATCH v8 22/39] btrfs: change insert_dirty_subvol to return errors
This will be able to return errors in the future, so change it to return an error and handle the error appropriately. Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 14 +- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 4022649e2365..2187015fc412 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1564,9 +1564,9 @@ static int find_next_key(struct btrfs_path *path, int level, /* * Insert current subvolume into reloc_control::dirty_subvol_roots */ -static void insert_dirty_subvol(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct btrfs_root *root) +static int insert_dirty_subvol(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_root *root) { struct btrfs_root *reloc_root = root->reloc_root; struct btrfs_root_item *reloc_root_item; @@ -1586,6 +1586,7 @@ static void insert_dirty_subvol(struct btrfs_trans_handle *trans, btrfs_grab_root(root); list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots); } + return 0; } static int clean_dirty_subvols(struct reloc_control *rc) @@ -1787,8 +1788,11 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, out: btrfs_free_path(path); - if (ret == 0) - insert_dirty_subvol(trans, rc, root); + if (ret == 0) { + ret = insert_dirty_subvol(trans, rc, root); + if (ret) + btrfs_abort_transaction(trans, ret); + } if (trans) btrfs_end_transaction_throttle(trans); -- 2.26.2
[PATCH v8 23/39] btrfs: handle btrfs_update_reloc_root failure in insert_dirty_subvol
btrfs_update_reloc_root will will return errors in the future, so handle the error properly in insert_dirty_subvol. Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 2187015fc412..e6f9304d6a29 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1570,6 +1570,7 @@ static int insert_dirty_subvol(struct btrfs_trans_handle *trans, { struct btrfs_root *reloc_root = root->reloc_root; struct btrfs_root_item *reloc_root_item; + int ret; /* @root must be a subvolume tree root with a valid reloc tree */ ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); @@ -1580,7 +1581,9 @@ static int insert_dirty_subvol(struct btrfs_trans_handle *trans, sizeof(reloc_root_item->drop_progress)); btrfs_set_root_drop_level(reloc_root_item, 0); btrfs_set_root_refs(reloc_root_item, 0); - btrfs_update_reloc_root(trans, root); + ret = btrfs_update_reloc_root(trans, root); + if (ret) + return ret; if (list_empty(&root->reloc_dirty_list)) { btrfs_grab_root(root); -- 2.26.2
[PATCH v8 18/39] btrfs: have proper error handling in btrfs_init_reloc_root
create_reloc_root will return errors in the future, and __add_reloc_root can return -ENOMEM or -EEXIST, so handle these errors properly. Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 7ca7ab3d40e2..9a028376f475 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -856,9 +856,14 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, reloc_root = create_reloc_root(trans, root, root->root_key.objectid); if (clear_rsv) trans->block_rsv = rsv; + if (IS_ERR(reloc_root)) + return PTR_ERR(reloc_root); ret = __add_reloc_root(reloc_root); - BUG_ON(ret < 0); + if (ret) { + btrfs_put_root(reloc_root); + return ret; + } root->reloc_root = btrfs_grab_root(reloc_root); return 0; } -- 2.26.2
[PATCH v8 14/39] btrfs: handle record_root_in_trans failure in qgroup_account_snapshot
record_root_in_trans can fail currently, so handle this failure properly. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 3a3a582063b4..26c91c4eba89 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1444,7 +1444,9 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, * recorded root will never be updated again, causing an outdated root * item. */ - record_root_in_trans(trans, src, 1); + ret = record_root_in_trans(trans, src, 1); + if (ret) + return ret; /* * btrfs_qgroup_inherit relies on a consistent view of the usage for the @@ -1513,7 +1515,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, * insert_dir_item() */ if (!ret) - record_root_in_trans(trans, parent, 1); + ret = record_root_in_trans(trans, parent, 1); return ret; } -- 2.26.2
[PATCH v8 15/39] btrfs: handle record_root_in_trans failure in btrfs_record_root_in_trans
record_root_in_trans can fail currently, handle this failure properly. Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 26c91c4eba89..7ee0199fbb95 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -487,6 +487,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; + int ret; if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return 0; @@ -501,10 +502,10 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, return 0; mutex_lock(&fs_info->reloc_mutex); - record_root_in_trans(trans, root, 0); + ret = record_root_in_trans(trans, root, 0); mutex_unlock(&fs_info->reloc_mutex); - return 0; + return ret; } static inline int is_transaction_blocked(struct btrfs_transaction *trans) -- 2.26.2
[PATCH v8 16/39] btrfs: handle record_root_in_trans failure in create_pending_snapshot
record_root_in_trans can currently fail, so handle this failure properly. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 11 --- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 7ee0199fbb95..93464b93ee9d 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1593,8 +1593,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, dentry = pending->dentry; parent_inode = pending->dir; parent_root = BTRFS_I(parent_inode)->root; - record_root_in_trans(trans, parent_root, 0); - + ret = record_root_in_trans(trans, parent_root, 0); + if (ret) + goto fail; cur_time = current_time(parent_inode); /* @@ -1630,7 +1631,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } - record_root_in_trans(trans, root, 0); + ret = record_root_in_trans(trans, root, 0); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); btrfs_check_and_init_root_item(new_root_item); -- 2.26.2
[PATCH v8 17/39] btrfs: return an error from btrfs_record_root_in_trans
We can create a reloc root when we record the root in the trans, which can fail for all sorts of different reasons. Propagate this error up the chain of callers. Future patches will fix the callers of btrfs_record_root_in_trans() to handle the error. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 93464b93ee9d..c24ffbf0fb74 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -408,6 +408,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, int force) { struct btrfs_fs_info *fs_info = root->fs_info; + int ret = 0; if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && root->last_trans < trans->transid) || force) { @@ -456,11 +457,11 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, * lock. smp_wmb() makes sure that all the writes above are * done before we pop in the zero below */ - btrfs_init_reloc_root(trans, root); + ret = btrfs_init_reloc_root(trans, root); smp_mb__before_atomic(); clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state); } - return 0; + return ret; } -- 2.26.2
[PATCH v8 13/39] btrfs: handle btrfs_record_root_in_trans failure in start_transaction
btrfs_record_root_in_trans will return errors in the future, so handle the error properly in start_transaction. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index acff6bb49a97..3a3a582063b4 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -741,7 +741,11 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, * Thus it need to be called after current->journal_info initialized, * or we can deadlock. */ - btrfs_record_root_in_trans(h, root); + ret = btrfs_record_root_in_trans(h, root); + if (ret) { + btrfs_end_transaction(h); + return ERR_PTR(ret); + } return h; -- 2.26.2
[PATCH v8 12/39] btrfs: btrfs: handle btrfs_record_root_in_trans failure in relocate_tree_block
btrfs_record_root_in_trans will return errors in the future, so handle the error properly in relocate_tree_block. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 28d49b14a23a..7ca7ab3d40e2 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2548,7 +2548,9 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, ret = -EUCLEAN; goto out; } - btrfs_record_root_in_trans(trans, root); + ret = btrfs_record_root_in_trans(trans, root); + if (ret) + goto out; root = root->reloc_root; node->new_bytenr = root->node->start; btrfs_put_root(node->root); -- 2.26.2
[PATCH v8 11/39] btrfs: handle btrfs_record_root_in_trans failure in create_subvol
btrfs_record_root_in_trans will return errors in the future, so handle the error properly in create_subvol. Signed-off-by: Josef Bacik --- fs/btrfs/ioctl.c | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e8d53fea4c61..29dbb1644935 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -721,7 +721,12 @@ static noinline int create_subvol(struct inode *dir, /* Freeing will be done in btrfs_put_root() of new_root */ anon_dev = 0; - btrfs_record_root_in_trans(trans, new_root); + ret = btrfs_record_root_in_trans(trans, new_root); + if (ret) { + btrfs_put_root(new_root); + btrfs_abort_transaction(trans, ret); + goto fail; + } ret = btrfs_create_subvol_root(trans, new_root, root); btrfs_put_root(new_root); -- 2.26.2
[PATCH v8 09/39] btrfs: handle btrfs_record_root_in_trans failure in btrfs_delete_subvolume
btrfs_record_root_in_trans will return errors in the future, so handle the error properly in btrfs_delete_subvolume. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c64a5e3eca47..48c953d0dde7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4317,7 +4317,11 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) goto out_end_trans; } - btrfs_record_root_in_trans(trans, dest); + ret = btrfs_record_root_in_trans(trans, dest); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_end_trans; + } memset(&dest->root_item.drop_progress, 0, sizeof(dest->root_item.drop_progress)); -- 2.26.2
[PATCH v8 06/39] btrfs: do proper error handling in record_reloc_root_in_trans
Generally speaking this shouldn't ever fail, the corresponding fs root for the reloc root will already be in memory, so we won't get -ENOMEM here. However if there is no corresponding root for the reloc root then we could get -ENOMEM when we try to allocate it or we could get -ENOENT when we look it up and see that it doesn't exist. Convert these BUG_ON()'s into ASSERT()'s + proper error handling for the case of corruption. Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 23 +-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b70a12a7..28d49b14a23a 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1971,8 +1971,27 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, return 0; root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); - BUG_ON(IS_ERR(root)); - BUG_ON(root->reloc_root != reloc_root); + + /* +* This should succeed, since we can't have a reloc root without having +* already looked up the actual root and created the reloc root for this +* root. +* +* However if there's some sort of corruption where we have a ref to a +* reloc root without a corresponding root this could return -ENOENT. +*/ + if (IS_ERR(root)) { + ASSERT(0); + return PTR_ERR(root); + } + if (root->reloc_root != reloc_root) { + ASSERT(0); + btrfs_err(fs_info, + "root %llu has two reloc roots associated with it", + reloc_root->root_key.offset); + btrfs_put_root(root); + return -EUCLEAN; + } ret = btrfs_record_root_in_trans(trans, root); btrfs_put_root(root); -- 2.26.2
[PATCH v8 10/39] btrfs: handle btrfs_record_root_in_trans failure in btrfs_recover_log_trees
btrfs_record_root_in_trans will return errors in the future, so handle the error properly in btrfs_recover_log_trees. This appears tricky, however we have a reference count on the destination root, so if this fails we need to continue on in the loop to make sure the properly cleanup is done. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 2f1acc9aea9e..941b4dd96dd0 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -6278,8 +6278,12 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) } wc.replay_dest->log_root = log; - btrfs_record_root_in_trans(trans, wc.replay_dest); - ret = walk_log_tree(trans, log, &wc); + ret = btrfs_record_root_in_trans(trans, wc.replay_dest); + if (ret) + btrfs_handle_fs_error(fs_info, ret, + "Couldn't record the root in the transaction."); + else + ret = walk_log_tree(trans, log, &wc); if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { ret = fixup_inode_link_counts(trans, wc.replay_dest, -- 2.26.2
[PATCH v8 08/39] btrfs: handle btrfs_record_root_in_trans failure in btrfs_rename
btrfs_record_root_in_trans will return errors in the future, so handle the error properly in btrfs_rename. Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a9fde9a22fff..c64a5e3eca47 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9410,8 +9410,11 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_notrans; } - if (dest != root) - btrfs_record_root_in_trans(trans, dest); + if (dest != root) { + ret = btrfs_record_root_in_trans(trans, dest); + if (ret) + goto out_fail; + } ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index); if (ret) -- 2.26.2
[PATCH v8 05/39] btrfs: check record_root_in_trans related failures in select_reloc_root
We will record the fs root or the reloc root in the trans in select_reloc_root. These will actually return errors in the following patches, so check their return value here and return it up the stack. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 0f2ecf61696e..b70a12a7 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1988,6 +1988,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_backref_node *next; struct btrfs_root *root; int index = 0; + int ret; next = node; while (1) { @@ -2023,11 +2024,15 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, } if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - record_reloc_root_in_trans(trans, root); + ret = record_reloc_root_in_trans(trans, root); + if (ret) + return ERR_PTR(ret); break; } - btrfs_record_root_in_trans(trans, root); + ret = btrfs_record_root_in_trans(trans, root); + if (ret) + return ERR_PTR(ret); root = root->reloc_root; if (next->new_bytenr != root->node->start) { -- 2.26.2
[PATCH v8 07/39] btrfs: handle btrfs_record_root_in_trans failure in btrfs_rename_exchange
btrfs_record_root_in_trans will return errors in the future, so handle the error properly in btrfs_rename_exchange. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 52dc5f52ea58..a9fde9a22fff 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9102,8 +9102,11 @@ static int btrfs_rename_exchange(struct inode *old_dir, goto out_notrans; } - if (dest != root) - btrfs_record_root_in_trans(trans, dest); + if (dest != root) { + ret = btrfs_record_root_in_trans(trans, dest); + if (ret) + goto out_fail; + } /* * We need to find a free sequence number both in the source and -- 2.26.2
[PATCH v8 02/39] btrfs: convert BUG_ON()'s in relocate_tree_block
We have a couple of BUG_ON()'s in relocate_tree_block() that can be tripped if we have file system corruption. Convert these to ASSERT()'s so developers still get yelled at when they break the backref code, but error out nicely for users so the whole box doesn't go down. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 24 ++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 63e416ac79ae..1f371a878831 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2454,8 +2454,28 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, if (root) { if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { - BUG_ON(node->new_bytenr); - BUG_ON(!list_empty(&node->list)); + /* +* This block was the root block of a root, and this is +* the first time we're processing the block and thus it +* should not have had the ->new_bytenr modified and +* should have not been included on the changed list. +* +* However in the case of corruption we could have +* multiple refs pointing to the same block improperly, +* and thus we would trip over these checks. ASSERT() +* for the developer case, because it could indicate a +* bug in the backref code, however error out for a +* normal user in the case of corruption. +*/ + ASSERT(node->new_bytenr == 0); + ASSERT(list_empty(&node->list)); + if (node->new_bytenr || !list_empty(&node->list)) { + btrfs_err(root->fs_info, + "bytenr %llu has improper references to it", + node->bytenr); + ret = -EUCLEAN; + goto out; + } btrfs_record_root_in_trans(trans, root); root = root->reloc_root; node->new_bytenr = root->node->start; -- 2.26.2
[PATCH v8 03/39] btrfs: handle errors from select_reloc_root()
Currently select_reloc_root() doesn't return an error, but followup patches will make it possible for it to return an error. We do have proper error recovery in do_relocation however, so handle the possibility of select_reloc_root() having an error properly instead of BUG_ON(!root). I've also adjusted select_reloc_root() to return ERR_PTR(-ENOENT) if we don't find a root, instead of NULL, to make the error case easier to deal with. I've replaced the BUG_ON(!root) with an ASSERT(0) for this case as it indicates we messed up the backref walking code, but it could also indicate corruption. Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 15 --- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 1f371a878831..097fea09e2d2 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2024,8 +2024,14 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, if (!next || next->level <= node->level) break; } - if (!root) - return NULL; + if (!root) { + /* +* This can happen if there's fs corruption or if there's a bug +* in the backref lookup code. +*/ + ASSERT(0); + return ERR_PTR(-ENOENT); + } next = node; /* setup backref node path for btrfs_reloc_cow_block */ @@ -2196,7 +2202,10 @@ static int do_relocation(struct btrfs_trans_handle *trans, upper = edge->node[UPPER]; root = select_reloc_root(trans, rc, upper, edges); - BUG_ON(!root); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto next; + } if (upper->eb && !upper->locked) { if (!lowest) { -- 2.26.2
[PATCH v8 04/39] btrfs: convert BUG_ON()'s in select_reloc_root() to proper errors
We have several BUG_ON()'s in select_reloc_root() that can be tripped if you have extent tree corruption. Convert these to ASSERT()'s, because if we hit it during testing it really is bad, or could indicate a problem with the backref walking code. However if users hit these problems it generally indicates corruption, I've hit a few machines in the fleet that trip over these with clearly corrupted extent trees, so be nice and spit out an error message and return an error instead of bringing the whole box down. Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 47 +++ 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 097fea09e2d2..0f2ecf61696e 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1994,8 +1994,33 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, cond_resched(); next = walk_up_backref(next, edges, &index); root = next->root; - BUG_ON(!root); - BUG_ON(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)); + + /* +* If there is no root, then our references for this block are +* incomplete, as we should be able to walk all the way up to a +* block that is owned by a root. +* +* This path is only for SHAREABLE roots, so if we come upon a +* non-SHAREABLE root then we have backrefs that resolve +* improperly. +* +* Both of these cases indicate file system corruption, or a bug +* in the backref walking code. +*/ + if (!root) { + ASSERT(0); + btrfs_err(trans->fs_info, + "bytenr %llu doesn't have a backref path ending in a root", + node->bytenr); + return ERR_PTR(-EUCLEAN); + } + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { + ASSERT(0); + btrfs_err(trans->fs_info, +"bytenr %llu has multiple refs with one ending in a non shareable root", + node->bytenr); + return ERR_PTR(-EUCLEAN); + } if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { record_reloc_root_in_trans(trans, root); @@ -2006,8 +2031,22 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, root = root->reloc_root; if (next->new_bytenr != root->node->start) { - BUG_ON(next->new_bytenr); - BUG_ON(!list_empty(&next->list)); + /* +* We just created the reloc root, so we shouldn't have +* ->new_bytenr set and this shouldn't be in the changed +* list. If it is then we have multiple roots pointing +* at the same bytenr which indicates corruption, or +* we've made a mistake in the backref walking code. +*/ + ASSERT(next->new_bytenr == 0); + ASSERT(list_empty(&next->list)); + if (next->new_bytenr || !list_empty(&next->list)) { + btrfs_err(trans->fs_info, +"bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", + node->bytenr, next->bytenr); + return ERR_PTR(-EUCLEAN); + } + next->new_bytenr = root->node->start; btrfs_put_root(next->root); next->root = btrfs_grab_root(root); -- 2.26.2
[PATCH v8 01/39] btrfs: convert some BUG_ON()'s to ASSERT()'s in do_relocation
A few of these are checking for correctness, and won't be triggered by corrupted file systems, so convert them to ASSERT() instead of BUG_ON() and add a comment explaining their existence. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 19 --- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 232d5da7b7be..63e416ac79ae 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2181,7 +2181,11 @@ static int do_relocation(struct btrfs_trans_handle *trans, int slot; int ret = 0; - BUG_ON(lowest && node->eb); + /* +* If we are lowest then this is the first time we're processing this +* block, and thus shouldn't have an eb associated with it yet. +*/ + ASSERT(!lowest || !node->eb); path->lowest_level = node->level + 1; rc->backref_cache.path[node->level] = node; @@ -2266,7 +2270,11 @@ static int do_relocation(struct btrfs_trans_handle *trans, free_extent_buffer(eb); if (ret < 0) goto next; - BUG_ON(node->eb != eb); + /* +* We've just cow'ed this block, it should have updated +* the correct backref node entry. +*/ + ASSERT(node->eb == eb); } else { btrfs_set_node_blockptr(upper->eb, slot, node->eb->start); @@ -2302,7 +2310,12 @@ static int do_relocation(struct btrfs_trans_handle *trans, } path->lowest_level = 0; - BUG_ON(ret == -ENOSPC); + + /* +* We should have allocated all of our space in the block rsv and thus +* shouldn't ENOSPC. +*/ + ASSERT(ret != -ENOSPC); return ret; } -- 2.26.2
[PATCH v8 00/39] Cleanup error handling in relocation
v7->v8: - Reordered some of the patches, so that the callers of functions that added new error cases were fixed first, and then added the new error handler. - Moved a few of the ASSERT(0) to where they made sense. Left the others that are in functions that can have multiple cases of the same error that indicates a logic error. v6->v7: - Broke up the series into 3 series, 1 for cosmetic things, 1 for all the major issues (including those reported on v6 of this set), and this new set which is solely the error handling related patches for relocation. It's still a lot of patches, sorry about that. v5->v6: - Reworked "btrfs: handle errors from select_reloc_root()" because Zygo reported hitting an ASSERT(ret != -ENOENT) during his testing. This was because I changed select_reloc_root() to return -ENOENT if we happened to race with somebody else who failed to init the reloc root, however we had an ASSERT() to check for this because it indicated corruption. I modified that patch to move the ASSERT() to where the problem actually is, so select_reloc_root() can return whatever error and it'll pass it along. I also removed Qu's reviewed-by for the patch because of the change. v4->v5: - Dropped "btrfs: fix error handling in commit_fs_roots" as it was merged. - Fixed an ASSERT() that happened during relocation recovery that Zygo reported, I moved the error condition out of another condition which broke recovery if we had deleted subvols pending with relocation. v3->v4: - Squashed the __add_reloc_root error handling patches in btrfs_recover_relocation as they were small and in the same function. - Squashed the record_root_in_trans failure handling patches for select_reloc_root as they were small and in the same function. - Added a new patch to address an existing error handling problem with subvol creation. - Fixed up the various cases that Qu noticed where I got things wrong, cleaning up a leaked root extent ref, a leaked inode item, and where I accidentally stopped dealing with errors from btrfs_drop_subtree. - Reworked a bunch of the ASSERT()'s to do ASSERT(0) in their respective if statements. - Added reviewed-bys. v2->v3: - A lot of extra patches fixing various things that I encountered while debugging the corruption problem that was uncovered by these patches. - Fixed the panic that Zygo was seeing and other issues. - Fixed up the comments from Nikolay and Filipe. A slight note, the first set of patches could probably be taken now, and in fact btrfs: fix error handling in commit_fs_roots Was sent earlier this week and is very important and needs to be reviewed and merged ASAP. The following are safe and could be merged outside of the rest of this series btrfs: allow error injection for btrfs_search_slot and btrfs_cow_block btrfs: fix lockdep splat in btrfs_recover_relocation btrfs: keep track of the root owner for relocation reads btrfs: noinline btrfs_should_cancel_balance btrfs: do not cleanup upper nodes in btrfs_backref_cleanup_node btrfs: pass down the tree block level through ref-verify btrfs: make sure owner is set in ref-verify btrfs: don't clear ret in btrfs_start_dirty_block_groups The rest obviously are all around the actual error handling. v1->v2: - fixed a bug where I accidentally dropped reading flags in relocate_block_group when I dropped the extra checks that we handle in the tree checker. --- Original message --- Hello, Relocation is the last place that is not able to handle errors at all, which results in all sorts of lovely panics if you encounter corruptions or IO errors. I'm going to start cleaning up relocation, but before I move code around I want the error handling to be somewhat sane, so I'm not changing behavior and error handling at the same time. These patches are purely about error handling, there is no behavior changing other than returning errors up the chain properly. There is a lot of room for follow up cleanups, which will happen next. However I wanted to get this series done today and out so we could get it merged ASAP, and then the follow up cleanups can happen later as they are less important and less critical. The only exception to the above is the patch to add the error injection sites for btrfs_cow_block and btrfs_search_slot, and a lockdep fix that I discovered while running my tests, those are the first two patches in the series. I tested this with my error injection stress test, where I keep track of all stack traces that have been tested and only inject errors when we have a new stack trace, which means I should have covered all of the various error conditions. With this patchset I'm no longer panicing while stressing the error conditions. Thanks, Josef Josef Bacik (39): btrfs: convert some BUG_ON()'s to ASSERT()'s in do_relocation btrfs: convert BUG_ON()'s in relocate_tree_block
Re: [PATCH v7 03/38] btrfs: handle errors from select_reloc_root()
On 2/26/21 1:30 PM, David Sterba wrote: On Wed, Dec 16, 2020 at 11:26:19AM -0500, Josef Bacik wrote: Currently select_reloc_root() doesn't return an error, but followup patches will make it possible for it to return an error. We do have proper error recovery in do_relocation however, so handle the possibility of select_reloc_root() having an error properly instead of BUG_ON(!root). I've also adjusted select_reloc_root() to return ERR_PTR(-ENOENT) if we don't find a root, instead of NULL, to make the error case easier to deal with. I've replaced the BUG_ON(!root) with an ASSERT(0) for this case as it indicates we messed up the backref walking code, but it could also indicate corruption. Signed-off-by: Josef Bacik --- fs/btrfs/relocation.c | 15 --- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 08ffaa93b78f..741068580455 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2024,8 +2024,14 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, if (!next || next->level <= node->level) break; } - if (!root) - return NULL; + if (!root) { + /* +* This can happen if there's fs corruption or if there's a bug +* in the backref lookup code. +*/ + ASSERT(0); You've added these assert(0) to several patches and I think it's wrong. The asserts are supposed to verify invariants, you can hardly say that we're expecting 0 to be true, so the construct serves as "please crash here", which is no better than BUG(). It's been spreading, there are like 25 now. They are much better than a BUG_ON(), as they won't trip in production, they'll only trip for developers. And we need them in many of these cases, and this example you've called out is a clear example of where we absolutely want to differentiate, because we have 3 different failure modes that will return -EUCLEAN. If I add a ASSERT(ret != -EUCLEAN) to all of the callers then when the developer hits a logic bug they'll have to go and manually add in their own assert to figure out which error condition failed. Instead I've added explanations in the comments for each assert and then have an assert for every error condition so that it's clear where things went wrong. So these ASSERT()'s are staying. I could be convinced to change them to match their error condition, so in the above case instead I would be fine changing it to if (!root) { ASSERT(root); ... If that's more acceptable let me know, I'll change them to match their error condition. Thanks, Josef
[PATCH 1/3] btrfs: init devices always
Neal reported a panic trying to use -o rescue=all BUG: kernel NULL pointer dereference, address: 0030 PGD 0 P4D 0 Oops: [#1] SMP NOPTI CPU: 0 PID: 696 Comm: mount Tainted: GW 5.12.0-rc2+ #296 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 RIP: 0010:btrfs_device_init_dev_stats+0x1d/0x200 RSP: 0018:afaec1483bb8 EFLAGS: 00010286 RAX: RBX: 9a5715bcb298 RCX: 0070 RDX: 9a5703248000 RSI: 9a57052ea150 RDI: 9a5715bca400 RBP: 9a57052ea150 R08: 0070 R09: 9a57052ea150 R10: 000130faf0741c10 R11: R12: 9a570370 R13: R14: 9a5715bcb278 R15: 9a57052ea150 FS: 7f600d122c40() GS:9a577bc0() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: 0030 CR3: 000112a46005 CR4: 00370ef0 Call Trace: ? btrfs_init_dev_stats+0x1f/0xf0 ? kmem_cache_alloc+0xef/0x1f0 btrfs_init_dev_stats+0x5f/0xf0 open_ctree+0x10cb/0x1720 btrfs_mount_root.cold+0x12/0xea legacy_get_tree+0x27/0x40 vfs_get_tree+0x25/0xb0 vfs_kern_mount.part.0+0x71/0xb0 btrfs_mount+0x10d/0x380 legacy_get_tree+0x27/0x40 vfs_get_tree+0x25/0xb0 path_mount+0x433/0xa00 __x64_sys_mount+0xe3/0x120 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xae This happens because when we call btrfs_init_dev_stats we do device->fs_info->dev_root. However device->fs_info isn't init'ed because we were only calling btrfs_init_devices_late() if we properly read the device root. However we don't actually need the device root to init the devices, this function simply assigns the devices their ->fs_info pointer properly, so this needs to be done unconditionally always so that we can properly deref device->fs_info in rescue cases. Reported-by: Neal Gompa Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 41b718cfea40..63656bf23ff2 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2387,8 +2387,8 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) } else { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->dev_root = root; - btrfs_init_devices_late(fs_info); } + btrfs_init_devices_late(fs_info); /* If IGNOREDATACSUMS is set don't bother reading the csum root. */ if (!btrfs_test_opt(fs_info, IGNOREDATACSUMS)) { -- 2.26.2
[PATCH 2/3] btrfs: do not init dev stats if we have no dev_root
Neal reported a panic trying to use -o rescue=all BUG: kernel NULL pointer dereference, address: 0030 PGD 0 P4D 0 Oops: [#1] SMP PTI CPU: 0 PID: 4095 Comm: mount Not tainted 5.11.0-0.rc7.149.fc34.x86_64 #1 RIP: 0010:btrfs_device_init_dev_stats+0x4c/0x1f0 RSP: 0018:a60285fbfb68 EFLAGS: 00010246 RAX: RBX: 88b88f806498 RCX: 88b82e7a2a10 RDX: a60285fbfb97 RSI: 88b82e7a2a10 RDI: RBP: 88b88f806b3c R08: R09: R10: 88b82e7a2a10 R11: R12: 88b88f806a00 R13: 88b88f806478 R14: 88b88f806a00 R15: 88b82e7a2a10 FS: 7f698be1ec40() GS:88b937e0() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: 0030 CR3: 92c9c006 CR4: 003706f0 Call Trace: ? btrfs_init_dev_stats+0x1f/0xf0 btrfs_init_dev_stats+0x62/0xf0 open_ctree+0x1019/0x15ff btrfs_mount_root.cold+0x13/0xfa legacy_get_tree+0x27/0x40 vfs_get_tree+0x25/0xb0 vfs_kern_mount.part.0+0x71/0xb0 btrfs_mount+0x131/0x3d0 ? legacy_get_tree+0x27/0x40 ? btrfs_show_options+0x640/0x640 legacy_get_tree+0x27/0x40 vfs_get_tree+0x25/0xb0 path_mount+0x441/0xa80 __x64_sys_mount+0xf4/0x130 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f698c04e52e This happens because we unconditionally attempt to init device stats on mount, but we may not have been able to read the device root. Fix this by skipping init'ing the device stats if we do not have a device root. Reported-by: Neal Gompa Signed-off-by: Josef Bacik --- fs/btrfs/volumes.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 995920fcce9b..d4ca721c1d91 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7448,6 +7448,9 @@ static int btrfs_device_init_dev_stats(struct btrfs_device *device, int item_size; int i, ret, slot; + if (!device->fs_info->dev_root) + return 0; + key.objectid = BTRFS_DEV_STATS_OBJECTID; key.type = BTRFS_PERSISTENT_ITEM_KEY; key.offset = device->devid; -- 2.26.2
[PATCH 3/3] btrfs: don't init dev replace for bad dev root
While helping Neal fix his broken file system I added a debug patch to catch if we were calling btrfs_search_slot with a NULL root, and this stack trace popped we tried to search with a NULL root CPU: 0 PID: 1760 Comm: mount Not tainted 5.11.0-155.nealbtrfstest.1.fc34.x86_64 #1 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/22/2020 Call Trace: dump_stack+0x6b/0x83 btrfs_search_slot.cold+0x11/0x1b ? btrfs_init_dev_replace+0x36/0x450 btrfs_init_dev_replace+0x71/0x450 open_ctree+0x1054/0x1610 btrfs_mount_root.cold+0x13/0xfa legacy_get_tree+0x27/0x40 vfs_get_tree+0x25/0xb0 vfs_kern_mount.part.0+0x71/0xb0 btrfs_mount+0x131/0x3d0 ? legacy_get_tree+0x27/0x40 ? btrfs_show_options+0x640/0x640 legacy_get_tree+0x27/0x40 vfs_get_tree+0x25/0xb0 path_mount+0x441/0xa80 __x64_sys_mount+0xf4/0x130 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f644730352e Fix this by not starting the device replace stuff if we do not have a NULL dev root. Reported-by: Neal Gompa Signed-off-by: Josef Bacik --- fs/btrfs/dev-replace.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 3a9c1e046ebe..d05f73530af7 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -81,6 +81,9 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) struct btrfs_dev_replace_item *ptr; u64 src_devid; + if (!dev_root) + return 0; + path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; -- 2.26.2
[PATCH 0/3] Handle bad dev_root properly with rescue=all
Hello, My recent debugging session with Neal's broken filesystem uncovered a glaring hole in my rescue=all patches, they don't deal with a NULL dev_root properly. In testing I only ever tested corrupting the extent tree or the csum tree, since those are the most problematic. The following 3 fixes allowed Neal to get rescue=all working without panicing the machine, and I verified everything by using btrfs-corrupt-block to corrupt a dev root of a file system. Thanks, Josef Josef Bacik (3): btrfs: init devices always btrfs: do not init dev stats if we have no dev_root btrfs: don't init dev replace for bad dev root fs/btrfs/dev-replace.c | 3 +++ fs/btrfs/disk-io.c | 2 +- fs/btrfs/volumes.c | 3 +++ 3 files changed, 7 insertions(+), 1 deletion(-) -- 2.26.2
Re: Recovering Btrfs from a freak failure of the disk controller
On 3/9/21 4:06 PM, Neal Gompa wrote: On Tue, Mar 9, 2021 at 2:04 PM Josef Bacik wrote: On 3/8/21 8:12 PM, Neal Gompa wrote: On Mon, Mar 8, 2021 at 5:04 PM Josef Bacik wrote: On 3/8/21 3:01 PM, Neal Gompa wrote: On Mon, Mar 8, 2021 at 1:38 PM Josef Bacik wrote: On 3/5/21 8:03 PM, Neal Gompa wrote: On Fri, Mar 5, 2021 at 5:01 PM Josef Bacik wrote: On 3/5/21 9:41 AM, Neal Gompa wrote: On Fri, Mar 5, 2021 at 9:12 AM Josef Bacik wrote: On 3/4/21 6:54 PM, Neal Gompa wrote: On Thu, Mar 4, 2021 at 3:25 PM Josef Bacik wrote: On 3/3/21 2:38 PM, Neal Gompa wrote: On Wed, Mar 3, 2021 at 1:42 PM Josef Bacik wrote: On 2/24/21 10:47 PM, Neal Gompa wrote: On Wed, Feb 24, 2021 at 10:44 AM Josef Bacik wrote: On 2/24/21 9:23 AM, Neal Gompa wrote: On Tue, Feb 23, 2021 at 10:05 AM Josef Bacik wrote: On 2/22/21 11:03 PM, Neal Gompa wrote: On Mon, Feb 22, 2021 at 2:34 PM Josef Bacik wrote: On 2/21/21 1:27 PM, Neal Gompa wrote: On Wed, Feb 17, 2021 at 11:44 AM Josef Bacik wrote: On 2/17/21 11:29 AM, Neal Gompa wrote: On Wed, Feb 17, 2021 at 9:59 AM Josef Bacik wrote: On 2/17/21 9:50 AM, Neal Gompa wrote: On Wed, Feb 17, 2021 at 9:36 AM Josef Bacik wrote: On 2/16/21 9:05 PM, Neal Gompa wrote: On Tue, Feb 16, 2021 at 4:24 PM Josef Bacik wrote: On 2/16/21 3:29 PM, Neal Gompa wrote: On Tue, Feb 16, 2021 at 1:11 PM Josef Bacik wrote: On 2/16/21 11:27 AM, Neal Gompa wrote: On Tue, Feb 16, 2021 at 10:19 AM Josef Bacik wrote: On 2/14/21 3:25 PM, Neal Gompa wrote: Hey all, So one of my main computers recently had a disk controller failure that caused my machine to freeze. After rebooting, Btrfs refuses to mount. I tried to do a mount and the following errors show up in the journal: Feb 14 15:20:49 localhost-live kernel: BTRFS info (device sda3): disk space caching is enabled Feb 14 15:20:49 localhost-live kernel: BTRFS info (device sda3): has skinny extents Feb 14 15:20:49 localhost-live kernel: BTRFS critical (device sda3): corrupt leaf: root=401 block=796082176 slot=15 ino=203657, invalid inode transid: has 96 expect [0, 95] Feb 14 15:20:49 localhost-live kernel: BTRFS error (device sda3): block=796082176 read time tree block corruption detected Feb 14 15:20:49 localhost-live kernel: BTRFS critical (device sda3): corrupt leaf: root=401 block=796082176 slot=15 ino=203657, invalid inode transid: has 96 expect [0, 95] Feb 14 15:20:49 localhost-live kernel: BTRFS error (device sda3): block=796082176 read time tree block corruption detected Feb 14 15:20:49 localhost-live kernel: BTRFS warning (device sda3): couldn't read tree root Feb 14 15:20:49 localhost-live kernel: BTRFS error (device sda3): open_ctree failed I've tried to do -o recovery,ro mount and get the same issue. I can't seem to find any reasonably good information on how to do recovery in this scenario, even to just recover enough to copy data off. I'm on Fedora 33, the system was on Linux kernel version 5.9.16 and the Fedora 33 live ISO I'm using has Linux kernel version 5.10.14. I'm using btrfs-progs v5.10. Can anyone help? Can you try btrfs check --clear-space-cache v1 /dev/whatever That should fix the inode generation thing so it's sane, and then the tree checker will allow the fs to be read, hopefully. If not we can work out some other magic. Thanks, Josef I got the same error as I did with btrfs-check --readonly... Oh lovely, what does btrfs check --readonly --backup do? No dice... # btrfs check --readonly --backup /dev/sda3 Opening filesystem to check... parent transid verify failed on 791281664 wanted 93 found 95 parent transid verify failed on 791281664 wanted 93 found 95 parent transid verify failed on 791281664 wanted 93 found 95 Hey look the block we're looking for, I wrote you some magic, just pull https://github.com/josefbacik/btrfs-progs/tree/for-neal build, and then run btrfs-neal-magic /dev/sda3 791281664 95 This will force us to point at the old root with (hopefully) the right bytenr and gen, and then hopefully you'll be able to recover from there. This is kind of saucy, so yolo, but I can undo it if it makes things worse. Thanks, # btrfs check --readonly /dev/sda3 Opening filesystem to check... ERROR: could not setup extent tree ERROR: cannot open file system # btrfs check --clear-space-cache v1 /dev/sda3 Opening filesystem to check... ERROR: could not setup extent tree ERROR: cannot open file system It's better, but still no dice... :( Hmm it's not telling us what's wrong with the extent tree, which is annoying. Does mount -o rescue=all,ro work now that the root tree is normal? Thanks, Nope, I see this in the journal: Feb 17 09:49:40 localhost-live kernel: BTRFS info (device sda3): enabling all of the rescue options Feb 17 09:49:40 localhost-live kernel: BTRFS info (device sda3): ignoring data csums Feb 17 09:49:40 localhost-live