On 2025/2/18 8:39, Kent Overstreet wrote:
Hongbo - please go over this thoroughly. It turns out (and I should've
caught this before I merged it) your check code in fsck.c was completely
fubar, and wouldn't ever log an error when i_size was off, so bugs
elsewhere would have been masked.
-- >8 --
Error handling was wrong, causing unhandled transaction restart errors.
check_directory_size() was also inefficient, since keys in multiple
snapshots would be iterated over once for every snapshot. Convert it to
the same scheme used for i_sectors and subdir count checking.
Cc: Hongbo Li <[email protected]>
Signed-off-by: Kent Overstreet <[email protected]>
---
fs/bcachefs/fsck.c | 78 ++++++++++++++++++----------------------------
1 file changed, 31 insertions(+), 47 deletions(-)
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 53a421ff136d..d406336e9a0a 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -823,6 +823,7 @@ struct inode_walker_entry {
struct bch_inode_unpacked inode;
u32 snapshot;
u64 count;
+ u64 i_size;
};
struct inode_walker {
@@ -910,8 +911,9 @@ lookup_inode_for_snapshot(struct bch_fs *c, struct
inode_walker *w, struct bkey_
if (k.k->p.snapshot != i->snapshot && !is_whiteout) {
struct inode_walker_entry new = *i;
- new.snapshot = k.k->p.snapshot;
- new.count = 0;
+ new.snapshot = k.k->p.snapshot;
+ new.count = 0;
+ new.i_size = 0;
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, k);
@@ -1116,37 +1118,6 @@ static int get_snapshot_root_inode(struct btree_trans
*trans,
return ret;
}
-static int check_directory_size(struct btree_trans *trans,
- struct bch_inode_unpacked *inode_u,
- struct bkey_s_c inode_k, bool *write_inode)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- u64 new_size = 0;
- int ret;
-
- for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents,
- SPOS(inode_k.k->p.offset, 0, inode_k.k->p.snapshot),
- POS(inode_k.k->p.offset, U64_MAX),
- 0, k, ret) {
- if (k.k->type != KEY_TYPE_dirent)
- continue;
-
- struct bkey_s_c_dirent dirent = bkey_s_c_to_dirent(k);
- struct qstr name = bch2_dirent_get_name(dirent);
-
- new_size += dirent_occupied_size(&name);
- }
- bch2_trans_iter_exit(trans, &iter);
-
- if (!ret && inode_u->bi_size != new_size) {
- inode_u->bi_size = new_size;
- *write_inode = true;
- }
-
- return ret;
-}
-
static int check_inode(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
@@ -1335,16 +1306,6 @@ static int check_inode(struct btree_trans *trans,
u.bi_journal_seq = journal_cur_seq(&c->journal);
do_update = true;
}
-
- if (S_ISDIR(u.bi_mode)) {
- ret = check_directory_size(trans, &u, k, &do_update);
-
- fsck_err_on(ret,
- trans, directory_size_mismatch,
- "directory inode %llu:%u with the mismatch directory
size",
- u.bi_inum, k.k->p.snapshot);
It only tries to fix the mismatched i_size, and only logs the error
during recalculating. And why the i_size will be off?
- ret = 0;
- }
do_update:
if (do_update) {
ret = __bch2_fsck_write_inode(trans, &u);
@@ -2017,10 +1978,31 @@ static int check_subdir_count_notnested(struct
btree_trans *trans, struct inode_
return ret;
}
-static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
+static int check_dir_i_size_notnested(struct btree_trans *trans, struct
inode_walker *w)
+{
+ struct bch_fs *c = trans->c;
+ int ret = 0;
+
+ darray_for_each(w->inodes, i)
+ if (fsck_err_on(i->inode.bi_size != i->i_size,
+ trans, inode_dir_wrong_nlink,
+ "directory %llu:%u with wrong i_nlink: got %u,
should be %llu",
+ w->last_pos.inode, i->snapshot, i->inode.bi_nlink,
i->count)) {
This shows the wrong message. It should use directory_size_mismatch
error and corresponding values.
+ i->inode.bi_size = i->i_size;
+ ret = bch2_fsck_write_inode(trans, &i->inode);
+ if (ret)
+ break;
+ }
+fsck_err:
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+static int check_subdir_dirents_count(struct btree_trans *trans, struct
inode_walker *w)
{
u32 restart_count = trans->restart_count;
return check_subdir_count_notnested(trans, w) ?:
+ check_dir_i_size_notnested(trans, w) ?:
trans_was_restarted(trans, restart_count);
}
@@ -2367,7 +2349,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
goto out;
if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) {
- ret = check_subdir_count(trans, dir);
+ ret = check_subdir_dirents_count(trans, dir);
if (ret)
goto err;
}
@@ -2457,9 +2439,11 @@ static int check_dirent(struct btree_trans *trans,
struct btree_iter *iter,
if (ret)
goto err;
- if (d.v->d_type == DT_DIR)
- for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
+ for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) {
+ if (d.v->d_type == DT_DIR)
i->count++;
+ i->i_size += bkey_bytes(d.k);
We cannot rely on the value of bkey_bytes of the older version (the
older is 0 for directory). The traversal order will affect the accounts
of i_size. We should recalculate the sub-entries.
Also I did the following test:
older version without dir i_size:
├── foo
│ ├── MAINTAINERS
│ ├── foo-2
│ │ ├── MAINTAINERS
│ │ └── profile
│ └── profile
└── lost+found
After fsck, it shows the i_size of foo-2 is still 0.
Thanks,
Hongbo
+ }
out:
err:
fsck_err: