[patch 135/208] iget: stop EXT2 from using iget() and read_inode()
From: David Howells [EMAIL PROTECTED] Stop the EXT2 filesystem from using iget() and read_inode(). Replace ext2_read_inode() with ext2_iget(), and call that instead of iget(). ext2_iget() then uses iget_locked() directly and returns a proper error code instead of an inode in the event of an error. ext2_fill_super() returns any error incurred when getting the root inode instead of EINVAL. [EMAIL PROTECTED]: coding-style fixes] Signed-off-by: David Howells [EMAIL PROTECTED] Acked-by: Theodore Ts'o [EMAIL PROTECTED] Cc: linux-ext4@vger.kernel.org Acked-by: Christoph Hellwig [EMAIL PROTECTED] Signed-off-by: Andrew Morton [EMAIL PROTECTED] --- fs/ext2/ext2.h |2 +- fs/ext2/inode.c | 29 + fs/ext2/namei.c | 12 ++-- fs/ext2/super.c | 32 ++-- 4 files changed, 46 insertions(+), 29 deletions(-) diff -puN fs/ext2/ext2.h~iget-stop-ext2-from-using-iget-and-read_inode-try fs/ext2/ext2.h --- a/fs/ext2/ext2.h~iget-stop-ext2-from-using-iget-and-read_inode-try +++ a/fs/ext2/ext2.h @@ -124,7 +124,7 @@ extern void ext2_check_inodes_bitmap (st extern unsigned long ext2_count_free (struct buffer_head *, unsigned); /* inode.c */ -extern void ext2_read_inode (struct inode *); +extern struct inode *ext2_iget (struct super_block *, unsigned long); extern int ext2_write_inode (struct inode *, int); extern void ext2_put_inode (struct inode *); extern void ext2_delete_inode (struct inode *); diff -puN fs/ext2/inode.c~iget-stop-ext2-from-using-iget-and-read_inode-try fs/ext2/inode.c --- a/fs/ext2/inode.c~iget-stop-ext2-from-using-iget-and-read_inode-try +++ a/fs/ext2/inode.c @@ -1181,22 +1181,33 @@ void ext2_get_inode_flags(struct ext2_in ei-i_flags |= EXT2_DIRSYNC_FL; } -void ext2_read_inode (struct inode * inode) +struct inode *ext2_iget (struct super_block *sb, unsigned long ino) { - struct ext2_inode_info *ei = EXT2_I(inode); - ino_t ino = inode-i_ino; + struct ext2_inode_info *ei; struct buffer_head * bh; - struct ext2_inode * raw_inode = ext2_get_inode(inode-i_sb, ino, bh); + struct ext2_inode *raw_inode; + struct inode *inode; + long ret = -EIO; int n; + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode-i_state I_NEW)) + return inode; + + ei = EXT2_I(inode); #ifdef CONFIG_EXT2_FS_POSIX_ACL ei-i_acl = EXT2_ACL_NOT_CACHED; ei-i_default_acl = EXT2_ACL_NOT_CACHED; #endif ei-i_block_alloc_info = NULL; - if (IS_ERR(raw_inode)) + raw_inode = ext2_get_inode(inode-i_sb, ino, bh); + if (IS_ERR(raw_inode)) { + ret = PTR_ERR(raw_inode); goto bad_inode; + } inode-i_mode = le16_to_cpu(raw_inode-i_mode); inode-i_uid = (uid_t)le16_to_cpu(raw_inode-i_uid_low); @@ -1220,6 +1231,7 @@ void ext2_read_inode (struct inode * ino if (inode-i_nlink == 0 (inode-i_mode == 0 || ei-i_dtime)) { /* this inode is deleted */ brelse (bh); + ret = -ESTALE; goto bad_inode; } inode-i_blocks = le32_to_cpu(raw_inode-i_blocks); @@ -1286,11 +1298,12 @@ void ext2_read_inode (struct inode * ino } brelse (bh); ext2_set_inode_flags(inode); - return; + unlock_new_inode(inode); + return inode; bad_inode: - make_bad_inode(inode); - return; + iget_failed(inode); + return ERR_PTR(ret); } static int ext2_update_inode(struct inode * inode, int do_sync) diff -puN fs/ext2/namei.c~iget-stop-ext2-from-using-iget-and-read_inode-try fs/ext2/namei.c --- a/fs/ext2/namei.c~iget-stop-ext2-from-using-iget-and-read_inode-try +++ a/fs/ext2/namei.c @@ -63,9 +63,9 @@ static struct dentry *ext2_lookup(struct ino = ext2_inode_by_name(dir, dentry); inode = NULL; if (ino) { - inode = iget(dir-i_sb, ino); - if (!inode) - return ERR_PTR(-EACCES); + inode = ext2_iget(dir-i_sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); } return d_splice_alias(inode, dentry); } @@ -83,10 +83,10 @@ struct dentry *ext2_get_parent(struct de ino = ext2_inode_by_name(child-d_inode, dotdot); if (!ino) return ERR_PTR(-ENOENT); - inode = iget(child-d_inode-i_sb, ino); + inode = ext2_iget(child-d_inode-i_sb, ino); - if (!inode) - return ERR_PTR(-EACCES); + if (IS_ERR(inode)) + return ERR_CAST(inode); parent = d_alloc_anon(inode); if (!parent) { iput(inode); diff -puN fs/ext2/super.c~iget-stop-ext2-from-using-iget-and-read_inode-try fs/ext2/super.c --- a/fs/ext2/super.c~iget-stop-ext2-from-using-iget-and-read_inode-try +++ a/fs/ext2/super.c @@
[patch 137/208] iget: stop EXT4 from using iget() and read_inode()
From: David Howells [EMAIL PROTECTED] Stop the EXT4 filesystem from using iget() and read_inode(). Replace ext4_read_inode() with ext4_iget(), and call that instead of iget(). ext4_iget() then uses iget_locked() directly and returns a proper error code instead of an inode in the event of an error. ext4_fill_super() returns any error incurred when getting the root inode instead of EINVAL. Signed-off-by: David Howells [EMAIL PROTECTED] Acked-by: Theodore Ts'o [EMAIL PROTECTED] Acked-by: Jan Kara [EMAIL PROTECTED] Cc: linux-ext4@vger.kernel.org Acked-by: Christoph Hellwig [EMAIL PROTECTED] Signed-off-by: Andrew Morton [EMAIL PROTECTED] --- fs/ext4/ialloc.c| 58 ++ fs/ext4/inode.c | 25 fs/ext4/namei.c | 29 +-- fs/ext4/resize.c|7 +--- fs/ext4/super.c | 36 +-- include/linux/ext4_fs.h |2 - 6 files changed, 87 insertions(+), 70 deletions(-) diff -puN fs/ext4/ialloc.c~iget-stop-ext4-from-using-iget-and-read_inode-try fs/ext4/ialloc.c --- a/fs/ext4/ialloc.c~iget-stop-ext4-from-using-iget-and-read_inode-try +++ a/fs/ext4/ialloc.c @@ -782,14 +782,15 @@ struct inode *ext4_orphan_get(struct sup unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)-s_es-s_inodes_count); ext4_group_t block_group; int bit; - struct buffer_head *bitmap_bh = NULL; + struct buffer_head *bitmap_bh; struct inode *inode = NULL; + long err = -EIO; /* Error cases - e2fsck has already cleaned up for us */ if (ino max_ino) { ext4_warning(sb, __FUNCTION__, bad orphan ino %lu! e2fsck was run?, ino); - goto out; + goto error; } block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); @@ -798,38 +799,49 @@ struct inode *ext4_orphan_get(struct sup if (!bitmap_bh) { ext4_warning(sb, __FUNCTION__, inode bitmap error for orphan %lu, ino); - goto out; + goto error; } /* Having the inode bit set should be a 100% indicator that this * is a valid orphan (no e2fsck run on fs). Orphans also include * inodes that were being truncated, so we can't check i_nlink==0. */ - if (!ext4_test_bit(bit, bitmap_bh-b_data) || - !(inode = iget(sb, ino)) || is_bad_inode(inode) || - NEXT_ORPHAN(inode) max_ino) { - ext4_warning(sb, __FUNCTION__, -bad orphan inode %lu! e2fsck was run?, ino); - printk(KERN_NOTICE ext4_test_bit(bit=%d, block=%llu) = %d\n, - bit, (unsigned long long)bitmap_bh-b_blocknr, - ext4_test_bit(bit, bitmap_bh-b_data)); - printk(KERN_NOTICE inode=%p\n, inode); - if (inode) { - printk(KERN_NOTICE is_bad_inode(inode)=%d\n, - is_bad_inode(inode)); - printk(KERN_NOTICE NEXT_ORPHAN(inode)=%u\n, - NEXT_ORPHAN(inode)); - printk(KERN_NOTICE max_ino=%lu\n, max_ino); - } + if (!ext4_test_bit(bit, bitmap_bh-b_data)) + goto bad_orphan; + + inode = ext4_iget(sb, ino); + if (IS_ERR(inode)) + goto iget_failed; + + if (NEXT_ORPHAN(inode) max_ino) + goto bad_orphan; + brelse(bitmap_bh); + return inode; + +iget_failed: + err = PTR_ERR(inode); + inode = NULL; +bad_orphan: + ext4_warning(sb, __FUNCTION__, +bad orphan inode %lu! e2fsck was run?, ino); + printk(KERN_NOTICE ext4_test_bit(bit=%d, block=%llu) = %d\n, + bit, (unsigned long long)bitmap_bh-b_blocknr, + ext4_test_bit(bit, bitmap_bh-b_data)); + printk(KERN_NOTICE inode=%p\n, inode); + if (inode) { + printk(KERN_NOTICE is_bad_inode(inode)=%d\n, + is_bad_inode(inode)); + printk(KERN_NOTICE NEXT_ORPHAN(inode)=%u\n, + NEXT_ORPHAN(inode)); + printk(KERN_NOTICE max_ino=%lu\n, max_ino); /* Avoid freeing blocks if we got a bad deleted inode */ - if (inode inode-i_nlink == 0) + if (inode-i_nlink == 0) inode-i_blocks = 0; iput(inode); - inode = NULL; } -out: brelse(bitmap_bh); - return inode; +error: + return ERR_PTR(err); } unsigned long ext4_count_free_inodes (struct super_block * sb) diff -puN fs/ext4/inode.c~iget-stop-ext4-from-using-iget-and-read_inode-try fs/ext4/inode.c --- a/fs/ext4/inode.c~iget-stop-ext4-from-using-iget-and-read_inode-try +++ a/fs/ext4/inode.c @@ -2680,21 +2680,31 @@ static blkcnt_t
Re: BUG_ON at mballoc.c:3752
On Wed, Feb 06, 2008 at 03:59:48PM -0600, Dave Kleikamp wrote: File systems should not call BUG() due to a corrupt file system. Instead the code should fail the operation, possibly marking the file system read-only (or panicking) depending on the errors= mount option. Eric Sandeen explained me the same on IRC. I was busy with the migrate locking bug. That's why i didn't update here. Today i tried to reproduce the problem using the image provided. But in my case it is not hitting the BUG_ON (mostly due to single cpu). I did look at the code and am not still not clear how we can hit that BUG_ON. prealloc free space pa_free is generated out of bitmap. So only if something corrupted bitmap after we initialized prealloc space we will hit this case. In mballoc we error out if the block allocated or fall in system zone. One thing i noticed is, the journal is corrupt. So the only possibility that i have is journal write resulted in bitmap corruption. I also looked at the mballoc to make sure we don't panic in case of a corrupt bitmap. Below is the patch that i have now. This one is yet to go through the ABAT test but it would be nice to see whether the below change cause any other issues. Eric , can you run the test with below patch and see if this makes any difference ?. I know we are not fixing any bugs in the below patch. ext4: Don't panic in case of corrupt bitmap From: Aneesh Kumar K.V [EMAIL PROTECTED] Multiblock allocator was calling BUG_ON in many case if the free and used blocks count obtained looking at the bitmap is different from what the allocator internally accounted for. Use ext4_error in such case and don't panic the system. Signed-off-by: Aneesh Kumar K.V [EMAIL PROTECTED] --- fs/ext4/mballoc.c | 35 +-- 1 files changed, 21 insertions(+), 14 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 06d1f52..656729b 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -680,7 +680,6 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) { char *bb; - /* FIXME!! is this needed */ BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); BUG_ON(max == NULL); @@ -964,7 +963,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb, grp-bb_fragments = fragments; if (free != grp-bb_free) { - printk(KERN_DEBUG + ext4_error(sb, __FUNCTION__, EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n, group, free, grp-bb_free); grp-bb_free = free; @@ -1821,13 +1820,24 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, i = ext4_find_next_zero_bit(bitmap, EXT4_BLOCKS_PER_GROUP(sb), i); if (i = EXT4_BLOCKS_PER_GROUP(sb)) { - BUG_ON(free != 0); + /* +* IF we corrupt the bitmap we won't find any +* free blocks even though group info says we +* we have free blocks +*/ + ext4_error(sb, __FUNCTION__, %d free blocks as per + group info. But bitmap says 0\n, + free); break; } mb_find_extent(e4b, 0, i, ac-ac_g_ex.fe_len, ex); BUG_ON(ex.fe_len = 0); - BUG_ON(free ex.fe_len); + if (free ex.fe_len) { + ext4_error(sb, __FUNCTION__, %d free blocks as per + group info. But got %d blocks\n, + free, ex.fe_len); + } ext4_mb_measure_extent(ac, ex, e4b); @@ -3354,13 +3364,10 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, ac-ac_pa = pa; /* we don't correct pa_pstart or pa_plen here to avoid -* possible race when tte group is being loaded concurrently +* possible race when the group is being loaded concurrently * instead we correct pa later, after blocks are marked -* in on-disk bitmap -- see ext4_mb_release_context() */ - /* -* FIXME!! but the other CPUs can look at this particular -* pa and think that it have enought free blocks if we -* don't update pa_free here right ? +* in on-disk bitmap -- see ext4_mb_release_context() +* Other CPUs are prevented from allocating from this pa by lg_mutex */ mb_debug(use %u/%u from group pa %p\n, pa-pa_lstart-len, len, pa); } @@ -3743,13 +3750,13 @@ static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b, bit = next + 1; } if (free != pa-pa_free) { - printk(KERN_ERR pa %p: logic %lu, phys. %lu, len %lu\n, +
Re: [PATCH 0/3] Move fsck from e2fsprogs to util-linux-ng
On Thu, Feb 07, 2008 at 01:37:43AM +0100, Kay Sievers wrote: Care to explain what ext4 development has to do with the generic fsck program? I don''t see any convincing reason not to move that now. Fsck and mount (and especially mount) needs to be linked against the very latest blkid library in order to make sure things work, and when we spread out packages across the various different packages, it means updating things to assure correct functionality is that much harder. It also means getting the right ext4 detection routines into vol_id, and in the past I've had real problems getting patches that fix real bugs in vol_id past you, and I'm just too busy right now to deal with that right now. - Ted - To unsubscribe from this list: send the line unsubscribe linux-ext4 in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] e2fsprogs: New bitmap and inode table allocation for FLEX_BG
New bitmap and inode table allocation for FLEX_BG From: Jose R. Santos [EMAIL PROTECTED] Change the way we allocate bitmaps and inode tables if the FLEX_BG feature is used at mke2fs time. It places calculates a new offset for bitmaps and inode table base on the number of groups that the user wishes to pack together using the new -G option. Creating a filesystem with 64 block groups in a flex group can be done by: mke2fs -j -I 256 -O flex_bg -G 32 /dev/sdX Signed-off-by: Jose R. Santos [EMAIL PROTECTED] Signed-off-by: Valerie Clement [EMAIL PROTECTED] --- lib/ext2fs/alloc_tables.c | 116 - lib/ext2fs/closefs.c |6 ++ lib/ext2fs/ext2_fs.h |6 ++ lib/ext2fs/initialize.c |6 ++ misc/mke2fs.c | 25 +- 5 files changed, 151 insertions(+), 8 deletions(-) diff --git a/lib/ext2fs/alloc_tables.c b/lib/ext2fs/alloc_tables.c index 4ad2ba9..8281858 100644 --- a/lib/ext2fs/alloc_tables.c +++ b/lib/ext2fs/alloc_tables.c @@ -27,18 +27,82 @@ #include ext2_fs.h #include ext2fs.h +void ext2fs_bgd_set_flex_meta_flag(ext2_filsys fs, blk_t block) +{ + dgrp_t group; + + group = ext2fs_group_of_blk(fs, block); + if (!(fs-group_desc[group].bg_flags EXT2_BG_FLEX_METADATA)) + fs-group_desc[group].bg_flags |= EXT2_BG_FLEX_METADATA; +} + +blk_t ext2fs_flexbg_offset(ext2_filsys fs, dgrp_t group, blk_t start_blk, + ext2fs_block_bitmap bmap, int offset, int size) +{ + int flexbg, flexbg_size, elem_size; + blk_t last_blk, first_free = 0; + dgrp_t last_grp; + + flexbg_size = 1 fs-super-s_log_groups_per_flex; + flexbg = group / flexbg_size; + + if (size fs-super-s_blocks_per_group / 8) + size = fs-super-s_blocks_per_group / 8; + + /* +* Dont do a long search if the previous block +* search is still valid. +*/ + if (start_blk group % flexbg_size) { + if (size flexbg_size) + elem_size = fs-inode_blocks_per_group; + else + elem_size = 1; + if (ext2fs_test_block_bitmap_range(bmap, start_blk + elem_size, + size)) + return start_blk + elem_size; + } + + start_blk = ext2fs_group_first_block(fs, flexbg_size * flexbg); + last_grp = (group + (flexbg_size - (group % flexbg_size) - 1)); + last_blk = ext2fs_group_last_block(fs, last_grp); + if (last_grp fs-group_desc_count) + last_grp = fs-group_desc_count; + + /* Find the first available block */ + if (ext2fs_get_free_blocks(fs, start_blk, last_blk, 1, bmap, + first_free)) + return first_free; + + if (ext2fs_get_free_blocks(fs, first_free + offset, last_blk, size, + bmap, first_free)) + return first_free; + + return first_free; +} + errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group, ext2fs_block_bitmap bmap) { errcode_t retval; blk_t group_blk, start_blk, last_blk, new_blk, blk; - int j; + dgrp_t last_grp; + int j, rem_grps, flexbg_size = 0; group_blk = ext2fs_group_first_block(fs, group); last_blk = ext2fs_group_last_block(fs, group); if (!bmap) bmap = fs-block_map; + + if (EXT2_HAS_INCOMPAT_FEATURE (fs-super, + EXT4_FEATURE_INCOMPAT_FLEX_BG)) { + flexbg_size = 1 fs-super-s_log_groups_per_flex; + rem_grps = flexbg_size - (group % flexbg_size); + last_grp = group + rem_grps - 1; + if (last_grp fs-group_desc_count) + last_grp = fs-group_desc_count; + } /* * Allocate the block and inode bitmaps, if necessary @@ -56,6 +120,15 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group, } else start_blk = group_blk; + if (flexbg_size) { + int prev_block = 0; + if (group fs-group_desc[group-1].bg_block_bitmap) + prev_block = fs-group_desc[group-1].bg_block_bitmap; + start_blk = ext2fs_flexbg_offset(fs, group, prev_block, bmap, +0, rem_grps); + last_blk = ext2fs_group_last_block(fs, last_grp); + } + if (!fs-group_desc[group].bg_block_bitmap) { retval = ext2fs_get_free_blocks(fs, start_blk, last_blk, 1, bmap, new_blk); @@ -66,6 +139,21 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group, return
Re: [PATCH] allocate struct ext4_allocation_context from a kmem cache to save stack space
On Wed, 2008-02-06 at 11:05 -0600, Eric Sandeen wrote: struct ext4_allocation_context is rather large, and this bloats the stack of many functions which use it. Allocating it from a named slab cache will alleviate this. For example, with this change (on top of the noinline patch sent earlier): -ext4_mb_new_blocks 200 +ext4_mb_new_blocks40 -ext4_mb_free_blocks 344 +ext4_mb_free_blocks 168 -ext4_mb_release_inode_pa 216 +ext4_mb_release_inode_pa 40 -ext4_mb_release_group_pa 192 +ext4_mb_release_group_pa 24 Most of these stack-allocated structs are actually used only for mballoc history; and in those cases often a smaller struct would do. So changing that may be another way around it, at least for those functions, if preferred. For now, in those cases where the ac is only for history, an allocation failure simply skips the history recording, and does not cause any other failures. Comments? Signed-off-by: Eric Sandeen [EMAIL PROTECTED] --- Index: linux-2.6.24-rc6-mm1/fs/ext4/mballoc.c === --- linux-2.6.24-rc6-mm1.orig/fs/ext4/mballoc.c +++ linux-2.6.24-rc6-mm1/fs/ext4/mballoc.c @@ -419,6 +419,7 @@ #define MB_DEFAULT_GROUP_PREALLOC512 static struct kmem_cache *ext4_pspace_cachep; +static struct kmem_cache *ext4_ac_cachep; #ifdef EXT4_BB_MAX_BLOCKS #undef EXT4_BB_MAX_BLOCKS @@ -2958,11 +2959,18 @@ int __init init_ext4_mballoc(void) if (ext4_pspace_cachep == NULL) return -ENOMEM; -#ifdef CONFIG_PROC_FS + ext4_ac_cachep = + kmem_cache_create(ext4_alloc_context, + sizeof(struct ext4_allocation_context), + 0, SLAB_RECLAIM_ACCOUNT, NULL); + if (ext4_ac_cachep == NULL) { + kmem_cache_destroy(ext4_pspace_cachep); + return -ENOMEM; + } + proc_root_ext4 = proc_mkdir(EXT4_ROOT, proc_root_fs); if (proc_root_ext4 == NULL) printk(KERN_ERR EXT4-fs: Unable to create %s\n, EXT4_ROOT); -#endif return 0; } @@ -2971,9 +2979,8 @@ void exit_ext4_mballoc(void) { /* XXX: synchronize_rcu(); */ kmem_cache_destroy(ext4_pspace_cachep); -#ifdef CONFIG_PROC_FS + kmem_cache_destroy(ext4_ac_cachep); remove_proc_entry(EXT4_ROOT, proc_root_fs); -#endif } Do you intend to remove the #ifdef CONFIG_PROC_FS, or it's a accident? I think we need keep that to allow ext4 build without procfs configured. Other than this, the patch looks fine to me.:) Mingming - To unsubscribe from this list: send the line unsubscribe linux-ext4 in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] allocate struct ext4_allocation_context from a kmem cache to save stack space
Mingming Cao wrote: Do you intend to remove the #ifdef CONFIG_PROC_FS, or it's a accident? I think we need keep that to allow ext4 build without procfs configured. Other than this, the patch looks fine to me.:) oh, it kind of snuck in. It actually should still build, as remove_proc_entry is a no-op function w/o the config option. Feel free to leave it in if you like though. -Eric - To unsubscribe from this list: send the line unsubscribe linux-ext4 in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: BUG_ON at mballoc.c:3752
On Thu, 2008-02-07 at 18:25 +0530, Aneesh Kumar K.V wrote: ext4: Don't panic in case of corrupt bitmap From: Aneesh Kumar K.V [EMAIL PROTECTED] Multiblock allocator was calling BUG_ON in many case if the free and used blocks count obtained looking at the bitmap is different from what the allocator internally accounted for. Use ext4_error in such case and don't panic the system. There seems a lot of BUG_ON() and BUG() in mballoc code, other than this case. Should it always panic the whole system in those cases? Perhaps replacing with ext4_error() or some cases just WARN_ON is enough. Mingming Signed-off-by: Aneesh Kumar K.V [EMAIL PROTECTED] --- fs/ext4/mballoc.c | 35 +-- 1 files changed, 21 insertions(+), 14 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 06d1f52..656729b 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -680,7 +680,6 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) { char *bb; - /* FIXME!! is this needed */ BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); BUG_ON(max == NULL); @@ -964,7 +963,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb, grp-bb_fragments = fragments; if (free != grp-bb_free) { - printk(KERN_DEBUG + ext4_error(sb, __FUNCTION__, EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n, group, free, grp-bb_free); grp-bb_free = free; @@ -1821,13 +1820,24 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, i = ext4_find_next_zero_bit(bitmap, EXT4_BLOCKS_PER_GROUP(sb), i); if (i = EXT4_BLOCKS_PER_GROUP(sb)) { - BUG_ON(free != 0); + /* + * IF we corrupt the bitmap we won't find any + * free blocks even though group info says we + * we have free blocks + */ + ext4_error(sb, __FUNCTION__, %d free blocks as per + group info. But bitmap says 0\n, + free); break; } mb_find_extent(e4b, 0, i, ac-ac_g_ex.fe_len, ex); BUG_ON(ex.fe_len = 0); - BUG_ON(free ex.fe_len); + if (free ex.fe_len) { + ext4_error(sb, __FUNCTION__, %d free blocks as per + group info. But got %d blocks\n, + free, ex.fe_len); + } ext4_mb_measure_extent(ac, ex, e4b); @@ -3354,13 +3364,10 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, ac-ac_pa = pa; /* we don't correct pa_pstart or pa_plen here to avoid - * possible race when tte group is being loaded concurrently + * possible race when the group is being loaded concurrently * instead we correct pa later, after blocks are marked - * in on-disk bitmap -- see ext4_mb_release_context() */ - /* - * FIXME!! but the other CPUs can look at this particular - * pa and think that it have enought free blocks if we - * don't update pa_free here right ? + * in on-disk bitmap -- see ext4_mb_release_context() + * Other CPUs are prevented from allocating from this pa by lg_mutex */ mb_debug(use %u/%u from group pa %p\n, pa-pa_lstart-len, len, pa); } @@ -3743,13 +3750,13 @@ static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b, bit = next + 1; } if (free != pa-pa_free) { - printk(KERN_ERR pa %p: logic %lu, phys. %lu, len %lu\n, + printk(KERN_CRIT pa %p: logic %lu, phys. %lu, len %lu\n, pa, (unsigned long) pa-pa_lstart, (unsigned long) pa-pa_pstart, (unsigned long) pa-pa_len); - printk(KERN_ERR free %u, pa_free %u\n, free, pa-pa_free); + ext4_error(sb, __FUNCTION__, free %u, pa_free %u\n, + free, pa-pa_free); } - BUG_ON(free != pa-pa_free); atomic_add(free, sbi-s_mb_discarded); return err; @@ -4405,7 +4412,7 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, unsigned long block, unsigned long count, int metadata, unsigned long *freed) { - struct buffer_head *bitmap_bh = 0; + struct buffer_head *bitmap_bh = NULL; struct super_block *sb = inode-i_sb; struct ext4_allocation_context ac; struct ext4_group_desc *gdp; - To unsubscribe from this list: send the line unsubscribe linux-ext4 in the body of a message to [EMAIL
Re: [PATCH] allocate struct ext4_allocation_context from a kmem cache to save stack space
On Thu, 2008-02-07 at 19:06 -0600, Eric Sandeen wrote: Mingming Cao wrote: Do you intend to remove the #ifdef CONFIG_PROC_FS, or it's a accident? I think we need keep that to allow ext4 build without procfs configured. Other than this, the patch looks fine to me.:) oh, it kind of snuck in. It actually should still build, as remove_proc_entry is a no-op function w/o the config option. Oh, I mean the proc_mkdir(EXT4_ROOT, proc_root_fs) will complain w/o CONFIG_PROC_FS configured. Mingming - To unsubscribe from this list: send the line unsubscribe linux-ext4 in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] allocate struct ext4_allocation_context from a kmem cache to save stack space
Mingming Cao wrote: On Thu, 2008-02-07 at 19:06 -0600, Eric Sandeen wrote: Mingming Cao wrote: Do you intend to remove the #ifdef CONFIG_PROC_FS, or it's a accident? I think we need keep that to allow ext4 build without procfs configured. Other than this, the patch looks fine to me.:) oh, it kind of snuck in. It actually should still build, as remove_proc_entry is a no-op function w/o the config option. Oh, I mean the proc_mkdir(EXT4_ROOT, proc_root_fs) will complain w/o CONFIG_PROC_FS configured. Mingming it'll build: static inline struct proc_dir_entry *proc_mkdir(const char *name, struct proc_dir_entry *parent) {return NULL;} yes, it'll issue a printk though. *shrug* I like fewer #ifdefs better than more, but doesn't matter much to me. -Eric - To unsubscribe from this list: send the line unsubscribe linux-ext4 in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Bugme-new] [Bug 9911] New: fsync blocks concurrent writes to file
On Thu, 7 Feb 2008 17:40:57 -0800 (PST) [EMAIL PROTECTED] wrote: http://bugzilla.kernel.org/show_bug.cgi?id=9911 Summary: fsync blocks concurrent writes to file Product: File System Version: 2.5 KernelVersion: 2.6.23.8 Platform: All OS/Version: Linux Tree: Mainline Status: NEW Severity: normal Priority: P1 Component: ext3 AssignedTo: [EMAIL PROTECTED] ReportedBy: [EMAIL PROTECTED] Latest working kernel version: none Earliest failing kernel version: 2.6.18 Distribution: RHEL 4 Fedora 8 Hardware Environment: Dual Xeon dual-core Athlon Software Environment: Java 1.5.0_14 Problem Description: Multiple threads append transactions to a single file. When fsyncs are issued, no writes complete until the fsync completes. This has the effect of yielding the same write rate regardless of how many threads are writing to the file. Solaris 10 does not exhibit this problem and scales well as additional threads are added. I have confirmed this by running strace and witnessing that writes block until immediately after the fsync completes. When using truss on Solaris I have witnessed multiple writes completing while an fsync was in progress. Steps to reproduce: Open a file for writing. Launch multiple threads with each one writing data and followed by an fsync, but only if data has been written since the last fsync. I have googled this issue substantially and have found no answers. I supposing teaching java about sync_file_range() would be all too hard. - To unsubscribe from this list: send the line unsubscribe linux-ext4 in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: BUG_ON at mballoc.c:3752
Aneesh Kumar K.V wrote: On Thu, Feb 07, 2008 at 05:30:48PM -0800, Mingming Cao wrote: On Thu, 2008-02-07 at 18:25 +0530, Aneesh Kumar K.V wrote: ext4: Don't panic in case of corrupt bitmap From: Aneesh Kumar K.V [EMAIL PROTECTED] Multiblock allocator was calling BUG_ON in many case if the free and used blocks count obtained looking at the bitmap is different from what the allocator internally accounted for. Use ext4_error in such case and don't panic the system. There seems a lot of BUG_ON() and BUG() in mballoc code, other than this case. Should it always panic the whole system in those cases? Perhaps replacing with ext4_error() or some cases just WARN_ON is enough. I had looked at the BUG_ON in mballoc code and found them very useful while stabilizing the mballoc code. It helped to catch wrong usage of functions. Most of the BUG_ON are there to make sure we call the API with the lock held or the API should not return value greater than 'x' Should not call the function with a particular argument as NULL ...etc kind of thing. So i would suggest to keep them as such. Yep - as long as the BUG_ONs are for programming errors, and not things like memory allocation failures or corrupted disks, I think it's ok. Not that I've audited them all. :) I asked about this early on, and got that answer... I'm reasonably satisfied with it. -Eric - To unsubscribe from this list: send the line unsubscribe linux-ext4 in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html