Dmitry Monakhov <dmonak...@openvz.org> writes: > If filesystem holds transaction open 'current->journal_info' it should not > performs memory allocations with __GFP_FS flag enabled otherwise this result > in fs > reentarance which lead to: > 1) reentrance to itself : deadlock or internal assertion failure due to > incorrect journal credits > 1) entrance to another fs: assertion faulure or silient corruption due to > incorrect journal I've run xfstests suite for ext4, xfs and btrfs and it is appeared that btrfs has number of issues with fs re-entrance
#BUG1: btrfs_create, btrfs_mknode, btrfs_synlink, etc btrfs_create: ->btrfs_start_transaction ->btrfs_find_free_ino ->start_caching ->kthread_run -> try to allocate mem with GFP_KERNEL I'm not expert in btrfs but it looks like this issue may be fixed easily by moving btrfs_find_free_ino out of transaction scope. #BUG2 btrfs_ioctl_send create fake journal transaction current->journal_info = BTRFS_SEND_TRANS_STUB and then call vfs_write which performs various mem allocation WARNING: CPU: 1 PID: 30532 at mm/page_alloc.c:2808 __alloc_pages_nodemask+0xca/0x65d() Modules linked in: CPU: 1 PID: 30532 Comm: btrfs Not tainted 3.18.0-rc2-00012-g9f89e906-dirty #219 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014 0000000000000af8 ffff88007acd79d8 ffffffff816bc74b 0000000000000af8 0000000000000000 ffff88007acd7a18 ffffffff81086599 00000000000200d2 ffffffff81144d2d 00000000000200d2 0000000000000000 0000000000000000 Call Trace: [<ffffffff816bc74b>] dump_stack+0x49/0x5e [<ffffffff81086599>] warn_slowpath_common+0x81/0x9b [<ffffffff81144d2d>] ? __alloc_pages_nodemask+0xca/0x65d [<ffffffff810865cd>] warn_slowpath_null+0x1a/0x1c [<ffffffff81144d2d>] __alloc_pages_nodemask+0xca/0x65d [<ffffffff810bc9fa>] ? trace_hardirqs_on_caller+0x164/0x19b [<ffffffff810bca3e>] ? trace_hardirqs_on+0xd/0xf [<ffffffff811863d6>] ? pipe_write+0x40/0x419 [<ffffffff811863d6>] ? pipe_write+0x40/0x419 [<ffffffff8118658b>] pipe_write+0x1f5/0x419 [<ffffffff8117e80a>] new_sync_write+0x8a/0xb2 [<ffffffff8117fdb7>] vfs_write+0xb5/0x14d [<ffffffff81375b9b>] write_buf+0x58/0x8e [<ffffffff8137c382>] btrfs_ioctl_send+0x5fa/0xdf8 [<ffffffff810ab549>] ? sched_clock_local+0x1c/0x82 [<ffffffff810bc488>] ? mark_lock+0x2d/0x1ec [<ffffffff810bdc0f>] ? __lock_acquire+0x3e8/0xf39 [<ffffffff810ab852>] ? sched_clock_cpu+0x8e/0xaa [<ffffffff816c1d83>] ? _raw_spin_unlock_irqrestore+0x55/0x72 [<ffffffff8134f6f8>] btrfs_ioctl+0x1258/0x1420 [<ffffffff8104a9b1>] ? sched_clock+0x17/0x1b [<ffffffff810ab549>] ? sched_clock_local+0x1c/0x82 [<ffffffff810ab852>] ? sched_clock_cpu+0x8e/0xaa [<ffffffff8118e227>] do_vfs_ioctl+0x43f/0x485 [<ffffffff81197154>] ? __fget+0x8c/0x97 [<ffffffff8118e2c7>] SyS_ioctl+0x5a/0x7f [<ffffffff816c2229>] system_call_fastpath+0x12/0x17 > > Signed-off-by: Dmitry Monakhov <dmonak...@openvz.org> > --- > include/linux/kernel.h | 7 +++++++ > mm/dmapool.c | 1 + > mm/mempool.c | 1 + > mm/page_alloc.c | 1 + > mm/slab.c | 1 + > mm/slub.c | 1 + > 6 files changed, 12 insertions(+), 0 deletions(-) > > diff --git a/include/linux/kernel.h b/include/linux/kernel.h > index 3d770f5..69923d4 100644 > --- a/include/linux/kernel.h > +++ b/include/linux/kernel.h > @@ -232,6 +232,13 @@ void might_fault(void); > static inline void might_fault(void) { } > #endif > > +#ifdef CONFIG_PROVE_LOCKING > +#define might_enter_fs_if(cond) \ > + WARN_ON_ONCE((cond) && current->journal_info) > +#else > +static inline void might_enter_fs_if(bool cond) { } > +#endif > + > extern struct atomic_notifier_head panic_notifier_list; > extern long (*panic_blink)(int state); > __printf(1, 2) > diff --git a/mm/dmapool.c b/mm/dmapool.c > index fd5fe43..c543eb8 100644 > --- a/mm/dmapool.c > +++ b/mm/dmapool.c > @@ -324,6 +324,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t > mem_flags, > void *retval; > > might_sleep_if(mem_flags & __GFP_WAIT); > + might_enter_fs_if(mem_flags & __GFP_FS); > > spin_lock_irqsave(&pool->lock, flags); > list_for_each_entry(page, &pool->page_list, page_list) { > diff --git a/mm/mempool.c b/mm/mempool.c > index e209c98..b5bb86f 100644 > --- a/mm/mempool.c > +++ b/mm/mempool.c > @@ -204,6 +204,7 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) > > VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); > might_sleep_if(gfp_mask & __GFP_WAIT); > + might_enter_fs_if(gfp_mask & __GFP_FS); > > gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ > gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 9cd36b8..284a699 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -2805,6 +2805,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int > order, > lockdep_trace_alloc(gfp_mask); > > might_sleep_if(gfp_mask & __GFP_WAIT); > + might_enter_fs_if(gfp_mask & __GFP_FS); > > if (should_fail_alloc_page(gfp_mask, order)) > return NULL; > diff --git a/mm/slab.c b/mm/slab.c > index eb2b2ea..43b0d2f 100644 > --- a/mm/slab.c > +++ b/mm/slab.c > @@ -2844,6 +2844,7 @@ static inline void cache_alloc_debugcheck_before(struct > kmem_cache *cachep, > gfp_t flags) > { > might_sleep_if(flags & __GFP_WAIT); > + might_enter_fs_if(flags & __GFP_FS); > #if DEBUG > kmem_flagcheck(cachep, flags); > #endif > diff --git a/mm/slub.c b/mm/slub.c > index ae7b9f1..474fc53 100644 > --- a/mm/slub.c > +++ b/mm/slub.c > @@ -1238,6 +1238,7 @@ static inline int slab_pre_alloc_hook(struct kmem_cache > *s, gfp_t flags) > flags &= gfp_allowed_mask; > lockdep_trace_alloc(flags); > might_sleep_if(flags & __GFP_WAIT); > + might_enter_fs_if(flags & __GFP_FS); > > return should_failslab(s->object_size, flags, s->flags); > } > -- > 1.7.1
signature.asc
Description: PGP signature