[PATCH] printk: Remove no longer used LOG_PREFIX.
When commit 5becfb1df5ac8e49 ("kmsg: merge continuation records while printing") introduced LOG_PREFIX, we used KERN_DEFAULT etc. as a flag for setting LOG_PREFIX in order to tell whether to call cont_add() (i.e. whether to append the message to "struct cont"). But since commit 4bcc595ccd80decb ("printk: reinstate KERN_CONT for printing continuation lines") inverted the behavior (i.e. don't append the message to "struct cont" unless KERN_CONT is specified) and commit 5aa068ea4082b39e ("printk: remove games with previous record flags") removed the last LOG_PREFIX check, setting LOG_PREFIX via KERN_DEFAULT etc. is no longer meaningful. Therefore, we can remove LOG_PREFIX and make KERN_DEFAULT empty string. Signed-off-by: Tetsuo Handa --- include/linux/kern_levels.h | 2 +- include/linux/printk.h | 1 - kernel/printk/printk.c | 6 +- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/include/linux/kern_levels.h b/include/linux/kern_levels.h index d237fe8..bf2389c 100644 --- a/include/linux/kern_levels.h +++ b/include/linux/kern_levels.h @@ -14,7 +14,7 @@ #define KERN_INFO KERN_SOH "6"/* informational */ #define KERN_DEBUG KERN_SOH "7"/* debug-level messages */ -#define KERN_DEFAULT KERN_SOH "d"/* the default kernel loglevel */ +#define KERN_DEFAULT "" /* the default kernel loglevel */ /* * Annotation for a "continued" line of log printout (only done after a diff --git a/include/linux/printk.h b/include/linux/printk.h index 77740a5..9fe6c9e 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -18,7 +18,6 @@ static inline int printk_get_level(const char *buffer) if (buffer[0] == KERN_SOH_ASCII && buffer[1]) { switch (buffer[1]) { case '0' ... '7': - case 'd': /* KERN_DEFAULT */ case 'c': /* KERN_CONT */ return buffer[1]; } diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index d3d1703..3e4a735 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -344,7 +344,6 @@ enum con_msg_format_flags { enum log_flags { LOG_NEWLINE = 2,/* text ended with a newline */ - LOG_PREFIX = 4,/* text started with a prefix */ LOG_CONT= 8,/* text is a fragment of a continuation line */ }; @@ -1867,9 +1866,6 @@ int vprintk_store(int facility, int level, case '0' ... '7': if (level == LOGLEVEL_DEFAULT) level = kern_level - '0'; - /* fallthrough */ - case 'd': /* KERN_DEFAULT */ - lflags |= LOG_PREFIX; break; case 'c': /* KERN_CONT */ lflags |= LOG_CONT; @@ -1884,7 +1880,7 @@ int vprintk_store(int facility, int level, level = default_message_loglevel; if (dict) - lflags |= LOG_PREFIX|LOG_NEWLINE; + lflags |= LOG_NEWLINE; return log_output(facility, level, lflags, dict, dictlen, text, text_len); -- 1.8.3.1
Re: [Bug 199931] New: systemd/rtorrent file data corruption when using echo 3 >/proc/sys/vm/drop_caches
On 2018/06/06 5:03, Andrew Morton wrote: > > (switched to email. Please respond via emailed reply-to-all, not via the > bugzilla web interface). > > On Tue, 05 Jun 2018 18:01:36 + bugzilla-dae...@bugzilla.kernel.org wrote: > >> https://bugzilla.kernel.org/show_bug.cgi?id=199931 >> >> Bug ID: 199931 >>Summary: systemd/rtorrent file data corruption when using echo >> 3 >/proc/sys/vm/drop_caches > > A long tale of woe here. Chris, do you think the pagecache corruption > is a general thing, or is it possible that btrfs is contributing? According to timestamp of my testcases, I was observing corrupted-bytes issue upon OOM-kill (without using btrfs) as of 2017 Aug 11. Thus, I don't think that this is specific to btrfs. But I can't find which patch fixed this issue. #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #define NUMTHREADS 512 #define STACKSIZE 8192 static int pipe_fd[2] = { EOF, EOF }; static int file_writer(void *i) { char buffer[4096] = { }; int fd; snprintf(buffer, sizeof(buffer), "/tmp/file.%lu", (unsigned long) i); fd = open(buffer, O_WRONLY | O_CREAT | O_APPEND, 0600); memset(buffer, 0xFF, sizeof(buffer)); read(pipe_fd[0], buffer, 1); while (write(fd, buffer, sizeof(buffer)) == sizeof(buffer)); return 0; } int main(int argc, char *argv[]) { char *buf = NULL; unsigned long size; unsigned long i; char *stack; if (pipe(pipe_fd)) return 1; stack = malloc(STACKSIZE * NUMTHREADS); for (size = 1048576; size < 512UL * (1 << 30); size <<= 1) { char *cp = realloc(buf, size); if (!cp) { size >>= 1; break; } buf = cp; } for (i = 0; i < NUMTHREADS; i++) if (clone(file_writer, stack + (i + 1) * STACKSIZE, CLONE_THREAD | CLONE_SIGHAND | CLONE_VM | CLONE_FS | CLONE_FILES, (void *) i) == -1) break; close(pipe_fd[1]); /* Will cause OOM due to overcommit; if not use SysRq-f */ for (i = 0; i < size; i += 4096) buf[i] = 0; kill(-1, SIGKILL); return 0; } #include #include #include #include #include #include int main(int argc, char *argv[]) { char buffer2[64] = { }; int ret = 0; int i; for (i = 0; i < 1024; i++) { int flag = 0; int fd; unsigned int byte[256]; int j; snprintf(buffer2, sizeof(buffer2), "/tmp/file.%u", i); fd = open(buffer2, O_RDONLY); if (fd == EOF) continue; lseek(fd, -4096, SEEK_END); memset(byte, 0, sizeof(byte)); while (1) { static unsigned char buffer[1048576]; int len = read(fd, (char *) buffer, sizeof(buffer)); if (len <= 0) break; for (j = 0; j < len; j++) if (buffer[j] != 0xFF) byte[buffer[j]]++; } close(fd); for (j = 0; j < 255; j++) if (byte[j]) { printf("ERROR: %u %u in %s\n", byte[j], j, buffer2); flag = 1; } if (flag == 0) unlink(buffer2); else ret = 1; } return ret; } -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2] lockdep: Fix fs_reclaim warning.
Peter, are you OK with this patch? Tetsuo Handa wrote: > From 361d37a7d36978020dfb4c11ec1f4800937ccb68 Mon Sep 17 00:00:00 2001 > From: Tetsuo Handa > Date: Thu, 8 Feb 2018 10:35:35 +0900 > Subject: [PATCH v2] lockdep: Fix fs_reclaim warning. > > Dave Jones reported fs_reclaim lockdep warnings. > > > WARNING: possible recursive locking detected > 4.15.0-rc9-backup-debug+ #1 Not tainted > > sshd/24800 is trying to acquire lock: >(fs_reclaim){+.+.}, at: [<84f438c2>] > fs_reclaim_acquire.part.102+0x5/0x30 > > but task is already holding lock: >(fs_reclaim){+.+.}, at: [<84f438c2>] > fs_reclaim_acquire.part.102+0x5/0x30 > > other info that might help us debug this: >Possible unsafe locking scenario: > > CPU0 > > lock(fs_reclaim); > lock(fs_reclaim); > >*** DEADLOCK *** > >May be due to missing lock nesting notation > > 2 locks held by sshd/24800: >#0: (sk_lock-AF_INET6){+.+.}, at: [<1a069652>] > tcp_sendmsg+0x19/0x40 >#1: (fs_reclaim){+.+.}, at: [<84f438c2>] > fs_reclaim_acquire.part.102+0x5/0x30 > > stack backtrace: > CPU: 3 PID: 24800 Comm: sshd Not tainted 4.15.0-rc9-backup-debug+ #1 > Call Trace: >dump_stack+0xbc/0x13f >__lock_acquire+0xa09/0x2040 >lock_acquire+0x12e/0x350 >fs_reclaim_acquire.part.102+0x29/0x30 >kmem_cache_alloc+0x3d/0x2c0 >alloc_extent_state+0xa7/0x410 >__clear_extent_bit+0x3ea/0x570 >try_release_extent_mapping+0x21a/0x260 >__btrfs_releasepage+0xb0/0x1c0 >btrfs_releasepage+0x161/0x170 >try_to_release_page+0x162/0x1c0 >shrink_page_list+0x1d5a/0x2fb0 >shrink_inactive_list+0x451/0x940 >shrink_node_memcg.constprop.88+0x4c9/0x5e0 >shrink_node+0x12d/0x260 >try_to_free_pages+0x418/0xaf0 >__alloc_pages_slowpath+0x976/0x1790 >__alloc_pages_nodemask+0x52c/0x5c0 >new_slab+0x374/0x3f0 >___slab_alloc.constprop.81+0x47e/0x5a0 >__slab_alloc.constprop.80+0x32/0x60 >__kmalloc_track_caller+0x267/0x310 >__kmalloc_reserve.isra.40+0x29/0x80 >__alloc_skb+0xee/0x390 >sk_stream_alloc_skb+0xb8/0x340 >tcp_sendmsg_locked+0x8e6/0x1d30 >tcp_sendmsg+0x27/0x40 >inet_sendmsg+0xd0/0x310 >sock_write_iter+0x17a/0x240 >__vfs_write+0x2ab/0x380 >vfs_write+0xfb/0x260 >SyS_write+0xb6/0x140 >do_syscall_64+0x1e5/0xc05 >entry_SYSCALL64_slow_path+0x25/0x25 > > This warning is caused by commit d92a8cfcb37ecd13 ("locking/lockdep: Rework > FS_RECLAIM annotation") which replaced lockdep_set_current_reclaim_state()/ > lockdep_clear_current_reclaim_state() in __perform_reclaim() and > lockdep_trace_alloc() in slab_pre_alloc_hook() with fs_reclaim_acquire()/ > fs_reclaim_release(). Since __kmalloc_reserve() from __alloc_skb() adds > __GFP_NOMEMALLOC | __GFP_NOWARN to gfp_mask, and all reclaim path simply > propagates __GFP_NOMEMALLOC, fs_reclaim_acquire() in slab_pre_alloc_hook() > is trying to grab the 'fake' lock again when __perform_reclaim() already > grabbed the 'fake' lock. > > The > > /* this guy won't enter reclaim */ > if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) > return false; > > test which causes slab_pre_alloc_hook() to try to grab the 'fake' lock > was added by commit cf40bd16fdad42c0 ("lockdep: annotate reclaim context > (__GFP_NOFS)"). But that test is outdated because PF_MEMALLOC thread won't > enter reclaim regardless of __GFP_NOMEMALLOC after commit 341ce06f69abfafa > ("page allocator: calculate the alloc_flags for allocation only once") > added the PF_MEMALLOC safeguard ( > > /* Avoid recursion of direct reclaim */ > if (p->flags & PF_MEMALLOC) > goto nopage; > > in __alloc_pages_slowpath()). > > Thus, let's fix outdated test by removing __GFP_NOMEMALLOC test and allow > __need_fs_reclaim() to return false. > > Reported-and-tested-by: Dave Jones > Signed-off-by: Tetsuo Handa > Cc: Peter Zijlstra > Cc: Nick Piggin > --- > mm/page_alloc.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 81e18ce..19fb76b 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -3590,7 +3590,7 @@ static bool __need_fs_reclaim(gfp_t gfp_mask) > return false; > > /* this guy won't enter reclaim */ > - if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) > + if (current->flags & PF_MEMALLOC) > return false; > > /* We're only interested __GFP_FS allocations for now */ > -- > 1.8.3.1 > -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2] lockdep: Fix fs_reclaim warning.
Nikolay Borisov wrote: > I think I've hit another incarnation of that one. The call stack is: > http://paste.opensuse.org/3f22d013 > > The cleaned up callstack of all the ? entries look like: > > __lock_acquire+0x2d8a/0x4b70 > lock_acquire+0x110/0x330 > kmem_cache_alloc+0x29/0x2c0 > __clear_extent_bit+0x488/0x800 > try_release_extent_mapping+0x288/0x3c0 > __btrfs_releasepage+0x6c/0x140 > shrink_page_list+0x227e/0x3110 > shrink_inactive_list+0x414/0xdb0 > shrink_node_memcg+0x7c8/0x1250 > shrink_node+0x2ae/0xb50 > do_try_to_free_pages+0x2b1/0xe20 > try_to_free_pages+0x205/0x570 > __alloc_pages_nodemask+0xb91/0x2160 > new_slab+0x27a/0x4e0 > ___slab_alloc+0x355/0x610 > __slab_alloc+0x4c/0xa0 > kmem_cache_alloc+0x22d/0x2c0 > mempool_alloc+0xe1/0x280 Yes, for mempool_alloc() is adding __GFP_NOMEMALLOC | __GFP_NOWARN to gfp_mask. gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ gfp_mask |= __GFP_NOWARN; /* failures are OK */ > bio_alloc_bioset+0x1d7/0x830 > ext4_mpage_readpages+0x99f/0x1000 <- > __do_page_cache_readahead+0x4be/0x840 > filemap_fault+0x8c8/0xfc0 > ext4_filemap_fault+0x7d/0xb0 > __do_fault+0x7a/0x150 > __handle_mm_fault+0x1542/0x29d0 > __do_page_fault+0x557/0xa30 > async_page_fault+0x4c/0x60 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2] lockdep: Fix fs_reclaim warning.
>From 361d37a7d36978020dfb4c11ec1f4800937ccb68 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 8 Feb 2018 10:35:35 +0900 Subject: [PATCH v2] lockdep: Fix fs_reclaim warning. Dave Jones reported fs_reclaim lockdep warnings. WARNING: possible recursive locking detected 4.15.0-rc9-backup-debug+ #1 Not tainted sshd/24800 is trying to acquire lock: (fs_reclaim){+.+.}, at: [<84f438c2>] fs_reclaim_acquire.part.102+0x5/0x30 but task is already holding lock: (fs_reclaim){+.+.}, at: [<84f438c2>] fs_reclaim_acquire.part.102+0x5/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 lock(fs_reclaim); lock(fs_reclaim); *** DEADLOCK *** May be due to missing lock nesting notation 2 locks held by sshd/24800: #0: (sk_lock-AF_INET6){+.+.}, at: [<1a069652>] tcp_sendmsg+0x19/0x40 #1: (fs_reclaim){+.+.}, at: [<84f438c2>] fs_reclaim_acquire.part.102+0x5/0x30 stack backtrace: CPU: 3 PID: 24800 Comm: sshd Not tainted 4.15.0-rc9-backup-debug+ #1 Call Trace: dump_stack+0xbc/0x13f __lock_acquire+0xa09/0x2040 lock_acquire+0x12e/0x350 fs_reclaim_acquire.part.102+0x29/0x30 kmem_cache_alloc+0x3d/0x2c0 alloc_extent_state+0xa7/0x410 __clear_extent_bit+0x3ea/0x570 try_release_extent_mapping+0x21a/0x260 __btrfs_releasepage+0xb0/0x1c0 btrfs_releasepage+0x161/0x170 try_to_release_page+0x162/0x1c0 shrink_page_list+0x1d5a/0x2fb0 shrink_inactive_list+0x451/0x940 shrink_node_memcg.constprop.88+0x4c9/0x5e0 shrink_node+0x12d/0x260 try_to_free_pages+0x418/0xaf0 __alloc_pages_slowpath+0x976/0x1790 __alloc_pages_nodemask+0x52c/0x5c0 new_slab+0x374/0x3f0 ___slab_alloc.constprop.81+0x47e/0x5a0 __slab_alloc.constprop.80+0x32/0x60 __kmalloc_track_caller+0x267/0x310 __kmalloc_reserve.isra.40+0x29/0x80 __alloc_skb+0xee/0x390 sk_stream_alloc_skb+0xb8/0x340 tcp_sendmsg_locked+0x8e6/0x1d30 tcp_sendmsg+0x27/0x40 inet_sendmsg+0xd0/0x310 sock_write_iter+0x17a/0x240 __vfs_write+0x2ab/0x380 vfs_write+0xfb/0x260 SyS_write+0xb6/0x140 do_syscall_64+0x1e5/0xc05 entry_SYSCALL64_slow_path+0x25/0x25 This warning is caused by commit d92a8cfcb37ecd13 ("locking/lockdep: Rework FS_RECLAIM annotation") which replaced lockdep_set_current_reclaim_state()/ lockdep_clear_current_reclaim_state() in __perform_reclaim() and lockdep_trace_alloc() in slab_pre_alloc_hook() with fs_reclaim_acquire()/ fs_reclaim_release(). Since __kmalloc_reserve() from __alloc_skb() adds __GFP_NOMEMALLOC | __GFP_NOWARN to gfp_mask, and all reclaim path simply propagates __GFP_NOMEMALLOC, fs_reclaim_acquire() in slab_pre_alloc_hook() is trying to grab the 'fake' lock again when __perform_reclaim() already grabbed the 'fake' lock. The /* this guy won't enter reclaim */ if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) return false; test which causes slab_pre_alloc_hook() to try to grab the 'fake' lock was added by commit cf40bd16fdad42c0 ("lockdep: annotate reclaim context (__GFP_NOFS)"). But that test is outdated because PF_MEMALLOC thread won't enter reclaim regardless of __GFP_NOMEMALLOC after commit 341ce06f69abfafa ("page allocator: calculate the alloc_flags for allocation only once") added the PF_MEMALLOC safeguard ( /* Avoid recursion of direct reclaim */ if (p->flags & PF_MEMALLOC) goto nopage; in __alloc_pages_slowpath()). Thus, let's fix outdated test by removing __GFP_NOMEMALLOC test and allow __need_fs_reclaim() to return false. Reported-and-tested-by: Dave Jones Signed-off-by: Tetsuo Handa Cc: Peter Zijlstra Cc: Nick Piggin --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 81e18ce..19fb76b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3590,7 +3590,7 @@ static bool __need_fs_reclaim(gfp_t gfp_mask) return false; /* this guy won't enter reclaim */ - if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) + if (current->flags & PF_MEMALLOC) return false; /* We're only interested __GFP_FS allocations for now */ -- 1.8.3.1 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [4.15-rc9] fs_reclaim lockdep trace
Peter Zijlstra wrote: > On Mon, Jan 29, 2018 at 08:47:20PM +0900, Tetsuo Handa wrote: > > Peter Zijlstra wrote: > > > On Sun, Jan 28, 2018 at 02:55:28PM +0900, Tetsuo Handa wrote: > > > > This warning seems to be caused by commit d92a8cfcb37ecd13 > > > > ("locking/lockdep: Rework FS_RECLAIM annotation") which moved the > > > > location of > > > > > > > > /* this guy won't enter reclaim */ > > > > if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) > > > > return false; > > > > > > > > check added by commit cf40bd16fdad42c0 ("lockdep: annotate reclaim > > > > context > > > > (__GFP_NOFS)"). > > > > > > I'm not entirly sure I get what you mean here. How did I move it? It was > > > part of lockdep_trace_alloc(), if __GFP_NOMEMALLOC was set, it would not > > > mark the lock as held. > > > > d92a8cfcb37ecd13 replaced lockdep_set_current_reclaim_state() with > > fs_reclaim_acquire(), and removed current->lockdep_recursion handling. > > > > -- > > # git show d92a8cfcb37ecd13 | grep recursion > > -# define INIT_LOCKDEP .lockdep_recursion = 0, > > .lockdep_reclaim_gfp = 0, > > +# define INIT_LOCKDEP .lockdep_recursion = 0, > > unsigned intlockdep_recursion; > > - if (unlikely(current->lockdep_recursion)) > > - current->lockdep_recursion = 1; > > - current->lockdep_recursion = 0; > > -* context checking code. This tests GFP_FS recursion (a lock taken > > -- > > That should not matter at all. The only case that would matter for is if > lockdep itself would ever call into lockdep again. Not something that > happens here. > > > > The new code has it in fs_reclaim_acquire/release to the same effect, if > > > __GFP_NOMEMALLOC, we'll not acquire/release the lock. > > > > Excuse me, but I can't catch. > > We currently acquire/release __fs_reclaim_map if __GFP_NOMEMALLOC. > > Right, got the case inverted, same difference though. Before we'd do > mark_held_lock(), now we do acquire/release under the same conditions. > > > > > Since __kmalloc_reserve() from __alloc_skb() adds > > > > __GFP_NOMEMALLOC | __GFP_NOWARN to gfp_mask, __need_fs_reclaim() is > > > > failing to return false despite PF_MEMALLOC context (and resulted in > > > > lockdep warning). > > > > > > But that's correct right, __GFP_NOMEMALLOC should negate PF_MEMALLOC. > > > That's what the name says. > > > > __GFP_NOMEMALLOC negates PF_MEMALLOC regarding what watermark that > > allocation > > request should use. > > Right. > > > But at the same time, PF_MEMALLOC negates __GFP_DIRECT_RECLAIM. > > Ah indeed. > > > Then, how can fs_reclaim contribute to deadlock? > > Not sure it can. But if we're going to allow this, it needs to come with > a clear description on why. Not a few clues to a puzzle. > Let's decode Dave's report. -- stack backtrace: CPU: 3 PID: 24800 Comm: sshd Not tainted 4.15.0-rc9-backup-debug+ #1 Call Trace: dump_stack+0xbc/0x13f __lock_acquire+0xa09/0x2040 lock_acquire+0x12e/0x350 fs_reclaim_acquire.part.102+0x29/0x30 kmem_cache_alloc+0x3d/0x2c0 alloc_extent_state+0xa7/0x410 __clear_extent_bit+0x3ea/0x570 try_release_extent_mapping+0x21a/0x260 __btrfs_releasepage+0xb0/0x1c0 btrfs_releasepage+0x161/0x170 try_to_release_page+0x162/0x1c0 shrink_page_list+0x1d5a/0x2fb0 shrink_inactive_list+0x451/0x940 shrink_node_memcg.constprop.88+0x4c9/0x5e0 shrink_node+0x12d/0x260 try_to_free_pages+0x418/0xaf0 __alloc_pages_slowpath+0x976/0x1790 __alloc_pages_nodemask+0x52c/0x5c0 new_slab+0x374/0x3f0 ___slab_alloc.constprop.81+0x47e/0x5a0 __slab_alloc.constprop.80+0x32/0x60 __kmalloc_track_caller+0x267/0x310 __kmalloc_reserve.isra.40+0x29/0x80 __alloc_skb+0xee/0x390 sk_stream_alloc_skb+0xb8/0x340 -- struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, bool force_schedule) { skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp) = { // gfp == GFP_KERNEL static inline struct sk_buff *alloc_skb_fclone(unsigned int size, gfp_t priority) { // priority == GFP_KERNEL return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE) = { data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc) = { // gfp_mask == GFP_KERNEL obj = kmalloc_node_track_caller(size, flags | __GFP_NOMEMALLOC | __GFP_NOWARN, node) =
Re: OOM: Better, but still there on
Nils Holland wrote: > Well, the issue is that I could only do everything via ssh today and > don't have any physical access to the machines. In fact, both seem to > have suffered a genuine kernel panic, which is also visible in the > last few lines of the log I provided today. So, basically, both > machines are now sitting at my home in panic state and I'll only be > able to resurrect them wheh I'm physically there again tonight. # echo 10 > /proc/sys/kernel/panic -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: OOM: Better, but still there on
Michal Hocko wrote: > TL;DR > there is another version of the debugging patch. Just revert the > previous one and apply this one instead. It's still not clear what > is going on but I suspect either some misaccounting or unexpeted > pages on the LRU lists. I have added one more tracepoint, so please > enable also mm_vmscan_inactive_list_is_low. > > Hopefully the additional data will tell us more. > > On Tue 20-12-16 03:08:29, Nils Holland wrote: > > On Mon, Dec 19, 2016 at 02:45:34PM +0100, Michal Hocko wrote: > > > > > Unfortunatelly shrink_active_list doesn't have any tracepoint so we do > > > not know whether we managed to rotate those pages. If they are referenced > > > quickly enough we might just keep refaulting them... Could you try to > > > apply > > > the followin diff on top what you have currently. It should add some more > > > tracepoint data which might tell us more. We can reduce the amount of > > > tracing data by enabling only mm_vmscan_lru_isolate, > > > mm_vmscan_lru_shrink_inactive and mm_vmscan_lru_shrink_active. > > > > So, the results are in! I applied your patch and rebuild the kernel, > > then I rebooted the machine, set up tracing so that only the three > > events you mentioned were being traced, and captured the output over > > the network. > > > > Things went a bit different this time: The trace events started to > > appear after a while and a whole lot of them were generated, but > > suddenly they stopped. A short while later, we get "cat /debug/trace/trace_pipe > /dev/udp/$ip/$port" stops reporting if /bin/cat is disturbed by page fault and/or memory allocation needed for sending UDP packets. Since netconsole can send UDP packets without involving memory allocation, printk() is preferable than tracing under OOM. > > It is possible that you are hitting multiple issues so it would be > great to focus at one at the time. The underlying problem might be > same/similar in the end but this is hard to tell now. Could you try to > reproduce and provide data for the OOM killer situation as well? > > > [ 1661.485568] btrfs-transacti: page alloction stalls for 611058ms, > > order:0, mode:0x2420048(GFP_NOFS|__GFP_HARDWALL|__GFP_MOVABLE) > > > > along with a backtrace and memory information, and then there was > > silence. > > > When I walked up to the machine, it had completely died; it > > wouldn't turn on its screen on key press any more, blindly trying to > > reboot via SysRequest had no effect, but the caps lock LED also wasn't > > blinking, like it normally does when a kernel panic occurs. Good > > question what state it was in. The OOM reaper didn't really seem to > > kick in and kill processes this time, it seems. > > > > The complete capture is up at: > > > > http://ftp.tisys.org/pub/misc/teela_2016-12-20.log.xz > > This is the stall report: > [ 1661.485568] btrfs-transacti: page alloction stalls for 611058ms, order:0, > mode:0x2420048(GFP_NOFS|__GFP_HARDWALL|__GFP_MOVABLE) > [ 1661.485859] CPU: 1 PID: 1950 Comm: btrfs-transacti Not tainted > 4.9.0-gentoo #4 > > pid 1950 is trying to allocate for a _long_ time. Considering that this > is the only stall report, this means that reclaim took really long so we > didn't get to the page allocator for that long. It sounds really crazy! warn_alloc() reports only if !__GFP_NOWARN. We can report where they were looping using kmallocwd at http://lkml.kernel.org/r/1478416501-10104-1-git-send-email-penguin-ker...@i-love.sakura.ne.jp (and extend it to call printk() for reporting values using SystemTap which your trace hooks would report, only during memory allocations are stalling, without delay caused by page fault and/or memory allocation needed for sending UDP packets). But if trying to reboot via SysRq-b did not work, I think that the system was in hard lockup state. That would be a different problem. By the way, Michal, I'm feeling strange because it seems to me that your analysis does not refer to the implications of "x86_32 kernel". Maybe you already referred x86_32 by "they are from the highmem zone" though. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: OOM: Better, but still there on
Nils Holland wrote: > On Sat, Dec 17, 2016 at 11:44:45PM +0900, Tetsuo Handa wrote: > > On 2016/12/17 21:59, Nils Holland wrote: > > > On Sat, Dec 17, 2016 at 01:02:03AM +0100, Michal Hocko wrote: > > >> mount -t tracefs none /debug/trace > > >> echo 1 > /debug/trace/events/vmscan/enable > > >> cat /debug/trace/trace_pipe > trace.log > > >> > > >> should help > > >> [...] > > > > > > No problem! I enabled writing the trace data to a file and then tried > > > to trigger another OOM situation. That worked, this time without a > > > complete kernel panic, but with only my processes being killed and the > > > system becoming unresponsive. > > > > Under OOM situation, writing to a file on disk unlikely works. Maybe > > logging via network ( "cat /debug/trace/trace_pipe > /dev/udp/$ip/$port" > > if your are using bash) works better. (I wish we can do it from kernel > > so that /bin/cat is not disturbed by delays due to page fault.) > > > > If you can configure netconsole for logging OOM killer messages and > > UDP socket for logging trace_pipe messages, udplogger at > > https://osdn.net/projects/akari/scm/svn/tree/head/branches/udplogger/ > > might fit for logging both output with timestamp into a single file. > > Actually, I decided to give this a try once more on machine #2, i.e. > not the one that produced the previous trace, but the other one. > > I logged via netconsole as well as 'cat /debug/trace/trace_pipe' via > the network to another machine running udplogger. After the machine > had been frehsly booted and I had set up the logging, unpacking of the > firefox source tarball started. After it had been unpacking for a > while, the first load of trace messages started to appear. Some time > later, OOMs started to appear - I've got quite a lot of them in my > capture file this time. Thank you for capturing. I think it worked well. Let's wait for Michal. The first OOM killer invocation was 2016-12-17 21:36:56 192.168.17.23:6665 [ 1276.828639] Killed process 3894 (xz) total-vm:68640kB, anon-rss:65920kB, file-rss:1696kB, shmem-rss:0kB and the last OOM killer invocation was 2016-12-17 21:39:27 192.168.17.23:6665 [ 1426.800677] Killed process 3070 (screen) total-vm:7440kB, anon-rss:960kB, file-rss:2360kB, shmem-rss:0kB and trace output was sent until 2016-12-17 21:37:07 192.168.17.23:48468 kworker/u4:4-3896 [000] 1287.202958: mm_shrink_slab_start: super_cache_scan+0x0/0x170 f4436ed4: nid: 0 objects to shrink 86 gfp_flags GFP_NOFS|__GFP_NOFAIL pgs_scanned 32 lru_pgs 406078 cache items 412 delta 0 total_scan 86 which (I hope) should be sufficient for analysis. > > Unfortunately, the reclaim trace messages stopped a while after the first > OOM messages show up - most likely my "cat" had been killed at that > point or became unresponsive. :-/ > > In the end, the machine didn't completely panic, but after nothing new > showed up being logged via the network, I walked up to the > machine and found it in a state where I couldn't really log in to it > anymore, but all that worked was, as always, a magic SysRequest reboot. There is a known issue (since Linux 2.6.32) that all memory allocation requests get stuck due to kswapd v.s. shrink_inactive_list() livelock which occurs under almost OOM situation ( http://lkml.kernel.org/r/20160211225929.GU14668@dastard ). If we hit it, even "page allocation stalls for " messages do not show up. Even if we didn't hit it, although agetty and sshd were still alive 2016-12-17 21:39:27 192.168.17.23:6665 [ 1426.800614] [ 2800] 0 2800 1152 494 6 30 0 agetty 2016-12-17 21:39:27 192.168.17.23:6665 [ 1426.800618] [ 2802] 0 2802 1457 1055 6 30 -1000 sshd memory allocation was delaying too much 2016-12-17 21:41:03 192.168.17.23:6665 [ 1521.034624] btrfs-transacti: page alloction stalls for 93995ms, order:0, mode:0x2400840(GFP_NOFS|__GFP_NOFAIL) 2016-12-17 21:41:03 192.168.17.23:6665 [ 1521.034628] CPU: 1 PID: 1949 Comm: btrfs-transacti Not tainted 4.9.0-gentoo #3 2016-12-17 21:41:03 192.168.17.23:6665 [ 1521.034630] Hardware name: Hewlett-Packard Compaq 15 Notebook PC/21F7, BIOS F.22 08/06/2014 2016-12-17 21:41:03 192.168.17.23:6665 [ 1521.034638] f162f94c c142bd8e 0001 f162f970 c110ad7e c1b58833 02400840 2016-12-17 21:41:03 192.168.17.23:6665 [ 1521.034645] f162f978 f162f980 c1b55814 f162f960 0160 f162fa38 c110b78c 02400840 2016-12-17 21:41:03 192.168.17.23:6665 [ 1521.034652] c1b55814 00016f2b 0040 f21d f21d 0001 2016-12-17 21:41:03 192.168.17.23:6665 [ 1521.03465
Re: OOM: Better, but still there on
On 2016/12/17 21:59, Nils Holland wrote: > On Sat, Dec 17, 2016 at 01:02:03AM +0100, Michal Hocko wrote: >> mount -t tracefs none /debug/trace >> echo 1 > /debug/trace/events/vmscan/enable >> cat /debug/trace/trace_pipe > trace.log >> >> should help >> [...] > > No problem! I enabled writing the trace data to a file and then tried > to trigger another OOM situation. That worked, this time without a > complete kernel panic, but with only my processes being killed and the > system becoming unresponsive. When that happened, I let it run for > another minute or two so that in case it was still logging something > to the trace file, it could continue to do so some time longer. Then I > rebooted with the only thing that still worked, i.e. by means of magic > SysRequest. Under OOM situation, writing to a file on disk unlikely works. Maybe logging via network ( "cat /debug/trace/trace_pipe > /dev/udp/$ip/$port" if your are using bash) works better. (I wish we can do it from kernel so that /bin/cat is not disturbed by delays due to page fault.) If you can configure netconsole for logging OOM killer messages and UDP socket for logging trace_pipe messages, udplogger at https://osdn.net/projects/akari/scm/svn/tree/head/branches/udplogger/ might fit for logging both output with timestamp into a single file. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] mm, oom: do not enfore OOM killer for __GFP_NOFAIL automatically
Michal Hocko wrote: > On Fri 16-12-16 12:31:51, Johannes Weiner wrote: >>> @@ -3737,6 +3752,16 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int >>> order, >>> */ >>> WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER); >>> >>> + /* >>> +* Help non-failing allocations by giving them access to memory >>> +* reserves but do not use ALLOC_NO_WATERMARKS because this >>> +* could deplete whole memory reserves which would just make >>> +* the situation worse >>> +*/ >>> + page = __alloc_pages_cpuset_fallback(gfp_mask, order, >>> ALLOC_HARDER, ac); >>> + if (page) >>> + goto got_pg; >>> + >> >> But this should be a separate patch, IMO. >> >> Do we observe GFP_NOFS lockups when we don't do this? > > this is hard to tell but considering users like grow_dev_page we can get > stuck with a very slow progress I believe. Those allocations could see > some help. > >> Don't we risk >> premature exhaustion of the memory reserves, and it's better to wait >> for other reclaimers to make some progress instead? > > waiting for other reclaimers would be preferable but we should at least > give these some priority, which is what ALLOC_HARDER should help with. > >> Should we give >> reserve access to all GFP_NOFS allocations, or just the ones from a >> reclaim/cleaning context? > > I would focus only for those which are important enough. Which are those > is a harder question. But certainly those with GFP_NOFAIL are important > enough. > >> All that should go into the changelog of a separate allocation booster >> patch, I think. > > The reason I did both in the same patch is to address the concern about > potential lockups when NOFS|NOFAIL cannot make any progress. I've chosen > ALLOC_HARDER to give the minimum portion of the reserves so that we do > not risk other high priority users to be blocked out but still help a > bit at least and prevent from starvation when other reclaimers are > faster to consume the reclaimed memory. > > I can extend the changelog of course but I believe that having both > changes together makes some sense. NOFS|NOFAIL allocations are not all > that rare and sometimes we really depend on them making a further > progress. > I feel that allowing access to memory reserves based on __GFP_NOFAIL might not make sense. My understanding is that actual I/O operation triggered by I/O requests by filesystem code are processed by other threads. Even if we grant access to memory reserves to GFP_NOFS | __GFP_NOFAIL allocations by fs code, I think that it is possible that memory allocations by underlying bio code fails to make a further progress unless memory reserves are granted as well. Below is a typical trace which I observe under OOM lockuped situation (though this trace is from an OOM stress test using XFS). [ 1845.187246] MemAlloc: kworker/2:1(14498) flags=0x4208060 switches=323636 seq=48 gfp=0x240(GFP_NOIO) order=0 delay=430400 uninterruptible [ 1845.187248] kworker/2:1 D12712 14498 2 0x0080 [ 1845.187251] Workqueue: events_freezable_power_ disk_events_workfn [ 1845.187252] Call Trace: [ 1845.187253] ? __schedule+0x23f/0xba0 [ 1845.187254] schedule+0x38/0x90 [ 1845.187255] schedule_timeout+0x205/0x4a0 [ 1845.187256] ? del_timer_sync+0xd0/0xd0 [ 1845.187257] schedule_timeout_uninterruptible+0x25/0x30 [ 1845.187258] __alloc_pages_nodemask+0x1035/0x10e0 [ 1845.187259] ? alloc_request_struct+0x14/0x20 [ 1845.187261] alloc_pages_current+0x96/0x1b0 [ 1845.187262] ? bio_alloc_bioset+0x20f/0x2e0 [ 1845.187264] bio_copy_kern+0xc4/0x180 [ 1845.187265] blk_rq_map_kern+0x6f/0x120 [ 1845.187268] __scsi_execute.isra.23+0x12f/0x160 [ 1845.187270] scsi_execute_req_flags+0x8f/0x100 [ 1845.187271] sr_check_events+0xba/0x2b0 [sr_mod] [ 1845.187274] cdrom_check_events+0x13/0x30 [cdrom] [ 1845.187275] sr_block_check_events+0x25/0x30 [sr_mod] [ 1845.187276] disk_check_events+0x5b/0x150 [ 1845.187277] disk_events_workfn+0x17/0x20 [ 1845.187278] process_one_work+0x1fc/0x750 [ 1845.187279] ? process_one_work+0x167/0x750 [ 1845.187279] worker_thread+0x126/0x4a0 [ 1845.187280] kthread+0x10a/0x140 [ 1845.187281] ? process_one_work+0x750/0x750 [ 1845.187282] ? kthread_create_on_node+0x60/0x60 [ 1845.187283] ret_from_fork+0x2a/0x40 I think that this GFP_NOIO allocation request needs to consume more memory reserves than GFP_NOFS allocation request to make progress. Do we want to add __GFP_NOFAIL to this GFP_NOIO allocation request in order to allow access to memory reserves as well as GFP_NOFS | __GFP_NOFAIL allocation request? -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Bug 186671] New: OOM on system with just rsync running 32GB of ram 30GB of pagecache
On 2016/11/18 6:49, Vlastimil Babka wrote: > On 11/16/2016 02:39 PM, E V wrote: >> System panic'd overnight running 4.9rc5 & rsync. Attached a photo of >> the stack trace, and the 38 call traces in a 2 minute window shortly >> before, to the bugzilla case for those not on it's e-mail list: >> >> https://bugzilla.kernel.org/show_bug.cgi?id=186671 > > The panic screenshot has only the last part, but the end marker says > it's OOM with no killable processes. The DEBUG_VM config thus didn't > trigger anything, and still there's tons of pagecache, mostly clean, > that's not being reclaimed. > > Could you now try this? > - enable CONFIG_PAGE_OWNER > - boot with kernel option: page_owner=on > - after the first oom, "cat /sys/kernel/debug/page_owner > file" > - provide the file (compressed, it will be quite large) Excuse me for a noise, but do we really need to do "cat /sys/kernel/debug/page_owner > file" after the first OOM killer invocation? I worry that it might be too difficult to do. Shouldn't we rather do "cat /sys/kernel/debug/page_owner > file" hourly and compare tendency between the latest one and previous one? This system has swap, and /var/log/messages before panic reports that swapin was stalling at memory allocation. [130346.262510] dsm_sa_datamgrd: page allocation stalls for 52400ms, order:0, mode:0x24200ca(GFP_HIGHUSER_MOVABLE) [130346.262572] CPU: 1 PID: 3622 Comm: dsm_sa_datamgrd Tainted: GW I 4.9.0-rc5 #2 [130346.262662] 8129ba69 8170e4c8 c90003ccb8d8 [130346.262714] 8113449f 024200ca1ca11b40 8170e4c8 c90003ccb880 [130346.262765] 0010 c90003ccb8e8 c90003ccb898 88041f226e80 [130346.262817] Call Trace: [130346.262843] [] ? dump_stack+0x46/0x5d [130346.262872] [] ? warn_alloc+0x11f/0x140 [130346.262899] [] ? __alloc_pages_slowpath+0x84b/0xa80 [130346.262929] [] ? __alloc_pages_nodemask+0x2b0/0x2f0 [130346.262960] [] ? alloc_pages_vma+0xbe/0x260 [130346.262989] [] ? pagecache_get_page+0x22/0x280 [130346.263019] [] ? __read_swap_cache_async+0x118/0x1a0 [130346.263048] [] ? read_swap_cache_async+0xf/0x30 [130346.263077] [] ? swapin_readahead+0x16e/0x1c0 [130346.263106] [] ? radix_tree_lookup_slot+0xe/0x20 [130346.263135] [] ? find_get_entry+0x14/0x130 [130346.263162] [] ? pagecache_get_page+0x22/0x280 [130346.263193] [] ? do_swap_page+0x44f/0x5f0 [130346.263220] [] ? __radix_tree_lookup+0x62/0xc0 [130346.263249] [] ? handle_mm_fault+0x66a/0xf00 [130346.263277] [] ? find_get_entry+0x14/0x130 [130346.263305] [] ? __do_page_fault+0x1c5/0x490 [130346.263336] [] ? page_fault+0x22/0x30 [130346.263364] [] ? copy_user_generic_string+0x2c/0x40 [130346.263395] [] ? set_fd_set+0x1d/0x30 [130346.263422] [] ? core_sys_select+0x1a5/0x260 [130346.263450] [] ? getname_flags+0x6a/0x1e0 [130346.263479] [] ? cp_new_stat+0x115/0x130 [130346.263509] [] ? ktime_get_ts64+0x3f/0xf0 [130346.263537] [] ? SyS_select+0xa5/0xe0 [130346.263564] [] ? entry_SYSCALL_64_fastpath+0x13/0x94 Under such situation, trying to login and execute /bin/cat could take minutes. Also, writing to btrfs and ext4 seems to be stalling. The btrfs one is a situation where WQ_MEM_RECLAIM kernel workqueue is unable to make progress. [130420.008231] kworker/u34:21: page allocation stalls for 35028ms, order:0, mode:0x2400840(GFP_NOFS|__GFP_NOFAIL) [130420.008287] CPU: 5 PID: 24286 Comm: kworker/u34:21 Tainted: GW I 4.9.0-rc5 #2 [130420.008401] Workqueue: btrfs-extent-refs btrfs_extent_refs_helper [btrfs] [130420.008432] 8129ba69 8170e4c8 c900087836a0 [130420.008483] 8113449f 024008401e3f1b40 8170e4c8 c90008783648 [130420.008534] 0010 c900087836b0 c90008783660 88041ecc4340 [130420.008586] Call Trace: [130420.008611] [] ? dump_stack+0x46/0x5d [130420.008640] [] ? warn_alloc+0x11f/0x140 [130420.008667] [] ? __alloc_pages_slowpath+0x84b/0xa80 [130420.008707] [] ? search_bitmap+0xc2/0x140 [btrfs] [130420.008736] [] ? __alloc_pages_nodemask+0x2b0/0x2f0 [130420.008766] [] ? alloc_pages_current+0x8a/0x110 [130420.008796] [] ? pagecache_get_page+0xec/0x280 [130420.008836] [] ? alloc_extent_buffer+0x108/0x430 [btrfs] [130420.008875] [] ? btrfs_alloc_tree_block+0x118/0x4d0 [btrfs] [130420.008927] [] ? __btrfs_cow_block+0x148/0x5d0 [btrfs] [130420.008964] [] ? btrfs_cow_block+0x114/0x1d0 [btrfs] [130420.009001] [] ? btrfs_search_slot+0x206/0xa40 [btrfs] [130420.009039] [] ? lookup_inline_extent_backref+0xd9/0x620 [btrfs] [130420.009095] [] ? set_extent_bit+0x24/0x30 [btrfs] [130420.009124] [] ? kmem_cache_alloc+0x17f/0x1b0 [130420.009161] [] ? __btrfs_free_extent.isra.69+0xef/0xd10 [btrfs] [130420.009215] [] ? btrfs_merge_delayed_refs+0x56/0x6f0 [btrfs] [130420.009269] [] ? __btrfs_run_delayed_ref