Re: x86: Hardware breakpoints are not always triggered
On Thu, Jan 28, 2016 at 10:33:28PM +0100, Paolo Bonzini wrote: > > > On 28/01/2016 09:31, Andrey Wagin wrote: > > I tried to print drX registers after a break-point. Looks like they > > are set correctly. > > Can you try this KVM patch? Looks like it fixes a case when reproducers are running only in VM. If I execute the reproducer a few times on the host and then execute it in VM, it exits very fast. Thanks, Andrew > > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c > index c13a64b7d789..32bae1c70a50 100644 > --- a/arch/x86/kvm/svm.c > +++ b/arch/x86/kvm/svm.c > @@ -1612,6 +1612,7 @@ static void svm_sync_dirty_debug_regs(struct kvm_vcpu > *vcpu) > vcpu->arch.dr7 = svm->vmcb->save.dr7; > > vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; > + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD; > set_dr_intercepts(svm); > } > > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index e2951b6edbbc..505a4663b9f4 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -5659,6 +5659,7 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu > *vcpu) > vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); > > vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; > + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD; > > cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); > cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING; > > Paolo
x86: Hardware breakpoints are not always triggered
Hi, We use hardware breakpoints in CRIU and we found that sometimes we set a break-point, but a process doesn't stop on it. I write a small reproducer for this bug. It create two processes, where a parent process traces a child. The parent process sets a break-point and each time when the child stop on it, the parent sets the variable "xxx" to A in a child process. The child runs an infinite loop, where it check the variable "xxx" and sets it to B. If a child process finds that xxx is equal to B, it exits with a non-zero code, what means that a break-point was not triggered. The source code is attached. The reproducer uses a different break-point address if it is executed with arguments than when it executed without arguments. Then I made a few experiments. The bug is triggered, if we execute this program a few times in a KVM virtual machine. [root@fc22-vm ptrace]# ( while :; do ./ptrace_breakpoint > /dev/null || { echo "FAIL - $?"; break; }; done ) & [3] 4088 [root@fc22-vm ptrace]# ( while :; do ./ptrace_breakpoint > /dev/null || { echo "FAIL - $?"; break; }; done ) & [4] 4091 [root@fc22-vm ptrace]# ( while :; do ./ptrace_breakpoint 1 2 > /dev/null || { echo "FAIL - $?"; break; }; done ) & [5] 4094 [root@fc22-vm ptrace]# ( while :; do ./ptrace_breakpoint 1 2 > /dev/null || { echo "FAIL - $?"; break; }; done ) & [6] 4097 [8] 4103 [root@fc22-vm ptrace]# 0087: exit - 5 0131: exited, status=1 0126: wait: No child processes FAIL - 3 I tried to execute the reproducer on the host (where kvm VM-s are running), but the bug was not triggered during one hour. When I executed the reproducer in VM without stopping processes on the host, I found that a bug is triggered much faster in this case. [root@fc22-vm ptrace]# ./ptrace_breakpoint 1 stop 24675 cont child2 1 stop 24676 cont child2 1 child2 5 0088: exit - 5 stop 24677 0132: exited, status=1 cont 0127: wait: No child processes I know that this bug can be reproduced starting with the 4.2 kernel. I haven't test older versions of the kernel. I tried to print drX registers after a break-point. Looks like they are set correctly. Maybe someone has any ideas where a problem is or how it can be investigated. Here is a criu issue for this problem: https://github.com/xemul/criu/issues/107 Thanks, Andrew #define _GNU_SOURCE /* See feature_test_macros(7) */ #include/* For SYS_xxx definitions */ #include #include #include #include #include #include #include #include #include #include #include /* Copied from the gdb header gdb/nat/x86-dregs.h */ /* Debug registers' indices. */ #define DR_FIRSTADDR 0 #define DR_LASTADDR 3 #define DR_NADDR 4 /* The number of debug address registers. */ #define DR_STATUS6 /* Index of debug status register (DR6). */ #define DR_CONTROL 7 /* Index of debug control register (DR7). */ #define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit. */ #define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit. */ #define DR_ENABLE_SIZE 2 /* Two enable bits per debug register. */ /* Locally enable the break/watchpoint in the I'th debug register. */ #define X86_DR_LOCAL_ENABLE(i) (1 << (DR_LOCAL_ENABLE_SHIFT + DR_ENABLE_SIZE * (i))) # define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) #define pr_perror(fmt, ...) \ fprintf(stderr, "%04d: " fmt ": %s\n", __LINE__, ##__VA_ARGS__, strerror(errno)) #define pr_err(fmt, ...) \ fprintf(stderr, "%04d: " fmt "\n", __LINE__, ##__VA_ARGS__) int ptrace_set_breakpoint(pid_t pid, void *addr) { int ret; /* Set a breakpoint */ if (ptrace(PTRACE_POKEUSER, pid, offsetof(struct user, u_debugreg[DR_FIRSTADDR]), addr)) { pr_perror("Unable to setup a breakpoint into %d", pid); return -1; } /* Enable the breakpoint */ if (ptrace(PTRACE_POKEUSER, pid, offsetof(struct user, u_debugreg[DR_CONTROL]), X86_DR_LOCAL_ENABLE(DR_FIRSTADDR))) { pr_perror("Unable to enable the breakpoint for %d", pid); return -1; } ret = ptrace(PTRACE_CONT, pid, NULL, NULL); if (ret) { pr_perror("Unable to restart the stopped tracee process %d", pid); return -1; } return 1; } static long xxx = -1; int child() { printf("child %d\n", xxx); syscall(__NR_getppid); if (xxx == 5) { pr_err("exit - %d", xxx); exit(1); } if (xxx > 0) xxx = 5; return 0; } int child2() { printf("child2 %d\n", xxx); syscall(__NR_getppid); if (xxx == 5) { pr_err("exit - %d", xxx); exit(1); } if (xxx > 0) xxx = 5; return 0; } int main(int argc, char **argv) { pid_t pid; int status, i = 0; int (*addr)(); if (argc > 1) addr = child2; else addr = child; pid = fork();
Re: x86: Hardware breakpoints are not always triggered
On Thu, Jan 28, 2016 at 10:33:28PM +0100, Paolo Bonzini wrote: > > > On 28/01/2016 09:31, Andrey Wagin wrote: > > I tried to print drX registers after a break-point. Looks like they > > are set correctly. > > Can you try this KVM patch? Looks like it fixes a case when reproducers are running only in VM. If I execute the reproducer a few times on the host and then execute it in VM, it exits very fast. Thanks, Andrew > > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c > index c13a64b7d789..32bae1c70a50 100644 > --- a/arch/x86/kvm/svm.c > +++ b/arch/x86/kvm/svm.c > @@ -1612,6 +1612,7 @@ static void svm_sync_dirty_debug_regs(struct kvm_vcpu > *vcpu) > vcpu->arch.dr7 = svm->vmcb->save.dr7; > > vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; > + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD; > set_dr_intercepts(svm); > } > > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index e2951b6edbbc..505a4663b9f4 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -5659,6 +5659,7 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu > *vcpu) > vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); > > vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; > + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD; > > cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); > cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING; > > Paolo
x86: Hardware breakpoints are not always triggered
Hi, We use hardware breakpoints in CRIU and we found that sometimes we set a break-point, but a process doesn't stop on it. I write a small reproducer for this bug. It create two processes, where a parent process traces a child. The parent process sets a break-point and each time when the child stop on it, the parent sets the variable "xxx" to A in a child process. The child runs an infinite loop, where it check the variable "xxx" and sets it to B. If a child process finds that xxx is equal to B, it exits with a non-zero code, what means that a break-point was not triggered. The source code is attached. The reproducer uses a different break-point address if it is executed with arguments than when it executed without arguments. Then I made a few experiments. The bug is triggered, if we execute this program a few times in a KVM virtual machine. [root@fc22-vm ptrace]# ( while :; do ./ptrace_breakpoint > /dev/null || { echo "FAIL - $?"; break; }; done ) & [3] 4088 [root@fc22-vm ptrace]# ( while :; do ./ptrace_breakpoint > /dev/null || { echo "FAIL - $?"; break; }; done ) & [4] 4091 [root@fc22-vm ptrace]# ( while :; do ./ptrace_breakpoint 1 2 > /dev/null || { echo "FAIL - $?"; break; }; done ) & [5] 4094 [root@fc22-vm ptrace]# ( while :; do ./ptrace_breakpoint 1 2 > /dev/null || { echo "FAIL - $?"; break; }; done ) & [6] 4097 [8] 4103 [root@fc22-vm ptrace]# 0087: exit - 5 0131: exited, status=1 0126: wait: No child processes FAIL - 3 I tried to execute the reproducer on the host (where kvm VM-s are running), but the bug was not triggered during one hour. When I executed the reproducer in VM without stopping processes on the host, I found that a bug is triggered much faster in this case. [root@fc22-vm ptrace]# ./ptrace_breakpoint 1 stop 24675 cont child2 1 stop 24676 cont child2 1 child2 5 0088: exit - 5 stop 24677 0132: exited, status=1 cont 0127: wait: No child processes I know that this bug can be reproduced starting with the 4.2 kernel. I haven't test older versions of the kernel. I tried to print drX registers after a break-point. Looks like they are set correctly. Maybe someone has any ideas where a problem is or how it can be investigated. Here is a criu issue for this problem: https://github.com/xemul/criu/issues/107 Thanks, Andrew #define _GNU_SOURCE /* See feature_test_macros(7) */ #include/* For SYS_xxx definitions */ #include #include #include #include #include #include #include #include #include #include #include /* Copied from the gdb header gdb/nat/x86-dregs.h */ /* Debug registers' indices. */ #define DR_FIRSTADDR 0 #define DR_LASTADDR 3 #define DR_NADDR 4 /* The number of debug address registers. */ #define DR_STATUS6 /* Index of debug status register (DR6). */ #define DR_CONTROL 7 /* Index of debug control register (DR7). */ #define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit. */ #define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit. */ #define DR_ENABLE_SIZE 2 /* Two enable bits per debug register. */ /* Locally enable the break/watchpoint in the I'th debug register. */ #define X86_DR_LOCAL_ENABLE(i) (1 << (DR_LOCAL_ENABLE_SHIFT + DR_ENABLE_SIZE * (i))) # define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) #define pr_perror(fmt, ...) \ fprintf(stderr, "%04d: " fmt ": %s\n", __LINE__, ##__VA_ARGS__, strerror(errno)) #define pr_err(fmt, ...) \ fprintf(stderr, "%04d: " fmt "\n", __LINE__, ##__VA_ARGS__) int ptrace_set_breakpoint(pid_t pid, void *addr) { int ret; /* Set a breakpoint */ if (ptrace(PTRACE_POKEUSER, pid, offsetof(struct user, u_debugreg[DR_FIRSTADDR]), addr)) { pr_perror("Unable to setup a breakpoint into %d", pid); return -1; } /* Enable the breakpoint */ if (ptrace(PTRACE_POKEUSER, pid, offsetof(struct user, u_debugreg[DR_CONTROL]), X86_DR_LOCAL_ENABLE(DR_FIRSTADDR))) { pr_perror("Unable to enable the breakpoint for %d", pid); return -1; } ret = ptrace(PTRACE_CONT, pid, NULL, NULL); if (ret) { pr_perror("Unable to restart the stopped tracee process %d", pid); return -1; } return 1; } static long xxx = -1; int child() { printf("child %d\n", xxx); syscall(__NR_getppid); if (xxx == 5) { pr_err("exit - %d", xxx); exit(1); } if (xxx > 0) xxx = 5; return 0; } int child2() { printf("child2 %d\n", xxx); syscall(__NR_getppid); if (xxx == 5) { pr_err("exit - %d", xxx); exit(1); } if (xxx > 0) xxx = 5; return 0; } int main(int argc, char **argv) { pid_t pid; int status, i = 0; int (*addr)(); if (argc > 1) addr = child2; else addr = child; pid = fork();
Re: WARNING: static_key_slow_dec used before call to jump_label_init
2015-09-24 9:57 GMT+03:00 Andrey Wagin : > Hello, > > I booted kernel with cgroup_disable=cpu and get this warning: > > [0.00] Kernel command line: > BOOT_IMAGE=/boot/vmlinuz-4.3.0-rc2-next-20150923 no_timer_check > console=ttyS0,115200n8 root=UUID=01bc7316-b1f4-45c9-a23a-0 [0.00] Kernel command line: BOOT_IMAGE=/boot/vmlinuz-4.3.0-rc2-next-20150923 no_timer_check console=ttyS0,115200n8 root=UUID=01bc7316-b1f4-45c9-a23a-00c5a2336ef2 console=tty1 ro rhgb quiet LANG=en_US.UTF-8 initrd=/boot/initramfs-4.3.0-rc2-next-20150923.img debug cgroup_disable=cpu You have new mail in /var/spool/mail/root > [0.00] [ cut here ] > [0.00] WARNING: CPU: 0 PID: 0 at kernel/jump_label.c:99 > static_key_slow_dec+0x44/0x60() > [0.00] static_key_slow_dec used before call to jump_label_init > [0.00] Modules linked in: > > [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted > 4.3.0-rc2-next-20150924 #1 > [0.00] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 > [0.00] ebd8af5cfcdffb28 81c03da8 > 813b18c2 > [0.00] 81c03df0 81c03de0 8108dd52 > 81c5bad0 > [0.00] 88003ffe663a 81c49a00 0001 > 88003ffe662b > [0.00] Call Trace: > [0.00] [] dump_stack+0x44/0x62 > [0.00] [] warn_slowpath_common+0x82/0xc0 > [0.00] [] warn_slowpath_fmt+0x5c/0x80 > [0.00] [] static_key_slow_dec+0x44/0x60 > [0.00] [] cgroup_disable+0xaf/0xd6 > [0.00] [] unknown_bootoption+0x8c/0x194 > [0.00] [] parse_args+0x273/0x4a0 > [0.00] [] ? printk+0x57/0x73 > [0.00] [] start_kernel+0x205/0x4b8 > [0.00] [] ? set_init_arg+0x55/0x55 > [0.00] [] ? early_idt_handler_array+0x120/0x120 > [0.00] [] x86_64_start_reservations+0x2a/0x2c > [0.00] [] x86_64_start_kernel+0x14a/0x16d > [0.00] ---[ end trace 8a774ae3ff020690 ]--- > > Thanks, > Andrew -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
WARNING: static_key_slow_dec used before call to jump_label_init
Hello, I booted kernel with cgroup_disable=cpu and get this warning: [0.00] Kernel command line: BOOT_IMAGE=/boot/vmlinuz-4.3.0-rc2-next-20150923 no_timer_check console=ttyS0,115200n8 root=UUID=01bc7316-b1f4-45c9-a23a-0 [0.00] [ cut here ] [0.00] WARNING: CPU: 0 PID: 0 at kernel/jump_label.c:99 static_key_slow_dec+0x44/0x60() [0.00] static_key_slow_dec used before call to jump_label_init [0.00] Modules linked in: [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 4.3.0-rc2-next-20150924 #1 [0.00] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [0.00] ebd8af5cfcdffb28 81c03da8 813b18c2 [0.00] 81c03df0 81c03de0 8108dd52 81c5bad0 [0.00] 88003ffe663a 81c49a00 0001 88003ffe662b [0.00] Call Trace: [0.00] [] dump_stack+0x44/0x62 [0.00] [] warn_slowpath_common+0x82/0xc0 [0.00] [] warn_slowpath_fmt+0x5c/0x80 [0.00] [] static_key_slow_dec+0x44/0x60 [0.00] [] cgroup_disable+0xaf/0xd6 [0.00] [] unknown_bootoption+0x8c/0x194 [0.00] [] parse_args+0x273/0x4a0 [0.00] [] ? printk+0x57/0x73 [0.00] [] start_kernel+0x205/0x4b8 [0.00] [] ? set_init_arg+0x55/0x55 [0.00] [] ? early_idt_handler_array+0x120/0x120 [0.00] [] x86_64_start_reservations+0x2a/0x2c [0.00] [] x86_64_start_kernel+0x14a/0x16d [0.00] ---[ end trace 8a774ae3ff020690 ]--- Thanks, Andrew -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
WARNING: static_key_slow_dec used before call to jump_label_init
Hello, I booted kernel with cgroup_disable=cpu and get this warning: [0.00] Kernel command line: BOOT_IMAGE=/boot/vmlinuz-4.3.0-rc2-next-20150923 no_timer_check console=ttyS0,115200n8 root=UUID=01bc7316-b1f4-45c9-a23a-0 [0.00] [ cut here ] [0.00] WARNING: CPU: 0 PID: 0 at kernel/jump_label.c:99 static_key_slow_dec+0x44/0x60() [0.00] static_key_slow_dec used before call to jump_label_init [0.00] Modules linked in: [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 4.3.0-rc2-next-20150924 #1 [0.00] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [0.00] ebd8af5cfcdffb28 81c03da8 813b18c2 [0.00] 81c03df0 81c03de0 8108dd52 81c5bad0 [0.00] 88003ffe663a 81c49a00 0001 88003ffe662b [0.00] Call Trace: [0.00] [] dump_stack+0x44/0x62 [0.00] [] warn_slowpath_common+0x82/0xc0 [0.00] [] warn_slowpath_fmt+0x5c/0x80 [0.00] [] static_key_slow_dec+0x44/0x60 [0.00] [] cgroup_disable+0xaf/0xd6 [0.00] [] unknown_bootoption+0x8c/0x194 [0.00] [] parse_args+0x273/0x4a0 [0.00] [] ? printk+0x57/0x73 [0.00] [] start_kernel+0x205/0x4b8 [0.00] [] ? set_init_arg+0x55/0x55 [0.00] [] ? early_idt_handler_array+0x120/0x120 [0.00] [] x86_64_start_reservations+0x2a/0x2c [0.00] [] x86_64_start_kernel+0x14a/0x16d [0.00] ---[ end trace 8a774ae3ff020690 ]--- Thanks, Andrew -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: WARNING: static_key_slow_dec used before call to jump_label_init
2015-09-24 9:57 GMT+03:00 Andrey Wagin <ava...@gmail.com>: > Hello, > > I booted kernel with cgroup_disable=cpu and get this warning: > > [0.00] Kernel command line: > BOOT_IMAGE=/boot/vmlinuz-4.3.0-rc2-next-20150923 no_timer_check > console=ttyS0,115200n8 root=UUID=01bc7316-b1f4-45c9-a23a-0 [0.00] Kernel command line: BOOT_IMAGE=/boot/vmlinuz-4.3.0-rc2-next-20150923 no_timer_check console=ttyS0,115200n8 root=UUID=01bc7316-b1f4-45c9-a23a-00c5a2336ef2 console=tty1 ro rhgb quiet LANG=en_US.UTF-8 initrd=/boot/initramfs-4.3.0-rc2-next-20150923.img debug cgroup_disable=cpu You have new mail in /var/spool/mail/root > [0.00] [ cut here ] > [0.00] WARNING: CPU: 0 PID: 0 at kernel/jump_label.c:99 > static_key_slow_dec+0x44/0x60() > [0.00] static_key_slow_dec used before call to jump_label_init > [0.00] Modules linked in: > > [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted > 4.3.0-rc2-next-20150924 #1 > [0.00] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 > [0.00] ebd8af5cfcdffb28 81c03da8 > 813b18c2 > [0.00] 81c03df0 81c03de0 8108dd52 > 81c5bad0 > [0.00] 88003ffe663a 81c49a00 0001 > 88003ffe662b > [0.00] Call Trace: > [0.00] [] dump_stack+0x44/0x62 > [0.00] [] warn_slowpath_common+0x82/0xc0 > [0.00] [] warn_slowpath_fmt+0x5c/0x80 > [0.00] [] static_key_slow_dec+0x44/0x60 > [0.00] [] cgroup_disable+0xaf/0xd6 > [0.00] [] unknown_bootoption+0x8c/0x194 > [0.00] [] parse_args+0x273/0x4a0 > [0.00] [] ? printk+0x57/0x73 > [0.00] [] start_kernel+0x205/0x4b8 > [0.00] [] ? set_init_arg+0x55/0x55 > [0.00] [] ? early_idt_handler_array+0x120/0x120 > [0.00] [] x86_64_start_reservations+0x2a/0x2c > [0.00] [] x86_64_start_kernel+0x14a/0x16d > [0.00] ---[ end trace 8a774ae3ff020690 ]--- > > Thanks, > Andrew -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] inotify: hide internal kernel bits from fdinfo
2015-09-21 21:45 GMT+03:00 Dave Hansen : > > From: Dave Hansen > > There was a report that my patch: > > inotify: actually check for invalid bits in sys_inotify_add_watch() > > broke CRIU. > > The reason is that CRIU looks up raw flags in /proc/$pid/fdinfo/* > to figure out how to rebuild inotify watches and then passes those > flags directly back in to the inotify API. One of those flags > (FS_EVENT_ON_CHILD) is set in mark->mask, but is not part of the > inotify API. It is used inside the kernel to _implement_ inotify > but it is not and has never been part of the API. > > My patch above ensured that we only allow bits which are part of > the API (IN_ALL_EVENTS). This broke CRIU. > > FS_EVENT_ON_CHILD is really internal to the kernel. It is set > _anyway_ on all inotify marks. So, CRIU was really just trying > to set a bit that was already set. > > This patch hides that bit from fdinfo. CRIU will not see the > bit, not try to set it, and should work as before. We should not > have been exposing this bit in the first place, so this is a good > patch independent of the CRIU problem. > > Signed-off-by: Dave Hansen > Reported-by: Andrey Wagin Acked-by: Andrey Vagin Thanks, Andrey > Cc: Andrew Morton > Cc: Cyrill Gorcunov > Cc: xe...@parallels.com > Cc: Eric Paris > Cc: j...@johnmccutchan.com > Cc: rl...@rlove.org > Cc: linux-kernel@vger.kernel.org > --- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] inotify: actually check for invalid bits in sys_inotify_add_watch()
2015-09-10 23:49 GMT+03:00 Andrew Morton : > On Wed, 9 Sep 2015 16:32:37 -0700 Dave Hansen wrote: > >> On 09/09/2015 04:16 PM, Eric Paris wrote: >> > Looks fine to me. And usually akpm picks them up these days. >> >> Is that an Acked-by? :) > > I grabbed it for 4.3. This patch breaks CRIU. When we are dumping an inotify file descriptors, we read event masks from /proc/self/fdinfo. [root@fc22-vm criu]# cat /proc/1065/fdinfo/3 pos: 0 flags: 04000 mnt_id: 11 inotify wd:1 ino:101d5c sdev:fc2 mask:800a708 ignored_mask:0 fhandle-bytes:8 fhandle-type:1 f_handle:5c1d10003996cda2 Here the mask contains the 0x800 (FS_EVENT_ON_CHILD) flag, which is set for all inotify and dnotify watchers. On restore, we call inotify_add_watch() with this mask, and it fails with this patch: 1375 inotify_add_watch(3, "/proc/self/fd/5", IN_CLOSE_WRITE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_UNMOUNT|IN_IGNORED|0x800) = -1 EINVAL (Invalid argument) I am not sure that we have to save backward compatibility here. Maybe we should not show FS_EVENT_ON_CHILD in fdinfo. > > I removed your cc:stable. I don't see anything in here which warrants > a backport. If there *is* a reason for backporting then your > changelogological skills are sorely wanting! > > > The changelog is pretty sucky really. What are the reasons for this > change, apart from "do what the comment said"? What's the benefit? > > And the code comment sucks. "don't allow invalid bits": well duh. And > "we don't want flags set" is also useless: it doesn't explain *why* we > don't want those flags set. > > And given that there is potential to break existing userspace, we need > some decent reasons for making this change. > > > > > From: Dave Hansen > Subject: inotify: actually check for invalid bits in sys_inotify_add_watch() > > The comment here says that it is checking for invalid bits. But, the mask > is *actually* checking to ensure that _any_ valid bit is set, which is > quite different. > > Add the actual check which was intended. Retain the existing check > because it actually does something useful: ensure that some inotify bits > are being added to the watch. Plus, this is existing behavior which would > be nice to preserve. > > I did a quick sniff test that inotify functions and that my > 'inotify-tools' package passes 'make check'. > > Signed-off-by: Dave Hansen > Cc: John McCutchan > Cc: Robert Love > Cc: Eric Paris > Signed-off-by: Andrew Morton > --- > > fs/notify/inotify/inotify_user.c |3 +++ > 1 file changed, 3 insertions(+) > > diff -puN > fs/notify/inotify/inotify_user.c~inotify-actually-check-for-invalid-bits-in-sys_inotify_add_watch > fs/notify/inotify/inotify_user.c > --- > a/fs/notify/inotify/inotify_user.c~inotify-actually-check-for-invalid-bits-in-sys_inotify_add_watch > +++ a/fs/notify/inotify/inotify_user.c > @@ -707,6 +707,9 @@ SYSCALL_DEFINE3(inotify_add_watch, int, > unsigned flags = 0; > > /* don't allow invalid bits: we don't want flags set */ > + if (unlikely(mask & ~ALL_INOTIFY_BITS)) > + return -EINVAL; > + /* require at least one valid bit set in the mask */ > if (unlikely(!(mask & ALL_INOTIFY_BITS))) > return -EINVAL; > > _ > > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] inotify: actually check for invalid bits in sys_inotify_add_watch()
2015-09-10 23:49 GMT+03:00 Andrew Morton: > On Wed, 9 Sep 2015 16:32:37 -0700 Dave Hansen wrote: > >> On 09/09/2015 04:16 PM, Eric Paris wrote: >> > Looks fine to me. And usually akpm picks them up these days. >> >> Is that an Acked-by? :) > > I grabbed it for 4.3. This patch breaks CRIU. When we are dumping an inotify file descriptors, we read event masks from /proc/self/fdinfo. [root@fc22-vm criu]# cat /proc/1065/fdinfo/3 pos: 0 flags: 04000 mnt_id: 11 inotify wd:1 ino:101d5c sdev:fc2 mask:800a708 ignored_mask:0 fhandle-bytes:8 fhandle-type:1 f_handle:5c1d10003996cda2 Here the mask contains the 0x800 (FS_EVENT_ON_CHILD) flag, which is set for all inotify and dnotify watchers. On restore, we call inotify_add_watch() with this mask, and it fails with this patch: 1375 inotify_add_watch(3, "/proc/self/fd/5", IN_CLOSE_WRITE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_UNMOUNT|IN_IGNORED|0x800) = -1 EINVAL (Invalid argument) I am not sure that we have to save backward compatibility here. Maybe we should not show FS_EVENT_ON_CHILD in fdinfo. > > I removed your cc:stable. I don't see anything in here which warrants > a backport. If there *is* a reason for backporting then your > changelogological skills are sorely wanting! > > > The changelog is pretty sucky really. What are the reasons for this > change, apart from "do what the comment said"? What's the benefit? > > And the code comment sucks. "don't allow invalid bits": well duh. And > "we don't want flags set" is also useless: it doesn't explain *why* we > don't want those flags set. > > And given that there is potential to break existing userspace, we need > some decent reasons for making this change. > > > > > From: Dave Hansen > Subject: inotify: actually check for invalid bits in sys_inotify_add_watch() > > The comment here says that it is checking for invalid bits. But, the mask > is *actually* checking to ensure that _any_ valid bit is set, which is > quite different. > > Add the actual check which was intended. Retain the existing check > because it actually does something useful: ensure that some inotify bits > are being added to the watch. Plus, this is existing behavior which would > be nice to preserve. > > I did a quick sniff test that inotify functions and that my > 'inotify-tools' package passes 'make check'. > > Signed-off-by: Dave Hansen > Cc: John McCutchan > Cc: Robert Love > Cc: Eric Paris > Signed-off-by: Andrew Morton > --- > > fs/notify/inotify/inotify_user.c |3 +++ > 1 file changed, 3 insertions(+) > > diff -puN > fs/notify/inotify/inotify_user.c~inotify-actually-check-for-invalid-bits-in-sys_inotify_add_watch > fs/notify/inotify/inotify_user.c > --- > a/fs/notify/inotify/inotify_user.c~inotify-actually-check-for-invalid-bits-in-sys_inotify_add_watch > +++ a/fs/notify/inotify/inotify_user.c > @@ -707,6 +707,9 @@ SYSCALL_DEFINE3(inotify_add_watch, int, > unsigned flags = 0; > > /* don't allow invalid bits: we don't want flags set */ > + if (unlikely(mask & ~ALL_INOTIFY_BITS)) > + return -EINVAL; > + /* require at least one valid bit set in the mask */ > if (unlikely(!(mask & ALL_INOTIFY_BITS))) > return -EINVAL; > > _ > > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] inotify: hide internal kernel bits from fdinfo
2015-09-21 21:45 GMT+03:00 Dave Hansen <d...@sr71.net>: > > From: Dave Hansen <dave.han...@linux.intel.com> > > There was a report that my patch: > > inotify: actually check for invalid bits in sys_inotify_add_watch() > > broke CRIU. > > The reason is that CRIU looks up raw flags in /proc/$pid/fdinfo/* > to figure out how to rebuild inotify watches and then passes those > flags directly back in to the inotify API. One of those flags > (FS_EVENT_ON_CHILD) is set in mark->mask, but is not part of the > inotify API. It is used inside the kernel to _implement_ inotify > but it is not and has never been part of the API. > > My patch above ensured that we only allow bits which are part of > the API (IN_ALL_EVENTS). This broke CRIU. > > FS_EVENT_ON_CHILD is really internal to the kernel. It is set > _anyway_ on all inotify marks. So, CRIU was really just trying > to set a bit that was already set. > > This patch hides that bit from fdinfo. CRIU will not see the > bit, not try to set it, and should work as before. We should not > have been exposing this bit in the first place, so this is a good > patch independent of the CRIU problem. > > Signed-off-by: Dave Hansen <dave.han...@linux.intel.com> > Reported-by: Andrey Wagin <ava...@gmail.com> Acked-by: Andrey Vagin <ava...@openvz.org> Thanks, Andrey > Cc: Andrew Morton <a...@linux-foundation.org> > Cc: Cyrill Gorcunov <gorcu...@openvz.org> > Cc: xe...@parallels.com > Cc: Eric Paris <epa...@redhat.com> > Cc: j...@johnmccutchan.com > Cc: rl...@rlove.org > Cc: linux-kernel@vger.kernel.org > --- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: linux-next: BUG: unable to handle kernel NULL pointer dereference in cfq_print_leaf_weight()
2015-06-19 23:26 GMT+03:00 Andrey Wagin : > 2015-06-19 19:24 GMT+03:00 Jens Axboe : >> On 06/17/2015 03:55 PM, Andrey Wagin wrote: >>> >>> Hi All, >>> >>> I executed CRIU tests on the 4.1.0-rc8-next-20150617 kernel and met >>> this bug. Maybe it will be interested for someone to look at it. >> >> >> This should fix it: >> >> http://git.kernel.dk/cgit/linux-block/commit/?h=for-4.2/core=9470e4a693db84bee7becbba8de01af02bb23c9f > > Hi Jens, > > Thank you for the path. I think we need to fix __cfq_set_weight and > __cfq_set_weight_device too. I've seen that you fixed these functions too. But CRIU tests still fail, because they tries to restore a value of blkio.weight and get EINVAL. It works fine on the upstream kernel. For me the suggested interface looks weird. What if a device which uses cfq isn't permanent. If I detach the device, cgroup configuration will be destroyed and then if I attach the device again, I will need to apply cgroup parameters again. Thanks, Andrew -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in Please read the FAQ at http://www.tux.org/lkml/
Re: linux-next: BUG: unable to handle kernel NULL pointer dereference in cfq_print_leaf_weight()
2015-06-19 23:26 GMT+03:00 Andrey Wagin ava...@gmail.com: 2015-06-19 19:24 GMT+03:00 Jens Axboe ax...@kernel.dk: On 06/17/2015 03:55 PM, Andrey Wagin wrote: Hi All, I executed CRIU tests on the 4.1.0-rc8-next-20150617 kernel and met this bug. Maybe it will be interested for someone to look at it. This should fix it: http://git.kernel.dk/cgit/linux-block/commit/?h=for-4.2/coreid=9470e4a693db84bee7becbba8de01af02bb23c9f Hi Jens, Thank you for the path. I think we need to fix __cfq_set_weight and __cfq_set_weight_device too. I've seen that you fixed these functions too. But CRIU tests still fail, because they tries to restore a value of blkio.weight and get EINVAL. It works fine on the upstream kernel. For me the suggested interface looks weird. What if a device which uses cfq isn't permanent. If I detach the device, cgroup configuration will be destroyed and then if I attach the device again, I will need to apply cgroup parameters again. Thanks, Andrew -- To unsubscribe from this list: send the line unsubscribe linux-kernel in Please read the FAQ at http://www.tux.org/lkml/
Re: linux-next: BUG: unable to handle kernel NULL pointer dereference in cfq_print_leaf_weight()
2015-06-19 19:24 GMT+03:00 Jens Axboe : > On 06/17/2015 03:55 PM, Andrey Wagin wrote: >> >> Hi All, >> >> I executed CRIU tests on the 4.1.0-rc8-next-20150617 kernel and met >> this bug. Maybe it will be interested for someone to look at it. > > > This should fix it: > > http://git.kernel.dk/cgit/linux-block/commit/?h=for-4.2/core=9470e4a693db84bee7becbba8de01af02bb23c9f Hi Jens, Thank you for the path. I think we need to fix __cfq_set_weight and __cfq_set_weight_device too. > > -- > Jens Axboe > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in Please read the FAQ at http://www.tux.org/lkml/
Re: linux-next: BUG: unable to handle kernel NULL pointer dereference in cfq_print_leaf_weight()
2015-06-19 19:24 GMT+03:00 Jens Axboe ax...@kernel.dk: On 06/17/2015 03:55 PM, Andrey Wagin wrote: Hi All, I executed CRIU tests on the 4.1.0-rc8-next-20150617 kernel and met this bug. Maybe it will be interested for someone to look at it. This should fix it: http://git.kernel.dk/cgit/linux-block/commit/?h=for-4.2/coreid=9470e4a693db84bee7becbba8de01af02bb23c9f Hi Jens, Thank you for the path. I think we need to fix __cfq_set_weight and __cfq_set_weight_device too. -- Jens Axboe -- To unsubscribe from this list: send the line unsubscribe linux-kernel in Please read the FAQ at http://www.tux.org/lkml/
Re: linux-next: BUG: unable to handle kernel NULL pointer dereference in cfq_print_leaf_weight()
2015-06-18 0:55 GMT+03:00 Andrey Wagin : > Hi All, > > I executed CRIU tests on the 4.1.0-rc8-next-20150617 kernel and met > this bug. Maybe it will be interested for someone to look at it. This bug isn't reproduced if one of devices uses the cfq scheduler: [root@avagin-fc19-cr ~]# cat /sys/block/sda/queue/scheduler noop deadline [cfq] [root@avagin-fc19-cr ~]# cat /sys/fs/cgroup/blkio/blkio.leaf_weight 1000 [root@avagin-fc19-cr ~]# echo noop > /sys/block/sda/queue/scheduler [root@avagin-fc19-cr ~]# cat /sys/block/sda/queue/scheduler [noop] deadline cfq [root@avagin-fc19-cr ~]# cat /sys/fs/cgroup/blkio/blkio.leaf_weight Killed blkcg_to_cfqgd(blkcg) returns NULL. > > [ 30.130897] BUG: unable to handle kernel NULL pointer dereference > at 001c > [ 30.130970] IP: [] cfq_print_leaf_weight+0x31/0x50 > [ 30.131013] PGD 3b77a067 PUD 39883067 PMD 0 > [ 30.131013] Oops: [#1] SMP > [ 30.131013] Modules linked in: tun netlink_diag af_packet_diag > udp_diag tcp_diag inet_diag unix_diag ppdev crct10dif_pclmul > crc32_pclmul crc32c_intel ghash_clmulni_intel virtio_balloon serio_raw > i2c_piix4 parport_pc parport virtio_blk virtio_net cirrus > drm_kms_helper ttm drm virtio_pci virtio_ring virtio ata_generic > pata_acpi > [ 30.131013] CPU: 0 PID: 613 Comm: criu Not tainted 4.1.0-rc8-next-20150617 > #1 > [ 30.131013] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 > [ 30.131013] task: 880039eb ti: 880039db task.ti: > 880039db > [ 30.131013] RIP: 0010:[] [] > cfq_print_leaf_weight+0x31/0x50 > [ 30.131013] RSP: 0018:880039db3d08 EFLAGS: 00010286 > [ 30.131013] RAX: RBX: 88003a524200 RCX: > 0001 > [ 30.131013] RDX: 0001 RSI: 0001 RDI: > 88003a524800 > [ 30.131013] RBP: 880039db3d18 R08: 0001 R09: > > [ 30.131013] R10: 0080 R11: R12: > 81cba6c8 > [ 30.131013] R13: 0001 R14: 880039db3f18 R15: > 88003a524200 > [ 30.131013] FS: 7f2b6c495740() GS:88003d20() > knlGS: > [ 30.131013] CS: 0010 DS: ES: CR0: 80050033 > [ 30.131013] CR2: 001c CR3: 3de3c000 CR4: > 000407f0 > [ 30.131013] Stack: > [ 30.131013] 880039db3d28 88003a524200 880039db3d48 > 8113c9d3 > [ 30.131013] 880039db3d58 880039773380 > 0001 > [ 30.131013] 880039db3d58 812b9df6 880039db3dd8 > 81257b7d > [ 30.131013] Call Trace: > [ 30.131013] [] cgroup_seqfile_show+0x43/0xc0 > [ 30.131013] [] kernfs_seq_show+0x26/0x30 > [ 30.131013] [] seq_read+0x10d/0x3c0 > [ 30.131013] [] kernfs_fop_read+0x129/0x180 > [ 30.131013] [] __vfs_read+0x37/0x100 > [ 30.131013] [] ? security_file_permission+0xa3/0xc0 > [ 30.131013] [] ? rw_verify_area+0x56/0xe0 > [ 30.131013] [] vfs_read+0x86/0x130 > [ 30.131013] [] SyS_read+0x58/0xd0 > [ 30.131013] [] entry_SYSCALL_64_fastpath+0x12/0x76 > [ 30.131013] Code: 48 89 e5 53 48 89 fb 48 83 ec 08 48 8b bf d8 00 > 00 00 e8 93 fe d7 ff 48 85 c0 74 0f 48 63 15 27 f0 8f 00 48 8b 84 d0 > 28 01 00 00 <8b> 50 1c 48 89 df 31 c0 48 c7 c6 f8 f8 ad 81 e8 1b b9 e9 > ff 48 > [ 30.131013] RIP [] cfq_print_leaf_weight+0x31/0x50 > [ 30.131013] RSP > [ 30.131013] CR2: 001c > [ 30.142367] ---[ end trace bad3e020b932bbb1 ]--- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: linux-next: BUG: unable to handle kernel NULL pointer dereference in cfq_print_leaf_weight()
2015-06-18 0:55 GMT+03:00 Andrey Wagin ava...@gmail.com: Hi All, I executed CRIU tests on the 4.1.0-rc8-next-20150617 kernel and met this bug. Maybe it will be interested for someone to look at it. This bug isn't reproduced if one of devices uses the cfq scheduler: [root@avagin-fc19-cr ~]# cat /sys/block/sda/queue/scheduler noop deadline [cfq] [root@avagin-fc19-cr ~]# cat /sys/fs/cgroup/blkio/blkio.leaf_weight 1000 [root@avagin-fc19-cr ~]# echo noop /sys/block/sda/queue/scheduler [root@avagin-fc19-cr ~]# cat /sys/block/sda/queue/scheduler [noop] deadline cfq [root@avagin-fc19-cr ~]# cat /sys/fs/cgroup/blkio/blkio.leaf_weight Killed blkcg_to_cfqgd(blkcg) returns NULL. [ 30.130897] BUG: unable to handle kernel NULL pointer dereference at 001c [ 30.130970] IP: [813bc7e1] cfq_print_leaf_weight+0x31/0x50 [ 30.131013] PGD 3b77a067 PUD 39883067 PMD 0 [ 30.131013] Oops: [#1] SMP [ 30.131013] Modules linked in: tun netlink_diag af_packet_diag udp_diag tcp_diag inet_diag unix_diag ppdev crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel virtio_balloon serio_raw i2c_piix4 parport_pc parport virtio_blk virtio_net cirrus drm_kms_helper ttm drm virtio_pci virtio_ring virtio ata_generic pata_acpi [ 30.131013] CPU: 0 PID: 613 Comm: criu Not tainted 4.1.0-rc8-next-20150617 #1 [ 30.131013] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 30.131013] task: 880039eb ti: 880039db task.ti: 880039db [ 30.131013] RIP: 0010:[813bc7e1] [813bc7e1] cfq_print_leaf_weight+0x31/0x50 [ 30.131013] RSP: 0018:880039db3d08 EFLAGS: 00010286 [ 30.131013] RAX: RBX: 88003a524200 RCX: 0001 [ 30.131013] RDX: 0001 RSI: 0001 RDI: 88003a524800 [ 30.131013] RBP: 880039db3d18 R08: 0001 R09: [ 30.131013] R10: 0080 R11: R12: 81cba6c8 [ 30.131013] R13: 0001 R14: 880039db3f18 R15: 88003a524200 [ 30.131013] FS: 7f2b6c495740() GS:88003d20() knlGS: [ 30.131013] CS: 0010 DS: ES: CR0: 80050033 [ 30.131013] CR2: 001c CR3: 3de3c000 CR4: 000407f0 [ 30.131013] Stack: [ 30.131013] 880039db3d28 88003a524200 880039db3d48 8113c9d3 [ 30.131013] 880039db3d58 880039773380 0001 [ 30.131013] 880039db3d58 812b9df6 880039db3dd8 81257b7d [ 30.131013] Call Trace: [ 30.131013] [8113c9d3] cgroup_seqfile_show+0x43/0xc0 [ 30.131013] [812b9df6] kernfs_seq_show+0x26/0x30 [ 30.131013] [81257b7d] seq_read+0x10d/0x3c0 [ 30.131013] [812ba7c9] kernfs_fop_read+0x129/0x180 [ 30.131013] [8122f9d7] __vfs_read+0x37/0x100 [ 30.131013] [81349db3] ? security_file_permission+0xa3/0xc0 [ 30.131013] [8122ff86] ? rw_verify_area+0x56/0xe0 [ 30.131013] [81230096] vfs_read+0x86/0x130 [ 30.131013] [81230f28] SyS_read+0x58/0xd0 [ 30.131013] [817c386e] entry_SYSCALL_64_fastpath+0x12/0x76 [ 30.131013] Code: 48 89 e5 53 48 89 fb 48 83 ec 08 48 8b bf d8 00 00 00 e8 93 fe d7 ff 48 85 c0 74 0f 48 63 15 27 f0 8f 00 48 8b 84 d0 28 01 00 00 8b 50 1c 48 89 df 31 c0 48 c7 c6 f8 f8 ad 81 e8 1b b9 e9 ff 48 [ 30.131013] RIP [813bc7e1] cfq_print_leaf_weight+0x31/0x50 [ 30.131013] RSP 880039db3d08 [ 30.131013] CR2: 001c [ 30.142367] ---[ end trace bad3e020b932bbb1 ]--- -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
linux-next: BUG: unable to handle kernel NULL pointer dereference in cfq_print_leaf_weight()
Hi All, I executed CRIU tests on the 4.1.0-rc8-next-20150617 kernel and met this bug. Maybe it will be interested for someone to look at it. [ 30.130897] BUG: unable to handle kernel NULL pointer dereference at 001c [ 30.130970] IP: [] cfq_print_leaf_weight+0x31/0x50 [ 30.131013] PGD 3b77a067 PUD 39883067 PMD 0 [ 30.131013] Oops: [#1] SMP [ 30.131013] Modules linked in: tun netlink_diag af_packet_diag udp_diag tcp_diag inet_diag unix_diag ppdev crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel virtio_balloon serio_raw i2c_piix4 parport_pc parport virtio_blk virtio_net cirrus drm_kms_helper ttm drm virtio_pci virtio_ring virtio ata_generic pata_acpi [ 30.131013] CPU: 0 PID: 613 Comm: criu Not tainted 4.1.0-rc8-next-20150617 #1 [ 30.131013] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 30.131013] task: 880039eb ti: 880039db task.ti: 880039db [ 30.131013] RIP: 0010:[] [] cfq_print_leaf_weight+0x31/0x50 [ 30.131013] RSP: 0018:880039db3d08 EFLAGS: 00010286 [ 30.131013] RAX: RBX: 88003a524200 RCX: 0001 [ 30.131013] RDX: 0001 RSI: 0001 RDI: 88003a524800 [ 30.131013] RBP: 880039db3d18 R08: 0001 R09: [ 30.131013] R10: 0080 R11: R12: 81cba6c8 [ 30.131013] R13: 0001 R14: 880039db3f18 R15: 88003a524200 [ 30.131013] FS: 7f2b6c495740() GS:88003d20() knlGS: [ 30.131013] CS: 0010 DS: ES: CR0: 80050033 [ 30.131013] CR2: 001c CR3: 3de3c000 CR4: 000407f0 [ 30.131013] Stack: [ 30.131013] 880039db3d28 88003a524200 880039db3d48 8113c9d3 [ 30.131013] 880039db3d58 880039773380 0001 [ 30.131013] 880039db3d58 812b9df6 880039db3dd8 81257b7d [ 30.131013] Call Trace: [ 30.131013] [] cgroup_seqfile_show+0x43/0xc0 [ 30.131013] [] kernfs_seq_show+0x26/0x30 [ 30.131013] [] seq_read+0x10d/0x3c0 [ 30.131013] [] kernfs_fop_read+0x129/0x180 [ 30.131013] [] __vfs_read+0x37/0x100 [ 30.131013] [] ? security_file_permission+0xa3/0xc0 [ 30.131013] [] ? rw_verify_area+0x56/0xe0 [ 30.131013] [] vfs_read+0x86/0x130 [ 30.131013] [] SyS_read+0x58/0xd0 [ 30.131013] [] entry_SYSCALL_64_fastpath+0x12/0x76 [ 30.131013] Code: 48 89 e5 53 48 89 fb 48 83 ec 08 48 8b bf d8 00 00 00 e8 93 fe d7 ff 48 85 c0 74 0f 48 63 15 27 f0 8f 00 48 8b 84 d0 28 01 00 00 <8b> 50 1c 48 89 df 31 c0 48 c7 c6 f8 f8 ad 81 e8 1b b9 e9 ff 48 [ 30.131013] RIP [] cfq_print_leaf_weight+0x31/0x50 [ 30.131013] RSP [ 30.131013] CR2: 001c [ 30.142367] ---[ end trace bad3e020b932bbb1 ]--- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
linux-next: BUG: unable to handle kernel NULL pointer dereference in cfq_print_leaf_weight()
Hi All, I executed CRIU tests on the 4.1.0-rc8-next-20150617 kernel and met this bug. Maybe it will be interested for someone to look at it. [ 30.130897] BUG: unable to handle kernel NULL pointer dereference at 001c [ 30.130970] IP: [813bc7e1] cfq_print_leaf_weight+0x31/0x50 [ 30.131013] PGD 3b77a067 PUD 39883067 PMD 0 [ 30.131013] Oops: [#1] SMP [ 30.131013] Modules linked in: tun netlink_diag af_packet_diag udp_diag tcp_diag inet_diag unix_diag ppdev crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel virtio_balloon serio_raw i2c_piix4 parport_pc parport virtio_blk virtio_net cirrus drm_kms_helper ttm drm virtio_pci virtio_ring virtio ata_generic pata_acpi [ 30.131013] CPU: 0 PID: 613 Comm: criu Not tainted 4.1.0-rc8-next-20150617 #1 [ 30.131013] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 30.131013] task: 880039eb ti: 880039db task.ti: 880039db [ 30.131013] RIP: 0010:[813bc7e1] [813bc7e1] cfq_print_leaf_weight+0x31/0x50 [ 30.131013] RSP: 0018:880039db3d08 EFLAGS: 00010286 [ 30.131013] RAX: RBX: 88003a524200 RCX: 0001 [ 30.131013] RDX: 0001 RSI: 0001 RDI: 88003a524800 [ 30.131013] RBP: 880039db3d18 R08: 0001 R09: [ 30.131013] R10: 0080 R11: R12: 81cba6c8 [ 30.131013] R13: 0001 R14: 880039db3f18 R15: 88003a524200 [ 30.131013] FS: 7f2b6c495740() GS:88003d20() knlGS: [ 30.131013] CS: 0010 DS: ES: CR0: 80050033 [ 30.131013] CR2: 001c CR3: 3de3c000 CR4: 000407f0 [ 30.131013] Stack: [ 30.131013] 880039db3d28 88003a524200 880039db3d48 8113c9d3 [ 30.131013] 880039db3d58 880039773380 0001 [ 30.131013] 880039db3d58 812b9df6 880039db3dd8 81257b7d [ 30.131013] Call Trace: [ 30.131013] [8113c9d3] cgroup_seqfile_show+0x43/0xc0 [ 30.131013] [812b9df6] kernfs_seq_show+0x26/0x30 [ 30.131013] [81257b7d] seq_read+0x10d/0x3c0 [ 30.131013] [812ba7c9] kernfs_fop_read+0x129/0x180 [ 30.131013] [8122f9d7] __vfs_read+0x37/0x100 [ 30.131013] [81349db3] ? security_file_permission+0xa3/0xc0 [ 30.131013] [8122ff86] ? rw_verify_area+0x56/0xe0 [ 30.131013] [81230096] vfs_read+0x86/0x130 [ 30.131013] [81230f28] SyS_read+0x58/0xd0 [ 30.131013] [817c386e] entry_SYSCALL_64_fastpath+0x12/0x76 [ 30.131013] Code: 48 89 e5 53 48 89 fb 48 83 ec 08 48 8b bf d8 00 00 00 e8 93 fe d7 ff 48 85 c0 74 0f 48 63 15 27 f0 8f 00 48 8b 84 d0 28 01 00 00 8b 50 1c 48 89 df 31 c0 48 c7 c6 f8 f8 ad 81 e8 1b b9 e9 ff 48 [ 30.131013] RIP [813bc7e1] cfq_print_leaf_weight+0x31/0x50 [ 30.131013] RSP 880039db3d08 [ 30.131013] CR2: 001c [ 30.142367] ---[ end trace bad3e020b932bbb1 ]--- -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] seccomp: add ptrace commands for suspend/resume
2015-06-01 22:28 GMT+03:00 Tycho Andersen : > This patch is the first step in enabling checkpoint/restore of processes > with seccomp enabled. > > One of the things CRIU does while dumping tasks is inject code into them > via ptrace to collect information that is only available to the process > itself. However, if we are in a seccomp mode where these processes are > prohibited from making these syscalls, then what CRIU does kills the task. > > This patch adds a new ptrace command, PTRACE_SUSPEND_SECCOMP that enables a > task from the init user namespace which has CAP_SYS_ADMIN to disable (and > re-enable) seccomp filters for another task so that they can be > successfully dumped (and restored). Do we need to re-enable seccomp if a tracer detaches unexpectedly. CRIU can be killed and we should try to not affect the task state even in this case. Thanks, Andrew -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] seccomp: add ptrace commands for suspend/resume
2015-06-01 22:28 GMT+03:00 Tycho Andersen tycho.ander...@canonical.com: This patch is the first step in enabling checkpoint/restore of processes with seccomp enabled. One of the things CRIU does while dumping tasks is inject code into them via ptrace to collect information that is only available to the process itself. However, if we are in a seccomp mode where these processes are prohibited from making these syscalls, then what CRIU does kills the task. This patch adds a new ptrace command, PTRACE_SUSPEND_SECCOMP that enables a task from the init user namespace which has CAP_SYS_ADMIN to disable (and re-enable) seccomp filters for another task so that they can be successfully dumped (and restored). Do we need to re-enable seccomp if a tracer detaches unexpectedly. CRIU can be killed and we should try to not affect the task state even in this case. Thanks, Andrew -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] fbcon: Avoid deleting a timer in IRQ context
2015-05-21 10:58 GMT+03:00 Thierry Reding : > From: Thierry Reding > > Commit 27a4c827c34a ("fbcon: use the cursor blink interval provided by > vt") unconditionally removes the cursor blink timer. Unfortunately that > wreaks havoc under some circumstances. An easily reproducible way is to > use both the framebuffer console and a debug serial port as the console > output for kernel messages (e.g. "console=ttyS0 console=tty1" on the > kernel command-line. Upon boot this triggers a warning from within the > del_timer_sync() function because it is called from IRQ context: > > [5.070096] [ cut here ] > [5.070110] WARNING: CPU: 0 PID: 0 at ../kernel/time/timer.c:1098 > del_timer_sync+0x4c/0x54() > [5.070115] Modules linked in: > [5.070120] CPU: 0 PID: 0 Comm: swapper/0 Not tainted > 4.1.0-rc4-next-20150519 #1 > [5.070123] Hardware name: SAMSUNG EXYNOS (Flattened Device Tree) > [5.070142] [] (unwind_backtrace) from [] (show_stack+0x10/0x14) > [5.070156] [] (show_stack) from [] (dump_stack+0x70/0xbc) > [5.070164] [] (dump_stack) from [] > (warn_slowpath_common+0x74/0xb0) > [5.070169] [] (warn_slowpath_common) from [] > (warn_slowpath_null+0x1c/0x24) > [5.070174] [] (warn_slowpath_null) from [] > (del_timer_sync+0x4c/0x54) > [5.070183] [] (del_timer_sync) from [] > (fbcon_del_cursor_timer+0x2c/0x40) > [5.070190] [] (fbcon_del_cursor_timer) from [] > (fbcon_cursor+0x9c/0x180) > [5.070198] [] (fbcon_cursor) from [] (hide_cursor+0x30/0x98) > [5.070204] [] (hide_cursor) from [] (vt_console_print+0x2a8/0x340) > [5.070212] [] (vt_console_print) from [] > (call_console_drivers.constprop.23+0xc8/0xec) > [5.070218] [] (call_console_drivers.constprop.23) from [] > (console_unlock+0x498/0x4f0) > [5.070223] [] (console_unlock) from [] (vprintk_emit+0x1f0/0x508) > [5.070228] [] (vprintk_emit) from [] (vprintk_default+0x24/0x2c) > [5.070234] [] (vprintk_default) from [] (printk+0x70/0x88) > > After which the system starts spewing all kinds of weird and seemingly > unrelated error messages. > > This commit fixes this by restoring the condition under which the call > to fbcon_del_cursor_timer() happens. Tested-by: Andrew Vagin It would be good to push this patch into linux-next asap. Without this patch I can't execute tests on linux-next. Thanks. > > Reported-by: Daniel Stone > Reported-by: Kevin Hilman > Tested-by: Kevin Hilman > Tested-by: Scot Doyle > Signed-off-by: Thierry Reding > --- > drivers/video/console/fbcon.c | 5 +++-- > 1 file changed, 3 insertions(+), 2 deletions(-) > > diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c > index 05b1d1a71ef9..658c34bb9076 100644 > --- a/drivers/video/console/fbcon.c > +++ b/drivers/video/console/fbcon.c > @@ -1310,8 +1310,9 @@ static void fbcon_cursor(struct vc_data *vc, int mode) > return; > > ops->cur_blink_jiffies = msecs_to_jiffies(vc->vc_cur_blink_ms); > - fbcon_del_cursor_timer(info); > - if (!(vc->vc_cursor_type & 0x10)) > + if (vc->vc_cursor_type & 0x10) > + fbcon_del_cursor_timer(info); > + else > fbcon_add_cursor_timer(info); > > ops->cursor_flash = (mode == CM_ERASE) ? 0 : 1; > -- > 2.4.1 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/2] fbcon: use the cursor blink interval provided by vt
2015-05-27 10:52 GMT+03:00 Scot Doyle : > On Wed, 27 May 2015, Andrey Wagin wrote: > ... >> I regularly execute criu tests on linux-next. For this, I use virtual >> machine from the digitalocean clould. The current version of >> linux-next hangs after a few seconds. I use git bisect to find the >> commit where the problem is appeaed. And it looks like the problem is >> in this patch. > ... >> Thanks, >> Andrew > > Perhaps this pending patch will help: > http://marc.info/?l=linux-kernel=143219509918969=2 Yes, it helps. Thanks. > > Thanks, > Scot > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/2] fbcon: use the cursor blink interval provided by vt
2015-05-27 10:52 GMT+03:00 Scot Doyle lkm...@scotdoyle.com: On Wed, 27 May 2015, Andrey Wagin wrote: ... I regularly execute criu tests on linux-next. For this, I use virtual machine from the digitalocean clould. The current version of linux-next hangs after a few seconds. I use git bisect to find the commit where the problem is appeaed. And it looks like the problem is in this patch. ... Thanks, Andrew Perhaps this pending patch will help: http://marc.info/?l=linux-kernelm=143219509918969w=2 Yes, it helps. Thanks. Thanks, Scot -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] fbcon: Avoid deleting a timer in IRQ context
2015-05-21 10:58 GMT+03:00 Thierry Reding thierry.red...@gmail.com: From: Thierry Reding tred...@nvidia.com Commit 27a4c827c34a (fbcon: use the cursor blink interval provided by vt) unconditionally removes the cursor blink timer. Unfortunately that wreaks havoc under some circumstances. An easily reproducible way is to use both the framebuffer console and a debug serial port as the console output for kernel messages (e.g. console=ttyS0 console=tty1 on the kernel command-line. Upon boot this triggers a warning from within the del_timer_sync() function because it is called from IRQ context: [5.070096] [ cut here ] [5.070110] WARNING: CPU: 0 PID: 0 at ../kernel/time/timer.c:1098 del_timer_sync+0x4c/0x54() [5.070115] Modules linked in: [5.070120] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.1.0-rc4-next-20150519 #1 [5.070123] Hardware name: SAMSUNG EXYNOS (Flattened Device Tree) [5.070142] [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [5.070156] [] (show_stack) from [] (dump_stack+0x70/0xbc) [5.070164] [] (dump_stack) from [] (warn_slowpath_common+0x74/0xb0) [5.070169] [] (warn_slowpath_common) from [] (warn_slowpath_null+0x1c/0x24) [5.070174] [] (warn_slowpath_null) from [] (del_timer_sync+0x4c/0x54) [5.070183] [] (del_timer_sync) from [] (fbcon_del_cursor_timer+0x2c/0x40) [5.070190] [] (fbcon_del_cursor_timer) from [] (fbcon_cursor+0x9c/0x180) [5.070198] [] (fbcon_cursor) from [] (hide_cursor+0x30/0x98) [5.070204] [] (hide_cursor) from [] (vt_console_print+0x2a8/0x340) [5.070212] [] (vt_console_print) from [] (call_console_drivers.constprop.23+0xc8/0xec) [5.070218] [] (call_console_drivers.constprop.23) from [] (console_unlock+0x498/0x4f0) [5.070223] [] (console_unlock) from [] (vprintk_emit+0x1f0/0x508) [5.070228] [] (vprintk_emit) from [] (vprintk_default+0x24/0x2c) [5.070234] [] (vprintk_default) from [] (printk+0x70/0x88) After which the system starts spewing all kinds of weird and seemingly unrelated error messages. This commit fixes this by restoring the condition under which the call to fbcon_del_cursor_timer() happens. Tested-by: Andrew Vagin ava...@virtuozzo.com It would be good to push this patch into linux-next asap. Without this patch I can't execute tests on linux-next. Thanks. Reported-by: Daniel Stone dan...@fooishbar.org Reported-by: Kevin Hilman khil...@kernel.org Tested-by: Kevin Hilman khil...@linaro.org Tested-by: Scot Doyle lkm...@scotdoyle.com Signed-off-by: Thierry Reding tred...@nvidia.com --- drivers/video/console/fbcon.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c index 05b1d1a71ef9..658c34bb9076 100644 --- a/drivers/video/console/fbcon.c +++ b/drivers/video/console/fbcon.c @@ -1310,8 +1310,9 @@ static void fbcon_cursor(struct vc_data *vc, int mode) return; ops-cur_blink_jiffies = msecs_to_jiffies(vc-vc_cur_blink_ms); - fbcon_del_cursor_timer(info); - if (!(vc-vc_cursor_type 0x10)) + if (vc-vc_cursor_type 0x10) + fbcon_del_cursor_timer(info); + else fbcon_add_cursor_timer(info); ops-cursor_flash = (mode == CM_ERASE) ? 0 : 1; -- 2.4.1 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/2] fbcon: use the cursor blink interval provided by vt
2015-02-27 22:15 GMT+03:00 Scot Doyle : > vt now provides a cursor blink interval via vc_data. Use this > interval instead of the currently hardcoded 200 msecs. Store it in > fbcon_ops to avoid locking the console in cursor_timer_handler(). I regularly execute criu tests on linux-next. For this, I use virtual machine from the digitalocean clould. The current version of linux-next hangs after a few seconds. I use git bisect to find the commit where the problem is appeaed. And it looks like the problem is in this patch. When the kernel hangs, it doesn't report anything on the screen and there is nothing suspicious in logs after reboot. I will try to reproduce the problem in my local enviroment to get more information. There is my config file: https://github.com/avagin/criu-jenkins-digitalocean/blob/d95d9e30a7da8755c47b290630bac7ee1fe7132d/jenkins-scripts/config Thanks, Andrew -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/2] fbcon: use the cursor blink interval provided by vt
2015-02-27 22:15 GMT+03:00 Scot Doyle lkm...@scotdoyle.com: vt now provides a cursor blink interval via vc_data. Use this interval instead of the currently hardcoded 200 msecs. Store it in fbcon_ops to avoid locking the console in cursor_timer_handler(). I regularly execute criu tests on linux-next. For this, I use virtual machine from the digitalocean clould. The current version of linux-next hangs after a few seconds. I use git bisect to find the commit where the problem is appeaed. And it looks like the problem is in this patch. When the kernel hangs, it doesn't report anything on the screen and there is nothing suspicious in logs after reboot. I will try to reproduce the problem in my local enviroment to get more information. There is my config file: https://github.com/avagin/criu-jenkins-digitalocean/blob/d95d9e30a7da8755c47b290630bac7ee1fe7132d/jenkins-scripts/config Thanks, Andrew -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] fs: show locked and lock_ro options in mountinfo
2015-03-28 1:47 GMT+03:00 Richard Weinberger : > Hi! > > Am 27.03.2015 um 23:35 schrieb Andrey Wagin: >> 2015-03-28 0:42 GMT+03:00 Richard Weinberger : >>> On Fri, Mar 27, 2015 at 9:39 PM, Andrey Vagin wrote: >>>> I don't see any reasons to hide them. This information can help to >>>> understand errors. >>> >>> Because these flags are set/read only internally by the VFS. In contrast >>> to the other flags shown by mountinfo MNT_LOCKED is not a mount option. >> >> But this flag is set as a result of the specified user action, when he >> unshares userns and mntns. This flag affects visiable behaviour. > > It is a implicit result. Used by the VFS internally. > If you expose it it becomes ABI and changing the behavior will be > tricky or impossible. > >>> >>> Why does it help to debug errors? >>> How would a user know that mount() with MS_BIND returns EINVAL because >>> the mount source is MNT_LOCKED? This information is useless for her. >> >> If I see lock_ro, I can be sure that mount -o remount,bind,rw /XXX will fail. >> If I see locked, I know that this mount can't be umounted or moved >> and can be bind-mounted only recursively. >> >> If a user see these flags, he can check that a mount namespace is >> configured correctly without security issues. >> >> Sorry but I don't understand why you think that this information is >> useless for users. > > You can only know if you know how the VFS works internally. > If know that EINVAL from mount(2) with MS_BIND can be caused by MNT_LOCKED > because I know the source. I bet you know the source too. But not Joey random > admin who looks into mountinfo to figure out why something does not work. > > If you expose MNT_LOCKED to userspace you'll have to update also the mount(2) > manpage with all glory details of that flag. > >>> If you argue like that you'd have to expose the whole VFS state to userland. >> >> I have not noticed other MNT_LOCK_* flags. I should think more about >> what information are a really required for dumping mount namespaces. >> >>> >>>> And this information is required for correct checkpoint/restore of mount >>>> namespaces. >>> >>> Why especially MNT_LOCKED and not all the other flags used by VFS? >> >> My goal is to dump enough information about a mount namespace to be >> able to restore it back later. I don't know how to do this without >> knowledge about locked mounts. I will think. > > How do you plan to restore a MNT_LOCKED mount? > IIRC we have currently no way to directly set MNT_LOCKED from userspace. It's the second question. The first question is how to check that we will be able to restore what we are dumping. If CRIU meets something what it doesn't know how to restore, it (must) refuses to dump this configuration. > >>> Say MNT_DOOMED? >> >> Mounts with MNT_DOOMED are never shown in mountinfo, are they? > > It was just an example. There are other flags too, did you double check > which ones you really need? > > To make the story short, my concern is that exposing such flags to userspace > has to be well thought. :-) > As long they are just internal we can change them as we like, as soon > userspace > depends somehow on it the pain begins. I'm agree with you. I'm going to think more about this. Thank you for response. > > Thanks, > //richard -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] fs: show locked and lock_ro options in mountinfo
2015-03-28 1:47 GMT+03:00 Richard Weinberger rich...@nod.at: Hi! Am 27.03.2015 um 23:35 schrieb Andrey Wagin: 2015-03-28 0:42 GMT+03:00 Richard Weinberger richard.weinber...@gmail.com: On Fri, Mar 27, 2015 at 9:39 PM, Andrey Vagin ava...@openvz.org wrote: I don't see any reasons to hide them. This information can help to understand errors. Because these flags are set/read only internally by the VFS. In contrast to the other flags shown by mountinfo MNT_LOCKED is not a mount option. But this flag is set as a result of the specified user action, when he unshares userns and mntns. This flag affects visiable behaviour. It is a implicit result. Used by the VFS internally. If you expose it it becomes ABI and changing the behavior will be tricky or impossible. Why does it help to debug errors? How would a user know that mount() with MS_BIND returns EINVAL because the mount source is MNT_LOCKED? This information is useless for her. If I see lock_ro, I can be sure that mount -o remount,bind,rw /XXX will fail. If I see locked, I know that this mount can't be umounted or moved and can be bind-mounted only recursively. If a user see these flags, he can check that a mount namespace is configured correctly without security issues. Sorry but I don't understand why you think that this information is useless for users. You can only know if you know how the VFS works internally. If know that EINVAL from mount(2) with MS_BIND can be caused by MNT_LOCKED because I know the source. I bet you know the source too. But not Joey random admin who looks into mountinfo to figure out why something does not work. If you expose MNT_LOCKED to userspace you'll have to update also the mount(2) manpage with all glory details of that flag. If you argue like that you'd have to expose the whole VFS state to userland. I have not noticed other MNT_LOCK_* flags. I should think more about what information are a really required for dumping mount namespaces. And this information is required for correct checkpoint/restore of mount namespaces. Why especially MNT_LOCKED and not all the other flags used by VFS? My goal is to dump enough information about a mount namespace to be able to restore it back later. I don't know how to do this without knowledge about locked mounts. I will think. How do you plan to restore a MNT_LOCKED mount? IIRC we have currently no way to directly set MNT_LOCKED from userspace. It's the second question. The first question is how to check that we will be able to restore what we are dumping. If CRIU meets something what it doesn't know how to restore, it (must) refuses to dump this configuration. Say MNT_DOOMED? Mounts with MNT_DOOMED are never shown in mountinfo, are they? It was just an example. There are other flags too, did you double check which ones you really need? To make the story short, my concern is that exposing such flags to userspace has to be well thought. :-) As long they are just internal we can change them as we like, as soon userspace depends somehow on it the pain begins. I'm agree with you. I'm going to think more about this. Thank you for response. Thanks, //richard -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] fs: show locked and lock_ro options in mountinfo
2015-03-28 0:42 GMT+03:00 Richard Weinberger : > On Fri, Mar 27, 2015 at 9:39 PM, Andrey Vagin wrote: >> I don't see any reasons to hide them. This information can help to >> understand errors. > > Because these flags are set/read only internally by the VFS. In contrast > to the other flags shown by mountinfo MNT_LOCKED is not a mount option. But this flag is set as a result of the specified user action, when he unshares userns and mntns. This flag affects visiable behaviour. > > Why does it help to debug errors? > How would a user know that mount() with MS_BIND returns EINVAL because > the mount source is MNT_LOCKED? This information is useless for her. If I see lock_ro, I can be sure that mount -o remount,bind,rw /XXX will fail. If I see locked, I know that this mount can't be umounted or moved and can be bind-mounted only recursively. If a user see these flags, he can check that a mount namespace is configured correctly without security issues. Sorry but I don't understand why you think that this information is useless for users. > If you argue like that you'd have to expose the whole VFS state to userland. I have not noticed other MNT_LOCK_* flags. I should think more about what information are a really required for dumping mount namespaces. > >> And this information is required for correct checkpoint/restore of mount >> namespaces. > > Why especially MNT_LOCKED and not all the other flags used by VFS? My goal is to dump enough information about a mount namespace to be able to restore it back later. I don't know how to do this without knowledge about locked mounts. I will think. > Say MNT_DOOMED? Mounts with MNT_DOOMED are never shown in mountinfo, are they? Thank you for looking at this patch. > > -- > Thanks, > //richard -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] fs: show locked and lock_ro options in mountinfo
2015-03-28 0:42 GMT+03:00 Richard Weinberger richard.weinber...@gmail.com: On Fri, Mar 27, 2015 at 9:39 PM, Andrey Vagin ava...@openvz.org wrote: I don't see any reasons to hide them. This information can help to understand errors. Because these flags are set/read only internally by the VFS. In contrast to the other flags shown by mountinfo MNT_LOCKED is not a mount option. But this flag is set as a result of the specified user action, when he unshares userns and mntns. This flag affects visiable behaviour. Why does it help to debug errors? How would a user know that mount() with MS_BIND returns EINVAL because the mount source is MNT_LOCKED? This information is useless for her. If I see lock_ro, I can be sure that mount -o remount,bind,rw /XXX will fail. If I see locked, I know that this mount can't be umounted or moved and can be bind-mounted only recursively. If a user see these flags, he can check that a mount namespace is configured correctly without security issues. Sorry but I don't understand why you think that this information is useless for users. If you argue like that you'd have to expose the whole VFS state to userland. I have not noticed other MNT_LOCK_* flags. I should think more about what information are a really required for dumping mount namespaces. And this information is required for correct checkpoint/restore of mount namespaces. Why especially MNT_LOCKED and not all the other flags used by VFS? My goal is to dump enough information about a mount namespace to be able to restore it back later. I don't know how to do this without knowledge about locked mounts. I will think. Say MNT_DOOMED? Mounts with MNT_DOOMED are never shown in mountinfo, are they? Thank you for looking at this patch. -- Thanks, //richard -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v3 1/2] x86_64,signal: Fix SS handling for signals delivered to 64-bit programs
2015-03-18 21:50 GMT+03:00 Cyrill Gorcunov : > On Wed, Mar 18, 2015 at 07:31:33PM +0100, Oleg Nesterov wrote: >> On 03/18, Cyrill Gorcunov wrote: >> > >> > On Wed, Mar 18, 2015 at 06:48:43PM +0100, Oleg Nesterov wrote: >> > > >> > > Shot in a dark afer a quick grep: restore_gpregs() should initialize >> > > ->ss? >> > >> > It hasn't been needed earlier, if this would help it means abi is broken, >> > no? :) >> > Otherwise I don't understand what's happening. >> >> until this commit the kernel simply forgot to restore ->ss in sigreturn(). >> >> after this commit, if your rt_sigframe has garbage in ->ss then SIGSEGV >> is clear. > > OK, so setting up a proper ss here should fix problem. > >> > > >> > > Seriously, where is UserX86RegsEntry? >> > >> > It's in protobif/core-x86.proto, welcome to protobuf hell. >> >> Still can't find UserX86RegsEntry... OK, perhaps it comes from >> user_x86_regs_entry in protobuf/core-x86.proto. Then I guess the patch I sent >> can be compiled at least ;) > > Yeah, protobuf-c mangles "_" in message name so user_x86_regs_entry -> > UserX86RegsEntry. > Andrew should be able to test it tomorrow (hopefully it's no urgent right? ;) This patch fixes the problem. Oleg, could you send this path in the criu maillist? -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v3 1/2] x86_64,signal: Fix SS handling for signals delivered to 64-bit programs
2015-03-12 23:57 GMT+03:00 Andy Lutomirski : > From: Andy Lutomirski > > The comment in the signal code says that apps can save/restore other > segments on their own. It's true that apps can *save* SS on their > own, but there's no way for apps to restore it: SYSCALL effectively > resets SS to __USER_DS, so any value that user code tries to load > into SS gets lost on entry to sigreturn. > > This recycles two padding bytes in the segment selector area for SS. > > While we're at it, we need a second change to make this useful. If > the signal we're delivering is caused by a bad SS value, saving that > value isn't enough. We need to remove that bad value from the regs > before we try to deliver the signal. Oddly, x32 already got this > right. > > I suspect that 64-bit programs that try to run 16-bit code and use > signals will have a lot of trouble without this. This commit breaks CRIU. I don't have any details yet. I'm going to investigate this issue and provide more details tomorrow. [root@avagin-fc19-cr criu]# setsid sleep 1000 & [1] 1225 [root@avagin-fc19-cr criu]# ps -C sleep PID TTY TIME CMD 1226 ?00:00:00 sleep [root@avagin-fc19-cr criu]# ./criu dump -t 1226 -D dump --shell-job [root@avagin-fc19-cr criu]# ./criu restore -D dump --shell-job Error (parasite-syscall.c:923): Task is in unexpected state: b7f (SIGSEGV) > > Signed-off-by: Andy Lutomirski > --- > arch/x86/include/asm/sigcontext.h | 2 +- > arch/x86/include/uapi/asm/sigcontext.h | 2 +- > arch/x86/kernel/signal.c | 22 +- > 3 files changed, 15 insertions(+), 11 deletions(-) > > diff --git a/arch/x86/include/asm/sigcontext.h > b/arch/x86/include/asm/sigcontext.h > index 9dfce4e0417d..f910cdcb71fd 100644 > --- a/arch/x86/include/asm/sigcontext.h > +++ b/arch/x86/include/asm/sigcontext.h > @@ -59,7 +59,7 @@ struct sigcontext { > unsigned short cs; > unsigned short gs; > unsigned short fs; > - unsigned short __pad0; > + unsigned short ss; > unsigned long err; > unsigned long trapno; > unsigned long oldmask; > diff --git a/arch/x86/include/uapi/asm/sigcontext.h > b/arch/x86/include/uapi/asm/sigcontext.h > index d8b9f9081e86..076b11fd6fa1 100644 > --- a/arch/x86/include/uapi/asm/sigcontext.h > +++ b/arch/x86/include/uapi/asm/sigcontext.h > @@ -179,7 +179,7 @@ struct sigcontext { > __u16 cs; > __u16 gs; > __u16 fs; > - __u16 __pad0; > + __u16 ss; > __u64 err; > __u64 trapno; > __u64 oldmask; > diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c > index ed37a768d0fc..9511eb7f17b0 100644 > --- a/arch/x86/kernel/signal.c > +++ b/arch/x86/kernel/signal.c > @@ -94,15 +94,8 @@ int restore_sigcontext(struct pt_regs *regs, struct > sigcontext __user *sc, > COPY(r15); > #endif /* CONFIG_X86_64 */ > > -#ifdef CONFIG_X86_32 > COPY_SEG_CPL3(cs); > COPY_SEG_CPL3(ss); > -#else /* !CONFIG_X86_32 */ > - /* Kernel saves and restores only the CS segment register on > signals, > -* which is the bare minimum needed to allow mixed 32/64-bit > code. > -* App's signal handler can save/restore other segments if > needed. */ > - COPY_SEG_CPL3(cs); > -#endif /* CONFIG_X86_32 */ > > get_user_ex(tmpflags, >flags); > regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & > FIX_EFLAGS); > @@ -164,6 +157,7 @@ int setup_sigcontext(struct sigcontext __user *sc, void > __user *fpstate, > put_user_ex(regs->cs, >cs); > put_user_ex(0, >gs); > put_user_ex(0, >fs); > + put_user_ex(regs->ss, >ss); > #endif /* CONFIG_X86_32 */ > > put_user_ex(fpstate, >fpstate); > @@ -457,9 +451,19 @@ static int __setup_rt_frame(int sig, struct ksignal > *ksig, > > regs->sp = (unsigned long)frame; > > - /* Set up the CS register to run signal handlers in 64-bit mode, > - even if the handler happens to be interrupting 32-bit code. */ > + /* > +* Set up the CS and SS registers to run signal handlers in > +* 64-bit mode, even if the handler happens to be interrupting > +* 32-bit or 16-bit code. > +* > +* SS is subtle. In 64-bit mode, we don't need any particular > +* SS descriptor, but we do need SS to be valid. It's possible > +* that the old SS is entirely bogus -- this can happen if the > +* signal we're trying to deliver is #GP or #SS caused by a bad > +* SS value. > +*/ > regs->cs = __USER_CS; > + regs->ss = __USER_DS; > > return 0; > } > -- > 2.3.0 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majord...@vger.kernel.org > More majordomo info at
Re: [PATCH v3 1/2] x86_64,signal: Fix SS handling for signals delivered to 64-bit programs
2015-03-12 23:57 GMT+03:00 Andy Lutomirski l...@kernel.org: From: Andy Lutomirski l...@amacapital.net The comment in the signal code says that apps can save/restore other segments on their own. It's true that apps can *save* SS on their own, but there's no way for apps to restore it: SYSCALL effectively resets SS to __USER_DS, so any value that user code tries to load into SS gets lost on entry to sigreturn. This recycles two padding bytes in the segment selector area for SS. While we're at it, we need a second change to make this useful. If the signal we're delivering is caused by a bad SS value, saving that value isn't enough. We need to remove that bad value from the regs before we try to deliver the signal. Oddly, x32 already got this right. I suspect that 64-bit programs that try to run 16-bit code and use signals will have a lot of trouble without this. This commit breaks CRIU. I don't have any details yet. I'm going to investigate this issue and provide more details tomorrow. [root@avagin-fc19-cr criu]# setsid sleep 1000 [1] 1225 [root@avagin-fc19-cr criu]# ps -C sleep PID TTY TIME CMD 1226 ?00:00:00 sleep [root@avagin-fc19-cr criu]# ./criu dump -t 1226 -D dump --shell-job [root@avagin-fc19-cr criu]# ./criu restore -D dump --shell-job Error (parasite-syscall.c:923): Task is in unexpected state: b7f (SIGSEGV) Signed-off-by: Andy Lutomirski l...@amacapital.net --- arch/x86/include/asm/sigcontext.h | 2 +- arch/x86/include/uapi/asm/sigcontext.h | 2 +- arch/x86/kernel/signal.c | 22 +- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h index 9dfce4e0417d..f910cdcb71fd 100644 --- a/arch/x86/include/asm/sigcontext.h +++ b/arch/x86/include/asm/sigcontext.h @@ -59,7 +59,7 @@ struct sigcontext { unsigned short cs; unsigned short gs; unsigned short fs; - unsigned short __pad0; + unsigned short ss; unsigned long err; unsigned long trapno; unsigned long oldmask; diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index d8b9f9081e86..076b11fd6fa1 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -179,7 +179,7 @@ struct sigcontext { __u16 cs; __u16 gs; __u16 fs; - __u16 __pad0; + __u16 ss; __u64 err; __u64 trapno; __u64 oldmask; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index ed37a768d0fc..9511eb7f17b0 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -94,15 +94,8 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, COPY(r15); #endif /* CONFIG_X86_64 */ -#ifdef CONFIG_X86_32 COPY_SEG_CPL3(cs); COPY_SEG_CPL3(ss); -#else /* !CONFIG_X86_32 */ - /* Kernel saves and restores only the CS segment register on signals, -* which is the bare minimum needed to allow mixed 32/64-bit code. -* App's signal handler can save/restore other segments if needed. */ - COPY_SEG_CPL3(cs); -#endif /* CONFIG_X86_32 */ get_user_ex(tmpflags, sc-flags); regs-flags = (regs-flags ~FIX_EFLAGS) | (tmpflags FIX_EFLAGS); @@ -164,6 +157,7 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, put_user_ex(regs-cs, sc-cs); put_user_ex(0, sc-gs); put_user_ex(0, sc-fs); + put_user_ex(regs-ss, sc-ss); #endif /* CONFIG_X86_32 */ put_user_ex(fpstate, sc-fpstate); @@ -457,9 +451,19 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, regs-sp = (unsigned long)frame; - /* Set up the CS register to run signal handlers in 64-bit mode, - even if the handler happens to be interrupting 32-bit code. */ + /* +* Set up the CS and SS registers to run signal handlers in +* 64-bit mode, even if the handler happens to be interrupting +* 32-bit or 16-bit code. +* +* SS is subtle. In 64-bit mode, we don't need any particular +* SS descriptor, but we do need SS to be valid. It's possible +* that the old SS is entirely bogus -- this can happen if the +* signal we're trying to deliver is #GP or #SS caused by a bad +* SS value. +*/ regs-cs = __USER_CS; + regs-ss = __USER_DS; return 0; } -- 2.3.0 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ -- To
Re: [PATCH v3 1/2] x86_64,signal: Fix SS handling for signals delivered to 64-bit programs
2015-03-18 21:50 GMT+03:00 Cyrill Gorcunov gorcu...@gmail.com: On Wed, Mar 18, 2015 at 07:31:33PM +0100, Oleg Nesterov wrote: On 03/18, Cyrill Gorcunov wrote: On Wed, Mar 18, 2015 at 06:48:43PM +0100, Oleg Nesterov wrote: Shot in a dark afer a quick grep: restore_gpregs() should initialize -ss? It hasn't been needed earlier, if this would help it means abi is broken, no? :) Otherwise I don't understand what's happening. until this commit the kernel simply forgot to restore -ss in sigreturn(). after this commit, if your rt_sigframe has garbage in -ss then SIGSEGV is clear. OK, so setting up a proper ss here should fix problem. Seriously, where is UserX86RegsEntry? It's in protobif/core-x86.proto, welcome to protobuf hell. Still can't find UserX86RegsEntry... OK, perhaps it comes from user_x86_regs_entry in protobuf/core-x86.proto. Then I guess the patch I sent can be compiled at least ;) Yeah, protobuf-c mangles _ in message name so user_x86_regs_entry - UserX86RegsEntry. Andrew should be able to test it tomorrow (hopefully it's no urgent right? ;) This patch fixes the problem. Oleg, could you send this path in the criu maillist? -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] selftests/kcmp: exit with non-zero code in a fail case
2015-03-13 13:37 GMT+03:00 Michael Ellerman : > On Fri, 2015-03-13 at 12:27 +0300, Andrey Vagin wrote: >> diff --git a/tools/testing/selftests/kselftest.h >> b/tools/testing/selftests/kselftest.h >> index 572c888..a0ec8b8 100644 >> --- a/tools/testing/selftests/kselftest.h >> +++ b/tools/testing/selftests/kselftest.h >> @@ -58,5 +58,17 @@ static inline int ksft_exit_skip(void) >> { >> exit(4); >> } >> +static inline int ksft_exit(void) >> +{ >> + if (ksft_cnt.ksft_fail) >> + return ksft_exit_fail(); >> + if (ksft_cnt.ksft_xpass) >> + return ksft_exit_xpass(); >> + if (ksft_cnt.ksft_xskip) >> + return ksft_exit_skip(); >> + if (ksft_cnt.ksft_xfail) >> + return ksft_exit_xfail(); >> + ksft_exit_pass(); >> +} > > This function claims to return 'int', but doesn't. So do all the others. I agree with this. I haven't seen any warning, because exit() never returns back. > > It could be as simple as: I think it isn't a good idea. In my version we can be sure that ksft_exit_fail() and ksft_exit() return the same code in a fail case. > > static inline void ksft_exit(void) > { > if (ksft_cnt.ksft_fail) > exit(1); > > if (ksft_cnt.ksft_xpass) > exit(3); > > if (ksft_cnt.ksft_xskip) > exit(4); > > if (ksft_cnt.ksft_xfail) > exit(2); > > exit(0); > } > > cheers > > > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] selftests/kcmp: exit with non-zero code in a fail case
2015-03-13 13:37 GMT+03:00 Michael Ellerman m...@ellerman.id.au: On Fri, 2015-03-13 at 12:27 +0300, Andrey Vagin wrote: diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 572c888..a0ec8b8 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -58,5 +58,17 @@ static inline int ksft_exit_skip(void) { exit(4); } +static inline int ksft_exit(void) +{ + if (ksft_cnt.ksft_fail) + return ksft_exit_fail(); + if (ksft_cnt.ksft_xpass) + return ksft_exit_xpass(); + if (ksft_cnt.ksft_xskip) + return ksft_exit_skip(); + if (ksft_cnt.ksft_xfail) + return ksft_exit_xfail(); + ksft_exit_pass(); +} This function claims to return 'int', but doesn't. So do all the others. I agree with this. I haven't seen any warning, because exit() never returns back. It could be as simple as: I think it isn't a good idea. In my version we can be sure that ksft_exit_fail() and ksft_exit() return the same code in a fail case. static inline void ksft_exit(void) { if (ksft_cnt.ksft_fail) exit(1); if (ksft_cnt.ksft_xpass) exit(3); if (ksft_cnt.ksft_xskip) exit(4); if (ksft_cnt.ksft_xfail) exit(2); exit(0); } cheers -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] proc: show locks in /proc/pid/fdinfo/X
2015-03-12 22:23 GMT+03:00 Andrew Morton : > On Thu, 12 Mar 2015 18:54:42 +0300 Andrew Vagin wrote: > >> v2: use seq_has_overflowed() properly > > --- a/fs/proc/fd.c~proc-show-locks-in-proc-pid-fdinfo-x-v2 > +++ a/fs/proc/fd.c > @@ -57,17 +57,15 @@ static int seq_show(struct seq_file *m, >real_mount(file->f_path.mnt)->mnt_id); > > show_fd_locks(m, file, files); > - ret = seq_has_overflowed(m); > - if (ret) > + if (seq_has_overflowed(m)) > goto out; > > if (file->f_op->show_fdinfo) > file->f_op->show_fdinfo(m, file); > - ret = seq_has_overflowed(m); > > out: > fput(file); > - return ret; > + return 0; > } > > static int seq_fdinfo_open(struct inode *inode, struct file *file) > > > So it returns "success" when the output has overflowed? Why this, > rather than returning an error? I have read fs/seq_file.c and looks like it's the right way. seq_has_overflowed() is used to avoid useless work. If we call it or don't call it, the result must be the same in both cases. So from this point of view it looks logically correct too. There are two interesting places from seq_file.c error = m->op->show(m, p); if (error < 0) break; if (unlikely(error)) { error = 0; m->count = 0; } if (seq_has_overflowed(m)) goto Eoverflow; ... err = m->op->show(m, p); if (err < 0) break; if (unlikely(err)) m->count = 0; if (unlikely(!m->count)) { p = m->op->next(m, p, ); m->index = pos; continue; } if (m->count < m->size) goto Fill; m->op->stop(m, p); kvfree(m->buf); m->count = 0; m->buf = seq_buf_alloc(m->size <<= 1); Thanks, Andrey -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] proc: show locks in /proc/pid/fdinfo/X
2015-03-12 22:23 GMT+03:00 Andrew Morton a...@linux-foundation.org: On Thu, 12 Mar 2015 18:54:42 +0300 Andrew Vagin ava...@parallels.com wrote: v2: use seq_has_overflowed() properly --- a/fs/proc/fd.c~proc-show-locks-in-proc-pid-fdinfo-x-v2 +++ a/fs/proc/fd.c @@ -57,17 +57,15 @@ static int seq_show(struct seq_file *m, real_mount(file-f_path.mnt)-mnt_id); show_fd_locks(m, file, files); - ret = seq_has_overflowed(m); - if (ret) + if (seq_has_overflowed(m)) goto out; if (file-f_op-show_fdinfo) file-f_op-show_fdinfo(m, file); - ret = seq_has_overflowed(m); out: fput(file); - return ret; + return 0; } static int seq_fdinfo_open(struct inode *inode, struct file *file) So it returns success when the output has overflowed? Why this, rather than returning an error? I have read fs/seq_file.c and looks like it's the right way. seq_has_overflowed() is used to avoid useless work. If we call it or don't call it, the result must be the same in both cases. So from this point of view it looks logically correct too. There are two interesting places from seq_file.c error = m-op-show(m, p); if (error 0) break; if (unlikely(error)) { error = 0; m-count = 0; } if (seq_has_overflowed(m)) goto Eoverflow; ... err = m-op-show(m, p); if (err 0) break; if (unlikely(err)) m-count = 0; if (unlikely(!m-count)) { p = m-op-next(m, p, pos); m-index = pos; continue; } if (m-count m-size) goto Fill; m-op-stop(m, p); kvfree(m-buf); m-count = 0; m-buf = seq_buf_alloc(m-size = 1); Thanks, Andrey -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/3 v3] x86: entry_64.S: always allocate complete "struct pt_regs"
2015-02-25 21:42 GMT+03:00 Denys Vlasenko : > On 02/25/2015 01:37 PM, Andrey Wagin wrote: >> 2015-02-13 0:54 GMT+03:00 Denys Vlasenko : >>> 64-bit code was using six stack slots less by not saving/restoring >>> registers which are callee-preserved according to C ABI, >>> and not allocating space for them. >>> Only when syscall needed a complete "struct pt_regs", >>> the complete area was allocated and filled in. >>> As an additional twist, on interrupt entry a "slightly less truncated >>> pt_regs" >>> trick is used, to make nested interrupt stacks easier to unwind. >>> >>> This proved to be a source of significant obfuscation and subtle bugs. >>> For example, stub_fork had to pop the return address, >>> extend the struct, save registers, and push return address back. Ugly. >>> ia32_ptregs_common pops return address and "returns" via jmp insn, >>> throwing a wrench into CPU return stack cache. >>> >>> This patch changes code to always allocate a complete "struct pt_regs". >>> The saving of registers is still done lazily. >>> >>> "Partial pt_regs" trick on interrupt stack is retained. >>> >>> Macros which manipulate "struct pt_regs" on stack are reworked: >>> ALLOC_PT_GPREGS_ON_STACK allocates the structure. >>> SAVE_C_REGS saves to it those registers which are clobbered by C code. >>> SAVE_EXTRA_REGS saves to it all other registers. >>> Corresponding RESTORE_* and REMOVE_PT_GPREGS_FROM_STACK macros reverse it. >>> >>> ia32_ptregs_common, stub_fork and friends lost their ugly dance with >>> return pointer. >>> >>> LOAD_ARGS32 in ia32entry.S now uses symbolic stack offsets >>> instead of magic numbers. >>> >>> error_entry and save_paranoid now use SAVE_C_REGS + SAVE_EXTRA_REGS >>> instead of having it open-coded yet again. >>> >>> Patch was run-tested: 64-bit executables, 32-bit executables, >>> strace works. >>> Timing tests did not show measurable difference in 32-bit >>> and 64-bit syscalls. >> >> Hello Denys, >> >> My test vm doesn't boot with this patch. Could you help to investigate >> this issue? > > I think I found it. This part of my patch is possibly wrong: > > @@ -171,9 +171,9 @@ static inline int arch_irqs_disabled(void) > #define ARCH_LOCKDEP_SYS_EXIT_IRQ \ > TRACE_IRQS_ON; \ > sti; \ > - SAVE_REST; \ > + SAVE_EXTRA_REGS; \ > LOCKDEP_SYS_EXIT; \ > - RESTORE_REST; \ > + RESTORE_EXTRA_REGS; \ > cli; \ > TRACE_IRQS_OFF; > > The "SAVE_REST" here is intended to really *push* extra regs on stack, > but the patch changed it so that they are written to existing stack > slots above. > > From code inspection it should work in almost all cases, but some > locations where it is used are really obscure. > > If there are places where *pushing* regs is really necessary, > this can corrupt rbp,rbx,r12-15 registers. > > Your config has CONFIG_LOCKDEP=y, I think it's worth trying whether the bug > was here. > Please find updated patch attached. Can you try it? It doesn't work [2.282198] microcode: CPU0 sig=0x623, pf=0x0, revision=0x1 [2.288321] microcode: CPU1 sig=0x623, pf=0x0, revision=0x1 [2.289139] microcode: CPU2 sig=0x623, pf=0x0, revision=0x1 [2.290618] microcode: CPU3 sig=0x623, pf=0x0, revision=0x1 [2.292417] microcode: Microcode Update Driver: v2.00 , Peter Oruba [2.392592] piix4_smbus :00:01.3: SMBus Host Controller at 0xb100, revision 0 [2.510882] systemd-fsck[349]: /dev/sda1: clean, 343/128016 files, 166911/512000 blocks [2.521463] Adding 4128764k swap on /dev/sda2. Priority:-1 extents:1 across:4128764k FS [2.573597] EXT4-fs (sda1): mounted filesystem with ordered data mode. Opts: (null) [2.597771] systemd-journald[283]: Received request to flush runtime journal from PID 1 [2.802288] audit: type=1305 audit(1424892361.629:3): audit_pid=369 old=0 auid=4294967295 ses=4294967295 res=1 [2.819660] traps: systemd-cgroups[380] general protection ip:7fb227939028 sp:7fff6bac59c8 error:0 in ld-2.18.so[7fb227921000+2] [3.016262] traps: systemd-cgroups[390] general protection ip:7f456f7b6028 sp:7fffdc059718 error:0 in ld-2.18.so[7f456f79e000+2] > > -- > vda > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/3 v3] x86: entry_64.S: always allocate complete struct pt_regs
2015-02-25 21:42 GMT+03:00 Denys Vlasenko dvlas...@redhat.com: On 02/25/2015 01:37 PM, Andrey Wagin wrote: 2015-02-13 0:54 GMT+03:00 Denys Vlasenko dvlas...@redhat.com: 64-bit code was using six stack slots less by not saving/restoring registers which are callee-preserved according to C ABI, and not allocating space for them. Only when syscall needed a complete struct pt_regs, the complete area was allocated and filled in. As an additional twist, on interrupt entry a slightly less truncated pt_regs trick is used, to make nested interrupt stacks easier to unwind. This proved to be a source of significant obfuscation and subtle bugs. For example, stub_fork had to pop the return address, extend the struct, save registers, and push return address back. Ugly. ia32_ptregs_common pops return address and returns via jmp insn, throwing a wrench into CPU return stack cache. This patch changes code to always allocate a complete struct pt_regs. The saving of registers is still done lazily. Partial pt_regs trick on interrupt stack is retained. Macros which manipulate struct pt_regs on stack are reworked: ALLOC_PT_GPREGS_ON_STACK allocates the structure. SAVE_C_REGS saves to it those registers which are clobbered by C code. SAVE_EXTRA_REGS saves to it all other registers. Corresponding RESTORE_* and REMOVE_PT_GPREGS_FROM_STACK macros reverse it. ia32_ptregs_common, stub_fork and friends lost their ugly dance with return pointer. LOAD_ARGS32 in ia32entry.S now uses symbolic stack offsets instead of magic numbers. error_entry and save_paranoid now use SAVE_C_REGS + SAVE_EXTRA_REGS instead of having it open-coded yet again. Patch was run-tested: 64-bit executables, 32-bit executables, strace works. Timing tests did not show measurable difference in 32-bit and 64-bit syscalls. Hello Denys, My test vm doesn't boot with this patch. Could you help to investigate this issue? I think I found it. This part of my patch is possibly wrong: @@ -171,9 +171,9 @@ static inline int arch_irqs_disabled(void) #define ARCH_LOCKDEP_SYS_EXIT_IRQ \ TRACE_IRQS_ON; \ sti; \ - SAVE_REST; \ + SAVE_EXTRA_REGS; \ LOCKDEP_SYS_EXIT; \ - RESTORE_REST; \ + RESTORE_EXTRA_REGS; \ cli; \ TRACE_IRQS_OFF; The SAVE_REST here is intended to really *push* extra regs on stack, but the patch changed it so that they are written to existing stack slots above. From code inspection it should work in almost all cases, but some locations where it is used are really obscure. If there are places where *pushing* regs is really necessary, this can corrupt rbp,rbx,r12-15 registers. Your config has CONFIG_LOCKDEP=y, I think it's worth trying whether the bug was here. Please find updated patch attached. Can you try it? It doesn't work [2.282198] microcode: CPU0 sig=0x623, pf=0x0, revision=0x1 [2.288321] microcode: CPU1 sig=0x623, pf=0x0, revision=0x1 [2.289139] microcode: CPU2 sig=0x623, pf=0x0, revision=0x1 [2.290618] microcode: CPU3 sig=0x623, pf=0x0, revision=0x1 [2.292417] microcode: Microcode Update Driver: v2.00 tig...@aivazian.fsnet.co.uk, Peter Oruba [2.392592] piix4_smbus :00:01.3: SMBus Host Controller at 0xb100, revision 0 [2.510882] systemd-fsck[349]: /dev/sda1: clean, 343/128016 files, 166911/512000 blocks [2.521463] Adding 4128764k swap on /dev/sda2. Priority:-1 extents:1 across:4128764k FS [2.573597] EXT4-fs (sda1): mounted filesystem with ordered data mode. Opts: (null) [2.597771] systemd-journald[283]: Received request to flush runtime journal from PID 1 [2.802288] audit: type=1305 audit(1424892361.629:3): audit_pid=369 old=0 auid=4294967295 ses=4294967295 res=1 [2.819660] traps: systemd-cgroups[380] general protection ip:7fb227939028 sp:7fff6bac59c8 error:0 in ld-2.18.so[7fb227921000+2] [3.016262] traps: systemd-cgroups[390] general protection ip:7f456f7b6028 sp:7fffdc059718 error:0 in ld-2.18.so[7f456f79e000+2] -- vda -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: BUG at /build/buildd/linux-3.13.0/fs/pnode.c:372!
2014-10-06 23:44 GMT+04:00 Serge E. Hallyn : > Quoting Andrey Wagin (ava...@gmail.com): >> 2014-10-06 22:19 GMT+04:00 Serge E. Hallyn : >> > Quoting Andrey Wagin (ava...@gmail.com): >> >> Hi All, >> >> >> >> Here is a small program, which triggers a bug. Is it interested for >> >> someone? >> > >> > Yup, would >> > >> > diff --git a/fs/namespace.c b/fs/namespace.c >> > index ef42d9b..ddef25d 100644 >> > --- a/fs/namespace.c >> > +++ b/fs/namespace.c >> > @@ -3046,6 +3046,9 @@ static int mntns_install(struct nsproxy *nsproxy, >> > void *ns) >> > struct mnt_namespace *mnt_ns = ns; >> > struct path root; >> > >> > + if (mnt_ns == nsproxy->mnt_ns) >> > + return 0; >> > + >> > if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || >> > !ns_capable(current_user_ns(), CAP_SYS_CHROOT) || >> > !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) >> > >> > fix it? >> >> Actually the bug is triggered from the second umount. I think we need > > Oh, I see. > >> something like this: >> diff --git a/fs/namespace.c b/fs/namespace.c >> index 68685b2..bbcd92c 100644 >> --- a/fs/namespace.c >> +++ b/fs/namespace.c >> @@ -1363,6 +1363,9 @@ static int do_umount(struct mount *mnt, int flags) >> return retval; >> } >> >> + if (mnt->mnt_parent == mnt) > > looks good (or the equivalent mnt_has_parent(mnt)). Have you tested that > this prevents your test program from causing a crash? Yes, I have. Look the strace output below. Today here is too late for sending a proper patch. > >> + return -EINVAL; >> + >> namespace_lock(); >> lock_mount_hash(); >> event++; >> >> root@ubuntu:/home/avagin# strace ./a.out >> ... >> open("/proc/self/ns/mnt", O_RDONLY) = 3 >> umount("/", MNT_DETACH) = 0 >> setns(3, 131072)= 0 >> umount("/", MNT_DETACH) = -1 EINVAL (Invalid argument) >> exit_group(0) = ? >> +++ exited with 0 +++ >> >> > >> >> root@ubuntu:/home/avagin# cat nsenter.c >> >> #define _GNU_SOURCE /* See feature_test_macros(7) */ >> >> #include >> >> #include >> >> #include >> >> #include >> >> #include >> >> #include >> >> >> >> int main(int argc, char **argv) >> >> { >> >> int fd; >> >> >> >> fd = open("/proc/self/ns/mnt", O_RDONLY); >> >> if (fd < 0) >> >> return 1; >> >> umount2("/", MNT_DETACH); >> >> if (setns(fd, CLONE_NEWNS)) >> >> return 1; >> >> umount2("/", MNT_DETACH); >> >> >> >> return 0; >> >> } >> >> >> >> root@ubuntu:/home/avagin# gcc nsenter.c -o nsenter >> >> root@ubuntu:/home/avagin# ./nsenter >> >> >> >> >> >> [ 260.548301] [ cut here ] >> >> [ 260.550941] kernel BUG at /build/buildd/linux-3.13.0/fs/pnode.c:372! >> >> [ 260.552068] invalid opcode: [#1] SMP >> >> [ 260.552068] Modules linked in: xt_CHECKSUM iptable_mangle xt_tcpudp >> >> xt_addrtype xt_conntrack ipt_MASQUERADE iptable_nat nf_conntrack_ipv4 >> >> nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack bridge stp llc >> >> dm_thin_pool dm_persistent_data dm_bufio dm_bio_prison iptable_filter >> >> ip_tables x_tables crct10dif_pclmul crc32_pclmul ghash_clmulni_intel >> >> binfmt_misc nfsd auth_rpcgss nfs_acl aesni_intel nfs lockd aes_x86_64 >> >> sunrpc fscache lrw gf128mul glue_helper ablk_helper cryptd serio_raw >> >> ppdev parport_pc lp parport btrfs xor raid6_pq libcrc32c psmouse >> >> floppy >> >> [ 260.552068] CPU: 0 PID: 1723 Comm: nsenter Not tainted >> >> 3.13.0-30-generic #55-Ubuntu >> >> [ 260.552068] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 >> >> [ 260.552068] task: 8800376097f0 ti: 880074824000 task.ti: >> >> 880074824000 >> >> [ 260.552068] RIP: 0010:[] [] >> >> propagate_umount+0x123/0x130 >> >> [ 26
Re: BUG at /build/buildd/linux-3.13.0/fs/pnode.c:372!
2014-10-06 22:19 GMT+04:00 Serge E. Hallyn : > Quoting Andrey Wagin (ava...@gmail.com): >> Hi All, >> >> Here is a small program, which triggers a bug. Is it interested for someone? > > Yup, would > > diff --git a/fs/namespace.c b/fs/namespace.c > index ef42d9b..ddef25d 100644 > --- a/fs/namespace.c > +++ b/fs/namespace.c > @@ -3046,6 +3046,9 @@ static int mntns_install(struct nsproxy *nsproxy, void > *ns) > struct mnt_namespace *mnt_ns = ns; > struct path root; > > + if (mnt_ns == nsproxy->mnt_ns) > + return 0; > + > if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || > !ns_capable(current_user_ns(), CAP_SYS_CHROOT) || > !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) > > fix it? Actually the bug is triggered from the second umount. I think we need something like this: diff --git a/fs/namespace.c b/fs/namespace.c index 68685b2..bbcd92c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1363,6 +1363,9 @@ static int do_umount(struct mount *mnt, int flags) return retval; } + if (mnt->mnt_parent == mnt) + return -EINVAL; + namespace_lock(); lock_mount_hash(); event++; root@ubuntu:/home/avagin# strace ./a.out ... open("/proc/self/ns/mnt", O_RDONLY) = 3 umount("/", MNT_DETACH) = 0 setns(3, 131072)= 0 umount("/", MNT_DETACH) = -1 EINVAL (Invalid argument) exit_group(0) = ? +++ exited with 0 +++ > >> root@ubuntu:/home/avagin# cat nsenter.c >> #define _GNU_SOURCE /* See feature_test_macros(7) */ >> #include >> #include >> #include >> #include >> #include >> #include >> >> int main(int argc, char **argv) >> { >> int fd; >> >> fd = open("/proc/self/ns/mnt", O_RDONLY); >> if (fd < 0) >> return 1; >> umount2("/", MNT_DETACH); >> if (setns(fd, CLONE_NEWNS)) >> return 1; >> umount2("/", MNT_DETACH); >> >> return 0; >> } >> >> root@ubuntu:/home/avagin# gcc nsenter.c -o nsenter >> root@ubuntu:/home/avagin# ./nsenter >> >> >> [ 260.548301] [ cut here ] >> [ 260.550941] kernel BUG at /build/buildd/linux-3.13.0/fs/pnode.c:372! >> [ 260.552068] invalid opcode: [#1] SMP >> [ 260.552068] Modules linked in: xt_CHECKSUM iptable_mangle xt_tcpudp >> xt_addrtype xt_conntrack ipt_MASQUERADE iptable_nat nf_conntrack_ipv4 >> nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack bridge stp llc >> dm_thin_pool dm_persistent_data dm_bufio dm_bio_prison iptable_filter >> ip_tables x_tables crct10dif_pclmul crc32_pclmul ghash_clmulni_intel >> binfmt_misc nfsd auth_rpcgss nfs_acl aesni_intel nfs lockd aes_x86_64 >> sunrpc fscache lrw gf128mul glue_helper ablk_helper cryptd serio_raw >> ppdev parport_pc lp parport btrfs xor raid6_pq libcrc32c psmouse >> floppy >> [ 260.552068] CPU: 0 PID: 1723 Comm: nsenter Not tainted >> 3.13.0-30-generic #55-Ubuntu >> [ 260.552068] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 >> [ 260.552068] task: 8800376097f0 ti: 880074824000 task.ti: >> 880074824000 >> [ 260.552068] RIP: 0010:[] [] >> propagate_umount+0x123/0x130 >> [ 260.552068] RSP: 0018:880074825e98 EFLAGS: 00010246 >> [ 260.552068] RAX: 88007c741140 RBX: 0002 RCX: >> 88007c741190 >> [ 260.552068] RDX: 88007c741190 RSI: 880074825ec0 RDI: >> 880074825ec0 >> [ 260.552068] RBP: 880074825eb0 R08: 000172e0 R09: >> 88007fc172e0 >> [ 260.552068] R10: 811cc642 R11: ea0001d59000 R12: >> 88007c741140 >> [ 260.552068] R13: 88007c741140 R14: 88007c741140 R15: >> >> [ 260.552068] FS: 7fd5c7e41740() GS:88007fc0() >> knlGS: >> [ 260.552068] CS: 0010 DS: ES: CR0: 80050033 >> [ 260.552068] CR2: 7fd5c7968050 CR3: 70124000 CR4: >> 000406f0 >> [ 260.552068] Stack: >> [ 260.552068] 0002 0002 88007c631000 >> 880074825ed8 >> [ 260.552068] 811dcfac 88007c741140 0002 >> 88007c741160 >> [ 260.552068] 880074825f38 811dd12b 811cc642 >> 7564 >> [ 260.552068] Call Trace: >> [ 260.552068] [] umount_tree+0x20c/0x260 >> [ 260.552068] [] do
BUG at /build/buildd/linux-3.13.0/fs/pnode.c:372!
Hi All, Here is a small program, which triggers a bug. Is it interested for someone? root@ubuntu:/home/avagin# cat nsenter.c #define _GNU_SOURCE /* See feature_test_macros(7) */ #include #include #include #include #include #include int main(int argc, char **argv) { int fd; fd = open("/proc/self/ns/mnt", O_RDONLY); if (fd < 0) return 1; umount2("/", MNT_DETACH); if (setns(fd, CLONE_NEWNS)) return 1; umount2("/", MNT_DETACH); return 0; } root@ubuntu:/home/avagin# gcc nsenter.c -o nsenter root@ubuntu:/home/avagin# ./nsenter [ 260.548301] [ cut here ] [ 260.550941] kernel BUG at /build/buildd/linux-3.13.0/fs/pnode.c:372! [ 260.552068] invalid opcode: [#1] SMP [ 260.552068] Modules linked in: xt_CHECKSUM iptable_mangle xt_tcpudp xt_addrtype xt_conntrack ipt_MASQUERADE iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack bridge stp llc dm_thin_pool dm_persistent_data dm_bufio dm_bio_prison iptable_filter ip_tables x_tables crct10dif_pclmul crc32_pclmul ghash_clmulni_intel binfmt_misc nfsd auth_rpcgss nfs_acl aesni_intel nfs lockd aes_x86_64 sunrpc fscache lrw gf128mul glue_helper ablk_helper cryptd serio_raw ppdev parport_pc lp parport btrfs xor raid6_pq libcrc32c psmouse floppy [ 260.552068] CPU: 0 PID: 1723 Comm: nsenter Not tainted 3.13.0-30-generic #55-Ubuntu [ 260.552068] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 260.552068] task: 8800376097f0 ti: 880074824000 task.ti: 880074824000 [ 260.552068] RIP: 0010:[] [] propagate_umount+0x123/0x130 [ 260.552068] RSP: 0018:880074825e98 EFLAGS: 00010246 [ 260.552068] RAX: 88007c741140 RBX: 0002 RCX: 88007c741190 [ 260.552068] RDX: 88007c741190 RSI: 880074825ec0 RDI: 880074825ec0 [ 260.552068] RBP: 880074825eb0 R08: 000172e0 R09: 88007fc172e0 [ 260.552068] R10: 811cc642 R11: ea0001d59000 R12: 88007c741140 [ 260.552068] R13: 88007c741140 R14: 88007c741140 R15: [ 260.552068] FS: 7fd5c7e41740() GS:88007fc0() knlGS: [ 260.552068] CS: 0010 DS: ES: CR0: 80050033 [ 260.552068] CR2: 7fd5c7968050 CR3: 70124000 CR4: 000406f0 [ 260.552068] Stack: [ 260.552068] 0002 0002 88007c631000 880074825ed8 [ 260.552068] 811dcfac 88007c741140 0002 88007c741160 [ 260.552068] 880074825f38 811dd12b 811cc642 7564 [ 260.552068] Call Trace: [ 260.552068] [] umount_tree+0x20c/0x260 [ 260.552068] [] do_umount+0x12b/0x300 [ 260.552068] [] ? final_putname+0x22/0x50 [ 260.552068] [] ? putname+0x29/0x40 [ 260.552068] [] SyS_umount+0xdc/0x100 [ 260.552068] [] tracesys+0xe1/0xe6 [ 260.552068] Code: 89 50 08 48 8b 50 08 48 89 02 49 89 45 08 e9 72 ff ff ff 0f 1f 44 00 00 4c 89 e6 4c 89 e7 e8 f5 f6 ff ff 48 89 c3 e9 39 ff ff ff <0f> 0b 66 2e 0f 1f 84 00 00 00 00 00 90 66 66 66 66 90 55 b8 01 [ 260.552068] RIP [] propagate_umount+0x123/0x130 [ 260.552068] RSP [ 260.611451] ---[ end trace 11c33d85f1d4c652 ]--- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
BUG at /build/buildd/linux-3.13.0/fs/pnode.c:372!
Hi All, Here is a small program, which triggers a bug. Is it interested for someone? root@ubuntu:/home/avagin# cat nsenter.c #define _GNU_SOURCE /* See feature_test_macros(7) */ #include sched.h #include sys/types.h #include sys/stat.h #include fcntl.h #include unistd.h #include sys/mount.h int main(int argc, char **argv) { int fd; fd = open(/proc/self/ns/mnt, O_RDONLY); if (fd 0) return 1; umount2(/, MNT_DETACH); if (setns(fd, CLONE_NEWNS)) return 1; umount2(/, MNT_DETACH); return 0; } root@ubuntu:/home/avagin# gcc nsenter.c -o nsenter root@ubuntu:/home/avagin# ./nsenter [ 260.548301] [ cut here ] [ 260.550941] kernel BUG at /build/buildd/linux-3.13.0/fs/pnode.c:372! [ 260.552068] invalid opcode: [#1] SMP [ 260.552068] Modules linked in: xt_CHECKSUM iptable_mangle xt_tcpudp xt_addrtype xt_conntrack ipt_MASQUERADE iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack bridge stp llc dm_thin_pool dm_persistent_data dm_bufio dm_bio_prison iptable_filter ip_tables x_tables crct10dif_pclmul crc32_pclmul ghash_clmulni_intel binfmt_misc nfsd auth_rpcgss nfs_acl aesni_intel nfs lockd aes_x86_64 sunrpc fscache lrw gf128mul glue_helper ablk_helper cryptd serio_raw ppdev parport_pc lp parport btrfs xor raid6_pq libcrc32c psmouse floppy [ 260.552068] CPU: 0 PID: 1723 Comm: nsenter Not tainted 3.13.0-30-generic #55-Ubuntu [ 260.552068] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 260.552068] task: 8800376097f0 ti: 880074824000 task.ti: 880074824000 [ 260.552068] RIP: 0010:[811e9483] [811e9483] propagate_umount+0x123/0x130 [ 260.552068] RSP: 0018:880074825e98 EFLAGS: 00010246 [ 260.552068] RAX: 88007c741140 RBX: 0002 RCX: 88007c741190 [ 260.552068] RDX: 88007c741190 RSI: 880074825ec0 RDI: 880074825ec0 [ 260.552068] RBP: 880074825eb0 R08: 000172e0 R09: 88007fc172e0 [ 260.552068] R10: 811cc642 R11: ea0001d59000 R12: 88007c741140 [ 260.552068] R13: 88007c741140 R14: 88007c741140 R15: [ 260.552068] FS: 7fd5c7e41740() GS:88007fc0() knlGS: [ 260.552068] CS: 0010 DS: ES: CR0: 80050033 [ 260.552068] CR2: 7fd5c7968050 CR3: 70124000 CR4: 000406f0 [ 260.552068] Stack: [ 260.552068] 0002 0002 88007c631000 880074825ed8 [ 260.552068] 811dcfac 88007c741140 0002 88007c741160 [ 260.552068] 880074825f38 811dd12b 811cc642 7564 [ 260.552068] Call Trace: [ 260.552068] [811dcfac] umount_tree+0x20c/0x260 [ 260.552068] [811dd12b] do_umount+0x12b/0x300 [ 260.552068] [811cc642] ? final_putname+0x22/0x50 [ 260.552068] [811cc849] ? putname+0x29/0x40 [ 260.552068] [811dd88c] SyS_umount+0xdc/0x100 [ 260.552068] [8172aeff] tracesys+0xe1/0xe6 [ 260.552068] Code: 89 50 08 48 8b 50 08 48 89 02 49 89 45 08 e9 72 ff ff ff 0f 1f 44 00 00 4c 89 e6 4c 89 e7 e8 f5 f6 ff ff 48 89 c3 e9 39 ff ff ff 0f 0b 66 2e 0f 1f 84 00 00 00 00 00 90 66 66 66 66 90 55 b8 01 [ 260.552068] RIP [811e9483] propagate_umount+0x123/0x130 [ 260.552068] RSP 880074825e98 [ 260.611451] ---[ end trace 11c33d85f1d4c652 ]--- -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: BUG at /build/buildd/linux-3.13.0/fs/pnode.c:372!
2014-10-06 22:19 GMT+04:00 Serge E. Hallyn se...@hallyn.com: Quoting Andrey Wagin (ava...@gmail.com): Hi All, Here is a small program, which triggers a bug. Is it interested for someone? Yup, would diff --git a/fs/namespace.c b/fs/namespace.c index ef42d9b..ddef25d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3046,6 +3046,9 @@ static int mntns_install(struct nsproxy *nsproxy, void *ns) struct mnt_namespace *mnt_ns = ns; struct path root; + if (mnt_ns == nsproxy-mnt_ns) + return 0; + if (!ns_capable(mnt_ns-user_ns, CAP_SYS_ADMIN) || !ns_capable(current_user_ns(), CAP_SYS_CHROOT) || !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) fix it? Actually the bug is triggered from the second umount. I think we need something like this: diff --git a/fs/namespace.c b/fs/namespace.c index 68685b2..bbcd92c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1363,6 +1363,9 @@ static int do_umount(struct mount *mnt, int flags) return retval; } + if (mnt-mnt_parent == mnt) + return -EINVAL; + namespace_lock(); lock_mount_hash(); event++; root@ubuntu:/home/avagin# strace ./a.out ... open(/proc/self/ns/mnt, O_RDONLY) = 3 umount(/, MNT_DETACH) = 0 setns(3, 131072)= 0 umount(/, MNT_DETACH) = -1 EINVAL (Invalid argument) exit_group(0) = ? +++ exited with 0 +++ root@ubuntu:/home/avagin# cat nsenter.c #define _GNU_SOURCE /* See feature_test_macros(7) */ #include sched.h #include sys/types.h #include sys/stat.h #include fcntl.h #include unistd.h #include sys/mount.h int main(int argc, char **argv) { int fd; fd = open(/proc/self/ns/mnt, O_RDONLY); if (fd 0) return 1; umount2(/, MNT_DETACH); if (setns(fd, CLONE_NEWNS)) return 1; umount2(/, MNT_DETACH); return 0; } root@ubuntu:/home/avagin# gcc nsenter.c -o nsenter root@ubuntu:/home/avagin# ./nsenter [ 260.548301] [ cut here ] [ 260.550941] kernel BUG at /build/buildd/linux-3.13.0/fs/pnode.c:372! [ 260.552068] invalid opcode: [#1] SMP [ 260.552068] Modules linked in: xt_CHECKSUM iptable_mangle xt_tcpudp xt_addrtype xt_conntrack ipt_MASQUERADE iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack bridge stp llc dm_thin_pool dm_persistent_data dm_bufio dm_bio_prison iptable_filter ip_tables x_tables crct10dif_pclmul crc32_pclmul ghash_clmulni_intel binfmt_misc nfsd auth_rpcgss nfs_acl aesni_intel nfs lockd aes_x86_64 sunrpc fscache lrw gf128mul glue_helper ablk_helper cryptd serio_raw ppdev parport_pc lp parport btrfs xor raid6_pq libcrc32c psmouse floppy [ 260.552068] CPU: 0 PID: 1723 Comm: nsenter Not tainted 3.13.0-30-generic #55-Ubuntu [ 260.552068] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 260.552068] task: 8800376097f0 ti: 880074824000 task.ti: 880074824000 [ 260.552068] RIP: 0010:[811e9483] [811e9483] propagate_umount+0x123/0x130 [ 260.552068] RSP: 0018:880074825e98 EFLAGS: 00010246 [ 260.552068] RAX: 88007c741140 RBX: 0002 RCX: 88007c741190 [ 260.552068] RDX: 88007c741190 RSI: 880074825ec0 RDI: 880074825ec0 [ 260.552068] RBP: 880074825eb0 R08: 000172e0 R09: 88007fc172e0 [ 260.552068] R10: 811cc642 R11: ea0001d59000 R12: 88007c741140 [ 260.552068] R13: 88007c741140 R14: 88007c741140 R15: [ 260.552068] FS: 7fd5c7e41740() GS:88007fc0() knlGS: [ 260.552068] CS: 0010 DS: ES: CR0: 80050033 [ 260.552068] CR2: 7fd5c7968050 CR3: 70124000 CR4: 000406f0 [ 260.552068] Stack: [ 260.552068] 0002 0002 88007c631000 880074825ed8 [ 260.552068] 811dcfac 88007c741140 0002 88007c741160 [ 260.552068] 880074825f38 811dd12b 811cc642 7564 [ 260.552068] Call Trace: [ 260.552068] [811dcfac] umount_tree+0x20c/0x260 [ 260.552068] [811dd12b] do_umount+0x12b/0x300 [ 260.552068] [811cc642] ? final_putname+0x22/0x50 [ 260.552068] [811cc849] ? putname+0x29/0x40 [ 260.552068] [811dd88c] SyS_umount+0xdc/0x100 [ 260.552068] [8172aeff] tracesys+0xe1/0xe6 [ 260.552068] Code: 89 50 08 48 8b 50 08 48 89 02 49 89 45 08 e9 72 ff ff ff 0f 1f 44 00 00 4c 89 e6 4c 89 e7 e8 f5 f6 ff ff 48 89 c3 e9 39 ff ff ff 0f 0b 66 2e 0f 1f 84 00 00 00 00 00 90 66 66 66 66 90 55 b8 01 [ 260.552068] RIP [811e9483] propagate_umount+0x123/0x130 [ 260.552068] RSP 880074825e98 [ 260.611451] ---[ end trace 11c33d85f1d4c652 ]--- -- To unsubscribe from
Re: BUG at /build/buildd/linux-3.13.0/fs/pnode.c:372!
2014-10-06 23:44 GMT+04:00 Serge E. Hallyn se...@hallyn.com: Quoting Andrey Wagin (ava...@gmail.com): 2014-10-06 22:19 GMT+04:00 Serge E. Hallyn se...@hallyn.com: Quoting Andrey Wagin (ava...@gmail.com): Hi All, Here is a small program, which triggers a bug. Is it interested for someone? Yup, would diff --git a/fs/namespace.c b/fs/namespace.c index ef42d9b..ddef25d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3046,6 +3046,9 @@ static int mntns_install(struct nsproxy *nsproxy, void *ns) struct mnt_namespace *mnt_ns = ns; struct path root; + if (mnt_ns == nsproxy-mnt_ns) + return 0; + if (!ns_capable(mnt_ns-user_ns, CAP_SYS_ADMIN) || !ns_capable(current_user_ns(), CAP_SYS_CHROOT) || !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) fix it? Actually the bug is triggered from the second umount. I think we need Oh, I see. something like this: diff --git a/fs/namespace.c b/fs/namespace.c index 68685b2..bbcd92c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1363,6 +1363,9 @@ static int do_umount(struct mount *mnt, int flags) return retval; } + if (mnt-mnt_parent == mnt) looks good (or the equivalent mnt_has_parent(mnt)). Have you tested that this prevents your test program from causing a crash? Yes, I have. Look the strace output below. Today here is too late for sending a proper patch. + return -EINVAL; + namespace_lock(); lock_mount_hash(); event++; root@ubuntu:/home/avagin# strace ./a.out ... open(/proc/self/ns/mnt, O_RDONLY) = 3 umount(/, MNT_DETACH) = 0 setns(3, 131072)= 0 umount(/, MNT_DETACH) = -1 EINVAL (Invalid argument) exit_group(0) = ? +++ exited with 0 +++ root@ubuntu:/home/avagin# cat nsenter.c #define _GNU_SOURCE /* See feature_test_macros(7) */ #include sched.h #include sys/types.h #include sys/stat.h #include fcntl.h #include unistd.h #include sys/mount.h int main(int argc, char **argv) { int fd; fd = open(/proc/self/ns/mnt, O_RDONLY); if (fd 0) return 1; umount2(/, MNT_DETACH); if (setns(fd, CLONE_NEWNS)) return 1; umount2(/, MNT_DETACH); return 0; } root@ubuntu:/home/avagin# gcc nsenter.c -o nsenter root@ubuntu:/home/avagin# ./nsenter [ 260.548301] [ cut here ] [ 260.550941] kernel BUG at /build/buildd/linux-3.13.0/fs/pnode.c:372! [ 260.552068] invalid opcode: [#1] SMP [ 260.552068] Modules linked in: xt_CHECKSUM iptable_mangle xt_tcpudp xt_addrtype xt_conntrack ipt_MASQUERADE iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack bridge stp llc dm_thin_pool dm_persistent_data dm_bufio dm_bio_prison iptable_filter ip_tables x_tables crct10dif_pclmul crc32_pclmul ghash_clmulni_intel binfmt_misc nfsd auth_rpcgss nfs_acl aesni_intel nfs lockd aes_x86_64 sunrpc fscache lrw gf128mul glue_helper ablk_helper cryptd serio_raw ppdev parport_pc lp parport btrfs xor raid6_pq libcrc32c psmouse floppy [ 260.552068] CPU: 0 PID: 1723 Comm: nsenter Not tainted 3.13.0-30-generic #55-Ubuntu [ 260.552068] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 260.552068] task: 8800376097f0 ti: 880074824000 task.ti: 880074824000 [ 260.552068] RIP: 0010:[811e9483] [811e9483] propagate_umount+0x123/0x130 [ 260.552068] RSP: 0018:880074825e98 EFLAGS: 00010246 [ 260.552068] RAX: 88007c741140 RBX: 0002 RCX: 88007c741190 [ 260.552068] RDX: 88007c741190 RSI: 880074825ec0 RDI: 880074825ec0 [ 260.552068] RBP: 880074825eb0 R08: 000172e0 R09: 88007fc172e0 [ 260.552068] R10: 811cc642 R11: ea0001d59000 R12: 88007c741140 [ 260.552068] R13: 88007c741140 R14: 88007c741140 R15: [ 260.552068] FS: 7fd5c7e41740() GS:88007fc0() knlGS: [ 260.552068] CS: 0010 DS: ES: CR0: 80050033 [ 260.552068] CR2: 7fd5c7968050 CR3: 70124000 CR4: 000406f0 [ 260.552068] Stack: [ 260.552068] 0002 0002 88007c631000 880074825ed8 [ 260.552068] 811dcfac 88007c741140 0002 88007c741160 [ 260.552068] 880074825f38 811dd12b 811cc642 7564 [ 260.552068] Call Trace: [ 260.552068] [811dcfac] umount_tree+0x20c/0x260 [ 260.552068] [811dd12b] do_umount+0x12b/0x300 [ 260.552068] [811cc642] ? final_putname+0x22/0x50 [ 260.552068] [811cc849] ? putname+0x29/0x40 [ 260.552068] [811dd88c
Does MNT_LOCKED work as expected?
Hi All, I think I found a case, when MNT_LOCKED (5ff9d8a65ce8 "vfs: Lock in place mounts from more privileged users") doesn't help to hide overmounted parts from unprivileged users. The problem exists for mounts, which are not overmounted entirely. In such cases we can open a directory from a target mount, which is not overmounted. Then we do pivot_root to move all mounts in a temporary directory. At the final step we deatch all mounts from the temporary directory. After that all children mounts are umounted from the target mount and we can use our file descriptor to open files, which have been overmount before. Here is a example https://github.com/avagin/userns_vs_mntns. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Does MNT_LOCKED work as expected?
Hi All, I think I found a case, when MNT_LOCKED (5ff9d8a65ce8 vfs: Lock in place mounts from more privileged users) doesn't help to hide overmounted parts from unprivileged users. The problem exists for mounts, which are not overmounted entirely. In such cases we can open a directory from a target mount, which is not overmounted. Then we do pivot_root to move all mounts in a temporary directory. At the final step we deatch all mounts from the temporary directory. After that all children mounts are umounted from the target mount and we can use our file descriptor to open files, which have been overmount before. Here is a example https://github.com/avagin/userns_vs_mntns. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: linux-next: cgroup_mount() falls asleep forever
2014-09-24 14:31 GMT+04:00 Andrey Wagin : > Hi All, > > I execute CRIU tests on linux-next. Today I found that one of tests > hangs up forever. > > [root@linux-next-test linux-next]# git describe HEAD > next-20140922 > [root@linux-next-test ~]# ps axf > ... > 698 ?Ss 0:05 \_ sshd: root@notty > 700 ?Ss 0:00 | \_ bash -x > jenkins-scripts/jenkins-ct.sh jenkins.sh > 706 ?S 0:00 | \_ bash -c ( mount --make-rprivate > / && umount -l /proc && mount -t proc proc /proc/ && bash -x > jenkins.sh) > 707 ?S 0:00 | \_ bash -c ( mount > --make-rprivate / && umount -l /proc && mount -t proc proc /proc/ && > bash -x jenkins.sh) > 711 ?S 0:00 | \_ bash -x jenkins.sh > 2981 ?S 0:05 | \_ bash -x > test/zdtm.sh -C -x .*\(maps01\|maps04\) > 6717 ?S 0:00 | \_ /bin/bash > zdtm/live/static/cgroup00.hook --clean > 6719 ?D 11:13 | \_ mount -t > cgroup none cgclean.cWgK71 -o none,name=zdtmtst > > [root@linux-next-test ~]# cat /proc/6719/stack > [] msleep+0x37/0x50 > [] cgroup_mount+0x712/0xa50 > [] mount_fs+0x39/0x1b0 > [] vfs_kern_mount+0x6b/0x150 > [] do_mount+0x22b/0xc20 > [] SyS_mount+0xa2/0x110 > [] system_call_fastpath+0x12/0x17 > [] 0x > > Let me know which info do you need for investigation. > > Steps to reproduce: > > git clone https://github.com/xemul/criu.git > cd criu > make > bash test/zdtm.sh static/cgroup00 Steps to reproduce without CRIU: [root@avagin-fc19-cr ~]# mount -t cgroup -o none,name=zdtmtst xxx /mnt/test/ [root@avagin-fc19-cr ~]# mkdir /mnt/test/xxx [root@avagin-fc19-cr ~]# umount /mnt/test/ [root@avagin-fc19-cr ~]# mount -t cgroup -o none,name=zdtmtst xxx /mnt/test/ [1]+ Stopped mount -t cgroup -o none,name=zdtmtst xxx /mnt/test/ [root@avagin-fc19-cr ~]# bg [root@avagin-fc19-cr ~]# cat /proc/633/stack [] msleep+0x37/0x50 [] cgroup_mount+0x482/0x670 [] mount_fs+0x39/0x1b0 [] vfs_kern_mount+0x6b/0x150 [] do_mount+0x22b/0xc20 [] SyS_mount+0xa2/0x110 [] system_call_fastpath+0x12/0x17 [] 0x > > Thanks, > Andrew -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: linux-next: cgroup_mount() falls asleep forever
2014-09-24 14:31 GMT+04:00 Andrey Wagin : > Hi All, The problem is in a following commit: commit 0c7bf3e8cab7900e17ce7f97104c39927d835469 Author: Zefan Li Date: Sat Sep 20 14:49:10 2014 +0800 cgroup: remove redundant variable in cgroup_mount() Both pinned_sb and new_sb indicate if a new superblock is needed, so we can just remove new_sb. Note now we must check if kernfs_tryget_sb() returns NULL, because when it returns NULL, kernfs_mount() may still re-use an existing superblock, which is just allocated by another concurent mount. Suggested-by: Tejun Heo Signed-off-by: Zefan Li Signed-off-by: Tejun Heo > > I execute CRIU tests on linux-next. Today I found that one of tests > hangs up forever. > > [root@linux-next-test linux-next]# git describe HEAD > next-20140922 > [root@linux-next-test ~]# ps axf > ... > 698 ?Ss 0:05 \_ sshd: root@notty > 700 ?Ss 0:00 | \_ bash -x > jenkins-scripts/jenkins-ct.sh jenkins.sh > 706 ?S 0:00 | \_ bash -c ( mount --make-rprivate > / && umount -l /proc && mount -t proc proc /proc/ && bash -x > jenkins.sh) > 707 ?S 0:00 | \_ bash -c ( mount > --make-rprivate / && umount -l /proc && mount -t proc proc /proc/ && > bash -x jenkins.sh) > 711 ?S 0:00 | \_ bash -x jenkins.sh > 2981 ?S 0:05 | \_ bash -x > test/zdtm.sh -C -x .*\(maps01\|maps04\) > 6717 ?S 0:00 | \_ /bin/bash > zdtm/live/static/cgroup00.hook --clean > 6719 ?D 11:13 | \_ mount -t > cgroup none cgclean.cWgK71 -o none,name=zdtmtst > > [root@linux-next-test ~]# cat /proc/6719/stack > [] msleep+0x37/0x50 > [] cgroup_mount+0x712/0xa50 > [] mount_fs+0x39/0x1b0 > [] vfs_kern_mount+0x6b/0x150 > [] do_mount+0x22b/0xc20 > [] SyS_mount+0xa2/0x110 > [] system_call_fastpath+0x12/0x17 > [] 0x > > Let me know which info do you need for investigation. > > Steps to reproduce: > > git clone https://github.com/xemul/criu.git > cd criu > make > bash test/zdtm.sh static/cgroup00 > > Thanks, > Andrew -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
linux-next: cgroup_mount() falls asleep forever
Hi All, I execute CRIU tests on linux-next. Today I found that one of tests hangs up forever. [root@linux-next-test linux-next]# git describe HEAD next-20140922 [root@linux-next-test ~]# ps axf ... 698 ?Ss 0:05 \_ sshd: root@notty 700 ?Ss 0:00 | \_ bash -x jenkins-scripts/jenkins-ct.sh jenkins.sh 706 ?S 0:00 | \_ bash -c ( mount --make-rprivate / && umount -l /proc && mount -t proc proc /proc/ && bash -x jenkins.sh) 707 ?S 0:00 | \_ bash -c ( mount --make-rprivate / && umount -l /proc && mount -t proc proc /proc/ && bash -x jenkins.sh) 711 ?S 0:00 | \_ bash -x jenkins.sh 2981 ?S 0:05 | \_ bash -x test/zdtm.sh -C -x .*\(maps01\|maps04\) 6717 ?S 0:00 | \_ /bin/bash zdtm/live/static/cgroup00.hook --clean 6719 ?D 11:13 | \_ mount -t cgroup none cgclean.cWgK71 -o none,name=zdtmtst [root@linux-next-test ~]# cat /proc/6719/stack [] msleep+0x37/0x50 [] cgroup_mount+0x712/0xa50 [] mount_fs+0x39/0x1b0 [] vfs_kern_mount+0x6b/0x150 [] do_mount+0x22b/0xc20 [] SyS_mount+0xa2/0x110 [] system_call_fastpath+0x12/0x17 [] 0x Let me know which info do you need for investigation. Steps to reproduce: git clone https://github.com/xemul/criu.git cd criu make bash test/zdtm.sh static/cgroup00 Thanks, Andrew -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
linux-next: cgroup_mount() falls asleep forever
Hi All, I execute CRIU tests on linux-next. Today I found that one of tests hangs up forever. [root@linux-next-test linux-next]# git describe HEAD next-20140922 [root@linux-next-test ~]# ps axf ... 698 ?Ss 0:05 \_ sshd: root@notty 700 ?Ss 0:00 | \_ bash -x jenkins-scripts/jenkins-ct.sh jenkins.sh 706 ?S 0:00 | \_ bash -c ( mount --make-rprivate / umount -l /proc mount -t proc proc /proc/ bash -x jenkins.sh) 707 ?S 0:00 | \_ bash -c ( mount --make-rprivate / umount -l /proc mount -t proc proc /proc/ bash -x jenkins.sh) 711 ?S 0:00 | \_ bash -x jenkins.sh 2981 ?S 0:05 | \_ bash -x test/zdtm.sh -C -x .*\(maps01\|maps04\) 6717 ?S 0:00 | \_ /bin/bash zdtm/live/static/cgroup00.hook --clean 6719 ?D 11:13 | \_ mount -t cgroup none cgclean.cWgK71 -o none,name=zdtmtst [root@linux-next-test ~]# cat /proc/6719/stack [8111fcb7] msleep+0x37/0x50 [8114bde2] cgroup_mount+0x712/0xa50 [8124d0e9] mount_fs+0x39/0x1b0 [8126d76b] vfs_kern_mount+0x6b/0x150 [8127075b] do_mount+0x22b/0xc20 [81271492] SyS_mount+0xa2/0x110 [817e3ba9] system_call_fastpath+0x12/0x17 [] 0x Let me know which info do you need for investigation. Steps to reproduce: git clone https://github.com/xemul/criu.git cd criu make bash test/zdtm.sh static/cgroup00 Thanks, Andrew -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: linux-next: cgroup_mount() falls asleep forever
2014-09-24 14:31 GMT+04:00 Andrey Wagin ava...@gmail.com: Hi All, The problem is in a following commit: commit 0c7bf3e8cab7900e17ce7f97104c39927d835469 Author: Zefan Li lize...@huawei.com Date: Sat Sep 20 14:49:10 2014 +0800 cgroup: remove redundant variable in cgroup_mount() Both pinned_sb and new_sb indicate if a new superblock is needed, so we can just remove new_sb. Note now we must check if kernfs_tryget_sb() returns NULL, because when it returns NULL, kernfs_mount() may still re-use an existing superblock, which is just allocated by another concurent mount. Suggested-by: Tejun Heo t...@kernel.org Signed-off-by: Zefan Li lize...@huawei.com Signed-off-by: Tejun Heo t...@kernel.org I execute CRIU tests on linux-next. Today I found that one of tests hangs up forever. [root@linux-next-test linux-next]# git describe HEAD next-20140922 [root@linux-next-test ~]# ps axf ... 698 ?Ss 0:05 \_ sshd: root@notty 700 ?Ss 0:00 | \_ bash -x jenkins-scripts/jenkins-ct.sh jenkins.sh 706 ?S 0:00 | \_ bash -c ( mount --make-rprivate / umount -l /proc mount -t proc proc /proc/ bash -x jenkins.sh) 707 ?S 0:00 | \_ bash -c ( mount --make-rprivate / umount -l /proc mount -t proc proc /proc/ bash -x jenkins.sh) 711 ?S 0:00 | \_ bash -x jenkins.sh 2981 ?S 0:05 | \_ bash -x test/zdtm.sh -C -x .*\(maps01\|maps04\) 6717 ?S 0:00 | \_ /bin/bash zdtm/live/static/cgroup00.hook --clean 6719 ?D 11:13 | \_ mount -t cgroup none cgclean.cWgK71 -o none,name=zdtmtst [root@linux-next-test ~]# cat /proc/6719/stack [8111fcb7] msleep+0x37/0x50 [8114bde2] cgroup_mount+0x712/0xa50 [8124d0e9] mount_fs+0x39/0x1b0 [8126d76b] vfs_kern_mount+0x6b/0x150 [8127075b] do_mount+0x22b/0xc20 [81271492] SyS_mount+0xa2/0x110 [817e3ba9] system_call_fastpath+0x12/0x17 [] 0x Let me know which info do you need for investigation. Steps to reproduce: git clone https://github.com/xemul/criu.git cd criu make bash test/zdtm.sh static/cgroup00 Thanks, Andrew -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: linux-next: cgroup_mount() falls asleep forever
2014-09-24 14:31 GMT+04:00 Andrey Wagin ava...@gmail.com: Hi All, I execute CRIU tests on linux-next. Today I found that one of tests hangs up forever. [root@linux-next-test linux-next]# git describe HEAD next-20140922 [root@linux-next-test ~]# ps axf ... 698 ?Ss 0:05 \_ sshd: root@notty 700 ?Ss 0:00 | \_ bash -x jenkins-scripts/jenkins-ct.sh jenkins.sh 706 ?S 0:00 | \_ bash -c ( mount --make-rprivate / umount -l /proc mount -t proc proc /proc/ bash -x jenkins.sh) 707 ?S 0:00 | \_ bash -c ( mount --make-rprivate / umount -l /proc mount -t proc proc /proc/ bash -x jenkins.sh) 711 ?S 0:00 | \_ bash -x jenkins.sh 2981 ?S 0:05 | \_ bash -x test/zdtm.sh -C -x .*\(maps01\|maps04\) 6717 ?S 0:00 | \_ /bin/bash zdtm/live/static/cgroup00.hook --clean 6719 ?D 11:13 | \_ mount -t cgroup none cgclean.cWgK71 -o none,name=zdtmtst [root@linux-next-test ~]# cat /proc/6719/stack [8111fcb7] msleep+0x37/0x50 [8114bde2] cgroup_mount+0x712/0xa50 [8124d0e9] mount_fs+0x39/0x1b0 [8126d76b] vfs_kern_mount+0x6b/0x150 [8127075b] do_mount+0x22b/0xc20 [81271492] SyS_mount+0xa2/0x110 [817e3ba9] system_call_fastpath+0x12/0x17 [] 0x Let me know which info do you need for investigation. Steps to reproduce: git clone https://github.com/xemul/criu.git cd criu make bash test/zdtm.sh static/cgroup00 Steps to reproduce without CRIU: [root@avagin-fc19-cr ~]# mount -t cgroup -o none,name=zdtmtst xxx /mnt/test/ [root@avagin-fc19-cr ~]# mkdir /mnt/test/xxx [root@avagin-fc19-cr ~]# umount /mnt/test/ [root@avagin-fc19-cr ~]# mount -t cgroup -o none,name=zdtmtst xxx /mnt/test/ [1]+ Stopped mount -t cgroup -o none,name=zdtmtst xxx /mnt/test/ [root@avagin-fc19-cr ~]# bg [root@avagin-fc19-cr ~]# cat /proc/633/stack [810e9377] msleep+0x37/0x50 [88a2] cgroup_mount+0x482/0x670 [811e3769] mount_fs+0x39/0x1b0 [8120157b] vfs_kern_mount+0x6b/0x150 [8120422b] do_mount+0x22b/0xc20 [81204f62] SyS_mount+0xa2/0x110 [816fcc69] system_call_fastpath+0x12/0x17 [] 0x Thanks, Andrew -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] Documentation/video4linux: don't build without CONFIG_VIDEO_V4L2
2014-08-29 0:42 GMT+04:00 Randy Dunlap : > On 08/28/14 13:34, Andrey Vagin wrote: >> Otherwise we get warnings: >> WARNING: "vb2_ops_wait_finish" >> [Documentation//video4linux/v4l2-pci-skeleton.ko] undefined! >> WARNING: "vb2_ops_wait_prepare" >> [Documentation//video4linux/v4l2-pci-skeleton.ko] undefined! >> ... >> WARNING: "video_unregister_device" >> [Documentation//video4linux/v4l2-pci-skeleton.ko] undefined! >> >> Fixes: 8db5ab4b50fb ("Documentation: add makefiles for more targets") >> >> Cc: Peter Foley >> Cc: Mauro Carvalho Chehab >> Cc: Randy Dunlap >> Signed-off-by: Andrey Vagin >> --- >> Documentation/video4linux/Makefile | 2 ++ >> 1 file changed, 2 insertions(+) >> >> diff --git a/Documentation/video4linux/Makefile >> b/Documentation/video4linux/Makefile >> index d58101e..f19f38e 100644 >> --- a/Documentation/video4linux/Makefile >> +++ b/Documentation/video4linux/Makefile >> @@ -1 +1,3 @@ >> +ifneq ($(CONFIG_VIDEO_V4L2),) >> obj-m := v4l2-pci-skeleton.o >> +endif >> > > The Kconfig file for this module says: > > config VIDEO_PCI_SKELETON > tristate "Skeleton PCI V4L2 driver" > depends on PCI && BUILD_DOCSRC > depends on VIDEO_V4L2 && VIDEOBUF2_CORE && VIDEOBUF2_MEMOPS > > so it should already be limited to VIDEO_V4L2 being enabled. > > What kernel or linux-next version did you see a problem with? Eh, I'm late. It was fixed already commit 81820f32ffaf393d9379c326d670257c63306a26 Author: Mark Brown Date: Wed Aug 27 10:18:51 2014 +1000 v4l2-pci-skeleton: Only build if PCI is available Sorry for the noise. > > Please send the failing .config file so that I can check it. > > Thanks. > > -- > ~Randy -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] Documentation/video4linux: don't build without CONFIG_VIDEO_V4L2
2014-08-29 0:42 GMT+04:00 Randy Dunlap rdun...@infradead.org: On 08/28/14 13:34, Andrey Vagin wrote: Otherwise we get warnings: WARNING: vb2_ops_wait_finish [Documentation//video4linux/v4l2-pci-skeleton.ko] undefined! WARNING: vb2_ops_wait_prepare [Documentation//video4linux/v4l2-pci-skeleton.ko] undefined! ... WARNING: video_unregister_device [Documentation//video4linux/v4l2-pci-skeleton.ko] undefined! Fixes: 8db5ab4b50fb (Documentation: add makefiles for more targets) Cc: Peter Foley pefol...@pefoley.com Cc: Mauro Carvalho Chehab m.che...@samsung.com Cc: Randy Dunlap rdun...@infradead.org Signed-off-by: Andrey Vagin ava...@openvz.org --- Documentation/video4linux/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/video4linux/Makefile b/Documentation/video4linux/Makefile index d58101e..f19f38e 100644 --- a/Documentation/video4linux/Makefile +++ b/Documentation/video4linux/Makefile @@ -1 +1,3 @@ +ifneq ($(CONFIG_VIDEO_V4L2),) obj-m := v4l2-pci-skeleton.o +endif The Kconfig file for this module says: config VIDEO_PCI_SKELETON tristate Skeleton PCI V4L2 driver depends on PCI BUILD_DOCSRC depends on VIDEO_V4L2 VIDEOBUF2_CORE VIDEOBUF2_MEMOPS so it should already be limited to VIDEO_V4L2 being enabled. What kernel or linux-next version did you see a problem with? Eh, I'm late. It was fixed already commit 81820f32ffaf393d9379c326d670257c63306a26 Author: Mark Brown broo...@kernel.org Date: Wed Aug 27 10:18:51 2014 +1000 v4l2-pci-skeleton: Only build if PCI is available Sorry for the noise. Please send the failing .config file so that I can check it. Thanks. -- ~Randy -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/4] Documentation: add makefiles for more targets
Hi Peter, I have got following errors with this patch. I think Documentation/video4linux should be built only if a config contains a proper options set. Building modules, stage 2. MODPOST 73 modules ERROR: "vb2_ops_wait_finish" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "vb2_ops_wait_prepare" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "v4l2_event_unsubscribe" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "v4l2_ctrl_subscribe_event" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "v4l2_ctrl_log_status" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "vb2_ioctl_streamoff" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "vb2_ioctl_streamon" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "vb2_ioctl_create_bufs" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "vb2_ioctl_dqbuf" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "vb2_ioctl_expbuf" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "vb2_ioctl_qbuf" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "vb2_ioctl_querybuf" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "vb2_ioctl_reqbufs" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "vb2_fop_release" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "v4l2_fh_open" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "vb2_fop_mmap" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "video_ioctl2" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "vb2_fop_poll" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "vb2_fop_read" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "vb2_buffer_done" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "__video_register_device" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "video_device_release_empty" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "vb2_dma_contig_init_ctx" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "vb2_queue_init" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "vb2_dma_contig_memops" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "v4l2_ctrl_new_std" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "v4l2_ctrl_handler_init_class" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "v4l2_device_register" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! HOSTCC arch/x86/tools/insn_sanity ERROR: "v4l2_match_dv_timings" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "v4l2_find_dv_timings_cap" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "v4l2_valid_dv_timings" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "v4l2_enum_dv_timings_cap" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "video_devdata" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "v4l2_device_unregister" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "vb2_dma_contig_cleanup_ctx" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "v4l2_ctrl_handler_free" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: "video_unregister_device" [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! make[1]: *** [__modpost] Error 1 make: *** [modules] Error 2 make: *** Waiting for unfinished jobs -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/4] Documentation: add makefiles for more targets
Hi Peter, I have got following errors with this patch. I think Documentation/video4linux should be built only if a config contains a proper options set. Building modules, stage 2. MODPOST 73 modules ERROR: vb2_ops_wait_finish [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: vb2_ops_wait_prepare [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: v4l2_event_unsubscribe [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: v4l2_ctrl_subscribe_event [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: v4l2_ctrl_log_status [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: vb2_ioctl_streamoff [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: vb2_ioctl_streamon [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: vb2_ioctl_create_bufs [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: vb2_ioctl_dqbuf [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: vb2_ioctl_expbuf [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: vb2_ioctl_qbuf [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: vb2_ioctl_querybuf [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: vb2_ioctl_reqbufs [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: vb2_fop_release [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: v4l2_fh_open [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: vb2_fop_mmap [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: video_ioctl2 [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: vb2_fop_poll [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: vb2_fop_read [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: vb2_buffer_done [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: __video_register_device [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: video_device_release_empty [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: vb2_dma_contig_init_ctx [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: vb2_queue_init [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: vb2_dma_contig_memops [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: v4l2_ctrl_new_std [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: v4l2_ctrl_handler_init_class [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: v4l2_device_register [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! HOSTCC arch/x86/tools/insn_sanity ERROR: v4l2_match_dv_timings [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: v4l2_find_dv_timings_cap [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: v4l2_valid_dv_timings [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: v4l2_enum_dv_timings_cap [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: video_devdata [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: v4l2_device_unregister [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: vb2_dma_contig_cleanup_ctx [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: v4l2_ctrl_handler_free [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! ERROR: video_unregister_device [Documentation/video4linux/v4l2-pci-skeleton.ko] undefined! make[1]: *** [__modpost] Error 1 make: *** [modules] Error 2 make: *** Waiting for unfinished jobs -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH driver-core-linus] kernfs, sysfs, cgroup: restrict extra perm check on open to sysfs
On Mon, May 12, 2014 at 01:56:27PM -0400, Tejun Heo wrote: > The kernfs open method - kernfs_fop_open() - inherited extra > permission checks from sysfs. While the vfs layer allows ignoring the > read/write permissions checks if the issuer has CAP_DAC_OVERRIDE, > sysfs explicitly denied open regardless of the cap if the file doesn't > have any of the UGO perms of the requested access or doesn't implement > the requested operation. It can be debated whether this was a good > idea or not but the behavior is too subtle and dangerous to change at > this point. > > After cgroup got converted to kernfs, this extra perm check also got > applied to cgroup breaking libcgroup which opens write-only files with > O_RDWR as root. This patch gates the extra open permission check with > a new flag KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK and enables it for sysfs. > For sysfs, nothing changes. For cgroup, root now can perform any > operation regardless of the permissions as it was before kernfs > conversion. Note that kernfs still fails unimplemented operations > with -EINVAL. > > While at it, add comments explaining KERNFS_ROOT flags. > It works for me. Thank you for the quick response. Tested-by: Andrey Wagin > Signed-off-by: Tejun Heo > Reported-by: Andrey Wagin > Cc: Li Zefan > References: > http://lkml.kernel.org/g/CANaxB-xUm3rJ-Cbp72q-rQJO5mZe1qK6qXsQM=vh0u8upj4...@mail.gmail.com > Fixes: 2bd59d48ebfb ("cgroup: convert to kernfs") > --- > fs/kernfs/file.c | 19 +++ > fs/sysfs/mount.c |3 ++- > include/linux/kernfs.h | 19 ++- > 3 files changed, 31 insertions(+), 10 deletions(-) > > --- a/fs/kernfs/file.c > +++ b/fs/kernfs/file.c > @@ -610,6 +610,7 @@ static void kernfs_put_open_node(struct > static int kernfs_fop_open(struct inode *inode, struct file *file) > { > struct kernfs_node *kn = file->f_path.dentry->d_fsdata; > + struct kernfs_root *root = kernfs_root(kn); > const struct kernfs_ops *ops; > struct kernfs_open_file *of; > bool has_read, has_write, has_mmap; > @@ -624,14 +625,16 @@ static int kernfs_fop_open(struct inode > has_write = ops->write || ops->mmap; > has_mmap = ops->mmap; > > - /* check perms and supported operations */ > - if ((file->f_mode & FMODE_WRITE) && > - (!(inode->i_mode & S_IWUGO) || !has_write)) > - goto err_out; > - > - if ((file->f_mode & FMODE_READ) && > - (!(inode->i_mode & S_IRUGO) || !has_read)) > - goto err_out; > + /* see the flag definition for details */ > + if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) { > + if ((file->f_mode & FMODE_WRITE) && > + (!(inode->i_mode & S_IWUGO) || !has_write)) > + goto err_out; > + > + if ((file->f_mode & FMODE_READ) && > + (!(inode->i_mode & S_IRUGO) || !has_read)) > + goto err_out; > + } > > /* allocate a kernfs_open_file for the file */ > error = -ENOMEM; > --- a/fs/sysfs/mount.c > +++ b/fs/sysfs/mount.c > @@ -63,7 +63,8 @@ int __init sysfs_init(void) > { > int err; > > - sysfs_root = kernfs_create_root(NULL, 0, NULL); > + sysfs_root = kernfs_create_root(NULL, KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, > + NULL); > if (IS_ERR(sysfs_root)) > return PTR_ERR(sysfs_root); > > --- a/include/linux/kernfs.h > +++ b/include/linux/kernfs.h > @@ -50,7 +50,24 @@ enum kernfs_node_flag { > > /* @flags for kernfs_create_root() */ > enum kernfs_root_flag { > - KERNFS_ROOT_CREATE_DEACTIVATED = 0x0001, > + /* > + * kernfs_nodes are created in the deactivated state and invisible. > + * They require explicit kernfs_activate() to become visible. This > + * can be used to make related nodes become visible atomically > + * after all nodes are created successfully. > + */ > + KERNFS_ROOT_CREATE_DEACTIVATED = 0x0001, > + > + /* > + * For regular flies, if the opener has CAP_DAC_OVERRIDE, open(2) > + * succeeds regardless of the RW permissions. sysfs had an extra > + * layer of enforcement where open(2) fails with -EACCES regardless > + * of CAP_DAC_OVERRIDE if the permission doesn't have the > + * respective read or write access at all (none of S_IRUGO or > + * S_IWUGO) or the respective operation isn't implemented. The > + * following flag enables that behavior. > + */ > + KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK = 0x0002, > }; > > /* type-specific structures for kernfs_node union members */ -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH driver-core-linus] kernfs, sysfs, cgroup: restrict extra perm check on open to sysfs
On Mon, May 12, 2014 at 01:56:27PM -0400, Tejun Heo wrote: The kernfs open method - kernfs_fop_open() - inherited extra permission checks from sysfs. While the vfs layer allows ignoring the read/write permissions checks if the issuer has CAP_DAC_OVERRIDE, sysfs explicitly denied open regardless of the cap if the file doesn't have any of the UGO perms of the requested access or doesn't implement the requested operation. It can be debated whether this was a good idea or not but the behavior is too subtle and dangerous to change at this point. After cgroup got converted to kernfs, this extra perm check also got applied to cgroup breaking libcgroup which opens write-only files with O_RDWR as root. This patch gates the extra open permission check with a new flag KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK and enables it for sysfs. For sysfs, nothing changes. For cgroup, root now can perform any operation regardless of the permissions as it was before kernfs conversion. Note that kernfs still fails unimplemented operations with -EINVAL. While at it, add comments explaining KERNFS_ROOT flags. It works for me. Thank you for the quick response. Tested-by: Andrey Wagin ava...@gmail.com Signed-off-by: Tejun Heo t...@kernel.org Reported-by: Andrey Wagin ava...@gmail.com Cc: Li Zefan lize...@huawei.com References: http://lkml.kernel.org/g/CANaxB-xUm3rJ-Cbp72q-rQJO5mZe1qK6qXsQM=vh0u8upj4...@mail.gmail.com Fixes: 2bd59d48ebfb (cgroup: convert to kernfs) --- fs/kernfs/file.c | 19 +++ fs/sysfs/mount.c |3 ++- include/linux/kernfs.h | 19 ++- 3 files changed, 31 insertions(+), 10 deletions(-) --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -610,6 +610,7 @@ static void kernfs_put_open_node(struct static int kernfs_fop_open(struct inode *inode, struct file *file) { struct kernfs_node *kn = file-f_path.dentry-d_fsdata; + struct kernfs_root *root = kernfs_root(kn); const struct kernfs_ops *ops; struct kernfs_open_file *of; bool has_read, has_write, has_mmap; @@ -624,14 +625,16 @@ static int kernfs_fop_open(struct inode has_write = ops-write || ops-mmap; has_mmap = ops-mmap; - /* check perms and supported operations */ - if ((file-f_mode FMODE_WRITE) - (!(inode-i_mode S_IWUGO) || !has_write)) - goto err_out; - - if ((file-f_mode FMODE_READ) - (!(inode-i_mode S_IRUGO) || !has_read)) - goto err_out; + /* see the flag definition for details */ + if (root-flags KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) { + if ((file-f_mode FMODE_WRITE) + (!(inode-i_mode S_IWUGO) || !has_write)) + goto err_out; + + if ((file-f_mode FMODE_READ) + (!(inode-i_mode S_IRUGO) || !has_read)) + goto err_out; + } /* allocate a kernfs_open_file for the file */ error = -ENOMEM; --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -63,7 +63,8 @@ int __init sysfs_init(void) { int err; - sysfs_root = kernfs_create_root(NULL, 0, NULL); + sysfs_root = kernfs_create_root(NULL, KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, + NULL); if (IS_ERR(sysfs_root)) return PTR_ERR(sysfs_root); --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -50,7 +50,24 @@ enum kernfs_node_flag { /* @flags for kernfs_create_root() */ enum kernfs_root_flag { - KERNFS_ROOT_CREATE_DEACTIVATED = 0x0001, + /* + * kernfs_nodes are created in the deactivated state and invisible. + * They require explicit kernfs_activate() to become visible. This + * can be used to make related nodes become visible atomically + * after all nodes are created successfully. + */ + KERNFS_ROOT_CREATE_DEACTIVATED = 0x0001, + + /* + * For regular flies, if the opener has CAP_DAC_OVERRIDE, open(2) + * succeeds regardless of the RW permissions. sysfs had an extra + * layer of enforcement where open(2) fails with -EACCES regardless + * of CAP_DAC_OVERRIDE if the permission doesn't have the + * respective read or write access at all (none of S_IRUGO or + * S_IWUGO) or the respective operation isn't implemented. The + * following flag enables that behavior. + */ + KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK = 0x0002, }; /* type-specific structures for kernfs_node union members */ -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [rfc 2/2] docs: procfs -- Document timerfd output
2014-03-31 21:54 GMT+04:00 Cyrill Gorcunov : > CC: Shawn Landden > CC: Thomas Gleixner > CC: Andrew Morton > CC: Andrey Vagin > CC: Pavel Emelyanov > Signed-off-by: Cyrill Gorcunov > --- > Documentation/filesystems/proc.txt |9 + > 1 file changed, 9 insertions(+) > > Index: linux-2.6.git/Documentation/filesystems/proc.txt > === > --- linux-2.6.git.orig/Documentation/filesystems/proc.txt > +++ linux-2.6.git/Documentation/filesystems/proc.txt > @@ -1741,6 +1741,15 @@ pair provide additional information part > While the first three lines are mandatory and always printed, the > rest is > optional and may be omitted if no marks created yet. > > + Timerfd files > + ~ > + pos:0 > + flags: 02 > + mnt_id: 9 > + clockid: 0 ticks: 0 I would prefer to print "tick" on a separate line. And we can print the current setting of the timer here. It can be useful not only for CRIU but for other users too. it_value: (5, 149234) it_interval: (10, 0) > + > + where 'clockid' is the clock type and 'ticks' is the number of the > timer expirations > + that have occurred [see timerfd_create(2) for details]. > > > -- > Configuring procfs > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [rfc 2/2] docs: procfs -- Document timerfd output
2014-03-31 21:54 GMT+04:00 Cyrill Gorcunov gorcu...@openvz.org: CC: Shawn Landden sh...@churchofgit.com CC: Thomas Gleixner t...@linutronix.de CC: Andrew Morton a...@linux-foundation.org CC: Andrey Vagin ava...@openvz.org CC: Pavel Emelyanov xe...@parallels.com Signed-off-by: Cyrill Gorcunov gorcu...@openvz.org --- Documentation/filesystems/proc.txt |9 + 1 file changed, 9 insertions(+) Index: linux-2.6.git/Documentation/filesystems/proc.txt === --- linux-2.6.git.orig/Documentation/filesystems/proc.txt +++ linux-2.6.git/Documentation/filesystems/proc.txt @@ -1741,6 +1741,15 @@ pair provide additional information part While the first three lines are mandatory and always printed, the rest is optional and may be omitted if no marks created yet. + Timerfd files + ~ + pos:0 + flags: 02 + mnt_id: 9 + clockid: 0 ticks: 0 I would prefer to print tick on a separate line. And we can print the current setting of the timer here. It can be useful not only for CRIU but for other users too. it_value: (5, 149234) it_interval: (10, 0) + + where 'clockid' is the clock type and 'ticks' is the number of the timer expirations + that have occurred [see timerfd_create(2) for details]. -- Configuring procfs -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [CRIU] [PATCH 1/3] prctl: reduce permissions to change boundaries of data, brk and stack
2014-02-14 23:16 GMT+04:00 Eric W. Biederman : > Cyrill Gorcunov writes: > >> On Fri, Feb 14, 2014 at 09:43:14PM +0400, Andrew Vagin wrote: >>> > My brain hurts just looking at this patch and how you are justifying it. >>> > >>> > For the resources you are mucking with below all you have to do is to >>> > verify that you are below the appropriate rlimit at all times and no >>> > CAP_SYS_RESOURCE check is needed. You only need CAP_SYS_RESOURCE >>> > to exceed your per process limits. >>> > >>> > All you have to do is to fix the current code to properly enforce the >>> > limits. >>> >>> I'm afraid what you are suggesting doesn't work. >>> >>> The first reason is that we can not change both boundaries in one call. >>> But when we are restoring these attributes, we may need to move their >>> too far. >> >> When this code was introduced, there were no user-namespace implementation, >> if I remember correctly, so CAP_SYS_RESOURCE was enough barrier point >> to prevent modifying this values by anyone. Now user-ns brings a limit -- >> we need somehow to provide a way to modify these mm fields having no >> CAP_SYS_RESOURCE set. "Verifying rlimit" not an option here because >> we're modifying members one by one (looking back I think this was not >> a good idea to modify the fields in this manner). >> >> Maybe we could improve this api and provide argument as a pointer >> to a structure, which would have all the fields we're going to >> modify, which in turn would allow us to verify that all new values >> are sane and fit rlimits, then we could (probably) deprecate old >> api if noone except c/r camp is using it (I actually can't imagine >> who else might need this api). Then CAP_SYS_RESOURCE requirement >> could be ripped off. Hm? (sure touching api is always "no-no" >> case, but maybe...) > > Hmm. Let me rewind this a little bit. > > I want to be very stupid and ask the following. > > Why can't you have the process of interest do: > ptrace(PTRACE_ATTACHME); > execve(executable, args, ...); > > /* Have the ptracer inject the recovery/fixup code */ > /* Fix up the mostly correct process to look like it has been > * executing for a while. > */ > > That should work, set all of the interesting fields, and works as > non-root today. My gut feel says do that and we can just > deprecate/remove prctl_set_mm. start_brk and start_stack are randomized each time. I don't understand how execve() can restore the origin values of attributes. > > I am hoping we can move this conversation what makes sense from oh ick > checkpoint/restort does not work with user namespaces. > > Eric -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [CRIU] [PATCH 1/3] prctl: reduce permissions to change boundaries of data, brk and stack
2014-02-14 23:16 GMT+04:00 Eric W. Biederman ebied...@xmission.com: Cyrill Gorcunov gorcu...@gmail.com writes: On Fri, Feb 14, 2014 at 09:43:14PM +0400, Andrew Vagin wrote: My brain hurts just looking at this patch and how you are justifying it. For the resources you are mucking with below all you have to do is to verify that you are below the appropriate rlimit at all times and no CAP_SYS_RESOURCE check is needed. You only need CAP_SYS_RESOURCE to exceed your per process limits. All you have to do is to fix the current code to properly enforce the limits. I'm afraid what you are suggesting doesn't work. The first reason is that we can not change both boundaries in one call. But when we are restoring these attributes, we may need to move their too far. When this code was introduced, there were no user-namespace implementation, if I remember correctly, so CAP_SYS_RESOURCE was enough barrier point to prevent modifying this values by anyone. Now user-ns brings a limit -- we need somehow to provide a way to modify these mm fields having no CAP_SYS_RESOURCE set. Verifying rlimit not an option here because we're modifying members one by one (looking back I think this was not a good idea to modify the fields in this manner). Maybe we could improve this api and provide argument as a pointer to a structure, which would have all the fields we're going to modify, which in turn would allow us to verify that all new values are sane and fit rlimits, then we could (probably) deprecate old api if noone except c/r camp is using it (I actually can't imagine who else might need this api). Then CAP_SYS_RESOURCE requirement could be ripped off. Hm? (sure touching api is always no-no case, but maybe...) Hmm. Let me rewind this a little bit. I want to be very stupid and ask the following. Why can't you have the process of interest do: ptrace(PTRACE_ATTACHME); execve(executable, args, ...); /* Have the ptracer inject the recovery/fixup code */ /* Fix up the mostly correct process to look like it has been * executing for a while. */ That should work, set all of the interesting fields, and works as non-root today. My gut feel says do that and we can just deprecate/remove prctl_set_mm. start_brk and start_stack are randomized each time. I don't understand how execve() can restore the origin values of attributes. I am hoping we can move this conversation what makes sense from oh ick checkpoint/restort does not work with user namespaces. Eric -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] netfilter: nf_conntrack: fix RCU race in nf_conntrack_find_get (v3)
> > Eh, looks like this path is incomplete too:( > > I think we can't set a reference counter for objects which is allocated > from a SLAB_DESTROY_BY_RCU cache. Look at the following backtrace. > > cpu1cpu2 > ct = nf_conntrack_find() > destroy_conntrack > atomic_inc_not_zero(ct) ct->ct_general.use is zero after destroy_conntrack(). Sorry for the noise. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] netfilter: nf_conntrack: fix RCU race in nf_conntrack_find_get (v3)
Eh, looks like this path is incomplete too:( I think we can't set a reference counter for objects which is allocated from a SLAB_DESTROY_BY_RCU cache. Look at the following backtrace. cpu1cpu2 ct = nf_conntrack_find() destroy_conntrack atomic_inc_not_zero(ct) ct-ct_general.use is zero after destroy_conntrack(). Sorry for the noise. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] netfilter: nf_conntrack: fix RCU race in nf_conntrack_find_get
2014/1/9 Eric Dumazet : > On Thu, 2014-01-09 at 09:24 +0400, Andrew Vagin wrote: > >> I have one more question. Looks like I found another problem. >> >> init_conntrack: >> hlist_nulls_add_head_rcu(>tuplehash[IP_CT_DIR_ORIGINAL].hnnode, >> >ct.unconfirmed); >> >> __nf_conntrack_hash_insert: >> hlist_nulls_add_head_rcu(>tuplehash[IP_CT_DIR_ORIGINAL].hnnode, >> >ct.hash[hash]); >> >> We use one hlist_nulls_node to add a conntrack into two different lists. >> As far as I understand, it's unacceptable in case of >> SLAB_DESTROY_BY_RCU. > > I guess you missed : > > net/netfilter/nf_conntrack_core.c:1598: > INIT_HLIST_NULLS_HEAD(>ct.unconfirmed, UNCONFIRMED_NULLS_VAL); but we can look up something suitable and return this value, but it will be unconfirmed. Ok, I see. This situation is fixed by this patch too. I don't understand the result of your discussion with Florian. Here are a few states of conntracts: it can be used and it's initialized. The sign of the first state is non-zero refcnt and the sign of the second state is the confirmed bit. In the first state conntrack is attached to skb and inserted in the unconfirmed list. In this state the use count can be incremented in ctnetlink_dump_list() and skb_clone(). In the second state conntrack may be attached to a few skb-s and inserted to net->ct.hash. I have read all emails again and I can't understand when this patch doesn't work. Maybe you could give a sequence of actions? Thanks. > > >> >> Lets imagine that we have two threads. The first one enumerates objects >> from a first list, the second one recreates an object and insert it in a >> second list. If a first thread in this moment stays on the object, it >> can read "next", when it's in the second list, so it will continue >> to enumerate objects from the second list. It is not what we want, isn't >> it? >> >> cpu1 cpu2 >> hlist_nulls_for_each_entry_rcu(ct) >> destroy_conntrack >> kmem_cache_free >> >> init_conntrack >> hlist_nulls_add_head_rcu >> ct = ct->next >> > > This will be fine. > > I think we even have a counter to count number of occurrence of this > rare event. (I personally never read a non null search_restart value) > > NF_CT_STAT_INC(net, search_restart); Thank you for explanation. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] netfilter: nf_conntrack: fix RCU race in nf_conntrack_find_get
2014/1/9 Eric Dumazet eric.duma...@gmail.com: On Thu, 2014-01-09 at 09:24 +0400, Andrew Vagin wrote: I have one more question. Looks like I found another problem. init_conntrack: hlist_nulls_add_head_rcu(ct-tuplehash[IP_CT_DIR_ORIGINAL].hnnode, net-ct.unconfirmed); __nf_conntrack_hash_insert: hlist_nulls_add_head_rcu(ct-tuplehash[IP_CT_DIR_ORIGINAL].hnnode, net-ct.hash[hash]); We use one hlist_nulls_node to add a conntrack into two different lists. As far as I understand, it's unacceptable in case of SLAB_DESTROY_BY_RCU. I guess you missed : net/netfilter/nf_conntrack_core.c:1598: INIT_HLIST_NULLS_HEAD(net-ct.unconfirmed, UNCONFIRMED_NULLS_VAL); but we can look up something suitable and return this value, but it will be unconfirmed. Ok, I see. This situation is fixed by this patch too. I don't understand the result of your discussion with Florian. Here are a few states of conntracts: it can be used and it's initialized. The sign of the first state is non-zero refcnt and the sign of the second state is the confirmed bit. In the first state conntrack is attached to skb and inserted in the unconfirmed list. In this state the use count can be incremented in ctnetlink_dump_list() and skb_clone(). In the second state conntrack may be attached to a few skb-s and inserted to net-ct.hash. I have read all emails again and I can't understand when this patch doesn't work. Maybe you could give a sequence of actions? Thanks. Lets imagine that we have two threads. The first one enumerates objects from a first list, the second one recreates an object and insert it in a second list. If a first thread in this moment stays on the object, it can read next, when it's in the second list, so it will continue to enumerate objects from the second list. It is not what we want, isn't it? cpu1 cpu2 hlist_nulls_for_each_entry_rcu(ct) destroy_conntrack kmem_cache_free init_conntrack hlist_nulls_add_head_rcu ct = ct-next This will be fine. I think we even have a counter to count number of occurrence of this rare event. (I personally never read a non null search_restart value) NF_CT_STAT_INC(net, search_restart); Thank you for explanation. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] netfilter: nf_conntrack: release conntrack from rcu callback
Hi Florian, 2014/1/7 Florian Westphal : > Andrew Vagin wrote: >> > ct = nf_ct_tuplehash_to_ctrack(h); >> > if (unlikely(nf_ct_is_dying(ct) || >> > !atomic_inc_not_zero(>ct_general.use))) >> > // which means we should hit this path (0 ref). >> > h = NULL; >> > else { >> > // otherwise, it cannot go away from under us, since >> > // we own a reference now. >> > if (unlikely(!nf_ct_tuple_equal(tuple, >tuple) >> > || >> > nf_ct_zone(ct) != zone)) { > > Perhaps this needs additional !nf_ct_is_confirmed()? Yes, it think it must help. Thank you for the comments. I resent this patch: [PATCH] netfilter: nf_conntrack: fix RCU race in nf_conntrack_find_get > > It would cover your case (found a recycled element that has been put on > the unconfirmed list (refcnt already set to 1, ct->tuple is set) on another > cpu, > extensions possibly not yet fully initialised), and the same tuple). > > Regards, > Florian Thanks, Andrey -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] netfilter: nf_conntrack: release conntrack from rcu callback
Hi Florian, 2014/1/7 Florian Westphal f...@strlen.de: Andrew Vagin ava...@gmail.com wrote: ct = nf_ct_tuplehash_to_ctrack(h); if (unlikely(nf_ct_is_dying(ct) || !atomic_inc_not_zero(ct-ct_general.use))) // which means we should hit this path (0 ref). h = NULL; else { // otherwise, it cannot go away from under us, since // we own a reference now. if (unlikely(!nf_ct_tuple_equal(tuple, h-tuple) || nf_ct_zone(ct) != zone)) { Perhaps this needs additional !nf_ct_is_confirmed()? Yes, it think it must help. Thank you for the comments. I resent this patch: [PATCH] netfilter: nf_conntrack: fix RCU race in nf_conntrack_find_get It would cover your case (found a recycled element that has been put on the unconfirmed list (refcnt already set to 1, ct-tuple is set) on another cpu, extensions possibly not yet fully initialised), and the same tuple). Regards, Florian Thanks, Andrey -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [CRIU] [PATCH] timerfd: show procfs fdinfo helper
2013/12/24 Shawn Landden : > | pos: 0 > | flags:02004002 > | clockid: 0 > > Cc: Thomas Gleixner > Cc: Alexander Viro > Signed-off-by: Shawn Landden > --- > fs/timerfd.c | 17 + > 1 file changed, 17 insertions(+) > > diff --git a/fs/timerfd.c b/fs/timerfd.c > index 9293121..e5fa587 100644 > --- a/fs/timerfd.c > +++ b/fs/timerfd.c > @@ -25,6 +25,7 @@ > #include > #include > #include > +#include > > struct timerfd_ctx { > union { > @@ -284,7 +285,23 @@ static ssize_t timerfd_read(struct file *file, char > __user *buf, size_t count, > return res; > } > > +#ifdef CONFIG_PROC_FS > +static int timerfd_show_fdinfo(struct seq_file *m, struct file *f) > +{ > + struct timerfd_ctx *ctx = f->private_data; > + int clockid; > + > + clockid = ctx->clockid; > + seq_printf(m, "clockid:\t%d\n", clockid); I think we can show ctx->ticks, itimerspec here. The ctx->ticks is required for proper dumping and restoring timerfd. > + > + return 0; > +} > +#endif > + > static const struct file_operations timerfd_fops = { > +#ifdef CONFIG_PROC_FS > + .show_fdinfo= timerfd_show_fdinfo, > +#endif > .release= timerfd_release, > .poll = timerfd_poll, > .read = timerfd_read, > -- > 1.8.5.2.297.g3e57c29 > > ___ > CRIU mailing list > c...@openvz.org > https://lists.openvz.org/mailman/listinfo/criu -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [CRIU] [PATCH] timerfd: show procfs fdinfo helper
2013/12/24 Shawn Landden sh...@churchofgit.com: | pos: 0 | flags:02004002 | clockid: 0 Cc: Thomas Gleixner t...@linutronix.de Cc: Alexander Viro v...@zeniv.linux.org.uk Signed-off-by: Shawn Landden sh...@churchofgit.com --- fs/timerfd.c | 17 + 1 file changed, 17 insertions(+) diff --git a/fs/timerfd.c b/fs/timerfd.c index 9293121..e5fa587 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -25,6 +25,7 @@ #include linux/syscalls.h #include linux/compat.h #include linux/rcupdate.h +#include linux/seq_file.h struct timerfd_ctx { union { @@ -284,7 +285,23 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, return res; } +#ifdef CONFIG_PROC_FS +static int timerfd_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct timerfd_ctx *ctx = f-private_data; + int clockid; + + clockid = ctx-clockid; + seq_printf(m, clockid:\t%d\n, clockid); I think we can show ctx-ticks, itimerspec here. The ctx-ticks is required for proper dumping and restoring timerfd. + + return 0; +} +#endif + static const struct file_operations timerfd_fops = { +#ifdef CONFIG_PROC_FS + .show_fdinfo= timerfd_show_fdinfo, +#endif .release= timerfd_release, .poll = timerfd_poll, .read = timerfd_read, -- 1.8.5.2.297.g3e57c29 ___ CRIU mailing list c...@openvz.org https://lists.openvz.org/mailman/listinfo/criu -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] thp: move preallocated PTE page table on move_huge_pmd()
2013/12/4 Kirill A. Shutemov : > Andrey Wagin reported crash on VM_BUG_ON() in pgtable_pmd_page_dtor() > with fallowing backtrace: > > [] free_pgd_range+0x2bf/0x410 > [] free_pgtables+0xce/0x120 > [] unmap_region+0xe0/0x120 > [] ? move_page_tables+0x526/0x6b0 > [] do_munmap+0x249/0x360 > [] move_vma+0x144/0x270 > [] SyS_mremap+0x3b9/0x510 > [] system_call_fastpath+0x16/0x1b > > The crash can be reproduce with this test case: > > #define _GNU_SOURCE > #include > #include > #include > > #define MB (1024 * 1024UL) > #define GB (1024 * MB) > > int main(int argc, char **argv) > { > char *p; > int i; > > p = mmap((void *) GB, 10 * MB, PROT_READ | PROT_WRITE, > MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); > for (i = 0; i < 10 * MB; i += 4096) > p[i] = 1; > mremap(p, 10 * MB, 10 * MB, MREMAP_FIXED | MREMAP_MAYMOVE, 2 * GB); > return 0; > } > > Due to split PMD lock, we now store preallocated PTE tables for THP > pages per-PMD table. It means we need to move them to other PMD table > if huge PMD moved there. > > Signed-off-by: Kirill A. Shutemov > Reported-by: Andrey Vagin My tests were working for the night without any problem. Thanks for the quick response. Tested-by: Andrey Vagin > --- > mm/huge_memory.c | 12 +++- > 1 file changed, 11 insertions(+), 1 deletion(-) > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index bccd5a628ea6..33a5dc492810 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -1481,8 +1481,18 @@ int move_huge_pmd(struct vm_area_struct *vma, struct > vm_area_struct *new_vma, > pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); > VM_BUG_ON(!pmd_none(*new_pmd)); > set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); > - if (new_ptl != old_ptl) > + if (new_ptl != old_ptl) { > + pgtable_t pgtable; > + > + /* > +* Move preallocated PTE page table if new_pmd is on > +* different PMD page table. > +*/ > + pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); > + pgtable_trans_huge_deposit(mm, new_pmd, pgtable); > + > spin_unlock(new_ptl); > + } > spin_unlock(old_ptl); > } > out: > -- > 1.8.4.4 > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] thp: move preallocated PTE page table on move_huge_pmd()
2013/12/4 Kirill A. Shutemov kirill.shute...@linux.intel.com: Andrey Wagin reported crash on VM_BUG_ON() in pgtable_pmd_page_dtor() with fallowing backtrace: [8119427f] free_pgd_range+0x2bf/0x410 [8119449e] free_pgtables+0xce/0x120 [8119b900] unmap_region+0xe0/0x120 [811a0036] ? move_page_tables+0x526/0x6b0 [8119d6a9] do_munmap+0x249/0x360 [811a0304] move_vma+0x144/0x270 [811a07e9] SyS_mremap+0x3b9/0x510 [8172d512] system_call_fastpath+0x16/0x1b The crash can be reproduce with this test case: #define _GNU_SOURCE #include sys/mman.h #include stdio.h #include unistd.h #define MB (1024 * 1024UL) #define GB (1024 * MB) int main(int argc, char **argv) { char *p; int i; p = mmap((void *) GB, 10 * MB, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); for (i = 0; i 10 * MB; i += 4096) p[i] = 1; mremap(p, 10 * MB, 10 * MB, MREMAP_FIXED | MREMAP_MAYMOVE, 2 * GB); return 0; } Due to split PMD lock, we now store preallocated PTE tables for THP pages per-PMD table. It means we need to move them to other PMD table if huge PMD moved there. Signed-off-by: Kirill A. Shutemov kirill.shute...@linux.intel.com Reported-by: Andrey Vagin ava...@openvz.org My tests were working for the night without any problem. Thanks for the quick response. Tested-by: Andrey Vagin ava...@openvz.org --- mm/huge_memory.c | 12 +++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index bccd5a628ea6..33a5dc492810 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1481,8 +1481,18 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); - if (new_ptl != old_ptl) + if (new_ptl != old_ptl) { + pgtable_t pgtable; + + /* +* Move preallocated PTE page table if new_pmd is on +* different PMD page table. +*/ + pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); + pgtable_trans_huge_deposit(mm, new_pmd, pgtable); + spin_unlock(new_ptl); + } spin_unlock(old_ptl); } out: -- 1.8.4.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 34/34] mm: dynamically allocate page->ptl if it cannot be embedded to struct page
2013/11/20 Kirill A. Shutemov : > Andrey Wagin wrote: >> 2013/11/20 Kirill A. Shutemov : >> > Andrey Wagin wrote: >> >> Hi Kirill, >> >> >> >> Looks like this patch adds memory leaks. >> >> [ 116.188310] kmemleak: 15672 new suspected memory leaks (see >> >> /sys/kernel/debug/kmemleak) >> >> unreferenced object 0x8800da45a350 (size 96): >> >> comm "dracut-initqueu", pid 93, jiffies 4294671391 (age 362.277s) >> >> hex dump (first 32 bytes): >> >> 07 00 07 00 ad 4e ad de ff ff ff ff 6b 6b 6b 6b .N.. >> >> ff ff ff ff ff ff ff ff 80 24 b4 82 ff ff ff ff .$.. >> >> backtrace: >> >> [] kmemleak_alloc+0x5e/0xc0 >> >> [] kmem_cache_alloc_trace+0x113/0x290 >> >> [] __ptlock_alloc+0x27/0x50 >> >> [] __pmd_alloc+0x59/0x170 >> >> [] copy_page_range+0x38a/0x3e0 >> >> [] dup_mm+0x313/0x540 >> >> [] copy_process+0x161a/0x1880 >> >> [] do_fork+0x8b/0x360 >> >> [] SyS_clone+0x16/0x20 >> >> [] stub_clone+0x69/0x90 >> >> [] 0x >> >> >> >> It's quite serious, because my test host went to panic in a few hours. >> > >> > Sorry for that. >> > >> > Could you test patch below. >> >> Yes, it works. >> >> I found this too a few minutes ago:) > > Nice > > Tested-by ? Tested-by: Andrey Vagin > > -- > Kirill A. Shutemov -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 34/34] mm: dynamically allocate page->ptl if it cannot be embedded to struct page
2013/11/20 Kirill A. Shutemov : > Andrey Wagin wrote: >> Hi Kirill, >> >> Looks like this patch adds memory leaks. >> [ 116.188310] kmemleak: 15672 new suspected memory leaks (see >> /sys/kernel/debug/kmemleak) >> unreferenced object 0x8800da45a350 (size 96): >> comm "dracut-initqueu", pid 93, jiffies 4294671391 (age 362.277s) >> hex dump (first 32 bytes): >> 07 00 07 00 ad 4e ad de ff ff ff ff 6b 6b 6b 6b .N.. >> ff ff ff ff ff ff ff ff 80 24 b4 82 ff ff ff ff .$.. >> backtrace: >> [] kmemleak_alloc+0x5e/0xc0 >> [] kmem_cache_alloc_trace+0x113/0x290 >> [] __ptlock_alloc+0x27/0x50 >> [] __pmd_alloc+0x59/0x170 >> [] copy_page_range+0x38a/0x3e0 >> [] dup_mm+0x313/0x540 >> [] copy_process+0x161a/0x1880 >> [] do_fork+0x8b/0x360 >> [] SyS_clone+0x16/0x20 >> [] stub_clone+0x69/0x90 >> [] 0x >> >> It's quite serious, because my test host went to panic in a few hours. > > Sorry for that. > > Could you test patch below. Yes, it works. I found this too a few minutes ago:) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index a7cccb6..44c366c 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -62,6 +62,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); + pgtable_pmd_page_dtor(virt_to_page(pmd)); /* * NOTE! For PAE, any changes to the top page-directory-pointer-table * entries need a full cr3 reload to flush. Thanks. > > diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c > index a7cccb6d7fec..7be5809754cf 100644 > --- a/arch/x86/mm/pgtable.c > +++ b/arch/x86/mm/pgtable.c > @@ -61,6 +61,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page > *pte) > #if PAGETABLE_LEVELS > 2 > void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) > { > + struct page *page = virt_to_page(pmd); > paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); > /* > * NOTE! For PAE, any changes to the top page-directory-pointer-table > @@ -69,7 +70,8 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) > #ifdef CONFIG_X86_PAE > tlb->need_flush_all = 1; > #endif > - tlb_remove_page(tlb, virt_to_page(pmd)); > + pgtable_pmd_page_dtor(page); > + tlb_remove_page(tlb, page); > } > > #if PAGETABLE_LEVELS > 3 > -- > Kirill A. Shutemov -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 34/34] mm: dynamically allocate page->ptl if it cannot be embedded to struct page
Hi Kirill, Looks like this patch adds memory leaks. [ 116.188310] kmemleak: 15672 new suspected memory leaks (see /sys/kernel/debug/kmemleak) unreferenced object 0x8800da45a350 (size 96): comm "dracut-initqueu", pid 93, jiffies 4294671391 (age 362.277s) hex dump (first 32 bytes): 07 00 07 00 ad 4e ad de ff ff ff ff 6b 6b 6b 6b .N.. ff ff ff ff ff ff ff ff 80 24 b4 82 ff ff ff ff .$.. backtrace: [] kmemleak_alloc+0x5e/0xc0 [] kmem_cache_alloc_trace+0x113/0x290 [] __ptlock_alloc+0x27/0x50 [] __pmd_alloc+0x59/0x170 [] copy_page_range+0x38a/0x3e0 [] dup_mm+0x313/0x540 [] copy_process+0x161a/0x1880 [] do_fork+0x8b/0x360 [] SyS_clone+0x16/0x20 [] stub_clone+0x69/0x90 [] 0x It's quite serious, because my test host went to panic in a few hours. [12000.632734] kmemleak: 74155 new suspected memory leaks (see /sys/kernel/debug/kmemleak) [12080.734075] zombie00[29282]: segfault at 0 ip 00401862 sp 7fffc509bc20 error 6 in zombie00[40+5000] [12619.799052] BUG: unable to handle kernel paging request at 7aa9e3a0 [12619.800044] IP: [] cpuacct_charge+0x97/0x1e0 [12619.800044] PGD 0 [12619.800044] Thread overran stack, or stack corrupted [12619.800044] Oops: [#1] SMP [12619.800044] Modules linked in: binfmt_misc ip6table_filter ip6_tables tun netlink_diag af_packet_diag udp_diag tcp_diag inet_diag unix_diag joydev microcode pcspkr i2c_piix4 virtio_balloon virtio_net i2c_core virtio_blk floppy [12619.800044] CPU: 1 PID: 1324 Comm: kworker/u4:2 Not tainted 3.12.0+ #142 [12619.800044] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2007 [12619.800044] Workqueue: writeback bdi_writeback_workfn (flush-252:0) [12619.800044] task: 88001f1a8000 ti: 880096f26000 task.ti: 880096f26000 [12619.800044] RIP: 0010:[] [] cpuacct_charge+0x97/0x1e0 [12619.800044] RSP: 0018:88011b403ce8 EFLAGS: 00010002 [12619.800044] RAX: d580 RBX: 000f11b1 RCX: 0003 [12619.800044] RDX: 81c49e40 RSI: 81c4bb00 RDI: 88001f1a8c68 [12619.800044] RBP: 88011b403d18 R08: 0001 R09: 0001 [12619.800044] R10: 0001 R11: 0007 R12: 88001f1a8000 [12619.800044] R13: 1f1a8000 R14: 82a86320 R15: 06b1bda1e433 [12619.800044] FS: () GS:88011b40() knlGS: [12619.800044] CS: 0010 DS: ES: CR0: 8005003b [12619.800044] CR2: 7aa9e3a0 CR3: 01c0b000 CR4: 06e0 [12619.800044] Stack: [12619.800044] 810b2b70 0002 88011b5d40c0 000f11b1 [12619.800044] 88001f1a8068 88001f1a8000 88011b403d58 810a108f [12619.800044] 88011b403d88 88001f1a8068 88011b5d40c0 88011b5d4000 [12619.800044] Call Trace: [12619.800044] [12619.800044] [] ? cpuacct_css_alloc+0xb0/0xb0 [12619.800044] [] update_curr+0x13f/0x230 [12619.800044] [] task_tick_fair+0x2d7/0x650 [12619.800044] [] ? sched_clock_cpu+0xb8/0x120 [12619.800044] [] scheduler_tick+0x6d/0xf0 [12619.800044] [] update_process_times+0x61/0x80 [12619.800044] [] tick_sched_handle+0x37/0x80 [12619.800044] [] tick_sched_timer+0x54/0x90 [12619.800044] [] __run_hrtimer+0x71/0x2d0 [12619.800044] [] ? tick_nohz_handler+0xc0/0xc0 [12619.800044] [] hrtimer_interrupt+0x116/0x2a0 [12619.800044] [] ? __local_bh_enable+0x49/0x70 [12619.800044] [] local_apic_timer_interrupt+0x3b/0x60 [12619.800044] [] smp_apic_timer_interrupt+0x45/0x60 [12619.800044] [] apic_timer_interrupt+0x6f/0x80 [12619.800044] [12619.800044] [] ? mark_held_locks+0x90/0x150 [12619.800044] [] ? _raw_spin_unlock_irqrestore+0x42/0x70 [12619.800044] [] virtio_queue_rq+0xdb/0x1b0 [virtio_blk] [12619.800044] [] __blk_mq_run_hw_queue+0x1ca/0x520 [12619.800044] [] blk_mq_run_hw_queue+0x35/0x40 [12619.800044] [] blk_mq_insert_requests+0xe2/0x190 [12619.800044] [] blk_mq_flush_plug_list+0x134/0x150 [12619.800044] [] blk_flush_plug_list+0xbd/0x220 [12619.800044] [] blk_mq_make_request+0x3da/0x4d0 [12619.800044] [] generic_make_request+0xca/0x100 [12619.800044] [] submit_bio+0x76/0x160 [12619.800044] [] ? test_set_page_writeback+0x36/0x2b0 [12619.800044] [] ? end_swap_bio_read+0xc0/0xc0 [12619.800044] [] __swap_writepage+0x198/0x230 [12619.800044] [] ? _raw_spin_unlock+0x2b/0x40 [12619.800044] [] ? page_swapcount+0x53/0x70 [12619.800044] [] swap_writepage+0x43/0x90 [12619.800044] [] shrink_page_list+0x6cf/0xaa0 [12619.800044] [] shrink_inactive_list+0x1c2/0x5b0 [12619.800044] [] ? __lock_acquire+0x23f/0x1810 [12619.800044] [] shrink_lruvec+0x335/0x600 [12619.800044] [] ? mem_cgroup_iter+0x1f5/0x510 [12619.800044] [] shrink_zone+0x96/0x1d0 [12619.800044] [] do_try_to_free_pages+0x103/0x600 [12619.800044] [] ? sched_clock_local+0x25/0x90 [12619.800044] [] try_to_free_pages+0x222/0x440 [12619.800044] [] __alloc_pages_nodemask+0x8af/0xc70
Re: [PATCH 34/34] mm: dynamically allocate page-ptl if it cannot be embedded to struct page
Hi Kirill, Looks like this patch adds memory leaks. [ 116.188310] kmemleak: 15672 new suspected memory leaks (see /sys/kernel/debug/kmemleak) unreferenced object 0x8800da45a350 (size 96): comm dracut-initqueu, pid 93, jiffies 4294671391 (age 362.277s) hex dump (first 32 bytes): 07 00 07 00 ad 4e ad de ff ff ff ff 6b 6b 6b 6b .N.. ff ff ff ff ff ff ff ff 80 24 b4 82 ff ff ff ff .$.. backtrace: [817152fe] kmemleak_alloc+0x5e/0xc0 [811c34f3] kmem_cache_alloc_trace+0x113/0x290 [811920f7] __ptlock_alloc+0x27/0x50 [81192849] __pmd_alloc+0x59/0x170 [81195ffa] copy_page_range+0x38a/0x3e0 [8105a013] dup_mm+0x313/0x540 [8105b9da] copy_process+0x161a/0x1880 [8105c01b] do_fork+0x8b/0x360 [8105c306] SyS_clone+0x16/0x20 [81727b79] stub_clone+0x69/0x90 [] 0x It's quite serious, because my test host went to panic in a few hours. [12000.632734] kmemleak: 74155 new suspected memory leaks (see /sys/kernel/debug/kmemleak) [12080.734075] zombie00[29282]: segfault at 0 ip 00401862 sp 7fffc509bc20 error 6 in zombie00[40+5000] [12619.799052] BUG: unable to handle kernel paging request at 7aa9e3a0 [12619.800044] IP: [810b2c07] cpuacct_charge+0x97/0x1e0 [12619.800044] PGD 0 [12619.800044] Thread overran stack, or stack corrupted [12619.800044] Oops: [#1] SMP [12619.800044] Modules linked in: binfmt_misc ip6table_filter ip6_tables tun netlink_diag af_packet_diag udp_diag tcp_diag inet_diag unix_diag joydev microcode pcspkr i2c_piix4 virtio_balloon virtio_net i2c_core virtio_blk floppy [12619.800044] CPU: 1 PID: 1324 Comm: kworker/u4:2 Not tainted 3.12.0+ #142 [12619.800044] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2007 [12619.800044] Workqueue: writeback bdi_writeback_workfn (flush-252:0) [12619.800044] task: 88001f1a8000 ti: 880096f26000 task.ti: 880096f26000 [12619.800044] RIP: 0010:[810b2c07] [810b2c07] cpuacct_charge+0x97/0x1e0 [12619.800044] RSP: 0018:88011b403ce8 EFLAGS: 00010002 [12619.800044] RAX: d580 RBX: 000f11b1 RCX: 0003 [12619.800044] RDX: 81c49e40 RSI: 81c4bb00 RDI: 88001f1a8c68 [12619.800044] RBP: 88011b403d18 R08: 0001 R09: 0001 [12619.800044] R10: 0001 R11: 0007 R12: 88001f1a8000 [12619.800044] R13: 1f1a8000 R14: 82a86320 R15: 06b1bda1e433 [12619.800044] FS: () GS:88011b40() knlGS: [12619.800044] CS: 0010 DS: ES: CR0: 8005003b [12619.800044] CR2: 7aa9e3a0 CR3: 01c0b000 CR4: 06e0 [12619.800044] Stack: [12619.800044] 810b2b70 0002 88011b5d40c0 000f11b1 [12619.800044] 88001f1a8068 88001f1a8000 88011b403d58 810a108f [12619.800044] 88011b403d88 88001f1a8068 88011b5d40c0 88011b5d4000 [12619.800044] Call Trace: [12619.800044] IRQ [12619.800044] [810b2b70] ? cpuacct_css_alloc+0xb0/0xb0 [12619.800044] [810a108f] update_curr+0x13f/0x230 [12619.800044] [810a9e57] task_tick_fair+0x2d7/0x650 [12619.800044] [8109dcc8] ? sched_clock_cpu+0xb8/0x120 [12619.800044] [8109482d] scheduler_tick+0x6d/0xf0 [12619.800044] [8106afd1] update_process_times+0x61/0x80 [12619.800044] [810e38c7] tick_sched_handle+0x37/0x80 [12619.800044] [810e3e74] tick_sched_timer+0x54/0x90 [12619.800044] [8108bd21] __run_hrtimer+0x71/0x2d0 [12619.800044] [810e3e20] ? tick_nohz_handler+0xc0/0xc0 [12619.800044] [8108c246] hrtimer_interrupt+0x116/0x2a0 [12619.800044] [81062959] ? __local_bh_enable+0x49/0x70 [12619.800044] [81033dcb] local_apic_timer_interrupt+0x3b/0x60 [12619.800044] [81727c05] smp_apic_timer_interrupt+0x45/0x60 [12619.800044] [8172686f] apic_timer_interrupt+0x6f/0x80 [12619.800044] EOI [12619.800044] [810b8e10] ? mark_held_locks+0x90/0x150 [12619.800044] [8171c6f2] ? _raw_spin_unlock_irqrestore+0x42/0x70 [12619.800044] [a001b71b] virtio_queue_rq+0xdb/0x1b0 [virtio_blk] [12619.800044] [8134647a] __blk_mq_run_hw_queue+0x1ca/0x520 [12619.800044] [81346b35] blk_mq_run_hw_queue+0x35/0x40 [12619.800044] [813470f2] blk_mq_insert_requests+0xe2/0x190 [12619.800044] [813472d4] blk_mq_flush_plug_list+0x134/0x150 [12619.800044] [8133d0cd] blk_flush_plug_list+0xbd/0x220 [12619.800044] [81346f1a] blk_mq_make_request+0x3da/0x4d0 [12619.800044] [813397aa] generic_make_request+0xca/0x100 [12619.800044] [81339856] submit_bio+0x76/0x160 [12619.800044] [81173c66] ? test_set_page_writeback+0x36/0x2b0 [12619.800044] [811a9ae0] ?
Re: [PATCH 34/34] mm: dynamically allocate page-ptl if it cannot be embedded to struct page
2013/11/20 Kirill A. Shutemov kirill.shute...@linux.intel.com: Andrey Wagin wrote: Hi Kirill, Looks like this patch adds memory leaks. [ 116.188310] kmemleak: 15672 new suspected memory leaks (see /sys/kernel/debug/kmemleak) unreferenced object 0x8800da45a350 (size 96): comm dracut-initqueu, pid 93, jiffies 4294671391 (age 362.277s) hex dump (first 32 bytes): 07 00 07 00 ad 4e ad de ff ff ff ff 6b 6b 6b 6b .N.. ff ff ff ff ff ff ff ff 80 24 b4 82 ff ff ff ff .$.. backtrace: [817152fe] kmemleak_alloc+0x5e/0xc0 [811c34f3] kmem_cache_alloc_trace+0x113/0x290 [811920f7] __ptlock_alloc+0x27/0x50 [81192849] __pmd_alloc+0x59/0x170 [81195ffa] copy_page_range+0x38a/0x3e0 [8105a013] dup_mm+0x313/0x540 [8105b9da] copy_process+0x161a/0x1880 [8105c01b] do_fork+0x8b/0x360 [8105c306] SyS_clone+0x16/0x20 [81727b79] stub_clone+0x69/0x90 [] 0x It's quite serious, because my test host went to panic in a few hours. Sorry for that. Could you test patch below. Yes, it works. I found this too a few minutes ago:) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index a7cccb6..44c366c 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -62,6 +62,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { paravirt_release_pmd(__pa(pmd) PAGE_SHIFT); + pgtable_pmd_page_dtor(virt_to_page(pmd)); /* * NOTE! For PAE, any changes to the top page-directory-pointer-table * entries need a full cr3 reload to flush. Thanks. diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index a7cccb6d7fec..7be5809754cf 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -61,6 +61,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) #if PAGETABLE_LEVELS 2 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { + struct page *page = virt_to_page(pmd); paravirt_release_pmd(__pa(pmd) PAGE_SHIFT); /* * NOTE! For PAE, any changes to the top page-directory-pointer-table @@ -69,7 +70,8 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) #ifdef CONFIG_X86_PAE tlb-need_flush_all = 1; #endif - tlb_remove_page(tlb, virt_to_page(pmd)); + pgtable_pmd_page_dtor(page); + tlb_remove_page(tlb, page); } #if PAGETABLE_LEVELS 3 -- Kirill A. Shutemov -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 34/34] mm: dynamically allocate page-ptl if it cannot be embedded to struct page
2013/11/20 Kirill A. Shutemov kirill.shute...@linux.intel.com: Andrey Wagin wrote: 2013/11/20 Kirill A. Shutemov kirill.shute...@linux.intel.com: Andrey Wagin wrote: Hi Kirill, Looks like this patch adds memory leaks. [ 116.188310] kmemleak: 15672 new suspected memory leaks (see /sys/kernel/debug/kmemleak) unreferenced object 0x8800da45a350 (size 96): comm dracut-initqueu, pid 93, jiffies 4294671391 (age 362.277s) hex dump (first 32 bytes): 07 00 07 00 ad 4e ad de ff ff ff ff 6b 6b 6b 6b .N.. ff ff ff ff ff ff ff ff 80 24 b4 82 ff ff ff ff .$.. backtrace: [817152fe] kmemleak_alloc+0x5e/0xc0 [811c34f3] kmem_cache_alloc_trace+0x113/0x290 [811920f7] __ptlock_alloc+0x27/0x50 [81192849] __pmd_alloc+0x59/0x170 [81195ffa] copy_page_range+0x38a/0x3e0 [8105a013] dup_mm+0x313/0x540 [8105b9da] copy_process+0x161a/0x1880 [8105c01b] do_fork+0x8b/0x360 [8105c306] SyS_clone+0x16/0x20 [81727b79] stub_clone+0x69/0x90 [] 0x It's quite serious, because my test host went to panic in a few hours. Sorry for that. Could you test patch below. Yes, it works. I found this too a few minutes ago:) Nice Tested-by ? Tested-by: Andrey Vagin ava...@openvz.org -- Kirill A. Shutemov -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] [RFC] kmemcg: remove union from memcg_params
2013/8/13 Andrew Morton : > On Fri, 9 Aug 2013 00:51:26 +0400 Andrey Vagin wrote: > >> struct memcg_cache_params { >> bool is_root_cache; >> union { >> struct kmem_cache *memcg_caches[0]; >> struct { >> struct mem_cgroup *memcg; >> struct list_head list; >> struct kmem_cache *root_cache; >> bool dead; >> atomic_t nr_pages; >> struct work_struct destroy; >> }; >> }; >> }; >> >> This union is a bit dangerous. //Andrew Morton >> >> The first problem was fixed in v3.10-rc5-67-gf101a94. >> The second problem is that the size of memory for root >> caches is calculated incorrectly: >> >> ssize_t size = memcg_caches_array_size(num_groups); >> >> size *= sizeof(void *); >> size += sizeof(struct memcg_cache_params); >> >> The last line should be fixed like this: >> size += offsetof(struct memcg_cache_params, memcg_caches) >> >> Andrew suggested to rework this code without union and >> this patch tries to do that. > > hm, did I? I reread your messages. I have seen in it, what I want. Sorry, you suggested to rework this code how you explained bellow in this message. "without union" is my fantasy. http://lkml.indiana.edu/hypermail/linux/kernel/1305.3/01985.html > >> This patch removes is_root_cache and union. The size of the >> memcg_cache_params structure is not changed. >> > > It's a bit sad to consume more space because we're sucky programmers. > It would be better to retain the union and to stop writing buggy code > to handle it! I decided to implement this approach, because it doesn't increase memory consumptions. This patch is replace is_root_cache on a pointer, but due to alignment the size of the structure is not changed. but the size of struct kmem_cache is increased on one pointer, if accounting of kernel memory is not enabled. The overhead of this patch on a real system is about 1K if the kernel memory accounting is disabled and the overhead is zero after enabling the accounting. > > Maybe there are things we can do to reduce the likelihood of people > mishandling the union - don't use anonymous fields, name each member, > access it via helper functions, etc. Ok, I wil try this way. Thanks, Andrey -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] [RFC] kmemcg: remove union from memcg_params
2013/8/13 Andrew Morton a...@linux-foundation.org: On Fri, 9 Aug 2013 00:51:26 +0400 Andrey Vagin ava...@openvz.org wrote: struct memcg_cache_params { bool is_root_cache; union { struct kmem_cache *memcg_caches[0]; struct { struct mem_cgroup *memcg; struct list_head list; struct kmem_cache *root_cache; bool dead; atomic_t nr_pages; struct work_struct destroy; }; }; }; This union is a bit dangerous. //Andrew Morton The first problem was fixed in v3.10-rc5-67-gf101a94. The second problem is that the size of memory for root caches is calculated incorrectly: ssize_t size = memcg_caches_array_size(num_groups); size *= sizeof(void *); size += sizeof(struct memcg_cache_params); The last line should be fixed like this: size += offsetof(struct memcg_cache_params, memcg_caches) Andrew suggested to rework this code without union and this patch tries to do that. hm, did I? I reread your messages. I have seen in it, what I want. Sorry, you suggested to rework this code how you explained bellow in this message. without union is my fantasy. http://lkml.indiana.edu/hypermail/linux/kernel/1305.3/01985.html This patch removes is_root_cache and union. The size of the memcg_cache_params structure is not changed. It's a bit sad to consume more space because we're sucky programmers. It would be better to retain the union and to stop writing buggy code to handle it! I decided to implement this approach, because it doesn't increase memory consumptions. This patch is replace is_root_cache on a pointer, but due to alignment the size of the structure is not changed. but the size of struct kmem_cache is increased on one pointer, if accounting of kernel memory is not enabled. The overhead of this patch on a real system is about 1K if the kernel memory accounting is disabled and the overhead is zero after enabling the accounting. Maybe there are things we can do to reduce the likelihood of people mishandling the union - don't use anonymous fields, name each member, access it via helper functions, etc. Ok, I wil try this way. Thanks, Andrey -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] [RFC] mnt: restrict a number of "struct mnt"
On Tue, Jun 18, 2013 at 02:56:51AM +0400, Andrey Wagin wrote: > 2013/6/17 Eric W. Biederman : > > So for anyone seriously worried about this kind of thing in general we > > already have the memory control group, which is quite capable of > > limiting this kind of thing, > > > and it limits all memory allocations not just mount. > > And that is problem, we can't to limit a particular slab. Let's > imagine a real container with 4Gb of RAM. What is a kernel memory > limit resonable for it? I setup 64 Mb (it may be not enough for real > CT, but it's enough to make host inaccessible for some minutes). > > $ mkdir /sys/fs/cgroup/memory/test > $ echo $((64 << 20)) > /sys/fs/cgroup/memory/test/memory.kmem.limit_in_bytes > $ unshare -m > $ echo $$ > /sys/fs/cgroup/memory/test/tasks > $ mount --make-rprivate / > $ mount -t tmpfs xxx /mnt > $ mount --make-shared /mnt > $ time bash -c 'set -m; for i in `seq 30`; do mount --bind /mnt > `mktemp -d /mnt/test.XX` & done; for i in `seq 30`; do wait; > done' > real 0m23.141s > user 0m0.016s > sys 0m22.881s > > While the last script is working, nobody can't to read /proc/mounts or > mount something. I don't think that users from other containers will > be glad. This problem is not so significant in compared with umounting > of this tree. > > $ strace -T umount -l /mnt > umount("/mnt", MNT_DETACH) = 0 <548.898244> > The host is inaccessible, it writes messages about soft lockup in > kernel log and eats 100% cpu. Eric, do you agree that * It is a problem * Currently we don't have a mechanism to prevent this problem * We need to find a way to prevent this problem > > > > > > Is there some reason we want to go down the path of adding and tuning > > static limits all over the kernel? As opposed to streamlining the memory > > control group so it is low overhead and everyone that cares can use it? > > The memory control group doesn't help in this case... I need to look > at this code in more details, maybe we can limit a depth of nested > mount points. Complexity of the umount algorithm does not depends on a depth of nested mounts, it depends on a number of mounts and sometimes complexity is O(n^2). For example: mount -t tmpfs xxx /mnt mount --make-shared /mnt mkdir /mnt/tmp mount -t tmpfs xxx /mnt/tmp mkdir /mnt/d for ((i = 0; i < $1; i++)); do d=`mktemp -d /mnt/d/xxx.XX` mount --bind /mnt/tmp $d || break done mkdir /mnt/tmp/d for ((i = 0; i < $1; i++)); do d=`mktemp -d /mnt/tmp/xxx.XX` mount --bind /mnt/tmp/d $d || break done perf data for umount -l /mnt 29.60% dbus-daemon [kernel.kallsyms][k] __ticket_spin_lock | --- __ticket_spin_lock lg_local_lock path_init path_openat do_filp_open do_sys_open SyS_openat system_call_fastpath __openat64_nocancel 0x747379732f312d73 20.20% umount [kernel.kallsyms][k] propagation_next | --- propagation_next | |--65.35%-- umount_tree | SyS_umount | system_call_fastpath | __umount2 | __libc_start_main | --34.65%-- propagate_umount umount_tree SyS_umount system_call_fastpath __umount2 __libc_start_main 17.81% umount [kernel.kallsyms][k] __lookup_mnt | --- __lookup_mnt | |--82.78%-- propagate_umount | umount_tree | SyS_umount | system_call_fastpath | __umount2 | __libc_start_main | --17.22%-- umount_tree SyS_umount system_call_fastpath __umount2 __libc_start_main -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] [RFC] mnt: restrict a number of struct mnt
On Tue, Jun 18, 2013 at 02:56:51AM +0400, Andrey Wagin wrote: 2013/6/17 Eric W. Biederman ebied...@xmission.com: So for anyone seriously worried about this kind of thing in general we already have the memory control group, which is quite capable of limiting this kind of thing, and it limits all memory allocations not just mount. And that is problem, we can't to limit a particular slab. Let's imagine a real container with 4Gb of RAM. What is a kernel memory limit resonable for it? I setup 64 Mb (it may be not enough for real CT, but it's enough to make host inaccessible for some minutes). $ mkdir /sys/fs/cgroup/memory/test $ echo $((64 20)) /sys/fs/cgroup/memory/test/memory.kmem.limit_in_bytes $ unshare -m $ echo $$ /sys/fs/cgroup/memory/test/tasks $ mount --make-rprivate / $ mount -t tmpfs xxx /mnt $ mount --make-shared /mnt $ time bash -c 'set -m; for i in `seq 30`; do mount --bind /mnt `mktemp -d /mnt/test.XX` done; for i in `seq 30`; do wait; done' real 0m23.141s user 0m0.016s sys 0m22.881s While the last script is working, nobody can't to read /proc/mounts or mount something. I don't think that users from other containers will be glad. This problem is not so significant in compared with umounting of this tree. $ strace -T umount -l /mnt umount(/mnt, MNT_DETACH) = 0 548.898244 The host is inaccessible, it writes messages about soft lockup in kernel log and eats 100% cpu. Eric, do you agree that * It is a problem * Currently we don't have a mechanism to prevent this problem * We need to find a way to prevent this problem Is there some reason we want to go down the path of adding and tuning static limits all over the kernel? As opposed to streamlining the memory control group so it is low overhead and everyone that cares can use it? The memory control group doesn't help in this case... I need to look at this code in more details, maybe we can limit a depth of nested mount points. Complexity of the umount algorithm does not depends on a depth of nested mounts, it depends on a number of mounts and sometimes complexity is O(n^2). For example: mount -t tmpfs xxx /mnt mount --make-shared /mnt mkdir /mnt/tmp mount -t tmpfs xxx /mnt/tmp mkdir /mnt/d for ((i = 0; i $1; i++)); do d=`mktemp -d /mnt/d/xxx.XX` mount --bind /mnt/tmp $d || break done mkdir /mnt/tmp/d for ((i = 0; i $1; i++)); do d=`mktemp -d /mnt/tmp/xxx.XX` mount --bind /mnt/tmp/d $d || break done perf data for umount -l /mnt 29.60% dbus-daemon [kernel.kallsyms][k] __ticket_spin_lock | --- __ticket_spin_lock lg_local_lock path_init path_openat do_filp_open do_sys_open SyS_openat system_call_fastpath __openat64_nocancel 0x747379732f312d73 20.20% umount [kernel.kallsyms][k] propagation_next | --- propagation_next | |--65.35%-- umount_tree | SyS_umount | system_call_fastpath | __umount2 | __libc_start_main | --34.65%-- propagate_umount umount_tree SyS_umount system_call_fastpath __umount2 __libc_start_main 17.81% umount [kernel.kallsyms][k] __lookup_mnt | --- __lookup_mnt | |--82.78%-- propagate_umount | umount_tree | SyS_umount | system_call_fastpath | __umount2 | __libc_start_main | --17.22%-- umount_tree SyS_umount system_call_fastpath __umount2 __libc_start_main -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] [RFC] mnt: restrict a number of "struct mnt"
2013/6/17 Eric W. Biederman : > Andrey Vagin writes: > >> I found that a few processes can eat all host memory and nobody can kill >> them. >> $ mount -t tmpfs xxx /mnt >> $ mount --make-shared /mnt >> $ for i in `seq 30`; do mount --bind /mnt `mktemp -d /mnt/test.XX` & done >> >> All this processes are unkillable, because they took i_mutex and waits >> namespace_lock. >> >> ... >> 21715 pts/0D 0:00 \_ mount --bind /mnt /mnt/test.ht6jzO >> 21716 pts/0D 0:00 \_ mount --bind /mnt /mnt/test.97K4mI >> 21717 pts/0R 0:01 \_ mount --bind /mnt /mnt/test.gO2CD9 >> ... >> >> Each of this process doubles a number of mounts, so at the end we will >> have about 2^32 mounts and the size of struct mnt is 256 bytes, so we >> need about 1TB of RAM. >> >> Another problem is that “umount” of a big tree is very hard operation >> and it requires a lot of time. >> E.g.: >> 16411 >> umount("/tmp/xxx", MNT_DETACH) = 0 <7.852066> (7.8 sec) >> 32795 >> umount("/tmp/xxx", MNT_DETACH) = 0 <34.485501> ( 34 sec) >> >> For all this time sys_umoun takes namespace_sem and vfsmount_lock... >> >> Due to all this reasons I suggest to restrict a number of mounts. >> Probably we can optimize this code in a future, but now this restriction >> can help. > > So for anyone seriously worried about this kind of thing in general we > already have the memory control group, which is quite capable of > limiting this kind of thing, > and it limits all memory allocations not just mount. And that is problem, we can't to limit a particular slab. Let's imagine a real container with 4Gb of RAM. What is a kernel memory limit resonable for it? I setup 64 Mb (it may be not enough for real CT, but it's enough to make host inaccessible for some minutes). $ mkdir /sys/fs/cgroup/memory/test $ echo $((64 << 20)) > /sys/fs/cgroup/memory/test/memory.kmem.limit_in_bytes $ unshare -m $ echo $$ > /sys/fs/cgroup/memory/test/tasks $ mount --make-rprivate / $ mount -t tmpfs xxx /mnt $ mount --make-shared /mnt $ time bash -c 'set -m; for i in `seq 30`; do mount --bind /mnt `mktemp -d /mnt/test.XX` & done; for i in `seq 30`; do wait; done' real 0m23.141s user 0m0.016s sys 0m22.881s While the last script is working, nobody can't to read /proc/mounts or mount something. I don't think that users from other containers will be glad. This problem is not so significant in compared with umounting of this tree. $ strace -T umount -l /mnt umount("/mnt", MNT_DETACH) = 0 <548.898244> The host is inaccessible, it writes messages about soft lockup in kernel log and eats 100% cpu. > > Is there some reason we want to go down the path of adding and tuning > static limits all over the kernel? As opposed to streamlining the memory > control group so it is low overhead and everyone that cares can use it? The memory control group doesn't help in this case... I need to look at this code in more details, maybe we can limit a depth of nested mount points. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] [RFC] mnt: restrict a number of struct mnt
2013/6/17 Eric W. Biederman ebied...@xmission.com: Andrey Vagin ava...@openvz.org writes: I found that a few processes can eat all host memory and nobody can kill them. $ mount -t tmpfs xxx /mnt $ mount --make-shared /mnt $ for i in `seq 30`; do mount --bind /mnt `mktemp -d /mnt/test.XX` done All this processes are unkillable, because they took i_mutex and waits namespace_lock. ... 21715 pts/0D 0:00 \_ mount --bind /mnt /mnt/test.ht6jzO 21716 pts/0D 0:00 \_ mount --bind /mnt /mnt/test.97K4mI 21717 pts/0R 0:01 \_ mount --bind /mnt /mnt/test.gO2CD9 ... Each of this process doubles a number of mounts, so at the end we will have about 2^32 mounts and the size of struct mnt is 256 bytes, so we need about 1TB of RAM. Another problem is that “umount” of a big tree is very hard operation and it requires a lot of time. E.g.: 16411 umount(/tmp/xxx, MNT_DETACH) = 0 7.852066 (7.8 sec) 32795 umount(/tmp/xxx, MNT_DETACH) = 0 34.485501 ( 34 sec) For all this time sys_umoun takes namespace_sem and vfsmount_lock... Due to all this reasons I suggest to restrict a number of mounts. Probably we can optimize this code in a future, but now this restriction can help. So for anyone seriously worried about this kind of thing in general we already have the memory control group, which is quite capable of limiting this kind of thing, and it limits all memory allocations not just mount. And that is problem, we can't to limit a particular slab. Let's imagine a real container with 4Gb of RAM. What is a kernel memory limit resonable for it? I setup 64 Mb (it may be not enough for real CT, but it's enough to make host inaccessible for some minutes). $ mkdir /sys/fs/cgroup/memory/test $ echo $((64 20)) /sys/fs/cgroup/memory/test/memory.kmem.limit_in_bytes $ unshare -m $ echo $$ /sys/fs/cgroup/memory/test/tasks $ mount --make-rprivate / $ mount -t tmpfs xxx /mnt $ mount --make-shared /mnt $ time bash -c 'set -m; for i in `seq 30`; do mount --bind /mnt `mktemp -d /mnt/test.XX` done; for i in `seq 30`; do wait; done' real 0m23.141s user 0m0.016s sys 0m22.881s While the last script is working, nobody can't to read /proc/mounts or mount something. I don't think that users from other containers will be glad. This problem is not so significant in compared with umounting of this tree. $ strace -T umount -l /mnt umount(/mnt, MNT_DETACH) = 0 548.898244 The host is inaccessible, it writes messages about soft lockup in kernel log and eats 100% cpu. Is there some reason we want to go down the path of adding and tuning static limits all over the kernel? As opposed to streamlining the memory control group so it is low overhead and everyone that cares can use it? The memory control group doesn't help in this case... I need to look at this code in more details, maybe we can limit a depth of nested mount points. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/1] move exit_task_namespaces() outside of exit_notify()
2013/4/13 Oleg Nesterov > > exit_notify() does exit_task_namespaces() after > forget_original_parent(). This was needed to ensure that ->nsproxy > can't be cleared prematurely, an exiting child we are going to > reparent can do do_notify_parent() and use the parent's (ours) pid_ns. > > However, after 32084504 "pidns: use task_active_pid_ns in > do_notify_parent" ->nsproxy != NULL is no longer needed, we rely > on task_active_pid_ns(). > > Move exit_task_namespaces() from exit_notify() to do_exit(), after > exit_fs() and before exit_task_work(). > > This solves the problem reported by Andrey, free_ipc_ns()->shm_destroy() > does fput() which needs task_work_add(). And this allows us do simplify > exit_notify(), we can avoid unlock/lock(tasklist) and we can change > ->exit_state instead of PF_EXITING in forget_original_parent(). > It looks good for me. I have tested it a bit and don't find any problem. Oleg, thank you. Acked-by: Andrew Vagin > Reported-by: Andrey Vagin > Signed-off-by: Oleg Nesterov > > --- x/kernel/exit.c > +++ x/kernel/exit.c > @@ -649,7 +649,6 @@ static void exit_notify(struct task_stru > * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) > */ > forget_original_parent(tsk); > - exit_task_namespaces(tsk); > > write_lock_irq(_lock); > if (group_dead) > @@ -795,6 +794,7 @@ void do_exit(long code) > exit_shm(tsk); > exit_files(tsk); > exit_fs(tsk); > + exit_task_namespaces(tsk); > exit_task_work(tsk); > check_stack_usage(); > exit_thread(); > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/1] move exit_task_namespaces() outside of exit_notify()
2013/4/13 Oleg Nesterov o...@redhat.com exit_notify() does exit_task_namespaces() after forget_original_parent(). This was needed to ensure that -nsproxy can't be cleared prematurely, an exiting child we are going to reparent can do do_notify_parent() and use the parent's (ours) pid_ns. However, after 32084504 pidns: use task_active_pid_ns in do_notify_parent -nsproxy != NULL is no longer needed, we rely on task_active_pid_ns(). Move exit_task_namespaces() from exit_notify() to do_exit(), after exit_fs() and before exit_task_work(). This solves the problem reported by Andrey, free_ipc_ns()-shm_destroy() does fput() which needs task_work_add(). And this allows us do simplify exit_notify(), we can avoid unlock/lock(tasklist) and we can change -exit_state instead of PF_EXITING in forget_original_parent(). It looks good for me. I have tested it a bit and don't find any problem. Oleg, thank you. Acked-by: Andrew Vagin ava...@gmail.com Reported-by: Andrey Vagin ava...@openvz.org Signed-off-by: Oleg Nesterov o...@redhat.com --- x/kernel/exit.c +++ x/kernel/exit.c @@ -649,7 +649,6 @@ static void exit_notify(struct task_stru * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) */ forget_original_parent(tsk); - exit_task_namespaces(tsk); write_lock_irq(tasklist_lock); if (group_dead) @@ -795,6 +794,7 @@ void do_exit(long code) exit_shm(tsk); exit_files(tsk); exit_fs(tsk); + exit_task_namespaces(tsk); exit_task_work(tsk); check_stack_usage(); exit_thread(); -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] ptrace: add ability to retrieve signals without removing from a queue (v2)
Andrew, thank you for the comments. I will fix them and send a new patch. A few comments are inline. 2013/2/26 Andrew Morton : > On Mon, 25 Feb 2013 14:06:53 +0400 >> + for (i = 0; i < arg.nr; i++) { >> + off = arg.off + i; > > "long long" = "u64" + "int". > > Wanna have a little think about what we're doing here, see if we can > pick the most appropriate types? A number of signal has the type int, because the system call ptrace() return "int" and my interface supposed to return a number of dumped signals. "off" is signed to check on a negative value and it can not be more then MAX_RAM / sizeof(siginfo), so long long is enough. It can be changed on s64 for avoiding misleading. > > `off' could be made local to this loop, which is a bit neater. > >> + spin_lock_irq(>sighand->siglock); >> + list_for_each_entry(q, >list, list) { >> + if (!off--) { >> + copy_siginfo(, >info); >> + break; >> + } >> + } >> + spin_unlock_irq(>sighand->siglock); >> + >> + if (off >= 0) >> + break; > > > > Oh I see what you did there - the user requested an entry which is > beyond the end of the list? A code comment would be nice. > > What's the return value in this case? "i". So how does userspace find > out what happened? The request PTRACE_PEEKSIGINFO returns a number of dumped signals. If a signal with the specified sequence number doesn't exist, ptrace returns 0. > > Please fully describe the interface so things such as this can be > understood. > >> +#ifdef CONFIG_COMPAT >> + if (unlikely(is_compat_task())) { >> + compat_siginfo_t __user *uinfo = compat_ptr(data); >> + >> + ret = copy_siginfo_to_user32(uinfo, ); >> + if (!ret) >> + ret = __put_user(info.si_code, >> >si_code); >> + } else >> +#endif >> + { >> + siginfo_t __user *uinfo = (siginfo_t __user *) data; >> + >> + ret = copy_siginfo_to_user(uinfo, ); >> + if (!ret) >> + ret = __put_user(info.si_code, >> >si_code); >> + } >> + >> + if (ret) >> + break; >> + >> + data += sizeof(siginfo_t); >> + >> + if (signal_pending(current)) { >> + i++; > > What does the i++ do? For accounting a current siginfo. I will add a comment here. > >> + break; >> + } >> + >> + cond_resched(); >> + } >> + >> + return i ? : ret; > > I hate that gcc thing :( > > Also, is it buggy? `ret' might be -EFAULT here. We appear to be using > read() return semantics here? I have tried to implement read() semantics here. ptrace() returns a number of dumped signals even if dumping of the last signal is failed. And it returns an error if not one signal has been dumped successfully. The similar logic is used in read() generic_file_aio_read() retval += desc.written; if (desc.error) { retval = retval ?: desc.error; break; } Thanks, Andrey -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] ptrace: add ability to retrieve signals without removing from a queue (v2)
2013/2/25 Pavel Emelyanov : >> + for (i = 0; i < arg.nr; i++) { >> + off = arg.off + i; >> + >> + spin_lock_irq(>sighand->siglock); >> + list_for_each_entry(q, >list, list) { >> + if (!off--) { >> + copy_siginfo(, >info); >> + break; >> + } >> + } >> + spin_unlock_irq(>sighand->siglock); > > What's the point of arg.nr if you for every single siginfo drop the lock > and perform linear search anyway? arg.nr may be useful, if we will decide to optimize this part in a future. Currently the problem with complexity of finding a signal in a queue is true not only for this code. It exists for delivering real-time signals as well. Probably in a future this problem will be fixed for both cases by the same way. Thanks, Andrew -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] ptrace: add ability to retrieve signals without removing from a queue (v2)
Andrew, thank you for the comments. I will fix them and send a new patch. A few comments are inline. 2013/2/26 Andrew Morton a...@linux-foundation.org: On Mon, 25 Feb 2013 14:06:53 +0400 + for (i = 0; i arg.nr; i++) { + off = arg.off + i; long long = u64 + int. Wanna have a little think about what we're doing here, see if we can pick the most appropriate types? A number of signal has the type int, because the system call ptrace() return int and my interface supposed to return a number of dumped signals. off is signed to check on a negative value and it can not be more then MAX_RAM / sizeof(siginfo), so long long is enough. It can be changed on s64 for avoiding misleading. `off' could be made local to this loop, which is a bit neater. + spin_lock_irq(child-sighand-siglock); + list_for_each_entry(q, pending-list, list) { + if (!off--) { + copy_siginfo(info, q-info); + break; + } + } + spin_unlock_irq(child-sighand-siglock); + + if (off = 0) + break; scratches head Oh I see what you did there - the user requested an entry which is beyond the end of the list? A code comment would be nice. What's the return value in this case? i. So how does userspace find out what happened? The request PTRACE_PEEKSIGINFO returns a number of dumped signals. If a signal with the specified sequence number doesn't exist, ptrace returns 0. Please fully describe the interface so things such as this can be understood. +#ifdef CONFIG_COMPAT + if (unlikely(is_compat_task())) { + compat_siginfo_t __user *uinfo = compat_ptr(data); + + ret = copy_siginfo_to_user32(uinfo, info); + if (!ret) + ret = __put_user(info.si_code, uinfo-si_code); + } else +#endif + { + siginfo_t __user *uinfo = (siginfo_t __user *) data; + + ret = copy_siginfo_to_user(uinfo, info); + if (!ret) + ret = __put_user(info.si_code, uinfo-si_code); + } + + if (ret) + break; + + data += sizeof(siginfo_t); + + if (signal_pending(current)) { + i++; What does the i++ do? For accounting a current siginfo. I will add a comment here. + break; + } + + cond_resched(); + } + + return i ? : ret; I hate that gcc thing :( Also, is it buggy? `ret' might be -EFAULT here. We appear to be using read() return semantics here? I have tried to implement read() semantics here. ptrace() returns a number of dumped signals even if dumping of the last signal is failed. And it returns an error if not one signal has been dumped successfully. The similar logic is used in read() generic_file_aio_read() retval += desc.written; if (desc.error) { retval = retval ?: desc.error; break; } Thanks, Andrey -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] ptrace: add ability to retrieve signals without removing from a queue (v2)
2013/2/25 Pavel Emelyanov xe...@parallels.com: + for (i = 0; i arg.nr; i++) { + off = arg.off + i; + + spin_lock_irq(child-sighand-siglock); + list_for_each_entry(q, pending-list, list) { + if (!off--) { + copy_siginfo(info, q-info); + break; + } + } + spin_unlock_irq(child-sighand-siglock); What's the point of arg.nr if you for every single siginfo drop the lock and perform linear search anyway? arg.nr may be useful, if we will decide to optimize this part in a future. Currently the problem with complexity of finding a signal in a queue is true not only for this code. It exists for delivering real-time signals as well. Probably in a future this problem will be fixed for both cases by the same way. Thanks, Andrew -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] ptrace: add ability to retrieve signals without removing them from a queue
2013/2/16 Pedro Alves : > Forgot to reply to this bit: > > On 02/15/2013 07:43 PM, Oleg Nesterov wrote: >>> We'd miss the poke >>> > variant, but that looks like something that could be always be added >>> > later. >> Yes. _POKE_ or _QUEUE_ or _DEQUEUE_, we can add more features if user- >> space wants them. > > In general, IMO, I agree with Roland at https://lkml.org/lkml/2002/12/20/84 > in that it's good to have setters for completeness, so that you can > change all the state via ptrace that you can read via ptrace. > > But I'm not doing any of this work, hence my "could always be > added later" comment instead of actually requesting it. But if > we had it, we could make e.g., gdb inspect the signal queues, > and then be able to tweak a realtime signal before it is > delivered. PTRACE_POKESIGINFO is more complicated than PTRACE_PEEKSIGINFO. Looks like PTRACE_POKESIGINFO should replace a siginfo with the specified sequence number. Should it be able to change signo? If it is able, what to do with signalfd, which already got a notification about the previous signal?.. My opinion is that it "could always be added later", when we will understand what exactly we want to have. > > -- > Pedro Alves -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] ptrace: add ability to retrieve signals without removing them from a queue
2013/2/16 Pedro Alves pal...@redhat.com: Forgot to reply to this bit: On 02/15/2013 07:43 PM, Oleg Nesterov wrote: We'd miss the poke variant, but that looks like something that could be always be added later. Yes. _POKE_ or _QUEUE_ or _DEQUEUE_, we can add more features if user- space wants them. In general, IMO, I agree with Roland at https://lkml.org/lkml/2002/12/20/84 in that it's good to have setters for completeness, so that you can change all the state via ptrace that you can read via ptrace. But I'm not doing any of this work, hence my could always be added later comment instead of actually requesting it. But if we had it, we could make e.g., gdb inspect the signal queues, and then be able to tweak a realtime signal before it is delivered. PTRACE_POKESIGINFO is more complicated than PTRACE_PEEKSIGINFO. Looks like PTRACE_POKESIGINFO should replace a siginfo with the specified sequence number. Should it be able to change signo? If it is able, what to do with signalfd, which already got a notification about the previous signal?.. My opinion is that it could always be added later, when we will understand what exactly we want to have. -- Pedro Alves -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 3/3] signalfd: add ability to read siginfo-s without dequeuing signals (v2)
2013/2/7 Oleg Nesterov : > Andrey, sorry for delay. > > As for API, I leave this to you and Michael. Not that I like these > new flags, but I agree that pread() hack was not pretty too. > > On 01/29, Andrey Vagin wrote: >> +static ssize_t signalfd_peek(struct signalfd_ctx *ctx, >> + siginfo_t *info, loff_t *ppos, int queue_mask) >> +{ >> + loff_t seq = *ppos / sizeof(struct signalfd_siginfo); >> + int signr = 0; >> + >> + if (queue_mask & SIGQUEUE_PRIVATE) >> + signr = peek_signal(>pending, >> + >sigmask, info, ); >> + else if (queue_mask & SIGQUEUE_SHARED) >> + signr = peek_signal(>signal->shared_pending, >> + >sigmask, info, ); >> + (*ppos) += sizeof(struct signalfd_siginfo); > > Now that this can work even with normal read(), we will actually change > f_pos. Then perhaps signalfd_fops->llseek() should work too. But this > is minor... lseek works only if FMODE_LSEEK is set. You have explained why read have strange semantics for SIGNALFD_PEEK. >Damn. But after I wrote this email I realized that llseek() probably can't > work. Because peek_offset/f_pos/whatever has to be shared with all processes > which have this file opened. > > Suppose that the task forks after sys_signalfd(). Now if parent or child > do llseek this affects them both. This is insane because signalfd is > "strange" to say at least, fork/dup/etc inherits signalfd_ctx but not the > "source" of the data. So I want to suggest a way how to forbid read() for SIGNALFD_PEEK. file->f_pos can be initialized to -1. read() returns EINVAL in this case. In a man page we will write that signals can be dumped only with help pread(). Is it overload or too ugly? > > Hmm. but since it works with read(), we shouldn't increment *ppos unless > signalfd_copyinfo() succeeds? No, we shouldn't. > > Btw, why do you pass seq by reference? Looks unneeded. You are right. I created this code for reading signals from both queues, but then we decided to forbid using SIGNALFD_PEEK for both queues simultaneously. Oleg, thank you for the comments. I'm waiting an answer on the question and after that I'm going to send a final version. > > Oleg. > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 3/3] signalfd: add ability to read siginfo-s without dequeuing signals (v2)
2013/2/7 Oleg Nesterov o...@redhat.com: Andrey, sorry for delay. As for API, I leave this to you and Michael. Not that I like these new flags, but I agree that pread() hack was not pretty too. On 01/29, Andrey Vagin wrote: +static ssize_t signalfd_peek(struct signalfd_ctx *ctx, + siginfo_t *info, loff_t *ppos, int queue_mask) +{ + loff_t seq = *ppos / sizeof(struct signalfd_siginfo); + int signr = 0; + + if (queue_mask SIGQUEUE_PRIVATE) + signr = peek_signal(current-pending, + ctx-sigmask, info, seq); + else if (queue_mask SIGQUEUE_SHARED) + signr = peek_signal(current-signal-shared_pending, + ctx-sigmask, info, seq); + (*ppos) += sizeof(struct signalfd_siginfo); Now that this can work even with normal read(), we will actually change f_pos. Then perhaps signalfd_fops-llseek() should work too. But this is minor... lseek works only if FMODE_LSEEK is set. You have explained why readlseek have strange semantics for SIGNALFD_PEEK. Damn. But after I wrote this email I realized that llseek() probably can't work. Because peek_offset/f_pos/whatever has to be shared with all processes which have this file opened. Suppose that the task forks after sys_signalfd(). Now if parent or child do llseek this affects them both. This is insane because signalfd is strange to say at least, fork/dup/etc inherits signalfd_ctx but not the source of the data. So I want to suggest a way how to forbid read() for SIGNALFD_PEEK. file-f_pos can be initialized to -1. read() returns EINVAL in this case. In a man page we will write that signals can be dumped only with help pread(). Is it overload or too ugly? Hmm. but since it works with read(), we shouldn't increment *ppos unless signalfd_copyinfo() succeeds? No, we shouldn't. Btw, why do you pass seq by reference? Looks unneeded. You are right. I created this code for reading signals from both queues, but then we decided to forbid using SIGNALFD_PEEK for both queues simultaneously. Oleg, thank you for the comments. I'm waiting an answer on the question and after that I'm going to send a final version. Oleg. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 3/3] signalfd: add ability to read siginfo-s without dequeuing signals (v2)
2013/2/2 Michael Kerrisk (man-pages) : > > On Jan 30, 2013 8:05 AM, "Andrey Vagin" wrote: >> >> If signalfd is created with the flag SFD_PEEK, it reads siginfo-s >> without dequeuing signals. >> >> For reading not first siginfo pread(fd, buf, size, pos) can be used, >> where ppos / sizeof(signalfd_siginfo) is a sequence number of a signal >> in a queue. > > Andrey, > > Is it perhaps worth erroring (EINVAL) if ((ppos % sizeof > (signalfd_siginfo)) != 0) ? Yes, it's already in the patch. @@ -257,9 +298,15 @@ static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count, if (!count) return -EINVAL; + if (*ppos % sizeof(struct signalfd_siginfo)) + return -EINVAL; + > > Cheers, > > Michael > >> This functionality is required for checkpointing pending signals. >> >> v2: * signals can be dumped only from one queue. >> * treat pos as offset in bytes, not in elements, so pos should be >> aligned to the size of signalfd_siginfo. >> >> Cc: Oleg Nesterov >> Cc: Alexander Viro >> Cc: "Paul E. McKenney" >> Cc: David Howells >> Cc: Dave Jones >> Cc: Andrey Vagin >> Cc: Michael Kerrisk >> Cc: Pavel Emelyanov >> CC: Cyrill Gorcunov >> Signed-off-by: Andrey Vagin >> --- >> fs/signalfd.c | 61 >> --- >> include/uapi/linux/signalfd.h | 2 ++ >> 2 files changed, 60 insertions(+), 3 deletions(-) >> >> diff --git a/fs/signalfd.c b/fs/signalfd.c >> index 8019ec9..0da6a30 100644 >> --- a/fs/signalfd.c >> +++ b/fs/signalfd.c >> @@ -51,6 +51,47 @@ struct signalfd_ctx { >> sigset_t sigmask; >> }; >> >> +static int peek_signal(struct sigpending *pending, sigset_t *mask, >> + siginfo_t *info, loff_t *pseq) >> +{ >> + struct sigqueue *q; >> + int ret = 0; >> + >> + spin_lock_irq(>sighand->siglock); >> + >> + list_for_each_entry(q, >list, list) { >> + if (sigismember(mask, q->info.si_signo)) >> + continue; >> + >> + if ((*pseq)-- == 0) { >> + copy_siginfo(info, >info); >> + ret = info->si_signo; >> + break; >> + } >> + } >> + >> + spin_unlock_irq(>sighand->siglock); >> + >> + return ret; >> +} >> + >> +static ssize_t signalfd_peek(struct signalfd_ctx *ctx, >> + siginfo_t *info, loff_t *ppos, int >> queue_mask) >> +{ >> + loff_t seq = *ppos / sizeof(struct signalfd_siginfo); >> + int signr = 0; >> + >> + if (queue_mask & SIGQUEUE_PRIVATE) >> + signr = peek_signal(>pending, >> + >sigmask, info, ); >> + else if (queue_mask & SIGQUEUE_SHARED) >> + signr = peek_signal(>signal->shared_pending, >> +>sigmask, info, ); >> + (*ppos) += sizeof(struct signalfd_siginfo); >> + >> + return signr; >> +} >> + >> static int signalfd_release(struct inode *inode, struct file *file) >> { >> kfree(file->private_data); >> @@ -257,9 +298,15 @@ static ssize_t signalfd_read(struct file *file, char >> __user *buf, size_t count, >> if (!count) >> return -EINVAL; >> >> + if (*ppos % sizeof(struct signalfd_siginfo)) >> + return -EINVAL; >> + >> siginfo = (struct signalfd_siginfo __user *) buf; >> do { >> - ret = signalfd_dequeue(ctx, , nonblock, qmask); >> + if (file->f_flags & SFD_PEEK) >> + ret = signalfd_peek(ctx, , ppos, qmask); >> + else >> + ret = signalfd_dequeue(ctx, , nonblock, >> qmask); >> >> if (unlikely(ret <= 0)) >> break; >> @@ -315,7 +362,12 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user >> *, user_mask, >> BUILD_BUG_ON(SFD_CLOEXEC != O_CLOEXEC); >> BUILD_BUG_ON(SFD_NONBLOCK != O_NONBLOCK); >> >> - if (flags & ~(SFD_CLOEXEC | SFD_NONBLOCK | SFD_RAW | SFD_QUEUES)) >> + if (flags & ~(SFD_CLOEXEC | SFD_NONBLOCK | >> + SFD_RAW | SFD_PEEK | SFD_QUEUES)) >> + return -EINVAL; >> + >> + /* SFD_PEEK can be used for one queue only */ >> + if ((flags & SFD_PEEK) && ((flags & SFD_QUEUES) == SFD_QUEUES)) >> return -EINVAL; >> >> if (sizemask != sizeof(sigset_t) || >> @@ -352,7 +404,10 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user >> *, user_mask, >> } >> >> file->f_flags |= (flags & SFD_QUEUES) ? : SFD_QUEUES; >> - file->f_flags |= flags & SFD_RAW; >> + file->f_flags |= flags & (SFD_RAW | SFD_PEEK); >> + >> + if (file->f_flags & SFD_PEEK) >> + file->f_mode |= FMODE_PREAD; >> >> fd_install(ufd, file); >> } else { >>
Re: [PATCH 3/3] signalfd: add ability to read siginfo-s without dequeuing signals (v2)
2013/2/2 Michael Kerrisk (man-pages) mtk.manpa...@gmail.com: On Jan 30, 2013 8:05 AM, Andrey Vagin ava...@openvz.org wrote: If signalfd is created with the flag SFD_PEEK, it reads siginfo-s without dequeuing signals. For reading not first siginfo pread(fd, buf, size, pos) can be used, where ppos / sizeof(signalfd_siginfo) is a sequence number of a signal in a queue. Andrey, Is it perhaps worth erroring (EINVAL) if ((ppos % sizeof (signalfd_siginfo)) != 0) ? Yes, it's already in the patch. @@ -257,9 +298,15 @@ static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count, if (!count) return -EINVAL; + if (*ppos % sizeof(struct signalfd_siginfo)) + return -EINVAL; + Cheers, Michael This functionality is required for checkpointing pending signals. v2: * signals can be dumped only from one queue. * treat pos as offset in bytes, not in elements, so pos should be aligned to the size of signalfd_siginfo. Cc: Oleg Nesterov o...@redhat.com Cc: Alexander Viro v...@zeniv.linux.org.uk Cc: Paul E. McKenney paul...@linux.vnet.ibm.com Cc: David Howells dhowe...@redhat.com Cc: Dave Jones da...@redhat.com Cc: Andrey Vagin ava...@openvz.org Cc: Michael Kerrisk mtk.manpa...@gmail.com Cc: Pavel Emelyanov xe...@parallels.com CC: Cyrill Gorcunov gorcu...@openvz.org Signed-off-by: Andrey Vagin ava...@openvz.org --- fs/signalfd.c | 61 --- include/uapi/linux/signalfd.h | 2 ++ 2 files changed, 60 insertions(+), 3 deletions(-) diff --git a/fs/signalfd.c b/fs/signalfd.c index 8019ec9..0da6a30 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -51,6 +51,47 @@ struct signalfd_ctx { sigset_t sigmask; }; +static int peek_signal(struct sigpending *pending, sigset_t *mask, + siginfo_t *info, loff_t *pseq) +{ + struct sigqueue *q; + int ret = 0; + + spin_lock_irq(current-sighand-siglock); + + list_for_each_entry(q, pending-list, list) { + if (sigismember(mask, q-info.si_signo)) + continue; + + if ((*pseq)-- == 0) { + copy_siginfo(info, q-info); + ret = info-si_signo; + break; + } + } + + spin_unlock_irq(current-sighand-siglock); + + return ret; +} + +static ssize_t signalfd_peek(struct signalfd_ctx *ctx, + siginfo_t *info, loff_t *ppos, int queue_mask) +{ + loff_t seq = *ppos / sizeof(struct signalfd_siginfo); + int signr = 0; + + if (queue_mask SIGQUEUE_PRIVATE) + signr = peek_signal(current-pending, + ctx-sigmask, info, seq); + else if (queue_mask SIGQUEUE_SHARED) + signr = peek_signal(current-signal-shared_pending, +ctx-sigmask, info, seq); + (*ppos) += sizeof(struct signalfd_siginfo); + + return signr; +} + static int signalfd_release(struct inode *inode, struct file *file) { kfree(file-private_data); @@ -257,9 +298,15 @@ static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count, if (!count) return -EINVAL; + if (*ppos % sizeof(struct signalfd_siginfo)) + return -EINVAL; + siginfo = (struct signalfd_siginfo __user *) buf; do { - ret = signalfd_dequeue(ctx, info, nonblock, qmask); + if (file-f_flags SFD_PEEK) + ret = signalfd_peek(ctx, info, ppos, qmask); + else + ret = signalfd_dequeue(ctx, info, nonblock, qmask); if (unlikely(ret = 0)) break; @@ -315,7 +362,12 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask, BUILD_BUG_ON(SFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(SFD_NONBLOCK != O_NONBLOCK); - if (flags ~(SFD_CLOEXEC | SFD_NONBLOCK | SFD_RAW | SFD_QUEUES)) + if (flags ~(SFD_CLOEXEC | SFD_NONBLOCK | + SFD_RAW | SFD_PEEK | SFD_QUEUES)) + return -EINVAL; + + /* SFD_PEEK can be used for one queue only */ + if ((flags SFD_PEEK) ((flags SFD_QUEUES) == SFD_QUEUES)) return -EINVAL; if (sizemask != sizeof(sigset_t) || @@ -352,7 +404,10 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask, } file-f_flags |= (flags SFD_QUEUES) ? : SFD_QUEUES; - file-f_flags |= flags SFD_RAW; + file-f_flags |= flags (SFD_RAW | SFD_PEEK); + + if (file-f_flags SFD_PEEK) + file-f_mode |= FMODE_PREAD; fd_install(ufd, file); } else { diff --git