Re: x86: Hardware breakpoints are not always triggered

2016-01-28 Thread Andrey Wagin
On Thu, Jan 28, 2016 at 10:33:28PM +0100, Paolo Bonzini wrote:
> 
> 
> On 28/01/2016 09:31, Andrey Wagin wrote:
> > I tried to print drX registers after a break-point. Looks like they
> > are set correctly.
> 
> Can you try this KVM patch?

Looks like it fixes a case when reproducers are running only in VM.

If I execute the reproducer a few times on the host and then execute it
in VM, it exits very fast.

Thanks,
Andrew

> 
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index c13a64b7d789..32bae1c70a50 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -1612,6 +1612,7 @@ static void svm_sync_dirty_debug_regs(struct kvm_vcpu 
> *vcpu)
>   vcpu->arch.dr7 = svm->vmcb->save.dr7;
>  
>   vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
> + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
>   set_dr_intercepts(svm);
>  }
>  
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index e2951b6edbbc..505a4663b9f4 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -5659,6 +5659,7 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu 
> *vcpu)
>   vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
>  
>   vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
> + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
>  
>   cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
>   cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING;
> 
> Paolo


x86: Hardware breakpoints are not always triggered

2016-01-28 Thread Andrey Wagin
Hi,

We use hardware breakpoints in CRIU and we found that sometimes we set
a break-point, but a process doesn't stop on it.

I write a small reproducer for this bug. It create two processes,
where a parent process traces a child. The parent process sets a
break-point and each time when the child stop on it, the parent sets
the variable "xxx" to A in a child process. The child runs an infinite
loop, where it check the variable "xxx" and sets it to B. If a child
process finds that xxx is equal to B, it exits with a non-zero code,
what means that a break-point was not triggered. The source code is
attached.

The reproducer uses a different break-point address if it is executed
with arguments than when it executed without arguments.

Then I made a few experiments. The bug is triggered, if we execute
this program a few times in a KVM virtual machine.

[root@fc22-vm ptrace]# ( while :; do ./ptrace_breakpoint > /dev/null
|| { echo "FAIL - $?"; break; }; done ) &
[3] 4088
[root@fc22-vm ptrace]# ( while :; do ./ptrace_breakpoint > /dev/null
|| { echo "FAIL - $?"; break; }; done ) &
[4] 4091
[root@fc22-vm ptrace]# ( while :; do ./ptrace_breakpoint 1 2 >
/dev/null || { echo "FAIL - $?"; break; }; done ) &
[5] 4094
[root@fc22-vm ptrace]# ( while :; do ./ptrace_breakpoint 1 2 >
/dev/null || { echo "FAIL - $?"; break; }; done ) &
[6] 4097
[8] 4103
[root@fc22-vm ptrace]# 0087: exit - 5
0131: exited, status=1
0126: wait: No child processes
FAIL - 3

I tried to execute the reproducer on the host (where kvm VM-s are
running), but the bug was not triggered during one hour.

When I executed the reproducer in VM without stopping processes on the
host, I found that a bug is triggered much faster in this case.

[root@fc22-vm ptrace]# ./ptrace_breakpoint 1

stop 24675
cont
child2 1
stop 24676
cont
child2 1
child2 5
0088: exit - 5
stop 24677
0132: exited, status=1
cont
0127: wait: No child processes

I know that this bug can be reproduced starting with the 4.2 kernel. I
haven't test older versions of the kernel.

I tried to print drX registers after a break-point. Looks like they
are set correctly.

Maybe someone has any ideas where a problem is or how it can be investigated.

Here is a criu issue for this problem:
https://github.com/xemul/criu/issues/107

Thanks,
Andrew
#define _GNU_SOURCE /* See feature_test_macros(7) */
#include/* For SYS_xxx definitions */

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

/* Copied from the gdb header gdb/nat/x86-dregs.h */

/* Debug registers' indices.  */
#define DR_FIRSTADDR 0
#define DR_LASTADDR  3
#define DR_NADDR 4  /* The number of debug address registers.  */
#define DR_STATUS6  /* Index of debug status register (DR6).  */
#define DR_CONTROL   7  /* Index of debug control register (DR7).  */

#define DR_LOCAL_ENABLE_SHIFT   0 /* Extra shift to the local enable bit.  */
#define DR_GLOBAL_ENABLE_SHIFT  1 /* Extra shift to the global enable bit.  */
#define DR_ENABLE_SIZE  2 /* Two enable bits per debug register.  */

/* Locally enable the break/watchpoint in the I'th debug register.  */
#define X86_DR_LOCAL_ENABLE(i) (1 << (DR_LOCAL_ENABLE_SHIFT + DR_ENABLE_SIZE * (i)))


# define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
#define pr_perror(fmt, ...) \
		fprintf(stderr, "%04d: " fmt ": %s\n", __LINE__, ##__VA_ARGS__, strerror(errno))
#define pr_err(fmt, ...) \
		fprintf(stderr, "%04d: " fmt "\n", __LINE__, ##__VA_ARGS__)

int ptrace_set_breakpoint(pid_t pid, void *addr)
{
int ret;

/* Set a breakpoint */
if (ptrace(PTRACE_POKEUSER, pid,
offsetof(struct user, u_debugreg[DR_FIRSTADDR]),
addr)) {
pr_perror("Unable to setup a breakpoint into %d", pid);
return -1;
}

/* Enable the breakpoint */
if (ptrace(PTRACE_POKEUSER, pid,
offsetof(struct user, u_debugreg[DR_CONTROL]),
X86_DR_LOCAL_ENABLE(DR_FIRSTADDR))) {
pr_perror("Unable to enable the breakpoint for %d", pid);
return -1;
}

ret = ptrace(PTRACE_CONT, pid, NULL, NULL);
if (ret) {
pr_perror("Unable to restart the  stopped tracee process %d", pid);
return -1;
}

return 1;
}

static long xxx = -1;
int child()
{
	printf("child %d\n", xxx);
	syscall(__NR_getppid);
	if (xxx == 5) {
		pr_err("exit - %d", xxx);
		exit(1);
	}
	if (xxx > 0)
		xxx = 5;
	return 0;
}

int child2()
{
	printf("child2 %d\n", xxx);
	syscall(__NR_getppid);
	if (xxx == 5) {
		pr_err("exit - %d", xxx);
		exit(1);
	}
	if (xxx > 0)
		xxx = 5;
	return 0;
}

int main(int argc, char **argv)
{
	pid_t pid;
	int status, i = 0;
	int (*addr)();

	if (argc > 1)
		addr = child2;
	else
		addr = child;

	pid = fork();
	

Re: x86: Hardware breakpoints are not always triggered

2016-01-28 Thread Andrey Wagin
On Thu, Jan 28, 2016 at 10:33:28PM +0100, Paolo Bonzini wrote:
> 
> 
> On 28/01/2016 09:31, Andrey Wagin wrote:
> > I tried to print drX registers after a break-point. Looks like they
> > are set correctly.
> 
> Can you try this KVM patch?

Looks like it fixes a case when reproducers are running only in VM.

If I execute the reproducer a few times on the host and then execute it
in VM, it exits very fast.

Thanks,
Andrew

> 
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index c13a64b7d789..32bae1c70a50 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -1612,6 +1612,7 @@ static void svm_sync_dirty_debug_regs(struct kvm_vcpu 
> *vcpu)
>   vcpu->arch.dr7 = svm->vmcb->save.dr7;
>  
>   vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
> + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
>   set_dr_intercepts(svm);
>  }
>  
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index e2951b6edbbc..505a4663b9f4 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -5659,6 +5659,7 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu 
> *vcpu)
>   vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
>  
>   vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
> + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
>  
>   cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
>   cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING;
> 
> Paolo


x86: Hardware breakpoints are not always triggered

2016-01-28 Thread Andrey Wagin
Hi,

We use hardware breakpoints in CRIU and we found that sometimes we set
a break-point, but a process doesn't stop on it.

I write a small reproducer for this bug. It create two processes,
where a parent process traces a child. The parent process sets a
break-point and each time when the child stop on it, the parent sets
the variable "xxx" to A in a child process. The child runs an infinite
loop, where it check the variable "xxx" and sets it to B. If a child
process finds that xxx is equal to B, it exits with a non-zero code,
what means that a break-point was not triggered. The source code is
attached.

The reproducer uses a different break-point address if it is executed
with arguments than when it executed without arguments.

Then I made a few experiments. The bug is triggered, if we execute
this program a few times in a KVM virtual machine.

[root@fc22-vm ptrace]# ( while :; do ./ptrace_breakpoint > /dev/null
|| { echo "FAIL - $?"; break; }; done ) &
[3] 4088
[root@fc22-vm ptrace]# ( while :; do ./ptrace_breakpoint > /dev/null
|| { echo "FAIL - $?"; break; }; done ) &
[4] 4091
[root@fc22-vm ptrace]# ( while :; do ./ptrace_breakpoint 1 2 >
/dev/null || { echo "FAIL - $?"; break; }; done ) &
[5] 4094
[root@fc22-vm ptrace]# ( while :; do ./ptrace_breakpoint 1 2 >
/dev/null || { echo "FAIL - $?"; break; }; done ) &
[6] 4097
[8] 4103
[root@fc22-vm ptrace]# 0087: exit - 5
0131: exited, status=1
0126: wait: No child processes
FAIL - 3

I tried to execute the reproducer on the host (where kvm VM-s are
running), but the bug was not triggered during one hour.

When I executed the reproducer in VM without stopping processes on the
host, I found that a bug is triggered much faster in this case.

[root@fc22-vm ptrace]# ./ptrace_breakpoint 1

stop 24675
cont
child2 1
stop 24676
cont
child2 1
child2 5
0088: exit - 5
stop 24677
0132: exited, status=1
cont
0127: wait: No child processes

I know that this bug can be reproduced starting with the 4.2 kernel. I
haven't test older versions of the kernel.

I tried to print drX registers after a break-point. Looks like they
are set correctly.

Maybe someone has any ideas where a problem is or how it can be investigated.

Here is a criu issue for this problem:
https://github.com/xemul/criu/issues/107

Thanks,
Andrew
#define _GNU_SOURCE /* See feature_test_macros(7) */
#include/* For SYS_xxx definitions */

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

/* Copied from the gdb header gdb/nat/x86-dregs.h */

/* Debug registers' indices.  */
#define DR_FIRSTADDR 0
#define DR_LASTADDR  3
#define DR_NADDR 4  /* The number of debug address registers.  */
#define DR_STATUS6  /* Index of debug status register (DR6).  */
#define DR_CONTROL   7  /* Index of debug control register (DR7).  */

#define DR_LOCAL_ENABLE_SHIFT   0 /* Extra shift to the local enable bit.  */
#define DR_GLOBAL_ENABLE_SHIFT  1 /* Extra shift to the global enable bit.  */
#define DR_ENABLE_SIZE  2 /* Two enable bits per debug register.  */

/* Locally enable the break/watchpoint in the I'th debug register.  */
#define X86_DR_LOCAL_ENABLE(i) (1 << (DR_LOCAL_ENABLE_SHIFT + DR_ENABLE_SIZE * (i)))


# define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
#define pr_perror(fmt, ...) \
		fprintf(stderr, "%04d: " fmt ": %s\n", __LINE__, ##__VA_ARGS__, strerror(errno))
#define pr_err(fmt, ...) \
		fprintf(stderr, "%04d: " fmt "\n", __LINE__, ##__VA_ARGS__)

int ptrace_set_breakpoint(pid_t pid, void *addr)
{
int ret;

/* Set a breakpoint */
if (ptrace(PTRACE_POKEUSER, pid,
offsetof(struct user, u_debugreg[DR_FIRSTADDR]),
addr)) {
pr_perror("Unable to setup a breakpoint into %d", pid);
return -1;
}

/* Enable the breakpoint */
if (ptrace(PTRACE_POKEUSER, pid,
offsetof(struct user, u_debugreg[DR_CONTROL]),
X86_DR_LOCAL_ENABLE(DR_FIRSTADDR))) {
pr_perror("Unable to enable the breakpoint for %d", pid);
return -1;
}

ret = ptrace(PTRACE_CONT, pid, NULL, NULL);
if (ret) {
pr_perror("Unable to restart the  stopped tracee process %d", pid);
return -1;
}

return 1;
}

static long xxx = -1;
int child()
{
	printf("child %d\n", xxx);
	syscall(__NR_getppid);
	if (xxx == 5) {
		pr_err("exit - %d", xxx);
		exit(1);
	}
	if (xxx > 0)
		xxx = 5;
	return 0;
}

int child2()
{
	printf("child2 %d\n", xxx);
	syscall(__NR_getppid);
	if (xxx == 5) {
		pr_err("exit - %d", xxx);
		exit(1);
	}
	if (xxx > 0)
		xxx = 5;
	return 0;
}

int main(int argc, char **argv)
{
	pid_t pid;
	int status, i = 0;
	int (*addr)();

	if (argc > 1)
		addr = child2;
	else
		addr = child;

	pid = fork();
	

Re: WARNING: static_key_slow_dec used before call to jump_label_init

2015-09-24 Thread Andrey Wagin
2015-09-24 9:57 GMT+03:00 Andrey Wagin :
> Hello,
>
> I booted kernel with cgroup_disable=cpu and get this warning:
>
> [0.00] Kernel command line:
> BOOT_IMAGE=/boot/vmlinuz-4.3.0-rc2-next-20150923 no_timer_check
> console=ttyS0,115200n8 root=UUID=01bc7316-b1f4-45c9-a23a-0

[0.00] Kernel command line:
BOOT_IMAGE=/boot/vmlinuz-4.3.0-rc2-next-20150923 no_timer_check
console=ttyS0,115200n8 root=UUID=01bc7316-b1f4-45c9-a23a-00c5a2336ef2
console=tty1 ro rhgb quiet LANG=en_US.UTF-8
initrd=/boot/initramfs-4.3.0-rc2-next-20150923.img debug
cgroup_disable=cpu
You have new mail in /var/spool/mail/root

> [0.00] [ cut here ]
> [0.00] WARNING: CPU: 0 PID: 0 at kernel/jump_label.c:99
> static_key_slow_dec+0x44/0x60()
> [0.00] static_key_slow_dec used before call to jump_label_init
> [0.00] Modules linked in:
>
> [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted
> 4.3.0-rc2-next-20150924 #1
> [0.00] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
> [0.00]   ebd8af5cfcdffb28 81c03da8
> 813b18c2
> [0.00]  81c03df0 81c03de0 8108dd52
> 81c5bad0
> [0.00]  88003ffe663a 81c49a00 0001
> 88003ffe662b
> [0.00] Call Trace:
> [0.00]  [] dump_stack+0x44/0x62
> [0.00]  [] warn_slowpath_common+0x82/0xc0
> [0.00]  [] warn_slowpath_fmt+0x5c/0x80
> [0.00]  [] static_key_slow_dec+0x44/0x60
> [0.00]  [] cgroup_disable+0xaf/0xd6
> [0.00]  [] unknown_bootoption+0x8c/0x194
> [0.00]  [] parse_args+0x273/0x4a0
> [0.00]  [] ? printk+0x57/0x73
> [0.00]  [] start_kernel+0x205/0x4b8
> [0.00]  [] ? set_init_arg+0x55/0x55
> [0.00]  [] ? early_idt_handler_array+0x120/0x120
> [0.00]  [] x86_64_start_reservations+0x2a/0x2c
> [0.00]  [] x86_64_start_kernel+0x14a/0x16d
> [0.00] ---[ end trace 8a774ae3ff020690 ]---
>
> Thanks,
> Andrew
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


WARNING: static_key_slow_dec used before call to jump_label_init

2015-09-24 Thread Andrey Wagin
Hello,

I booted kernel with cgroup_disable=cpu and get this warning:

[0.00] Kernel command line:
BOOT_IMAGE=/boot/vmlinuz-4.3.0-rc2-next-20150923 no_timer_check
console=ttyS0,115200n8 root=UUID=01bc7316-b1f4-45c9-a23a-0
[0.00] [ cut here ]
[0.00] WARNING: CPU: 0 PID: 0 at kernel/jump_label.c:99
static_key_slow_dec+0x44/0x60()
[0.00] static_key_slow_dec used before call to jump_label_init
[0.00] Modules linked in:

[0.00] CPU: 0 PID: 0 Comm: swapper Not tainted
4.3.0-rc2-next-20150924 #1
[0.00] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[0.00]   ebd8af5cfcdffb28 81c03da8
813b18c2
[0.00]  81c03df0 81c03de0 8108dd52
81c5bad0
[0.00]  88003ffe663a 81c49a00 0001
88003ffe662b
[0.00] Call Trace:
[0.00]  [] dump_stack+0x44/0x62
[0.00]  [] warn_slowpath_common+0x82/0xc0
[0.00]  [] warn_slowpath_fmt+0x5c/0x80
[0.00]  [] static_key_slow_dec+0x44/0x60
[0.00]  [] cgroup_disable+0xaf/0xd6
[0.00]  [] unknown_bootoption+0x8c/0x194
[0.00]  [] parse_args+0x273/0x4a0
[0.00]  [] ? printk+0x57/0x73
[0.00]  [] start_kernel+0x205/0x4b8
[0.00]  [] ? set_init_arg+0x55/0x55
[0.00]  [] ? early_idt_handler_array+0x120/0x120
[0.00]  [] x86_64_start_reservations+0x2a/0x2c
[0.00]  [] x86_64_start_kernel+0x14a/0x16d
[0.00] ---[ end trace 8a774ae3ff020690 ]---

Thanks,
Andrew
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


WARNING: static_key_slow_dec used before call to jump_label_init

2015-09-24 Thread Andrey Wagin
Hello,

I booted kernel with cgroup_disable=cpu and get this warning:

[0.00] Kernel command line:
BOOT_IMAGE=/boot/vmlinuz-4.3.0-rc2-next-20150923 no_timer_check
console=ttyS0,115200n8 root=UUID=01bc7316-b1f4-45c9-a23a-0
[0.00] [ cut here ]
[0.00] WARNING: CPU: 0 PID: 0 at kernel/jump_label.c:99
static_key_slow_dec+0x44/0x60()
[0.00] static_key_slow_dec used before call to jump_label_init
[0.00] Modules linked in:

[0.00] CPU: 0 PID: 0 Comm: swapper Not tainted
4.3.0-rc2-next-20150924 #1
[0.00] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[0.00]   ebd8af5cfcdffb28 81c03da8
813b18c2
[0.00]  81c03df0 81c03de0 8108dd52
81c5bad0
[0.00]  88003ffe663a 81c49a00 0001
88003ffe662b
[0.00] Call Trace:
[0.00]  [] dump_stack+0x44/0x62
[0.00]  [] warn_slowpath_common+0x82/0xc0
[0.00]  [] warn_slowpath_fmt+0x5c/0x80
[0.00]  [] static_key_slow_dec+0x44/0x60
[0.00]  [] cgroup_disable+0xaf/0xd6
[0.00]  [] unknown_bootoption+0x8c/0x194
[0.00]  [] parse_args+0x273/0x4a0
[0.00]  [] ? printk+0x57/0x73
[0.00]  [] start_kernel+0x205/0x4b8
[0.00]  [] ? set_init_arg+0x55/0x55
[0.00]  [] ? early_idt_handler_array+0x120/0x120
[0.00]  [] x86_64_start_reservations+0x2a/0x2c
[0.00]  [] x86_64_start_kernel+0x14a/0x16d
[0.00] ---[ end trace 8a774ae3ff020690 ]---

Thanks,
Andrew
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: WARNING: static_key_slow_dec used before call to jump_label_init

2015-09-24 Thread Andrey Wagin
2015-09-24 9:57 GMT+03:00 Andrey Wagin <ava...@gmail.com>:
> Hello,
>
> I booted kernel with cgroup_disable=cpu and get this warning:
>
> [0.00] Kernel command line:
> BOOT_IMAGE=/boot/vmlinuz-4.3.0-rc2-next-20150923 no_timer_check
> console=ttyS0,115200n8 root=UUID=01bc7316-b1f4-45c9-a23a-0

[0.00] Kernel command line:
BOOT_IMAGE=/boot/vmlinuz-4.3.0-rc2-next-20150923 no_timer_check
console=ttyS0,115200n8 root=UUID=01bc7316-b1f4-45c9-a23a-00c5a2336ef2
console=tty1 ro rhgb quiet LANG=en_US.UTF-8
initrd=/boot/initramfs-4.3.0-rc2-next-20150923.img debug
cgroup_disable=cpu
You have new mail in /var/spool/mail/root

> [0.00] [ cut here ]
> [0.00] WARNING: CPU: 0 PID: 0 at kernel/jump_label.c:99
> static_key_slow_dec+0x44/0x60()
> [0.00] static_key_slow_dec used before call to jump_label_init
> [0.00] Modules linked in:
>
> [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted
> 4.3.0-rc2-next-20150924 #1
> [0.00] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
> [0.00]   ebd8af5cfcdffb28 81c03da8
> 813b18c2
> [0.00]  81c03df0 81c03de0 8108dd52
> 81c5bad0
> [0.00]  88003ffe663a 81c49a00 0001
> 88003ffe662b
> [0.00] Call Trace:
> [0.00]  [] dump_stack+0x44/0x62
> [0.00]  [] warn_slowpath_common+0x82/0xc0
> [0.00]  [] warn_slowpath_fmt+0x5c/0x80
> [0.00]  [] static_key_slow_dec+0x44/0x60
> [0.00]  [] cgroup_disable+0xaf/0xd6
> [0.00]  [] unknown_bootoption+0x8c/0x194
> [0.00]  [] parse_args+0x273/0x4a0
> [0.00]  [] ? printk+0x57/0x73
> [0.00]  [] start_kernel+0x205/0x4b8
> [0.00]  [] ? set_init_arg+0x55/0x55
> [0.00]  [] ? early_idt_handler_array+0x120/0x120
> [0.00]  [] x86_64_start_reservations+0x2a/0x2c
> [0.00]  [] x86_64_start_kernel+0x14a/0x16d
> [0.00] ---[ end trace 8a774ae3ff020690 ]---
>
> Thanks,
> Andrew
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] inotify: hide internal kernel bits from fdinfo

2015-09-21 Thread Andrey Wagin
2015-09-21 21:45 GMT+03:00 Dave Hansen :
>
> From: Dave Hansen 
>
> There was a report that my patch:
>
> inotify: actually check for invalid bits in sys_inotify_add_watch()
>
> broke CRIU.
>
> The reason is that CRIU looks up raw flags in /proc/$pid/fdinfo/*
> to figure out how to rebuild inotify watches and then passes those
> flags directly back in to the inotify API.  One of those flags
> (FS_EVENT_ON_CHILD) is set in mark->mask, but is not part of the
> inotify API.  It is used inside the kernel to _implement_ inotify
> but it is not and has never been part of the API.
>
> My patch above ensured that we only allow bits which are part of
> the API (IN_ALL_EVENTS).  This broke CRIU.
>
> FS_EVENT_ON_CHILD is really internal to the kernel.  It is set
> _anyway_ on all inotify marks.  So, CRIU was really just trying
> to set a bit that was already set.
>
> This patch hides that bit from fdinfo.  CRIU will not see the
> bit, not try to set it, and should work as before.  We should not
> have been exposing this bit in the first place, so this is a good
> patch independent of the CRIU problem.
>
> Signed-off-by: Dave Hansen 
> Reported-by: Andrey Wagin 

Acked-by: Andrey Vagin 

Thanks,
Andrey

> Cc: Andrew Morton 
> Cc: Cyrill Gorcunov 
> Cc: xe...@parallels.com
> Cc: Eric Paris 
> Cc: j...@johnmccutchan.com
> Cc: rl...@rlove.org
> Cc: linux-kernel@vger.kernel.org
> ---
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] inotify: actually check for invalid bits in sys_inotify_add_watch()

2015-09-21 Thread Andrey Wagin
2015-09-10 23:49 GMT+03:00 Andrew Morton :
> On Wed, 9 Sep 2015 16:32:37 -0700 Dave Hansen  wrote:
>
>> On 09/09/2015 04:16 PM, Eric Paris wrote:
>> > Looks fine to me. And usually akpm picks them up these days.
>>
>> Is that an Acked-by? :)
>
> I grabbed it for 4.3.

This patch breaks CRIU. When we are dumping an inotify file
descriptors, we read event masks from /proc/self/fdinfo.

[root@fc22-vm criu]# cat /proc/1065/fdinfo/3
pos: 0
flags: 04000
mnt_id: 11
inotify wd:1 ino:101d5c sdev:fc2 mask:800a708 ignored_mask:0
fhandle-bytes:8 fhandle-type:1 f_handle:5c1d10003996cda2

Here the mask contains the 0x800 (FS_EVENT_ON_CHILD) flag, which
is set for all inotify and dnotify watchers. On restore, we call
inotify_add_watch() with this mask, and it fails with this patch:

1375  inotify_add_watch(3, "/proc/self/fd/5",
IN_CLOSE_WRITE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_UNMOUNT|IN_IGNORED|0x800)
= -1 EINVAL (Invalid argument)

I am not sure that we have to save backward compatibility here. Maybe
we should not show FS_EVENT_ON_CHILD in fdinfo.

>
> I removed your cc:stable.  I don't see anything in here which warrants
> a backport.  If there *is* a reason for backporting then your
> changelogological skills are sorely wanting!
>
>
> The changelog is pretty sucky really.  What are the reasons for this
> change, apart from "do what the comment said"?  What's the benefit?
>
> And the code comment sucks.  "don't allow invalid bits": well duh.  And
> "we don't want flags set" is also useless: it doesn't explain *why* we
> don't want those flags set.
>
> And given that there is potential to break existing userspace, we need
> some decent reasons for making this change.
>
>
>
>
> From: Dave Hansen 
> Subject: inotify: actually check for invalid bits in sys_inotify_add_watch()
>
> The comment here says that it is checking for invalid bits.  But, the mask
> is *actually* checking to ensure that _any_ valid bit is set, which is
> quite different.
>
> Add the actual check which was intended.  Retain the existing check
> because it actually does something useful: ensure that some inotify bits
> are being added to the watch.  Plus, this is existing behavior which would
> be nice to preserve.
>
> I did a quick sniff test that inotify functions and that my
> 'inotify-tools' package passes 'make check'.
>
> Signed-off-by: Dave Hansen 
> Cc: John McCutchan 
> Cc: Robert Love 
> Cc: Eric Paris 
> Signed-off-by: Andrew Morton 
> ---
>
>  fs/notify/inotify/inotify_user.c |3 +++
>  1 file changed, 3 insertions(+)
>
> diff -puN 
> fs/notify/inotify/inotify_user.c~inotify-actually-check-for-invalid-bits-in-sys_inotify_add_watch
>  fs/notify/inotify/inotify_user.c
> --- 
> a/fs/notify/inotify/inotify_user.c~inotify-actually-check-for-invalid-bits-in-sys_inotify_add_watch
> +++ a/fs/notify/inotify/inotify_user.c
> @@ -707,6 +707,9 @@ SYSCALL_DEFINE3(inotify_add_watch, int,
> unsigned flags = 0;
>
> /* don't allow invalid bits: we don't want flags set */
> +   if (unlikely(mask & ~ALL_INOTIFY_BITS))
> +   return -EINVAL;
> +   /* require at least one valid bit set in the mask */
> if (unlikely(!(mask & ALL_INOTIFY_BITS)))
> return -EINVAL;
>
> _
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] inotify: actually check for invalid bits in sys_inotify_add_watch()

2015-09-21 Thread Andrey Wagin
2015-09-10 23:49 GMT+03:00 Andrew Morton :
> On Wed, 9 Sep 2015 16:32:37 -0700 Dave Hansen  wrote:
>
>> On 09/09/2015 04:16 PM, Eric Paris wrote:
>> > Looks fine to me. And usually akpm picks them up these days.
>>
>> Is that an Acked-by? :)
>
> I grabbed it for 4.3.

This patch breaks CRIU. When we are dumping an inotify file
descriptors, we read event masks from /proc/self/fdinfo.

[root@fc22-vm criu]# cat /proc/1065/fdinfo/3
pos: 0
flags: 04000
mnt_id: 11
inotify wd:1 ino:101d5c sdev:fc2 mask:800a708 ignored_mask:0
fhandle-bytes:8 fhandle-type:1 f_handle:5c1d10003996cda2

Here the mask contains the 0x800 (FS_EVENT_ON_CHILD) flag, which
is set for all inotify and dnotify watchers. On restore, we call
inotify_add_watch() with this mask, and it fails with this patch:

1375  inotify_add_watch(3, "/proc/self/fd/5",
IN_CLOSE_WRITE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_UNMOUNT|IN_IGNORED|0x800)
= -1 EINVAL (Invalid argument)

I am not sure that we have to save backward compatibility here. Maybe
we should not show FS_EVENT_ON_CHILD in fdinfo.

>
> I removed your cc:stable.  I don't see anything in here which warrants
> a backport.  If there *is* a reason for backporting then your
> changelogological skills are sorely wanting!
>
>
> The changelog is pretty sucky really.  What are the reasons for this
> change, apart from "do what the comment said"?  What's the benefit?
>
> And the code comment sucks.  "don't allow invalid bits": well duh.  And
> "we don't want flags set" is also useless: it doesn't explain *why* we
> don't want those flags set.
>
> And given that there is potential to break existing userspace, we need
> some decent reasons for making this change.
>
>
>
>
> From: Dave Hansen 
> Subject: inotify: actually check for invalid bits in sys_inotify_add_watch()
>
> The comment here says that it is checking for invalid bits.  But, the mask
> is *actually* checking to ensure that _any_ valid bit is set, which is
> quite different.
>
> Add the actual check which was intended.  Retain the existing check
> because it actually does something useful: ensure that some inotify bits
> are being added to the watch.  Plus, this is existing behavior which would
> be nice to preserve.
>
> I did a quick sniff test that inotify functions and that my
> 'inotify-tools' package passes 'make check'.
>
> Signed-off-by: Dave Hansen 
> Cc: John McCutchan 
> Cc: Robert Love 
> Cc: Eric Paris 
> Signed-off-by: Andrew Morton 
> ---
>
>  fs/notify/inotify/inotify_user.c |3 +++
>  1 file changed, 3 insertions(+)
>
> diff -puN 
> fs/notify/inotify/inotify_user.c~inotify-actually-check-for-invalid-bits-in-sys_inotify_add_watch
>  fs/notify/inotify/inotify_user.c
> --- 
> a/fs/notify/inotify/inotify_user.c~inotify-actually-check-for-invalid-bits-in-sys_inotify_add_watch
> +++ a/fs/notify/inotify/inotify_user.c
> @@ -707,6 +707,9 @@ SYSCALL_DEFINE3(inotify_add_watch, int,
> unsigned flags = 0;
>
> /* don't allow invalid bits: we don't want flags set */
> +   if (unlikely(mask & ~ALL_INOTIFY_BITS))
> +   return -EINVAL;
> +   /* require at least one valid bit set in the mask */
> if (unlikely(!(mask & ALL_INOTIFY_BITS)))
> return -EINVAL;
>
> _
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] inotify: hide internal kernel bits from fdinfo

2015-09-21 Thread Andrey Wagin
2015-09-21 21:45 GMT+03:00 Dave Hansen <d...@sr71.net>:
>
> From: Dave Hansen <dave.han...@linux.intel.com>
>
> There was a report that my patch:
>
> inotify: actually check for invalid bits in sys_inotify_add_watch()
>
> broke CRIU.
>
> The reason is that CRIU looks up raw flags in /proc/$pid/fdinfo/*
> to figure out how to rebuild inotify watches and then passes those
> flags directly back in to the inotify API.  One of those flags
> (FS_EVENT_ON_CHILD) is set in mark->mask, but is not part of the
> inotify API.  It is used inside the kernel to _implement_ inotify
> but it is not and has never been part of the API.
>
> My patch above ensured that we only allow bits which are part of
> the API (IN_ALL_EVENTS).  This broke CRIU.
>
> FS_EVENT_ON_CHILD is really internal to the kernel.  It is set
> _anyway_ on all inotify marks.  So, CRIU was really just trying
> to set a bit that was already set.
>
> This patch hides that bit from fdinfo.  CRIU will not see the
> bit, not try to set it, and should work as before.  We should not
> have been exposing this bit in the first place, so this is a good
> patch independent of the CRIU problem.
>
> Signed-off-by: Dave Hansen <dave.han...@linux.intel.com>
> Reported-by: Andrey Wagin <ava...@gmail.com>

Acked-by: Andrey Vagin <ava...@openvz.org>

Thanks,
Andrey

> Cc: Andrew Morton <a...@linux-foundation.org>
> Cc: Cyrill Gorcunov <gorcu...@openvz.org>
> Cc: xe...@parallels.com
> Cc: Eric Paris <epa...@redhat.com>
> Cc: j...@johnmccutchan.com
> Cc: rl...@rlove.org
> Cc: linux-kernel@vger.kernel.org
> ---
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: linux-next: BUG: unable to handle kernel NULL pointer dereference in cfq_print_leaf_weight()

2015-06-22 Thread Andrey Wagin
2015-06-19 23:26 GMT+03:00 Andrey Wagin :
> 2015-06-19 19:24 GMT+03:00 Jens Axboe :
>> On 06/17/2015 03:55 PM, Andrey Wagin wrote:
>>>
>>> Hi All,
>>>
>>> I executed CRIU tests on the 4.1.0-rc8-next-20150617 kernel and met
>>> this bug. Maybe it will be interested for someone to look at it.
>>
>>
>> This should fix it:
>>
>> http://git.kernel.dk/cgit/linux-block/commit/?h=for-4.2/core=9470e4a693db84bee7becbba8de01af02bb23c9f
>
> Hi Jens,
>
> Thank you for the path. I think we need to fix __cfq_set_weight and
> __cfq_set_weight_device too.

I've seen that you fixed these functions too. But CRIU tests still
fail, because they tries to restore a value of blkio.weight and get
EINVAL. It works fine on the upstream kernel.

For me the suggested interface looks weird. What if a device which
uses cfq isn't permanent. If I detach the device, cgroup configuration
will be destroyed and then if I attach the device again, I will need
to apply cgroup parameters again.

Thanks,
Andrew
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at  http://www.tux.org/lkml/


Re: linux-next: BUG: unable to handle kernel NULL pointer dereference in cfq_print_leaf_weight()

2015-06-22 Thread Andrey Wagin
2015-06-19 23:26 GMT+03:00 Andrey Wagin ava...@gmail.com:
 2015-06-19 19:24 GMT+03:00 Jens Axboe ax...@kernel.dk:
 On 06/17/2015 03:55 PM, Andrey Wagin wrote:

 Hi All,

 I executed CRIU tests on the 4.1.0-rc8-next-20150617 kernel and met
 this bug. Maybe it will be interested for someone to look at it.


 This should fix it:

 http://git.kernel.dk/cgit/linux-block/commit/?h=for-4.2/coreid=9470e4a693db84bee7becbba8de01af02bb23c9f

 Hi Jens,

 Thank you for the path. I think we need to fix __cfq_set_weight and
 __cfq_set_weight_device too.

I've seen that you fixed these functions too. But CRIU tests still
fail, because they tries to restore a value of blkio.weight and get
EINVAL. It works fine on the upstream kernel.

For me the suggested interface looks weird. What if a device which
uses cfq isn't permanent. If I detach the device, cgroup configuration
will be destroyed and then if I attach the device again, I will need
to apply cgroup parameters again.

Thanks,
Andrew
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
Please read the FAQ at  http://www.tux.org/lkml/


Re: linux-next: BUG: unable to handle kernel NULL pointer dereference in cfq_print_leaf_weight()

2015-06-19 Thread Andrey Wagin
2015-06-19 19:24 GMT+03:00 Jens Axboe :
> On 06/17/2015 03:55 PM, Andrey Wagin wrote:
>>
>> Hi All,
>>
>> I executed CRIU tests on the 4.1.0-rc8-next-20150617 kernel and met
>> this bug. Maybe it will be interested for someone to look at it.
>
>
> This should fix it:
>
> http://git.kernel.dk/cgit/linux-block/commit/?h=for-4.2/core=9470e4a693db84bee7becbba8de01af02bb23c9f

Hi Jens,

Thank you for the path. I think we need to fix __cfq_set_weight and
__cfq_set_weight_device too.

>
> --
> Jens Axboe
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at  http://www.tux.org/lkml/


Re: linux-next: BUG: unable to handle kernel NULL pointer dereference in cfq_print_leaf_weight()

2015-06-19 Thread Andrey Wagin
2015-06-19 19:24 GMT+03:00 Jens Axboe ax...@kernel.dk:
 On 06/17/2015 03:55 PM, Andrey Wagin wrote:

 Hi All,

 I executed CRIU tests on the 4.1.0-rc8-next-20150617 kernel and met
 this bug. Maybe it will be interested for someone to look at it.


 This should fix it:

 http://git.kernel.dk/cgit/linux-block/commit/?h=for-4.2/coreid=9470e4a693db84bee7becbba8de01af02bb23c9f

Hi Jens,

Thank you for the path. I think we need to fix __cfq_set_weight and
__cfq_set_weight_device too.


 --
 Jens Axboe

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
Please read the FAQ at  http://www.tux.org/lkml/


Re: linux-next: BUG: unable to handle kernel NULL pointer dereference in cfq_print_leaf_weight()

2015-06-18 Thread Andrey Wagin
2015-06-18 0:55 GMT+03:00 Andrey Wagin :
> Hi All,
>
> I executed CRIU tests on the 4.1.0-rc8-next-20150617 kernel and met
> this bug. Maybe it will be interested for someone to look at it.

This bug isn't reproduced if one of devices uses the cfq scheduler:

[root@avagin-fc19-cr ~]# cat /sys/block/sda/queue/scheduler
noop deadline [cfq]
[root@avagin-fc19-cr ~]# cat /sys/fs/cgroup/blkio/blkio.leaf_weight
1000

[root@avagin-fc19-cr ~]# echo noop > /sys/block/sda/queue/scheduler
[root@avagin-fc19-cr ~]# cat /sys/block/sda/queue/scheduler
[noop] deadline cfq
[root@avagin-fc19-cr ~]# cat /sys/fs/cgroup/blkio/blkio.leaf_weight
Killed

blkcg_to_cfqgd(blkcg) returns NULL.

>
> [   30.130897] BUG: unable to handle kernel NULL pointer dereference
> at 001c
> [   30.130970] IP: [] cfq_print_leaf_weight+0x31/0x50
> [   30.131013] PGD 3b77a067 PUD 39883067 PMD 0
> [   30.131013] Oops:  [#1] SMP
> [   30.131013] Modules linked in: tun netlink_diag af_packet_diag
> udp_diag tcp_diag inet_diag unix_diag ppdev crct10dif_pclmul
> crc32_pclmul crc32c_intel ghash_clmulni_intel virtio_balloon serio_raw
> i2c_piix4 parport_pc parport virtio_blk virtio_net cirrus
> drm_kms_helper ttm drm virtio_pci virtio_ring virtio ata_generic
> pata_acpi
> [   30.131013] CPU: 0 PID: 613 Comm: criu Not tainted 4.1.0-rc8-next-20150617 
> #1
> [   30.131013] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
> [   30.131013] task: 880039eb ti: 880039db task.ti:
> 880039db
> [   30.131013] RIP: 0010:[]  []
> cfq_print_leaf_weight+0x31/0x50
> [   30.131013] RSP: 0018:880039db3d08  EFLAGS: 00010286
> [   30.131013] RAX:  RBX: 88003a524200 RCX: 
> 0001
> [   30.131013] RDX: 0001 RSI: 0001 RDI: 
> 88003a524800
> [   30.131013] RBP: 880039db3d18 R08: 0001 R09: 
> 
> [   30.131013] R10: 0080 R11:  R12: 
> 81cba6c8
> [   30.131013] R13: 0001 R14: 880039db3f18 R15: 
> 88003a524200
> [   30.131013] FS:  7f2b6c495740() GS:88003d20()
> knlGS:
> [   30.131013] CS:  0010 DS:  ES:  CR0: 80050033
> [   30.131013] CR2: 001c CR3: 3de3c000 CR4: 
> 000407f0
> [   30.131013] Stack:
> [   30.131013]  880039db3d28 88003a524200 880039db3d48
> 8113c9d3
> [   30.131013]  880039db3d58  880039773380
> 0001
> [   30.131013]  880039db3d58 812b9df6 880039db3dd8
> 81257b7d
> [   30.131013] Call Trace:
> [   30.131013]  [] cgroup_seqfile_show+0x43/0xc0
> [   30.131013]  [] kernfs_seq_show+0x26/0x30
> [   30.131013]  [] seq_read+0x10d/0x3c0
> [   30.131013]  [] kernfs_fop_read+0x129/0x180
> [   30.131013]  [] __vfs_read+0x37/0x100
> [   30.131013]  [] ? security_file_permission+0xa3/0xc0
> [   30.131013]  [] ? rw_verify_area+0x56/0xe0
> [   30.131013]  [] vfs_read+0x86/0x130
> [   30.131013]  [] SyS_read+0x58/0xd0
> [   30.131013]  [] entry_SYSCALL_64_fastpath+0x12/0x76
> [   30.131013] Code: 48 89 e5 53 48 89 fb 48 83 ec 08 48 8b bf d8 00
> 00 00 e8 93 fe d7 ff 48 85 c0 74 0f 48 63 15 27 f0 8f 00 48 8b 84 d0
> 28 01 00 00 <8b> 50 1c 48 89 df 31 c0 48 c7 c6 f8 f8 ad 81 e8 1b b9 e9
> ff 48
> [   30.131013] RIP  [] cfq_print_leaf_weight+0x31/0x50
> [   30.131013]  RSP 
> [   30.131013] CR2: 001c
> [   30.142367] ---[ end trace bad3e020b932bbb1 ]---
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: linux-next: BUG: unable to handle kernel NULL pointer dereference in cfq_print_leaf_weight()

2015-06-18 Thread Andrey Wagin
2015-06-18 0:55 GMT+03:00 Andrey Wagin ava...@gmail.com:
 Hi All,

 I executed CRIU tests on the 4.1.0-rc8-next-20150617 kernel and met
 this bug. Maybe it will be interested for someone to look at it.

This bug isn't reproduced if one of devices uses the cfq scheduler:

[root@avagin-fc19-cr ~]# cat /sys/block/sda/queue/scheduler
noop deadline [cfq]
[root@avagin-fc19-cr ~]# cat /sys/fs/cgroup/blkio/blkio.leaf_weight
1000

[root@avagin-fc19-cr ~]# echo noop  /sys/block/sda/queue/scheduler
[root@avagin-fc19-cr ~]# cat /sys/block/sda/queue/scheduler
[noop] deadline cfq
[root@avagin-fc19-cr ~]# cat /sys/fs/cgroup/blkio/blkio.leaf_weight
Killed

blkcg_to_cfqgd(blkcg) returns NULL.


 [   30.130897] BUG: unable to handle kernel NULL pointer dereference
 at 001c
 [   30.130970] IP: [813bc7e1] cfq_print_leaf_weight+0x31/0x50
 [   30.131013] PGD 3b77a067 PUD 39883067 PMD 0
 [   30.131013] Oops:  [#1] SMP
 [   30.131013] Modules linked in: tun netlink_diag af_packet_diag
 udp_diag tcp_diag inet_diag unix_diag ppdev crct10dif_pclmul
 crc32_pclmul crc32c_intel ghash_clmulni_intel virtio_balloon serio_raw
 i2c_piix4 parport_pc parport virtio_blk virtio_net cirrus
 drm_kms_helper ttm drm virtio_pci virtio_ring virtio ata_generic
 pata_acpi
 [   30.131013] CPU: 0 PID: 613 Comm: criu Not tainted 4.1.0-rc8-next-20150617 
 #1
 [   30.131013] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
 [   30.131013] task: 880039eb ti: 880039db task.ti:
 880039db
 [   30.131013] RIP: 0010:[813bc7e1]  [813bc7e1]
 cfq_print_leaf_weight+0x31/0x50
 [   30.131013] RSP: 0018:880039db3d08  EFLAGS: 00010286
 [   30.131013] RAX:  RBX: 88003a524200 RCX: 
 0001
 [   30.131013] RDX: 0001 RSI: 0001 RDI: 
 88003a524800
 [   30.131013] RBP: 880039db3d18 R08: 0001 R09: 
 
 [   30.131013] R10: 0080 R11:  R12: 
 81cba6c8
 [   30.131013] R13: 0001 R14: 880039db3f18 R15: 
 88003a524200
 [   30.131013] FS:  7f2b6c495740() GS:88003d20()
 knlGS:
 [   30.131013] CS:  0010 DS:  ES:  CR0: 80050033
 [   30.131013] CR2: 001c CR3: 3de3c000 CR4: 
 000407f0
 [   30.131013] Stack:
 [   30.131013]  880039db3d28 88003a524200 880039db3d48
 8113c9d3
 [   30.131013]  880039db3d58  880039773380
 0001
 [   30.131013]  880039db3d58 812b9df6 880039db3dd8
 81257b7d
 [   30.131013] Call Trace:
 [   30.131013]  [8113c9d3] cgroup_seqfile_show+0x43/0xc0
 [   30.131013]  [812b9df6] kernfs_seq_show+0x26/0x30
 [   30.131013]  [81257b7d] seq_read+0x10d/0x3c0
 [   30.131013]  [812ba7c9] kernfs_fop_read+0x129/0x180
 [   30.131013]  [8122f9d7] __vfs_read+0x37/0x100
 [   30.131013]  [81349db3] ? security_file_permission+0xa3/0xc0
 [   30.131013]  [8122ff86] ? rw_verify_area+0x56/0xe0
 [   30.131013]  [81230096] vfs_read+0x86/0x130
 [   30.131013]  [81230f28] SyS_read+0x58/0xd0
 [   30.131013]  [817c386e] entry_SYSCALL_64_fastpath+0x12/0x76
 [   30.131013] Code: 48 89 e5 53 48 89 fb 48 83 ec 08 48 8b bf d8 00
 00 00 e8 93 fe d7 ff 48 85 c0 74 0f 48 63 15 27 f0 8f 00 48 8b 84 d0
 28 01 00 00 8b 50 1c 48 89 df 31 c0 48 c7 c6 f8 f8 ad 81 e8 1b b9 e9
 ff 48
 [   30.131013] RIP  [813bc7e1] cfq_print_leaf_weight+0x31/0x50
 [   30.131013]  RSP 880039db3d08
 [   30.131013] CR2: 001c
 [   30.142367] ---[ end trace bad3e020b932bbb1 ]---
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


linux-next: BUG: unable to handle kernel NULL pointer dereference in cfq_print_leaf_weight()

2015-06-17 Thread Andrey Wagin
Hi All,

I executed CRIU tests on the 4.1.0-rc8-next-20150617 kernel and met
this bug. Maybe it will be interested for someone to look at it.

[   30.130897] BUG: unable to handle kernel NULL pointer dereference
at 001c
[   30.130970] IP: [] cfq_print_leaf_weight+0x31/0x50
[   30.131013] PGD 3b77a067 PUD 39883067 PMD 0
[   30.131013] Oops:  [#1] SMP
[   30.131013] Modules linked in: tun netlink_diag af_packet_diag
udp_diag tcp_diag inet_diag unix_diag ppdev crct10dif_pclmul
crc32_pclmul crc32c_intel ghash_clmulni_intel virtio_balloon serio_raw
i2c_piix4 parport_pc parport virtio_blk virtio_net cirrus
drm_kms_helper ttm drm virtio_pci virtio_ring virtio ata_generic
pata_acpi
[   30.131013] CPU: 0 PID: 613 Comm: criu Not tainted 4.1.0-rc8-next-20150617 #1
[   30.131013] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[   30.131013] task: 880039eb ti: 880039db task.ti:
880039db
[   30.131013] RIP: 0010:[]  []
cfq_print_leaf_weight+0x31/0x50
[   30.131013] RSP: 0018:880039db3d08  EFLAGS: 00010286
[   30.131013] RAX:  RBX: 88003a524200 RCX: 0001
[   30.131013] RDX: 0001 RSI: 0001 RDI: 88003a524800
[   30.131013] RBP: 880039db3d18 R08: 0001 R09: 
[   30.131013] R10: 0080 R11:  R12: 81cba6c8
[   30.131013] R13: 0001 R14: 880039db3f18 R15: 88003a524200
[   30.131013] FS:  7f2b6c495740() GS:88003d20()
knlGS:
[   30.131013] CS:  0010 DS:  ES:  CR0: 80050033
[   30.131013] CR2: 001c CR3: 3de3c000 CR4: 000407f0
[   30.131013] Stack:
[   30.131013]  880039db3d28 88003a524200 880039db3d48
8113c9d3
[   30.131013]  880039db3d58  880039773380
0001
[   30.131013]  880039db3d58 812b9df6 880039db3dd8
81257b7d
[   30.131013] Call Trace:
[   30.131013]  [] cgroup_seqfile_show+0x43/0xc0
[   30.131013]  [] kernfs_seq_show+0x26/0x30
[   30.131013]  [] seq_read+0x10d/0x3c0
[   30.131013]  [] kernfs_fop_read+0x129/0x180
[   30.131013]  [] __vfs_read+0x37/0x100
[   30.131013]  [] ? security_file_permission+0xa3/0xc0
[   30.131013]  [] ? rw_verify_area+0x56/0xe0
[   30.131013]  [] vfs_read+0x86/0x130
[   30.131013]  [] SyS_read+0x58/0xd0
[   30.131013]  [] entry_SYSCALL_64_fastpath+0x12/0x76
[   30.131013] Code: 48 89 e5 53 48 89 fb 48 83 ec 08 48 8b bf d8 00
00 00 e8 93 fe d7 ff 48 85 c0 74 0f 48 63 15 27 f0 8f 00 48 8b 84 d0
28 01 00 00 <8b> 50 1c 48 89 df 31 c0 48 c7 c6 f8 f8 ad 81 e8 1b b9 e9
ff 48
[   30.131013] RIP  [] cfq_print_leaf_weight+0x31/0x50
[   30.131013]  RSP 
[   30.131013] CR2: 001c
[   30.142367] ---[ end trace bad3e020b932bbb1 ]---
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


linux-next: BUG: unable to handle kernel NULL pointer dereference in cfq_print_leaf_weight()

2015-06-17 Thread Andrey Wagin
Hi All,

I executed CRIU tests on the 4.1.0-rc8-next-20150617 kernel and met
this bug. Maybe it will be interested for someone to look at it.

[   30.130897] BUG: unable to handle kernel NULL pointer dereference
at 001c
[   30.130970] IP: [813bc7e1] cfq_print_leaf_weight+0x31/0x50
[   30.131013] PGD 3b77a067 PUD 39883067 PMD 0
[   30.131013] Oops:  [#1] SMP
[   30.131013] Modules linked in: tun netlink_diag af_packet_diag
udp_diag tcp_diag inet_diag unix_diag ppdev crct10dif_pclmul
crc32_pclmul crc32c_intel ghash_clmulni_intel virtio_balloon serio_raw
i2c_piix4 parport_pc parport virtio_blk virtio_net cirrus
drm_kms_helper ttm drm virtio_pci virtio_ring virtio ata_generic
pata_acpi
[   30.131013] CPU: 0 PID: 613 Comm: criu Not tainted 4.1.0-rc8-next-20150617 #1
[   30.131013] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[   30.131013] task: 880039eb ti: 880039db task.ti:
880039db
[   30.131013] RIP: 0010:[813bc7e1]  [813bc7e1]
cfq_print_leaf_weight+0x31/0x50
[   30.131013] RSP: 0018:880039db3d08  EFLAGS: 00010286
[   30.131013] RAX:  RBX: 88003a524200 RCX: 0001
[   30.131013] RDX: 0001 RSI: 0001 RDI: 88003a524800
[   30.131013] RBP: 880039db3d18 R08: 0001 R09: 
[   30.131013] R10: 0080 R11:  R12: 81cba6c8
[   30.131013] R13: 0001 R14: 880039db3f18 R15: 88003a524200
[   30.131013] FS:  7f2b6c495740() GS:88003d20()
knlGS:
[   30.131013] CS:  0010 DS:  ES:  CR0: 80050033
[   30.131013] CR2: 001c CR3: 3de3c000 CR4: 000407f0
[   30.131013] Stack:
[   30.131013]  880039db3d28 88003a524200 880039db3d48
8113c9d3
[   30.131013]  880039db3d58  880039773380
0001
[   30.131013]  880039db3d58 812b9df6 880039db3dd8
81257b7d
[   30.131013] Call Trace:
[   30.131013]  [8113c9d3] cgroup_seqfile_show+0x43/0xc0
[   30.131013]  [812b9df6] kernfs_seq_show+0x26/0x30
[   30.131013]  [81257b7d] seq_read+0x10d/0x3c0
[   30.131013]  [812ba7c9] kernfs_fop_read+0x129/0x180
[   30.131013]  [8122f9d7] __vfs_read+0x37/0x100
[   30.131013]  [81349db3] ? security_file_permission+0xa3/0xc0
[   30.131013]  [8122ff86] ? rw_verify_area+0x56/0xe0
[   30.131013]  [81230096] vfs_read+0x86/0x130
[   30.131013]  [81230f28] SyS_read+0x58/0xd0
[   30.131013]  [817c386e] entry_SYSCALL_64_fastpath+0x12/0x76
[   30.131013] Code: 48 89 e5 53 48 89 fb 48 83 ec 08 48 8b bf d8 00
00 00 e8 93 fe d7 ff 48 85 c0 74 0f 48 63 15 27 f0 8f 00 48 8b 84 d0
28 01 00 00 8b 50 1c 48 89 df 31 c0 48 c7 c6 f8 f8 ad 81 e8 1b b9 e9
ff 48
[   30.131013] RIP  [813bc7e1] cfq_print_leaf_weight+0x31/0x50
[   30.131013]  RSP 880039db3d08
[   30.131013] CR2: 001c
[   30.142367] ---[ end trace bad3e020b932bbb1 ]---
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] seccomp: add ptrace commands for suspend/resume

2015-06-02 Thread Andrey Wagin
2015-06-01 22:28 GMT+03:00 Tycho Andersen :
> This patch is the first step in enabling checkpoint/restore of processes
> with seccomp enabled.
>
> One of the things CRIU does while dumping tasks is inject code into them
> via ptrace to collect information that is only available to the process
> itself. However, if we are in a seccomp mode where these processes are
> prohibited from making these syscalls, then what CRIU does kills the task.
>
> This patch adds a new ptrace command, PTRACE_SUSPEND_SECCOMP that enables a
> task from the init user namespace which has CAP_SYS_ADMIN to disable (and
> re-enable) seccomp filters for another task so that they can be
> successfully dumped (and restored).

Do we need to re-enable seccomp if a tracer detaches unexpectedly.
CRIU can be killed and we should try to not affect the task state even
in this case.

Thanks,
Andrew
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] seccomp: add ptrace commands for suspend/resume

2015-06-02 Thread Andrey Wagin
2015-06-01 22:28 GMT+03:00 Tycho Andersen tycho.ander...@canonical.com:
 This patch is the first step in enabling checkpoint/restore of processes
 with seccomp enabled.

 One of the things CRIU does while dumping tasks is inject code into them
 via ptrace to collect information that is only available to the process
 itself. However, if we are in a seccomp mode where these processes are
 prohibited from making these syscalls, then what CRIU does kills the task.

 This patch adds a new ptrace command, PTRACE_SUSPEND_SECCOMP that enables a
 task from the init user namespace which has CAP_SYS_ADMIN to disable (and
 re-enable) seccomp filters for another task so that they can be
 successfully dumped (and restored).

Do we need to re-enable seccomp if a tracer detaches unexpectedly.
CRIU can be killed and we should try to not affect the task state even
in this case.

Thanks,
Andrew
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] fbcon: Avoid deleting a timer in IRQ context

2015-05-27 Thread Andrey Wagin
2015-05-21 10:58 GMT+03:00 Thierry Reding :
> From: Thierry Reding 
>
> Commit 27a4c827c34a ("fbcon: use the cursor blink interval provided by
> vt") unconditionally removes the cursor blink timer. Unfortunately that
> wreaks havoc under some circumstances. An easily reproducible way is to
> use both the framebuffer console and a debug serial port as the console
> output for kernel messages (e.g. "console=ttyS0 console=tty1" on the
> kernel command-line. Upon boot this triggers a warning from within the
> del_timer_sync() function because it is called from IRQ context:
>
> [5.070096] [ cut here ]
> [5.070110] WARNING: CPU: 0 PID: 0 at ../kernel/time/timer.c:1098 
> del_timer_sync+0x4c/0x54()
> [5.070115] Modules linked in:
> [5.070120] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 
> 4.1.0-rc4-next-20150519 #1
> [5.070123] Hardware name: SAMSUNG EXYNOS (Flattened Device Tree)
> [5.070142] [] (unwind_backtrace) from [] (show_stack+0x10/0x14)
> [5.070156] [] (show_stack) from [] (dump_stack+0x70/0xbc)
> [5.070164] [] (dump_stack) from [] 
> (warn_slowpath_common+0x74/0xb0)
> [5.070169] [] (warn_slowpath_common) from [] 
> (warn_slowpath_null+0x1c/0x24)
> [5.070174] [] (warn_slowpath_null) from [] 
> (del_timer_sync+0x4c/0x54)
> [5.070183] [] (del_timer_sync) from [] 
> (fbcon_del_cursor_timer+0x2c/0x40)
> [5.070190] [] (fbcon_del_cursor_timer) from [] 
> (fbcon_cursor+0x9c/0x180)
> [5.070198] [] (fbcon_cursor) from [] (hide_cursor+0x30/0x98)
> [5.070204] [] (hide_cursor) from [] (vt_console_print+0x2a8/0x340)
> [5.070212] [] (vt_console_print) from [] 
> (call_console_drivers.constprop.23+0xc8/0xec)
> [5.070218] [] (call_console_drivers.constprop.23) from [] 
> (console_unlock+0x498/0x4f0)
> [5.070223] [] (console_unlock) from [] (vprintk_emit+0x1f0/0x508)
> [5.070228] [] (vprintk_emit) from [] (vprintk_default+0x24/0x2c)
> [5.070234] [] (vprintk_default) from [] (printk+0x70/0x88)
>
> After which the system starts spewing all kinds of weird and seemingly
> unrelated error messages.
>
> This commit fixes this by restoring the condition under which the call
> to fbcon_del_cursor_timer() happens.

Tested-by: Andrew Vagin 

It would be good to push this patch into linux-next asap. Without this
patch I can't execute tests on linux-next. Thanks.

>
> Reported-by: Daniel Stone 
> Reported-by: Kevin Hilman 
> Tested-by: Kevin Hilman 
> Tested-by: Scot Doyle 
> Signed-off-by: Thierry Reding 
> ---
>  drivers/video/console/fbcon.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c
> index 05b1d1a71ef9..658c34bb9076 100644
> --- a/drivers/video/console/fbcon.c
> +++ b/drivers/video/console/fbcon.c
> @@ -1310,8 +1310,9 @@ static void fbcon_cursor(struct vc_data *vc, int mode)
> return;
>
> ops->cur_blink_jiffies = msecs_to_jiffies(vc->vc_cur_blink_ms);
> -   fbcon_del_cursor_timer(info);
> -   if (!(vc->vc_cursor_type & 0x10))
> +   if (vc->vc_cursor_type & 0x10)
> +   fbcon_del_cursor_timer(info);
> +   else
> fbcon_add_cursor_timer(info);
>
> ops->cursor_flash = (mode == CM_ERASE) ? 0 : 1;
> --
> 2.4.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] fbcon: use the cursor blink interval provided by vt

2015-05-27 Thread Andrey Wagin
2015-05-27 10:52 GMT+03:00 Scot Doyle :
> On Wed, 27 May 2015, Andrey Wagin wrote:
> ...
>> I regularly execute criu tests on linux-next. For this, I use virtual
>> machine from the digitalocean clould. The current version of
>> linux-next hangs after a few seconds. I use git bisect to find the
>> commit where the problem is appeaed. And it looks like the problem is
>> in this patch.
> ...
>> Thanks,
>> Andrew
>
> Perhaps this pending patch will help:
> http://marc.info/?l=linux-kernel=143219509918969=2

Yes, it helps. Thanks.
>
> Thanks,
> Scot
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] fbcon: use the cursor blink interval provided by vt

2015-05-27 Thread Andrey Wagin
2015-05-27 10:52 GMT+03:00 Scot Doyle lkm...@scotdoyle.com:
 On Wed, 27 May 2015, Andrey Wagin wrote:
 ...
 I regularly execute criu tests on linux-next. For this, I use virtual
 machine from the digitalocean clould. The current version of
 linux-next hangs after a few seconds. I use git bisect to find the
 commit where the problem is appeaed. And it looks like the problem is
 in this patch.
 ...
 Thanks,
 Andrew

 Perhaps this pending patch will help:
 http://marc.info/?l=linux-kernelm=143219509918969w=2

Yes, it helps. Thanks.

 Thanks,
 Scot

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] fbcon: Avoid deleting a timer in IRQ context

2015-05-27 Thread Andrey Wagin
2015-05-21 10:58 GMT+03:00 Thierry Reding thierry.red...@gmail.com:
 From: Thierry Reding tred...@nvidia.com

 Commit 27a4c827c34a (fbcon: use the cursor blink interval provided by
 vt) unconditionally removes the cursor blink timer. Unfortunately that
 wreaks havoc under some circumstances. An easily reproducible way is to
 use both the framebuffer console and a debug serial port as the console
 output for kernel messages (e.g. console=ttyS0 console=tty1 on the
 kernel command-line. Upon boot this triggers a warning from within the
 del_timer_sync() function because it is called from IRQ context:

 [5.070096] [ cut here ]
 [5.070110] WARNING: CPU: 0 PID: 0 at ../kernel/time/timer.c:1098 
 del_timer_sync+0x4c/0x54()
 [5.070115] Modules linked in:
 [5.070120] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 
 4.1.0-rc4-next-20150519 #1
 [5.070123] Hardware name: SAMSUNG EXYNOS (Flattened Device Tree)
 [5.070142] [] (unwind_backtrace) from [] (show_stack+0x10/0x14)
 [5.070156] [] (show_stack) from [] (dump_stack+0x70/0xbc)
 [5.070164] [] (dump_stack) from [] 
 (warn_slowpath_common+0x74/0xb0)
 [5.070169] [] (warn_slowpath_common) from [] 
 (warn_slowpath_null+0x1c/0x24)
 [5.070174] [] (warn_slowpath_null) from [] 
 (del_timer_sync+0x4c/0x54)
 [5.070183] [] (del_timer_sync) from [] 
 (fbcon_del_cursor_timer+0x2c/0x40)
 [5.070190] [] (fbcon_del_cursor_timer) from [] 
 (fbcon_cursor+0x9c/0x180)
 [5.070198] [] (fbcon_cursor) from [] (hide_cursor+0x30/0x98)
 [5.070204] [] (hide_cursor) from [] (vt_console_print+0x2a8/0x340)
 [5.070212] [] (vt_console_print) from [] 
 (call_console_drivers.constprop.23+0xc8/0xec)
 [5.070218] [] (call_console_drivers.constprop.23) from [] 
 (console_unlock+0x498/0x4f0)
 [5.070223] [] (console_unlock) from [] (vprintk_emit+0x1f0/0x508)
 [5.070228] [] (vprintk_emit) from [] (vprintk_default+0x24/0x2c)
 [5.070234] [] (vprintk_default) from [] (printk+0x70/0x88)

 After which the system starts spewing all kinds of weird and seemingly
 unrelated error messages.

 This commit fixes this by restoring the condition under which the call
 to fbcon_del_cursor_timer() happens.

Tested-by: Andrew Vagin ava...@virtuozzo.com

It would be good to push this patch into linux-next asap. Without this
patch I can't execute tests on linux-next. Thanks.


 Reported-by: Daniel Stone dan...@fooishbar.org
 Reported-by: Kevin Hilman khil...@kernel.org
 Tested-by: Kevin Hilman khil...@linaro.org
 Tested-by: Scot Doyle lkm...@scotdoyle.com
 Signed-off-by: Thierry Reding tred...@nvidia.com
 ---
  drivers/video/console/fbcon.c | 5 +++--
  1 file changed, 3 insertions(+), 2 deletions(-)

 diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c
 index 05b1d1a71ef9..658c34bb9076 100644
 --- a/drivers/video/console/fbcon.c
 +++ b/drivers/video/console/fbcon.c
 @@ -1310,8 +1310,9 @@ static void fbcon_cursor(struct vc_data *vc, int mode)
 return;

 ops-cur_blink_jiffies = msecs_to_jiffies(vc-vc_cur_blink_ms);
 -   fbcon_del_cursor_timer(info);
 -   if (!(vc-vc_cursor_type  0x10))
 +   if (vc-vc_cursor_type  0x10)
 +   fbcon_del_cursor_timer(info);
 +   else
 fbcon_add_cursor_timer(info);

 ops-cursor_flash = (mode == CM_ERASE) ? 0 : 1;
 --
 2.4.1

 --
 To unsubscribe from this list: send the line unsubscribe linux-kernel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] fbcon: use the cursor blink interval provided by vt

2015-05-26 Thread Andrey Wagin
2015-02-27 22:15 GMT+03:00 Scot Doyle :
> vt now provides a cursor blink interval via vc_data. Use this
> interval instead of the currently hardcoded 200 msecs. Store it in
> fbcon_ops to avoid locking the console in cursor_timer_handler().

I regularly execute criu tests on linux-next. For this, I use virtual
machine from the digitalocean clould. The current version of
linux-next hangs after a few seconds. I use git bisect to find the
commit where the problem is appeaed. And it looks like the problem is
in this patch.

When the kernel hangs, it doesn't report anything on the screen and
there is nothing suspicious in logs after reboot.

I will try to reproduce the problem in my local enviroment to get more
information.

There is my config file:
https://github.com/avagin/criu-jenkins-digitalocean/blob/d95d9e30a7da8755c47b290630bac7ee1fe7132d/jenkins-scripts/config

Thanks,
Andrew
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] fbcon: use the cursor blink interval provided by vt

2015-05-26 Thread Andrey Wagin
2015-02-27 22:15 GMT+03:00 Scot Doyle lkm...@scotdoyle.com:
 vt now provides a cursor blink interval via vc_data. Use this
 interval instead of the currently hardcoded 200 msecs. Store it in
 fbcon_ops to avoid locking the console in cursor_timer_handler().

I regularly execute criu tests on linux-next. For this, I use virtual
machine from the digitalocean clould. The current version of
linux-next hangs after a few seconds. I use git bisect to find the
commit where the problem is appeaed. And it looks like the problem is
in this patch.

When the kernel hangs, it doesn't report anything on the screen and
there is nothing suspicious in logs after reboot.

I will try to reproduce the problem in my local enviroment to get more
information.

There is my config file:
https://github.com/avagin/criu-jenkins-digitalocean/blob/d95d9e30a7da8755c47b290630bac7ee1fe7132d/jenkins-scripts/config

Thanks,
Andrew
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] fs: show locked and lock_ro options in mountinfo

2015-03-31 Thread Andrey Wagin
2015-03-28 1:47 GMT+03:00 Richard Weinberger :
> Hi!
>
> Am 27.03.2015 um 23:35 schrieb Andrey Wagin:
>> 2015-03-28 0:42 GMT+03:00 Richard Weinberger :
>>> On Fri, Mar 27, 2015 at 9:39 PM, Andrey Vagin  wrote:
>>>> I don't see any reasons to hide them. This information can help to
>>>> understand errors.
>>>
>>> Because these flags are set/read only internally by the VFS. In contrast
>>> to the other flags shown by mountinfo MNT_LOCKED is not a mount option.
>>
>> But this flag is set as a result of the specified user action, when he
>> unshares userns and mntns. This flag affects visiable behaviour.
>
> It is a implicit result. Used by the VFS internally.
> If you expose it it becomes ABI and changing the behavior will be
> tricky or impossible.
>
>>>
>>> Why does it help to debug errors?
>>> How would a user know that mount() with MS_BIND returns EINVAL because
>>> the mount source is MNT_LOCKED? This information is useless for her.
>>
>> If I see lock_ro, I can be sure that mount -o remount,bind,rw /XXX will fail.
>> If I see locked, I  know that this mount can't be umounted or moved
>> and can be bind-mounted only recursively.
>>
>> If a user see these flags, he can check that a mount namespace is
>> configured correctly without security issues.
>>
>> Sorry but I don't understand why you think that this information is
>> useless for users.
>
> You can only know if you know how the VFS works internally.
> If know that EINVAL from mount(2) with MS_BIND can be caused by MNT_LOCKED
> because I know the source. I bet you know the source too. But not Joey random
> admin who looks into mountinfo to figure out why something does not work.
>
> If you expose MNT_LOCKED to userspace you'll have to update also the mount(2)
> manpage with all glory details of that flag.
>
>>> If you argue like that you'd have to expose the whole VFS state to userland.
>>
>> I have not noticed other MNT_LOCK_* flags. I should think more about
>> what information are a really required for dumping mount namespaces.
>>
>>>
>>>> And this information is required for correct checkpoint/restore of mount
>>>> namespaces.
>>>
>>> Why especially MNT_LOCKED and not all the other flags used by VFS?
>>
>> My goal is to dump enough information about a mount namespace to be
>> able to restore it back later. I don't know how to do this without
>> knowledge about locked mounts. I will think.
>
> How do you plan to restore a MNT_LOCKED mount?
> IIRC we have currently no way to directly set MNT_LOCKED from userspace.

It's the second question. The first question is how to check that we
will be able to restore what we are dumping.

If CRIU meets something what it doesn't know how to restore, it (must)
refuses to dump this configuration.

>
>>> Say MNT_DOOMED?
>>
>> Mounts with MNT_DOOMED are never shown in mountinfo, are they?
>
> It was just an example. There are other flags too, did you double check
> which ones you really need?
>
> To make the story short, my concern is that exposing such flags to userspace
> has to be well thought. :-)
> As long they are just internal we can change them as we like, as soon 
> userspace
> depends somehow on it the pain begins.

I'm agree with you. I'm going to think more about this. Thank you for response.

>
> Thanks,
> //richard
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] fs: show locked and lock_ro options in mountinfo

2015-03-31 Thread Andrey Wagin
2015-03-28 1:47 GMT+03:00 Richard Weinberger rich...@nod.at:
 Hi!

 Am 27.03.2015 um 23:35 schrieb Andrey Wagin:
 2015-03-28 0:42 GMT+03:00 Richard Weinberger richard.weinber...@gmail.com:
 On Fri, Mar 27, 2015 at 9:39 PM, Andrey Vagin ava...@openvz.org wrote:
 I don't see any reasons to hide them. This information can help to
 understand errors.

 Because these flags are set/read only internally by the VFS. In contrast
 to the other flags shown by mountinfo MNT_LOCKED is not a mount option.

 But this flag is set as a result of the specified user action, when he
 unshares userns and mntns. This flag affects visiable behaviour.

 It is a implicit result. Used by the VFS internally.
 If you expose it it becomes ABI and changing the behavior will be
 tricky or impossible.


 Why does it help to debug errors?
 How would a user know that mount() with MS_BIND returns EINVAL because
 the mount source is MNT_LOCKED? This information is useless for her.

 If I see lock_ro, I can be sure that mount -o remount,bind,rw /XXX will fail.
 If I see locked, I  know that this mount can't be umounted or moved
 and can be bind-mounted only recursively.

 If a user see these flags, he can check that a mount namespace is
 configured correctly without security issues.

 Sorry but I don't understand why you think that this information is
 useless for users.

 You can only know if you know how the VFS works internally.
 If know that EINVAL from mount(2) with MS_BIND can be caused by MNT_LOCKED
 because I know the source. I bet you know the source too. But not Joey random
 admin who looks into mountinfo to figure out why something does not work.

 If you expose MNT_LOCKED to userspace you'll have to update also the mount(2)
 manpage with all glory details of that flag.

 If you argue like that you'd have to expose the whole VFS state to userland.

 I have not noticed other MNT_LOCK_* flags. I should think more about
 what information are a really required for dumping mount namespaces.


 And this information is required for correct checkpoint/restore of mount
 namespaces.

 Why especially MNT_LOCKED and not all the other flags used by VFS?

 My goal is to dump enough information about a mount namespace to be
 able to restore it back later. I don't know how to do this without
 knowledge about locked mounts. I will think.

 How do you plan to restore a MNT_LOCKED mount?
 IIRC we have currently no way to directly set MNT_LOCKED from userspace.

It's the second question. The first question is how to check that we
will be able to restore what we are dumping.

If CRIU meets something what it doesn't know how to restore, it (must)
refuses to dump this configuration.


 Say MNT_DOOMED?

 Mounts with MNT_DOOMED are never shown in mountinfo, are they?

 It was just an example. There are other flags too, did you double check
 which ones you really need?

 To make the story short, my concern is that exposing such flags to userspace
 has to be well thought. :-)
 As long they are just internal we can change them as we like, as soon 
 userspace
 depends somehow on it the pain begins.

I'm agree with you. I'm going to think more about this. Thank you for response.


 Thanks,
 //richard
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] fs: show locked and lock_ro options in mountinfo

2015-03-27 Thread Andrey Wagin
2015-03-28 0:42 GMT+03:00 Richard Weinberger :
> On Fri, Mar 27, 2015 at 9:39 PM, Andrey Vagin  wrote:
>> I don't see any reasons to hide them. This information can help to
>> understand errors.
>
> Because these flags are set/read only internally by the VFS. In contrast
> to the other flags shown by mountinfo MNT_LOCKED is not a mount option.

But this flag is set as a result of the specified user action, when he
unshares userns and mntns. This flag affects visiable behaviour.

>
> Why does it help to debug errors?
> How would a user know that mount() with MS_BIND returns EINVAL because
> the mount source is MNT_LOCKED? This information is useless for her.

If I see lock_ro, I can be sure that mount -o remount,bind,rw /XXX will fail.
If I see locked, I  know that this mount can't be umounted or moved
and can be bind-mounted only recursively.

If a user see these flags, he can check that a mount namespace is
configured correctly without security issues.

Sorry but I don't understand why you think that this information is
useless for users.

> If you argue like that you'd have to expose the whole VFS state to userland.

I have not noticed other MNT_LOCK_* flags. I should think more about
what information are a really required for dumping mount namespaces.

>
>> And this information is required for correct checkpoint/restore of mount
>> namespaces.
>
> Why especially MNT_LOCKED and not all the other flags used by VFS?

My goal is to dump enough information about a mount namespace to be
able to restore it back later. I don't know how to do this without
knowledge about locked mounts. I will think.

> Say MNT_DOOMED?

Mounts with MNT_DOOMED are never shown in mountinfo, are they?

Thank you for looking at this patch.


>
> --
> Thanks,
> //richard
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] fs: show locked and lock_ro options in mountinfo

2015-03-27 Thread Andrey Wagin
2015-03-28 0:42 GMT+03:00 Richard Weinberger richard.weinber...@gmail.com:
 On Fri, Mar 27, 2015 at 9:39 PM, Andrey Vagin ava...@openvz.org wrote:
 I don't see any reasons to hide them. This information can help to
 understand errors.

 Because these flags are set/read only internally by the VFS. In contrast
 to the other flags shown by mountinfo MNT_LOCKED is not a mount option.

But this flag is set as a result of the specified user action, when he
unshares userns and mntns. This flag affects visiable behaviour.


 Why does it help to debug errors?
 How would a user know that mount() with MS_BIND returns EINVAL because
 the mount source is MNT_LOCKED? This information is useless for her.

If I see lock_ro, I can be sure that mount -o remount,bind,rw /XXX will fail.
If I see locked, I  know that this mount can't be umounted or moved
and can be bind-mounted only recursively.

If a user see these flags, he can check that a mount namespace is
configured correctly without security issues.

Sorry but I don't understand why you think that this information is
useless for users.

 If you argue like that you'd have to expose the whole VFS state to userland.

I have not noticed other MNT_LOCK_* flags. I should think more about
what information are a really required for dumping mount namespaces.


 And this information is required for correct checkpoint/restore of mount
 namespaces.

 Why especially MNT_LOCKED and not all the other flags used by VFS?

My goal is to dump enough information about a mount namespace to be
able to restore it back later. I don't know how to do this without
knowledge about locked mounts. I will think.

 Say MNT_DOOMED?

Mounts with MNT_DOOMED are never shown in mountinfo, are they?

Thank you for looking at this patch.



 --
 Thanks,
 //richard
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 1/2] x86_64,signal: Fix SS handling for signals delivered to 64-bit programs

2015-03-18 Thread Andrey Wagin
2015-03-18 21:50 GMT+03:00 Cyrill Gorcunov :
> On Wed, Mar 18, 2015 at 07:31:33PM +0100, Oleg Nesterov wrote:
>> On 03/18, Cyrill Gorcunov wrote:
>> >
>> > On Wed, Mar 18, 2015 at 06:48:43PM +0100, Oleg Nesterov wrote:
>> > >
>> > > Shot in a dark afer a quick grep: restore_gpregs() should initialize 
>> > > ->ss?
>> >
>> > It hasn't been needed earlier, if this would help it means abi is broken, 
>> > no? :)
>> > Otherwise I don't understand what's happening.
>>
>> until this commit the kernel simply forgot to restore ->ss in sigreturn().
>>
>> after this commit, if your rt_sigframe has garbage in ->ss then SIGSEGV
>> is clear.
>
> OK, so setting up a proper ss here should fix problem.
>
>> > >
>> > > Seriously, where is UserX86RegsEntry?
>> >
>> > It's in protobif/core-x86.proto, welcome to protobuf hell.
>>
>> Still can't find UserX86RegsEntry... OK, perhaps it comes from
>> user_x86_regs_entry in protobuf/core-x86.proto. Then I guess the patch I sent
>> can be compiled at least ;)
>
> Yeah, protobuf-c mangles "_" in message name so user_x86_regs_entry -> 
> UserX86RegsEntry.
> Andrew should be able to test it tomorrow (hopefully it's no urgent right? ;)

This patch fixes the problem. Oleg, could you send this path in the
criu maillist?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 1/2] x86_64,signal: Fix SS handling for signals delivered to 64-bit programs

2015-03-18 Thread Andrey Wagin
2015-03-12 23:57 GMT+03:00 Andy Lutomirski :
> From: Andy Lutomirski 
>
> The comment in the signal code says that apps can save/restore other
> segments on their own.  It's true that apps can *save* SS on their
> own, but there's no way for apps to restore it: SYSCALL effectively
> resets SS to __USER_DS, so any value that user code tries to load
> into SS gets lost on entry to sigreturn.
>
> This recycles two padding bytes in the segment selector area for SS.
>
> While we're at it, we need a second change to make this useful.  If
> the signal we're delivering is caused by a bad SS value, saving that
> value isn't enough.  We need to remove that bad value from the regs
> before we try to deliver the signal.  Oddly, x32 already got this
> right.
>
> I suspect that 64-bit programs that try to run 16-bit code and use
> signals will have a lot of trouble without this.

This commit breaks CRIU. I don't have any details yet. I'm going to
investigate this issue and provide more details tomorrow.

[root@avagin-fc19-cr criu]# setsid sleep 1000 &
[1] 1225
[root@avagin-fc19-cr criu]# ps -C sleep
  PID TTY  TIME CMD
 1226 ?00:00:00 sleep
[root@avagin-fc19-cr criu]# ./criu dump -t 1226 -D dump --shell-job
[root@avagin-fc19-cr criu]# ./criu restore -D dump --shell-job
Error (parasite-syscall.c:923): Task is in unexpected state: b7f (SIGSEGV)


>
> Signed-off-by: Andy Lutomirski 
> ---
>  arch/x86/include/asm/sigcontext.h  |  2 +-
>  arch/x86/include/uapi/asm/sigcontext.h |  2 +-
>  arch/x86/kernel/signal.c   | 22 +-
>  3 files changed, 15 insertions(+), 11 deletions(-)
>
> diff --git a/arch/x86/include/asm/sigcontext.h 
> b/arch/x86/include/asm/sigcontext.h
> index 9dfce4e0417d..f910cdcb71fd 100644
> --- a/arch/x86/include/asm/sigcontext.h
> +++ b/arch/x86/include/asm/sigcontext.h
> @@ -59,7 +59,7 @@ struct sigcontext {
> unsigned short cs;
> unsigned short gs;
> unsigned short fs;
> -   unsigned short __pad0;
> +   unsigned short ss;
> unsigned long err;
> unsigned long trapno;
> unsigned long oldmask;
> diff --git a/arch/x86/include/uapi/asm/sigcontext.h 
> b/arch/x86/include/uapi/asm/sigcontext.h
> index d8b9f9081e86..076b11fd6fa1 100644
> --- a/arch/x86/include/uapi/asm/sigcontext.h
> +++ b/arch/x86/include/uapi/asm/sigcontext.h
> @@ -179,7 +179,7 @@ struct sigcontext {
> __u16 cs;
> __u16 gs;
> __u16 fs;
> -   __u16 __pad0;
> +   __u16 ss;
> __u64 err;
> __u64 trapno;
> __u64 oldmask;
> diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
> index ed37a768d0fc..9511eb7f17b0 100644
> --- a/arch/x86/kernel/signal.c
> +++ b/arch/x86/kernel/signal.c
> @@ -94,15 +94,8 @@ int restore_sigcontext(struct pt_regs *regs, struct 
> sigcontext __user *sc,
> COPY(r15);
>  #endif /* CONFIG_X86_64 */
>
> -#ifdef CONFIG_X86_32
> COPY_SEG_CPL3(cs);
> COPY_SEG_CPL3(ss);
> -#else /* !CONFIG_X86_32 */
> -   /* Kernel saves and restores only the CS segment register on 
> signals,
> -* which is the bare minimum needed to allow mixed 32/64-bit 
> code.
> -* App's signal handler can save/restore other segments if 
> needed. */
> -   COPY_SEG_CPL3(cs);
> -#endif /* CONFIG_X86_32 */
>
> get_user_ex(tmpflags, >flags);
> regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & 
> FIX_EFLAGS);
> @@ -164,6 +157,7 @@ int setup_sigcontext(struct sigcontext __user *sc, void 
> __user *fpstate,
> put_user_ex(regs->cs, >cs);
> put_user_ex(0, >gs);
> put_user_ex(0, >fs);
> +   put_user_ex(regs->ss, >ss);
>  #endif /* CONFIG_X86_32 */
>
> put_user_ex(fpstate, >fpstate);
> @@ -457,9 +451,19 @@ static int __setup_rt_frame(int sig, struct ksignal 
> *ksig,
>
> regs->sp = (unsigned long)frame;
>
> -   /* Set up the CS register to run signal handlers in 64-bit mode,
> -  even if the handler happens to be interrupting 32-bit code. */
> +   /*
> +* Set up the CS and SS registers to run signal handlers in
> +* 64-bit mode, even if the handler happens to be interrupting
> +* 32-bit or 16-bit code.
> +*
> +* SS is subtle.  In 64-bit mode, we don't need any particular
> +* SS descriptor, but we do need SS to be valid.  It's possible
> +* that the old SS is entirely bogus -- this can happen if the
> +* signal we're trying to deliver is #GP or #SS caused by a bad
> +* SS value.
> +*/
> regs->cs = __USER_CS;
> +   regs->ss = __USER_DS;
>
> return 0;
>  }
> --
> 2.3.0
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  

Re: [PATCH v3 1/2] x86_64,signal: Fix SS handling for signals delivered to 64-bit programs

2015-03-18 Thread Andrey Wagin
2015-03-12 23:57 GMT+03:00 Andy Lutomirski l...@kernel.org:
 From: Andy Lutomirski l...@amacapital.net

 The comment in the signal code says that apps can save/restore other
 segments on their own.  It's true that apps can *save* SS on their
 own, but there's no way for apps to restore it: SYSCALL effectively
 resets SS to __USER_DS, so any value that user code tries to load
 into SS gets lost on entry to sigreturn.

 This recycles two padding bytes in the segment selector area for SS.

 While we're at it, we need a second change to make this useful.  If
 the signal we're delivering is caused by a bad SS value, saving that
 value isn't enough.  We need to remove that bad value from the regs
 before we try to deliver the signal.  Oddly, x32 already got this
 right.

 I suspect that 64-bit programs that try to run 16-bit code and use
 signals will have a lot of trouble without this.

This commit breaks CRIU. I don't have any details yet. I'm going to
investigate this issue and provide more details tomorrow.

[root@avagin-fc19-cr criu]# setsid sleep 1000 
[1] 1225
[root@avagin-fc19-cr criu]# ps -C sleep
  PID TTY  TIME CMD
 1226 ?00:00:00 sleep
[root@avagin-fc19-cr criu]# ./criu dump -t 1226 -D dump --shell-job
[root@avagin-fc19-cr criu]# ./criu restore -D dump --shell-job
Error (parasite-syscall.c:923): Task is in unexpected state: b7f (SIGSEGV)



 Signed-off-by: Andy Lutomirski l...@amacapital.net
 ---
  arch/x86/include/asm/sigcontext.h  |  2 +-
  arch/x86/include/uapi/asm/sigcontext.h |  2 +-
  arch/x86/kernel/signal.c   | 22 +-
  3 files changed, 15 insertions(+), 11 deletions(-)

 diff --git a/arch/x86/include/asm/sigcontext.h 
 b/arch/x86/include/asm/sigcontext.h
 index 9dfce4e0417d..f910cdcb71fd 100644
 --- a/arch/x86/include/asm/sigcontext.h
 +++ b/arch/x86/include/asm/sigcontext.h
 @@ -59,7 +59,7 @@ struct sigcontext {
 unsigned short cs;
 unsigned short gs;
 unsigned short fs;
 -   unsigned short __pad0;
 +   unsigned short ss;
 unsigned long err;
 unsigned long trapno;
 unsigned long oldmask;
 diff --git a/arch/x86/include/uapi/asm/sigcontext.h 
 b/arch/x86/include/uapi/asm/sigcontext.h
 index d8b9f9081e86..076b11fd6fa1 100644
 --- a/arch/x86/include/uapi/asm/sigcontext.h
 +++ b/arch/x86/include/uapi/asm/sigcontext.h
 @@ -179,7 +179,7 @@ struct sigcontext {
 __u16 cs;
 __u16 gs;
 __u16 fs;
 -   __u16 __pad0;
 +   __u16 ss;
 __u64 err;
 __u64 trapno;
 __u64 oldmask;
 diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
 index ed37a768d0fc..9511eb7f17b0 100644
 --- a/arch/x86/kernel/signal.c
 +++ b/arch/x86/kernel/signal.c
 @@ -94,15 +94,8 @@ int restore_sigcontext(struct pt_regs *regs, struct 
 sigcontext __user *sc,
 COPY(r15);
  #endif /* CONFIG_X86_64 */

 -#ifdef CONFIG_X86_32
 COPY_SEG_CPL3(cs);
 COPY_SEG_CPL3(ss);
 -#else /* !CONFIG_X86_32 */
 -   /* Kernel saves and restores only the CS segment register on 
 signals,
 -* which is the bare minimum needed to allow mixed 32/64-bit 
 code.
 -* App's signal handler can save/restore other segments if 
 needed. */
 -   COPY_SEG_CPL3(cs);
 -#endif /* CONFIG_X86_32 */

 get_user_ex(tmpflags, sc-flags);
 regs-flags = (regs-flags  ~FIX_EFLAGS) | (tmpflags  
 FIX_EFLAGS);
 @@ -164,6 +157,7 @@ int setup_sigcontext(struct sigcontext __user *sc, void 
 __user *fpstate,
 put_user_ex(regs-cs, sc-cs);
 put_user_ex(0, sc-gs);
 put_user_ex(0, sc-fs);
 +   put_user_ex(regs-ss, sc-ss);
  #endif /* CONFIG_X86_32 */

 put_user_ex(fpstate, sc-fpstate);
 @@ -457,9 +451,19 @@ static int __setup_rt_frame(int sig, struct ksignal 
 *ksig,

 regs-sp = (unsigned long)frame;

 -   /* Set up the CS register to run signal handlers in 64-bit mode,
 -  even if the handler happens to be interrupting 32-bit code. */
 +   /*
 +* Set up the CS and SS registers to run signal handlers in
 +* 64-bit mode, even if the handler happens to be interrupting
 +* 32-bit or 16-bit code.
 +*
 +* SS is subtle.  In 64-bit mode, we don't need any particular
 +* SS descriptor, but we do need SS to be valid.  It's possible
 +* that the old SS is entirely bogus -- this can happen if the
 +* signal we're trying to deliver is #GP or #SS caused by a bad
 +* SS value.
 +*/
 regs-cs = __USER_CS;
 +   regs-ss = __USER_DS;

 return 0;
  }
 --
 2.3.0

 --
 To unsubscribe from this list: send the line unsubscribe linux-kernel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 Please read the FAQ at  http://www.tux.org/lkml/
--
To 

Re: [PATCH v3 1/2] x86_64,signal: Fix SS handling for signals delivered to 64-bit programs

2015-03-18 Thread Andrey Wagin
2015-03-18 21:50 GMT+03:00 Cyrill Gorcunov gorcu...@gmail.com:
 On Wed, Mar 18, 2015 at 07:31:33PM +0100, Oleg Nesterov wrote:
 On 03/18, Cyrill Gorcunov wrote:
 
  On Wed, Mar 18, 2015 at 06:48:43PM +0100, Oleg Nesterov wrote:
  
   Shot in a dark afer a quick grep: restore_gpregs() should initialize 
   -ss?
 
  It hasn't been needed earlier, if this would help it means abi is broken, 
  no? :)
  Otherwise I don't understand what's happening.

 until this commit the kernel simply forgot to restore -ss in sigreturn().

 after this commit, if your rt_sigframe has garbage in -ss then SIGSEGV
 is clear.

 OK, so setting up a proper ss here should fix problem.

  
   Seriously, where is UserX86RegsEntry?
 
  It's in protobif/core-x86.proto, welcome to protobuf hell.

 Still can't find UserX86RegsEntry... OK, perhaps it comes from
 user_x86_regs_entry in protobuf/core-x86.proto. Then I guess the patch I sent
 can be compiled at least ;)

 Yeah, protobuf-c mangles _ in message name so user_x86_regs_entry - 
 UserX86RegsEntry.
 Andrew should be able to test it tomorrow (hopefully it's no urgent right? ;)

This patch fixes the problem. Oleg, could you send this path in the
criu maillist?
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] selftests/kcmp: exit with non-zero code in a fail case

2015-03-13 Thread Andrey Wagin
2015-03-13 13:37 GMT+03:00 Michael Ellerman :
> On Fri, 2015-03-13 at 12:27 +0300, Andrey Vagin wrote:
>> diff --git a/tools/testing/selftests/kselftest.h 
>> b/tools/testing/selftests/kselftest.h
>> index 572c888..a0ec8b8 100644
>> --- a/tools/testing/selftests/kselftest.h
>> +++ b/tools/testing/selftests/kselftest.h
>> @@ -58,5 +58,17 @@ static inline int ksft_exit_skip(void)
>>  {
>>   exit(4);
>>  }
>> +static inline int ksft_exit(void)
>> +{
>> + if (ksft_cnt.ksft_fail)
>> + return ksft_exit_fail();
>> + if (ksft_cnt.ksft_xpass)
>> + return ksft_exit_xpass();
>> + if (ksft_cnt.ksft_xskip)
>> + return ksft_exit_skip();
>> + if (ksft_cnt.ksft_xfail)
>> + return ksft_exit_xfail();
>> + ksft_exit_pass();
>> +}
>
> This function claims to return 'int', but doesn't. So do all the others.

I agree with this. I haven't seen any warning, because exit() never
returns back.

>
> It could be as simple as:

I think it isn't a good idea. In my version we can be sure that
ksft_exit_fail() and ksft_exit() return the same code in a fail case.

>
> static inline void ksft_exit(void)
> {
> if (ksft_cnt.ksft_fail)
> exit(1);
>
> if (ksft_cnt.ksft_xpass)
> exit(3);
>
> if (ksft_cnt.ksft_xskip)
> exit(4);
>
> if (ksft_cnt.ksft_xfail)
> exit(2);
>
> exit(0);
> }
>
> cheers
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] selftests/kcmp: exit with non-zero code in a fail case

2015-03-13 Thread Andrey Wagin
2015-03-13 13:37 GMT+03:00 Michael Ellerman m...@ellerman.id.au:
 On Fri, 2015-03-13 at 12:27 +0300, Andrey Vagin wrote:
 diff --git a/tools/testing/selftests/kselftest.h 
 b/tools/testing/selftests/kselftest.h
 index 572c888..a0ec8b8 100644
 --- a/tools/testing/selftests/kselftest.h
 +++ b/tools/testing/selftests/kselftest.h
 @@ -58,5 +58,17 @@ static inline int ksft_exit_skip(void)
  {
   exit(4);
  }
 +static inline int ksft_exit(void)
 +{
 + if (ksft_cnt.ksft_fail)
 + return ksft_exit_fail();
 + if (ksft_cnt.ksft_xpass)
 + return ksft_exit_xpass();
 + if (ksft_cnt.ksft_xskip)
 + return ksft_exit_skip();
 + if (ksft_cnt.ksft_xfail)
 + return ksft_exit_xfail();
 + ksft_exit_pass();
 +}

 This function claims to return 'int', but doesn't. So do all the others.

I agree with this. I haven't seen any warning, because exit() never
returns back.


 It could be as simple as:

I think it isn't a good idea. In my version we can be sure that
ksft_exit_fail() and ksft_exit() return the same code in a fail case.


 static inline void ksft_exit(void)
 {
 if (ksft_cnt.ksft_fail)
 exit(1);

 if (ksft_cnt.ksft_xpass)
 exit(3);

 if (ksft_cnt.ksft_xskip)
 exit(4);

 if (ksft_cnt.ksft_xfail)
 exit(2);

 exit(0);
 }

 cheers



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] proc: show locks in /proc/pid/fdinfo/X

2015-03-12 Thread Andrey Wagin
2015-03-12 22:23 GMT+03:00 Andrew Morton :
> On Thu, 12 Mar 2015 18:54:42 +0300 Andrew Vagin  wrote:
>
>> v2: use seq_has_overflowed() properly
>
> --- a/fs/proc/fd.c~proc-show-locks-in-proc-pid-fdinfo-x-v2
> +++ a/fs/proc/fd.c
> @@ -57,17 +57,15 @@ static int seq_show(struct seq_file *m,
>real_mount(file->f_path.mnt)->mnt_id);
>
> show_fd_locks(m, file, files);
> -   ret = seq_has_overflowed(m);
> -   if (ret)
> +   if (seq_has_overflowed(m))
> goto out;
>
> if (file->f_op->show_fdinfo)
> file->f_op->show_fdinfo(m, file);
> -   ret = seq_has_overflowed(m);
>
>  out:
> fput(file);
> -   return ret;
> +   return 0;
>  }
>
>  static int seq_fdinfo_open(struct inode *inode, struct file *file)
>
>
> So it returns "success" when the output has overflowed?  Why this,
> rather than returning an error?

I have read fs/seq_file.c and looks like it's the right way.
seq_has_overflowed() is used to avoid useless work. If we call it or
don't call it, the result must be the same in both cases. So from this
point of view it looks logically correct too.

There are two interesting places from seq_file.c
error = m->op->show(m, p);
if (error < 0)
break;
if (unlikely(error)) {
error = 0;
m->count = 0;
}
if (seq_has_overflowed(m))
goto Eoverflow;

...
err = m->op->show(m, p);
if (err < 0)
break;
if (unlikely(err))
m->count = 0;
if (unlikely(!m->count)) {
p = m->op->next(m, p, );
m->index = pos;
continue;
}
if (m->count < m->size)
goto Fill;
m->op->stop(m, p);
kvfree(m->buf);
m->count = 0;
m->buf = seq_buf_alloc(m->size <<= 1);

Thanks,
Andrey
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] proc: show locks in /proc/pid/fdinfo/X

2015-03-12 Thread Andrey Wagin
2015-03-12 22:23 GMT+03:00 Andrew Morton a...@linux-foundation.org:
 On Thu, 12 Mar 2015 18:54:42 +0300 Andrew Vagin ava...@parallels.com wrote:

 v2: use seq_has_overflowed() properly

 --- a/fs/proc/fd.c~proc-show-locks-in-proc-pid-fdinfo-x-v2
 +++ a/fs/proc/fd.c
 @@ -57,17 +57,15 @@ static int seq_show(struct seq_file *m,
real_mount(file-f_path.mnt)-mnt_id);

 show_fd_locks(m, file, files);
 -   ret = seq_has_overflowed(m);
 -   if (ret)
 +   if (seq_has_overflowed(m))
 goto out;

 if (file-f_op-show_fdinfo)
 file-f_op-show_fdinfo(m, file);
 -   ret = seq_has_overflowed(m);

  out:
 fput(file);
 -   return ret;
 +   return 0;
  }

  static int seq_fdinfo_open(struct inode *inode, struct file *file)


 So it returns success when the output has overflowed?  Why this,
 rather than returning an error?

I have read fs/seq_file.c and looks like it's the right way.
seq_has_overflowed() is used to avoid useless work. If we call it or
don't call it, the result must be the same in both cases. So from this
point of view it looks logically correct too.

There are two interesting places from seq_file.c
error = m-op-show(m, p);
if (error  0)
break;
if (unlikely(error)) {
error = 0;
m-count = 0;
}
if (seq_has_overflowed(m))
goto Eoverflow;

...
err = m-op-show(m, p);
if (err  0)
break;
if (unlikely(err))
m-count = 0;
if (unlikely(!m-count)) {
p = m-op-next(m, p, pos);
m-index = pos;
continue;
}
if (m-count  m-size)
goto Fill;
m-op-stop(m, p);
kvfree(m-buf);
m-count = 0;
m-buf = seq_buf_alloc(m-size = 1);

Thanks,
Andrey
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/3 v3] x86: entry_64.S: always allocate complete "struct pt_regs"

2015-02-25 Thread Andrey Wagin
2015-02-25 21:42 GMT+03:00 Denys Vlasenko :
> On 02/25/2015 01:37 PM, Andrey Wagin wrote:
>> 2015-02-13 0:54 GMT+03:00 Denys Vlasenko :
>>> 64-bit code was using six stack slots less by not saving/restoring
>>> registers which are callee-preserved according to C ABI,
>>> and not allocating space for them.
>>> Only when syscall needed a complete "struct pt_regs",
>>> the complete area was allocated and filled in.
>>> As an additional twist, on interrupt entry a "slightly less truncated 
>>> pt_regs"
>>> trick is used, to make nested interrupt stacks easier to unwind.
>>>
>>> This proved to be a source of significant obfuscation and subtle bugs.
>>> For example, stub_fork had to pop the return address,
>>> extend the struct, save registers, and push return address back. Ugly.
>>> ia32_ptregs_common pops return address and "returns" via jmp insn,
>>> throwing a wrench into CPU return stack cache.
>>>
>>> This patch changes code to always allocate a complete "struct pt_regs".
>>> The saving of registers is still done lazily.
>>>
>>> "Partial pt_regs" trick on interrupt stack is retained.
>>>
>>> Macros which manipulate "struct pt_regs" on stack are reworked:
>>> ALLOC_PT_GPREGS_ON_STACK allocates the structure.
>>> SAVE_C_REGS saves to it those registers which are clobbered by C code.
>>> SAVE_EXTRA_REGS saves to it all other registers.
>>> Corresponding RESTORE_* and REMOVE_PT_GPREGS_FROM_STACK macros reverse it.
>>>
>>> ia32_ptregs_common, stub_fork and friends lost their ugly dance with
>>> return pointer.
>>>
>>> LOAD_ARGS32 in ia32entry.S now uses symbolic stack offsets
>>> instead of magic numbers.
>>>
>>> error_entry and save_paranoid now use SAVE_C_REGS + SAVE_EXTRA_REGS
>>> instead of having it open-coded yet again.
>>>
>>> Patch was run-tested: 64-bit executables, 32-bit executables,
>>> strace works.
>>> Timing tests did not show measurable difference in 32-bit
>>> and 64-bit syscalls.
>>
>> Hello Denys,
>>
>> My test vm doesn't boot with this patch. Could you help to investigate
>> this issue?
>
> I think I found it. This part of my patch is possibly wrong:
>
> @@ -171,9 +171,9 @@ static inline int arch_irqs_disabled(void)
>  #define ARCH_LOCKDEP_SYS_EXIT_IRQ  \
> TRACE_IRQS_ON; \
> sti; \
> -   SAVE_REST; \
> +   SAVE_EXTRA_REGS; \
> LOCKDEP_SYS_EXIT; \
> -   RESTORE_REST; \
> +   RESTORE_EXTRA_REGS; \
> cli; \
> TRACE_IRQS_OFF;
>
> The "SAVE_REST" here is intended to really *push* extra regs on stack,
> but the patch changed it so that they are written to existing stack
> slots above.
>
> From code inspection it should work in almost all cases, but some
> locations where it is used are really obscure.
>
> If there are places where *pushing* regs is really necessary,
> this can corrupt rbp,rbx,r12-15 registers.
>
> Your config has CONFIG_LOCKDEP=y, I think it's worth trying whether the bug
> was here.
> Please find updated patch attached. Can you try it?

It doesn't work

[2.282198] microcode: CPU0 sig=0x623, pf=0x0, revision=0x1
[2.288321] microcode: CPU1 sig=0x623, pf=0x0, revision=0x1
[2.289139] microcode: CPU2 sig=0x623, pf=0x0, revision=0x1
[2.290618] microcode: CPU3 sig=0x623, pf=0x0, revision=0x1
[2.292417] microcode: Microcode Update Driver: v2.00
, Peter Oruba
[2.392592] piix4_smbus :00:01.3: SMBus Host Controller at
0xb100, revision 0
[2.510882] systemd-fsck[349]: /dev/sda1: clean, 343/128016 files,
166911/512000 blocks
[2.521463] Adding 4128764k swap on /dev/sda2.  Priority:-1
extents:1 across:4128764k FS
[2.573597] EXT4-fs (sda1): mounted filesystem with ordered data
mode. Opts: (null)
[2.597771] systemd-journald[283]: Received request to flush
runtime journal from PID 1
[2.802288] audit: type=1305 audit(1424892361.629:3): audit_pid=369
old=0 auid=4294967295 ses=4294967295 res=1
[2.819660] traps: systemd-cgroups[380] general protection
ip:7fb227939028 sp:7fff6bac59c8 error:0 in
ld-2.18.so[7fb227921000+2]
[3.016262] traps: systemd-cgroups[390] general protection
ip:7f456f7b6028 sp:7fffdc059718 error:0 in
ld-2.18.so[7f456f79e000+2]

>
> --
> vda
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/3 v3] x86: entry_64.S: always allocate complete struct pt_regs

2015-02-25 Thread Andrey Wagin
2015-02-25 21:42 GMT+03:00 Denys Vlasenko dvlas...@redhat.com:
 On 02/25/2015 01:37 PM, Andrey Wagin wrote:
 2015-02-13 0:54 GMT+03:00 Denys Vlasenko dvlas...@redhat.com:
 64-bit code was using six stack slots less by not saving/restoring
 registers which are callee-preserved according to C ABI,
 and not allocating space for them.
 Only when syscall needed a complete struct pt_regs,
 the complete area was allocated and filled in.
 As an additional twist, on interrupt entry a slightly less truncated 
 pt_regs
 trick is used, to make nested interrupt stacks easier to unwind.

 This proved to be a source of significant obfuscation and subtle bugs.
 For example, stub_fork had to pop the return address,
 extend the struct, save registers, and push return address back. Ugly.
 ia32_ptregs_common pops return address and returns via jmp insn,
 throwing a wrench into CPU return stack cache.

 This patch changes code to always allocate a complete struct pt_regs.
 The saving of registers is still done lazily.

 Partial pt_regs trick on interrupt stack is retained.

 Macros which manipulate struct pt_regs on stack are reworked:
 ALLOC_PT_GPREGS_ON_STACK allocates the structure.
 SAVE_C_REGS saves to it those registers which are clobbered by C code.
 SAVE_EXTRA_REGS saves to it all other registers.
 Corresponding RESTORE_* and REMOVE_PT_GPREGS_FROM_STACK macros reverse it.

 ia32_ptregs_common, stub_fork and friends lost their ugly dance with
 return pointer.

 LOAD_ARGS32 in ia32entry.S now uses symbolic stack offsets
 instead of magic numbers.

 error_entry and save_paranoid now use SAVE_C_REGS + SAVE_EXTRA_REGS
 instead of having it open-coded yet again.

 Patch was run-tested: 64-bit executables, 32-bit executables,
 strace works.
 Timing tests did not show measurable difference in 32-bit
 and 64-bit syscalls.

 Hello Denys,

 My test vm doesn't boot with this patch. Could you help to investigate
 this issue?

 I think I found it. This part of my patch is possibly wrong:

 @@ -171,9 +171,9 @@ static inline int arch_irqs_disabled(void)
  #define ARCH_LOCKDEP_SYS_EXIT_IRQ  \
 TRACE_IRQS_ON; \
 sti; \
 -   SAVE_REST; \
 +   SAVE_EXTRA_REGS; \
 LOCKDEP_SYS_EXIT; \
 -   RESTORE_REST; \
 +   RESTORE_EXTRA_REGS; \
 cli; \
 TRACE_IRQS_OFF;

 The SAVE_REST here is intended to really *push* extra regs on stack,
 but the patch changed it so that they are written to existing stack
 slots above.

 From code inspection it should work in almost all cases, but some
 locations where it is used are really obscure.

 If there are places where *pushing* regs is really necessary,
 this can corrupt rbp,rbx,r12-15 registers.

 Your config has CONFIG_LOCKDEP=y, I think it's worth trying whether the bug
 was here.
 Please find updated patch attached. Can you try it?

It doesn't work

[2.282198] microcode: CPU0 sig=0x623, pf=0x0, revision=0x1
[2.288321] microcode: CPU1 sig=0x623, pf=0x0, revision=0x1
[2.289139] microcode: CPU2 sig=0x623, pf=0x0, revision=0x1
[2.290618] microcode: CPU3 sig=0x623, pf=0x0, revision=0x1
[2.292417] microcode: Microcode Update Driver: v2.00
tig...@aivazian.fsnet.co.uk, Peter Oruba
[2.392592] piix4_smbus :00:01.3: SMBus Host Controller at
0xb100, revision 0
[2.510882] systemd-fsck[349]: /dev/sda1: clean, 343/128016 files,
166911/512000 blocks
[2.521463] Adding 4128764k swap on /dev/sda2.  Priority:-1
extents:1 across:4128764k FS
[2.573597] EXT4-fs (sda1): mounted filesystem with ordered data
mode. Opts: (null)
[2.597771] systemd-journald[283]: Received request to flush
runtime journal from PID 1
[2.802288] audit: type=1305 audit(1424892361.629:3): audit_pid=369
old=0 auid=4294967295 ses=4294967295 res=1
[2.819660] traps: systemd-cgroups[380] general protection
ip:7fb227939028 sp:7fff6bac59c8 error:0 in
ld-2.18.so[7fb227921000+2]
[3.016262] traps: systemd-cgroups[390] general protection
ip:7f456f7b6028 sp:7fffdc059718 error:0 in
ld-2.18.so[7f456f79e000+2]


 --
 vda

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG at /build/buildd/linux-3.13.0/fs/pnode.c:372!

2014-10-06 Thread Andrey Wagin
2014-10-06 23:44 GMT+04:00 Serge E. Hallyn :
> Quoting Andrey Wagin (ava...@gmail.com):
>> 2014-10-06 22:19 GMT+04:00 Serge E. Hallyn :
>> > Quoting Andrey Wagin (ava...@gmail.com):
>> >> Hi All,
>> >>
>> >> Here is a small program, which triggers a bug. Is it interested for 
>> >> someone?
>> >
>> > Yup, would
>> >
>> > diff --git a/fs/namespace.c b/fs/namespace.c
>> > index ef42d9b..ddef25d 100644
>> > --- a/fs/namespace.c
>> > +++ b/fs/namespace.c
>> > @@ -3046,6 +3046,9 @@ static int mntns_install(struct nsproxy *nsproxy, 
>> > void *ns)
>> > struct mnt_namespace *mnt_ns = ns;
>> > struct path root;
>> >
>> > +   if (mnt_ns == nsproxy->mnt_ns)
>> > +   return 0;
>> > +
>> > if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
>> > !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
>> > !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
>> >
>> > fix it?
>>
>> Actually the bug is triggered from the second umount. I think we need
>
> Oh, I see.
>
>> something like this:
>> diff --git a/fs/namespace.c b/fs/namespace.c
>> index 68685b2..bbcd92c 100644
>> --- a/fs/namespace.c
>> +++ b/fs/namespace.c
>> @@ -1363,6 +1363,9 @@ static int do_umount(struct mount *mnt, int flags)
>> return retval;
>> }
>>
>> +   if (mnt->mnt_parent == mnt)
>
> looks good (or the equivalent  mnt_has_parent(mnt)).  Have you tested that
> this prevents your test program from causing a crash?

Yes, I have. Look the strace output below. Today here is too late for
sending a proper patch.

>
>> +   return -EINVAL;
>> +
>> namespace_lock();
>> lock_mount_hash();
>> event++;
>>
>> root@ubuntu:/home/avagin# strace ./a.out
>> ...
>> open("/proc/self/ns/mnt", O_RDONLY) = 3
>> umount("/", MNT_DETACH) = 0
>> setns(3, 131072)= 0
>> umount("/", MNT_DETACH) = -1 EINVAL (Invalid argument)
>> exit_group(0)   = ?
>> +++ exited with 0 +++
>>
>> >
>> >> root@ubuntu:/home/avagin# cat nsenter.c
>> >> #define _GNU_SOURCE /* See feature_test_macros(7) */
>> >> #include 
>> >> #include 
>> >> #include 
>> >> #include 
>> >> #include 
>> >> #include 
>> >>
>> >> int main(int argc, char **argv)
>> >> {
>> >> int fd;
>> >>
>> >> fd = open("/proc/self/ns/mnt", O_RDONLY);
>> >> if (fd < 0)
>> >> return 1;
>> >> umount2("/", MNT_DETACH);
>> >> if (setns(fd, CLONE_NEWNS))
>> >> return 1;
>> >> umount2("/", MNT_DETACH);
>> >>
>> >> return 0;
>> >> }
>> >>
>> >> root@ubuntu:/home/avagin# gcc nsenter.c -o nsenter
>> >> root@ubuntu:/home/avagin# ./nsenter
>> >>
>> >>
>> >> [  260.548301] [ cut here ]
>> >> [  260.550941] kernel BUG at /build/buildd/linux-3.13.0/fs/pnode.c:372!
>> >> [  260.552068] invalid opcode:  [#1] SMP
>> >> [  260.552068] Modules linked in: xt_CHECKSUM iptable_mangle xt_tcpudp
>> >> xt_addrtype xt_conntrack ipt_MASQUERADE iptable_nat nf_conntrack_ipv4
>> >> nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack bridge stp llc
>> >> dm_thin_pool dm_persistent_data dm_bufio dm_bio_prison iptable_filter
>> >> ip_tables x_tables crct10dif_pclmul crc32_pclmul ghash_clmulni_intel
>> >> binfmt_misc nfsd auth_rpcgss nfs_acl aesni_intel nfs lockd aes_x86_64
>> >> sunrpc fscache lrw gf128mul glue_helper ablk_helper cryptd serio_raw
>> >> ppdev parport_pc lp parport btrfs xor raid6_pq libcrc32c psmouse
>> >> floppy
>> >> [  260.552068] CPU: 0 PID: 1723 Comm: nsenter Not tainted
>> >> 3.13.0-30-generic #55-Ubuntu
>> >> [  260.552068] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
>> >> [  260.552068] task: 8800376097f0 ti: 880074824000 task.ti:
>> >> 880074824000
>> >> [  260.552068] RIP: 0010:[]  []
>> >> propagate_umount+0x123/0x130
>> >> [  26

Re: BUG at /build/buildd/linux-3.13.0/fs/pnode.c:372!

2014-10-06 Thread Andrey Wagin
2014-10-06 22:19 GMT+04:00 Serge E. Hallyn :
> Quoting Andrey Wagin (ava...@gmail.com):
>> Hi All,
>>
>> Here is a small program, which triggers a bug. Is it interested for someone?
>
> Yup, would
>
> diff --git a/fs/namespace.c b/fs/namespace.c
> index ef42d9b..ddef25d 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -3046,6 +3046,9 @@ static int mntns_install(struct nsproxy *nsproxy, void 
> *ns)
> struct mnt_namespace *mnt_ns = ns;
> struct path root;
>
> +   if (mnt_ns == nsproxy->mnt_ns)
> +   return 0;
> +
> if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
> !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
> !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
>
> fix it?

Actually the bug is triggered from the second umount. I think we need
something like this:
diff --git a/fs/namespace.c b/fs/namespace.c
index 68685b2..bbcd92c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1363,6 +1363,9 @@ static int do_umount(struct mount *mnt, int flags)
return retval;
}

+   if (mnt->mnt_parent == mnt)
+   return -EINVAL;
+
namespace_lock();
lock_mount_hash();
event++;

root@ubuntu:/home/avagin# strace ./a.out
...
open("/proc/self/ns/mnt", O_RDONLY) = 3
umount("/", MNT_DETACH) = 0
setns(3, 131072)= 0
umount("/", MNT_DETACH) = -1 EINVAL (Invalid argument)
exit_group(0)   = ?
+++ exited with 0 +++

>
>> root@ubuntu:/home/avagin# cat nsenter.c
>> #define _GNU_SOURCE /* See feature_test_macros(7) */
>> #include 
>> #include 
>> #include 
>> #include 
>> #include 
>> #include 
>>
>> int main(int argc, char **argv)
>> {
>> int fd;
>>
>> fd = open("/proc/self/ns/mnt", O_RDONLY);
>> if (fd < 0)
>> return 1;
>> umount2("/", MNT_DETACH);
>> if (setns(fd, CLONE_NEWNS))
>> return 1;
>> umount2("/", MNT_DETACH);
>>
>> return 0;
>> }
>>
>> root@ubuntu:/home/avagin# gcc nsenter.c -o nsenter
>> root@ubuntu:/home/avagin# ./nsenter
>>
>>
>> [  260.548301] [ cut here ]
>> [  260.550941] kernel BUG at /build/buildd/linux-3.13.0/fs/pnode.c:372!
>> [  260.552068] invalid opcode:  [#1] SMP
>> [  260.552068] Modules linked in: xt_CHECKSUM iptable_mangle xt_tcpudp
>> xt_addrtype xt_conntrack ipt_MASQUERADE iptable_nat nf_conntrack_ipv4
>> nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack bridge stp llc
>> dm_thin_pool dm_persistent_data dm_bufio dm_bio_prison iptable_filter
>> ip_tables x_tables crct10dif_pclmul crc32_pclmul ghash_clmulni_intel
>> binfmt_misc nfsd auth_rpcgss nfs_acl aesni_intel nfs lockd aes_x86_64
>> sunrpc fscache lrw gf128mul glue_helper ablk_helper cryptd serio_raw
>> ppdev parport_pc lp parport btrfs xor raid6_pq libcrc32c psmouse
>> floppy
>> [  260.552068] CPU: 0 PID: 1723 Comm: nsenter Not tainted
>> 3.13.0-30-generic #55-Ubuntu
>> [  260.552068] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
>> [  260.552068] task: 8800376097f0 ti: 880074824000 task.ti:
>> 880074824000
>> [  260.552068] RIP: 0010:[]  []
>> propagate_umount+0x123/0x130
>> [  260.552068] RSP: 0018:880074825e98  EFLAGS: 00010246
>> [  260.552068] RAX: 88007c741140 RBX: 0002 RCX: 
>> 88007c741190
>> [  260.552068] RDX: 88007c741190 RSI: 880074825ec0 RDI: 
>> 880074825ec0
>> [  260.552068] RBP: 880074825eb0 R08: 000172e0 R09: 
>> 88007fc172e0
>> [  260.552068] R10: 811cc642 R11: ea0001d59000 R12: 
>> 88007c741140
>> [  260.552068] R13: 88007c741140 R14: 88007c741140 R15: 
>> 
>> [  260.552068] FS:  7fd5c7e41740() GS:88007fc0()
>> knlGS:
>> [  260.552068] CS:  0010 DS:  ES:  CR0: 80050033
>> [  260.552068] CR2: 7fd5c7968050 CR3: 70124000 CR4: 
>> 000406f0
>> [  260.552068] Stack:
>> [  260.552068]  0002 0002 88007c631000
>> 880074825ed8
>> [  260.552068]  811dcfac 88007c741140 0002
>> 88007c741160
>> [  260.552068]  880074825f38 811dd12b 811cc642
>> 7564
>> [  260.552068] Call Trace:
>> [  260.552068]  [] umount_tree+0x20c/0x260
>> [  260.552068]  [] do

BUG at /build/buildd/linux-3.13.0/fs/pnode.c:372!

2014-10-06 Thread Andrey Wagin
Hi All,

Here is a small program, which triggers a bug. Is it interested for someone?

root@ubuntu:/home/avagin# cat nsenter.c
#define _GNU_SOURCE /* See feature_test_macros(7) */
#include 
#include 
#include 
#include 
#include 
#include 

int main(int argc, char **argv)
{
int fd;

fd = open("/proc/self/ns/mnt", O_RDONLY);
if (fd < 0)
return 1;
umount2("/", MNT_DETACH);
if (setns(fd, CLONE_NEWNS))
return 1;
umount2("/", MNT_DETACH);

return 0;
}

root@ubuntu:/home/avagin# gcc nsenter.c -o nsenter
root@ubuntu:/home/avagin# ./nsenter


[  260.548301] [ cut here ]
[  260.550941] kernel BUG at /build/buildd/linux-3.13.0/fs/pnode.c:372!
[  260.552068] invalid opcode:  [#1] SMP
[  260.552068] Modules linked in: xt_CHECKSUM iptable_mangle xt_tcpudp
xt_addrtype xt_conntrack ipt_MASQUERADE iptable_nat nf_conntrack_ipv4
nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack bridge stp llc
dm_thin_pool dm_persistent_data dm_bufio dm_bio_prison iptable_filter
ip_tables x_tables crct10dif_pclmul crc32_pclmul ghash_clmulni_intel
binfmt_misc nfsd auth_rpcgss nfs_acl aesni_intel nfs lockd aes_x86_64
sunrpc fscache lrw gf128mul glue_helper ablk_helper cryptd serio_raw
ppdev parport_pc lp parport btrfs xor raid6_pq libcrc32c psmouse
floppy
[  260.552068] CPU: 0 PID: 1723 Comm: nsenter Not tainted
3.13.0-30-generic #55-Ubuntu
[  260.552068] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[  260.552068] task: 8800376097f0 ti: 880074824000 task.ti:
880074824000
[  260.552068] RIP: 0010:[]  []
propagate_umount+0x123/0x130
[  260.552068] RSP: 0018:880074825e98  EFLAGS: 00010246
[  260.552068] RAX: 88007c741140 RBX: 0002 RCX: 88007c741190
[  260.552068] RDX: 88007c741190 RSI: 880074825ec0 RDI: 880074825ec0
[  260.552068] RBP: 880074825eb0 R08: 000172e0 R09: 88007fc172e0
[  260.552068] R10: 811cc642 R11: ea0001d59000 R12: 88007c741140
[  260.552068] R13: 88007c741140 R14: 88007c741140 R15: 
[  260.552068] FS:  7fd5c7e41740() GS:88007fc0()
knlGS:
[  260.552068] CS:  0010 DS:  ES:  CR0: 80050033
[  260.552068] CR2: 7fd5c7968050 CR3: 70124000 CR4: 000406f0
[  260.552068] Stack:
[  260.552068]  0002 0002 88007c631000
880074825ed8
[  260.552068]  811dcfac 88007c741140 0002
88007c741160
[  260.552068]  880074825f38 811dd12b 811cc642
7564
[  260.552068] Call Trace:
[  260.552068]  [] umount_tree+0x20c/0x260
[  260.552068]  [] do_umount+0x12b/0x300
[  260.552068]  [] ? final_putname+0x22/0x50
[  260.552068]  [] ? putname+0x29/0x40
[  260.552068]  [] SyS_umount+0xdc/0x100
[  260.552068]  [] tracesys+0xe1/0xe6
[  260.552068] Code: 89 50 08 48 8b 50 08 48 89 02 49 89 45 08 e9 72
ff ff ff 0f 1f 44 00 00 4c 89 e6 4c 89 e7 e8 f5 f6 ff ff 48 89 c3 e9
39 ff ff ff <0f> 0b 66 2e 0f 1f 84 00 00 00 00 00 90 66 66 66 66 90 55
b8 01
[  260.552068] RIP  [] propagate_umount+0x123/0x130
[  260.552068]  RSP 
[  260.611451] ---[ end trace 11c33d85f1d4c652 ]---
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


BUG at /build/buildd/linux-3.13.0/fs/pnode.c:372!

2014-10-06 Thread Andrey Wagin
Hi All,

Here is a small program, which triggers a bug. Is it interested for someone?

root@ubuntu:/home/avagin# cat nsenter.c
#define _GNU_SOURCE /* See feature_test_macros(7) */
#include sched.h
#include sys/types.h
#include sys/stat.h
#include fcntl.h
#include unistd.h
#include sys/mount.h

int main(int argc, char **argv)
{
int fd;

fd = open(/proc/self/ns/mnt, O_RDONLY);
if (fd  0)
return 1;
umount2(/, MNT_DETACH);
if (setns(fd, CLONE_NEWNS))
return 1;
umount2(/, MNT_DETACH);

return 0;
}

root@ubuntu:/home/avagin# gcc nsenter.c -o nsenter
root@ubuntu:/home/avagin# ./nsenter


[  260.548301] [ cut here ]
[  260.550941] kernel BUG at /build/buildd/linux-3.13.0/fs/pnode.c:372!
[  260.552068] invalid opcode:  [#1] SMP
[  260.552068] Modules linked in: xt_CHECKSUM iptable_mangle xt_tcpudp
xt_addrtype xt_conntrack ipt_MASQUERADE iptable_nat nf_conntrack_ipv4
nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack bridge stp llc
dm_thin_pool dm_persistent_data dm_bufio dm_bio_prison iptable_filter
ip_tables x_tables crct10dif_pclmul crc32_pclmul ghash_clmulni_intel
binfmt_misc nfsd auth_rpcgss nfs_acl aesni_intel nfs lockd aes_x86_64
sunrpc fscache lrw gf128mul glue_helper ablk_helper cryptd serio_raw
ppdev parport_pc lp parport btrfs xor raid6_pq libcrc32c psmouse
floppy
[  260.552068] CPU: 0 PID: 1723 Comm: nsenter Not tainted
3.13.0-30-generic #55-Ubuntu
[  260.552068] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[  260.552068] task: 8800376097f0 ti: 880074824000 task.ti:
880074824000
[  260.552068] RIP: 0010:[811e9483]  [811e9483]
propagate_umount+0x123/0x130
[  260.552068] RSP: 0018:880074825e98  EFLAGS: 00010246
[  260.552068] RAX: 88007c741140 RBX: 0002 RCX: 88007c741190
[  260.552068] RDX: 88007c741190 RSI: 880074825ec0 RDI: 880074825ec0
[  260.552068] RBP: 880074825eb0 R08: 000172e0 R09: 88007fc172e0
[  260.552068] R10: 811cc642 R11: ea0001d59000 R12: 88007c741140
[  260.552068] R13: 88007c741140 R14: 88007c741140 R15: 
[  260.552068] FS:  7fd5c7e41740() GS:88007fc0()
knlGS:
[  260.552068] CS:  0010 DS:  ES:  CR0: 80050033
[  260.552068] CR2: 7fd5c7968050 CR3: 70124000 CR4: 000406f0
[  260.552068] Stack:
[  260.552068]  0002 0002 88007c631000
880074825ed8
[  260.552068]  811dcfac 88007c741140 0002
88007c741160
[  260.552068]  880074825f38 811dd12b 811cc642
7564
[  260.552068] Call Trace:
[  260.552068]  [811dcfac] umount_tree+0x20c/0x260
[  260.552068]  [811dd12b] do_umount+0x12b/0x300
[  260.552068]  [811cc642] ? final_putname+0x22/0x50
[  260.552068]  [811cc849] ? putname+0x29/0x40
[  260.552068]  [811dd88c] SyS_umount+0xdc/0x100
[  260.552068]  [8172aeff] tracesys+0xe1/0xe6
[  260.552068] Code: 89 50 08 48 8b 50 08 48 89 02 49 89 45 08 e9 72
ff ff ff 0f 1f 44 00 00 4c 89 e6 4c 89 e7 e8 f5 f6 ff ff 48 89 c3 e9
39 ff ff ff 0f 0b 66 2e 0f 1f 84 00 00 00 00 00 90 66 66 66 66 90 55
b8 01
[  260.552068] RIP  [811e9483] propagate_umount+0x123/0x130
[  260.552068]  RSP 880074825e98
[  260.611451] ---[ end trace 11c33d85f1d4c652 ]---
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG at /build/buildd/linux-3.13.0/fs/pnode.c:372!

2014-10-06 Thread Andrey Wagin
2014-10-06 22:19 GMT+04:00 Serge E. Hallyn se...@hallyn.com:
 Quoting Andrey Wagin (ava...@gmail.com):
 Hi All,

 Here is a small program, which triggers a bug. Is it interested for someone?

 Yup, would

 diff --git a/fs/namespace.c b/fs/namespace.c
 index ef42d9b..ddef25d 100644
 --- a/fs/namespace.c
 +++ b/fs/namespace.c
 @@ -3046,6 +3046,9 @@ static int mntns_install(struct nsproxy *nsproxy, void 
 *ns)
 struct mnt_namespace *mnt_ns = ns;
 struct path root;

 +   if (mnt_ns == nsproxy-mnt_ns)
 +   return 0;
 +
 if (!ns_capable(mnt_ns-user_ns, CAP_SYS_ADMIN) ||
 !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
 !ns_capable(current_user_ns(), CAP_SYS_ADMIN))

 fix it?

Actually the bug is triggered from the second umount. I think we need
something like this:
diff --git a/fs/namespace.c b/fs/namespace.c
index 68685b2..bbcd92c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1363,6 +1363,9 @@ static int do_umount(struct mount *mnt, int flags)
return retval;
}

+   if (mnt-mnt_parent == mnt)
+   return -EINVAL;
+
namespace_lock();
lock_mount_hash();
event++;

root@ubuntu:/home/avagin# strace ./a.out
...
open(/proc/self/ns/mnt, O_RDONLY) = 3
umount(/, MNT_DETACH) = 0
setns(3, 131072)= 0
umount(/, MNT_DETACH) = -1 EINVAL (Invalid argument)
exit_group(0)   = ?
+++ exited with 0 +++


 root@ubuntu:/home/avagin# cat nsenter.c
 #define _GNU_SOURCE /* See feature_test_macros(7) */
 #include sched.h
 #include sys/types.h
 #include sys/stat.h
 #include fcntl.h
 #include unistd.h
 #include sys/mount.h

 int main(int argc, char **argv)
 {
 int fd;

 fd = open(/proc/self/ns/mnt, O_RDONLY);
 if (fd  0)
 return 1;
 umount2(/, MNT_DETACH);
 if (setns(fd, CLONE_NEWNS))
 return 1;
 umount2(/, MNT_DETACH);

 return 0;
 }

 root@ubuntu:/home/avagin# gcc nsenter.c -o nsenter
 root@ubuntu:/home/avagin# ./nsenter


 [  260.548301] [ cut here ]
 [  260.550941] kernel BUG at /build/buildd/linux-3.13.0/fs/pnode.c:372!
 [  260.552068] invalid opcode:  [#1] SMP
 [  260.552068] Modules linked in: xt_CHECKSUM iptable_mangle xt_tcpudp
 xt_addrtype xt_conntrack ipt_MASQUERADE iptable_nat nf_conntrack_ipv4
 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack bridge stp llc
 dm_thin_pool dm_persistent_data dm_bufio dm_bio_prison iptable_filter
 ip_tables x_tables crct10dif_pclmul crc32_pclmul ghash_clmulni_intel
 binfmt_misc nfsd auth_rpcgss nfs_acl aesni_intel nfs lockd aes_x86_64
 sunrpc fscache lrw gf128mul glue_helper ablk_helper cryptd serio_raw
 ppdev parport_pc lp parport btrfs xor raid6_pq libcrc32c psmouse
 floppy
 [  260.552068] CPU: 0 PID: 1723 Comm: nsenter Not tainted
 3.13.0-30-generic #55-Ubuntu
 [  260.552068] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
 [  260.552068] task: 8800376097f0 ti: 880074824000 task.ti:
 880074824000
 [  260.552068] RIP: 0010:[811e9483]  [811e9483]
 propagate_umount+0x123/0x130
 [  260.552068] RSP: 0018:880074825e98  EFLAGS: 00010246
 [  260.552068] RAX: 88007c741140 RBX: 0002 RCX: 
 88007c741190
 [  260.552068] RDX: 88007c741190 RSI: 880074825ec0 RDI: 
 880074825ec0
 [  260.552068] RBP: 880074825eb0 R08: 000172e0 R09: 
 88007fc172e0
 [  260.552068] R10: 811cc642 R11: ea0001d59000 R12: 
 88007c741140
 [  260.552068] R13: 88007c741140 R14: 88007c741140 R15: 
 
 [  260.552068] FS:  7fd5c7e41740() GS:88007fc0()
 knlGS:
 [  260.552068] CS:  0010 DS:  ES:  CR0: 80050033
 [  260.552068] CR2: 7fd5c7968050 CR3: 70124000 CR4: 
 000406f0
 [  260.552068] Stack:
 [  260.552068]  0002 0002 88007c631000
 880074825ed8
 [  260.552068]  811dcfac 88007c741140 0002
 88007c741160
 [  260.552068]  880074825f38 811dd12b 811cc642
 7564
 [  260.552068] Call Trace:
 [  260.552068]  [811dcfac] umount_tree+0x20c/0x260
 [  260.552068]  [811dd12b] do_umount+0x12b/0x300
 [  260.552068]  [811cc642] ? final_putname+0x22/0x50
 [  260.552068]  [811cc849] ? putname+0x29/0x40
 [  260.552068]  [811dd88c] SyS_umount+0xdc/0x100
 [  260.552068]  [8172aeff] tracesys+0xe1/0xe6
 [  260.552068] Code: 89 50 08 48 8b 50 08 48 89 02 49 89 45 08 e9 72
 ff ff ff 0f 1f 44 00 00 4c 89 e6 4c 89 e7 e8 f5 f6 ff ff 48 89 c3 e9
 39 ff ff ff 0f 0b 66 2e 0f 1f 84 00 00 00 00 00 90 66 66 66 66 90 55
 b8 01
 [  260.552068] RIP  [811e9483] propagate_umount+0x123/0x130
 [  260.552068]  RSP 880074825e98
 [  260.611451] ---[ end trace 11c33d85f1d4c652 ]---
 --
 To unsubscribe from

Re: BUG at /build/buildd/linux-3.13.0/fs/pnode.c:372!

2014-10-06 Thread Andrey Wagin
2014-10-06 23:44 GMT+04:00 Serge E. Hallyn se...@hallyn.com:
 Quoting Andrey Wagin (ava...@gmail.com):
 2014-10-06 22:19 GMT+04:00 Serge E. Hallyn se...@hallyn.com:
  Quoting Andrey Wagin (ava...@gmail.com):
  Hi All,
 
  Here is a small program, which triggers a bug. Is it interested for 
  someone?
 
  Yup, would
 
  diff --git a/fs/namespace.c b/fs/namespace.c
  index ef42d9b..ddef25d 100644
  --- a/fs/namespace.c
  +++ b/fs/namespace.c
  @@ -3046,6 +3046,9 @@ static int mntns_install(struct nsproxy *nsproxy, 
  void *ns)
  struct mnt_namespace *mnt_ns = ns;
  struct path root;
 
  +   if (mnt_ns == nsproxy-mnt_ns)
  +   return 0;
  +
  if (!ns_capable(mnt_ns-user_ns, CAP_SYS_ADMIN) ||
  !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
  !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
 
  fix it?

 Actually the bug is triggered from the second umount. I think we need

 Oh, I see.

 something like this:
 diff --git a/fs/namespace.c b/fs/namespace.c
 index 68685b2..bbcd92c 100644
 --- a/fs/namespace.c
 +++ b/fs/namespace.c
 @@ -1363,6 +1363,9 @@ static int do_umount(struct mount *mnt, int flags)
 return retval;
 }

 +   if (mnt-mnt_parent == mnt)

 looks good (or the equivalent  mnt_has_parent(mnt)).  Have you tested that
 this prevents your test program from causing a crash?

Yes, I have. Look the strace output below. Today here is too late for
sending a proper patch.


 +   return -EINVAL;
 +
 namespace_lock();
 lock_mount_hash();
 event++;

 root@ubuntu:/home/avagin# strace ./a.out
 ...
 open(/proc/self/ns/mnt, O_RDONLY) = 3
 umount(/, MNT_DETACH) = 0
 setns(3, 131072)= 0
 umount(/, MNT_DETACH) = -1 EINVAL (Invalid argument)
 exit_group(0)   = ?
 +++ exited with 0 +++

 
  root@ubuntu:/home/avagin# cat nsenter.c
  #define _GNU_SOURCE /* See feature_test_macros(7) */
  #include sched.h
  #include sys/types.h
  #include sys/stat.h
  #include fcntl.h
  #include unistd.h
  #include sys/mount.h
 
  int main(int argc, char **argv)
  {
  int fd;
 
  fd = open(/proc/self/ns/mnt, O_RDONLY);
  if (fd  0)
  return 1;
  umount2(/, MNT_DETACH);
  if (setns(fd, CLONE_NEWNS))
  return 1;
  umount2(/, MNT_DETACH);
 
  return 0;
  }
 
  root@ubuntu:/home/avagin# gcc nsenter.c -o nsenter
  root@ubuntu:/home/avagin# ./nsenter
 
 
  [  260.548301] [ cut here ]
  [  260.550941] kernel BUG at /build/buildd/linux-3.13.0/fs/pnode.c:372!
  [  260.552068] invalid opcode:  [#1] SMP
  [  260.552068] Modules linked in: xt_CHECKSUM iptable_mangle xt_tcpudp
  xt_addrtype xt_conntrack ipt_MASQUERADE iptable_nat nf_conntrack_ipv4
  nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack bridge stp llc
  dm_thin_pool dm_persistent_data dm_bufio dm_bio_prison iptable_filter
  ip_tables x_tables crct10dif_pclmul crc32_pclmul ghash_clmulni_intel
  binfmt_misc nfsd auth_rpcgss nfs_acl aesni_intel nfs lockd aes_x86_64
  sunrpc fscache lrw gf128mul glue_helper ablk_helper cryptd serio_raw
  ppdev parport_pc lp parport btrfs xor raid6_pq libcrc32c psmouse
  floppy
  [  260.552068] CPU: 0 PID: 1723 Comm: nsenter Not tainted
  3.13.0-30-generic #55-Ubuntu
  [  260.552068] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
  [  260.552068] task: 8800376097f0 ti: 880074824000 task.ti:
  880074824000
  [  260.552068] RIP: 0010:[811e9483]  [811e9483]
  propagate_umount+0x123/0x130
  [  260.552068] RSP: 0018:880074825e98  EFLAGS: 00010246
  [  260.552068] RAX: 88007c741140 RBX: 0002 RCX: 
  88007c741190
  [  260.552068] RDX: 88007c741190 RSI: 880074825ec0 RDI: 
  880074825ec0
  [  260.552068] RBP: 880074825eb0 R08: 000172e0 R09: 
  88007fc172e0
  [  260.552068] R10: 811cc642 R11: ea0001d59000 R12: 
  88007c741140
  [  260.552068] R13: 88007c741140 R14: 88007c741140 R15: 
  
  [  260.552068] FS:  7fd5c7e41740() GS:88007fc0()
  knlGS:
  [  260.552068] CS:  0010 DS:  ES:  CR0: 80050033
  [  260.552068] CR2: 7fd5c7968050 CR3: 70124000 CR4: 
  000406f0
  [  260.552068] Stack:
  [  260.552068]  0002 0002 88007c631000
  880074825ed8
  [  260.552068]  811dcfac 88007c741140 0002
  88007c741160
  [  260.552068]  880074825f38 811dd12b 811cc642
  7564
  [  260.552068] Call Trace:
  [  260.552068]  [811dcfac] umount_tree+0x20c/0x260
  [  260.552068]  [811dd12b] do_umount+0x12b/0x300
  [  260.552068]  [811cc642] ? final_putname+0x22/0x50
  [  260.552068]  [811cc849] ? putname+0x29/0x40
  [  260.552068]  [811dd88c

Does MNT_LOCKED work as expected?

2014-09-29 Thread Andrey Wagin
Hi All,

I think I found a case, when MNT_LOCKED (5ff9d8a65ce8 "vfs: Lock in
place mounts from more privileged users") doesn't help to hide
overmounted parts from unprivileged users.
The problem exists for mounts, which are not overmounted entirely. In
such cases we can open a directory from a target mount, which is not
overmounted. Then we do pivot_root to move all mounts in a temporary
directory. At the final step we deatch all mounts from the temporary
directory. After that all children mounts are umounted from the target
mount and we can use our file descriptor to open files, which have
been overmount before.

Here is a example https://github.com/avagin/userns_vs_mntns.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Does MNT_LOCKED work as expected?

2014-09-29 Thread Andrey Wagin
Hi All,

I think I found a case, when MNT_LOCKED (5ff9d8a65ce8 vfs: Lock in
place mounts from more privileged users) doesn't help to hide
overmounted parts from unprivileged users.
The problem exists for mounts, which are not overmounted entirely. In
such cases we can open a directory from a target mount, which is not
overmounted. Then we do pivot_root to move all mounts in a temporary
directory. At the final step we deatch all mounts from the temporary
directory. After that all children mounts are umounted from the target
mount and we can use our file descriptor to open files, which have
been overmount before.

Here is a example https://github.com/avagin/userns_vs_mntns.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: linux-next: cgroup_mount() falls asleep forever

2014-09-24 Thread Andrey Wagin
2014-09-24 14:31 GMT+04:00 Andrey Wagin :
> Hi All,
>
> I execute CRIU tests on linux-next. Today I found that one of tests
> hangs up forever.
>
> [root@linux-next-test linux-next]# git describe HEAD
> next-20140922
> [root@linux-next-test ~]# ps axf
> ...
>   698 ?Ss 0:05  \_ sshd: root@notty
>   700 ?Ss 0:00  |   \_ bash -x
> jenkins-scripts/jenkins-ct.sh jenkins.sh
>   706 ?S  0:00  |   \_ bash -c ( mount --make-rprivate
> / && umount -l  /proc && mount -t proc proc /proc/ && bash -x
> jenkins.sh)
>   707 ?S  0:00  |   \_ bash -c ( mount
> --make-rprivate / && umount -l  /proc && mount -t proc proc /proc/ &&
> bash -x jenkins.sh)
>   711 ?S  0:00  |   \_ bash -x jenkins.sh
>  2981 ?S  0:05  |   \_ bash -x
> test/zdtm.sh -C -x .*\(maps01\|maps04\)
>  6717 ?S  0:00  |   \_ /bin/bash
> zdtm/live/static/cgroup00.hook --clean
>  6719 ?D 11:13  |   \_ mount -t
> cgroup none cgclean.cWgK71 -o none,name=zdtmtst
>
> [root@linux-next-test ~]# cat /proc/6719/stack
> [] msleep+0x37/0x50
> [] cgroup_mount+0x712/0xa50
> [] mount_fs+0x39/0x1b0
> [] vfs_kern_mount+0x6b/0x150
> [] do_mount+0x22b/0xc20
> [] SyS_mount+0xa2/0x110
> [] system_call_fastpath+0x12/0x17
> [] 0x
>
> Let me know which info do you need for investigation.
>
> Steps to reproduce:
>
> git clone https://github.com/xemul/criu.git
> cd criu
> make
> bash test/zdtm.sh static/cgroup00

Steps to reproduce without CRIU:

[root@avagin-fc19-cr ~]# mount -t cgroup -o none,name=zdtmtst xxx /mnt/test/
[root@avagin-fc19-cr ~]# mkdir /mnt/test/xxx
[root@avagin-fc19-cr ~]# umount /mnt/test/
[root@avagin-fc19-cr ~]# mount -t cgroup -o none,name=zdtmtst xxx /mnt/test/
[1]+  Stopped mount -t cgroup -o none,name=zdtmtst xxx
/mnt/test/
[root@avagin-fc19-cr ~]# bg
[root@avagin-fc19-cr ~]# cat /proc/633/stack
[] msleep+0x37/0x50
[] cgroup_mount+0x482/0x670
[] mount_fs+0x39/0x1b0
[] vfs_kern_mount+0x6b/0x150
[] do_mount+0x22b/0xc20
[] SyS_mount+0xa2/0x110
[] system_call_fastpath+0x12/0x17
[] 0x

>
> Thanks,
> Andrew
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: linux-next: cgroup_mount() falls asleep forever

2014-09-24 Thread Andrey Wagin
2014-09-24 14:31 GMT+04:00 Andrey Wagin :
> Hi All,

The problem is in a following commit:

commit 0c7bf3e8cab7900e17ce7f97104c39927d835469
Author: Zefan Li 
Date:   Sat Sep 20 14:49:10 2014 +0800

cgroup: remove redundant variable in cgroup_mount()

Both pinned_sb and new_sb indicate if a new superblock is needed,
so we can just remove new_sb.

Note now we must check if kernfs_tryget_sb() returns NULL, because
when it returns NULL, kernfs_mount() may still re-use an existing
superblock, which is just allocated by another concurent mount.

Suggested-by: Tejun Heo 
Signed-off-by: Zefan Li 
Signed-off-by: Tejun Heo 

>
> I execute CRIU tests on linux-next. Today I found that one of tests
> hangs up forever.
>
> [root@linux-next-test linux-next]# git describe HEAD
> next-20140922
> [root@linux-next-test ~]# ps axf
> ...
>   698 ?Ss 0:05  \_ sshd: root@notty
>   700 ?Ss 0:00  |   \_ bash -x
> jenkins-scripts/jenkins-ct.sh jenkins.sh
>   706 ?S  0:00  |   \_ bash -c ( mount --make-rprivate
> / && umount -l  /proc && mount -t proc proc /proc/ && bash -x
> jenkins.sh)
>   707 ?S  0:00  |   \_ bash -c ( mount
> --make-rprivate / && umount -l  /proc && mount -t proc proc /proc/ &&
> bash -x jenkins.sh)
>   711 ?S  0:00  |   \_ bash -x jenkins.sh
>  2981 ?S  0:05  |   \_ bash -x
> test/zdtm.sh -C -x .*\(maps01\|maps04\)
>  6717 ?S  0:00  |   \_ /bin/bash
> zdtm/live/static/cgroup00.hook --clean
>  6719 ?D 11:13  |   \_ mount -t
> cgroup none cgclean.cWgK71 -o none,name=zdtmtst
>
> [root@linux-next-test ~]# cat /proc/6719/stack
> [] msleep+0x37/0x50
> [] cgroup_mount+0x712/0xa50
> [] mount_fs+0x39/0x1b0
> [] vfs_kern_mount+0x6b/0x150
> [] do_mount+0x22b/0xc20
> [] SyS_mount+0xa2/0x110
> [] system_call_fastpath+0x12/0x17
> [] 0x
>
> Let me know which info do you need for investigation.
>
> Steps to reproduce:
>
> git clone https://github.com/xemul/criu.git
> cd criu
> make
> bash test/zdtm.sh static/cgroup00
>
> Thanks,
> Andrew
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


linux-next: cgroup_mount() falls asleep forever

2014-09-24 Thread Andrey Wagin
Hi All,

I execute CRIU tests on linux-next. Today I found that one of tests
hangs up forever.

[root@linux-next-test linux-next]# git describe HEAD
next-20140922
[root@linux-next-test ~]# ps axf
...
  698 ?Ss 0:05  \_ sshd: root@notty
  700 ?Ss 0:00  |   \_ bash -x
jenkins-scripts/jenkins-ct.sh jenkins.sh
  706 ?S  0:00  |   \_ bash -c ( mount --make-rprivate
/ && umount -l  /proc && mount -t proc proc /proc/ && bash -x
jenkins.sh)
  707 ?S  0:00  |   \_ bash -c ( mount
--make-rprivate / && umount -l  /proc && mount -t proc proc /proc/ &&
bash -x jenkins.sh)
  711 ?S  0:00  |   \_ bash -x jenkins.sh
 2981 ?S  0:05  |   \_ bash -x
test/zdtm.sh -C -x .*\(maps01\|maps04\)
 6717 ?S  0:00  |   \_ /bin/bash
zdtm/live/static/cgroup00.hook --clean
 6719 ?D 11:13  |   \_ mount -t
cgroup none cgclean.cWgK71 -o none,name=zdtmtst

[root@linux-next-test ~]# cat /proc/6719/stack
[] msleep+0x37/0x50
[] cgroup_mount+0x712/0xa50
[] mount_fs+0x39/0x1b0
[] vfs_kern_mount+0x6b/0x150
[] do_mount+0x22b/0xc20
[] SyS_mount+0xa2/0x110
[] system_call_fastpath+0x12/0x17
[] 0x

Let me know which info do you need for investigation.

Steps to reproduce:

git clone https://github.com/xemul/criu.git
cd criu
make
bash test/zdtm.sh static/cgroup00

Thanks,
Andrew
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


linux-next: cgroup_mount() falls asleep forever

2014-09-24 Thread Andrey Wagin
Hi All,

I execute CRIU tests on linux-next. Today I found that one of tests
hangs up forever.

[root@linux-next-test linux-next]# git describe HEAD
next-20140922
[root@linux-next-test ~]# ps axf
...
  698 ?Ss 0:05  \_ sshd: root@notty
  700 ?Ss 0:00  |   \_ bash -x
jenkins-scripts/jenkins-ct.sh jenkins.sh
  706 ?S  0:00  |   \_ bash -c ( mount --make-rprivate
/  umount -l  /proc  mount -t proc proc /proc/  bash -x
jenkins.sh)
  707 ?S  0:00  |   \_ bash -c ( mount
--make-rprivate /  umount -l  /proc  mount -t proc proc /proc/ 
bash -x jenkins.sh)
  711 ?S  0:00  |   \_ bash -x jenkins.sh
 2981 ?S  0:05  |   \_ bash -x
test/zdtm.sh -C -x .*\(maps01\|maps04\)
 6717 ?S  0:00  |   \_ /bin/bash
zdtm/live/static/cgroup00.hook --clean
 6719 ?D 11:13  |   \_ mount -t
cgroup none cgclean.cWgK71 -o none,name=zdtmtst

[root@linux-next-test ~]# cat /proc/6719/stack
[8111fcb7] msleep+0x37/0x50
[8114bde2] cgroup_mount+0x712/0xa50
[8124d0e9] mount_fs+0x39/0x1b0
[8126d76b] vfs_kern_mount+0x6b/0x150
[8127075b] do_mount+0x22b/0xc20
[81271492] SyS_mount+0xa2/0x110
[817e3ba9] system_call_fastpath+0x12/0x17
[] 0x

Let me know which info do you need for investigation.

Steps to reproduce:

git clone https://github.com/xemul/criu.git
cd criu
make
bash test/zdtm.sh static/cgroup00

Thanks,
Andrew
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: linux-next: cgroup_mount() falls asleep forever

2014-09-24 Thread Andrey Wagin
2014-09-24 14:31 GMT+04:00 Andrey Wagin ava...@gmail.com:
 Hi All,

The problem is in a following commit:

commit 0c7bf3e8cab7900e17ce7f97104c39927d835469
Author: Zefan Li lize...@huawei.com
Date:   Sat Sep 20 14:49:10 2014 +0800

cgroup: remove redundant variable in cgroup_mount()

Both pinned_sb and new_sb indicate if a new superblock is needed,
so we can just remove new_sb.

Note now we must check if kernfs_tryget_sb() returns NULL, because
when it returns NULL, kernfs_mount() may still re-use an existing
superblock, which is just allocated by another concurent mount.

Suggested-by: Tejun Heo t...@kernel.org
Signed-off-by: Zefan Li lize...@huawei.com
Signed-off-by: Tejun Heo t...@kernel.org


 I execute CRIU tests on linux-next. Today I found that one of tests
 hangs up forever.

 [root@linux-next-test linux-next]# git describe HEAD
 next-20140922
 [root@linux-next-test ~]# ps axf
 ...
   698 ?Ss 0:05  \_ sshd: root@notty
   700 ?Ss 0:00  |   \_ bash -x
 jenkins-scripts/jenkins-ct.sh jenkins.sh
   706 ?S  0:00  |   \_ bash -c ( mount --make-rprivate
 /  umount -l  /proc  mount -t proc proc /proc/  bash -x
 jenkins.sh)
   707 ?S  0:00  |   \_ bash -c ( mount
 --make-rprivate /  umount -l  /proc  mount -t proc proc /proc/ 
 bash -x jenkins.sh)
   711 ?S  0:00  |   \_ bash -x jenkins.sh
  2981 ?S  0:05  |   \_ bash -x
 test/zdtm.sh -C -x .*\(maps01\|maps04\)
  6717 ?S  0:00  |   \_ /bin/bash
 zdtm/live/static/cgroup00.hook --clean
  6719 ?D 11:13  |   \_ mount -t
 cgroup none cgclean.cWgK71 -o none,name=zdtmtst

 [root@linux-next-test ~]# cat /proc/6719/stack
 [8111fcb7] msleep+0x37/0x50
 [8114bde2] cgroup_mount+0x712/0xa50
 [8124d0e9] mount_fs+0x39/0x1b0
 [8126d76b] vfs_kern_mount+0x6b/0x150
 [8127075b] do_mount+0x22b/0xc20
 [81271492] SyS_mount+0xa2/0x110
 [817e3ba9] system_call_fastpath+0x12/0x17
 [] 0x

 Let me know which info do you need for investigation.

 Steps to reproduce:

 git clone https://github.com/xemul/criu.git
 cd criu
 make
 bash test/zdtm.sh static/cgroup00

 Thanks,
 Andrew
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: linux-next: cgroup_mount() falls asleep forever

2014-09-24 Thread Andrey Wagin
2014-09-24 14:31 GMT+04:00 Andrey Wagin ava...@gmail.com:
 Hi All,

 I execute CRIU tests on linux-next. Today I found that one of tests
 hangs up forever.

 [root@linux-next-test linux-next]# git describe HEAD
 next-20140922
 [root@linux-next-test ~]# ps axf
 ...
   698 ?Ss 0:05  \_ sshd: root@notty
   700 ?Ss 0:00  |   \_ bash -x
 jenkins-scripts/jenkins-ct.sh jenkins.sh
   706 ?S  0:00  |   \_ bash -c ( mount --make-rprivate
 /  umount -l  /proc  mount -t proc proc /proc/  bash -x
 jenkins.sh)
   707 ?S  0:00  |   \_ bash -c ( mount
 --make-rprivate /  umount -l  /proc  mount -t proc proc /proc/ 
 bash -x jenkins.sh)
   711 ?S  0:00  |   \_ bash -x jenkins.sh
  2981 ?S  0:05  |   \_ bash -x
 test/zdtm.sh -C -x .*\(maps01\|maps04\)
  6717 ?S  0:00  |   \_ /bin/bash
 zdtm/live/static/cgroup00.hook --clean
  6719 ?D 11:13  |   \_ mount -t
 cgroup none cgclean.cWgK71 -o none,name=zdtmtst

 [root@linux-next-test ~]# cat /proc/6719/stack
 [8111fcb7] msleep+0x37/0x50
 [8114bde2] cgroup_mount+0x712/0xa50
 [8124d0e9] mount_fs+0x39/0x1b0
 [8126d76b] vfs_kern_mount+0x6b/0x150
 [8127075b] do_mount+0x22b/0xc20
 [81271492] SyS_mount+0xa2/0x110
 [817e3ba9] system_call_fastpath+0x12/0x17
 [] 0x

 Let me know which info do you need for investigation.

 Steps to reproduce:

 git clone https://github.com/xemul/criu.git
 cd criu
 make
 bash test/zdtm.sh static/cgroup00

Steps to reproduce without CRIU:

[root@avagin-fc19-cr ~]# mount -t cgroup -o none,name=zdtmtst xxx /mnt/test/
[root@avagin-fc19-cr ~]# mkdir /mnt/test/xxx
[root@avagin-fc19-cr ~]# umount /mnt/test/
[root@avagin-fc19-cr ~]# mount -t cgroup -o none,name=zdtmtst xxx /mnt/test/
[1]+  Stopped mount -t cgroup -o none,name=zdtmtst xxx
/mnt/test/
[root@avagin-fc19-cr ~]# bg
[root@avagin-fc19-cr ~]# cat /proc/633/stack
[810e9377] msleep+0x37/0x50
[88a2] cgroup_mount+0x482/0x670
[811e3769] mount_fs+0x39/0x1b0
[8120157b] vfs_kern_mount+0x6b/0x150
[8120422b] do_mount+0x22b/0xc20
[81204f62] SyS_mount+0xa2/0x110
[816fcc69] system_call_fastpath+0x12/0x17
[] 0x


 Thanks,
 Andrew
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] Documentation/video4linux: don't build without CONFIG_VIDEO_V4L2

2014-08-28 Thread Andrey Wagin
2014-08-29 0:42 GMT+04:00 Randy Dunlap :
> On 08/28/14 13:34, Andrey Vagin wrote:
>> Otherwise we get warnings:
>> WARNING: "vb2_ops_wait_finish" 
>> [Documentation//video4linux/v4l2-pci-skeleton.ko] undefined!
>> WARNING: "vb2_ops_wait_prepare" 
>> [Documentation//video4linux/v4l2-pci-skeleton.ko] undefined!
>> ...
>> WARNING: "video_unregister_device" 
>> [Documentation//video4linux/v4l2-pci-skeleton.ko] undefined!
>>
>> Fixes: 8db5ab4b50fb ("Documentation: add makefiles for more targets")
>>
>> Cc: Peter Foley 
>> Cc: Mauro Carvalho Chehab 
>> Cc: Randy Dunlap 
>> Signed-off-by: Andrey Vagin 
>> ---
>>  Documentation/video4linux/Makefile | 2 ++
>>  1 file changed, 2 insertions(+)
>>
>> diff --git a/Documentation/video4linux/Makefile 
>> b/Documentation/video4linux/Makefile
>> index d58101e..f19f38e 100644
>> --- a/Documentation/video4linux/Makefile
>> +++ b/Documentation/video4linux/Makefile
>> @@ -1 +1,3 @@
>> +ifneq ($(CONFIG_VIDEO_V4L2),)
>>  obj-m := v4l2-pci-skeleton.o
>> +endif
>>
>
> The Kconfig file for this module says:
>
> config VIDEO_PCI_SKELETON
> tristate "Skeleton PCI V4L2 driver"
> depends on PCI && BUILD_DOCSRC
> depends on VIDEO_V4L2 && VIDEOBUF2_CORE && VIDEOBUF2_MEMOPS
>
> so it should already be limited to VIDEO_V4L2 being enabled.
>
> What kernel or linux-next version did you see a problem with?

Eh, I'm late. It was fixed already

commit 81820f32ffaf393d9379c326d670257c63306a26
Author: Mark Brown 
Date:   Wed Aug 27 10:18:51 2014 +1000

v4l2-pci-skeleton: Only build if PCI is available

Sorry for the noise.

>
> Please send the failing .config file so that I can check it.
>
> Thanks.
>
> --
> ~Randy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] Documentation/video4linux: don't build without CONFIG_VIDEO_V4L2

2014-08-28 Thread Andrey Wagin
2014-08-29 0:42 GMT+04:00 Randy Dunlap rdun...@infradead.org:
 On 08/28/14 13:34, Andrey Vagin wrote:
 Otherwise we get warnings:
 WARNING: vb2_ops_wait_finish 
 [Documentation//video4linux/v4l2-pci-skeleton.ko] undefined!
 WARNING: vb2_ops_wait_prepare 
 [Documentation//video4linux/v4l2-pci-skeleton.ko] undefined!
 ...
 WARNING: video_unregister_device 
 [Documentation//video4linux/v4l2-pci-skeleton.ko] undefined!

 Fixes: 8db5ab4b50fb (Documentation: add makefiles for more targets)

 Cc: Peter Foley pefol...@pefoley.com
 Cc: Mauro Carvalho Chehab m.che...@samsung.com
 Cc: Randy Dunlap rdun...@infradead.org
 Signed-off-by: Andrey Vagin ava...@openvz.org
 ---
  Documentation/video4linux/Makefile | 2 ++
  1 file changed, 2 insertions(+)

 diff --git a/Documentation/video4linux/Makefile 
 b/Documentation/video4linux/Makefile
 index d58101e..f19f38e 100644
 --- a/Documentation/video4linux/Makefile
 +++ b/Documentation/video4linux/Makefile
 @@ -1 +1,3 @@
 +ifneq ($(CONFIG_VIDEO_V4L2),)
  obj-m := v4l2-pci-skeleton.o
 +endif


 The Kconfig file for this module says:

 config VIDEO_PCI_SKELETON
 tristate Skeleton PCI V4L2 driver
 depends on PCI  BUILD_DOCSRC
 depends on VIDEO_V4L2  VIDEOBUF2_CORE  VIDEOBUF2_MEMOPS

 so it should already be limited to VIDEO_V4L2 being enabled.

 What kernel or linux-next version did you see a problem with?

Eh, I'm late. It was fixed already

commit 81820f32ffaf393d9379c326d670257c63306a26
Author: Mark Brown broo...@kernel.org
Date:   Wed Aug 27 10:18:51 2014 +1000

v4l2-pci-skeleton: Only build if PCI is available

Sorry for the noise.


 Please send the failing .config file so that I can check it.

 Thanks.

 --
 ~Randy
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/4] Documentation: add makefiles for more targets

2014-08-26 Thread Andrey Wagin
Hi Peter,

I have got following errors with this patch. I think
Documentation/video4linux should be built only if a config contains a
proper options set.

  Building modules, stage 2.
  MODPOST 73 modules
ERROR: "vb2_ops_wait_finish"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "vb2_ops_wait_prepare"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "v4l2_event_unsubscribe"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "v4l2_ctrl_subscribe_event"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "v4l2_ctrl_log_status"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "vb2_ioctl_streamoff"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "vb2_ioctl_streamon"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "vb2_ioctl_create_bufs"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "vb2_ioctl_dqbuf"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "vb2_ioctl_expbuf"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "vb2_ioctl_qbuf"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "vb2_ioctl_querybuf"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "vb2_ioctl_reqbufs"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "vb2_fop_release"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "v4l2_fh_open" [Documentation/video4linux/v4l2-pci-skeleton.ko]
undefined!
ERROR: "vb2_fop_mmap" [Documentation/video4linux/v4l2-pci-skeleton.ko]
undefined!
ERROR: "video_ioctl2" [Documentation/video4linux/v4l2-pci-skeleton.ko]
undefined!
ERROR: "vb2_fop_poll" [Documentation/video4linux/v4l2-pci-skeleton.ko]
undefined!
ERROR: "vb2_fop_read" [Documentation/video4linux/v4l2-pci-skeleton.ko]
undefined!
ERROR: "vb2_buffer_done"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "__video_register_device"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "video_device_release_empty"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "vb2_dma_contig_init_ctx"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "vb2_queue_init"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "vb2_dma_contig_memops"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "v4l2_ctrl_new_std"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "v4l2_ctrl_handler_init_class"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "v4l2_device_register"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
  HOSTCC  arch/x86/tools/insn_sanity
ERROR: "v4l2_match_dv_timings"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "v4l2_find_dv_timings_cap"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "v4l2_valid_dv_timings"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "v4l2_enum_dv_timings_cap"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "video_devdata"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "v4l2_device_unregister"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "vb2_dma_contig_cleanup_ctx"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "v4l2_ctrl_handler_free"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: "video_unregister_device"
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
make[1]: *** [__modpost] Error 1
make: *** [modules] Error 2
make: *** Waiting for unfinished jobs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/4] Documentation: add makefiles for more targets

2014-08-26 Thread Andrey Wagin
Hi Peter,

I have got following errors with this patch. I think
Documentation/video4linux should be built only if a config contains a
proper options set.

  Building modules, stage 2.
  MODPOST 73 modules
ERROR: vb2_ops_wait_finish
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: vb2_ops_wait_prepare
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: v4l2_event_unsubscribe
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: v4l2_ctrl_subscribe_event
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: v4l2_ctrl_log_status
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: vb2_ioctl_streamoff
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: vb2_ioctl_streamon
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: vb2_ioctl_create_bufs
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: vb2_ioctl_dqbuf
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: vb2_ioctl_expbuf
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: vb2_ioctl_qbuf
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: vb2_ioctl_querybuf
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: vb2_ioctl_reqbufs
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: vb2_fop_release
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: v4l2_fh_open [Documentation/video4linux/v4l2-pci-skeleton.ko]
undefined!
ERROR: vb2_fop_mmap [Documentation/video4linux/v4l2-pci-skeleton.ko]
undefined!
ERROR: video_ioctl2 [Documentation/video4linux/v4l2-pci-skeleton.ko]
undefined!
ERROR: vb2_fop_poll [Documentation/video4linux/v4l2-pci-skeleton.ko]
undefined!
ERROR: vb2_fop_read [Documentation/video4linux/v4l2-pci-skeleton.ko]
undefined!
ERROR: vb2_buffer_done
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: __video_register_device
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: video_device_release_empty
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: vb2_dma_contig_init_ctx
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: vb2_queue_init
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: vb2_dma_contig_memops
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: v4l2_ctrl_new_std
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: v4l2_ctrl_handler_init_class
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: v4l2_device_register
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
  HOSTCC  arch/x86/tools/insn_sanity
ERROR: v4l2_match_dv_timings
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: v4l2_find_dv_timings_cap
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: v4l2_valid_dv_timings
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: v4l2_enum_dv_timings_cap
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: video_devdata
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: v4l2_device_unregister
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: vb2_dma_contig_cleanup_ctx
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: v4l2_ctrl_handler_free
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
ERROR: video_unregister_device
[Documentation/video4linux/v4l2-pci-skeleton.ko] undefined!
make[1]: *** [__modpost] Error 1
make: *** [modules] Error 2
make: *** Waiting for unfinished jobs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH driver-core-linus] kernfs, sysfs, cgroup: restrict extra perm check on open to sysfs

2014-05-12 Thread Andrey Wagin
On Mon, May 12, 2014 at 01:56:27PM -0400, Tejun Heo wrote:
> The kernfs open method - kernfs_fop_open() - inherited extra
> permission checks from sysfs.  While the vfs layer allows ignoring the
> read/write permissions checks if the issuer has CAP_DAC_OVERRIDE,
> sysfs explicitly denied open regardless of the cap if the file doesn't
> have any of the UGO perms of the requested access or doesn't implement
> the requested operation.  It can be debated whether this was a good
> idea or not but the behavior is too subtle and dangerous to change at
> this point.
> 
> After cgroup got converted to kernfs, this extra perm check also got
> applied to cgroup breaking libcgroup which opens write-only files with
> O_RDWR as root.  This patch gates the extra open permission check with
> a new flag KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK and enables it for sysfs.
> For sysfs, nothing changes.  For cgroup, root now can perform any
> operation regardless of the permissions as it was before kernfs
> conversion.  Note that kernfs still fails unimplemented operations
> with -EINVAL.
> 
> While at it, add comments explaining KERNFS_ROOT flags.
>

It works for me. Thank you for the quick response.

Tested-by: Andrey Wagin 

> Signed-off-by: Tejun Heo 
> Reported-by: Andrey Wagin 
> Cc: Li Zefan 
> References: 
> http://lkml.kernel.org/g/CANaxB-xUm3rJ-Cbp72q-rQJO5mZe1qK6qXsQM=vh0u8upj4...@mail.gmail.com
> Fixes: 2bd59d48ebfb ("cgroup: convert to kernfs")
> ---
>  fs/kernfs/file.c   |   19 +++
>  fs/sysfs/mount.c   |3 ++-
>  include/linux/kernfs.h |   19 ++-
>  3 files changed, 31 insertions(+), 10 deletions(-)
> 
> --- a/fs/kernfs/file.c
> +++ b/fs/kernfs/file.c
> @@ -610,6 +610,7 @@ static void kernfs_put_open_node(struct
>  static int kernfs_fop_open(struct inode *inode, struct file *file)
>  {
>   struct kernfs_node *kn = file->f_path.dentry->d_fsdata;
> + struct kernfs_root *root = kernfs_root(kn);
>   const struct kernfs_ops *ops;
>   struct kernfs_open_file *of;
>   bool has_read, has_write, has_mmap;
> @@ -624,14 +625,16 @@ static int kernfs_fop_open(struct inode
>   has_write = ops->write || ops->mmap;
>   has_mmap = ops->mmap;
>  
> - /* check perms and supported operations */
> - if ((file->f_mode & FMODE_WRITE) &&
> - (!(inode->i_mode & S_IWUGO) || !has_write))
> - goto err_out;
> -
> - if ((file->f_mode & FMODE_READ) &&
> - (!(inode->i_mode & S_IRUGO) || !has_read))
> - goto err_out;
> + /* see the flag definition for details */
> + if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) {
> + if ((file->f_mode & FMODE_WRITE) &&
> + (!(inode->i_mode & S_IWUGO) || !has_write))
> + goto err_out;
> +
> + if ((file->f_mode & FMODE_READ) &&
> + (!(inode->i_mode & S_IRUGO) || !has_read))
> + goto err_out;
> + }
>  
>   /* allocate a kernfs_open_file for the file */
>   error = -ENOMEM;
> --- a/fs/sysfs/mount.c
> +++ b/fs/sysfs/mount.c
> @@ -63,7 +63,8 @@ int __init sysfs_init(void)
>  {
>   int err;
>  
> - sysfs_root = kernfs_create_root(NULL, 0, NULL);
> + sysfs_root = kernfs_create_root(NULL, KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
> + NULL);
>   if (IS_ERR(sysfs_root))
>   return PTR_ERR(sysfs_root);
>  
> --- a/include/linux/kernfs.h
> +++ b/include/linux/kernfs.h
> @@ -50,7 +50,24 @@ enum kernfs_node_flag {
>  
>  /* @flags for kernfs_create_root() */
>  enum kernfs_root_flag {
> - KERNFS_ROOT_CREATE_DEACTIVATED = 0x0001,
> + /*
> +  * kernfs_nodes are created in the deactivated state and invisible.
> +  * They require explicit kernfs_activate() to become visible.  This
> +  * can be used to make related nodes become visible atomically
> +  * after all nodes are created successfully.
> +  */
> + KERNFS_ROOT_CREATE_DEACTIVATED  = 0x0001,
> +
> + /*
> +  * For regular flies, if the opener has CAP_DAC_OVERRIDE, open(2)
> +  * succeeds regardless of the RW permissions.  sysfs had an extra
> +  * layer of enforcement where open(2) fails with -EACCES regardless
> +  * of CAP_DAC_OVERRIDE if the permission doesn't have the
> +  * respective read or write access at all (none of S_IRUGO or
> +  * S_IWUGO) or the respective operation isn't implemented.  The
> +  * following flag enables that behavior.
> +  */
> + KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK   = 0x0002,
>  };
>  
>  /* type-specific structures for kernfs_node union members */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH driver-core-linus] kernfs, sysfs, cgroup: restrict extra perm check on open to sysfs

2014-05-12 Thread Andrey Wagin
On Mon, May 12, 2014 at 01:56:27PM -0400, Tejun Heo wrote:
 The kernfs open method - kernfs_fop_open() - inherited extra
 permission checks from sysfs.  While the vfs layer allows ignoring the
 read/write permissions checks if the issuer has CAP_DAC_OVERRIDE,
 sysfs explicitly denied open regardless of the cap if the file doesn't
 have any of the UGO perms of the requested access or doesn't implement
 the requested operation.  It can be debated whether this was a good
 idea or not but the behavior is too subtle and dangerous to change at
 this point.
 
 After cgroup got converted to kernfs, this extra perm check also got
 applied to cgroup breaking libcgroup which opens write-only files with
 O_RDWR as root.  This patch gates the extra open permission check with
 a new flag KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK and enables it for sysfs.
 For sysfs, nothing changes.  For cgroup, root now can perform any
 operation regardless of the permissions as it was before kernfs
 conversion.  Note that kernfs still fails unimplemented operations
 with -EINVAL.
 
 While at it, add comments explaining KERNFS_ROOT flags.


It works for me. Thank you for the quick response.

Tested-by: Andrey Wagin ava...@gmail.com

 Signed-off-by: Tejun Heo t...@kernel.org
 Reported-by: Andrey Wagin ava...@gmail.com
 Cc: Li Zefan lize...@huawei.com
 References: 
 http://lkml.kernel.org/g/CANaxB-xUm3rJ-Cbp72q-rQJO5mZe1qK6qXsQM=vh0u8upj4...@mail.gmail.com
 Fixes: 2bd59d48ebfb (cgroup: convert to kernfs)
 ---
  fs/kernfs/file.c   |   19 +++
  fs/sysfs/mount.c   |3 ++-
  include/linux/kernfs.h |   19 ++-
  3 files changed, 31 insertions(+), 10 deletions(-)
 
 --- a/fs/kernfs/file.c
 +++ b/fs/kernfs/file.c
 @@ -610,6 +610,7 @@ static void kernfs_put_open_node(struct
  static int kernfs_fop_open(struct inode *inode, struct file *file)
  {
   struct kernfs_node *kn = file-f_path.dentry-d_fsdata;
 + struct kernfs_root *root = kernfs_root(kn);
   const struct kernfs_ops *ops;
   struct kernfs_open_file *of;
   bool has_read, has_write, has_mmap;
 @@ -624,14 +625,16 @@ static int kernfs_fop_open(struct inode
   has_write = ops-write || ops-mmap;
   has_mmap = ops-mmap;
  
 - /* check perms and supported operations */
 - if ((file-f_mode  FMODE_WRITE) 
 - (!(inode-i_mode  S_IWUGO) || !has_write))
 - goto err_out;
 -
 - if ((file-f_mode  FMODE_READ) 
 - (!(inode-i_mode  S_IRUGO) || !has_read))
 - goto err_out;
 + /* see the flag definition for details */
 + if (root-flags  KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) {
 + if ((file-f_mode  FMODE_WRITE) 
 + (!(inode-i_mode  S_IWUGO) || !has_write))
 + goto err_out;
 +
 + if ((file-f_mode  FMODE_READ) 
 + (!(inode-i_mode  S_IRUGO) || !has_read))
 + goto err_out;
 + }
  
   /* allocate a kernfs_open_file for the file */
   error = -ENOMEM;
 --- a/fs/sysfs/mount.c
 +++ b/fs/sysfs/mount.c
 @@ -63,7 +63,8 @@ int __init sysfs_init(void)
  {
   int err;
  
 - sysfs_root = kernfs_create_root(NULL, 0, NULL);
 + sysfs_root = kernfs_create_root(NULL, KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
 + NULL);
   if (IS_ERR(sysfs_root))
   return PTR_ERR(sysfs_root);
  
 --- a/include/linux/kernfs.h
 +++ b/include/linux/kernfs.h
 @@ -50,7 +50,24 @@ enum kernfs_node_flag {
  
  /* @flags for kernfs_create_root() */
  enum kernfs_root_flag {
 - KERNFS_ROOT_CREATE_DEACTIVATED = 0x0001,
 + /*
 +  * kernfs_nodes are created in the deactivated state and invisible.
 +  * They require explicit kernfs_activate() to become visible.  This
 +  * can be used to make related nodes become visible atomically
 +  * after all nodes are created successfully.
 +  */
 + KERNFS_ROOT_CREATE_DEACTIVATED  = 0x0001,
 +
 + /*
 +  * For regular flies, if the opener has CAP_DAC_OVERRIDE, open(2)
 +  * succeeds regardless of the RW permissions.  sysfs had an extra
 +  * layer of enforcement where open(2) fails with -EACCES regardless
 +  * of CAP_DAC_OVERRIDE if the permission doesn't have the
 +  * respective read or write access at all (none of S_IRUGO or
 +  * S_IWUGO) or the respective operation isn't implemented.  The
 +  * following flag enables that behavior.
 +  */
 + KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK   = 0x0002,
  };
  
  /* type-specific structures for kernfs_node union members */
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [rfc 2/2] docs: procfs -- Document timerfd output

2014-03-31 Thread Andrey Wagin
2014-03-31 21:54 GMT+04:00 Cyrill Gorcunov :
> CC: Shawn Landden 
> CC: Thomas Gleixner 
> CC: Andrew Morton 
> CC: Andrey Vagin 
> CC: Pavel Emelyanov 
> Signed-off-by: Cyrill Gorcunov 
> ---
>  Documentation/filesystems/proc.txt |9 +
>  1 file changed, 9 insertions(+)
>
> Index: linux-2.6.git/Documentation/filesystems/proc.txt
> ===
> --- linux-2.6.git.orig/Documentation/filesystems/proc.txt
> +++ linux-2.6.git/Documentation/filesystems/proc.txt
> @@ -1741,6 +1741,15 @@ pair provide additional information part
> While the first three lines are mandatory and always printed, the 
> rest is
> optional and may be omitted if no marks created yet.
>
> +   Timerfd files
> +   ~
> +   pos:0
> +   flags:  02
> +   mnt_id: 9
> +   clockid: 0 ticks: 0

I would prefer to print "tick" on a separate line.

And we can print the current setting of the timer here. It can be
useful not only for CRIU but for other users too.

it_value: (5, 149234)
it_interval: (10, 0)

> +
> +   where 'clockid' is the clock type and 'ticks' is the number of the 
> timer expirations
> +   that have occurred [see timerfd_create(2) for details].
>
>  
> --
>  Configuring procfs
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [rfc 2/2] docs: procfs -- Document timerfd output

2014-03-31 Thread Andrey Wagin
2014-03-31 21:54 GMT+04:00 Cyrill Gorcunov gorcu...@openvz.org:
 CC: Shawn Landden sh...@churchofgit.com
 CC: Thomas Gleixner t...@linutronix.de
 CC: Andrew Morton a...@linux-foundation.org
 CC: Andrey Vagin ava...@openvz.org
 CC: Pavel Emelyanov xe...@parallels.com
 Signed-off-by: Cyrill Gorcunov gorcu...@openvz.org
 ---
  Documentation/filesystems/proc.txt |9 +
  1 file changed, 9 insertions(+)

 Index: linux-2.6.git/Documentation/filesystems/proc.txt
 ===
 --- linux-2.6.git.orig/Documentation/filesystems/proc.txt
 +++ linux-2.6.git/Documentation/filesystems/proc.txt
 @@ -1741,6 +1741,15 @@ pair provide additional information part
 While the first three lines are mandatory and always printed, the 
 rest is
 optional and may be omitted if no marks created yet.

 +   Timerfd files
 +   ~
 +   pos:0
 +   flags:  02
 +   mnt_id: 9
 +   clockid: 0 ticks: 0

I would prefer to print tick on a separate line.

And we can print the current setting of the timer here. It can be
useful not only for CRIU but for other users too.

it_value: (5, 149234)
it_interval: (10, 0)

 +
 +   where 'clockid' is the clock type and 'ticks' is the number of the 
 timer expirations
 +   that have occurred [see timerfd_create(2) for details].

  
 --
  Configuring procfs

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [CRIU] [PATCH 1/3] prctl: reduce permissions to change boundaries of data, brk and stack

2014-02-14 Thread Andrey Wagin
2014-02-14 23:16 GMT+04:00 Eric W. Biederman :
> Cyrill Gorcunov  writes:
>
>> On Fri, Feb 14, 2014 at 09:43:14PM +0400, Andrew Vagin wrote:
>>> > My brain hurts just looking at this patch and how you are justifying it.
>>> >
>>> > For the resources you are mucking with below all you have to do is to
>>> > verify that you are below the appropriate rlimit at all times and no
>>> > CAP_SYS_RESOURCE check is needed.  You only need CAP_SYS_RESOURCE
>>> > to exceed your per process limits.
>>> >
>>> > All you have to do is to fix the current code to properly enforce the
>>> > limits.
>>>
>>> I'm afraid what you are suggesting doesn't work.
>>>
>>> The first reason is that we can not change both boundaries in one call.
>>> But when we are restoring these attributes, we may need to move their
>>> too far.
>>
>> When this code was introduced, there were no user-namespace implementation,
>> if I remember correctly, so CAP_SYS_RESOURCE was enough barrier point
>> to prevent modifying this values by anyone. Now user-ns brings a limit --
>> we need somehow to provide a way to modify these mm fields having no
>> CAP_SYS_RESOURCE set. "Verifying rlimit" not an option here because
>> we're modifying members one by one (looking back I think this was not
>> a good idea to modify the fields in this manner).
>>
>> Maybe we could improve this api and provide argument as a pointer
>> to a structure, which would have all the fields we're going to
>> modify, which in turn would allow us to verify that all new values
>> are sane and fit rlimits, then we could (probably) deprecate old
>> api if noone except c/r camp is using it (I actually can't imagine
>> who else might need this api). Then CAP_SYS_RESOURCE requirement
>> could be ripped off. Hm? (sure touching api is always "no-no"
>> case, but maybe...)
>
> Hmm.  Let me rewind this a little bit.
>
> I want to be very stupid and ask the following.
>
> Why can't you have the process of interest do:
> ptrace(PTRACE_ATTACHME);
> execve(executable, args, ...);
>
> /* Have the ptracer inject the recovery/fixup code */
> /* Fix up the mostly correct process to look like it has been
>  * executing for a while.
>  */
>
> That should work, set all of the interesting fields, and works as
> non-root today.  My gut feel says do that and we can just
> deprecate/remove prctl_set_mm.

start_brk and start_stack are randomized each time. I don't understand
how execve() can restore the origin values of attributes.

>
> I am hoping we can move this conversation what makes sense from oh ick
> checkpoint/restort does not work with user namespaces.
>
> Eric
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [CRIU] [PATCH 1/3] prctl: reduce permissions to change boundaries of data, brk and stack

2014-02-14 Thread Andrey Wagin
2014-02-14 23:16 GMT+04:00 Eric W. Biederman ebied...@xmission.com:
 Cyrill Gorcunov gorcu...@gmail.com writes:

 On Fri, Feb 14, 2014 at 09:43:14PM +0400, Andrew Vagin wrote:
  My brain hurts just looking at this patch and how you are justifying it.
 
  For the resources you are mucking with below all you have to do is to
  verify that you are below the appropriate rlimit at all times and no
  CAP_SYS_RESOURCE check is needed.  You only need CAP_SYS_RESOURCE
  to exceed your per process limits.
 
  All you have to do is to fix the current code to properly enforce the
  limits.

 I'm afraid what you are suggesting doesn't work.

 The first reason is that we can not change both boundaries in one call.
 But when we are restoring these attributes, we may need to move their
 too far.

 When this code was introduced, there were no user-namespace implementation,
 if I remember correctly, so CAP_SYS_RESOURCE was enough barrier point
 to prevent modifying this values by anyone. Now user-ns brings a limit --
 we need somehow to provide a way to modify these mm fields having no
 CAP_SYS_RESOURCE set. Verifying rlimit not an option here because
 we're modifying members one by one (looking back I think this was not
 a good idea to modify the fields in this manner).

 Maybe we could improve this api and provide argument as a pointer
 to a structure, which would have all the fields we're going to
 modify, which in turn would allow us to verify that all new values
 are sane and fit rlimits, then we could (probably) deprecate old
 api if noone except c/r camp is using it (I actually can't imagine
 who else might need this api). Then CAP_SYS_RESOURCE requirement
 could be ripped off. Hm? (sure touching api is always no-no
 case, but maybe...)

 Hmm.  Let me rewind this a little bit.

 I want to be very stupid and ask the following.

 Why can't you have the process of interest do:
 ptrace(PTRACE_ATTACHME);
 execve(executable, args, ...);

 /* Have the ptracer inject the recovery/fixup code */
 /* Fix up the mostly correct process to look like it has been
  * executing for a while.
  */

 That should work, set all of the interesting fields, and works as
 non-root today.  My gut feel says do that and we can just
 deprecate/remove prctl_set_mm.

start_brk and start_stack are randomized each time. I don't understand
how execve() can restore the origin values of attributes.


 I am hoping we can move this conversation what makes sense from oh ick
 checkpoint/restort does not work with user namespaces.

 Eric
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] netfilter: nf_conntrack: fix RCU race in nf_conntrack_find_get (v3)

2014-01-14 Thread Andrey Wagin
>
> Eh, looks like this path is incomplete too:(
>
> I think we can't set a reference counter for objects which is allocated
> from a SLAB_DESTROY_BY_RCU cache. Look at the following backtrace.
>
> cpu1cpu2
> ct = nf_conntrack_find()
> destroy_conntrack
> atomic_inc_not_zero(ct)

ct->ct_general.use is zero after destroy_conntrack(). Sorry for the noise.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] netfilter: nf_conntrack: fix RCU race in nf_conntrack_find_get (v3)

2014-01-14 Thread Andrey Wagin

 Eh, looks like this path is incomplete too:(

 I think we can't set a reference counter for objects which is allocated
 from a SLAB_DESTROY_BY_RCU cache. Look at the following backtrace.

 cpu1cpu2
 ct = nf_conntrack_find()
 destroy_conntrack
 atomic_inc_not_zero(ct)

ct-ct_general.use is zero after destroy_conntrack(). Sorry for the noise.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] netfilter: nf_conntrack: fix RCU race in nf_conntrack_find_get

2014-01-09 Thread Andrey Wagin
2014/1/9 Eric Dumazet :
> On Thu, 2014-01-09 at 09:24 +0400, Andrew Vagin wrote:
>
>> I have one more question. Looks like I found another problem.
>>
>> init_conntrack:
>> hlist_nulls_add_head_rcu(>tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
>>   >ct.unconfirmed);
>>
>> __nf_conntrack_hash_insert:
>>   hlist_nulls_add_head_rcu(>tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
>>  >ct.hash[hash]);
>>
>> We use one hlist_nulls_node to add a conntrack into two different lists.
>> As far as I understand, it's unacceptable in case of
>> SLAB_DESTROY_BY_RCU.
>
> I guess you missed :
>
> net/netfilter/nf_conntrack_core.c:1598: 
> INIT_HLIST_NULLS_HEAD(>ct.unconfirmed, UNCONFIRMED_NULLS_VAL);

but we can look up something suitable and return this value, but it
will be unconfirmed. Ok, I see. This situation is fixed by this patch
too.

I don't understand the result of your discussion with Florian.  Here
are a few states of conntracts: it can be used and it's initialized.
The sign of the first state is non-zero refcnt and the sign of the
second state is the confirmed bit.

In the first state conntrack is attached to skb and inserted in the
unconfirmed list. In this state the use count can be incremented in
ctnetlink_dump_list() and skb_clone().

In the second state conntrack may be attached to a few skb-s and
inserted to net->ct.hash.

I have read all emails again and I can't understand when this patch
doesn't work.  Maybe you could give a sequence of actions? Thanks.

>
>
>>
>> Lets imagine that we have two threads. The first one enumerates objects
>> from a first list, the second one recreates an object and insert it in a
>> second list.  If a first thread in this moment stays on the object, it
>> can read "next", when it's in the second list, so it will continue
>> to enumerate objects from the second list. It is not what we want, isn't
>> it?
>>
>> cpu1  cpu2
>>   hlist_nulls_for_each_entry_rcu(ct)
>> destroy_conntrack
>>  kmem_cache_free
>>
>> init_conntrack
>>  hlist_nulls_add_head_rcu
>>   ct = ct->next
>>
>
> This will be fine.
>
> I think we even have a counter to count number of occurrence of this
> rare event. (I personally never read a non null search_restart value)
>
>  NF_CT_STAT_INC(net, search_restart);

Thank you for explanation.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] netfilter: nf_conntrack: fix RCU race in nf_conntrack_find_get

2014-01-09 Thread Andrey Wagin
2014/1/9 Eric Dumazet eric.duma...@gmail.com:
 On Thu, 2014-01-09 at 09:24 +0400, Andrew Vagin wrote:

 I have one more question. Looks like I found another problem.

 init_conntrack:
 hlist_nulls_add_head_rcu(ct-tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
   net-ct.unconfirmed);

 __nf_conntrack_hash_insert:
   hlist_nulls_add_head_rcu(ct-tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
  net-ct.hash[hash]);

 We use one hlist_nulls_node to add a conntrack into two different lists.
 As far as I understand, it's unacceptable in case of
 SLAB_DESTROY_BY_RCU.

 I guess you missed :

 net/netfilter/nf_conntrack_core.c:1598: 
 INIT_HLIST_NULLS_HEAD(net-ct.unconfirmed, UNCONFIRMED_NULLS_VAL);

but we can look up something suitable and return this value, but it
will be unconfirmed. Ok, I see. This situation is fixed by this patch
too.

I don't understand the result of your discussion with Florian.  Here
are a few states of conntracts: it can be used and it's initialized.
The sign of the first state is non-zero refcnt and the sign of the
second state is the confirmed bit.

In the first state conntrack is attached to skb and inserted in the
unconfirmed list. In this state the use count can be incremented in
ctnetlink_dump_list() and skb_clone().

In the second state conntrack may be attached to a few skb-s and
inserted to net-ct.hash.

I have read all emails again and I can't understand when this patch
doesn't work.  Maybe you could give a sequence of actions? Thanks.




 Lets imagine that we have two threads. The first one enumerates objects
 from a first list, the second one recreates an object and insert it in a
 second list.  If a first thread in this moment stays on the object, it
 can read next, when it's in the second list, so it will continue
 to enumerate objects from the second list. It is not what we want, isn't
 it?

 cpu1  cpu2
   hlist_nulls_for_each_entry_rcu(ct)
 destroy_conntrack
  kmem_cache_free

 init_conntrack
  hlist_nulls_add_head_rcu
   ct = ct-next


 This will be fine.

 I think we even have a counter to count number of occurrence of this
 rare event. (I personally never read a non null search_restart value)

  NF_CT_STAT_INC(net, search_restart);

Thank you for explanation.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] netfilter: nf_conntrack: release conntrack from rcu callback

2014-01-07 Thread Andrey Wagin
Hi Florian,

2014/1/7 Florian Westphal :
> Andrew Vagin  wrote:

>> > ct = nf_ct_tuplehash_to_ctrack(h);
>> > if (unlikely(nf_ct_is_dying(ct) ||
>> >  !atomic_inc_not_zero(>ct_general.use)))
>> > // which means we should hit this path (0 ref).
>> > h = NULL;
>> > else {
>> > // otherwise, it cannot go away from under us, since
>> > // we own a reference now.
>> > if (unlikely(!nf_ct_tuple_equal(tuple, >tuple) 
>> > ||
>> >  nf_ct_zone(ct) != zone)) {
>
> Perhaps this needs additional !nf_ct_is_confirmed()?

Yes, it think it must help. Thank you for the comments. I resent this patch:
[PATCH] netfilter: nf_conntrack: fix RCU race in nf_conntrack_find_get

>
> It would cover your case (found a recycled element that has been put on
> the unconfirmed list (refcnt already set to 1, ct->tuple is set) on another 
> cpu,
> extensions possibly not yet fully initialised), and the same tuple).
>
> Regards,
> Florian

Thanks,
Andrey
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] netfilter: nf_conntrack: release conntrack from rcu callback

2014-01-07 Thread Andrey Wagin
Hi Florian,

2014/1/7 Florian Westphal f...@strlen.de:
 Andrew Vagin ava...@gmail.com wrote:

  ct = nf_ct_tuplehash_to_ctrack(h);
  if (unlikely(nf_ct_is_dying(ct) ||
   !atomic_inc_not_zero(ct-ct_general.use)))
  // which means we should hit this path (0 ref).
  h = NULL;
  else {
  // otherwise, it cannot go away from under us, since
  // we own a reference now.
  if (unlikely(!nf_ct_tuple_equal(tuple, h-tuple) 
  ||
   nf_ct_zone(ct) != zone)) {

 Perhaps this needs additional !nf_ct_is_confirmed()?

Yes, it think it must help. Thank you for the comments. I resent this patch:
[PATCH] netfilter: nf_conntrack: fix RCU race in nf_conntrack_find_get


 It would cover your case (found a recycled element that has been put on
 the unconfirmed list (refcnt already set to 1, ct-tuple is set) on another 
 cpu,
 extensions possibly not yet fully initialised), and the same tuple).

 Regards,
 Florian

Thanks,
Andrey
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [CRIU] [PATCH] timerfd: show procfs fdinfo helper

2013-12-25 Thread Andrey Wagin
2013/12/24 Shawn Landden :
> | pos:  0
> | flags:02004002
> | clockid:  0
>
> Cc: Thomas Gleixner 
> Cc: Alexander Viro 
> Signed-off-by: Shawn Landden 
> ---
>  fs/timerfd.c | 17 +
>  1 file changed, 17 insertions(+)
>
> diff --git a/fs/timerfd.c b/fs/timerfd.c
> index 9293121..e5fa587 100644
> --- a/fs/timerfd.c
> +++ b/fs/timerfd.c
> @@ -25,6 +25,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>
>  struct timerfd_ctx {
> union {
> @@ -284,7 +285,23 @@ static ssize_t timerfd_read(struct file *file, char 
> __user *buf, size_t count,
> return res;
>  }
>
> +#ifdef CONFIG_PROC_FS
> +static int timerfd_show_fdinfo(struct seq_file *m, struct file *f)
> +{
> +   struct timerfd_ctx *ctx = f->private_data;
> +   int clockid;
> +
> +   clockid = ctx->clockid;
> +   seq_printf(m, "clockid:\t%d\n", clockid);

I think we can show ctx->ticks, itimerspec here. The ctx->ticks is
required for proper dumping and restoring timerfd.

> +
> +   return 0;
> +}
> +#endif
> +
>  static const struct file_operations timerfd_fops = {
> +#ifdef CONFIG_PROC_FS
> +   .show_fdinfo= timerfd_show_fdinfo,
> +#endif
> .release= timerfd_release,
> .poll   = timerfd_poll,
> .read   = timerfd_read,
> --
> 1.8.5.2.297.g3e57c29
>
> ___
> CRIU mailing list
> c...@openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [CRIU] [PATCH] timerfd: show procfs fdinfo helper

2013-12-25 Thread Andrey Wagin
2013/12/24 Shawn Landden sh...@churchofgit.com:
 | pos:  0
 | flags:02004002
 | clockid:  0

 Cc: Thomas Gleixner t...@linutronix.de
 Cc: Alexander Viro v...@zeniv.linux.org.uk
 Signed-off-by: Shawn Landden sh...@churchofgit.com
 ---
  fs/timerfd.c | 17 +
  1 file changed, 17 insertions(+)

 diff --git a/fs/timerfd.c b/fs/timerfd.c
 index 9293121..e5fa587 100644
 --- a/fs/timerfd.c
 +++ b/fs/timerfd.c
 @@ -25,6 +25,7 @@
  #include linux/syscalls.h
  #include linux/compat.h
  #include linux/rcupdate.h
 +#include linux/seq_file.h

  struct timerfd_ctx {
 union {
 @@ -284,7 +285,23 @@ static ssize_t timerfd_read(struct file *file, char 
 __user *buf, size_t count,
 return res;
  }

 +#ifdef CONFIG_PROC_FS
 +static int timerfd_show_fdinfo(struct seq_file *m, struct file *f)
 +{
 +   struct timerfd_ctx *ctx = f-private_data;
 +   int clockid;
 +
 +   clockid = ctx-clockid;
 +   seq_printf(m, clockid:\t%d\n, clockid);

I think we can show ctx-ticks, itimerspec here. The ctx-ticks is
required for proper dumping and restoring timerfd.

 +
 +   return 0;
 +}
 +#endif
 +
  static const struct file_operations timerfd_fops = {
 +#ifdef CONFIG_PROC_FS
 +   .show_fdinfo= timerfd_show_fdinfo,
 +#endif
 .release= timerfd_release,
 .poll   = timerfd_poll,
 .read   = timerfd_read,
 --
 1.8.5.2.297.g3e57c29

 ___
 CRIU mailing list
 c...@openvz.org
 https://lists.openvz.org/mailman/listinfo/criu
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] thp: move preallocated PTE page table on move_huge_pmd()

2013-12-04 Thread Andrey Wagin
2013/12/4 Kirill A. Shutemov :
> Andrey Wagin reported crash on VM_BUG_ON() in pgtable_pmd_page_dtor()
> with fallowing backtrace:
>
>   [] free_pgd_range+0x2bf/0x410
>   [] free_pgtables+0xce/0x120
>   [] unmap_region+0xe0/0x120
>   [] ? move_page_tables+0x526/0x6b0
>   [] do_munmap+0x249/0x360
>   [] move_vma+0x144/0x270
>   [] SyS_mremap+0x3b9/0x510
>   [] system_call_fastpath+0x16/0x1b
>
> The crash can be reproduce with this test case:
>
>   #define _GNU_SOURCE
>   #include 
>   #include 
>   #include 
>
>   #define MB (1024 * 1024UL)
>   #define GB (1024 * MB)
>
>   int main(int argc, char **argv)
>   {
> char *p;
> int i;
>
> p = mmap((void *) GB, 10 * MB, PROT_READ | PROT_WRITE,
> MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
> for (i = 0; i < 10 * MB; i += 4096)
> p[i] = 1;
> mremap(p, 10 * MB, 10 * MB, MREMAP_FIXED | MREMAP_MAYMOVE, 2 * GB);
> return 0;
>   }
>
> Due to split PMD lock, we now store preallocated PTE tables for THP
> pages per-PMD table.  It means we need to move them to other PMD table
> if huge PMD moved there.
>
> Signed-off-by: Kirill A. Shutemov 
> Reported-by: Andrey Vagin 

My tests were working for the night without any problem.  Thanks for
the quick response.

Tested-by: Andrey Vagin 

> ---
>  mm/huge_memory.c | 12 +++-
>  1 file changed, 11 insertions(+), 1 deletion(-)
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index bccd5a628ea6..33a5dc492810 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1481,8 +1481,18 @@ int move_huge_pmd(struct vm_area_struct *vma, struct 
> vm_area_struct *new_vma,
> pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
> VM_BUG_ON(!pmd_none(*new_pmd));
> set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
> -   if (new_ptl != old_ptl)
> +   if (new_ptl != old_ptl) {
> +   pgtable_t pgtable;
> +
> +   /*
> +* Move preallocated PTE page table if new_pmd is on
> +* different PMD page table.
> +*/
> +   pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
> +   pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
> +
> spin_unlock(new_ptl);
> +   }
> spin_unlock(old_ptl);
> }
>  out:
> --
> 1.8.4.4
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] thp: move preallocated PTE page table on move_huge_pmd()

2013-12-04 Thread Andrey Wagin
2013/12/4 Kirill A. Shutemov kirill.shute...@linux.intel.com:
 Andrey Wagin reported crash on VM_BUG_ON() in pgtable_pmd_page_dtor()
 with fallowing backtrace:

   [8119427f] free_pgd_range+0x2bf/0x410
   [8119449e] free_pgtables+0xce/0x120
   [8119b900] unmap_region+0xe0/0x120
   [811a0036] ? move_page_tables+0x526/0x6b0
   [8119d6a9] do_munmap+0x249/0x360
   [811a0304] move_vma+0x144/0x270
   [811a07e9] SyS_mremap+0x3b9/0x510
   [8172d512] system_call_fastpath+0x16/0x1b

 The crash can be reproduce with this test case:

   #define _GNU_SOURCE
   #include sys/mman.h
   #include stdio.h
   #include unistd.h

   #define MB (1024 * 1024UL)
   #define GB (1024 * MB)

   int main(int argc, char **argv)
   {
 char *p;
 int i;

 p = mmap((void *) GB, 10 * MB, PROT_READ | PROT_WRITE,
 MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
 for (i = 0; i  10 * MB; i += 4096)
 p[i] = 1;
 mremap(p, 10 * MB, 10 * MB, MREMAP_FIXED | MREMAP_MAYMOVE, 2 * GB);
 return 0;
   }

 Due to split PMD lock, we now store preallocated PTE tables for THP
 pages per-PMD table.  It means we need to move them to other PMD table
 if huge PMD moved there.

 Signed-off-by: Kirill A. Shutemov kirill.shute...@linux.intel.com
 Reported-by: Andrey Vagin ava...@openvz.org

My tests were working for the night without any problem.  Thanks for
the quick response.

Tested-by: Andrey Vagin ava...@openvz.org

 ---
  mm/huge_memory.c | 12 +++-
  1 file changed, 11 insertions(+), 1 deletion(-)

 diff --git a/mm/huge_memory.c b/mm/huge_memory.c
 index bccd5a628ea6..33a5dc492810 100644
 --- a/mm/huge_memory.c
 +++ b/mm/huge_memory.c
 @@ -1481,8 +1481,18 @@ int move_huge_pmd(struct vm_area_struct *vma, struct 
 vm_area_struct *new_vma,
 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
 VM_BUG_ON(!pmd_none(*new_pmd));
 set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
 -   if (new_ptl != old_ptl)
 +   if (new_ptl != old_ptl) {
 +   pgtable_t pgtable;
 +
 +   /*
 +* Move preallocated PTE page table if new_pmd is on
 +* different PMD page table.
 +*/
 +   pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
 +   pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
 +
 spin_unlock(new_ptl);
 +   }
 spin_unlock(old_ptl);
 }
  out:
 --
 1.8.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 34/34] mm: dynamically allocate page->ptl if it cannot be embedded to struct page

2013-11-20 Thread Andrey Wagin
2013/11/20 Kirill A. Shutemov :
> Andrey Wagin wrote:
>> 2013/11/20 Kirill A. Shutemov :
>> > Andrey Wagin wrote:
>> >> Hi Kirill,
>> >>
>> >> Looks like this patch adds memory leaks.
>> >> [  116.188310] kmemleak: 15672 new suspected memory leaks (see
>> >> /sys/kernel/debug/kmemleak)
>> >> unreferenced object 0x8800da45a350 (size 96):
>> >>   comm "dracut-initqueu", pid 93, jiffies 4294671391 (age 362.277s)
>> >>   hex dump (first 32 bytes):
>> >> 07 00 07 00 ad 4e ad de ff ff ff ff 6b 6b 6b 6b  .N..
>> >> ff ff ff ff ff ff ff ff 80 24 b4 82 ff ff ff ff  .$..
>> >>   backtrace:
>> >> [] kmemleak_alloc+0x5e/0xc0
>> >> [] kmem_cache_alloc_trace+0x113/0x290
>> >> [] __ptlock_alloc+0x27/0x50
>> >> [] __pmd_alloc+0x59/0x170
>> >> [] copy_page_range+0x38a/0x3e0
>> >> [] dup_mm+0x313/0x540
>> >> [] copy_process+0x161a/0x1880
>> >> [] do_fork+0x8b/0x360
>> >> [] SyS_clone+0x16/0x20
>> >> [] stub_clone+0x69/0x90
>> >> [] 0x
>> >>
>> >> It's quite serious, because my test host went to panic in a few hours.
>> >
>> > Sorry for that.
>> >
>> > Could you test patch below.
>>
>> Yes, it works.
>>
>> I found this too a few minutes ago:)
>
> Nice
>
> Tested-by ?

Tested-by: Andrey Vagin 

>
> --
>  Kirill A. Shutemov
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 34/34] mm: dynamically allocate page->ptl if it cannot be embedded to struct page

2013-11-20 Thread Andrey Wagin
2013/11/20 Kirill A. Shutemov :
> Andrey Wagin wrote:
>> Hi Kirill,
>>
>> Looks like this patch adds memory leaks.
>> [  116.188310] kmemleak: 15672 new suspected memory leaks (see
>> /sys/kernel/debug/kmemleak)
>> unreferenced object 0x8800da45a350 (size 96):
>>   comm "dracut-initqueu", pid 93, jiffies 4294671391 (age 362.277s)
>>   hex dump (first 32 bytes):
>> 07 00 07 00 ad 4e ad de ff ff ff ff 6b 6b 6b 6b  .N..
>> ff ff ff ff ff ff ff ff 80 24 b4 82 ff ff ff ff  .$..
>>   backtrace:
>> [] kmemleak_alloc+0x5e/0xc0
>> [] kmem_cache_alloc_trace+0x113/0x290
>> [] __ptlock_alloc+0x27/0x50
>> [] __pmd_alloc+0x59/0x170
>> [] copy_page_range+0x38a/0x3e0
>> [] dup_mm+0x313/0x540
>> [] copy_process+0x161a/0x1880
>> [] do_fork+0x8b/0x360
>> [] SyS_clone+0x16/0x20
>> [] stub_clone+0x69/0x90
>> [] 0x
>>
>> It's quite serious, because my test host went to panic in a few hours.
>
> Sorry for that.
>
> Could you test patch below.

Yes, it works.

I found this too a few minutes ago:)

diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index a7cccb6..44c366c 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -62,6 +62,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 {
paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
+   pgtable_pmd_page_dtor(virt_to_page(pmd));
/*
 * NOTE! For PAE, any changes to the top page-directory-pointer-table
 * entries need a full cr3 reload to flush.

Thanks.

>
> diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
> index a7cccb6d7fec..7be5809754cf 100644
> --- a/arch/x86/mm/pgtable.c
> +++ b/arch/x86/mm/pgtable.c
> @@ -61,6 +61,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page 
> *pte)
>  #if PAGETABLE_LEVELS > 2
>  void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
>  {
> +   struct page *page = virt_to_page(pmd);
> paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
> /*
>  * NOTE! For PAE, any changes to the top page-directory-pointer-table
> @@ -69,7 +70,8 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
>  #ifdef CONFIG_X86_PAE
> tlb->need_flush_all = 1;
>  #endif
> -   tlb_remove_page(tlb, virt_to_page(pmd));
> +   pgtable_pmd_page_dtor(page);
> +   tlb_remove_page(tlb, page);
>  }
>
>  #if PAGETABLE_LEVELS > 3
> --
>  Kirill A. Shutemov
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 34/34] mm: dynamically allocate page->ptl if it cannot be embedded to struct page

2013-11-20 Thread Andrey Wagin
Hi Kirill,

Looks like this patch adds memory leaks.
[  116.188310] kmemleak: 15672 new suspected memory leaks (see
/sys/kernel/debug/kmemleak)
unreferenced object 0x8800da45a350 (size 96):
  comm "dracut-initqueu", pid 93, jiffies 4294671391 (age 362.277s)
  hex dump (first 32 bytes):
07 00 07 00 ad 4e ad de ff ff ff ff 6b 6b 6b 6b  .N..
ff ff ff ff ff ff ff ff 80 24 b4 82 ff ff ff ff  .$..
  backtrace:
[] kmemleak_alloc+0x5e/0xc0
[] kmem_cache_alloc_trace+0x113/0x290
[] __ptlock_alloc+0x27/0x50
[] __pmd_alloc+0x59/0x170
[] copy_page_range+0x38a/0x3e0
[] dup_mm+0x313/0x540
[] copy_process+0x161a/0x1880
[] do_fork+0x8b/0x360
[] SyS_clone+0x16/0x20
[] stub_clone+0x69/0x90
[] 0x

It's quite serious, because my test host went to panic in a few hours.

[12000.632734] kmemleak: 74155 new suspected memory leaks (see
/sys/kernel/debug/kmemleak)
[12080.734075] zombie00[29282]: segfault at 0 ip 00401862 sp
7fffc509bc20 error 6 in zombie00[40+5000]
[12619.799052] BUG: unable to handle kernel paging request at 7aa9e3a0
[12619.800044] IP: [] cpuacct_charge+0x97/0x1e0
[12619.800044] PGD 0
[12619.800044] Thread overran stack, or stack corrupted
[12619.800044] Oops:  [#1] SMP
[12619.800044] Modules linked in: binfmt_misc ip6table_filter
ip6_tables tun netlink_diag af_packet_diag udp_diag tcp_diag inet_diag
unix_diag joydev microcode pcspkr i2c_piix4 virtio_balloon virtio_net
i2c_core virtio_blk floppy
[12619.800044] CPU: 1 PID: 1324 Comm: kworker/u4:2 Not tainted 3.12.0+ #142
[12619.800044] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2007
[12619.800044] Workqueue: writeback bdi_writeback_workfn (flush-252:0)
[12619.800044] task: 88001f1a8000 ti: 880096f26000 task.ti:
880096f26000
[12619.800044] RIP: 0010:[]  []
cpuacct_charge+0x97/0x1e0
[12619.800044] RSP: 0018:88011b403ce8  EFLAGS: 00010002
[12619.800044] RAX: d580 RBX: 000f11b1 RCX: 0003
[12619.800044] RDX: 81c49e40 RSI: 81c4bb00 RDI: 88001f1a8c68
[12619.800044] RBP: 88011b403d18 R08: 0001 R09: 0001
[12619.800044] R10: 0001 R11: 0007 R12: 88001f1a8000
[12619.800044] R13: 1f1a8000 R14: 82a86320 R15: 06b1bda1e433
[12619.800044] FS:  () GS:88011b40()
knlGS:
[12619.800044] CS:  0010 DS:  ES:  CR0: 8005003b
[12619.800044] CR2: 7aa9e3a0 CR3: 01c0b000 CR4: 06e0
[12619.800044] Stack:
[12619.800044]  810b2b70 0002 88011b5d40c0
000f11b1
[12619.800044]  88001f1a8068 88001f1a8000 88011b403d58
810a108f
[12619.800044]  88011b403d88 88001f1a8068 88011b5d40c0
88011b5d4000
[12619.800044] Call Trace:
[12619.800044]  
[12619.800044]  [] ? cpuacct_css_alloc+0xb0/0xb0
[12619.800044]  [] update_curr+0x13f/0x230
[12619.800044]  [] task_tick_fair+0x2d7/0x650
[12619.800044]  [] ? sched_clock_cpu+0xb8/0x120
[12619.800044]  [] scheduler_tick+0x6d/0xf0
[12619.800044]  [] update_process_times+0x61/0x80
[12619.800044]  [] tick_sched_handle+0x37/0x80
[12619.800044]  [] tick_sched_timer+0x54/0x90
[12619.800044]  [] __run_hrtimer+0x71/0x2d0
[12619.800044]  [] ? tick_nohz_handler+0xc0/0xc0
[12619.800044]  [] hrtimer_interrupt+0x116/0x2a0
[12619.800044]  [] ? __local_bh_enable+0x49/0x70
[12619.800044]  [] local_apic_timer_interrupt+0x3b/0x60
[12619.800044]  [] smp_apic_timer_interrupt+0x45/0x60
[12619.800044]  [] apic_timer_interrupt+0x6f/0x80
[12619.800044]  
[12619.800044]  [] ? mark_held_locks+0x90/0x150
[12619.800044]  [] ? _raw_spin_unlock_irqrestore+0x42/0x70
[12619.800044]  [] virtio_queue_rq+0xdb/0x1b0 [virtio_blk]
[12619.800044]  [] __blk_mq_run_hw_queue+0x1ca/0x520
[12619.800044]  [] blk_mq_run_hw_queue+0x35/0x40
[12619.800044]  [] blk_mq_insert_requests+0xe2/0x190
[12619.800044]  [] blk_mq_flush_plug_list+0x134/0x150
[12619.800044]  [] blk_flush_plug_list+0xbd/0x220
[12619.800044]  [] blk_mq_make_request+0x3da/0x4d0
[12619.800044]  [] generic_make_request+0xca/0x100
[12619.800044]  [] submit_bio+0x76/0x160
[12619.800044]  [] ? test_set_page_writeback+0x36/0x2b0
[12619.800044]  [] ? end_swap_bio_read+0xc0/0xc0
[12619.800044]  [] __swap_writepage+0x198/0x230
[12619.800044]  [] ? _raw_spin_unlock+0x2b/0x40
[12619.800044]  [] ? page_swapcount+0x53/0x70
[12619.800044]  [] swap_writepage+0x43/0x90
[12619.800044]  [] shrink_page_list+0x6cf/0xaa0
[12619.800044]  [] shrink_inactive_list+0x1c2/0x5b0
[12619.800044]  [] ? __lock_acquire+0x23f/0x1810
[12619.800044]  [] shrink_lruvec+0x335/0x600
[12619.800044]  [] ? mem_cgroup_iter+0x1f5/0x510
[12619.800044]  [] shrink_zone+0x96/0x1d0
[12619.800044]  [] do_try_to_free_pages+0x103/0x600
[12619.800044]  [] ? sched_clock_local+0x25/0x90
[12619.800044]  [] try_to_free_pages+0x222/0x440
[12619.800044]  [] __alloc_pages_nodemask+0x8af/0xc70

Re: [PATCH 34/34] mm: dynamically allocate page-ptl if it cannot be embedded to struct page

2013-11-20 Thread Andrey Wagin
Hi Kirill,

Looks like this patch adds memory leaks.
[  116.188310] kmemleak: 15672 new suspected memory leaks (see
/sys/kernel/debug/kmemleak)
unreferenced object 0x8800da45a350 (size 96):
  comm dracut-initqueu, pid 93, jiffies 4294671391 (age 362.277s)
  hex dump (first 32 bytes):
07 00 07 00 ad 4e ad de ff ff ff ff 6b 6b 6b 6b  .N..
ff ff ff ff ff ff ff ff 80 24 b4 82 ff ff ff ff  .$..
  backtrace:
[817152fe] kmemleak_alloc+0x5e/0xc0
[811c34f3] kmem_cache_alloc_trace+0x113/0x290
[811920f7] __ptlock_alloc+0x27/0x50
[81192849] __pmd_alloc+0x59/0x170
[81195ffa] copy_page_range+0x38a/0x3e0
[8105a013] dup_mm+0x313/0x540
[8105b9da] copy_process+0x161a/0x1880
[8105c01b] do_fork+0x8b/0x360
[8105c306] SyS_clone+0x16/0x20
[81727b79] stub_clone+0x69/0x90
[] 0x

It's quite serious, because my test host went to panic in a few hours.

[12000.632734] kmemleak: 74155 new suspected memory leaks (see
/sys/kernel/debug/kmemleak)
[12080.734075] zombie00[29282]: segfault at 0 ip 00401862 sp
7fffc509bc20 error 6 in zombie00[40+5000]
[12619.799052] BUG: unable to handle kernel paging request at 7aa9e3a0
[12619.800044] IP: [810b2c07] cpuacct_charge+0x97/0x1e0
[12619.800044] PGD 0
[12619.800044] Thread overran stack, or stack corrupted
[12619.800044] Oops:  [#1] SMP
[12619.800044] Modules linked in: binfmt_misc ip6table_filter
ip6_tables tun netlink_diag af_packet_diag udp_diag tcp_diag inet_diag
unix_diag joydev microcode pcspkr i2c_piix4 virtio_balloon virtio_net
i2c_core virtio_blk floppy
[12619.800044] CPU: 1 PID: 1324 Comm: kworker/u4:2 Not tainted 3.12.0+ #142
[12619.800044] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2007
[12619.800044] Workqueue: writeback bdi_writeback_workfn (flush-252:0)
[12619.800044] task: 88001f1a8000 ti: 880096f26000 task.ti:
880096f26000
[12619.800044] RIP: 0010:[810b2c07]  [810b2c07]
cpuacct_charge+0x97/0x1e0
[12619.800044] RSP: 0018:88011b403ce8  EFLAGS: 00010002
[12619.800044] RAX: d580 RBX: 000f11b1 RCX: 0003
[12619.800044] RDX: 81c49e40 RSI: 81c4bb00 RDI: 88001f1a8c68
[12619.800044] RBP: 88011b403d18 R08: 0001 R09: 0001
[12619.800044] R10: 0001 R11: 0007 R12: 88001f1a8000
[12619.800044] R13: 1f1a8000 R14: 82a86320 R15: 06b1bda1e433
[12619.800044] FS:  () GS:88011b40()
knlGS:
[12619.800044] CS:  0010 DS:  ES:  CR0: 8005003b
[12619.800044] CR2: 7aa9e3a0 CR3: 01c0b000 CR4: 06e0
[12619.800044] Stack:
[12619.800044]  810b2b70 0002 88011b5d40c0
000f11b1
[12619.800044]  88001f1a8068 88001f1a8000 88011b403d58
810a108f
[12619.800044]  88011b403d88 88001f1a8068 88011b5d40c0
88011b5d4000
[12619.800044] Call Trace:
[12619.800044]  IRQ
[12619.800044]  [810b2b70] ? cpuacct_css_alloc+0xb0/0xb0
[12619.800044]  [810a108f] update_curr+0x13f/0x230
[12619.800044]  [810a9e57] task_tick_fair+0x2d7/0x650
[12619.800044]  [8109dcc8] ? sched_clock_cpu+0xb8/0x120
[12619.800044]  [8109482d] scheduler_tick+0x6d/0xf0
[12619.800044]  [8106afd1] update_process_times+0x61/0x80
[12619.800044]  [810e38c7] tick_sched_handle+0x37/0x80
[12619.800044]  [810e3e74] tick_sched_timer+0x54/0x90
[12619.800044]  [8108bd21] __run_hrtimer+0x71/0x2d0
[12619.800044]  [810e3e20] ? tick_nohz_handler+0xc0/0xc0
[12619.800044]  [8108c246] hrtimer_interrupt+0x116/0x2a0
[12619.800044]  [81062959] ? __local_bh_enable+0x49/0x70
[12619.800044]  [81033dcb] local_apic_timer_interrupt+0x3b/0x60
[12619.800044]  [81727c05] smp_apic_timer_interrupt+0x45/0x60
[12619.800044]  [8172686f] apic_timer_interrupt+0x6f/0x80
[12619.800044]  EOI
[12619.800044]  [810b8e10] ? mark_held_locks+0x90/0x150
[12619.800044]  [8171c6f2] ? _raw_spin_unlock_irqrestore+0x42/0x70
[12619.800044]  [a001b71b] virtio_queue_rq+0xdb/0x1b0 [virtio_blk]
[12619.800044]  [8134647a] __blk_mq_run_hw_queue+0x1ca/0x520
[12619.800044]  [81346b35] blk_mq_run_hw_queue+0x35/0x40
[12619.800044]  [813470f2] blk_mq_insert_requests+0xe2/0x190
[12619.800044]  [813472d4] blk_mq_flush_plug_list+0x134/0x150
[12619.800044]  [8133d0cd] blk_flush_plug_list+0xbd/0x220
[12619.800044]  [81346f1a] blk_mq_make_request+0x3da/0x4d0
[12619.800044]  [813397aa] generic_make_request+0xca/0x100
[12619.800044]  [81339856] submit_bio+0x76/0x160
[12619.800044]  [81173c66] ? test_set_page_writeback+0x36/0x2b0
[12619.800044]  [811a9ae0] ? 

Re: [PATCH 34/34] mm: dynamically allocate page-ptl if it cannot be embedded to struct page

2013-11-20 Thread Andrey Wagin
2013/11/20 Kirill A. Shutemov kirill.shute...@linux.intel.com:
 Andrey Wagin wrote:
 Hi Kirill,

 Looks like this patch adds memory leaks.
 [  116.188310] kmemleak: 15672 new suspected memory leaks (see
 /sys/kernel/debug/kmemleak)
 unreferenced object 0x8800da45a350 (size 96):
   comm dracut-initqueu, pid 93, jiffies 4294671391 (age 362.277s)
   hex dump (first 32 bytes):
 07 00 07 00 ad 4e ad de ff ff ff ff 6b 6b 6b 6b  .N..
 ff ff ff ff ff ff ff ff 80 24 b4 82 ff ff ff ff  .$..
   backtrace:
 [817152fe] kmemleak_alloc+0x5e/0xc0
 [811c34f3] kmem_cache_alloc_trace+0x113/0x290
 [811920f7] __ptlock_alloc+0x27/0x50
 [81192849] __pmd_alloc+0x59/0x170
 [81195ffa] copy_page_range+0x38a/0x3e0
 [8105a013] dup_mm+0x313/0x540
 [8105b9da] copy_process+0x161a/0x1880
 [8105c01b] do_fork+0x8b/0x360
 [8105c306] SyS_clone+0x16/0x20
 [81727b79] stub_clone+0x69/0x90
 [] 0x

 It's quite serious, because my test host went to panic in a few hours.

 Sorry for that.

 Could you test patch below.

Yes, it works.

I found this too a few minutes ago:)

diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index a7cccb6..44c366c 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -62,6 +62,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 {
paravirt_release_pmd(__pa(pmd)  PAGE_SHIFT);
+   pgtable_pmd_page_dtor(virt_to_page(pmd));
/*
 * NOTE! For PAE, any changes to the top page-directory-pointer-table
 * entries need a full cr3 reload to flush.

Thanks.


 diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
 index a7cccb6d7fec..7be5809754cf 100644
 --- a/arch/x86/mm/pgtable.c
 +++ b/arch/x86/mm/pgtable.c
 @@ -61,6 +61,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page 
 *pte)
  #if PAGETABLE_LEVELS  2
  void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
  {
 +   struct page *page = virt_to_page(pmd);
 paravirt_release_pmd(__pa(pmd)  PAGE_SHIFT);
 /*
  * NOTE! For PAE, any changes to the top page-directory-pointer-table
 @@ -69,7 +70,8 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
  #ifdef CONFIG_X86_PAE
 tlb-need_flush_all = 1;
  #endif
 -   tlb_remove_page(tlb, virt_to_page(pmd));
 +   pgtable_pmd_page_dtor(page);
 +   tlb_remove_page(tlb, page);
  }

  #if PAGETABLE_LEVELS  3
 --
  Kirill A. Shutemov
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 34/34] mm: dynamically allocate page-ptl if it cannot be embedded to struct page

2013-11-20 Thread Andrey Wagin
2013/11/20 Kirill A. Shutemov kirill.shute...@linux.intel.com:
 Andrey Wagin wrote:
 2013/11/20 Kirill A. Shutemov kirill.shute...@linux.intel.com:
  Andrey Wagin wrote:
  Hi Kirill,
 
  Looks like this patch adds memory leaks.
  [  116.188310] kmemleak: 15672 new suspected memory leaks (see
  /sys/kernel/debug/kmemleak)
  unreferenced object 0x8800da45a350 (size 96):
comm dracut-initqueu, pid 93, jiffies 4294671391 (age 362.277s)
hex dump (first 32 bytes):
  07 00 07 00 ad 4e ad de ff ff ff ff 6b 6b 6b 6b  .N..
  ff ff ff ff ff ff ff ff 80 24 b4 82 ff ff ff ff  .$..
backtrace:
  [817152fe] kmemleak_alloc+0x5e/0xc0
  [811c34f3] kmem_cache_alloc_trace+0x113/0x290
  [811920f7] __ptlock_alloc+0x27/0x50
  [81192849] __pmd_alloc+0x59/0x170
  [81195ffa] copy_page_range+0x38a/0x3e0
  [8105a013] dup_mm+0x313/0x540
  [8105b9da] copy_process+0x161a/0x1880
  [8105c01b] do_fork+0x8b/0x360
  [8105c306] SyS_clone+0x16/0x20
  [81727b79] stub_clone+0x69/0x90
  [] 0x
 
  It's quite serious, because my test host went to panic in a few hours.
 
  Sorry for that.
 
  Could you test patch below.

 Yes, it works.

 I found this too a few minutes ago:)

 Nice

 Tested-by ?

Tested-by: Andrey Vagin ava...@openvz.org


 --
  Kirill A. Shutemov
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] [RFC] kmemcg: remove union from memcg_params

2013-08-12 Thread Andrey Wagin
2013/8/13 Andrew Morton :
> On Fri,  9 Aug 2013 00:51:26 +0400 Andrey Vagin  wrote:
>
>> struct memcg_cache_params {
>> bool is_root_cache;
>> union {
>> struct kmem_cache *memcg_caches[0];
>> struct {
>> struct mem_cgroup *memcg;
>> struct list_head list;
>> struct kmem_cache *root_cache;
>> bool dead;
>> atomic_t nr_pages;
>> struct work_struct destroy;
>> };
>> };
>> };
>>
>> This union is a bit dangerous. //Andrew Morton
>>
>> The first problem was fixed in v3.10-rc5-67-gf101a94.
>> The second problem is that the size of memory for root
>> caches is calculated incorrectly:
>>
>>   ssize_t size = memcg_caches_array_size(num_groups);
>>
>>   size *= sizeof(void *);
>>   size += sizeof(struct memcg_cache_params);
>>
>> The last line should be fixed like this:
>>   size += offsetof(struct memcg_cache_params, memcg_caches)
>>
>> Andrew suggested to rework this code without union and
>> this patch tries to do that.
>
> hm, did I?

I reread your messages. I have seen in it, what I want. Sorry, you
suggested to rework this code how you explained bellow in this
message. "without union" is my fantasy.
http://lkml.indiana.edu/hypermail/linux/kernel/1305.3/01985.html

>
>> This patch removes is_root_cache and union. The size of the
>> memcg_cache_params structure is not changed.
>>
>
> It's a bit sad to consume more space because we're sucky programmers.
> It would be better to retain the union and to stop writing buggy code
> to handle it!

I decided to implement this approach, because it doesn't increase
memory consumptions. This patch is replace is_root_cache on a pointer,
but due to alignment the size of the structure is not changed.

but the size of struct kmem_cache is increased on one pointer, if
accounting of kernel memory is not enabled.

The overhead of this patch on a real system is about 1K if the kernel
memory accounting is disabled and the overhead is zero after enabling
the accounting.

>
> Maybe there are things we can do to reduce the likelihood of people
> mishandling the union - don't use anonymous fields, name each member,
> access it via helper functions, etc.

Ok, I wil try this way.

Thanks,
Andrey
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] [RFC] kmemcg: remove union from memcg_params

2013-08-12 Thread Andrey Wagin
2013/8/13 Andrew Morton a...@linux-foundation.org:
 On Fri,  9 Aug 2013 00:51:26 +0400 Andrey Vagin ava...@openvz.org wrote:

 struct memcg_cache_params {
 bool is_root_cache;
 union {
 struct kmem_cache *memcg_caches[0];
 struct {
 struct mem_cgroup *memcg;
 struct list_head list;
 struct kmem_cache *root_cache;
 bool dead;
 atomic_t nr_pages;
 struct work_struct destroy;
 };
 };
 };

 This union is a bit dangerous. //Andrew Morton

 The first problem was fixed in v3.10-rc5-67-gf101a94.
 The second problem is that the size of memory for root
 caches is calculated incorrectly:

   ssize_t size = memcg_caches_array_size(num_groups);

   size *= sizeof(void *);
   size += sizeof(struct memcg_cache_params);

 The last line should be fixed like this:
   size += offsetof(struct memcg_cache_params, memcg_caches)

 Andrew suggested to rework this code without union and
 this patch tries to do that.

 hm, did I?

I reread your messages. I have seen in it, what I want. Sorry, you
suggested to rework this code how you explained bellow in this
message. without union is my fantasy.
http://lkml.indiana.edu/hypermail/linux/kernel/1305.3/01985.html


 This patch removes is_root_cache and union. The size of the
 memcg_cache_params structure is not changed.


 It's a bit sad to consume more space because we're sucky programmers.
 It would be better to retain the union and to stop writing buggy code
 to handle it!

I decided to implement this approach, because it doesn't increase
memory consumptions. This patch is replace is_root_cache on a pointer,
but due to alignment the size of the structure is not changed.

but the size of struct kmem_cache is increased on one pointer, if
accounting of kernel memory is not enabled.

The overhead of this patch on a real system is about 1K if the kernel
memory accounting is disabled and the overhead is zero after enabling
the accounting.


 Maybe there are things we can do to reduce the likelihood of people
 mishandling the union - don't use anonymous fields, name each member,
 access it via helper functions, etc.

Ok, I wil try this way.

Thanks,
Andrey
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] [RFC] mnt: restrict a number of "struct mnt"

2013-06-19 Thread Andrey Wagin
On Tue, Jun 18, 2013 at 02:56:51AM +0400, Andrey Wagin wrote:
> 2013/6/17 Eric W. Biederman :
> > So for anyone seriously worried about this kind of thing in general we
> > already have the memory control group, which is quite capable of
> > limiting this kind of thing,
> 
> > and it limits all memory allocations not just mount.
> 
> And that is problem, we can't to limit a particular slab. Let's
> imagine a real container with 4Gb of RAM. What is a kernel memory
> limit resonable for it? I setup 64 Mb (it may be not enough for real
> CT, but it's enough to make host inaccessible for some minutes).
> 
> $ mkdir /sys/fs/cgroup/memory/test
> $ echo $((64 << 20)) > /sys/fs/cgroup/memory/test/memory.kmem.limit_in_bytes
> $ unshare -m
> $ echo $$ > /sys/fs/cgroup/memory/test/tasks
> $ mount --make-rprivate /
> $ mount -t tmpfs xxx /mnt
> $ mount --make-shared /mnt
> $ time bash -c 'set -m; for i in `seq 30`; do mount --bind /mnt
> `mktemp -d /mnt/test.XX` & done;  for i in `seq 30`; do wait;
> done'
> real 0m23.141s
> user 0m0.016s
> sys 0m22.881s
> 
> While the last script is working, nobody can't to read /proc/mounts or
> mount something. I don't think that users from other containers will
> be glad. This problem is not so significant in compared with umounting
> of this tree.
> 
> $ strace -T umount -l /mnt
> umount("/mnt", MNT_DETACH)  = 0 <548.898244>
> The host is inaccessible, it writes messages about soft lockup in
> kernel log and eats 100% cpu.

Eric, do you agree that
* It is a problem
* Currently we don't have a mechanism to prevent this problem
* We need to find a way to prevent this problem

> 
> 
> >
> > Is there some reason we want to go down the path of adding and tuning
> > static limits all over the kernel?  As opposed to streamlining the memory
> > control group so it is low overhead and everyone that cares can use it?
> 
> The memory control group doesn't help in this case... I need to look
> at this code in more details, maybe we can limit a depth of nested
> mount points.

Complexity of the umount algorithm does not depends on a depth of nested
mounts, it depends on a number of mounts and sometimes complexity is O(n^2).

For example:

mount -t tmpfs xxx /mnt
mount --make-shared /mnt

mkdir /mnt/tmp
mount -t tmpfs xxx /mnt/tmp
mkdir /mnt/d

for ((i = 0; i < $1; i++)); do
d=`mktemp -d /mnt/d/xxx.XX`
mount --bind /mnt/tmp $d || break
done

mkdir /mnt/tmp/d
for ((i = 0; i < $1; i++)); do
d=`mktemp -d /mnt/tmp/xxx.XX`
mount --bind /mnt/tmp/d $d || break
done

perf data for umount -l /mnt
29.60% dbus-daemon  [kernel.kallsyms][k] __ticket_spin_lock
   |
   --- __ticket_spin_lock
   lg_local_lock
   path_init
   path_openat
   do_filp_open
   do_sys_open
   SyS_openat
   system_call_fastpath
   __openat64_nocancel
   0x747379732f312d73

20.20%  umount  [kernel.kallsyms][k] propagation_next
|
--- propagation_next
   |
   |--65.35%-- umount_tree
   |  SyS_umount
   |  system_call_fastpath
   |  __umount2
   |  __libc_start_main
   |
--34.65%-- propagate_umount
  umount_tree
  SyS_umount
  system_call_fastpath
  __umount2
  __libc_start_main

17.81%  umount  [kernel.kallsyms][k] __lookup_mnt
|
--- __lookup_mnt
   |
   |--82.78%-- propagate_umount
   |  umount_tree
   |  SyS_umount
   |  system_call_fastpath
   |  __umount2
   |  __libc_start_main
   |
--17.22%-- umount_tree
  SyS_umount
  system_call_fastpath
  __umount2
  __libc_start_main

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] [RFC] mnt: restrict a number of struct mnt

2013-06-19 Thread Andrey Wagin
On Tue, Jun 18, 2013 at 02:56:51AM +0400, Andrey Wagin wrote:
 2013/6/17 Eric W. Biederman ebied...@xmission.com:
  So for anyone seriously worried about this kind of thing in general we
  already have the memory control group, which is quite capable of
  limiting this kind of thing,
 
  and it limits all memory allocations not just mount.
 
 And that is problem, we can't to limit a particular slab. Let's
 imagine a real container with 4Gb of RAM. What is a kernel memory
 limit resonable for it? I setup 64 Mb (it may be not enough for real
 CT, but it's enough to make host inaccessible for some minutes).
 
 $ mkdir /sys/fs/cgroup/memory/test
 $ echo $((64  20))  /sys/fs/cgroup/memory/test/memory.kmem.limit_in_bytes
 $ unshare -m
 $ echo $$  /sys/fs/cgroup/memory/test/tasks
 $ mount --make-rprivate /
 $ mount -t tmpfs xxx /mnt
 $ mount --make-shared /mnt
 $ time bash -c 'set -m; for i in `seq 30`; do mount --bind /mnt
 `mktemp -d /mnt/test.XX`  done;  for i in `seq 30`; do wait;
 done'
 real 0m23.141s
 user 0m0.016s
 sys 0m22.881s
 
 While the last script is working, nobody can't to read /proc/mounts or
 mount something. I don't think that users from other containers will
 be glad. This problem is not so significant in compared with umounting
 of this tree.
 
 $ strace -T umount -l /mnt
 umount(/mnt, MNT_DETACH)  = 0 548.898244
 The host is inaccessible, it writes messages about soft lockup in
 kernel log and eats 100% cpu.

Eric, do you agree that
* It is a problem
* Currently we don't have a mechanism to prevent this problem
* We need to find a way to prevent this problem

 
 
 
  Is there some reason we want to go down the path of adding and tuning
  static limits all over the kernel?  As opposed to streamlining the memory
  control group so it is low overhead and everyone that cares can use it?
 
 The memory control group doesn't help in this case... I need to look
 at this code in more details, maybe we can limit a depth of nested
 mount points.

Complexity of the umount algorithm does not depends on a depth of nested
mounts, it depends on a number of mounts and sometimes complexity is O(n^2).

For example:

mount -t tmpfs xxx /mnt
mount --make-shared /mnt

mkdir /mnt/tmp
mount -t tmpfs xxx /mnt/tmp
mkdir /mnt/d

for ((i = 0; i  $1; i++)); do
d=`mktemp -d /mnt/d/xxx.XX`
mount --bind /mnt/tmp $d || break
done

mkdir /mnt/tmp/d
for ((i = 0; i  $1; i++)); do
d=`mktemp -d /mnt/tmp/xxx.XX`
mount --bind /mnt/tmp/d $d || break
done

perf data for umount -l /mnt
29.60% dbus-daemon  [kernel.kallsyms][k] __ticket_spin_lock
   |
   --- __ticket_spin_lock
   lg_local_lock
   path_init
   path_openat
   do_filp_open
   do_sys_open
   SyS_openat
   system_call_fastpath
   __openat64_nocancel
   0x747379732f312d73

20.20%  umount  [kernel.kallsyms][k] propagation_next
|
--- propagation_next
   |
   |--65.35%-- umount_tree
   |  SyS_umount
   |  system_call_fastpath
   |  __umount2
   |  __libc_start_main
   |
--34.65%-- propagate_umount
  umount_tree
  SyS_umount
  system_call_fastpath
  __umount2
  __libc_start_main

17.81%  umount  [kernel.kallsyms][k] __lookup_mnt
|
--- __lookup_mnt
   |
   |--82.78%-- propagate_umount
   |  umount_tree
   |  SyS_umount
   |  system_call_fastpath
   |  __umount2
   |  __libc_start_main
   |
--17.22%-- umount_tree
  SyS_umount
  system_call_fastpath
  __umount2
  __libc_start_main

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] [RFC] mnt: restrict a number of "struct mnt"

2013-06-17 Thread Andrey Wagin
2013/6/17 Eric W. Biederman :
> Andrey Vagin  writes:
>
>> I found that a few processes can eat all host memory and nobody can kill 
>> them.
>> $ mount -t tmpfs xxx /mnt
>> $ mount --make-shared /mnt
>> $ for i in `seq 30`; do mount --bind /mnt `mktemp -d /mnt/test.XX` & done
>>
>> All this processes are unkillable, because they took i_mutex and waits
>> namespace_lock.
>>
>> ...
>> 21715 pts/0D  0:00  \_ mount --bind /mnt /mnt/test.ht6jzO
>> 21716 pts/0D  0:00  \_ mount --bind /mnt /mnt/test.97K4mI
>> 21717 pts/0R  0:01  \_ mount --bind /mnt /mnt/test.gO2CD9
>> ...
>>
>> Each of this process doubles a number of mounts, so at the end we will
>> have about 2^32 mounts and the size of struct mnt is 256 bytes, so we
>> need about 1TB of RAM.
>>
>> Another problem is that “umount” of a big tree is very hard operation
>> and it requires a lot of time.
>> E.g.:
>> 16411
>> umount("/tmp/xxx", MNT_DETACH)  = 0 <7.852066> (7.8 sec)
>> 32795
>> umount("/tmp/xxx", MNT_DETACH)  = 0 <34.485501> ( 34 sec)
>>
>> For all this time sys_umoun takes namespace_sem and vfsmount_lock...
>>
>> Due to all this reasons I suggest to restrict a number of mounts.
>> Probably we can optimize this code in a future, but now this restriction
>> can help.
>
> So for anyone seriously worried about this kind of thing in general we
> already have the memory control group, which is quite capable of
> limiting this kind of thing,

> and it limits all memory allocations not just mount.

And that is problem, we can't to limit a particular slab. Let's
imagine a real container with 4Gb of RAM. What is a kernel memory
limit resonable for it? I setup 64 Mb (it may be not enough for real
CT, but it's enough to make host inaccessible for some minutes).

$ mkdir /sys/fs/cgroup/memory/test
$ echo $((64 << 20)) > /sys/fs/cgroup/memory/test/memory.kmem.limit_in_bytes
$ unshare -m
$ echo $$ > /sys/fs/cgroup/memory/test/tasks
$ mount --make-rprivate /
$ mount -t tmpfs xxx /mnt
$ mount --make-shared /mnt
$ time bash -c 'set -m; for i in `seq 30`; do mount --bind /mnt
`mktemp -d /mnt/test.XX` & done;  for i in `seq 30`; do wait;
done'
real 0m23.141s
user 0m0.016s
sys 0m22.881s

While the last script is working, nobody can't to read /proc/mounts or
mount something. I don't think that users from other containers will
be glad. This problem is not so significant in compared with umounting
of this tree.

$ strace -T umount -l /mnt
umount("/mnt", MNT_DETACH)  = 0 <548.898244>
The host is inaccessible, it writes messages about soft lockup in
kernel log and eats 100% cpu.


>
> Is there some reason we want to go down the path of adding and tuning
> static limits all over the kernel?  As opposed to streamlining the memory
> control group so it is low overhead and everyone that cares can use it?

The memory control group doesn't help in this case... I need to look
at this code in more details, maybe we can limit a depth of nested
mount points.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] [RFC] mnt: restrict a number of struct mnt

2013-06-17 Thread Andrey Wagin
2013/6/17 Eric W. Biederman ebied...@xmission.com:
 Andrey Vagin ava...@openvz.org writes:

 I found that a few processes can eat all host memory and nobody can kill 
 them.
 $ mount -t tmpfs xxx /mnt
 $ mount --make-shared /mnt
 $ for i in `seq 30`; do mount --bind /mnt `mktemp -d /mnt/test.XX`  done

 All this processes are unkillable, because they took i_mutex and waits
 namespace_lock.

 ...
 21715 pts/0D  0:00  \_ mount --bind /mnt /mnt/test.ht6jzO
 21716 pts/0D  0:00  \_ mount --bind /mnt /mnt/test.97K4mI
 21717 pts/0R  0:01  \_ mount --bind /mnt /mnt/test.gO2CD9
 ...

 Each of this process doubles a number of mounts, so at the end we will
 have about 2^32 mounts and the size of struct mnt is 256 bytes, so we
 need about 1TB of RAM.

 Another problem is that “umount” of a big tree is very hard operation
 and it requires a lot of time.
 E.g.:
 16411
 umount(/tmp/xxx, MNT_DETACH)  = 0 7.852066 (7.8 sec)
 32795
 umount(/tmp/xxx, MNT_DETACH)  = 0 34.485501 ( 34 sec)

 For all this time sys_umoun takes namespace_sem and vfsmount_lock...

 Due to all this reasons I suggest to restrict a number of mounts.
 Probably we can optimize this code in a future, but now this restriction
 can help.

 So for anyone seriously worried about this kind of thing in general we
 already have the memory control group, which is quite capable of
 limiting this kind of thing,

 and it limits all memory allocations not just mount.

And that is problem, we can't to limit a particular slab. Let's
imagine a real container with 4Gb of RAM. What is a kernel memory
limit resonable for it? I setup 64 Mb (it may be not enough for real
CT, but it's enough to make host inaccessible for some minutes).

$ mkdir /sys/fs/cgroup/memory/test
$ echo $((64  20))  /sys/fs/cgroup/memory/test/memory.kmem.limit_in_bytes
$ unshare -m
$ echo $$  /sys/fs/cgroup/memory/test/tasks
$ mount --make-rprivate /
$ mount -t tmpfs xxx /mnt
$ mount --make-shared /mnt
$ time bash -c 'set -m; for i in `seq 30`; do mount --bind /mnt
`mktemp -d /mnt/test.XX`  done;  for i in `seq 30`; do wait;
done'
real 0m23.141s
user 0m0.016s
sys 0m22.881s

While the last script is working, nobody can't to read /proc/mounts or
mount something. I don't think that users from other containers will
be glad. This problem is not so significant in compared with umounting
of this tree.

$ strace -T umount -l /mnt
umount(/mnt, MNT_DETACH)  = 0 548.898244
The host is inaccessible, it writes messages about soft lockup in
kernel log and eats 100% cpu.



 Is there some reason we want to go down the path of adding and tuning
 static limits all over the kernel?  As opposed to streamlining the memory
 control group so it is low overhead and everyone that cares can use it?

The memory control group doesn't help in this case... I need to look
at this code in more details, maybe we can limit a depth of nested
mount points.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/1] move exit_task_namespaces() outside of exit_notify()

2013-04-15 Thread Andrey Wagin
2013/4/13 Oleg Nesterov 
>
> exit_notify() does exit_task_namespaces() after
> forget_original_parent(). This was needed to ensure that ->nsproxy
> can't be cleared prematurely, an exiting child we are going to
> reparent can do do_notify_parent() and use the parent's (ours) pid_ns.
>
> However, after 32084504 "pidns: use task_active_pid_ns in
> do_notify_parent" ->nsproxy != NULL is no longer needed, we rely
> on task_active_pid_ns().
>
> Move exit_task_namespaces() from exit_notify() to do_exit(), after
> exit_fs() and before exit_task_work().
>
> This solves the problem reported by Andrey, free_ipc_ns()->shm_destroy()
> does fput() which needs task_work_add(). And this allows us do simplify
> exit_notify(), we can avoid unlock/lock(tasklist) and we can change
> ->exit_state instead of PF_EXITING in forget_original_parent().
>

It looks good for me. I have tested it a bit and don't find any problem.
Oleg, thank you.

Acked-by: Andrew Vagin 

> Reported-by: Andrey Vagin 
> Signed-off-by: Oleg Nesterov 
>
> --- x/kernel/exit.c
> +++ x/kernel/exit.c
> @@ -649,7 +649,6 @@ static void exit_notify(struct task_stru
>  *  jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
>  */
> forget_original_parent(tsk);
> -   exit_task_namespaces(tsk);
>
> write_lock_irq(_lock);
> if (group_dead)
> @@ -795,6 +794,7 @@ void do_exit(long code)
> exit_shm(tsk);
> exit_files(tsk);
> exit_fs(tsk);
> +   exit_task_namespaces(tsk);
> exit_task_work(tsk);
> check_stack_usage();
> exit_thread();
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/1] move exit_task_namespaces() outside of exit_notify()

2013-04-15 Thread Andrey Wagin
2013/4/13 Oleg Nesterov o...@redhat.com

 exit_notify() does exit_task_namespaces() after
 forget_original_parent(). This was needed to ensure that -nsproxy
 can't be cleared prematurely, an exiting child we are going to
 reparent can do do_notify_parent() and use the parent's (ours) pid_ns.

 However, after 32084504 pidns: use task_active_pid_ns in
 do_notify_parent -nsproxy != NULL is no longer needed, we rely
 on task_active_pid_ns().

 Move exit_task_namespaces() from exit_notify() to do_exit(), after
 exit_fs() and before exit_task_work().

 This solves the problem reported by Andrey, free_ipc_ns()-shm_destroy()
 does fput() which needs task_work_add(). And this allows us do simplify
 exit_notify(), we can avoid unlock/lock(tasklist) and we can change
 -exit_state instead of PF_EXITING in forget_original_parent().


It looks good for me. I have tested it a bit and don't find any problem.
Oleg, thank you.

Acked-by: Andrew Vagin ava...@gmail.com

 Reported-by: Andrey Vagin ava...@openvz.org
 Signed-off-by: Oleg Nesterov o...@redhat.com

 --- x/kernel/exit.c
 +++ x/kernel/exit.c
 @@ -649,7 +649,6 @@ static void exit_notify(struct task_stru
  *  jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
  */
 forget_original_parent(tsk);
 -   exit_task_namespaces(tsk);

 write_lock_irq(tasklist_lock);
 if (group_dead)
 @@ -795,6 +794,7 @@ void do_exit(long code)
 exit_shm(tsk);
 exit_files(tsk);
 exit_fs(tsk);
 +   exit_task_namespaces(tsk);
 exit_task_work(tsk);
 check_stack_usage();
 exit_thread();

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] ptrace: add ability to retrieve signals without removing from a queue (v2)

2013-02-25 Thread Andrey Wagin
Andrew, thank you for the comments. I will fix them and send a new
patch. A few comments are inline.

2013/2/26 Andrew Morton :
> On Mon, 25 Feb 2013 14:06:53 +0400
>> + for (i = 0; i < arg.nr; i++) {
>> + off = arg.off + i;
>
> "long long" = "u64" + "int".
>
> Wanna have a little think about what we're doing here, see if we can
> pick the most appropriate types?

A number of signal has the type int, because the system call ptrace()
return "int" and my interface supposed to return a number of dumped
signals.
"off" is signed to check on a negative value and it can not be more
then MAX_RAM / sizeof(siginfo), so
long long is enough. It can be changed on s64 for avoiding misleading.

>
> `off' could be made local to this loop, which is a bit neater.
>
>> + spin_lock_irq(>sighand->siglock);
>> + list_for_each_entry(q, >list, list) {
>> + if (!off--) {
>> + copy_siginfo(, >info);
>> + break;
>> + }
>> + }
>> + spin_unlock_irq(>sighand->siglock);
>> +
>> + if (off >= 0)
>> + break;
>
> 
>
> Oh I see what you did there - the user requested an entry which is
> beyond the end of the list? A code comment would be nice.
>
> What's the return value in this case?  "i".  So how does userspace find
> out what happened?

The request PTRACE_PEEKSIGINFO returns a number of dumped signals. If
a signal with the specified sequence number doesn't exist, ptrace
returns 0.

>
> Please fully describe the interface so things such as this can be
> understood.
>
>> +#ifdef CONFIG_COMPAT
>> + if (unlikely(is_compat_task())) {
>> + compat_siginfo_t __user *uinfo = compat_ptr(data);
>> +
>> + ret = copy_siginfo_to_user32(uinfo, );
>> + if (!ret)
>> + ret = __put_user(info.si_code, 
>> >si_code);
>> + } else
>> +#endif
>> + {
>> + siginfo_t __user *uinfo = (siginfo_t __user *) data;
>> +
>> + ret = copy_siginfo_to_user(uinfo, );
>> + if (!ret)
>> + ret = __put_user(info.si_code, 
>> >si_code);
>> + }
>> +
>> + if (ret)
>> + break;
>> +
>> + data += sizeof(siginfo_t);
>> +
>> + if (signal_pending(current)) {
>> + i++;
>
> What does the i++ do?

For accounting a current siginfo. I will add a comment here.

>
>> + break;
>> + }
>> +
>> + cond_resched();
>> + }
>> +
>> + return i ? : ret;
>
> I hate that gcc thing :(
>
> Also, is it buggy?  `ret' might be -EFAULT here.  We appear to be using
> read() return semantics here?

I have tried to implement read() semantics here. ptrace() returns a
number of dumped signals even if dumping of the last signal is failed.
And it returns an error if not one signal has been dumped
successfully.

The similar logic is used in read()
generic_file_aio_read()

retval += desc.written;
if (desc.error) {
retval = retval ?: desc.error;
break;
}

Thanks,
  Andrey
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] ptrace: add ability to retrieve signals without removing from a queue (v2)

2013-02-25 Thread Andrey Wagin
2013/2/25 Pavel Emelyanov :
>> + for (i = 0; i < arg.nr; i++) {
>> + off = arg.off + i;
>> +
>> + spin_lock_irq(>sighand->siglock);
>> + list_for_each_entry(q, >list, list) {
>> + if (!off--) {
>> + copy_siginfo(, >info);
>> + break;
>> + }
>> + }
>> + spin_unlock_irq(>sighand->siglock);
>
> What's the point of arg.nr if you for every single siginfo drop the lock
> and perform linear search anyway?

arg.nr may be useful, if we will decide to optimize this part in a
future. Currently the problem with complexity of finding a signal in a
queue is true not only for this code. It exists for delivering
real-time signals as well. Probably in a future this problem will be
fixed for both cases by the same way.

Thanks,
  Andrew
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] ptrace: add ability to retrieve signals without removing from a queue (v2)

2013-02-25 Thread Andrey Wagin
Andrew, thank you for the comments. I will fix them and send a new
patch. A few comments are inline.

2013/2/26 Andrew Morton a...@linux-foundation.org:
 On Mon, 25 Feb 2013 14:06:53 +0400
 + for (i = 0; i  arg.nr; i++) {
 + off = arg.off + i;

 long long = u64 + int.

 Wanna have a little think about what we're doing here, see if we can
 pick the most appropriate types?

A number of signal has the type int, because the system call ptrace()
return int and my interface supposed to return a number of dumped
signals.
off is signed to check on a negative value and it can not be more
then MAX_RAM / sizeof(siginfo), so
long long is enough. It can be changed on s64 for avoiding misleading.


 `off' could be made local to this loop, which is a bit neater.

 + spin_lock_irq(child-sighand-siglock);
 + list_for_each_entry(q, pending-list, list) {
 + if (!off--) {
 + copy_siginfo(info, q-info);
 + break;
 + }
 + }
 + spin_unlock_irq(child-sighand-siglock);
 +
 + if (off = 0)
 + break;

 scratches head

 Oh I see what you did there - the user requested an entry which is
 beyond the end of the list? A code comment would be nice.

 What's the return value in this case?  i.  So how does userspace find
 out what happened?

The request PTRACE_PEEKSIGINFO returns a number of dumped signals. If
a signal with the specified sequence number doesn't exist, ptrace
returns 0.


 Please fully describe the interface so things such as this can be
 understood.

 +#ifdef CONFIG_COMPAT
 + if (unlikely(is_compat_task())) {
 + compat_siginfo_t __user *uinfo = compat_ptr(data);
 +
 + ret = copy_siginfo_to_user32(uinfo, info);
 + if (!ret)
 + ret = __put_user(info.si_code, 
 uinfo-si_code);
 + } else
 +#endif
 + {
 + siginfo_t __user *uinfo = (siginfo_t __user *) data;
 +
 + ret = copy_siginfo_to_user(uinfo, info);
 + if (!ret)
 + ret = __put_user(info.si_code, 
 uinfo-si_code);
 + }
 +
 + if (ret)
 + break;
 +
 + data += sizeof(siginfo_t);
 +
 + if (signal_pending(current)) {
 + i++;

 What does the i++ do?

For accounting a current siginfo. I will add a comment here.


 + break;
 + }
 +
 + cond_resched();
 + }
 +
 + return i ? : ret;

 I hate that gcc thing :(

 Also, is it buggy?  `ret' might be -EFAULT here.  We appear to be using
 read() return semantics here?

I have tried to implement read() semantics here. ptrace() returns a
number of dumped signals even if dumping of the last signal is failed.
And it returns an error if not one signal has been dumped
successfully.

The similar logic is used in read()
generic_file_aio_read()

retval += desc.written;
if (desc.error) {
retval = retval ?: desc.error;
break;
}

Thanks,
  Andrey
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] ptrace: add ability to retrieve signals without removing from a queue (v2)

2013-02-25 Thread Andrey Wagin
2013/2/25 Pavel Emelyanov xe...@parallels.com:
 + for (i = 0; i  arg.nr; i++) {
 + off = arg.off + i;
 +
 + spin_lock_irq(child-sighand-siglock);
 + list_for_each_entry(q, pending-list, list) {
 + if (!off--) {
 + copy_siginfo(info, q-info);
 + break;
 + }
 + }
 + spin_unlock_irq(child-sighand-siglock);

 What's the point of arg.nr if you for every single siginfo drop the lock
 and perform linear search anyway?

arg.nr may be useful, if we will decide to optimize this part in a
future. Currently the problem with complexity of finding a signal in a
queue is true not only for this code. It exists for delivering
real-time signals as well. Probably in a future this problem will be
fixed for both cases by the same way.

Thanks,
  Andrew
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] ptrace: add ability to retrieve signals without removing them from a queue

2013-02-19 Thread Andrey Wagin
2013/2/16 Pedro Alves :
> Forgot to reply to this bit:
>
> On 02/15/2013 07:43 PM, Oleg Nesterov wrote:
>>> We'd miss the poke
>>> > variant, but that looks like something that could be always be added
>>> > later.
>> Yes. _POKE_ or _QUEUE_ or _DEQUEUE_, we can add more features if user-
>> space wants them.
>
> In general, IMO, I agree with Roland at https://lkml.org/lkml/2002/12/20/84
> in that it's good to have setters for completeness, so that you can
> change all the state via ptrace that you can read via ptrace.
>
> But I'm not doing any of this work, hence my "could always be
> added later" comment instead of actually requesting it.  But if
> we had it, we could make e.g., gdb inspect the signal queues,
> and then be able to tweak a realtime signal before it is
> delivered.

PTRACE_POKESIGINFO is more complicated than PTRACE_PEEKSIGINFO.
Looks like PTRACE_POKESIGINFO should replace a siginfo with the
specified sequence number. Should it be able to change signo? If it is
able, what to do with signalfd, which already got a notification about
the previous signal?..

My opinion is that it "could always be added later", when we will
understand what exactly we want to have.

>
> --
> Pedro Alves
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] ptrace: add ability to retrieve signals without removing them from a queue

2013-02-19 Thread Andrey Wagin
2013/2/16 Pedro Alves pal...@redhat.com:
 Forgot to reply to this bit:

 On 02/15/2013 07:43 PM, Oleg Nesterov wrote:
 We'd miss the poke
  variant, but that looks like something that could be always be added
  later.
 Yes. _POKE_ or _QUEUE_ or _DEQUEUE_, we can add more features if user-
 space wants them.

 In general, IMO, I agree with Roland at https://lkml.org/lkml/2002/12/20/84
 in that it's good to have setters for completeness, so that you can
 change all the state via ptrace that you can read via ptrace.

 But I'm not doing any of this work, hence my could always be
 added later comment instead of actually requesting it.  But if
 we had it, we could make e.g., gdb inspect the signal queues,
 and then be able to tweak a realtime signal before it is
 delivered.

PTRACE_POKESIGINFO is more complicated than PTRACE_PEEKSIGINFO.
Looks like PTRACE_POKESIGINFO should replace a siginfo with the
specified sequence number. Should it be able to change signo? If it is
able, what to do with signalfd, which already got a notification about
the previous signal?..

My opinion is that it could always be added later, when we will
understand what exactly we want to have.


 --
 Pedro Alves
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/3] signalfd: add ability to read siginfo-s without dequeuing signals (v2)

2013-02-07 Thread Andrey Wagin
2013/2/7 Oleg Nesterov :
> Andrey, sorry for delay.
>
> As for API, I leave this to you and Michael. Not that I like these
> new flags, but I agree that pread() hack was not pretty too.
>
> On 01/29, Andrey Vagin wrote:
>> +static ssize_t signalfd_peek(struct signalfd_ctx *ctx,
>> + siginfo_t *info, loff_t *ppos, int queue_mask)
>> +{
>> + loff_t seq = *ppos / sizeof(struct signalfd_siginfo);
>> + int signr = 0;
>> +
>> + if (queue_mask & SIGQUEUE_PRIVATE)
>> + signr = peek_signal(>pending,
>> + >sigmask, info, );
>> + else if (queue_mask & SIGQUEUE_SHARED)
>> + signr = peek_signal(>signal->shared_pending,
>> +  >sigmask, info, );
>> + (*ppos) += sizeof(struct signalfd_siginfo);
>
> Now that this can work even with normal read(), we will actually change
> f_pos. Then perhaps signalfd_fops->llseek() should work too. But this
> is minor...

lseek works only if FMODE_LSEEK is set.

You have explained why read have strange semantics for SIGNALFD_PEEK.

>Damn. But after I wrote this email I realized that llseek() probably can't
> work. Because peek_offset/f_pos/whatever has to be shared with all processes
> which have this file opened.
>
> Suppose that the task forks after sys_signalfd(). Now if parent or child
> do llseek this affects them both. This is insane because signalfd is
> "strange" to say at least, fork/dup/etc inherits signalfd_ctx but not the
> "source" of the data.

So I want to suggest a way how to forbid read() for SIGNALFD_PEEK.
file->f_pos can be initialized to -1. read() returns EINVAL in this
case. In a man page we will write that signals can be dumped only with
help pread(). Is it overload or too ugly?

>
> Hmm. but since it works with read(), we shouldn't increment *ppos unless
> signalfd_copyinfo() succeeds?

No, we shouldn't.

>
> Btw, why do you pass seq by reference? Looks unneeded.

You are right. I created this code for reading signals from both
queues, but then we decided to forbid using SIGNALFD_PEEK for both
queues simultaneously.


Oleg, thank you for the comments. I'm waiting an answer on the
question and after that I'm going to send a final version.

>
> Oleg.
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/3] signalfd: add ability to read siginfo-s without dequeuing signals (v2)

2013-02-07 Thread Andrey Wagin
2013/2/7 Oleg Nesterov o...@redhat.com:
 Andrey, sorry for delay.

 As for API, I leave this to you and Michael. Not that I like these
 new flags, but I agree that pread() hack was not pretty too.

 On 01/29, Andrey Vagin wrote:
 +static ssize_t signalfd_peek(struct signalfd_ctx *ctx,
 + siginfo_t *info, loff_t *ppos, int queue_mask)
 +{
 + loff_t seq = *ppos / sizeof(struct signalfd_siginfo);
 + int signr = 0;
 +
 + if (queue_mask  SIGQUEUE_PRIVATE)
 + signr = peek_signal(current-pending,
 + ctx-sigmask, info, seq);
 + else if (queue_mask  SIGQUEUE_SHARED)
 + signr = peek_signal(current-signal-shared_pending,
 +  ctx-sigmask, info, seq);
 + (*ppos) += sizeof(struct signalfd_siginfo);

 Now that this can work even with normal read(), we will actually change
 f_pos. Then perhaps signalfd_fops-llseek() should work too. But this
 is minor...

lseek works only if FMODE_LSEEK is set.

You have explained why readlseek have strange semantics for SIGNALFD_PEEK.

Damn. But after I wrote this email I realized that llseek() probably can't
 work. Because peek_offset/f_pos/whatever has to be shared with all processes
 which have this file opened.

 Suppose that the task forks after sys_signalfd(). Now if parent or child
 do llseek this affects them both. This is insane because signalfd is
 strange to say at least, fork/dup/etc inherits signalfd_ctx but not the
 source of the data.

So I want to suggest a way how to forbid read() for SIGNALFD_PEEK.
file-f_pos can be initialized to -1. read() returns EINVAL in this
case. In a man page we will write that signals can be dumped only with
help pread(). Is it overload or too ugly?


 Hmm. but since it works with read(), we shouldn't increment *ppos unless
 signalfd_copyinfo() succeeds?

No, we shouldn't.


 Btw, why do you pass seq by reference? Looks unneeded.

You are right. I created this code for reading signals from both
queues, but then we decided to forbid using SIGNALFD_PEEK for both
queues simultaneously.


Oleg, thank you for the comments. I'm waiting an answer on the
question and after that I'm going to send a final version.


 Oleg.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/3] signalfd: add ability to read siginfo-s without dequeuing signals (v2)

2013-02-01 Thread Andrey Wagin
2013/2/2 Michael Kerrisk (man-pages) :
>
> On Jan 30, 2013 8:05 AM, "Andrey Vagin"  wrote:
>>
>> If signalfd is created with the flag SFD_PEEK, it reads siginfo-s
>> without dequeuing signals.
>>
>> For reading not first siginfo pread(fd, buf, size, pos) can be used,
>> where ppos / sizeof(signalfd_siginfo) is a sequence number of a signal
>> in a queue.
>
> Andrey,
>
> Is it perhaps worth erroring (EINVAL) if  ((ppos % sizeof
> (signalfd_siginfo)) != 0) ?

Yes, it's already in the patch.

@@ -257,9 +298,15 @@ static ssize_t signalfd_read(struct file *file,
char __user *buf, size_t count,
if (!count)
return -EINVAL;

+   if (*ppos % sizeof(struct signalfd_siginfo))
+   return -EINVAL;
+


>
> Cheers,
>
> Michael
>
>> This functionality is required for checkpointing pending signals.
>>
>> v2: * signals can be dumped only from one queue.
>> * treat pos as offset in bytes, not in elements, so pos should be
>>   aligned to the size of signalfd_siginfo.
>>
>> Cc: Oleg Nesterov 
>> Cc: Alexander Viro 
>> Cc: "Paul E. McKenney" 
>> Cc: David Howells 
>> Cc: Dave Jones 
>> Cc: Andrey Vagin 
>> Cc: Michael Kerrisk 
>> Cc: Pavel Emelyanov 
>> CC: Cyrill Gorcunov 
>> Signed-off-by: Andrey Vagin 
>> ---
>>  fs/signalfd.c | 61
>> ---
>>  include/uapi/linux/signalfd.h |  2 ++
>>  2 files changed, 60 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/signalfd.c b/fs/signalfd.c
>> index 8019ec9..0da6a30 100644
>> --- a/fs/signalfd.c
>> +++ b/fs/signalfd.c
>> @@ -51,6 +51,47 @@ struct signalfd_ctx {
>> sigset_t sigmask;
>>  };
>>
>> +static int peek_signal(struct sigpending *pending, sigset_t *mask,
>> +   siginfo_t *info, loff_t *pseq)
>> +{
>> +   struct sigqueue *q;
>> +   int ret = 0;
>> +
>> +   spin_lock_irq(>sighand->siglock);
>> +
>> +   list_for_each_entry(q, >list, list) {
>> +   if (sigismember(mask, q->info.si_signo))
>> +   continue;
>> +
>> +   if ((*pseq)-- == 0) {
>> +   copy_siginfo(info, >info);
>> +   ret = info->si_signo;
>> +   break;
>> +   }
>> +   }
>> +
>> +   spin_unlock_irq(>sighand->siglock);
>> +
>> +   return ret;
>> +}
>> +
>> +static ssize_t signalfd_peek(struct signalfd_ctx *ctx,
>> +   siginfo_t *info, loff_t *ppos, int
>> queue_mask)
>> +{
>> +   loff_t seq = *ppos / sizeof(struct signalfd_siginfo);
>> +   int signr = 0;
>> +
>> +   if (queue_mask & SIGQUEUE_PRIVATE)
>> +   signr = peek_signal(>pending,
>> +   >sigmask, info, );
>> +   else if (queue_mask & SIGQUEUE_SHARED)
>> +   signr = peek_signal(>signal->shared_pending,
>> +>sigmask, info, );
>> +   (*ppos) += sizeof(struct signalfd_siginfo);
>> +
>> +   return signr;
>> +}
>> +
>>  static int signalfd_release(struct inode *inode, struct file *file)
>>  {
>> kfree(file->private_data);
>> @@ -257,9 +298,15 @@ static ssize_t signalfd_read(struct file *file, char
>> __user *buf, size_t count,
>> if (!count)
>> return -EINVAL;
>>
>> +   if (*ppos % sizeof(struct signalfd_siginfo))
>> +   return -EINVAL;
>> +
>> siginfo = (struct signalfd_siginfo __user *) buf;
>> do {
>> -   ret = signalfd_dequeue(ctx, , nonblock, qmask);
>> +   if (file->f_flags & SFD_PEEK)
>> +   ret = signalfd_peek(ctx, , ppos, qmask);
>> +   else
>> +   ret = signalfd_dequeue(ctx, , nonblock,
>> qmask);
>>
>> if (unlikely(ret <= 0))
>> break;
>> @@ -315,7 +362,12 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user
>> *, user_mask,
>> BUILD_BUG_ON(SFD_CLOEXEC != O_CLOEXEC);
>> BUILD_BUG_ON(SFD_NONBLOCK != O_NONBLOCK);
>>
>> -   if (flags & ~(SFD_CLOEXEC | SFD_NONBLOCK | SFD_RAW | SFD_QUEUES))
>> +   if (flags & ~(SFD_CLOEXEC | SFD_NONBLOCK |
>> +   SFD_RAW | SFD_PEEK | SFD_QUEUES))
>> +   return -EINVAL;
>> +
>> +   /* SFD_PEEK can be used for one queue only */
>> +   if ((flags & SFD_PEEK) && ((flags & SFD_QUEUES) == SFD_QUEUES))
>> return -EINVAL;
>>
>> if (sizemask != sizeof(sigset_t) ||
>> @@ -352,7 +404,10 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user
>> *, user_mask,
>> }
>>
>> file->f_flags |= (flags & SFD_QUEUES) ? : SFD_QUEUES;
>> -   file->f_flags |= flags & SFD_RAW;
>> +   file->f_flags |= flags & (SFD_RAW | SFD_PEEK);
>> +
>> +   if (file->f_flags & SFD_PEEK)
>> +   file->f_mode |= FMODE_PREAD;
>>
>> fd_install(ufd, file);
>> } else {
>> 

Re: [PATCH 3/3] signalfd: add ability to read siginfo-s without dequeuing signals (v2)

2013-02-01 Thread Andrey Wagin
2013/2/2 Michael Kerrisk (man-pages) mtk.manpa...@gmail.com:

 On Jan 30, 2013 8:05 AM, Andrey Vagin ava...@openvz.org wrote:

 If signalfd is created with the flag SFD_PEEK, it reads siginfo-s
 without dequeuing signals.

 For reading not first siginfo pread(fd, buf, size, pos) can be used,
 where ppos / sizeof(signalfd_siginfo) is a sequence number of a signal
 in a queue.

 Andrey,

 Is it perhaps worth erroring (EINVAL) if  ((ppos % sizeof
 (signalfd_siginfo)) != 0) ?

Yes, it's already in the patch.

@@ -257,9 +298,15 @@ static ssize_t signalfd_read(struct file *file,
char __user *buf, size_t count,
if (!count)
return -EINVAL;

+   if (*ppos % sizeof(struct signalfd_siginfo))
+   return -EINVAL;
+



 Cheers,

 Michael

 This functionality is required for checkpointing pending signals.

 v2: * signals can be dumped only from one queue.
 * treat pos as offset in bytes, not in elements, so pos should be
   aligned to the size of signalfd_siginfo.

 Cc: Oleg Nesterov o...@redhat.com
 Cc: Alexander Viro v...@zeniv.linux.org.uk
 Cc: Paul E. McKenney paul...@linux.vnet.ibm.com
 Cc: David Howells dhowe...@redhat.com
 Cc: Dave Jones da...@redhat.com
 Cc: Andrey Vagin ava...@openvz.org
 Cc: Michael Kerrisk mtk.manpa...@gmail.com
 Cc: Pavel Emelyanov xe...@parallels.com
 CC: Cyrill Gorcunov gorcu...@openvz.org
 Signed-off-by: Andrey Vagin ava...@openvz.org
 ---
  fs/signalfd.c | 61
 ---
  include/uapi/linux/signalfd.h |  2 ++
  2 files changed, 60 insertions(+), 3 deletions(-)

 diff --git a/fs/signalfd.c b/fs/signalfd.c
 index 8019ec9..0da6a30 100644
 --- a/fs/signalfd.c
 +++ b/fs/signalfd.c
 @@ -51,6 +51,47 @@ struct signalfd_ctx {
 sigset_t sigmask;
  };

 +static int peek_signal(struct sigpending *pending, sigset_t *mask,
 +   siginfo_t *info, loff_t *pseq)
 +{
 +   struct sigqueue *q;
 +   int ret = 0;
 +
 +   spin_lock_irq(current-sighand-siglock);
 +
 +   list_for_each_entry(q, pending-list, list) {
 +   if (sigismember(mask, q-info.si_signo))
 +   continue;
 +
 +   if ((*pseq)-- == 0) {
 +   copy_siginfo(info, q-info);
 +   ret = info-si_signo;
 +   break;
 +   }
 +   }
 +
 +   spin_unlock_irq(current-sighand-siglock);
 +
 +   return ret;
 +}
 +
 +static ssize_t signalfd_peek(struct signalfd_ctx *ctx,
 +   siginfo_t *info, loff_t *ppos, int
 queue_mask)
 +{
 +   loff_t seq = *ppos / sizeof(struct signalfd_siginfo);
 +   int signr = 0;
 +
 +   if (queue_mask  SIGQUEUE_PRIVATE)
 +   signr = peek_signal(current-pending,
 +   ctx-sigmask, info, seq);
 +   else if (queue_mask  SIGQUEUE_SHARED)
 +   signr = peek_signal(current-signal-shared_pending,
 +ctx-sigmask, info, seq);
 +   (*ppos) += sizeof(struct signalfd_siginfo);
 +
 +   return signr;
 +}
 +
  static int signalfd_release(struct inode *inode, struct file *file)
  {
 kfree(file-private_data);
 @@ -257,9 +298,15 @@ static ssize_t signalfd_read(struct file *file, char
 __user *buf, size_t count,
 if (!count)
 return -EINVAL;

 +   if (*ppos % sizeof(struct signalfd_siginfo))
 +   return -EINVAL;
 +
 siginfo = (struct signalfd_siginfo __user *) buf;
 do {
 -   ret = signalfd_dequeue(ctx, info, nonblock, qmask);
 +   if (file-f_flags  SFD_PEEK)
 +   ret = signalfd_peek(ctx, info, ppos, qmask);
 +   else
 +   ret = signalfd_dequeue(ctx, info, nonblock,
 qmask);

 if (unlikely(ret = 0))
 break;
 @@ -315,7 +362,12 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user
 *, user_mask,
 BUILD_BUG_ON(SFD_CLOEXEC != O_CLOEXEC);
 BUILD_BUG_ON(SFD_NONBLOCK != O_NONBLOCK);

 -   if (flags  ~(SFD_CLOEXEC | SFD_NONBLOCK | SFD_RAW | SFD_QUEUES))
 +   if (flags  ~(SFD_CLOEXEC | SFD_NONBLOCK |
 +   SFD_RAW | SFD_PEEK | SFD_QUEUES))
 +   return -EINVAL;
 +
 +   /* SFD_PEEK can be used for one queue only */
 +   if ((flags  SFD_PEEK)  ((flags  SFD_QUEUES) == SFD_QUEUES))
 return -EINVAL;

 if (sizemask != sizeof(sigset_t) ||
 @@ -352,7 +404,10 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user
 *, user_mask,
 }

 file-f_flags |= (flags  SFD_QUEUES) ? : SFD_QUEUES;
 -   file-f_flags |= flags  SFD_RAW;
 +   file-f_flags |= flags  (SFD_RAW | SFD_PEEK);
 +
 +   if (file-f_flags  SFD_PEEK)
 +   file-f_mode |= FMODE_PREAD;

 fd_install(ufd, file);
 } else {
 diff --git 

  1   2   >