[tip:x86/urgent] x86/purgatory: Avoid creating stray ..d files, remove -MD from KBUILD_CFLAGS

2018-03-25 Thread tip-bot for Sven Wegener
Commit-ID:  e847f6aaf68f6156a5e9b26afe1a7316b9ab697e
Gitweb: https://git.kernel.org/tip/e847f6aaf68f6156a5e9b26afe1a7316b9ab697e
Author: Sven Wegener 
AuthorDate: Sat, 24 Mar 2018 22:21:13 +0100
Committer:  Ingo Molnar 
CommitDate: Sun, 25 Mar 2018 11:04:02 +0200

x86/purgatory: Avoid creating stray ..d files, remove -MD from 
KBUILD_CFLAGS

The kernel build system already takes care of generating the dependency
files. Having the additional -MD in KBUILD_CFLAGS leads to stray
..d files in the build directory when we call the cc-option macro.

Signed-off-by: Sven Wegener 
Cc: H. Peter Anvin 
Cc: Kees Cook 
Cc: Linus Torvalds 
Cc: Matthias Kaehlcke 
Cc: Peter Zijlstra 
Cc: Sam Ravnborg 
Cc: Thomas Gleixner 
Cc: Vivek Goyal 
Link: 
http://lkml.kernel.org/r/alpine.lnx.2.21.1803242219380.30...@titan.int.lan.stealer.net
Signed-off-by: Ingo Molnar 
---
 arch/x86/purgatory/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index 2f15a2ac4209..d70c15de417b 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -16,7 +16,7 @@ KCOV_INSTRUMENT := n
 # in turn leaves some undefined symbols like __fentry__ in purgatory and not
 # sure how to relocate those. Like kexec-tools, use custom flags.
 
-KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes 
-fno-zero-initialized-in-bss -fno-builtin -ffreestanding -c -MD -Os 
-mcmodel=large
+KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes 
-fno-zero-initialized-in-bss -fno-builtin -ffreestanding -c -Os -mcmodel=large
 KBUILD_CFLAGS += -m$(BITS)
 KBUILD_CFLAGS += $(call cc-option,-fno-PIE)
 


[PATCH] x86/purgatory: Remove -MD from KBUILD_CFLAGS

2018-03-24 Thread Sven Wegener
The kernel build system already takes care of generating the dependency
files. Having the additional -MD in KBUILD_CFLAGS leads to stray
..d files in the build directory when we call the cc-option macro.

Signed-off-by: Sven Wegener 
---
 arch/x86/purgatory/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index 2f15a2ac4209..d70c15de417b 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -16,7 +16,7 @@ KCOV_INSTRUMENT := n
 # in turn leaves some undefined symbols like __fentry__ in purgatory and not
 # sure how to relocate those. Like kexec-tools, use custom flags.
 
-KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes 
-fno-zero-initialized-in-bss -fno-builtin -ffreestanding -c -MD -Os 
-mcmodel=large
+KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes 
-fno-zero-initialized-in-bss -fno-builtin -ffreestanding -c -Os -mcmodel=large
 KBUILD_CFLAGS += -m$(BITS)
 KBUILD_CFLAGS += $(call cc-option,-fno-PIE)
 


[tip:x86/urgent] x86_32, entry: Store badsys error code in %eax

2014-07-22 Thread tip-bot for Sven Wegener
Commit-ID:  8142b215501f8b291a108a202b3a053a265b03dd
Gitweb: http://git.kernel.org/tip/8142b215501f8b291a108a202b3a053a265b03dd
Author: Sven Wegener 
AuthorDate: Tue, 22 Jul 2014 10:26:06 +0200
Committer:  H. Peter Anvin 
CommitDate: Tue, 22 Jul 2014 02:34:05 -0700

x86_32, entry: Store badsys error code in %eax

Commit 554086d ("x86_32, entry: Do syscall exit work on badsys
(CVE-2014-4508)") introduced a regression in the x86_32 syscall entry
code, resulting in syscall() not returning proper errors for undefined
syscalls on CPUs supporting the sysenter feature.

The following code:

> int result = syscall(666);
> printf("result=%d errno=%d error=%s\n", result, errno, strerror(errno));

results in:

> result=666 errno=0 error=Success

Obviously, the syscall return value is the called syscall number, but it
should have been an ENOSYS error. When run under ptrace it behaves
correctly, which makes it hard to debug in the wild:

> result=-1 errno=38 error=Function not implemented

The %eax register is the return value register. For debugging via ptrace
the syscall entry code stores the complete register context on the
stack. The badsys handlers only store the ENOSYS error code in the
ptrace register set and do not set %eax like a regular syscall handler
would. The old resume_userspace call chain contains code that clobbers
%eax and it restores %eax from the ptrace registers afterwards. The same
goes for the ptrace-enabled call chain. When ptrace is not used, the
syscall return value is the passed-in syscall number from the untouched
%eax register.

Use %eax as the return value register in syscall_badsys and
sysenter_badsys, like a real syscall handler does, and have the caller
push the value onto the stack for ptrace access.

Signed-off-by: Sven Wegener 
Link: 
http://lkml.kernel.org/r/alpine.lnx.2.11.1407221022380.31...@titan.int.lan.stealer.net
Reviewed-and-tested-by: Andy Lutomirski 
Cc:  # If 554086d is backported
Signed-off-by: H. Peter Anvin 
---
 arch/x86/kernel/entry_32.S | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index dbaa23e..0d0c9d4 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -425,8 +425,8 @@ sysenter_do_call:
cmpl $(NR_syscalls), %eax
jae sysenter_badsys
call *sys_call_table(,%eax,4)
-   movl %eax,PT_EAX(%esp)
 sysenter_after_call:
+   movl %eax,PT_EAX(%esp)
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
@@ -502,6 +502,7 @@ ENTRY(system_call)
jae syscall_badsys
 syscall_call:
call *sys_call_table(,%eax,4)
+syscall_after_call:
movl %eax,PT_EAX(%esp)  # store the return value
 syscall_exit:
LOCKDEP_SYS_EXIT
@@ -675,12 +676,12 @@ syscall_fault:
 END(syscall_fault)
 
 syscall_badsys:
-   movl $-ENOSYS,PT_EAX(%esp)
-   jmp syscall_exit
+   movl $-ENOSYS,%eax
+   jmp syscall_after_call
 END(syscall_badsys)
 
 sysenter_badsys:
-   movl $-ENOSYS,PT_EAX(%esp)
+   movl $-ENOSYS,%eax
jmp sysenter_after_call
 END(syscall_badsys)
CFI_ENDPROC
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] x86_32, entry: store badsys error code in %eax

2014-07-22 Thread Sven Wegener
Commit 554086d ("x86_32, entry: Do syscall exit work on badsys
(CVE-2014-4508)") introduced a regression in the x86_32 syscall entry
code, resulting in syscall() not returning proper errors for undefined
syscalls on CPUs supporting the sysenter feature.

The following code:

> int result = syscall(666);
> printf("result=%d errno=%d error=%s\n", result, errno, strerror(errno));

results in:

> result=666 errno=0 error=Success

Obviously, the syscall return value is the called syscall number, but it
should have been an ENOSYS error. When run under ptrace it behaves
correctly, which makes it hard to debug in the wild:

> result=-1 errno=38 error=Function not implemented

The %eax register is the return value register. For debugging via ptrace
the syscall entry code stores the complete register context on the
stack. The badsys handlers only store the ENOSYS error code in the
ptrace register set and do not set %eax like a regular syscall handler
would. The old resume_userspace call chain contains code that clobbers
%eax and it restores %eax from the ptrace registers afterwards. The same
goes for the ptrace-enabled call chain. When ptrace is not used, the
syscall return value is the passed-in syscall number from the untouched
%eax register.

Use %eax as the return value register in syscall_badsys and
sysenter_badsys, like a real syscall handler does, and have the caller
push the value onto the stack for ptrace access.

Signed-off-by: Sven Wegener 
Reviewed-and-tested-by: Andy Lutomirski 
Cc: sta...@vger.kernel.org
---

I've updated the commit message and added the Reviewed-and-tested-by and 
Cc.

 arch/x86/kernel/entry_32.S | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index dbaa23e..0d0c9d4 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -425,8 +425,8 @@ sysenter_do_call:
cmpl $(NR_syscalls), %eax
jae sysenter_badsys
call *sys_call_table(,%eax,4)
-   movl %eax,PT_EAX(%esp)
 sysenter_after_call:
+   movl %eax,PT_EAX(%esp)
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
@@ -502,6 +502,7 @@ ENTRY(system_call)
jae syscall_badsys
 syscall_call:
call *sys_call_table(,%eax,4)
+syscall_after_call:
movl %eax,PT_EAX(%esp)  # store the return value
 syscall_exit:
LOCKDEP_SYS_EXIT
@@ -675,12 +676,12 @@ syscall_fault:
 END(syscall_fault)
 
 syscall_badsys:
-   movl $-ENOSYS,PT_EAX(%esp)
-   jmp syscall_exit
+   movl $-ENOSYS,%eax
+   jmp syscall_after_call
 END(syscall_badsys)
 
 sysenter_badsys:
-   movl $-ENOSYS,PT_EAX(%esp)
+   movl $-ENOSYS,%eax
jmp sysenter_after_call
 END(syscall_badsys)
CFI_ENDPROC
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] x86_32, entry: store badsys error code in %eax

2014-07-21 Thread Sven Wegener
On Mon, 21 Jul 2014, Andy Lutomirski wrote:

> On Sun, Jul 20, 2014 at 2:33 PM, Sven Wegener  
> wrote:
> > Commit 554086d ("x86_32, entry: Do syscall exit work on badsys
> > (CVE-2014-4508)") introduced a subtle regression in the x86_32 syscall
> > entry code, resulting in syscall() not returning proper errors for
> > non-existing syscalls on CPUs not supporting the sysenter feature.
> 
> s/not supporting/supporting/

Looks like I mixed the sep vs. syscall CPU flag. Initially I encountered 
the issue on real hardware (Celeron) having the sep but not the syscall 
flag. During testing it worked on an emulated CPU missing the sep and 
having the syscall flag and broke on an emulated CPU having the sep and 
missing the syscall flag. I only looked at the syscall flag, which is 
completly invariant for this issue, and assumed it stands for sysenter 
support, completly ignoring the sep flag.

Sven
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] x86_32, entry: fix sysenter_badsys END symbol

2014-07-20 Thread Sven Wegener
Signed-off-by: Sven Wegener 
Cc: Andy Lutomirski 
---
 arch/x86/kernel/entry_32.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 793f6c9..0958f2b 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -682,7 +682,7 @@ END(syscall_badsys)
 sysenter_badsys:
movl $-ENOSYS,%eax
jmp sysenter_after_call
-END(syscall_badsys)
+END(sysenter_badsys)
CFI_ENDPROC
 
 .macro FIXUP_ESPFIX_STACK
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] x86_32, entry: use syscall_badsys from syscall_trace_entry

2014-07-20 Thread Sven Wegener
Bring syscall_trace_entry in line with the other syscall_* error
handling code and use syscall_badsys for setting -ENOSYS.

Signed-off-by: Sven Wegener 
---
 arch/x86/kernel/entry_32.S | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 0d0c9d4..793f6c9 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -644,13 +644,12 @@ END(work_pending)
# perform syscall exit tracing
ALIGN
 syscall_trace_entry:
-   movl $-ENOSYS,PT_EAX(%esp)
movl %esp, %eax
call syscall_trace_enter
/* What it returned is what we'll actually use.  */
cmpl $(NR_syscalls), %eax
-   jnae syscall_call
-   jmp syscall_exit
+   jae syscall_badsys
+   jmp syscall_call
 END(syscall_trace_entry)
 
# perform syscall exit tracing
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] x86_32, entry: store badsys error code in %eax

2014-07-20 Thread Sven Wegener
Commit 554086d ("x86_32, entry: Do syscall exit work on badsys
(CVE-2014-4508)") introduced a subtle regression in the x86_32 syscall
entry code, resulting in syscall() not returning proper errors for
non-existing syscalls on CPUs not supporting the sysenter feature.

The following code:

> int result = syscall(666);
> printf("result=%d errno=%d error=%s\n", result, errno, strerror(errno));

results in:

> result=666 errno=0 error=Success

Obviously, the syscall return value is the called syscall number, but it
should have been an ENOSYS error. When run under ptrace it behaves
correctly, which makes it hard to debug in the wild:

> result=-1 errno=38 error=Function not implemented

The %eax register is the return value register. For debugging via ptrace
the syscall entry code stores the complete register context on the
stack. The badsys handlers only store the ENOSYS error code in the
ptrace register set and do not set %eax like a regular syscall handler
would. The old resume_userspace call chain contains code that clobbers
%eax and it restores %eax from the ptrace registers afterwards. The same
goes for the ptrace-enabled call chain. When ptrace is not used, the
syscall return value is the passed-in syscall number from the
%eax register.

Use %eax as the return value register in syscall_badsys and
sysenter_badsys, like a real syscall handler does, and have the caller
push the value onto the stack for ptrace access.

Signed-off-by: Sven Wegener 
---
 arch/x86/kernel/entry_32.S | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index dbaa23e..0d0c9d4 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -425,8 +425,8 @@ sysenter_do_call:
cmpl $(NR_syscalls), %eax
jae sysenter_badsys
call *sys_call_table(,%eax,4)
-   movl %eax,PT_EAX(%esp)
 sysenter_after_call:
+   movl %eax,PT_EAX(%esp)
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
@@ -502,6 +502,7 @@ ENTRY(system_call)
jae syscall_badsys
 syscall_call:
call *sys_call_table(,%eax,4)
+syscall_after_call:
movl %eax,PT_EAX(%esp)  # store the return value
 syscall_exit:
LOCKDEP_SYS_EXIT
@@ -675,12 +676,12 @@ syscall_fault:
 END(syscall_fault)
 
 syscall_badsys:
-   movl $-ENOSYS,PT_EAX(%esp)
-   jmp syscall_exit
+   movl $-ENOSYS,%eax
+   jmp syscall_after_call
 END(syscall_badsys)
 
 sysenter_badsys:
-   movl $-ENOSYS,PT_EAX(%esp)
+   movl $-ENOSYS,%eax
jmp sysenter_after_call
 END(syscall_badsys)
CFI_ENDPROC
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] NFSv4: Check for buffer length in __nfs4_get_acl_uncached

2012-12-12 Thread Sven Wegener
Commit 1f1ea6c "NFSv4: Fix buffer overflow checking in
__nfs4_get_acl_uncached" accidently dropped the checking for too small
result buffer length.

If someone uses getxattr on "system.nfs4_acl" on an NFSv4 mount
supporting ACLs, the ACL has not been cached and the buffer suplied is
too short, we still copy the complete ACL, resulting in kernel and user
space memory corruption.

Signed-off-by: Sven Wegener 
Cc: sta...@kernel.org
---
 fs/nfs/nfs4proc.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

Resending, because it did not get any response.

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 7bff871..f15be6b 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3831,8 +3831,13 @@ static ssize_t __nfs4_get_acl_uncached(struct inode 
*inode, void *buf, size_t bu
goto out_free;
}
nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len);
-   if (buf)
+   if (buf) {
+   if (res.acl_len > buflen) {
+   ret = -ERANGE;
+   goto out_free;
+   }
_copy_from_pages(buf, pages, res.acl_data_offset, res.acl_len);
+   }
 out_ok:
ret = res.acl_len;
 out_free:
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC] ipvs: Cleanup sync daemon code

2008-02-10 Thread Sven Wegener

On Sat, 9 Feb 2008, Christoph Hellwig wrote:


On Sun, Feb 10, 2008 at 12:38:11AM +0100, Sven Wegener wrote:

 struct ip_vs_sync_thread_data {
-   struct completion *startup;
+   struct completion *startup; /* set to NULL once completed */


This is not needed anmore.  kthread_run guarantees that the newly
creates thread is run before returning to the caller.


The completion is currently used to return an error code for errors that 
happen during initialization in the threads (open socket, allocate 
memory). We could move the setup code out of the threads and have them 
only run an error-safe loop.



+/* wait queue for master sync daemon */
+static DECLARE_WAIT_QUEUE_HEAD(sync_master_wait);


I don't think you need this one either.  You can use wake_up_process
on the task_struct pointer instead.


Thanks, now using schedule_timeout with wake_up_process.


spin_lock(&ip_vs_sync_lock);
list_add_tail(&sb->list, &ip_vs_sync_queue);
+   if (++ip_vs_sync_count == 10)
+   wake_up_interruptible(&sync_master_wait);
spin_unlock(&ip_vs_sync_lock);
 }



-static int sync_thread(void *startup)
+static int sync_thread(void *data)


Btw, it might make sense to remove sync_thread and just call the
master and backup threads directly.


When the setup code has been moved out of the threads, the code gets much 
simpler.



+void __init ip_vs_sync_init(void)
+{
+   /* set up multicast address */
+   mcast_addr.sin_family = AF_INET;
+   mcast_addr.sin_port = htons(IP_VS_SYNC_PORT);
+   mcast_addr.sin_addr.s_addr = htonl(IP_VS_SYNC_GROUP);
 }


Why can't this be initialized at compile time by:

static struct sockaddr_in mcast_addr = {
.sin_family = AF_INET,
.sin_port   = htons(IP_VS_SYNC_PORT),
.sin_addr.s_addr= htonl(IP_VS_SYNC_GROUP),
}

(the hton* might need __constant_hton* also I'm not sure without trying)


Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC] ipvs: Cleanup sync daemon code

2008-02-09 Thread Sven Wegener

Hi all,

I'd like to get your feedback on this:

- Use kthread_run instead of doing a double-fork via kernel_thread()

- Return proper error codes to user-space on failures

Currently ipvsadm --start-daemon with an invalid --mcast-interface will 
silently suceed. With these changes we get an appropriate "No such device" 
error.


- Use wait queues for both master and backup thread

Instead of doing an endless loop with sleeping for one second, we now use 
wait queues. The master sync daemon has its own wait queue and gets woken 
up when we have enough data to sent and also at a regular interval. The 
backup sync daemon sits on the wait queue of the mcast socket and gets 
woken up as soon as we have data to process.


diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 56f3c94..519bd96 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -890,6 +890,7 @@ extern char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
 extern int start_sync_thread(int state, char *mcast_ifn, __u8 syncid);
 extern int stop_sync_thread(int state);
 extern void ip_vs_sync_conn(struct ip_vs_conn *cp);
+extern void ip_vs_sync_init(void);


 /*
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
index 963981a..0ccee4b 100644
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -1071,6 +1071,8 @@ static int __init ip_vs_init(void)
 {
int ret;

+   ip_vs_sync_init();
+
ret = ip_vs_control_init();
if (ret < 0) {
IP_VS_ERR("can't setup control.\n");
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 948378d..36063d3 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -29,6 +29,9 @@
 #include 
 #include  /* for ip_mc_join_group */
 #include 
+#include 
+#include 
+#include 

 #include 
 #include 
@@ -68,7 +71,8 @@ struct ip_vs_sync_conn_options {
 };

 struct ip_vs_sync_thread_data {
-   struct completion *startup;
+   struct completion *startup; /* set to NULL once completed */
+   int *retval; /* only valid until startup is completed */
int state;
 };

@@ -123,9 +127,10 @@ struct ip_vs_sync_buff {
 };


-/* the sync_buff list head and the lock */
+/* the sync_buff list head, the lock and the counter */
 static LIST_HEAD(ip_vs_sync_queue);
 static DEFINE_SPINLOCK(ip_vs_sync_lock);
+static unsigned int ip_vs_sync_count;

 /* current sync_buff for accepting new conn entries */
 static struct ip_vs_sync_buff   *curr_sb = NULL;
@@ -140,6 +145,13 @@ volatile int ip_vs_backup_syncid = 0;
 char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
 char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];

+/* sync daemon tasks */
+static struct task_struct *sync_master_thread;
+static struct task_struct *sync_backup_thread;
+
+/* wait queue for master sync daemon */
+static DECLARE_WAIT_QUEUE_HEAD(sync_master_wait);
+
 /* multicast addr */
 static struct sockaddr_in mcast_addr;

@@ -148,6 +160,8 @@ static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
 {
spin_lock(&ip_vs_sync_lock);
list_add_tail(&sb->list, &ip_vs_sync_queue);
+   if (++ip_vs_sync_count == 10)
+   wake_up_interruptible(&sync_master_wait);
spin_unlock(&ip_vs_sync_lock);
 }

@@ -163,6 +177,7 @@ static inline struct ip_vs_sync_buff * sb_dequeue(void)
struct ip_vs_sync_buff,
list);
list_del(&sb->list);
+   ip_vs_sync_count--;
}
spin_unlock_bh(&ip_vs_sync_lock);

@@ -536,14 +551,17 @@ static int bind_mcastif_addr(struct socket *sock, char 
*ifname)
 static struct socket * make_send_sock(void)
 {
struct socket *sock;
+   int result;

/* First create a socket */
-   if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) {
+   result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+   if (result < 0) {
IP_VS_ERR("Error during creation of socket; terminating\n");
-   return NULL;
+   return ERR_PTR(result);
}

-   if (set_mcast_if(sock->sk, ip_vs_master_mcast_ifn) < 0) {
+   result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn);
+   if (result < 0) {
IP_VS_ERR("Error setting outbound mcast interface\n");
goto error;
}
@@ -551,14 +569,16 @@ static struct socket * make_send_sock(void)
set_mcast_loop(sock->sk, 0);
set_mcast_ttl(sock->sk, 1);

-   if (bind_mcastif_addr(sock, ip_vs_master_mcast_ifn) < 0) {
+   result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn);
+   if (result < 0) {
IP_VS_ERR("Error binding address of the mcast interface\n");
goto error;
}

-   if (sock->ops->connect(sock,
-  (struct sockaddr*)&mcast_addr,
-  sizeof(struct sockaddr), 0) < 0) {
+   result = sock->ops->connect

Re: [PATCH] leds: Add support for power LED on WRAP systems

2008-02-07 Thread Sven Wegener

On Thu, 7 Feb 2008, Richard Purdie wrote:


On Tue, 2008-02-05 at 21:51 +0100, Sven Wegener wrote:

WRAP systems have an additional LED. The power LED is normally lit after boot
and doesn't serve any other purpose besides showing that the system is powered
on. Nevertheless, its state is controllable and we can attach a trigger to it.


There is already a patch queued up to do something like this in the LED
tree, can you check that does everything you need please?

http://git.o-hand.com/?p=linux-rpurdie-leds;a=shortlog;h=for-mm


It's ok, exactly the same my patch implements.

Sven
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] ipvs: Make the synchronization interval controllable

2008-02-06 Thread Sven Wegener
The default synchronization interval of 1000 milliseconds is too high for a
heavily loaded director. Collecting the connection information from one second
and then sending it out in a burst will overflow the socket buffer and lead to
synchronization information being dropped. Make the interval controllable by a
sysctl variable so that users can tune it. We enforce a lower limit of 0 and an
upper limit of 2000 ms on the interval. A too large interval can make the
synchronization buffer consume too much memory and will also delay the exit of
the kernel threads.

Signed-off-by: Sven Wegener <[EMAIL PROTECTED]>
---

Changes from the last version include the addition of the range enforcement.
Also place the definitions of the variables where all other ipvs sysctl
variables are.

Documentation/networking/ipvs-sysctl.txt |   10 ++
 include/net/ip_vs.h  |1 +
 net/ipv4/ipvs/ip_vs_ctl.c|   12 
 net/ipv4/ipvs/ip_vs_sync.c   |4 ++--
 4 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/ipvs-sysctl.txt 
b/Documentation/networking/ipvs-sysctl.txt
index 4ccdbca..bb4eb9a 100644
--- a/Documentation/networking/ipvs-sysctl.txt
+++ b/Documentation/networking/ipvs-sysctl.txt
@@ -141,3 +141,13 @@ sync_threshold - INTEGER
 synchronized, every time the number of its incoming packets
 modulus 50 equals the threshold. The range of the threshold is
 from 0 to 49.
+
+sync_interval - INTEGER
+   default 1000
+
+   The information from synchronization is buffered and sent out at a
+   regular interval by a kernel thread. The interval (in ms) is
+   controlled by this value. The default is too high for a heavily loaded
+   director. If you get a lot of "ip_vs_send_async error" messages from
+   your kernel, then you should lower this value. The value of the
+   interval can be chosen from the range from 0 to 2000.
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 56f3c94..9c4498b 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -854,6 +854,7 @@ extern int sysctl_ip_vs_cache_bypass;
 extern int sysctl_ip_vs_expire_nodest_conn;
 extern int sysctl_ip_vs_expire_quiescent_template;
 extern int sysctl_ip_vs_sync_threshold[2];
+extern int sysctl_ip_vs_sync_interval;
 extern int sysctl_ip_vs_nat_icmp_send;
 extern struct ip_vs_stats ip_vs_stats;
 extern struct ctl_path net_vs_ctl_path[];
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 94c5767..c6322f7 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -80,8 +80,11 @@ int sysctl_ip_vs_cache_bypass = 0;
 int sysctl_ip_vs_expire_nodest_conn = 0;
 int sysctl_ip_vs_expire_quiescent_template = 0;
 int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
+int sysctl_ip_vs_sync_interval = 1000;
 int sysctl_ip_vs_nat_icmp_send = 0;
 
+static int ip_vs_sync_interval_min = 0;
+static int ip_vs_sync_interval_max = 2000;
 
 #ifdef CONFIG_IP_VS_DEBUG
 static int sysctl_ip_vs_debug_level = 0;
@@ -1582,6 +1585,15 @@ static struct ctl_table vs_vars[] = {
.proc_handler   = &proc_do_sync_threshold,
},
{
+   .procname   = "sync_interval",
+   .data   = &sysctl_ip_vs_sync_interval,
+   .maxlen = sizeof(int),
+   .mode   = 0644,
+   .proc_handler   = &proc_dointvec_minmax,
+   .extra1 = &ip_vs_sync_interval_min,
+   .extra2 = &ip_vs_sync_interval_max,
+   },
+   {
.procname   = "nat_icmp_send",
.data   = &sysctl_ip_vs_nat_icmp_send,
.maxlen = sizeof(int),
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 948378d..10ab1b7 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -701,7 +701,7 @@ static void sync_master_loop(void)
if (stop_master_sync)
break;
 
-   msleep_interruptible(1000);
+   msleep_interruptible(sysctl_ip_vs_sync_interval);
}
 
/* clean up the sync_buff queue */
@@ -758,7 +758,7 @@ static void sync_backup_loop(void)
if (stop_backup_sync)
break;
 
-   msleep_interruptible(1000);
+   msleep_interruptible(sysctl_ip_vs_sync_interval);
}
 
/* release the sending multicast socket */
-- 
1.5.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] ipvs: Make the synchronization interval controllable

2008-02-06 Thread Sven Wegener

On Wed, 6 Feb 2008, David Rientjes wrote:


On Wed, 6 Feb 2008, Sven Wegener wrote:


diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 948378d..9b57ad3 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -143,6 +143,8 @@ char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
 /* multicast addr */
 static struct sockaddr_in mcast_addr;

+/* milliseconds between synchronization runs */
+int sysctl_ip_vs_sync_interval = 1000;

 static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
 {


How useful is a negative ip_vs_sync_interval?


Negative values will be converted to MAX_JIFFY_OFFSET by msecs_to_jiffies 
and result in a very long interval. A too long interval will be a good way 
to get your system OOM. We could use an unsigned int or even restrict the 
value with proc_dointvec_minmax. I'd prefer the latter, that's what I 
already had in my mind and it also protects from unintentionally choosing 
a too long interval.


Sven
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] ipvs: Make the synchronization interval controllable

2008-02-06 Thread Sven Wegener
The default synchronization interval of 1000 milliseconds is too high for a
heavily loaded director. Collecting the connection information from one second
and then sending it out in a burst will overflow the socket buffer and lead to
synchronization information being dropped. Make the interval controllable by a
sysctl variable so that users can tune it.

Signed-off-by: Sven Wegener <[EMAIL PROTECTED]>
---
 Documentation/networking/ipvs-sysctl.txt |9 +
 net/ipv4/ipvs/ip_vs_ctl.c|9 -
 net/ipv4/ipvs/ip_vs_sync.c   |6 --
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/Documentation/networking/ipvs-sysctl.txt 
b/Documentation/networking/ipvs-sysctl.txt
index 4ccdbca..1389e2f 100644
--- a/Documentation/networking/ipvs-sysctl.txt
+++ b/Documentation/networking/ipvs-sysctl.txt
@@ -141,3 +141,12 @@ sync_threshold - INTEGER
 synchronized, every time the number of its incoming packets
 modulus 50 equals the threshold. The range of the threshold is
 from 0 to 49.
+
+sync_interval - INTEGER
+   default 1000
+
+   The information from synchronization is buffered and sent out at
+   regular intervals by a kernel thread. The interval (in ms) is
+   controlled by this value. The default is too high for a heavily loaded
+   director. If you get a lot of "ip_vs_send_async error" messages from
+   your kernel, then you should lower this value.
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 94c5767..2781505 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -81,7 +81,7 @@ int sysctl_ip_vs_expire_nodest_conn = 0;
 int sysctl_ip_vs_expire_quiescent_template = 0;
 int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
 int sysctl_ip_vs_nat_icmp_send = 0;
-
+extern int sysctl_ip_vs_sync_interval;
 
 #ifdef CONFIG_IP_VS_DEBUG
 static int sysctl_ip_vs_debug_level = 0;
@@ -1582,6 +1582,13 @@ static struct ctl_table vs_vars[] = {
.proc_handler   = &proc_do_sync_threshold,
},
{
+   .procname   = "sync_interval",
+   .data   = &sysctl_ip_vs_sync_interval,
+   .maxlen = sizeof(int),
+   .mode   = 0644,
+   .proc_handler   = &proc_dointvec,
+   },
+   {
.procname   = "nat_icmp_send",
.data   = &sysctl_ip_vs_nat_icmp_send,
.maxlen = sizeof(int),
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 948378d..9b57ad3 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -143,6 +143,8 @@ char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
 /* multicast addr */
 static struct sockaddr_in mcast_addr;
 
+/* milliseconds between synchronization runs */
+int sysctl_ip_vs_sync_interval = 1000;
 
 static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
 {
@@ -701,7 +703,7 @@ static void sync_master_loop(void)
if (stop_master_sync)
break;
 
-   msleep_interruptible(1000);
+   msleep_interruptible(sysctl_ip_vs_sync_interval);
}
 
/* clean up the sync_buff queue */
@@ -758,7 +760,7 @@ static void sync_backup_loop(void)
if (stop_backup_sync)
break;
 
-   msleep_interruptible(1000);
+   msleep_interruptible(sysctl_ip_vs_sync_interval);
}
 
/* release the sending multicast socket */
-- 
1.5.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] ipvs: Make wrr "no available servers" error message rate-limited

2008-02-05 Thread Sven Wegener
No available servers is more an error message than something informational. It
should also be rate-limited, else we're going to flood our logs on a busy
director, if all real servers are out of order with a weight of zero.

Signed-off-by: Sven Wegener <[EMAIL PROTECTED]>
---

Actually, do we need this message at all? The wrr scheduler is the only one
printing an error message in such a case.

 net/ipv4/ipvs/ip_vs_wrr.c |3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c
index 749fa04..85c680a 100644
--- a/net/ipv4/ipvs/ip_vs_wrr.c
+++ b/net/ipv4/ipvs/ip_vs_wrr.c
@@ -22,6 +22,7 @@
 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -169,7 +170,7 @@ ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct 
sk_buff *skb)
 */
if (mark->cw == 0) {
mark->cl = &svc->destinations;
-   IP_VS_INFO("ip_vs_wrr_schedule(): "
+   IP_VS_ERR_RL("ip_vs_wrr_schedule(): "
   "no available servers\n");
dest = NULL;
goto out;
-- 
1.5.3.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] leds: Add support for power LED on WRAP systems

2008-02-05 Thread Sven Wegener
WRAP systems have an additional LED. The power LED is normally lit after boot
and doesn't serve any other purpose besides showing that the system is powered
on. Nevertheless, its state is controllable and we can attach a trigger to it.

Cc: Kristian Kielhofner <[EMAIL PROTECTED]>
Signed-off-by: Sven Wegener <[EMAIL PROTECTED]>
---
 drivers/leds/leds-wrap.c |   41 +++--
 1 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/drivers/leds/leds-wrap.c b/drivers/leds/leds-wrap.c
index 27fb2d8..0ccb483 100644
--- a/drivers/leds/leds-wrap.c
+++ b/drivers/leds/leds-wrap.c
@@ -19,11 +19,21 @@
 #include 
 
 #define DRVNAME "wrap-led"
+#define WRAP_POWER_LED_GPIO2
 #define WRAP_ERROR_LED_GPIO3
 #defineWRAP_EXTRA_LED_GPIO 18
 
 static struct platform_device *pdev;
 
+static void wrap_power_led_set(struct led_classdev *led_cdev,
+   enum led_brightness value)
+{
+   if (value)
+   scx200_gpio_set_low(WRAP_POWER_LED_GPIO);
+   else
+   scx200_gpio_set_high(WRAP_POWER_LED_GPIO);
+}
+
 static void wrap_error_led_set(struct led_classdev *led_cdev,
enum led_brightness value)
 {
@@ -42,6 +52,11 @@ static void wrap_extra_led_set(struct led_classdev *led_cdev,
scx200_gpio_set_high(WRAP_EXTRA_LED_GPIO);
 }
 
+static struct led_classdev wrap_power_led = {
+   .name   = "wrap:power",
+   .brightness_set = wrap_power_led_set,
+};
+
 static struct led_classdev wrap_error_led = {
.name   = "wrap:error",
.brightness_set = wrap_error_led_set,
@@ -56,6 +71,7 @@ static struct led_classdev wrap_extra_led = {
 static int wrap_led_suspend(struct platform_device *dev,
pm_message_t state)
 {
+   led_classdev_suspend(&wrap_power_led);
led_classdev_suspend(&wrap_error_led);
led_classdev_suspend(&wrap_extra_led);
return 0;
@@ -63,6 +79,7 @@ static int wrap_led_suspend(struct platform_device *dev,
 
 static int wrap_led_resume(struct platform_device *dev)
 {
+   led_classdev_resume(&wrap_power_led);
led_classdev_resume(&wrap_error_led);
led_classdev_resume(&wrap_extra_led);
return 0;
@@ -76,17 +93,30 @@ static int wrap_led_probe(struct platform_device *pdev)
 {
int ret;
 
+   ret = led_classdev_register(&pdev->dev, &wrap_power_led);
+   if (ret < 0)
+   goto out;
ret = led_classdev_register(&pdev->dev, &wrap_error_led);
-   if (ret == 0) {
-   ret = led_classdev_register(&pdev->dev, &wrap_extra_led);
-   if (ret < 0)
-   led_classdev_unregister(&wrap_error_led);
-   }
+   if (ret < 0)
+   goto outpower;
+   ret = led_classdev_register(&pdev->dev, &wrap_extra_led);
+   if (ret < 0)
+   goto outerror;
+
+   return 0;
+
+outerror:
+   led_classdev_unregister(&wrap_error_led);
+outpower:
+   led_classdev_unregister(&wrap_power_led);
+out:
+
return ret;
 }
 
 static int wrap_led_remove(struct platform_device *pdev)
 {
+   led_classdev_unregister(&wrap_power_led);
led_classdev_unregister(&wrap_error_led);
led_classdev_unregister(&wrap_extra_led);
return 0;
@@ -139,4 +169,3 @@ module_exit(wrap_led_exit);
 MODULE_AUTHOR("Kristian Kielhofner <[EMAIL PROTECTED]>");
 MODULE_DESCRIPTION("PCEngines WRAP LED driver");
 MODULE_LICENSE("GPL");
-
-- 
1.5.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/