[take22 1/4] kevent: Core files.
Core files. This patch includes core kevent files: * userspace controlling * kernelspace interfaces * initialization * notification state machines Some bits of documentation can be found on project's homepage (and links from there): http://tservice.net.ru/~s0mbre/old/?section=projectsitem=kevent Signed-off-by: Evgeniy Polyakov [EMAIL PROTECTED] diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index 7e639f7..a9560eb 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S @@ -318,3 +318,6 @@ ENTRY(sys_call_table) .long sys_vmsplice .long sys_move_pages .long sys_getcpu + .long sys_kevent_get_events + .long sys_kevent_ctl/* 320 */ + .long sys_kevent_wait diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index b4aa875..cf18955 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -714,8 +714,11 @@ #endif .quad compat_sys_get_robust_list .quad sys_splice .quad sys_sync_file_range - .quad sys_tee + .quad sys_tee /* 315 */ .quad compat_sys_vmsplice .quad compat_sys_move_pages .quad sys_getcpu + .quad sys_kevent_get_events + .quad sys_kevent_ctl/* 320 */ + .quad sys_kevent_wait ia32_syscall_end: diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index bd99870..f009677 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -324,10 +324,13 @@ #define __NR_tee 315 #define __NR_vmsplice 316 #define __NR_move_pages317 #define __NR_getcpu318 +#define __NR_kevent_get_events 319 +#define __NR_kevent_ctl320 +#define __NR_kevent_wait 321 #ifdef __KERNEL__ -#define NR_syscalls 319 +#define NR_syscalls 322 #include linux/err.h /* diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index 6137146..c53d156 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -619,10 +619,16 @@ #define __NR_vmsplice 278 __SYSCALL(__NR_vmsplice, sys_vmsplice) #define __NR_move_pages279 __SYSCALL(__NR_move_pages, sys_move_pages) +#define __NR_kevent_get_events 280 +__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events) +#define __NR_kevent_ctl281 +__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl) +#define __NR_kevent_wait 282 +__SYSCALL(__NR_kevent_wait, sys_kevent_wait) #ifdef __KERNEL__ -#define __NR_syscall_max __NR_move_pages +#define __NR_syscall_max __NR_kevent_wait #include linux/err.h #ifndef __NO_STUBS diff --git a/include/linux/kevent.h b/include/linux/kevent.h new file mode 100644 index 000..743b328 --- /dev/null +++ b/include/linux/kevent.h @@ -0,0 +1,205 @@ +/* + * 2006 Copyright (c) Evgeniy Polyakov [EMAIL PROTECTED] + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef __KEVENT_H +#define __KEVENT_H +#include linux/types.h +#include linux/list.h +#include linux/rbtree.h +#include linux/spinlock.h +#include linux/mutex.h +#include linux/wait.h +#include linux/net.h +#include linux/rcupdate.h +#include linux/kevent_storage.h +#include linux/ukevent.h + +#define KEVENT_MIN_BUFFS_ALLOC 3 + +struct kevent; +struct kevent_storage; +typedef int (* kevent_callback_t)(struct kevent *); + +/* @callback is called each time new event has been caught. */ +/* @enqueue is called each time new event is queued. */ +/* @dequeue is called each time event is dequeued. */ + +struct kevent_callbacks { + kevent_callback_t callback, enqueue, dequeue; +}; + +#define KEVENT_READY 0x1 +#define KEVENT_STORAGE 0x2 +#define KEVENT_USER0x4 + +struct kevent +{ + /* Used for kevent freeing.*/ + struct rcu_head rcu_head; + struct ukevent event; + /* This lock protects ukevent manipulations, e.g. ret_flags changes. */ + spinlock_t ulock; + + /* Entry of user's tree. */ + struct rb_node kevent_node; + /* Entry of origin's queue. */ + struct list_headstorage_entry; + /* Entry of user's ready. */ + struct
Re: [take21 1/4] kevent: Core files.
+/* + * Called under kevent_user-ready_lock, so updates are always protected. + */ +int kevent_user_ring_add_event(struct kevent *k) +{ + unsigned int pidx, off; + struct kevent_mring *ring, *copy_ring; + + ring = k-user-pring[0]; + + if ((ring-kidx + 1 == ring-uidx) || + ((ring-kidx + 1 == KEVENT_MAX_EVENTS) ring-uidx == 0)) { + if (k-user-overflow_kevent == NULL) + k-user-overflow_kevent = k; + return -EAGAIN; + } + I really dont understand how you manage to queue multiple kevents in the 'overflow list'. You just queue one kevent at most. What am I missing ? + + for (i=0; iKEVENT_MAX_PAGES; ++i) { + u-pring[i] = (struct kevent_mring *)__get_free_page(GFP_KERNEL); + if (!u-pring[i]) + break; + } + + if (i != KEVENT_MAX_PAGES) + goto err_out_free; Why dont you use goto directly ? if (!u-pring[i]) goto err_out_free; + + u-pring[0]-uidx = u-pring[0]-kidx = 0; + + return 0; + +err_out_free: + for (i=0; iKEVENT_MAX_PAGES; ++i) { + if (!u-pring[i]) + break; + + free_page((unsigned long)u-pring[i]); + } + return k; +} + +static int kevent_user_ctl_add(struct kevent_user *u, unsigned int num, void __user *arg) +{ + int err, cerr = 0, knum = 0, rnum = 0, i; + void __user *orig = arg; + struct ukevent uk; + + mutex_lock(u-ctl_mutex); + + err = -EINVAL; + if (num KEVENT_MIN_BUFFS_ALLOC) { + struct ukevent *ukev; + + ukev = kevent_get_user(num, arg); + if (ukev) { + for (i = 0; i num; ++i) { + err = kevent_user_add_ukevent(ukev[i], u); + if (err) { + kevent_stat_im(u); + if (i != rnum) + memcpy(ukev[rnum], ukev[i], sizeof(struct ukevent)); + rnum++; + } else + knum++; Why are you using/counting knum ? + } + if (copy_to_user(orig, ukev, rnum*sizeof(struct ukevent))) + cerr = -EFAULT; + kfree(ukev); + goto out_setup; + } + } + + for (i = 0; i num; ++i) { + if (copy_from_user(uk, arg, sizeof(struct ukevent))) { + cerr = -EFAULT; + break; + } + arg += sizeof(struct ukevent); + + err = kevent_user_add_ukevent(uk, u); + if (err) { + kevent_stat_im(u); + if (copy_to_user(orig, uk, sizeof(struct ukevent))) { + cerr = -EFAULT; + break; + } + orig += sizeof(struct ukevent); + rnum++; + } else + knum++; + } + +out_setup: + if (cerr 0) { + err = cerr; + goto out_remove; + } + + err = rnum; +out_remove: + mutex_unlock(u-ctl_mutex); + + return err; +} - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take21 1/4] kevent: Core files.
On Sat, Oct 28, 2006 at 12:28:12PM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: +/* + * Called under kevent_user-ready_lock, so updates are always protected. + */ +int kevent_user_ring_add_event(struct kevent *k) +{ + unsigned int pidx, off; + struct kevent_mring *ring, *copy_ring; + + ring = k-user-pring[0]; + + if ((ring-kidx + 1 == ring-uidx) || + ((ring-kidx + 1 == KEVENT_MAX_EVENTS) ring-uidx == 0)) { + if (k-user-overflow_kevent == NULL) + k-user-overflow_kevent = k; + return -EAGAIN; + } + I really dont understand how you manage to queue multiple kevents in the 'overflow list'. You just queue one kevent at most. What am I missing ? There is no overflow list - it is a pointer to the first kevent in the ready queue, which was not put into ring buffer. It is an optimisation, which allows to not search for that position each time new event should be placed into the buffer, when it starts to have an empty slot. + +for (i=0; iKEVENT_MAX_PAGES; ++i) { +u-pring[i] = (struct kevent_mring *)__get_free_page(GFP_KERNEL); +if (!u-pring[i]) +break; +} + +if (i != KEVENT_MAX_PAGES) +goto err_out_free; Why dont you use goto directly ? if (!u-pring[i]) goto err_out_free; I used a fallback mode here which allowed to use smaller number of pages for kevent ring buffer, but then decided to drop it. So it is possible to use goto directly. + +u-pring[0]-uidx = u-pring[0]-kidx = 0; + +return 0; + +err_out_free: +for (i=0; iKEVENT_MAX_PAGES; ++i) { +if (!u-pring[i]) +break; + +free_page((unsigned long)u-pring[i]); +} +return k; +} + +static int kevent_user_ctl_add(struct kevent_user *u, unsigned int num, void __user *arg) +{ +int err, cerr = 0, knum = 0, rnum = 0, i; +void __user *orig = arg; +struct ukevent uk; + +mutex_lock(u-ctl_mutex); + +err = -EINVAL; +if (num KEVENT_MIN_BUFFS_ALLOC) { +struct ukevent *ukev; + +ukev = kevent_get_user(num, arg); +if (ukev) { +for (i = 0; i num; ++i) { +err = kevent_user_add_ukevent(ukev[i], u); +if (err) { +kevent_stat_im(u); +if (i != rnum) +memcpy(ukev[rnum], ukev[i], sizeof(struct ukevent)); +rnum++; +} else +knum++; Why are you using/counting knum ? It should go avay. +} +if (copy_to_user(orig, ukev, rnum*sizeof(struct ukevent))) +cerr = -EFAULT; +kfree(ukev); +goto out_setup; +} +} + +for (i = 0; i num; ++i) { +if (copy_from_user(uk, arg, sizeof(struct ukevent))) { +cerr = -EFAULT; +break; +} +arg += sizeof(struct ukevent); + +err = kevent_user_add_ukevent(uk, u); +if (err) { +kevent_stat_im(u); +if (copy_to_user(orig, uk, sizeof(struct ukevent))) { +cerr = -EFAULT; +break; +} +orig += sizeof(struct ukevent); +rnum++; +} else +knum++; +} + +out_setup: +if (cerr 0) { +err = cerr; +goto out_remove; +} + +err = rnum; +out_remove: +mutex_unlock(u-ctl_mutex); + +return err; +} - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take21 1/4] kevent: Core files.
Evgeniy Polyakov a e'crit : On Sat, Oct 28, 2006 at 12:28:12PM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: I really dont understand how you manage to queue multiple kevents in the 'overflow list'. You just queue one kevent at most. What am I missing ? There is no overflow list - it is a pointer to the first kevent in the ready queue, which was not put into ring buffer. It is an optimisation, which allows to not search for that position each time new event should be placed into the buffer, when it starts to have an empty slot. This overflow list (you may call it differently, but still it IS a list), is not complete. I feel you add it just to make me happy, but I am not (yet :) ) For example, you make no test at kevent_finish_user_complete() time. Obviously, you can have a dangling pointer, and crash your box in certain conditions. static void kevent_finish_user_complete(struct kevent *k, int deq) { struct kevent_user *u = k-user; unsigned long flags; if (deq) kevent_dequeue(k); spin_lock_irqsave(u-ready_lock, flags); if (k-flags KEVENT_READY) { + if (u-overflow_event == k) { + /* MUST do something to change u-overflow_kevent */ + } list_del(k-ready_entry); k-flags = ~KEVENT_READY; u-ready_num--; } spin_unlock_irqrestore(u-ready_lock, flags); kevent_user_put(u); call_rcu(k-rcu_head, kevent_free_rcu); } Eric - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take21 1/4] kevent: Core files.
On Sat, Oct 28, 2006 at 02:36:31PM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: Evgeniy Polyakov a e'crit : On Sat, Oct 28, 2006 at 12:28:12PM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: I really dont understand how you manage to queue multiple kevents in the 'overflow list'. You just queue one kevent at most. What am I missing ? There is no overflow list - it is a pointer to the first kevent in the ready queue, which was not put into ring buffer. It is an optimisation, which allows to not search for that position each time new event should be placed into the buffer, when it starts to have an empty slot. This overflow list (you may call it differently, but still it IS a list), is not complete. I feel you add it just to make me happy, but I am not (yet :) ) There is no overflow list. There is ready queue, part of which (first several entries) is copied into the ring buffer, overflow_kevent is a pointer to the first kevent which was not copied. For example, you make no test at kevent_finish_user_complete() time. Obviously, you can have a dangling pointer, and crash your box in certain conditions. You are right, I did not put overflow_kevent check into all places which can remove kevent. Here is a patch I am about to commit into the kevent tree: diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c index 711a8a8..ecee668 100644 --- a/kernel/kevent/kevent_user.c +++ b/kernel/kevent/kevent_user.c @@ -235,6 +235,36 @@ static void kevent_free_rcu(struct rcu_h } /* + * Must be called under u-ready_lock. + * This function removes kevent from ready queue and + * tries to add new kevent into ring buffer. + */ +static void kevent_remove_ready(struct kevent *k) +{ + struct kevent_user *u = k-user; + + list_del(k-ready_entry); + k-flags = ~KEVENT_READY; + u-ready_num--; + if (++u-pring[0]-uidx == KEVENT_MAX_EVENTS) + u-pring[0]-uidx = 0; + + if (u-overflow_kevent) { + int err; + + err = kevent_user_ring_add_event(u-overflow_kevent); + if (!err || u-overflow_kevent == k) { + if (u-overflow_kevent-ready_entry.next == u-ready_list) + u-overflow_kevent = NULL; + else + u-overflow_kevent = + list_entry(u-overflow_kevent-ready_entry.next, + struct kevent, ready_entry); + } + } +} + +/* * Complete kevent removing - it dequeues kevent from storage list * if it is requested, removes kevent from ready list, drops userspace * control block reference counter and schedules kevent freeing through RCU. @@ -248,11 +278,8 @@ static void kevent_finish_user_complete( kevent_dequeue(k); spin_lock_irqsave(u-ready_lock, flags); - if (k-flags KEVENT_READY) { - list_del(k-ready_entry); - k-flags = ~KEVENT_READY; - u-ready_num--; - } + if (k-flags KEVENT_READY) + kevent_remove_ready(k); spin_unlock_irqrestore(u-ready_lock, flags); kevent_user_put(u); @@ -303,25 +330,7 @@ static struct kevent *kqueue_dequeue_rea spin_lock_irqsave(u-ready_lock, flags); if (u-ready_num !list_empty(u-ready_list)) { k = list_entry(u-ready_list.next, struct kevent, ready_entry); - list_del(k-ready_entry); - k-flags = ~KEVENT_READY; - u-ready_num--; - if (++u-pring[0]-uidx == KEVENT_MAX_EVENTS) - u-pring[0]-uidx = 0; - - if (u-overflow_kevent) { - int err; - - err = kevent_user_ring_add_event(u-overflow_kevent); - if (!err) { - if (u-overflow_kevent-ready_entry.next == u-ready_list) - u-overflow_kevent = NULL; - else - u-overflow_kevent = - list_entry(u-overflow_kevent-ready_entry.next, - struct kevent, ready_entry); - } - } + kevent_remove_ready(k); } spin_unlock_irqrestore(u-ready_lock, flags); It tries to put next kevent into the ring and thus update overflow_kevent if new kevent has been put into the buffer or kevent being removed is overflow kevent. Patch depends on committed changes of returned error numbers and unused variables cleanup, it will be included into next patchset if there are no problems with it. -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL
Re: [take21 1/4] kevent: Core files.
Evgeniy Polyakov a e'crit : On Sat, Oct 28, 2006 at 02:36:31PM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: Evgeniy Polyakov a e'crit : On Sat, Oct 28, 2006 at 12:28:12PM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: I really dont understand how you manage to queue multiple kevents in the 'overflow list'. You just queue one kevent at most. What am I missing ? There is no overflow list - it is a pointer to the first kevent in the ready queue, which was not put into ring buffer. It is an optimisation, which allows to not search for that position each time new event should be placed into the buffer, when it starts to have an empty slot. This overflow list (you may call it differently, but still it IS a list), is not complete. I feel you add it just to make me happy, but I am not (yet :) ) There is no overflow list. There is ready queue, part of which (first several entries) is copied into the ring buffer, overflow_kevent is a pointer to the first kevent which was not copied. For example, you make no test at kevent_finish_user_complete() time. Obviously, you can have a dangling pointer, and crash your box in certain conditions. You are right, I did not put overflow_kevent check into all places which can remove kevent. Here is a patch I am about to commit into the kevent tree: diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c index 711a8a8..ecee668 100644 --- a/kernel/kevent/kevent_user.c +++ b/kernel/kevent/kevent_user.c @@ -235,6 +235,36 @@ static void kevent_free_rcu(struct rcu_h } /* + * Must be called under u-ready_lock. + * This function removes kevent from ready queue and + * tries to add new kevent into ring buffer. + */ +static void kevent_remove_ready(struct kevent *k) +{ + struct kevent_user *u = k-user; + + list_del(k-ready_entry); Arg... no You cannot call list_del() , then check overflow_kevent. I you call list_del on what happens to be the kevent pointed by overflow_kevent, you loose... + k-flags = ~KEVENT_READY; + u-ready_num--; + if (++u-pring[0]-uidx == KEVENT_MAX_EVENTS) + u-pring[0]-uidx = 0; + + if (u-overflow_kevent) { + int err; + + err = kevent_user_ring_add_event(u-overflow_kevent); + if (!err || u-overflow_kevent == k) { + if (u-overflow_kevent-ready_entry.next == u-ready_list) + u-overflow_kevent = NULL; + else +u-overflow_kevent = + list_entry(u-overflow_kevent-ready_entry.next, + struct kevent, ready_entry); + } + } +} + +/* * Complete kevent removing - it dequeues kevent from storage list * if it is requested, removes kevent from ready list, drops userspace * control block reference counter and schedules kevent freeing through RCU. @@ -248,11 +278,8 @@ static void kevent_finish_user_complete( kevent_dequeue(k); spin_lock_irqsave(u-ready_lock, flags); - if (k-flags KEVENT_READY) { - list_del(k-ready_entry); - k-flags = ~KEVENT_READY; - u-ready_num--; - } + if (k-flags KEVENT_READY) + kevent_remove_ready(k); spin_unlock_irqrestore(u-ready_lock, flags); kevent_user_put(u); @@ -303,25 +330,7 @@ static struct kevent *kqueue_dequeue_rea spin_lock_irqsave(u-ready_lock, flags); if (u-ready_num !list_empty(u-ready_list)) { k = list_entry(u-ready_list.next, struct kevent, ready_entry); - list_del(k-ready_entry); - k-flags = ~KEVENT_READY; - u-ready_num--; - if (++u-pring[0]-uidx == KEVENT_MAX_EVENTS) - u-pring[0]-uidx = 0; - - if (u-overflow_kevent) { - int err; - - err = kevent_user_ring_add_event(u-overflow_kevent); - if (!err) { - if (u-overflow_kevent-ready_entry.next == u-ready_list) - u-overflow_kevent = NULL; - else - u-overflow_kevent = - list_entry(u-overflow_kevent-ready_entry.next, -struct kevent, ready_entry); - } - } + kevent_remove_ready(k); } spin_unlock_irqrestore(u-ready_lock, flags); - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take21 1/4] kevent: Core files.
On Sat, Oct 28, 2006 at 03:23:40PM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c index 711a8a8..ecee668 100644 --- a/kernel/kevent/kevent_user.c +++ b/kernel/kevent/kevent_user.c @@ -235,6 +235,36 @@ static void kevent_free_rcu(struct rcu_h } /* + * Must be called under u-ready_lock. + * This function removes kevent from ready queue and + * tries to add new kevent into ring buffer. + */ +static void kevent_remove_ready(struct kevent *k) +{ +struct kevent_user *u = k-user; + +list_del(k-ready_entry); Arg... no You cannot call list_del() , then check overflow_kevent. I you call list_del on what happens to be the kevent pointed by overflow_kevent, you loose... This function is always called from appropriate context, where it is guaranteed that it is safe to call list_del: 1. when kevent is removed. It is called after check, that given kevent is in the ready queue. 2. when dequeued from ready queue, which means that it can be removed from that queue. -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take21 1/4] kevent: Core files.
Evgeniy Polyakov a e'crit : On Sat, Oct 28, 2006 at 03:23:40PM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c index 711a8a8..ecee668 100644 --- a/kernel/kevent/kevent_user.c +++ b/kernel/kevent/kevent_user.c @@ -235,6 +235,36 @@ static void kevent_free_rcu(struct rcu_h } /* + * Must be called under u-ready_lock. + * This function removes kevent from ready queue and + * tries to add new kevent into ring buffer. + */ +static void kevent_remove_ready(struct kevent *k) +{ + struct kevent_user *u = k-user; + + list_del(k-ready_entry); Arg... no You cannot call list_del() , then check overflow_kevent. I you call list_del on what happens to be the kevent pointed by overflow_kevent, you loose... This function is always called from appropriate context, where it is guaranteed that it is safe to call list_del: 1. when kevent is removed. It is called after check, that given kevent is in the ready queue. 2. when dequeued from ready queue, which means that it can be removed from that queue. Could you please check the list_del() function ? file include/linux/list.h static inline void list_del(struct list_head *entry) { __list_del(entry-prev, entry-next); entry-next = LIST_POISON1; entry-prev = LIST_POISON2; } So, after calling list_del(k-read_entry); next and prev are basically destroyed. So when you write later : +if (!err || u-overflow_kevent == k) { +if (u-overflow_kevent-ready_entry.next == u-ready_list) +u-overflow_kevent = NULL; +else +u-overflow_kevent = + list_entry(u-overflow_kevent-ready_entry.next, + struct kevent, ready_entry); +} then you have a problem, since list_entry(k-ready_entry.next, struct kevent, ready_entry); will give you garbage. Eric - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take21 1/4] kevent: Core files.
On Sat, Oct 28, 2006 at 03:34:52PM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: + list_del(k-ready_entry); Arg... no You cannot call list_del() , then check overflow_kevent. I you call list_del on what happens to be the kevent pointed by overflow_kevent, you loose... This function is always called from appropriate context, where it is guaranteed that it is safe to call list_del: 1. when kevent is removed. It is called after check, that given kevent is in the ready queue. 2. when dequeued from ready queue, which means that it can be removed from that queue. Could you please check the list_del() function ? file include/linux/list.h static inline void list_del(struct list_head *entry) { __list_del(entry-prev, entry-next); entry-next = LIST_POISON1; entry-prev = LIST_POISON2; } So, after calling list_del(k-read_entry); next and prev are basically destroyed. So when you write later : +if (!err || u-overflow_kevent == k) { +if (u-overflow_kevent-ready_entry.next == u-ready_list) +u-overflow_kevent = NULL; +else +u-overflow_kevent = + list_entry(u-overflow_kevent-ready_entry.next, + struct kevent, ready_entry); +} then you have a problem, since list_entry(k-ready_entry.next, struct kevent, ready_entry); will give you garbage. Ok, I understand you now. To remove this issue we can delete entry from the list after all checks with overflow_kevent pointer are completed, i.e. have something like this: diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c index 711a8a8..f3fec9b 100644 --- a/kernel/kevent/kevent_user.c +++ b/kernel/kevent/kevent_user.c @@ -235,6 +235,36 @@ static void kevent_free_rcu(struct rcu_h } /* + * Must be called under u-ready_lock. + * This function removes kevent from ready queue and + * tries to add new kevent into ring buffer. + */ +static void kevent_remove_ready(struct kevent *k) +{ + struct kevent_user *u = k-user; + + if (++u-pring[0]-uidx == KEVENT_MAX_EVENTS) + u-pring[0]-uidx = 0; + + if (u-overflow_kevent) { + int err; + + err = kevent_user_ring_add_event(u-overflow_kevent); + if (!err || u-overflow_kevent == k) { + if (u-overflow_kevent-ready_entry.next == u-ready_list) + u-overflow_kevent = NULL; + else + u-overflow_kevent = + list_entry(u-overflow_kevent-ready_entry.next, + struct kevent, ready_entry); + } + } + list_del(k-ready_entry); + k-flags = ~KEVENT_READY; + u-ready_num--; +} + +/* * Complete kevent removing - it dequeues kevent from storage list * if it is requested, removes kevent from ready list, drops userspace * control block reference counter and schedules kevent freeing through RCU. @@ -248,11 +278,8 @@ static void kevent_finish_user_complete( kevent_dequeue(k); spin_lock_irqsave(u-ready_lock, flags); - if (k-flags KEVENT_READY) { - list_del(k-ready_entry); - k-flags = ~KEVENT_READY; - u-ready_num--; - } + if (k-flags KEVENT_READY) + kevent_remove_ready(k); spin_unlock_irqrestore(u-ready_lock, flags); kevent_user_put(u); @@ -303,25 +330,7 @@ static struct kevent *kqueue_dequeue_rea spin_lock_irqsave(u-ready_lock, flags); if (u-ready_num !list_empty(u-ready_list)) { k = list_entry(u-ready_list.next, struct kevent, ready_entry); - list_del(k-ready_entry); - k-flags = ~KEVENT_READY; - u-ready_num--; - if (++u-pring[0]-uidx == KEVENT_MAX_EVENTS) - u-pring[0]-uidx = 0; - - if (u-overflow_kevent) { - int err; - - err = kevent_user_ring_add_event(u-overflow_kevent); - if (!err) { - if (u-overflow_kevent-ready_entry.next == u-ready_list) - u-overflow_kevent = NULL; - else - u-overflow_kevent = - list_entry(u-overflow_kevent-ready_entry.next, - struct kevent, ready_entry); - } - } + kevent_remove_ready(k); } spin_unlock_irqrestore(u-ready_lock, flags); Thanks. Eric -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at
[take21 1/4] kevent: Core files.
Core files. This patch includes core kevent files: * userspace controlling * kernelspace interfaces * initialization * notification state machines Some bits of documentation can be found on project's homepage (and links from there): http://tservice.net.ru/~s0mbre/old/?section=projectsitem=kevent Signed-off-by: Evgeniy Polyakov [EMAIL PROTECTED] diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index 7e639f7..a9560eb 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S @@ -318,3 +318,6 @@ ENTRY(sys_call_table) .long sys_vmsplice .long sys_move_pages .long sys_getcpu + .long sys_kevent_get_events + .long sys_kevent_ctl/* 320 */ + .long sys_kevent_wait diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index b4aa875..cf18955 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -714,8 +714,11 @@ #endif .quad compat_sys_get_robust_list .quad sys_splice .quad sys_sync_file_range - .quad sys_tee + .quad sys_tee /* 315 */ .quad compat_sys_vmsplice .quad compat_sys_move_pages .quad sys_getcpu + .quad sys_kevent_get_events + .quad sys_kevent_ctl/* 320 */ + .quad sys_kevent_wait ia32_syscall_end: diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index bd99870..f009677 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -324,10 +324,13 @@ #define __NR_tee 315 #define __NR_vmsplice 316 #define __NR_move_pages317 #define __NR_getcpu318 +#define __NR_kevent_get_events 319 +#define __NR_kevent_ctl320 +#define __NR_kevent_wait 321 #ifdef __KERNEL__ -#define NR_syscalls 319 +#define NR_syscalls 322 #include linux/err.h /* diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index 6137146..c53d156 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -619,10 +619,16 @@ #define __NR_vmsplice 278 __SYSCALL(__NR_vmsplice, sys_vmsplice) #define __NR_move_pages279 __SYSCALL(__NR_move_pages, sys_move_pages) +#define __NR_kevent_get_events 280 +__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events) +#define __NR_kevent_ctl281 +__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl) +#define __NR_kevent_wait 282 +__SYSCALL(__NR_kevent_wait, sys_kevent_wait) #ifdef __KERNEL__ -#define __NR_syscall_max __NR_move_pages +#define __NR_syscall_max __NR_kevent_wait #include linux/err.h #ifndef __NO_STUBS diff --git a/include/linux/kevent.h b/include/linux/kevent.h new file mode 100644 index 000..125414c --- /dev/null +++ b/include/linux/kevent.h @@ -0,0 +1,205 @@ +/* + * 2006 Copyright (c) Evgeniy Polyakov [EMAIL PROTECTED] + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef __KEVENT_H +#define __KEVENT_H +#include linux/types.h +#include linux/list.h +#include linux/rbtree.h +#include linux/spinlock.h +#include linux/mutex.h +#include linux/wait.h +#include linux/net.h +#include linux/rcupdate.h +#include linux/kevent_storage.h +#include linux/ukevent.h + +#define KEVENT_MIN_BUFFS_ALLOC 3 + +struct kevent; +struct kevent_storage; +typedef int (* kevent_callback_t)(struct kevent *); + +/* @callback is called each time new event has been caught. */ +/* @enqueue is called each time new event is queued. */ +/* @dequeue is called each time event is dequeued. */ + +struct kevent_callbacks { + kevent_callback_t callback, enqueue, dequeue; +}; + +#define KEVENT_READY 0x1 +#define KEVENT_STORAGE 0x2 +#define KEVENT_USER0x4 + +struct kevent +{ + /* Used for kevent freeing.*/ + struct rcu_head rcu_head; + struct ukevent event; + /* This lock protects ukevent manipulations, e.g. ret_flags changes. */ + spinlock_t ulock; + + /* Entry of user's tree. */ + struct rb_node kevent_node; + /* Entry of origin's queue. */ + struct list_headstorage_entry; + /* Entry of user's ready. */ + struct
Re: [take19 1/4] kevent: Core files.
On Tuesday 17 October 2006 00:09, Johann Borck wrote: Regarding mukevent I'm thinking of a event-type specific struct, that is filled by the originating code, and placed into a per-event-type ring buffer (which requires modification of kevent_wait). I'd personally worry about an implementation that used a per-event-type ring buffer, because you're still left having to hack around starvation issues in user-space. It is of course possible under the current model for anyone who wants per-event-type ring buffers to have them - just make separate kevent sets. I haven't thought this through all the way yet, but why not have variable length event structures and have the kernel fill in a next pointer in each one? This could even be used to keep backwards binary compatibility while adding additional fields to the structures over time, though no space would be wasted on modern programs. You still end up with a question of what to do in case of overflow, but I'm thinking the thing to do in that case might be to start pushing overflow events onto a linked list which can be written back into the ring buffer when space becomes available. The appropriate behavior would be to throw new events on the linked list if the linked list had any events, so that things are delivered in order, but write to the mapped buffer directly otherwise. Deciding when to do that is tricky, and I haven't thought through the implications fully when I say this, but what about activating a bottom half when more space becomes available, and let that drain overflowed events back into the mapped buffer? Or perhaps the time to do it would be in the next blocking wait, when the queue emptied? I think it is very important to avoid any limits that can not be adjusted on the fly at run-time by CAP_SYS_ADMIN or what have you. Doing it this way may have other problems I've ignored but at least the big one - compile-time capacity limits in the year 2006 - would be largely avoided :P Nothing real solid yet, just some electrical storms in the grey matter... Thanks, Chase - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take19 1/4] kevent: Core files.
On Tue, Oct 17, 2006 at 07:10:14AM +0200, Johann Borck ([EMAIL PROTECTED]) wrote: Ulrich Drepper wrote: Evgeniy Polyakov wrote: Existing design does not allow overflow. And I've pointed out a number of times that this is not practical at best. There are event sources which can create events which cannot be coalesced into one single event as it would be required with your design. Signals are one example, specifically realtime signals. If we do not want the design to be limited from the start this approach has to be thought over. So zap mmap() support completely, since it is not usable at all. We wont discuss on it. Initial implementation did not have it. But I was requested to do it, and it is ready now. No one likes it, but no one provides an alternative implementation. We are stuck. We need the mapped ring buffer. The current design (before it was removed) was broken but this does not mean it shouldn't be implemented. We just need more time to figure out how to implement it correctly. Considering the if at all and if then how of ring buffer implemetation I'd like to throw in some ideas I had when reading the discussion and respective code. If I understood Ulrich Drepper right, his notion of a generic event handling interface is, that it has to be flexible enough to transport additional info from origin to userspace, and to support queuing of events from the same origin, so that additional per-event-occurrence data doesn't get lost, which would happen when coalescing multiple events into one until delivery. From what I read he says ring buffer is broken because of insufficient space for additional data (mukevent) and the limited number of events that can be put into ring buffer. Another argument is missing notification of userspace about dropped events in case ring buffer limit is reached. (is that right?) I can add such notification, but its existense _is_ the broken design. After such condition happend, all new events will dissapear (although they are still accessible through usual queue) from mapped buffer. While writing this I have come to the idea on how to imrove the case of the size of mapped buffer - we can make it with limited size, and when it is full, some bit will be set in the shared area and obviously no new events can be added there, but when user commits some events from that buffer (i.e. says to kernel that appropriate kevents can be freed or requeued according to theirs flags), new ready events from ready queue can be copied into mapped buffer. It still does not solve (and I do insist that it is broken behaviour) the case when kernel is going to generate infinite number of events for one requested by userspace (as in case of generating new 'data_has_arrived' event when new byte has been received). Userspace events are only marked as ready, they are not generated - it is high-performance _feature_ of the new design, not some kind of a bug. I see no reason why kevent couldn't be modified to fit (all) these needs. While modifying the server-example and writing a client using kevent I came across the coalescing problem, there were more incoming connections than accept events, and I had to work around that. In this Btw, accept() issue is exactly the same as with usual poll() - repeated insertion of the same kevent will fire immediately, which requires event to be one-shot. One of the initial implementation contained number of ready for accept sockets as one of the returned parameters though. case the pure number of coalesced events would suffice, while it wouldn't for the example of RT-signals that Ulrich Drepper gave. So if coalescing can be done at all or if it is impossible depends on the type of event. The same goes for additional data delivered with the events. There might be no panacea for all possible scenarios with one fixed design. Either performance suffers for 'lightweight' events which don't need additional data and/or coalescing is not problematic and/or ring buffer, or kevent is not usable for other types of events. Why not treat different things differently, and let the (kernel-)user decide. I don't know if I got all this right, but if, then ring buffer is needed especially for cases where coalescing is not possible and additional data has to be delivered for each triggered notification (so the pure number of events is not enough; other reasons? performance? ). To me it doesn't make sense to have kevent fill memory and use processor-time if buffer is not used at all, which is the case when using kevent_getevents. So here are my Ideas: Make usage of ring buffer optional, if not required for specific event-type it might be chosen by userspace-code. Make limit of events in ring buffer optional and controllable from userspace. It is of course possible, main problem is that existing design of the mapped buffer is not sufficient, and there are no other propositions except that 'it
Re: [take19 1/4] kevent: Core files.
On Tue, Oct 17, 2006 at 12:59:47AM -0500, Chase Venters ([EMAIL PROTECTED]) wrote: On Tuesday 17 October 2006 00:09, Johann Borck wrote: Regarding mukevent I'm thinking of a event-type specific struct, that is filled by the originating code, and placed into a per-event-type ring buffer (which requires modification of kevent_wait). I'd personally worry about an implementation that used a per-event-type ring buffer, because you're still left having to hack around starvation issues in user-space. It is of course possible under the current model for anyone who wants per-event-type ring buffers to have them - just make separate kevent sets. I haven't thought this through all the way yet, but why not have variable length event structures and have the kernel fill in a next pointer in each one? This could even be used to keep backwards binary compatibility while Why do we want variable size structures in mmap ring buffer? adding additional fields to the structures over time, though no space would be wasted on modern programs. You still end up with a question of what to do in case of overflow, but I'm thinking the thing to do in that case might be to start pushing overflow events onto a linked list which can be written back into the ring buffer when space becomes available. The appropriate behavior would be to throw new events on the linked list if the linked list had any events, so that things are delivered in order, but write to the mapped buffer directly otherwise. I think in a similar way. Kevent actually do not require such list, since it has already queue of the ready events. -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take19 1/4] kevent: Core files.
On Tuesday 17 October 2006 05:42, Evgeniy Polyakov wrote: On Tue, Oct 17, 2006 at 12:59:47AM -0500, Chase Venters ([EMAIL PROTECTED]) wrote: On Tuesday 17 October 2006 00:09, Johann Borck wrote: Regarding mukevent I'm thinking of a event-type specific struct, that is filled by the originating code, and placed into a per-event-type ring buffer (which requires modification of kevent_wait). I'd personally worry about an implementation that used a per-event-type ring buffer, because you're still left having to hack around starvation issues in user-space. It is of course possible under the current model for anyone who wants per-event-type ring buffers to have them - just make separate kevent sets. I haven't thought this through all the way yet, but why not have variable length event structures and have the kernel fill in a next pointer in each one? This could even be used to keep backwards binary compatibility while Why do we want variable size structures in mmap ring buffer? Flexibility primarily. So when we all decide to add a new event type six months from now, or add more information to an existing one, we don't run the risk that the existing mukevent isn't big enough. adding additional fields to the structures over time, though no space would be wasted on modern programs. You still end up with a question of what to do in case of overflow, but I'm thinking the thing to do in that case might be to start pushing overflow events onto a linked list which can be written back into the ring buffer when space becomes available. The appropriate behavior would be to throw new events on the linked list if the linked list had any events, so that things are delivered in order, but write to the mapped buffer directly otherwise. I think in a similar way. Kevent actually do not require such list, since it has already queue of the ready events. The current event types coalesce if there are multiple events, correct? It sounds like there may be other event types where coalescing multiple events is not the correct approach. Thanks, Chase - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take19 1/4] kevent: Core files.
On Tuesday 17 October 2006 12:39, Evgeniy Polyakov wrote: I can add such notification, but its existense _is_ the broken design. After such condition happend, all new events will dissapear (although they are still accessible through usual queue) from mapped buffer. While writing this I have come to the idea on how to imrove the case of the size of mapped buffer - we can make it with limited size, and when it is full, some bit will be set in the shared area and obviously no new events can be added there, but when user commits some events from that buffer (i.e. says to kernel that appropriate kevents can be freed or requeued according to theirs flags), new ready events from ready queue can be copied into mapped buffer. It still does not solve (and I do insist that it is broken behaviour) the case when kernel is going to generate infinite number of events for one requested by userspace (as in case of generating new 'data_has_arrived' event when new byte has been received). Behavior is not broken. It's quite usefull and works 99.% of time. I was trying to suggest you but you missed my point. You dont want to use a bit, but a full sequence counter, 32bits. A program may handle XXX.XXX handles, but use a 4096 entries ring buffer 'only'. The user program keeps a local copy of a special word named 'ring_buffer_full_counter' Each time the kernel cannot queue an event in the ring buffer, it increase the ring_buffer_was_full_counter (exported to user app in the mmap view) When the user application notice the kernel changed ring_buffer_was_full_counter it does a full scan of all file handles (preferably using poll() to get all relevant info in one syscall) : do { if (read_event_from_mmap()) {handle_event(fd); continue;} /* ring buffer is empty, check if we missed some events */ if (unlikely(mmap-ring_buffer_full_counter != my_ring_buffer_full_counter)) { my_ring_buffer_full_counter = mmap-ring_buffer_full_counter; /* slow PATH */ /* can use a big poll() for example, or just a loop without poll() */ for_all_file_desc_do() { check if some event/data is waiting on THIS fd } /* } else syscall_wait_for_one_available_kevent(queue) } This is how a program can recover. If ring buffer has a reasonable size, this kind of event should not happen very frequently. If it does (because events continue to fill ring_buffer during recovery and might hit FULL again), maybe a smart program is able to resize the ring_buffer, and start using it after yet another recovery pass. If not, we dont care, because a big poll() give us many ready file-descriptors in one syscall, and maybe this is much better than kevent/epoll when XX.XXX events are ready. Eric - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take19 1/4] kevent: Core files.
On Tue, Oct 17, 2006 at 08:12:04AM -0500, Chase Venters ([EMAIL PROTECTED]) wrote: Regarding mukevent I'm thinking of a event-type specific struct, that is filled by the originating code, and placed into a per-event-type ring buffer (which requires modification of kevent_wait). I'd personally worry about an implementation that used a per-event-type ring buffer, because you're still left having to hack around starvation issues in user-space. It is of course possible under the current model for anyone who wants per-event-type ring buffers to have them - just make separate kevent sets. I haven't thought this through all the way yet, but why not have variable length event structures and have the kernel fill in a next pointer in each one? This could even be used to keep backwards binary compatibility while Why do we want variable size structures in mmap ring buffer? Flexibility primarily. So when we all decide to add a new event type six months from now, or add more information to an existing one, we don't run the risk that the existing mukevent isn't big enough. Do we need such flexibility, when we have unique id attached to each event? User can store any information in own buffers, which are indexed by that id. adding additional fields to the structures over time, though no space would be wasted on modern programs. You still end up with a question of what to do in case of overflow, but I'm thinking the thing to do in that case might be to start pushing overflow events onto a linked list which can be written back into the ring buffer when space becomes available. The appropriate behavior would be to throw new events on the linked list if the linked list had any events, so that things are delivered in order, but write to the mapped buffer directly otherwise. I think in a similar way. Kevent actually do not require such list, since it has already queue of the ready events. The current event types coalesce if there are multiple events, correct? It sounds like there may be other event types where coalescing multiple events is not the correct approach. There is no events coalescing, I think that it is even incorrect to say, that something is being coalesced in kevents. There is 'new' (which is well forgotten old) approach - user _asks_ kernel about some information, and kernel says when it is ready. Kernel does not say: part of the info is ready, part of the info is ready and so on, it just marks user's request as ready - that means that it is possible that there were zillions of events, each one could mark the _same_ userspace request as ready, and exactly what user requested is transferred back. Thus it is very fast and is correct way to deal with problem of pipes of different diameters. Kernel does not generate events - only user creates requests, which are marked as ready. I made that decision to remove _any_ kind of possible overflows from kernel side - if user was scheduled away, or has unsufficient space or bad mood, to not introduce any kind of ugly priorities (higher one could fill the whole pipe while lower could not even send a single event). Instead kernel does just what it was requested to do, and it can provide some hints on how that process happend (for example how many sockets are ready for accept(), or how many bytes are in the receiving queue). And that approach does solve the problem of the cases when it looks like it is logical to _generate_ event - for example in inotify case, where new event is _generated_ each time requested case happens. For example the case when new files are created in the directory - it is possible that there will be queue overflow (btw, watch for each file in the kernel tree takes about 2gb of kernel mem), if many files were created, so userspace must rescan the whole directory to check missed files, so why is it needed at all to generate info about first two or ten files, instead userspace asks kernel to notify it when directory has changed or some new files were created, and kernelspace will answer when directory has been changed or new files were created (with some hint with number of them). Likely request for generation of events in kernel is a workaround for some other problems, which in long term will hit us with new troubles - queue length and overflows. Thanks, Chase -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take19 1/4] kevent: Core files.
On Tue, Oct 17, 2006 at 03:19:36PM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: On Tuesday 17 October 2006 12:39, Evgeniy Polyakov wrote: I can add such notification, but its existense _is_ the broken design. After such condition happend, all new events will dissapear (although they are still accessible through usual queue) from mapped buffer. While writing this I have come to the idea on how to imrove the case of the size of mapped buffer - we can make it with limited size, and when it is full, some bit will be set in the shared area and obviously no new events can be added there, but when user commits some events from that buffer (i.e. says to kernel that appropriate kevents can be freed or requeued according to theirs flags), new ready events from ready queue can be copied into mapped buffer. It still does not solve (and I do insist that it is broken behaviour) the case when kernel is going to generate infinite number of events for one requested by userspace (as in case of generating new 'data_has_arrived' event when new byte has been received). Behavior is not broken. It's quite usefull and works 99.% of time. I was trying to suggest you but you missed my point. You dont want to use a bit, but a full sequence counter, 32bits. A program may handle XXX.XXX handles, but use a 4096 entries ring buffer 'only'. The user program keeps a local copy of a special word named 'ring_buffer_full_counter' Each time the kernel cannot queue an event in the ring buffer, it increase the ring_buffer_was_full_counter (exported to user app in the mmap view) When the user application notice the kernel changed ring_buffer_was_full_counter it does a full scan of all file handles (preferably using poll() to get all relevant info in one syscall) : I.e. to scan the rest of the xxx.xxx events? do { if (read_event_from_mmap()) {handle_event(fd); continue;} /* ring buffer is empty, check if we missed some events */ if (unlikely(mmap-ring_buffer_full_counter != my_ring_buffer_full_counter)) { my_ring_buffer_full_counter = mmap-ring_buffer_full_counter; /* slow PATH */ /* can use a big poll() for example, or just a loop without poll() */ for_all_file_desc_do() { check if some event/data is waiting on THIS fd } /* } else syscall_wait_for_one_available_kevent(queue) } This is how a program can recover. If ring buffer has a reasonable size, this kind of event should not happen very frequently. If it does (because events continue to fill ring_buffer during recovery and might hit FULL again), maybe a smart program is able to resize the ring_buffer, and start using it after yet another recovery pass. If not, we dont care, because a big poll() give us many ready file-descriptors in one syscall, and maybe this is much better than kevent/epoll when XX.XXX events are ready. What about the case, which I described in other e-mail, when in case of the full ring buffer, no new events are written there, and when userspace commits (i.e. marks as ready to be freed or requeued by kernel) some events, new ones will be copied from ready queue into the buffer? Eric -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take19 1/4] kevent: Core files.
On Tuesday 17 October 2006 15:42, Evgeniy Polyakov wrote: On Tue, Oct 17, 2006 at 03:19:36PM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: On Tuesday 17 October 2006 12:39, Evgeniy Polyakov wrote: I can add such notification, but its existense _is_ the broken design. After such condition happend, all new events will dissapear (although they are still accessible through usual queue) from mapped buffer. While writing this I have come to the idea on how to imrove the case of the size of mapped buffer - we can make it with limited size, and when it is full, some bit will be set in the shared area and obviously no new events can be added there, but when user commits some events from that buffer (i.e. says to kernel that appropriate kevents can be freed or requeued according to theirs flags), new ready events from ready queue can be copied into mapped buffer. It still does not solve (and I do insist that it is broken behaviour) the case when kernel is going to generate infinite number of events for one requested by userspace (as in case of generating new 'data_has_arrived' event when new byte has been received). Behavior is not broken. It's quite usefull and works 99.% of time. I was trying to suggest you but you missed my point. You dont want to use a bit, but a full sequence counter, 32bits. A program may handle XXX.XXX handles, but use a 4096 entries ring buffer 'only'. The user program keeps a local copy of a special word named 'ring_buffer_full_counter' Each time the kernel cannot queue an event in the ring buffer, it increase the ring_buffer_was_full_counter (exported to user app in the mmap view) When the user application notice the kernel changed ring_buffer_was_full_counter it does a full scan of all file handles (preferably using poll() to get all relevant info in one syscall) : I.e. to scan the rest of the xxx.xxx events? do { if (read_event_from_mmap()) {handle_event(fd); continue;} /* ring buffer is empty, check if we missed some events */ if (unlikely(mmap-ring_buffer_full_counter != my_ring_buffer_full_counter)) { my_ring_buffer_full_counter = mmap-ring_buffer_full_counter; /* slow PATH */ /* can use a big poll() for example, or just a loop without poll() */ for_all_file_desc_do() { check if some event/data is waiting on THIS fd } /* } else syscall_wait_for_one_available_kevent(queue) } This is how a program can recover. If ring buffer has a reasonable size, this kind of event should not happen very frequently. If it does (because events continue to fill ring_buffer during recovery and might hit FULL again), maybe a smart program is able to resize the ring_buffer, and start using it after yet another recovery pass. If not, we dont care, because a big poll() give us many ready file-descriptors in one syscall, and maybe this is much better than kevent/epoll when XX.XXX events are ready. What about the case, which I described in other e-mail, when in case of the full ring buffer, no new events are written there, and when userspace commits (i.e. marks as ready to be freed or requeued by kernel) some events, new ones will be copied from ready queue into the buffer? Then, user might receive 'false events', exactly like poll()/select()/epoll() can do sometime. IE a 'ready' indication while there is no current event available on a particular fd / event_source. This should be safe, since those programs already ignore read() returns -EAGAIN and other similar things. Programmer prefers to receive two 'event available' indications than ZERO (and be stuck for infinite time). Of course, hot path (normal cases) should return one 'event' only. In order words, being ultra fast 99.99 % of the time, but being able to block forever once in a while is not an option. Eric - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take19 1/4] kevent: Core files.
On Tue, Oct 17, 2006 at 03:52:34PM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: What about the case, which I described in other e-mail, when in case of the full ring buffer, no new events are written there, and when userspace commits (i.e. marks as ready to be freed or requeued by kernel) some events, new ones will be copied from ready queue into the buffer? Then, user might receive 'false events', exactly like poll()/select()/epoll() can do sometime. IE a 'ready' indication while there is no current event available on a particular fd / event_source. Only if user simultaneously uses oth interfaces and remove even from the queue when it's copy was in mapped buffer, but in that case it's user's problem (and if we do want, we can store pointer/index of the ring buffer entry, so when event is removed from the ready queue (using kevent_get_events()), appropriate entry in the ring buffer will be updated to show that it is no longer valid. This should be safe, since those programs already ignore read() returns -EAGAIN and other similar things. Programmer prefers to receive two 'event available' indications than ZERO (and be stuck for infinite time). Of course, hot path (normal cases) should return one 'event' only. In order words, being ultra fast 99.99 % of the time, but being able to block forever once in a while is not an option. Have I missed something? It looks like the only problematic situation is described above when user simultaneously uses both interfaces. Eric -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take19 1/4] kevent: Core files.
On Tuesday 17 October 2006 16:07, Evgeniy Polyakov wrote: On Tue, Oct 17, 2006 at 03:52:34PM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: What about the case, which I described in other e-mail, when in case of the full ring buffer, no new events are written there, and when userspace commits (i.e. marks as ready to be freed or requeued by kernel) some events, new ones will be copied from ready queue into the buffer? Then, user might receive 'false events', exactly like poll()/select()/epoll() can do sometime. IE a 'ready' indication while there is no current event available on a particular fd / event_source. Only if user simultaneously uses oth interfaces and remove even from the queue when it's copy was in mapped buffer, but in that case it's user's problem (and if we do want, we can store pointer/index of the ring buffer entry, so when event is removed from the ready queue (using kevent_get_events()), appropriate entry in the ring buffer will be updated to show that it is no longer valid. This should be safe, since those programs already ignore read() returns -EAGAIN and other similar things. Programmer prefers to receive two 'event available' indications than ZERO (and be stuck for infinite time). Of course, hot path (normal cases) should return one 'event' only. In order words, being ultra fast 99.99 % of the time, but being able to block forever once in a while is not an option. Have I missed something? It looks like the only problematic situation is described above when user simultaneously uses both interfaces. In my point of view, user of the 'mmaped ring buffer' should be prepared to use both interfaces. Or else you are forced to presize the ring buffer to insane limits. That is : - Most of the time, we expect consuming events via mmaped ring buffer and no syscalls. - In case we notice a 'mmaped ring buffer overflow', syscalls to get/consume events that could not be stored in mmaped buffer (but queued by kevent subsystem). If not stored by kevent subsystem (memory failure ?), revert to poll() to fetch all 'missed fds' in one row. Go back to normal mode. - In case of empty ring buffer (or no mmap support at all, because this app doesnt expect lot of events per time unit, or because kevent dont have mmap support) : Be able to syscall and wait for an event. Eric - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take19 1/4] kevent: Core files.
On Tue, Oct 17, 2006 at 04:25:00PM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: On Tuesday 17 October 2006 16:07, Evgeniy Polyakov wrote: On Tue, Oct 17, 2006 at 03:52:34PM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: What about the case, which I described in other e-mail, when in case of the full ring buffer, no new events are written there, and when userspace commits (i.e. marks as ready to be freed or requeued by kernel) some events, new ones will be copied from ready queue into the buffer? Then, user might receive 'false events', exactly like poll()/select()/epoll() can do sometime. IE a 'ready' indication while there is no current event available on a particular fd / event_source. Only if user simultaneously uses oth interfaces and remove even from the queue when it's copy was in mapped buffer, but in that case it's user's problem (and if we do want, we can store pointer/index of the ring buffer entry, so when event is removed from the ready queue (using kevent_get_events()), appropriate entry in the ring buffer will be updated to show that it is no longer valid. This should be safe, since those programs already ignore read() returns -EAGAIN and other similar things. Programmer prefers to receive two 'event available' indications than ZERO (and be stuck for infinite time). Of course, hot path (normal cases) should return one 'event' only. In order words, being ultra fast 99.99 % of the time, but being able to block forever once in a while is not an option. Have I missed something? It looks like the only problematic situation is described above when user simultaneously uses both interfaces. In my point of view, user of the 'mmaped ring buffer' should be prepared to use both interfaces. Or else you are forced to presize the ring buffer to insane limits. That is : - Most of the time, we expect consuming events via mmaped ring buffer and no syscalls. - In case we notice a 'mmaped ring buffer overflow', syscalls to get/consume events that could not be stored in mmaped buffer (but queued by kevent subsystem). If not stored by kevent subsystem (memory failure ?), revert to poll() to fetch all 'missed fds' in one row. Go back to normal mode. kevent uses smaller amount of memory than epoll() per event, so it is very unlikely that it will be impossible to store new event there and epoll() will succeed. The same can be applied to poll(), which allocates the whole table in syscall. - In case of empty ring buffer (or no mmap support at all, because this app doesnt expect lot of events per time unit, or because kevent dont have mmap support) : Be able to syscall and wait for an event. So the most complex case is when user is going to use both interfaces, and it's steps when mapped ring buffer has overflow. In that case user can either read and mark some events as ready in ring buffer (the latter is being done through special syscall), so kevent core will put there new ready events. User can also get events using usual syscall, in that case events in ring buffer must be updated - and actually I implemented mapped buffer in the way which allows to remove events from the queue - queue is a FIFO, and the first entry to be obtained through syscall is _always_ the first entry in the ring buffer. So when user reads event through syscall (no matter if we are in overflow case or not), even being read is easily accessible in the ring buffer. So I propose following design for ring buffer (quite simple): kernelspace maintains two indexes - to the first and the last events in the ring buffer (and maximum size of the buffer of course). When new event is marked as ready, some info is being copied into ring buffer and index of the last entry is increased. When event is being read through syscall it is _guaranteed_ that that event will be at the position pointed by the index of the first element, that index is then increased (thus opening new slot in the buffer). If index of the last entry reaches (with possible wrapping) index of the first entry, that means that overflow has happend. In this case no new events can be copied into ring buffer, so they are only placed into ready queue (accessible through syscall kevent_get_events()). When user calls kevent_get_events() it will obtain the first element (pointed by index of the first element in the ring buffer), and if there is ready event, which is not placed into the ring buffer, it is copied (with appropriate update of the last index and new overflow condition). When userspace calls kevent_wait(num), it means that userspace marks as ready first (from index of the first element) $num elements, which thus can be removed (or requeued) and replaced by pending ready events. Does it sound like clawing over the glass or much better? Eric - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More
Re: [take19 1/4] kevent: Core files.
On Tuesday 17 October 2006 17:09, Evgeniy Polyakov wrote: On Tue, Oct 17, 2006 at 04:25:00PM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: On Tuesday 17 October 2006 16:07, Evgeniy Polyakov wrote: On Tue, Oct 17, 2006 at 03:52:34PM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: What about the case, which I described in other e-mail, when in case of the full ring buffer, no new events are written there, and when userspace commits (i.e. marks as ready to be freed or requeued by kernel) some events, new ones will be copied from ready queue into the buffer? Then, user might receive 'false events', exactly like poll()/select()/epoll() can do sometime. IE a 'ready' indication while there is no current event available on a particular fd / event_source. Only if user simultaneously uses oth interfaces and remove even from the queue when it's copy was in mapped buffer, but in that case it's user's problem (and if we do want, we can store pointer/index of the ring buffer entry, so when event is removed from the ready queue (using kevent_get_events()), appropriate entry in the ring buffer will be updated to show that it is no longer valid. This should be safe, since those programs already ignore read() returns -EAGAIN and other similar things. Programmer prefers to receive two 'event available' indications than ZERO (and be stuck for infinite time). Of course, hot path (normal cases) should return one 'event' only. In order words, being ultra fast 99.99 % of the time, but being able to block forever once in a while is not an option. Have I missed something? It looks like the only problematic situation is described above when user simultaneously uses both interfaces. In my point of view, user of the 'mmaped ring buffer' should be prepared to use both interfaces. Or else you are forced to presize the ring buffer to insane limits. That is : - Most of the time, we expect consuming events via mmaped ring buffer and no syscalls. - In case we notice a 'mmaped ring buffer overflow', syscalls to get/consume events that could not be stored in mmaped buffer (but queued by kevent subsystem). If not stored by kevent subsystem (memory failure ?), revert to poll() to fetch all 'missed fds' in one row. Go back to normal mode. kevent uses smaller amount of memory than epoll() per event, so it is very unlikely that it will be impossible to store new event there and epoll() will succeed. The same can be applied to poll(), which allocates the whole table in syscall. - In case of empty ring buffer (or no mmap support at all, because this app doesnt expect lot of events per time unit, or because kevent dont have mmap support) : Be able to syscall and wait for an event. So the most complex case is when user is going to use both interfaces, and it's steps when mapped ring buffer has overflow. In that case user can either read and mark some events as ready in ring buffer (the latter is being done through special syscall), so kevent core will put there new ready events. User can also get events using usual syscall, in that case events in ring buffer must be updated - and actually I implemented mapped buffer in the way which allows to remove events from the queue - queue is a FIFO, and the first entry to be obtained through syscall is _always_ the first entry in the ring buffer. So when user reads event through syscall (no matter if we are in overflow case or not), even being read is easily accessible in the ring buffer. So I propose following design for ring buffer (quite simple): kernelspace maintains two indexes - to the first and the last events in the ring buffer (and maximum size of the buffer of course). When new event is marked as ready, some info is being copied into ring buffer and index of the last entry is increased. When event is being read through syscall it is _guaranteed_ that that event will be at the position pointed by the index of the first element, that index is then increased (thus opening new slot in the buffer). If index of the last entry reaches (with possible wrapping) index of the first entry, that means that overflow has happend. In this case no new events can be copied into ring buffer, so they are only placed into ready queue (accessible through syscall kevent_get_events()). When user calls kevent_get_events() it will obtain the first element (pointed by index of the first element in the ring buffer), and if there is ready event, which is not placed into the ring buffer, it is copied (with appropriate update of the last index and new overflow condition). Well, I'm not sure its good to do this 'move one event from ready list to slot X', one by one, because this event will likely be flushed out of processor cache (because we will have to consume 4096 events before reaching this one). I think its better to batch
Re: [take19 1/4] kevent: Core files.
On Tuesday 17 October 2006 16:25, Eric Dumazet wrote: On Tuesday 17 October 2006 16:07, Evgeniy Polyakov wrote: On Tue, Oct 17, 2006 at 03:52:34PM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: What about the case, which I described in other e-mail, when in case of the full ring buffer, no new events are written there, and when userspace commits (i.e. marks as ready to be freed or requeued by kernel) some events, new ones will be copied from ready queue into the buffer? Then, user might receive 'false events', exactly like poll()/select()/epoll() can do sometime. IE a 'ready' indication while there is no current event available on a particular fd / event_source. Only if user simultaneously uses oth interfaces and remove even from the queue when it's copy was in mapped buffer, but in that case it's user's problem (and if we do want, we can store pointer/index of the ring buffer entry, so when event is removed from the ready queue (using kevent_get_events()), appropriate entry in the ring buffer will be updated to show that it is no longer valid. This should be safe, since those programs already ignore read() returns -EAGAIN and other similar things. Programmer prefers to receive two 'event available' indications than ZERO (and be stuck for infinite time). Of course, hot path (normal cases) should return one 'event' only. In order words, being ultra fast 99.99 % of the time, but being able to block forever once in a while is not an option. Have I missed something? It looks like the only problematic situation is described above when user simultaneously uses both interfaces. In my point of view, user of the 'mmaped ring buffer' should be prepared to use both interfaces. Or else you are forced to presize the ring buffer to insane limits. I don't see why overflow couldn't be handle by a syscall telling the kernel that the buffer is ready for new events. As mentioned most of the time overflow should not happend and if it does the syscall should be amortized nicely by the number of events. That is : - Most of the time, we expect consuming events via mmaped ring buffer and no syscalls. - In case we notice a 'mmaped ring buffer overflow', syscalls to get/consume events that could not be stored in mmaped buffer (but queued by kevent subsystem). If not stored by kevent subsystem (memory failure ?), revert to poll() to fetch all 'missed fds' in one row. Go back to normal mode. - In case of empty ring buffer (or no mmap support at all, because this app doesnt expect lot of events per time unit, or because kevent dont have mmap support) : Be able to syscall and wait for an event. As I see it there are two main problems with a mmapped ring buffer (correct me if I'm wrong): 1. Overflow. 2. Handle multiple kernel event that only needs one user event. I.e. multiple packet arriving at the same socket. The user should only see one IN event at the time he is ready to handle it. In an earlier post I suggested a scheme that solves these issues. It was based on the assumption that kernel and user-space share index variables and can read/update them atomically without much overhead. Only in cases where the buffer is empty and full system call would be required. Hans Henrik Happe - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take19 1/4] kevent: Core files.
On Tue, Oct 17, 2006 at 05:32:28PM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: So the most complex case is when user is going to use both interfaces, and it's steps when mapped ring buffer has overflow. In that case user can either read and mark some events as ready in ring buffer (the latter is being done through special syscall), so kevent core will put there new ready events. User can also get events using usual syscall, in that case events in ring buffer must be updated - and actually I implemented mapped buffer in the way which allows to remove events from the queue - queue is a FIFO, and the first entry to be obtained through syscall is _always_ the first entry in the ring buffer. So when user reads event through syscall (no matter if we are in overflow case or not), even being read is easily accessible in the ring buffer. So I propose following design for ring buffer (quite simple): kernelspace maintains two indexes - to the first and the last events in the ring buffer (and maximum size of the buffer of course). When new event is marked as ready, some info is being copied into ring buffer and index of the last entry is increased. When event is being read through syscall it is _guaranteed_ that that event will be at the position pointed by the index of the first element, that index is then increased (thus opening new slot in the buffer). If index of the last entry reaches (with possible wrapping) index of the first entry, that means that overflow has happend. In this case no new events can be copied into ring buffer, so they are only placed into ready queue (accessible through syscall kevent_get_events()). When user calls kevent_get_events() it will obtain the first element (pointed by index of the first element in the ring buffer), and if there is ready event, which is not placed into the ring buffer, it is copied (with appropriate update of the last index and new overflow condition). Well, I'm not sure its good to do this 'move one event from ready list to slot X', one by one, because this event will likely be flushed out of processor cache (because we will have to consume 4096 events before reaching this one). I think its better to batch this kind of 'push XX events' later, XX being small enough not to waste CPU cache, and when ring buffer is empty again. Ok, that's possible. mmap buffer is good for latency and minimum synchro between user thread and kernel producer. But once we hit an 'overflow', it is better to revert to a mode feeding XX events per syscall, to be sure it fits CPU caches : The user thread will do the copy between kernel memory to user memory, and this thread will shortly use those events in user land. User can do both - either get events through syscall, or get them from mapped ring buffer when it is refilled. BTW, maintaining coherency on mmap buffer is expensive : once a event is copied to mmap buffer, kernel has to issue a smp_mb() before updating the index, so that a user thread wont start to consume an event with random values because its CPU see the update on index before updates on data. There will be some tricks with barriers indeed. Once all the queue is flushed in efficient way, we can switch to mmap mode again. Eric Ok, there is one apologist for mmap buffer implementation, who forced me to create first implementation, which was dropped due to absense of remote mental reading abilities. Ulrich, does above approach sound good for you? I actually do not want to reimplement something, that will be pointed to with words 'no matter what you say, it is broken and I do not want it' again :). -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take19 1/4] kevent: Core files.
On Tuesday 17 October 2006 18:01, Evgeniy Polyakov wrote: Ok, there is one apologist for mmap buffer implementation, who forced me to create first implementation, which was dropped due to absense of remote mental reading abilities. Ulrich, does above approach sound good for you? I actually do not want to reimplement something, that will be pointed to with words 'no matter what you say, it is broken and I do not want it' again :). In my humble opinion, you should first write a 'real application', to show how the mmap buffer and kevent syscalls would be used (fast path and slow/recovery paths). I am sure it would be easier for everybody to agree on the API *before* you start coding a *lot* of hard (kernel) stuff : It would certainly save your mental CPU cycles (and ours too :) ) This 'real application' could be the event loop of a simple HTTP server, or a basic 'echo all' server. Adding the bits about timers events and signals should be done too. Eric - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take19 1/4] kevent: Core files.
On Tue, Oct 17, 2006 at 06:26:04PM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: On Tuesday 17 October 2006 18:01, Evgeniy Polyakov wrote: Ok, there is one apologist for mmap buffer implementation, who forced me to create first implementation, which was dropped due to absense of remote mental reading abilities. Ulrich, does above approach sound good for you? I actually do not want to reimplement something, that will be pointed to with words 'no matter what you say, it is broken and I do not want it' again :). In my humble opinion, you should first write a 'real application', to show how the mmap buffer and kevent syscalls would be used (fast path and slow/recovery paths). I am sure it would be easier for everybody to agree on the API *before* you start coding a *lot* of hard (kernel) stuff : It would certainly save your mental CPU cycles (and ours too :) ) This 'real application' could be the event loop of a simple HTTP server, or a basic 'echo all' server. Adding the bits about timers events and signals should be done too. I wrote one with previous ring buffer implementation - it used timers and echoed when they fired, it was even described in details in one of the lwn.net articles. I'm not going to waste others and my time implementing feature requests without at least _some_ feedback from those who asked them. In case when person, originally requested some feature, does not answer and there are other opinions, only they will be get into account of course. Eric -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take19 1/4] kevent: Core files.
On Tuesday 17 October 2006 18:35, Evgeniy Polyakov wrote: On Tue, Oct 17, 2006 at 06:26:04PM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: On Tuesday 17 October 2006 18:01, Evgeniy Polyakov wrote: Ok, there is one apologist for mmap buffer implementation, who forced me to create first implementation, which was dropped due to absense of remote mental reading abilities. Ulrich, does above approach sound good for you? I actually do not want to reimplement something, that will be pointed to with words 'no matter what you say, it is broken and I do not want it' again :). In my humble opinion, you should first write a 'real application', to show how the mmap buffer and kevent syscalls would be used (fast path and slow/recovery paths). I am sure it would be easier for everybody to agree on the API *before* you start coding a *lot* of hard (kernel) stuff : It would certainly save your mental CPU cycles (and ours too :) ) This 'real application' could be the event loop of a simple HTTP server, or a basic 'echo all' server. Adding the bits about timers events and signals should be done too. I wrote one with previous ring buffer implementation - it used timers and echoed when they fired, it was even described in details in one of the lwn.net articles. I'm not going to waste others and my time implementing feature requests without at least _some_ feedback from those who asked them. In case when person, originally requested some feature, does not answer and there are other opinions, only they will be get into account of course. I am not sure I understand what you wrote, English is not our native language. I think many people gave you feedbacks. I feel that all feedback on this mailing list is constructive. Many posts/patches on this list are never commented at all. Eric - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take19 1/4] kevent: Core files.
On Tue, Oct 17, 2006 at 06:45:54PM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: On Tuesday 17 October 2006 18:35, Evgeniy Polyakov wrote: On Tue, Oct 17, 2006 at 06:26:04PM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: On Tuesday 17 October 2006 18:01, Evgeniy Polyakov wrote: Ok, there is one apologist for mmap buffer implementation, who forced me to create first implementation, which was dropped due to absense of remote mental reading abilities. Ulrich, does above approach sound good for you? I actually do not want to reimplement something, that will be pointed to with words 'no matter what you say, it is broken and I do not want it' again :). In my humble opinion, you should first write a 'real application', to show how the mmap buffer and kevent syscalls would be used (fast path and slow/recovery paths). I am sure it would be easier for everybody to agree on the API *before* you start coding a *lot* of hard (kernel) stuff : It would certainly save your mental CPU cycles (and ours too :) ) This 'real application' could be the event loop of a simple HTTP server, or a basic 'echo all' server. Adding the bits about timers events and signals should be done too. I wrote one with previous ring buffer implementation - it used timers and echoed when they fired, it was even described in details in one of the lwn.net articles. I'm not going to waste others and my time implementing feature requests without at least _some_ feedback from those who asked them. In case when person, originally requested some feature, does not answer and there are other opinions, only they will be get into account of course. I am not sure I understand what you wrote, English is not our native language. I think many people gave you feedbacks. I feel that all feedback on this mailing list is constructive. Many posts/patches on this list are never commented at all. And I do greatly appreciate feedback from those people! But I do not understand why I never got feedback on initial design and implementation (and then created as far as I recall at least 10 releases) from Ulrich, who first asked for such a feture. So right now I'm waiting for his opinion on that problem, even if it will be 'it sucks' again, but at least in that case I will not waste people's time. Ulrich, could you please comment on design notes sent couple of mail above? Eric -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take19 1/4] kevent: Core files.
Evgeniy Polyakov a e'crit : On Tue, Oct 17, 2006 at 06:45:54PM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: I am not sure I understand what you wrote, English is not our native language. I think many people gave you feedbacks. I feel that all feedback on this mailing list is constructive. Many posts/patches on this list are never commented at all. And I do greatly appreciate feedback from those people! But I do not understand why I never got feedback on initial design and implementation (and then created as far as I recall at least 10 releases) from Ulrich, who first asked for such a feture. So right now I'm waiting for his opinion on that problem, even if it will be 'it sucks' again, but at least in that case I will not waste people's time. Ulrich, could you please comment on design notes sent couple of mail above? Ulrich is a very busy man. We have to live with that. rant_mode For example, I *complained* one day, that each glibc fopen()/fread()/fclose() pass does a mmap()/munmap() to obtain a single 4KB of memory, without any cache mechanism. This badly hurts performance of multi-threaded programs as we know mmap()/munmap() has to down_write(mm-mmap_sem); and play VM games. So to avoid this, I manually call setvbuf() in my own programs, to provide a suitable buffer to glibc, because of its suboptimal default allocation, vestige of an old epoch... /rant_mode Eric - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take19 1/4] kevent: Core files.
On Sun, Oct 15, 2006 at 04:22:45PM -0700, Ulrich Drepper ([EMAIL PROTECTED]) wrote: Evgeniy Polyakov wrote: Existing design does not allow overflow. And I've pointed out a number of times that this is not practical at best. There are event sources which can create events which cannot be coalesced into one single event as it would be required with your design. Signals are one example, specifically realtime signals. If we do not want the design to be limited from the start this approach has to be thought over. The whole idea of mmap buffer seems to be broken, since those who asked for creation do not like existing design and do not show theirs... According to signals and possibility to overflow in existing ring buffer implementation. You seems to not checked the code - each event can be marked as ready only one time, which means only one copy and so on. It was done _specially_. And it is not limitation, but new approach. Queue of the same signals or any other events has fundamental flawness (as any other ring buffer implementation, which has queue size) - it's size of the queue and extremely bad case of the overflow. So, the same event may not be ready several times. Any design which allows to create infinite number of events generated for the same case is broken, since consumer can be in situation, when it can not handle that flow. That is why poll() returns only POLLIN when data is ready in network stack, but is not trying to generate some kind of a signal for each byte/packet/MTU/MSS received. RT signals have design problems, and I will not repeate the same error with similar limits in kevent. So zap mmap() support completely, since it is not usable at all. We wont discuss on it. Initial implementation did not have it. But I was requested to do it, and it is ready now. No one likes it, but no one provides an alternative implementation. We are stuck. We need the mapped ring buffer. The current design (before it was removed) was broken but this does not mean it shouldn't be implemented. We just need more time to figure out how to implement it correctly. In the latest patchset it was removed. I'm waiting for your code. Mmap implementation can be added separately, since it does not affect kevent core. -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take19 1/4] kevent: Core files.
Evgeniy Polyakov wrote: The whole idea of mmap buffer seems to be broken, since those who asked for creation do not like existing design and do not show theirs... What kind of argumentation is that? Because my attempt to implement it doesn't work and nobody right away has a better suggestion this means the idea is broken. Nonsense. It just means that time should be spend on thinking about this. You cut all this short by rushing out your attempt without any discussions. Unfortunately nobody else really looked at the approach so it lingered around for some weeks. Well, now it is clear that it is not the right approach and we can start thinking about it again. You seems to not checked the code - each event can be marked as ready only one time, which means only one copy and so on. It was done _specially_. And it is not limitation, but new approach. I know that it is done deliberately and I tell you that this is wrong and unacceptable. Realtime signals are one event which need to have more than one event queued. This is no description of what you have implemented, it's a description of the reality of realtime signals. RT signals are queued. They carry a data value (the sigval_t object) which can be unique for each signal delivery. Coalescing the signal events therefore leads to information loss. Therefore, at the very least for signal we need to have the ability to queue more than one event for each event source. Not having this functionality means that signals and likely other types of events cannot be implemented using kevent queues. Queue of the same signals or any other events has fundamental flawness (as any other ring buffer implementation, which has queue size) - it's size of the queue and extremely bad case of the overflow. Of course there are additional problems. Overflows need to be handled. But this is nothing which is unsolvable. So, the same event may not be ready several times. Any design which allows to create infinite number of events generated for the same case is broken, since consumer can be in situation, when it can not handle that flow. That's complete nonsense. Again, for RT signals it is very reasonable and not broken to have multiple outstanding signals. That is why poll() returns only POLLIN when data is ready in network stack, but is not trying to generate some kind of a signal for each byte/packet/MTU/MSS received. It makes no sense to drag poll() into this discussion. poll() is a very limited interface. The new event handling is supposed to be the opposite, namely, usable for all kinds of events. Arguing that because poll() does it like this just means you don't see what big step is needed to get to the goal of a unified event handling. The shackles of poll() must be left behind. RT signals have design problems, and I will not repeate the same error with similar limits in kevent. I don't know what to say. You claim to be the source of all wisdom is OS design. Maybe you should design your own OS, from ground up. I wonder how many people would like that since all your arguments are squarely geared towards optimizing the implementation. But: the implementation is irrelevant without users. The functionality users (= programmers) want and need is what must drive the implementation. And RT signals are definitely heavily used and liked by programmers. You have to accept that you try to modify an OS which has that functionality regardless of how much you hate it and want to fight it. Mmap implementation can be added separately, since it does not affect kevent core. That I doubt very much and it is why I would not want the kevent stuff go into any released kernel until that detail is resolved. -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take19 1/4] kevent: Core files.
On Mon, Oct 16, 2006 at 03:16:15AM -0700, Ulrich Drepper ([EMAIL PROTECTED]) wrote: Evgeniy Polyakov wrote: The whole idea of mmap buffer seems to be broken, since those who asked for creation do not like existing design and do not show theirs... What kind of argumentation is that? Because my attempt to implement it doesn't work and nobody right away has a better suggestion this means the idea is broken. Nonsense. Ok, let's reformulate: My attempt works, but nobody around likes it, I remove it and wait until some other implement it. It just means that time should be spend on thinking about this. You cut all this short by rushing out your attempt without any discussions. Unfortunately nobody else really looked at the approach so it lingered around for some weeks. Well, now it is clear that it is not the right approach and we can start thinking about it again. I talked about it in the last 13 releases of the kevent, and _noone_ said at least some comments. And now I get - 'it is broken, it does not work, there are problems, we do not want it' and the like. I tried hardly to show that it does work and problems shown can not happen, but noone still hears me. Since I think it is not that interface which is 100% required for correct functionality, I removed it. When there are better suggestions and implementation we can return to them of course. You seems to not checked the code - each event can be marked as ready only one time, which means only one copy and so on. It was done _specially_. And it is not limitation, but new approach. I know that it is done deliberately and I tell you that this is wrong and unacceptable. Realtime signals are one event which need to have more than one event queued. This is no description of what you have implemented, it's a description of the reality of realtime signals. RT signals are queued. They carry a data value (the sigval_t object) which can be unique for each signal delivery. Coalescing the signal events therefore leads to information loss. Therefore, at the very least for signal we need to have the ability to queue more than one event for each event source. Not having this functionality means that signals and likely other types of events cannot be implemented using kevent queues. Well, my point about rt-signals is that they do not deserve to be resurrected, but it is only my point :) In case it is still used, each signal setup should create event - many signals means many events, each signal can be sent with different parameters - each event should correspond to one unique case. Queue of the same signals or any other events has fundamental flawness (as any other ring buffer implementation, which has queue size) - it's size of the queue and extremely bad case of the overflow. Of course there are additional problems. Overflows need to be handled. But this is nothing which is unsolvable. I strongly disagree that having design which allows overflows is acceptible - do we really want rt-signals queue overflow problems in new place? Instead some complex allocation scheme can be created. So, the same event may not be ready several times. Any design which allows to create infinite number of events generated for the same case is broken, since consumer can be in situation, when it can not handle that flow. That's complete nonsense. Again, for RT signals it is very reasonable and not broken to have multiple outstanding signals. The same signal with different payload is acceptible, but when number of them increases ulimit and they are started to be forgotten - that's what I call broken design. That is why poll() returns only POLLIN when data is ready in network stack, but is not trying to generate some kind of a signal for each byte/packet/MTU/MSS received. It makes no sense to drag poll() into this discussion. poll() is a very limited interface. The new event handling is supposed to be the opposite, namely, usable for all kinds of events. Arguing that because poll() does it like this just means you don't see what big step is needed to get to the goal of a unified event handling. The shackles of poll() must be left behind. Kevent is that subsystem, and for now it works quite good. RT signals have design problems, and I will not repeate the same error with similar limits in kevent. I don't know what to say. You claim to be the source of all wisdom is OS design. Maybe you should design your own OS, from ground up. I wonder how many people would like that since all your arguments are squarely geared towards optimizing the implementation. But: the implementation is irrelevant without users. The functionality users (= programmers) want and need is what must drive the implementation. And RT signals are definitely heavily used and liked by programmers. You have to accept that you try to modify an OS which has that functionality regardless of how
Re: [take19 1/4] kevent: Core files.
Ulrich Drepper wrote: Evgeniy Polyakov wrote: Existing design does not allow overflow. And I've pointed out a number of times that this is not practical at best. There are event sources which can create events which cannot be coalesced into one single event as it would be required with your design. Signals are one example, specifically realtime signals. If we do not want the design to be limited from the start this approach has to be thought over. So zap mmap() support completely, since it is not usable at all. We wont discuss on it. Initial implementation did not have it. But I was requested to do it, and it is ready now. No one likes it, but no one provides an alternative implementation. We are stuck. We need the mapped ring buffer. The current design (before it was removed) was broken but this does not mean it shouldn't be implemented. We just need more time to figure out how to implement it correctly. Considering the if at all and if then how of ring buffer implemetation I'd like to throw in some ideas I had when reading the discussion and respective code. If I understood Ulrich Drepper right, his notion of a generic event handling interface is, that it has to be flexible enough to transport additional info from origin to userspace, and to support queuing of events from the same origin, so that additional per-event-occurrence data doesn't get lost, which would happen when coalescing multiple events into one until delivery. From what I read he says ring buffer is broken because of insufficient space for additional data (mukevent) and the limited number of events that can be put into ring buffer. Another argument is missing notification of userspace about dropped events in case ring buffer limit is reached. (is that right?) I see no reason why kevent couldn't be modified to fit (all) these needs. While modifying the server-example and writing a client using kevent I came across the coalescing problem, there were more incoming connections than accept events, and I had to work around that. In this case the pure number of coalesced events would suffice, while it wouldn't for the example of RT-signals that Ulrich Drepper gave. So if coalescing can be done at all or if it is impossible depends on the type of event. The same goes for additional data delivered with the events. There might be no panacea for all possible scenarios with one fixed design. Either performance suffers for 'lightweight' events which don't need additional data and/or coalescing is not problematic and/or ring buffer, or kevent is not usable for other types of events. Why not treat different things differently, and let the (kernel-)user decide. I don't know if I got all this right, but if, then ring buffer is needed especially for cases where coalescing is not possible and additional data has to be delivered for each triggered notification (so the pure number of events is not enough; other reasons? performance? ). To me it doesn't make sense to have kevent fill memory and use processor-time if buffer is not used at all, which is the case when using kevent_getevents. So here are my Ideas: Make usage of ring buffer optional, if not required for specific event-type it might be chosen by userspace-code. Make limit of events in ring buffer optional and controllable from userspace. Regarding mukevent I'm thinking of a event-type specific struct, that is filled by the originating code, and placed into a per-event-type ring buffer (which requires modification of kevent_wait). To my limited understanding it seems that alternative or modified versions of kevent_storage_ready, (__)kevent_requeue and kevent_user_ring_add_event could return a void pointer to the position in buffer, and all kevent has to know about is the size of the struct. If coalescing doesn't hurt for a specific event-type it might just be modified to notify userspace about the number of coalesced events. Make it depend on type of event. I know this doesn't address all objections that have been made, and Evgeniy, big sorry for this being just talk again, and maybe not even applicable for some reasons I do not overlook, but maybe it's worth consideration. I'll gladly try to put that into code, and see where it leads. I think kevent is great, and if things can be done to increase it's genericity without sacrifying performance, why not. Sorry for the length of post and repetitions, Johann - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take19 1/4] kevent: Core files.
Evgeniy Polyakov wrote: Existing design does not allow overflow. And I've pointed out a number of times that this is not practical at best. There are event sources which can create events which cannot be coalesced into one single event as it would be required with your design. Signals are one example, specifically realtime signals. If we do not want the design to be limited from the start this approach has to be thought over. So zap mmap() support completely, since it is not usable at all. We wont discuss on it. Initial implementation did not have it. But I was requested to do it, and it is ready now. No one likes it, but no one provides an alternative implementation. We are stuck. We need the mapped ring buffer. The current design (before it was removed) was broken but this does not mean it shouldn't be implemented. We just need more time to figure out how to implement it correctly. -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take19 1/4] kevent: Core files.
On Wed, Oct 04, 2006 at 10:57:32AM -0700, Ulrich Drepper ([EMAIL PROTECTED]) wrote: On 10/3/06, Evgeniy Polyakov [EMAIL PROTECTED] wrote: http://tservice.net.ru/~s0mbre/archive/kevent/evserver_kevent.c http://tservice.net.ru/~s0mbre/archive/kevent/evtest.c These are simple programs which by themselves have problems. For instance, I consider a very bad idea to hardcode the size of the ring buffer. Specifying macros in the header file counts as hardcoding. Systems grow over time and so will the demand of connections. I have no problem with the kernel hardcoding the value internally (or having a /proc entry to select it) but programs should be able to dynamically learn about the value so they don't have to be recompiled. Well, it is possible to create /sys/proc entry for that, and even now userspace can grow mapping ring until it is forbiden by kernel, which means limit is reached. Actually the whole idea with global limit of kevents does not sound very good to me, but it is required to remove overflow in mapped buffer. But more problematic is that I don't see how the interfaces can be efficiently used in multi-threaded (or multi-process) programs. How would multiple threads using the same kevent queue and running in the same kevent_get_events() loop work out? How do they guarantee that each request is only handled once? kqueue_dequeue_ready() is atomic and this function removes kevent from ready queue so other thread can not get it. From what I see now this means a second data structure is needed to keep track of the state of each entry. But even then, how do we even recognized used ring buffer entries? For instance, assume two threads. Both call get_events, one event is reported, both threads are woken up (which is another thing to consider, more later). One thread uses ring buffer entry, the other goes back to sleep in get_events. Now, how does the kernel know when the other thread is done working on the ring buffer entry? There might be lots of entries coming in overflowing the entire buffer. Heck, you don't even need two threads for this scenario. Are you talking about mapped buffer or syscall interface? The former has special syscall kevent_wait(), which reports number of 'processed' events and first processed number, so kernel can remove all appropriate events. The latter is described above - kqueue_dequeue_ready() is atomic, so that event will be removed from the ready queue and optionally from the whole kevent tree. It is possible to work with both interfaces at the same time, since mapped buffer contains a copy of the event, which is potentially freed and processed by other thread. Actually I do not like idea of mapped ring anyway, since if application uses a lot of events, it will batch them into big chunks, so syscall overhead is negligible, if application uses small number of events, syscalls will be rare and will not hurt performance. When I was thinking about this (and discussing it in Ottawa) I was always assuming that we have a status field in the ring buffer entry which lets the userlevel code indicate whether the entry is free again or not. This requires a writable mapping, yes, and potentially causes cache line ping-pong. I think Zach mentioned he has some ideas about this. As far as I can see, there are no other ideas on how to implement ring buffer, so I did it like I wanted. It has some limitation indeed, but since I do not see any other code, how can I say what is better or worse? As for the multiple thread wakeup, I mentioned this before. We have to avoid the trampling herd problem. We cannot wakeup all waiters. But we also cannot assume that, without protocols, waking up just one for each available entry is sufficient. So the first question is: what is the current policy? It is a good practice to _not_ share the same queue between a lot of threads. Currently all waiters are awakened. AIO was removed from patchset by request of Cristoph. Timers, network AIO, fs AIO, socket nortifications and poll/select events work well with existing structures. Well, excuse me if I don't take your word for it. I agree, the AIO code should not be submitted along with this. The same for any other code using the event handling. But we need to check whether the interface is generic enough to accomodate them in a way which actually makes sense. Again, think highly threaded processes or multiple processes sharing the same event queue. You missed the point. I implemented _all_ above and it does work. Although it was removed from submission patchset. You can find all patches on kevent homepage, they were posted to lkml@ and netdev@ too many times to miss them. It is even possible to create variable sized kevents - each kevent contain pointer to user's data, which can be considered as pointer to additional area (it's size kernel implementation for given kevent type can determine from other parameters or use
Re: [take19 1/4] kevent: Core files.
On Thursday 05 October 2006 10:57, Evgeniy Polyakov wrote: Well, it is possible to create /sys/proc entry for that, and even now userspace can grow mapping ring until it is forbiden by kernel, which means limit is reached. No need for yet another /sys/proc entry. Right now, I (for example) may have a use for Generic event handling, but for a program that needs XXX.XXX handles, and about XX.XXX events per second. Right now, this program uses epoll, and reaches no limit at all, once you pass the ulimit -n, and other kernel wide tunes of course, not related to epoll. With your current kevent, I cannot switch to it, because of hardcoded limits. I may be wrong, but what is currently missing for me is : - No hardcoded limit on the max number of events. (A process that can open XXX.XXX files should be allowed to open a kevent queue with at least XXX.XXX events). Right now thats not clear what happens IF the current limit is reached. - In order to avoid touching the whole ring buffer, it might be good to be able to reset the indexes to the beginning when ring buffer is empty. (So if the user land is responsive enough to consume events, only first pages of the mapping would be used : that saves L1/L2 cpu caches) A plus would be - A working/usable mmap ring buffer implementation, but I think its not mandatory. System calls are not that expensive, especially if you can batch XX events per syscall (like epoll). Nice thing with a ring buffer is that we touch less cache lines than say epoll that have lot of linked structures. About mmap, I think you might want a hybrid thing : One writable page where userland can write its index, (and hold one or more futex shared by kernel) (with appropriate thread locking in case multiple threads want to dequeue events). In fast path, no syscalls are needed to maintain this user index. XXX readonly pages (for user, but r/w for kernel), where kernel write its own index, and events of course. Using separate cache lines avoid false sharing : kernel can update its own index and events without having to pay the price of cache line ping pongs. It could use futex infrastructure to wakeup one thread 'only' instead of all threads waiting an event. Eric - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take19 1/4] kevent: Core files.
On Thursday 05 October 2006 12:55, Evgeniy Polyakov wrote: On Thu, Oct 05, 2006 at 12:45:03PM +0200, Eric Dumazet ([EMAIL PROTECTED]) What is missing or not obvious is : If events are skipped because of overflows, What happens ? Connections stuck forever ? Hope that everything will restore itself ? Is kernel able to SIGNAL this problem to user land ? Exisitng code does not overflow by design, but can consume a lot of memory. I talked about the case, when there will be some limit on number of entries put into mapped buffer. You still dont answer my question. Please answer the question. Recap : You have a max of events queued. A network message come and kernel want to add another event. It cannot because limit is reached. How the User Program knows that this problem was hit ? It is the same. What if reing buffer was grown upto 3 entry, and is now empty, and we need to put there 4 entries? Grow it again? It can be done, easily, but it looks like a workaround not as solution. And it is highly unlikely that in situation, when there are a lot of event, ring can be empty. I dont speak of re-allocation of ring buffer. I dont care to allocate at startup a big enough buffer. Say you have allocated a ring buffer of 1024*1024 entries. Then you queue 100 events per second, and dequeue them immediatly. No need to blindly use all 1024*1024 slots in the ring buffer, doing index = (index+1)%(1024*1024) epoll() does not have mmap. Problem is not about how many events can be put into the kernel, but how many of them can be put into mapped buffer. There is no problem if mmap is turned off. So zap mmap() support completely, since it is not usable at all. We wont discuss on it. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take19 1/4] kevent: Core files.
On Thu, Oct 05, 2006 at 02:09:31PM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: On Thursday 05 October 2006 12:55, Evgeniy Polyakov wrote: On Thu, Oct 05, 2006 at 12:45:03PM +0200, Eric Dumazet ([EMAIL PROTECTED]) What is missing or not obvious is : If events are skipped because of overflows, What happens ? Connections stuck forever ? Hope that everything will restore itself ? Is kernel able to SIGNAL this problem to user land ? Exisitng code does not overflow by design, but can consume a lot of memory. I talked about the case, when there will be some limit on number of entries put into mapped buffer. You still dont answer my question. Please answer the question. Recap : You have a max of events queued. A network message come and kernel want to add another event. It cannot because limit is reached. How the User Program knows that this problem was hit ? Existing design does not allow overflow. If event was added into the queue (like user requested notification, when new data has arrived), it is guaranteed that there will be place to put that event into mapped buffer when it is ready. If user wants to add anotehr event (for example after accept() user wants to add another socket with request for notification about data arrival into that socket), it can fail though. This limit is introduced only because of mmap buffer. It is the same. What if reing buffer was grown upto 3 entry, and is now empty, and we need to put there 4 entries? Grow it again? It can be done, easily, but it looks like a workaround not as solution. And it is highly unlikely that in situation, when there are a lot of event, ring can be empty. I dont speak of re-allocation of ring buffer. I dont care to allocate at startup a big enough buffer. Say you have allocated a ring buffer of 1024*1024 entries. Then you queue 100 events per second, and dequeue them immediatly. No need to blindly use all 1024*1024 slots in the ring buffer, doing index = (index+1)%(1024*1024) But what if they are not dequeued immediateyl? What if rate is high and while one tries to dequeue, system adds another events? epoll() does not have mmap. Problem is not about how many events can be put into the kernel, but how many of them can be put into mapped buffer. There is no problem if mmap is turned off. So zap mmap() support completely, since it is not usable at all. We wont discuss on it. Initial implementation did not have it. But I was requested to do it, and it is ready now. No one likes it, but no one provides an alternative implementation. We are stuck. -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take19 1/4] kevent: Core files.
On Thursday 05 October 2006 12:21, Evgeniy Polyakov wrote: On Thu, Oct 05, 2006 at 11:56:24AM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: On Thursday 05 October 2006 10:57, Evgeniy Polyakov wrote: Well, it is possible to create /sys/proc entry for that, and even now userspace can grow mapping ring until it is forbiden by kernel, which means limit is reached. No need for yet another /sys/proc entry. Right now, I (for example) may have a use for Generic event handling, but for a program that needs XXX.XXX handles, and about XX.XXX events per second. Right now, this program uses epoll, and reaches no limit at all, once you pass the ulimit -n, and other kernel wide tunes of course, not related to epoll. With your current kevent, I cannot switch to it, because of hardcoded limits. I may be wrong, but what is currently missing for me is : - No hardcoded limit on the max number of events. (A process that can open XXX.XXX files should be allowed to open a kevent queue with at least XXX.XXX events). Right now thats not clear what happens IF the current limit is reached. This forces to overflows in fixed sized memory mapped buffer. If we remove memory mapped buffer or will allow to have overflows (and thus skipped entries) keven can easily scale to that limits (tested with xx.xxx events though). - In order to avoid touching the whole ring buffer, it might be good to be able to reset the indexes to the beginning when ring buffer is empty. (So if the user land is responsive enough to consume events, only first pages of the mapping would be used : that saves L1/L2 cpu caches) And what happens when there are 3 empty at the beginning and \we need to put there 4 ready events? Couldn't there be 3 areas in the mmap buffer: - Unused: entries that the kernel can alloc from. - Alloced: entries alloced by kernel but not yet used by user. Kernel can update these if new events requires that. - Consumed: entries that the user are processing. The user takes a set of alloced entries and make them consumed. Then it processes the events after which it makes them unused. If there are no unused entries and the kernel needs some, it has wait for free entries. The user has to notify when unused entries becomes available. It could set a flag in the mmap'ed area to avoid unnessesary wakeups. The are some details with indexing and wakeup notification that I have left out, but I hope my idea is clear. I could give a more detailed description if requested. Also, I'm a user-level programmer so I might not get the whole picture. Hans Henrik Happe - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take19 1/4] kevent: Core files.
On Thu, Oct 05, 2006 at 04:01:19PM +0200, Hans Henrik Happe ([EMAIL PROTECTED]) wrote: And what happens when there are 3 empty at the beginning and \we need to put there 4 ready events? Couldn't there be 3 areas in the mmap buffer: - Unused: entries that the kernel can alloc from. - Alloced: entries alloced by kernel but not yet used by user. Kernel can update these if new events requires that. - Consumed: entries that the user are processing. The user takes a set of alloced entries and make them consumed. Then it processes the events after which it makes them unused. If there are no unused entries and the kernel needs some, it has wait for free entries. The user has to notify when unused entries becomes available. It could set a flag in the mmap'ed area to avoid unnessesary wakeups. The are some details with indexing and wakeup notification that I have left out, but I hope my idea is clear. I could give a more detailed description if requested. Also, I'm a user-level programmer so I might not get the whole picture. This looks good on a picture, but how can you put it into page-based storage without major and complex shared structures, which should be properly locked between kernelspace and userspace? Hans Henrik Happe -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take19 1/4] kevent: Core files.
On 9/20/06, Evgeniy Polyakov [EMAIL PROTECTED] wrote: This patch includes core kevent files: [...] I tried to look at the example programs before and failed. I tried again. Where can I find up-to-date example code? Some other points: - I really would prefer not to rush all this into the upstream kernel. The main problem is that the ring buffer interface is a shared data structure. These are always tricky. We need to find the right combination between size (as small as possible) and supporting all the interfaces. - so far only the timer and aio notification is speced out. What about the rest? Are we sure all aspects can be expressed? I am not yet. - we need an interface to add an event from userlevel. I.e., we need to be able to synthesize events. There are events (like, for instance the async DNS functionality) which come from userlevel code. I would very much prefer we look at the other events before setting the data structures in stone. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take19 1/4] kevent: Core files.
On Tue, Oct 03, 2006 at 11:34:02PM -0700, Ulrich Drepper ([EMAIL PROTECTED]) wrote: On 9/20/06, Evgeniy Polyakov [EMAIL PROTECTED] wrote: This patch includes core kevent files: [...] I tried to look at the example programs before and failed. I tried again. Where can I find up-to-date example code? http://tservice.net.ru/~s0mbre/archive/kevent/evserver_kevent.c http://tservice.net.ru/~s0mbre/archive/kevent/evtest.c Structures were not changed from the beginning of kevent project. Some other points: - I really would prefer not to rush all this into the upstream kernel. The main problem is that the ring buffer interface is a shared data structure. These are always tricky. We need to find the right combination between size (as small as possible) and supporting all the interfaces. mmap interface itself is in question, since it allows to create dos since there are no rlimits for pinned memory. - so far only the timer and aio notification is speced out. What about the rest? Are we sure all aspects can be expressed? I am not yet. AIO was removed from patchset by request of Cristoph. Timers, network AIO, fs AIO, socket nortifications and poll/select events work well with existing structures. - we need an interface to add an event from userlevel. I.e., we need to be able to synthesize events. There are events (like, for instance the async DNS functionality) which come from userlevel code. I would very much prefer we look at the other events before setting the data structures in stone. Signals and userspace events (hello solaris) easily fits into existing structures. It is even possible to create variable sized kevents - each kevent contain pointer to user's data, which can be considered as pointer to additional area (it's size kernel implementation for given kevent type can determine from other parameters or use predefined one and fetch additional data in -enqueue() callback). -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take19 1/4] kevent: Core files.
On 10/3/06, Evgeniy Polyakov [EMAIL PROTECTED] wrote: http://tservice.net.ru/~s0mbre/archive/kevent/evserver_kevent.c http://tservice.net.ru/~s0mbre/archive/kevent/evtest.c These are simple programs which by themselves have problems. For instance, I consider a very bad idea to hardcode the size of the ring buffer. Specifying macros in the header file counts as hardcoding. Systems grow over time and so will the demand of connections. I have no problem with the kernel hardcoding the value internally (or having a /proc entry to select it) but programs should be able to dynamically learn about the value so they don't have to be recompiled. But more problematic is that I don't see how the interfaces can be efficiently used in multi-threaded (or multi-process) programs. How would multiple threads using the same kevent queue and running in the same kevent_get_events() loop work out? How do they guarantee that each request is only handled once? From what I see now this means a second data structure is needed to keep track of the state of each entry. But even then, how do we even recognized used ring buffer entries? For instance, assume two threads. Both call get_events, one event is reported, both threads are woken up (which is another thing to consider, more later). One thread uses ring buffer entry, the other goes back to sleep in get_events. Now, how does the kernel know when the other thread is done working on the ring buffer entry? There might be lots of entries coming in overflowing the entire buffer. Heck, you don't even need two threads for this scenario. When I was thinking about this (and discussing it in Ottawa) I was always assuming that we have a status field in the ring buffer entry which lets the userlevel code indicate whether the entry is free again or not. This requires a writable mapping, yes, and potentially causes cache line ping-pong. I think Zach mentioned he has some ideas about this. As for the multiple thread wakeup, I mentioned this before. We have to avoid the trampling herd problem. We cannot wakeup all waiters. But we also cannot assume that, without protocols, waking up just one for each available entry is sufficient. So the first question is: what is the current policy? AIO was removed from patchset by request of Cristoph. Timers, network AIO, fs AIO, socket nortifications and poll/select events work well with existing structures. Well, excuse me if I don't take your word for it. I agree, the AIO code should not be submitted along with this. The same for any other code using the event handling. But we need to check whether the interface is generic enough to accomodate them in a way which actually makes sense. Again, think highly threaded processes or multiple processes sharing the same event queue. It is even possible to create variable sized kevents - each kevent contain pointer to user's data, which can be considered as pointer to additional area (it's size kernel implementation for given kevent type can determine from other parameters or use predefined one and fetch additional data in -enqueue() callback). That sounds interesting and certainly helps with securing the interface for the future. But if there is anything we can do to avoid unnecessary costs we should do it, even if this means investigation all this further. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[take19 1/4] kevent: Core files.
Core files. This patch includes core kevent files: - userspace controlling - kernelspace interfaces - initialization - notification state machines Signed-off-by: Evgeniy Polyakov [EMAIL PROTECTED] diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index dd63d47..c10698e 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S @@ -317,3 +317,6 @@ ENTRY(sys_call_table) .long sys_tee /* 315 */ .long sys_vmsplice .long sys_move_pages + .long sys_kevent_get_events + .long sys_kevent_ctl + .long sys_kevent_wait /* 320 */ diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 5d4a7d1..a06b76f 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -710,7 +710,10 @@ #endif .quad compat_sys_get_robust_list .quad sys_splice .quad sys_sync_file_range - .quad sys_tee + .quad sys_tee /* 315 */ .quad compat_sys_vmsplice .quad compat_sys_move_pages + .quad sys_kevent_get_events + .quad sys_kevent_ctl + .quad sys_kevent_wait /* 320 */ ia32_syscall_end: diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index fc1c8dd..68072b5 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -323,10 +323,13 @@ #define __NR_sync_file_range 314 #define __NR_tee 315 #define __NR_vmsplice 316 #define __NR_move_pages317 +#define __NR_kevent_get_events 318 +#define __NR_kevent_ctl319 +#define __NR_kevent_wait 320 #ifdef __KERNEL__ -#define NR_syscalls 318 +#define NR_syscalls 321 /* * user-visible error numbers are in the range -1 - -128: see diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index 94387c9..ee907ad 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -619,10 +619,16 @@ #define __NR_vmsplice 278 __SYSCALL(__NR_vmsplice, sys_vmsplice) #define __NR_move_pages279 __SYSCALL(__NR_move_pages, sys_move_pages) +#define __NR_kevent_get_events 280 +__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events) +#define __NR_kevent_ctl281 +__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl) +#define __NR_kevent_wait 282 +__SYSCALL(__NR_kevent_wait, sys_kevent_wait) #ifdef __KERNEL__ -#define __NR_syscall_max __NR_move_pages +#define __NR_syscall_max __NR_kevent_wait #ifndef __NO_STUBS diff --git a/include/linux/kevent.h b/include/linux/kevent.h new file mode 100644 index 000..24ced10 --- /dev/null +++ b/include/linux/kevent.h @@ -0,0 +1,195 @@ +/* + * 2006 Copyright (c) Evgeniy Polyakov [EMAIL PROTECTED] + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef __KEVENT_H +#define __KEVENT_H +#include linux/types.h +#include linux/list.h +#include linux/rbtree.h +#include linux/spinlock.h +#include linux/mutex.h +#include linux/wait.h +#include linux/net.h +#include linux/rcupdate.h +#include linux/kevent_storage.h +#include linux/ukevent.h + +#define KEVENT_MIN_BUFFS_ALLOC 3 + +struct kevent; +struct kevent_storage; +typedef int (* kevent_callback_t)(struct kevent *); + +/* @callback is called each time new event has been caught. */ +/* @enqueue is called each time new event is queued. */ +/* @dequeue is called each time event is dequeued. */ + +struct kevent_callbacks { + kevent_callback_t callback, enqueue, dequeue; +}; + +#define KEVENT_READY 0x1 +#define KEVENT_STORAGE 0x2 +#define KEVENT_USER0x4 + +struct kevent +{ + /* Used for kevent freeing.*/ + struct rcu_head rcu_head; + struct ukevent event; + /* This lock protects ukevent manipulations, e.g. ret_flags changes. */ + spinlock_t ulock; + + /* Entry of user's tree. */ + struct rb_node kevent_node; + /* Entry of origin's queue. */ + struct list_headstorage_entry; + /* Entry of user's ready. */ + struct list_headready_entry; + + u32 flags; + + /* User who requested this kevent. */ +
[take18 1/4] kevent: Core files.
Core files. This patch includes core kevent files: - userspace controlling - kernelspace interfaces - initialization - notification state machines Signed-off-by: Evgeniy Polyakov [EMAIL PROTECTED] diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index dd63d47..c10698e 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S @@ -317,3 +317,6 @@ ENTRY(sys_call_table) .long sys_tee /* 315 */ .long sys_vmsplice .long sys_move_pages + .long sys_kevent_get_events + .long sys_kevent_ctl + .long sys_kevent_wait /* 320 */ diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 5d4a7d1..a06b76f 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -710,7 +710,10 @@ #endif .quad compat_sys_get_robust_list .quad sys_splice .quad sys_sync_file_range - .quad sys_tee + .quad sys_tee /* 315 */ .quad compat_sys_vmsplice .quad compat_sys_move_pages + .quad sys_kevent_get_events + .quad sys_kevent_ctl + .quad sys_kevent_wait /* 320 */ ia32_syscall_end: diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index fc1c8dd..68072b5 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -323,10 +323,13 @@ #define __NR_sync_file_range 314 #define __NR_tee 315 #define __NR_vmsplice 316 #define __NR_move_pages317 +#define __NR_kevent_get_events 318 +#define __NR_kevent_ctl319 +#define __NR_kevent_wait 320 #ifdef __KERNEL__ -#define NR_syscalls 318 +#define NR_syscalls 321 /* * user-visible error numbers are in the range -1 - -128: see diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index 94387c9..ee907ad 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -619,10 +619,16 @@ #define __NR_vmsplice 278 __SYSCALL(__NR_vmsplice, sys_vmsplice) #define __NR_move_pages279 __SYSCALL(__NR_move_pages, sys_move_pages) +#define __NR_kevent_get_events 280 +__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events) +#define __NR_kevent_ctl281 +__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl) +#define __NR_kevent_wait 282 +__SYSCALL(__NR_kevent_wait, sys_kevent_wait) #ifdef __KERNEL__ -#define __NR_syscall_max __NR_move_pages +#define __NR_syscall_max __NR_kevent_wait #ifndef __NO_STUBS diff --git a/include/linux/kevent.h b/include/linux/kevent.h new file mode 100644 index 000..867820b --- /dev/null +++ b/include/linux/kevent.h @@ -0,0 +1,195 @@ +/* + * 2006 Copyright (c) Evgeniy Polyakov [EMAIL PROTECTED] + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef __KEVENT_H +#define __KEVENT_H +#include linux/types.h +#include linux/list.h +#include linux/rbtree.h +#include linux/spinlock.h +#include linux/mutex.h +#include linux/wait.h +#include linux/net.h +#include linux/rcupdate.h +#include linux/kevent_storage.h +#include linux/ukevent.h + +#define KEVENT_MIN_BUFFS_ALLOC 3 + +struct kevent; +struct kevent_storage; +typedef int (* kevent_callback_t)(struct kevent *); + +/* @callback is called each time new event has been caught. */ +/* @enqueue is called each time new event is queued. */ +/* @dequeue is called each time event is dequeued. */ + +struct kevent_callbacks { + kevent_callback_t callback, enqueue, dequeue; +}; + +#define KEVENT_READY 0x1 +#define KEVENT_STORAGE 0x2 +#define KEVENT_USER0x4 + +struct kevent +{ + /* Used for kevent freeing.*/ + struct rcu_head rcu_head; + struct ukevent event; + /* This lock protects ukevent manipulations, e.g. ret_flags changes. */ + spinlock_t ulock; + + /* Entry of user's tree. */ + struct rb_node kevent_node; + /* Entry of origin's queue. */ + struct list_headstorage_entry; + /* Entry of user's ready. */ + struct list_headready_entry; + + u32 flags; + + /* User who requested this kevent. */ +
Re: [take17 1/4] kevent: Core files.
I stand corrected. On Thursday 07 September 2006 23:38, Evgeniy Polyakov wrote: On Thu, Sep 07, 2006 at 09:05:16PM -0700, [EMAIL PROTECTED] ([EMAIL PROTECTED]) wrote: +static int __devinit kevent_user_init(void) +{ + int err = 0; + + kevent_cache = kmem_cache_create(kevent_cache, + sizeof(struct kevent), 0, SLAB_PANIC, NULL, NULL); + + err = misc_register(kevent_miscdev); + if (err) { + printk(KERN_ERR Failed to register kevent miscdev: err=%d.\n, err); + goto err_out_exit; + } + + printk(KEVENT subsystem has been successfully registered.\n); + + return 0; + +err_out_exit: + kmem_cache_destroy(kevent_cache); + return err; +} It's probably best to treat kmem_cache_create like a black box and check for it returning null. It can not return NULL, it will panic instead since I use SLAB_PANIC flag. Thanks, Shaw - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take17 1/4] kevent: Core files.
On Thu, Sep 07, 2006 at 09:05:16PM -0700, [EMAIL PROTECTED] ([EMAIL PROTECTED]) wrote: +static int __devinit kevent_user_init(void) +{ + int err = 0; + + kevent_cache = kmem_cache_create(kevent_cache, + sizeof(struct kevent), 0, SLAB_PANIC, NULL, NULL); + + err = misc_register(kevent_miscdev); + if (err) { + printk(KERN_ERR Failed to register kevent miscdev: err=%d.\n, err); + goto err_out_exit; + } + + printk(KEVENT subsystem has been successfully registered.\n); + + return 0; + +err_out_exit: + kmem_cache_destroy(kevent_cache); + return err; +} It's probably best to treat kmem_cache_create like a black box and check for it returning null. It can not return NULL, it will panic instead since I use SLAB_PANIC flag. Thanks, Shaw -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[take17 1/4] kevent: Core files.
Core files. This patch includes core kevent files: - userspace controlling - kernelspace interfaces - initialization - notification state machines Signed-off-by: Evgeniy Polyakov [EMAIL PROTECTED] diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index dd63d47..c10698e 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S @@ -317,3 +317,6 @@ ENTRY(sys_call_table) .long sys_tee /* 315 */ .long sys_vmsplice .long sys_move_pages + .long sys_kevent_get_events + .long sys_kevent_ctl + .long sys_kevent_wait /* 320 */ diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 5d4a7d1..a06b76f 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -710,7 +710,10 @@ #endif .quad compat_sys_get_robust_list .quad sys_splice .quad sys_sync_file_range - .quad sys_tee + .quad sys_tee /* 315 */ .quad compat_sys_vmsplice .quad compat_sys_move_pages + .quad sys_kevent_get_events + .quad sys_kevent_ctl + .quad sys_kevent_wait /* 320 */ ia32_syscall_end: diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index fc1c8dd..68072b5 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -323,10 +323,13 @@ #define __NR_sync_file_range 314 #define __NR_tee 315 #define __NR_vmsplice 316 #define __NR_move_pages317 +#define __NR_kevent_get_events 318 +#define __NR_kevent_ctl319 +#define __NR_kevent_wait 320 #ifdef __KERNEL__ -#define NR_syscalls 318 +#define NR_syscalls 321 /* * user-visible error numbers are in the range -1 - -128: see diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index 94387c9..ee907ad 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -619,10 +619,16 @@ #define __NR_vmsplice 278 __SYSCALL(__NR_vmsplice, sys_vmsplice) #define __NR_move_pages279 __SYSCALL(__NR_move_pages, sys_move_pages) +#define __NR_kevent_get_events 280 +__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events) +#define __NR_kevent_ctl281 +__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl) +#define __NR_kevent_wait 282 +__SYSCALL(__NR_kevent_wait, sys_kevent_wait) #ifdef __KERNEL__ -#define __NR_syscall_max __NR_move_pages +#define __NR_syscall_max __NR_kevent_wait #ifndef __NO_STUBS diff --git a/include/linux/kevent.h b/include/linux/kevent.h new file mode 100644 index 000..67007f2 --- /dev/null +++ b/include/linux/kevent.h @@ -0,0 +1,196 @@ +/* + * 2006 Copyright (c) Evgeniy Polyakov [EMAIL PROTECTED] + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef __KEVENT_H +#define __KEVENT_H +#include linux/types.h +#include linux/list.h +#include linux/spinlock.h +#include linux/mutex.h +#include linux/wait.h +#include linux/net.h +#include linux/rcupdate.h +#include linux/kevent_storage.h +#include linux/ukevent.h + +#define KEVENT_MIN_BUFFS_ALLOC 3 + +struct kevent; +struct kevent_storage; +typedef int (* kevent_callback_t)(struct kevent *); + +/* @callback is called each time new event has been caught. */ +/* @enqueue is called each time new event is queued. */ +/* @dequeue is called each time event is dequeued. */ + +struct kevent_callbacks { + kevent_callback_t callback, enqueue, dequeue; +}; + +#define KEVENT_READY 0x1 +#define KEVENT_STORAGE 0x2 +#define KEVENT_USER0x4 + +struct kevent +{ + /* Used for kevent freeing.*/ + struct rcu_head rcu_head; + struct ukevent event; + /* This lock protects ukevent manipulations, e.g. ret_flags changes. */ + spinlock_t ulock; + + /* Entry of user's queue. */ + struct list_headkevent_entry; + /* Entry of origin's queue. */ + struct list_headstorage_entry; + /* Entry of user's ready. */ + struct list_headready_entry; + + u32 flags; + + /* User who requested this kevent. */ + struct kevent_user
Re: [take16 1/4] kevent: Core files.
On Wed, Sep 06, 2006 at 09:23:56AM -0500, Chase Venters ([EMAIL PROTECTED]) wrote: On Wed, 6 Sep 2006, Evgeniy Polyakov wrote: +struct kevent_user +{ These structure names get a little dicey (kevent, kevent_user, ukevent, mukevent)... might there be slightly different names that could be selected to better distinguish the purpose of each? Like what? ukevent means userspace_kevent, but ukevent is much smaller. mukevent is mapped userspace kevent, mukevent is again much smaller. Hmm, well, kevent_user and ukevent are perhaps the only ones I'm concerned about. What about calling kevent_user a kevent_queue, kevent_fd or kevent_set? kevent_user is kernel side representation of, guess what? Yes, kevent user :) I decided to use queue length for mmaped buffer, using size of the mmapped buffer as queue length is possible too. But in any case it is very broken behaviour to introduce any kind of overflow and special marking for that - rt signals already have it, no need to create additional headache. Hmm. The concern here is pinned memory, is it not? I'm trying to think of the best way to avoid compile-time limits. select() has a rather (infamous) compile-time limit of 1,024 thanks to libc (and thanks to the bit vector, a glass ceiling). Now, you'd be a fool to use select() on that many fd's in modern code meant to run on modern UNIXes. But kevent is a new system, the grand unified event loop all of us userspace programmers have been begging for since many years ago. Glass ceilings tend to hurt when you run into them :) Using the size of the memory mapped buffer as queue length sounds like a sane simplification. Pinned memory is not the _main_ issue in a real world application - only if it is some kind of a DoS or really broken behaviour where tons of event queues are going to be created (like many epoll control descriptors). Memory mapped buffer actually can even not exist, if application is not going to use mmap interface. +static int kevent_user_ring_init(struct kevent_user *u) +{ + int pnum; + + pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct mukevent) + sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE; This calculation works with the current constants, but it comes up a page short if, say, KEVENT_MAX_EVENTS were 4095. It also looks incorrect visually since the 'sizeof(unsigned int)' is only factored in once (rather than once per page). I suggest a static / inline __max_kevent_pages() function that either does: return KEVENT_MAX_EVENTS / KEVENTS_ON_PAGE + 1; or int pnum = KEVENT_MAX_EVENTS / KEVENTS_ON_PAGE; if (KEVENT_MAX_EVENTS % KEVENTS_ON_PAGE) pnum++; return pnum; Both should be optimized away by the compiler and will give correct answers regardless of the constant values. Above pnum calculation aligns number of mukevents to pages size with appropriate check for (unsigned int), although it is not stated in that comment (more clear commant can be found around KEVENTS_ON_PAGE). You propose esentially the same calcualtion in the seconds case, while first one requires additional page in some cases. You are right about my first suggestion sometimes coming up a page extra. What I'm worried about is that the current ALIGN() based calculation comes up a page short if KEVENT_MAX_EVENTS is certain values (say 4095). This is because the unsigned int index is inside kevent_mring for every page, though the ALIGN() calculation just factors in room for one of them. In these boundary cases (KEVENT_MAX_EVENTS == 4095), your calculation thinks it can fit one last mukevent on a page because it didn't factor in room for unsigned int index at the start of every page; rather just for one page. In this case, the modulus should always come up non-zero, giving us the extra required page. Comment about KEVENTS_ON_PAGE celarly says what must be taken into account when size is calculated, but you are right, I should use there better macros, which should take sizeof(struct kevent_mring). I will update it. It is unused, but I'm still waiting on comments if we need kevent_get_events() at all - some people wanted to completely eliminate that function in favour of total mmap domination. Interesting idea. It would certainly simplify the interface. Only for those who really wants to use additional mmap interface. I have no strong opinion on how to behave in this situation. kevent can panic, can free cache, can go to infinite loop or screw up the hard drive. Everything is (almost) the same. Obviously it's not a huge deal :) If kevent is to screw up the hard drive, though, we must put in an exception for it to avoid my music directory. Care to send a patch for kernel command line? :) -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take15 1/4] kevent: Core files.
On Tue, Sep 05, 2006 at 03:28:17PM +0200, Arnd Bergmann ([EMAIL PROTECTED]) wrote: On Monday 04 September 2006 12:14, Evgeniy Polyakov wrote: +asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min_nr, unsigned int max_nr, __u64 timeout, void __user *buf, unsigned flags) +asmlinkage long sys_kevent_ctl(int fd, unsigned int cmd, unsigned int num, void __user *arg) 'void __user *arg' in both of these always points to a struct ukevent, according to your documentation. Shouldn't it be a 'struct ukevent __user *arg' then? Yep. I will update it in the next patchset. Thank you. Arnd -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take16 1/4] kevent: Core files.
Evgeniy, Sorry about the radio silence later. Some reviewer commentary follows. On Wed, 6 Sep 2006, Evgeniy Polyakov wrote: Core files. This patch includes core kevent files: - userspace controlling - kernelspace interfaces - initialization - notification state machines Signed-off-by: Evgeniy Polyakov [EMAIL PROTECTED] diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index dd63d47..c10698e 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S @@ -317,3 +317,6 @@ ENTRY(sys_call_table) .long sys_tee /* 315 */ .long sys_vmsplice .long sys_move_pages + .long sys_kevent_get_events + .long sys_kevent_ctl + .long sys_kevent_wait /* 320 */ diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 5d4a7d1..a06b76f 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -710,7 +710,10 @@ #endif .quad compat_sys_get_robust_list .quad sys_splice .quad sys_sync_file_range - .quad sys_tee + .quad sys_tee /* 315 */ .quad compat_sys_vmsplice .quad compat_sys_move_pages + .quad sys_kevent_get_events + .quad sys_kevent_ctl + .quad sys_kevent_wait /* 320 */ ia32_syscall_end: diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index fc1c8dd..68072b5 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -323,10 +323,13 @@ #define __NR_sync_file_range 314 #define __NR_tee315 #define __NR_vmsplice 316 #define __NR_move_pages 317 +#define __NR_kevent_get_events 318 +#define __NR_kevent_ctl319 +#define __NR_kevent_wait 320 #ifdef __KERNEL__ -#define NR_syscalls 318 +#define NR_syscalls 321 /* * user-visible error numbers are in the range -1 - -128: see diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index 94387c9..ee907ad 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -619,10 +619,16 @@ #define __NR_vmsplice 278 __SYSCALL(__NR_vmsplice, sys_vmsplice) #define __NR_move_pages 279 __SYSCALL(__NR_move_pages, sys_move_pages) +#define __NR_kevent_get_events 280 +__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events) +#define __NR_kevent_ctl281 +__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl) +#define __NR_kevent_wait 282 +__SYSCALL(__NR_kevent_wait, sys_kevent_wait) #ifdef __KERNEL__ -#define __NR_syscall_max __NR_move_pages +#define __NR_syscall_max __NR_kevent_wait #ifndef __NO_STUBS diff --git a/include/linux/kevent.h b/include/linux/kevent.h new file mode 100644 index 000..67007f2 --- /dev/null +++ b/include/linux/kevent.h @@ -0,0 +1,196 @@ +/* + * 2006 Copyright (c) Evgeniy Polyakov [EMAIL PROTECTED] + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef __KEVENT_H +#define __KEVENT_H +#include linux/types.h +#include linux/list.h +#include linux/spinlock.h +#include linux/mutex.h +#include linux/wait.h +#include linux/net.h +#include linux/rcupdate.h +#include linux/kevent_storage.h +#include linux/ukevent.h + +#define KEVENT_MIN_BUFFS_ALLOC 3 + +struct kevent; +struct kevent_storage; +typedef int (* kevent_callback_t)(struct kevent *); + +/* @callback is called each time new event has been caught. */ +/* @enqueue is called each time new event is queued. */ +/* @dequeue is called each time event is dequeued. */ + +struct kevent_callbacks { + kevent_callback_t callback, enqueue, dequeue; +}; + +#define KEVENT_READY 0x1 +#define KEVENT_STORAGE 0x2 +#define KEVENT_USER0x4 + +struct kevent +{ + /* Used for kevent freeing.*/ + struct rcu_head rcu_head; + struct ukevent event; + /* This lock protects ukevent manipulations, e.g. ret_flags changes. */ + spinlock_t ulock; + + /* Entry of user's queue. */ + struct list_headkevent_entry; + /* Entry of origin's queue. */ + struct list_headstorage_entry; + /* Entry of user's ready. */ + struct list_headready_entry; + + u32
Re: [take16 1/4] kevent: Core files.
On Wed, 6 Sep 2006, Chase Venters wrote: + if (start + num = KEVENT_MAX_EVENTS || + start = KEVENT_MAX_EVENTS || + num = KEVENT_MAX_EVENTS) Since start and num are unsigned, the last two checks are redundant. If start or num is individually = KEVENT_MAX_EVENTS, start + num must be. Actually, my early-morning brain code optimizer is apparently broken, because it forgot all about integer wraparound. Disregard please. Thanks, Chase - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take16 1/4] kevent: Core files.
On Wed, Sep 06, 2006 at 08:40:21AM -0500, Chase Venters ([EMAIL PROTECTED]) wrote: Evgeniy, Sorry about the radio silence later. Some reviewer commentary follows. +struct kevent +{ +/* Used for kevent freeing.*/ +struct rcu_head rcu_head; +struct ukevent event; +/* This lock protects ukevent manipulations, e.g. ret_flags changes. */ +spinlock_t ulock; + +/* Entry of user's queue. */ +struct list_headkevent_entry; +/* Entry of origin's queue. */ +struct list_headstorage_entry; +/* Entry of user's ready. */ +struct list_headready_entry; + +u32 flags; + +/* User who requested this kevent. */ +struct kevent_user *user; +/* Kevent container. */ +struct kevent_storage *st; + +struct kevent_callbacks callbacks; + +/* Private data for different storages. + * poll()/select storage has a list of wait_queue_t containers + * for each -poll() { poll_wait()' } here. + */ +void*priv; +}; + +#define KEVENT_HASH_MASK0xff + +struct kevent_user +{ These structure names get a little dicey (kevent, kevent_user, ukevent, mukevent)... might there be slightly different names that could be selected to better distinguish the purpose of each? Like what? ukevent means userspace_kevent, but ukevent is much smaller. mukevent is mapped userspace kevent, mukevent is again much smaller. +struct list_headkevent_list[KEVENT_HASH_MASK+1]; +spinlock_t kevent_lock; +/* Number of queued kevents. */ +unsigned intkevent_num; + +/* List of ready kevents. */ +struct list_headready_list; +/* Number of ready kevents. */ +unsigned intready_num; +/* Protects all manipulations with ready queue. */ +spinlock_t ready_lock; + +/* Protects against simultaneous kevent_user control manipulations. */ +struct mutexctl_mutex; +/* Wait until some events are ready. */ +wait_queue_head_t wait; + +/* Reference counter, increased for each new kevent. */ +atomic_trefcnt; + +unsigned intpages_in_use; +/* Array of pages forming mapped ring buffer */ +struct kevent_mring **pring; + +#ifdef CONFIG_KEVENT_USER_STAT +unsigned long im_num; +unsigned long wait_num; +unsigned long total; +#endif +}; +#define KEVENT_MAX_EVENTS 4096 + This limit governs how many simultaneous kevents you can be waiting on / for at once, correct? Would it be possible to drop the hard limit and limit instead, say, the maximum number of kevents you can have pending in the mmap ring-buffer? After the number is exceeded, additional events could get dropped, or some magic number could be put in the kevent_mring-index field to let the process know that it must hit another syscall to drain the rest of the events. I decided to use queue length for mmaped buffer, using size of the mmapped buffer as queue length is possible too. But in any case it is very broken behaviour to introduce any kind of overflow and special marking for that - rt signals already have it, no need to create additional headache. +static struct kevent_callbacks kevent_registered_callbacks[KEVENT_MAX]; __read_mostly? Yep, I was told already that some structures can be marked as such. Such change is not 100% requirement though. + +int kevent_add_callbacks(const struct kevent_callbacks *cb, int pos) +{ +struct kevent_callbacks *p; + +if (pos = KEVENT_MAX) +return -EINVAL; + +p = kevent_registered_callbacks[pos]; + +p-enqueue = (cb-enqueue) ? cb-enqueue : kevent_break; +p-dequeue = (cb-dequeue) ? cb-dequeue : kevent_break; +p-callback = (cb-callback) ? cb-callback : kevent_break; Curious... why are these callbacks copied, rather than just retaining a pointer to a const/static ops structure? It simplifies callers of that callbacks to just call a function instead of dereferencing and check for various pointers. + +printk(KERN_INFO KEVENT: Added callbacks for type %d.\n, pos); Is this printk() chatter necessary? As any other information printk in kernel it is not neccessary, but it allows user to know which kevent kernel users are enabled. +static char kevent_name[] = kevent; const? Yep. +/* + * Initialize mmap ring buffer. + * It will store ready kevents, so userspace could get them directly instead + * of using syscall. Esentially syscall becomes just a waiting point. + */ +static int kevent_user_ring_init(struct kevent_user *u) +{ +int pnum; + +pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct mukevent) + sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE; This calculation works with the current constants, but it comes
Re: [take16 1/4] kevent: Core files.
On Wed, 6 Sep 2006, Evgeniy Polyakov wrote: + +struct kevent_user +{ These structure names get a little dicey (kevent, kevent_user, ukevent, mukevent)... might there be slightly different names that could be selected to better distinguish the purpose of each? Like what? ukevent means userspace_kevent, but ukevent is much smaller. mukevent is mapped userspace kevent, mukevent is again much smaller. Hmm, well, kevent_user and ukevent are perhaps the only ones I'm concerned about. What about calling kevent_user a kevent_queue, kevent_fd or kevent_set? I decided to use queue length for mmaped buffer, using size of the mmapped buffer as queue length is possible too. But in any case it is very broken behaviour to introduce any kind of overflow and special marking for that - rt signals already have it, no need to create additional headache. Hmm. The concern here is pinned memory, is it not? I'm trying to think of the best way to avoid compile-time limits. select() has a rather (infamous) compile-time limit of 1,024 thanks to libc (and thanks to the bit vector, a glass ceiling). Now, you'd be a fool to use select() on that many fd's in modern code meant to run on modern UNIXes. But kevent is a new system, the grand unified event loop all of us userspace programmers have been begging for since many years ago. Glass ceilings tend to hurt when you run into them :) Using the size of the memory mapped buffer as queue length sounds like a sane simplification. +static int kevent_user_ring_init(struct kevent_user *u) +{ + int pnum; + + pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct mukevent) + sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE; This calculation works with the current constants, but it comes up a page short if, say, KEVENT_MAX_EVENTS were 4095. It also looks incorrect visually since the 'sizeof(unsigned int)' is only factored in once (rather than once per page). I suggest a static / inline __max_kevent_pages() function that either does: return KEVENT_MAX_EVENTS / KEVENTS_ON_PAGE + 1; or int pnum = KEVENT_MAX_EVENTS / KEVENTS_ON_PAGE; if (KEVENT_MAX_EVENTS % KEVENTS_ON_PAGE) pnum++; return pnum; Both should be optimized away by the compiler and will give correct answers regardless of the constant values. Above pnum calculation aligns number of mukevents to pages size with appropriate check for (unsigned int), although it is not stated in that comment (more clear commant can be found around KEVENTS_ON_PAGE). You propose esentially the same calcualtion in the seconds case, while first one requires additional page in some cases. You are right about my first suggestion sometimes coming up a page extra. What I'm worried about is that the current ALIGN() based calculation comes up a page short if KEVENT_MAX_EVENTS is certain values (say 4095). This is because the unsigned int index is inside kevent_mring for every page, though the ALIGN() calculation just factors in room for one of them. In these boundary cases (KEVENT_MAX_EVENTS == 4095), your calculation thinks it can fit one last mukevent on a page because it didn't factor in room for unsigned int index at the start of every page; rather just for one page. In this case, the modulus should always come up non-zero, giving us the extra required page. It is unused, but I'm still waiting on comments if we need kevent_get_events() at all - some people wanted to completely eliminate that function in favour of total mmap domination. Interesting idea. It would certainly simplify the interface. I have no strong opinion on how to behave in this situation. kevent can panic, can free cache, can go to infinite loop or screw up the hard drive. Everything is (almost) the same. Obviously it's not a huge deal :) If kevent is to screw up the hard drive, though, we must put in an exception for it to avoid my music directory. Looking pretty good. This is my first pass of comments, and I'll probably have questions that follow, but I'm trying to get a really good picture of what is going on here for documentation purposes. Thank you, Chase. I will definitely get your comments into account and change related bits. Thanks again! Chase - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[take16 1/4] kevent: Core files.
Core files. This patch includes core kevent files: - userspace controlling - kernelspace interfaces - initialization - notification state machines Signed-off-by: Evgeniy Polyakov [EMAIL PROTECTED] diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index dd63d47..c10698e 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S @@ -317,3 +317,6 @@ ENTRY(sys_call_table) .long sys_tee /* 315 */ .long sys_vmsplice .long sys_move_pages + .long sys_kevent_get_events + .long sys_kevent_ctl + .long sys_kevent_wait /* 320 */ diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 5d4a7d1..a06b76f 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -710,7 +710,10 @@ #endif .quad compat_sys_get_robust_list .quad sys_splice .quad sys_sync_file_range - .quad sys_tee + .quad sys_tee /* 315 */ .quad compat_sys_vmsplice .quad compat_sys_move_pages + .quad sys_kevent_get_events + .quad sys_kevent_ctl + .quad sys_kevent_wait /* 320 */ ia32_syscall_end: diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index fc1c8dd..68072b5 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -323,10 +323,13 @@ #define __NR_sync_file_range 314 #define __NR_tee 315 #define __NR_vmsplice 316 #define __NR_move_pages317 +#define __NR_kevent_get_events 318 +#define __NR_kevent_ctl319 +#define __NR_kevent_wait 320 #ifdef __KERNEL__ -#define NR_syscalls 318 +#define NR_syscalls 321 /* * user-visible error numbers are in the range -1 - -128: see diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index 94387c9..ee907ad 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -619,10 +619,16 @@ #define __NR_vmsplice 278 __SYSCALL(__NR_vmsplice, sys_vmsplice) #define __NR_move_pages279 __SYSCALL(__NR_move_pages, sys_move_pages) +#define __NR_kevent_get_events 280 +__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events) +#define __NR_kevent_ctl281 +__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl) +#define __NR_kevent_wait 282 +__SYSCALL(__NR_kevent_wait, sys_kevent_wait) #ifdef __KERNEL__ -#define __NR_syscall_max __NR_move_pages +#define __NR_syscall_max __NR_kevent_wait #ifndef __NO_STUBS diff --git a/include/linux/kevent.h b/include/linux/kevent.h new file mode 100644 index 000..67007f2 --- /dev/null +++ b/include/linux/kevent.h @@ -0,0 +1,196 @@ +/* + * 2006 Copyright (c) Evgeniy Polyakov [EMAIL PROTECTED] + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef __KEVENT_H +#define __KEVENT_H +#include linux/types.h +#include linux/list.h +#include linux/spinlock.h +#include linux/mutex.h +#include linux/wait.h +#include linux/net.h +#include linux/rcupdate.h +#include linux/kevent_storage.h +#include linux/ukevent.h + +#define KEVENT_MIN_BUFFS_ALLOC 3 + +struct kevent; +struct kevent_storage; +typedef int (* kevent_callback_t)(struct kevent *); + +/* @callback is called each time new event has been caught. */ +/* @enqueue is called each time new event is queued. */ +/* @dequeue is called each time event is dequeued. */ + +struct kevent_callbacks { + kevent_callback_t callback, enqueue, dequeue; +}; + +#define KEVENT_READY 0x1 +#define KEVENT_STORAGE 0x2 +#define KEVENT_USER0x4 + +struct kevent +{ + /* Used for kevent freeing.*/ + struct rcu_head rcu_head; + struct ukevent event; + /* This lock protects ukevent manipulations, e.g. ret_flags changes. */ + spinlock_t ulock; + + /* Entry of user's queue. */ + struct list_headkevent_entry; + /* Entry of origin's queue. */ + struct list_headstorage_entry; + /* Entry of user's ready. */ + struct list_headready_entry; + + u32 flags; + + /* User who requested this kevent. */ + struct kevent_user
Re: [take15 1/4] kevent: Core files.
On Monday 04 September 2006 12:14, Evgeniy Polyakov wrote: +asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min_nr, unsigned int max_nr, __u64 timeout, void __user *buf, unsigned flags) +asmlinkage long sys_kevent_ctl(int fd, unsigned int cmd, unsigned int num, void __user *arg) 'void __user *arg' in both of these always points to a struct ukevent, according to your documentation. Shouldn't it be a 'struct ukevent __user *arg' then? Arnd - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[take15 1/4] kevent: Core files.
Core files. This patch includes core kevent files: - userspace controlling - kernelspace interfaces - initialization - notification state machines Signed-off-by: Evgeniy Polyakov [EMAIL PROTECTED] diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index dd63d47..c10698e 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S @@ -317,3 +317,6 @@ ENTRY(sys_call_table) .long sys_tee /* 315 */ .long sys_vmsplice .long sys_move_pages + .long sys_kevent_get_events + .long sys_kevent_ctl + .long sys_kevent_wait /* 320 */ diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 5d4a7d1..a06b76f 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -710,7 +710,10 @@ #endif .quad compat_sys_get_robust_list .quad sys_splice .quad sys_sync_file_range - .quad sys_tee + .quad sys_tee /* 315 */ .quad compat_sys_vmsplice .quad compat_sys_move_pages + .quad sys_kevent_get_events + .quad sys_kevent_ctl + .quad sys_kevent_wait /* 320 */ ia32_syscall_end: diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index fc1c8dd..68072b5 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -323,10 +323,13 @@ #define __NR_sync_file_range 314 #define __NR_tee 315 #define __NR_vmsplice 316 #define __NR_move_pages317 +#define __NR_kevent_get_events 318 +#define __NR_kevent_ctl319 +#define __NR_kevent_wait 320 #ifdef __KERNEL__ -#define NR_syscalls 318 +#define NR_syscalls 321 /* * user-visible error numbers are in the range -1 - -128: see diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index 94387c9..ee907ad 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -619,10 +619,16 @@ #define __NR_vmsplice 278 __SYSCALL(__NR_vmsplice, sys_vmsplice) #define __NR_move_pages279 __SYSCALL(__NR_move_pages, sys_move_pages) +#define __NR_kevent_get_events 280 +__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events) +#define __NR_kevent_ctl281 +__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl) +#define __NR_kevent_wait 282 +__SYSCALL(__NR_kevent_wait, sys_kevent_wait) #ifdef __KERNEL__ -#define __NR_syscall_max __NR_move_pages +#define __NR_syscall_max __NR_kevent_wait #ifndef __NO_STUBS diff --git a/include/linux/kevent.h b/include/linux/kevent.h new file mode 100644 index 000..67007f2 --- /dev/null +++ b/include/linux/kevent.h @@ -0,0 +1,196 @@ +/* + * 2006 Copyright (c) Evgeniy Polyakov [EMAIL PROTECTED] + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef __KEVENT_H +#define __KEVENT_H +#include linux/types.h +#include linux/list.h +#include linux/spinlock.h +#include linux/mutex.h +#include linux/wait.h +#include linux/net.h +#include linux/rcupdate.h +#include linux/kevent_storage.h +#include linux/ukevent.h + +#define KEVENT_MIN_BUFFS_ALLOC 3 + +struct kevent; +struct kevent_storage; +typedef int (* kevent_callback_t)(struct kevent *); + +/* @callback is called each time new event has been caught. */ +/* @enqueue is called each time new event is queued. */ +/* @dequeue is called each time event is dequeued. */ + +struct kevent_callbacks { + kevent_callback_t callback, enqueue, dequeue; +}; + +#define KEVENT_READY 0x1 +#define KEVENT_STORAGE 0x2 +#define KEVENT_USER0x4 + +struct kevent +{ + /* Used for kevent freeing.*/ + struct rcu_head rcu_head; + struct ukevent event; + /* This lock protects ukevent manipulations, e.g. ret_flags changes. */ + spinlock_t ulock; + + /* Entry of user's queue. */ + struct list_headkevent_entry; + /* Entry of origin's queue. */ + struct list_headstorage_entry; + /* Entry of user's ready. */ + struct list_headready_entry; + + u32 flags; + + /* User who requested this kevent. */ + struct kevent_user
[take5 1/4] kevent: Core files.
Core files. This patch includes core kevent files: - userspace controlling - kernelspace interfaces - initialization - notification state machines It might also inlclude parts from other subsystem (like network related syscalls, so it is possible that it will not compile without other patches applied). Signed-off-by: Evgeniy Polyakov [EMAIL PROTECTED] diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index dd63d47..0af988a 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S @@ -317,3 +317,7 @@ ENTRY(sys_call_table) .long sys_tee /* 315 */ .long sys_vmsplice .long sys_move_pages + .long sys_aio_recv + .long sys_aio_send + .long sys_kevent_get_events + .long sys_kevent_ctl diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 5d4a7d1..e157ad4 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -713,4 +713,8 @@ #endif .quad sys_tee .quad compat_sys_vmsplice .quad compat_sys_move_pages + .quad sys_aio_recv + .quad sys_aio_send + .quad sys_kevent_get_events + .quad sys_kevent_ctl ia32_syscall_end: diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index fc1c8dd..a76e50d 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -323,10 +323,14 @@ #define __NR_sync_file_range 314 #define __NR_tee 315 #define __NR_vmsplice 316 #define __NR_move_pages317 +#define __NR_aio_recv 318 +#define __NR_aio_send 319 +#define __NR_kevent_get_events 320 +#define __NR_kevent_ctl321 #ifdef __KERNEL__ -#define NR_syscalls 318 +#define NR_syscalls 322 /* * user-visible error numbers are in the range -1 - -128: see diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index 94387c9..9a0b581 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -619,10 +619,18 @@ #define __NR_vmsplice 278 __SYSCALL(__NR_vmsplice, sys_vmsplice) #define __NR_move_pages279 __SYSCALL(__NR_move_pages, sys_move_pages) +#define __NR_aio_recv 280 +__SYSCALL(__NR_aio_recv, sys_aio_recv) +#define __NR_aio_send 281 +__SYSCALL(__NR_aio_send, sys_aio_send) +#define __NR_kevent_get_events 282 +__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events) +#define __NR_kevent_ctl283 +__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl) #ifdef __KERNEL__ -#define __NR_syscall_max __NR_move_pages +#define __NR_syscall_max __NR_kevent_ctl #ifndef __NO_STUBS diff --git a/include/linux/kevent.h b/include/linux/kevent.h new file mode 100644 index 000..c32f3bd --- /dev/null +++ b/include/linux/kevent.h @@ -0,0 +1,296 @@ +/* + * kevent.h + * + * 2006 Copyright (c) Evgeniy Polyakov [EMAIL PROTECTED] + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef __KEVENT_H +#define __KEVENT_H + +/* + * Kevent request flags. + */ + +#define KEVENT_REQ_ONESHOT 0x1 /* Process this event only once and then dequeue. */ + +/* + * Kevent return flags. + */ +#define KEVENT_RET_BROKEN 0x1 /* Kevent is broken. */ +#define KEVENT_RET_DONE0x2 /* Kevent processing was finished successfully. */ + +/* + * Kevent type set. + */ +#define KEVENT_SOCKET 0 +#define KEVENT_INODE 1 +#define KEVENT_TIMER 2 +#define KEVENT_POLL3 +#define KEVENT_NAIO4 +#define KEVENT_AIO 5 +#defineKEVENT_MAX 6 + +/* + * Per-type event sets. + * Number of per-event sets should be exactly as number of kevent types. + */ + +/* + * Timer events. + */ +#defineKEVENT_TIMER_FIRED 0x1 + +/* + * Socket/network asynchronous IO events. + */ +#defineKEVENT_SOCKET_RECV 0x1 +#defineKEVENT_SOCKET_ACCEPT0x2 +#defineKEVENT_SOCKET_SEND 0x4 + +/* + * Inode events. + */ +#defineKEVENT_INODE_CREATE 0x1 +#defineKEVENT_INODE_REMOVE 0x2 + +/* + * Poll events. + */ +#defineKEVENT_POLL_POLLIN 0x0001 +#define
Re: [take5 1/4] kevent: Core files.
+++ b/include/linux/kevent.h ... +#ifdef CONFIG_KEVENT_SOCKET + +extern struct file_operations socket_file_ops; This doesn't build because socket_file_ops was left static in net/socket.c. In any case, kevent.h has no business exposing socket_file_ops to users of the kevent api just so the kevent core can test files as being backed by sockets. It'd be more appropriate to call into the socket layer with the filp and let it return -EINVAL or -ESOCKNOOPT instead of trying to do that in the kevent layer. - z - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take5 1/4] kevent: Core files.
On Tue, Aug 08, 2006 at 03:02:59PM -0700, Zach Brown ([EMAIL PROTECTED]) wrote: +++ b/include/linux/kevent.h ... +#ifdef CONFIG_KEVENT_SOCKET + +extern struct file_operations socket_file_ops; This doesn't build because socket_file_ops was left static in net/socket.c. I exported it. It just sneaked out of patchset. In any case, kevent.h has no business exposing socket_file_ops to users of the kevent api just so the kevent core can test files as being backed by sockets. It'd be more appropriate to call into the socket layer with the filp and let it return -EINVAL or -ESOCKNOOPT instead of trying to do that in the kevent layer. Ok, I will move to use some functions from socket code without exporting socket_file_ops. - z -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take4 1/4] kevent: Core files.
On Sat, Aug 05, 2006 at 05:02:38PM +0400, Evgeniy Polyakov wrote: +static int __devinit kevent_user_init(void) +{ + struct class_device *dev; + int err = 0; + + err = register_filesystem(kevent_fs_type); + if (err) + panic(%s: failed to register filesystem: err=%d.\n, +kevent_name, err); + + kevent_mnt = kern_mount(kevent_fs_type); + if (IS_ERR(kevent_mnt)) + panic(%s: failed to mount silesystem: err=%ld.\n, + kevent_name, PTR_ERR(kevent_mnt)); + + kevent_user_major = register_chrdev(0, kevent_name, kevent_user_fops); + if (kevent_user_major 0) { + printk(KERN_ERR Failed to register \%s\ char device: err=%d.\n, + kevent_name, kevent_user_major); + return -ENODEV; + } + + kevent_user_class = class_create(THIS_MODULE, kevent); + if (IS_ERR(kevent_user_class)) { + printk(KERN_ERR Failed to register \%s\ class: err=%ld.\n, + kevent_name, PTR_ERR(kevent_user_class)); + err = PTR_ERR(kevent_user_class); + goto err_out_unregister; + } + + dev = class_device_create(kevent_user_class, NULL, + MKDEV(kevent_user_major, 0), NULL, kevent_name); + if (IS_ERR(dev)) { + printk(KERN_ERR Failed to create %d.%d class device in \%s\ class: err=%ld.\n, + kevent_user_major, 0, kevent_name, PTR_ERR(dev)); + err = PTR_ERR(dev); + goto err_out_class_destroy; + } As you are only using 1 minor number in this code, why not just use a miscdevice instead? It saves a bit of overhead and makes the code a tiny bit smaller :) thanks, greg k-h - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take4 1/4] kevent: Core files.
On Sat, Aug 05, 2006 at 10:57:02AM -0700, GregKH ([EMAIL PROTECTED]) wrote: + dev = class_device_create(kevent_user_class, NULL, + MKDEV(kevent_user_major, 0), NULL, kevent_name); + if (IS_ERR(dev)) { + printk(KERN_ERR Failed to create %d.%d class device in \%s\ class: err=%ld.\n, + kevent_user_major, 0, kevent_name, PTR_ERR(dev)); + err = PTR_ERR(dev); + goto err_out_class_destroy; + } As you are only using 1 minor number in this code, why not just use a miscdevice instead? It saves a bit of overhead and makes the code a tiny bit smaller :) No problem. I will move it to miscdevice instead of full chardev. thanks, greg k-h -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take3 1/4] kevent: Core files.
On Thursday 03 August 2006 11:46, Evgeniy Polyakov wrote: Core files. This patch includes core kevent files: - userspace controlling - kernelspace interfaces - initialization - notification state machines +static int kevent_user_wait(struct file *file, struct kevent_user *u, + unsigned int min_nr, unsigned int max_nr, unsigned int timeout, + void __user *buf) +{ + mutex_lock(u-ctl_mutex); + while (num max_nr ((k = kqueue_dequeue_ready(u)) != NULL)) { + if (copy_to_user(buf + num*sizeof(struct ukevent), + k-event, sizeof(struct ukevent))) { + cerr = -EINVAL; + break; + } It seems quite wrong to hold ctl_mutex while doing a copy_to_user() (of possibly a large amount of data) : A thread can sleep on a page fault and other threads cannot make progress. Eric - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take3 1/4] kevent: Core files.
On Thu, Aug 03, 2006 at 04:40:34PM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: + mutex_lock(u-ctl_mutex); + while (num max_nr ((k = kqueue_dequeue_ready(u)) != NULL)) { + if (copy_to_user(buf + num*sizeof(struct ukevent), + k-event, sizeof(struct ukevent))) { + cerr = -EINVAL; + break; + } It seems quite wrong to hold ctl_mutex while doing a copy_to_user() (of possibly a large amount of data) : A thread can sleep on a page fault and other threads cannot make progress. I would not call that wrong - system prevents some threads from removing kevents which are counted to be transfered to the userspace, i.e. when dequeuing was awakened and it had seen some events it is possible, that when it will dequeue them part will be removed by other thread, so I prevent this. Eric -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take3 1/4] kevent: Core files.
On Thursday 03 August 2006 16:55, Evgeniy Polyakov wrote: On Thu, Aug 03, 2006 at 04:40:34PM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: + mutex_lock(u-ctl_mutex); + while (num max_nr ((k = kqueue_dequeue_ready(u)) != NULL)) { + if (copy_to_user(buf + num*sizeof(struct ukevent), + k-event, sizeof(struct ukevent))) { + cerr = -EINVAL; + break; + } It seems quite wrong to hold ctl_mutex while doing a copy_to_user() (of possibly a large amount of data) : A thread can sleep on a page fault and other threads cannot make progress. I would not call that wrong - system prevents some threads from removing kevents which are counted to be transfered to the userspace, i.e. when dequeuing was awakened and it had seen some events it is possible, that when it will dequeue them part will be removed by other thread, so I prevent this. Hum, wrong was maybe not the good word but kqueue_dequeue_ready() uses a spinlock (ready_lock) to protect ready_list. One particular struct kevent is given to one thread, one at a time. If you look at fs/eventpoll.c, you can see how carefull is ep_send_events() so that multiple threads can in the same time transfer different items to user memory. In a model where several threads are servicing events collected by a single point (epoll, or kevent), this is important to not block all threads because of a single thread waiting a swapin (trigered by copy_to_user() ) Eric - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take3 1/4] kevent: Core files.
On Thu, Aug 03, 2006 at 05:11:58PM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: On Thursday 03 August 2006 16:55, Evgeniy Polyakov wrote: On Thu, Aug 03, 2006 at 04:40:34PM +0200, Eric Dumazet ([EMAIL PROTECTED]) wrote: + mutex_lock(u-ctl_mutex); + while (num max_nr ((k = kqueue_dequeue_ready(u)) != NULL)) { + if (copy_to_user(buf + num*sizeof(struct ukevent), + k-event, sizeof(struct ukevent))) { + cerr = -EINVAL; + break; + } It seems quite wrong to hold ctl_mutex while doing a copy_to_user() (of possibly a large amount of data) : A thread can sleep on a page fault and other threads cannot make progress. I would not call that wrong - system prevents some threads from removing kevents which are counted to be transfered to the userspace, i.e. when dequeuing was awakened and it had seen some events it is possible, that when it will dequeue them part will be removed by other thread, so I prevent this. Hum, wrong was maybe not the good word but kqueue_dequeue_ready() uses a spinlock (ready_lock) to protect ready_list. One particular struct kevent is given to one thread, one at a time. I mean that wait_event logic will see that there are requested number of events, and when it starts to get them, it is possible that there will be no events at all. If you look at fs/eventpoll.c, you can see how carefull is ep_send_events() so that multiple threads can in the same time transfer different items to user memory. It is done under the same logic under ep-sem semaphore, which is being held for del and read operations. Or do you mean to have rw semahore instead of mutex here? In a model where several threads are servicing events collected by a single point (epoll, or kevent), this is important to not block all threads because of a single thread waiting a swapin (trigered by copy_to_user() ) Eric -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take3 1/4] kevent: Core files.
From: Evgeniy Polyakov [EMAIL PROTECTED] Date: Thu, 3 Aug 2006 18:55:57 +0400 I would not call that wrong - system prevents some threads from removing kevents which are counted to be transfered to the userspace, i.e. when dequeuing was awakened and it had seen some events it is possible, that when it will dequeue them part will be removed by other thread, so I prevent this. Queue is all that matters to be synchronized, so it seems better to have a mutex on the queue rather than a global one. That way, user can only hurt himself. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take2 1/4] kevent: core files.
On Tue, Aug 01, 2006 at 04:56:59PM -0700, Zach Brown ([EMAIL PROTECTED]) wrote: OK, here's some of my reactions to the core part. Thanks. +#define KEVENT_SOCKET 0 +#define KEVENT_INODE 1 +#define KEVENT_TIMER 2 +#define KEVENT_POLL3 +#define KEVENT_NAIO4 +#define KEVENT_AIO 5 I guess we can't really avoid some form of centralized list of the constants in the API if we're going for a flat constant namespace. It'll be irritating to manage this list over time, just like it's irritating to manage syscall numbers now. +/* + * Socket/network asynchronous IO events. + */ +#defineKEVENT_SOCKET_RECV 0x1 +#defineKEVENT_SOCKET_ACCEPT0x2 +#defineKEVENT_SOCKET_SEND 0x4 I wonder if these shouldn't live in the subsystems instead of in kevent.h. Yes it could, but it requires including those files in kevent.h, which is exported to userspace, and it is not always possible to publish included file there. +/* + * Poll events. + */ +#defineKEVENT_POLL_POLLIN 0x0001 +#defineKEVENT_POLL_POLLPRI 0x0002 +#defineKEVENT_POLL_POLLOUT 0x0004 +#defineKEVENT_POLL_POLLERR 0x0008 +#defineKEVENT_POLL_POLLHUP 0x0010 +#defineKEVENT_POLL_POLLNVAL0x0020 + +#defineKEVENT_POLL_POLLRDNORM 0x0040 +#defineKEVENT_POLL_POLLRDBAND 0x0080 +#defineKEVENT_POLL_POLLWRNORM 0x0100 +#defineKEVENT_POLL_POLLWRBAND 0x0200 +#defineKEVENT_POLL_POLLMSG 0x0400 +#defineKEVENT_POLL_POLLREMOVE 0x1000 And couldn't we just use the existing poll bit definitions for this? asm/poll.h I expect. linux/poll.h is too heavy or not? +struct kevent_id +{ + __u32 raw[2]; +}; Why not a simple u64? Users can play games with packing it into other types if they want. + __u32 user[2];/* User's data. It is not used, just copied to/from user. */ + void*ptr; + }; Again just a u64 seems like it would be simpler. userspace library wrappers can help massage it, but the kernel is just treating it as an opaque data blob. u64 is not aligned, so I prefer to use u32 as much as possible. +}; + +#defineKEVENT_CTL_ADD 0 +#defineKEVENT_CTL_REMOVE 1 +#defineKEVENT_CTL_MODIFY 2 +#defineKEVENT_CTL_INIT 3 + +struct kevent_user_control +{ + unsigned intcmd;/* Control command, e.g. KEVENT_ADD, KEVENT_REMOVE... */ + unsigned intnum;/* Number of ukevents this strucutre controls. */ + unsigned inttimeout;/* Timeout in milliseconds waiting for num events to become ready. */ +}; Even if we only have one syscall with a cmd multiplexer (which I'm not thrilled with), we should at least make these arguments explicit in the system call. It's weird to hide them in a struct. We could also think about making them u32 or u64 so that we don't need compat wrappers, but maybe that's overkill. Ok. Also, can we please use a struct timespec for the timeout? Then the kernel will have the luxury of using whatever mechanism it wants to satisfy the user's precision desires. Just like sys_nanosleep() uses timespec and so can be implemented with hrtimers. It has variable size, I strongly against such things between kernel and userspace. +struct kevent +{ (trivial nit, struct kevent { is the preferred form.) Ok. + struct ukevent event; + spinlock_t lock; /* This lock protects ukevent manipulations, e.g. ret_flags changes. */ It'd be great if these struct members could get a prefix (ala: inode - i_, socket - sk_) so that it's less painful getting tags helpers to look up instances for us. Asking for 'lock' is hilarious. But it requires much less typing :) Will update. +struct kevent_list +{ + struct list_headkevent_list;/* List of all kevents. */ + spinlock_t kevent_lock;/* Protects all manipulations with queue of kevents. */ +}; + +struct kevent_user +{ + struct kevent_list kqueue[KEVENT_HASH_MASK+1]; Hmm. I think the current preference is not to have a lock per bucket. It doesn't scale nearly as well as it seems like it should as the cache footprint is higher and as cacheline contention hits as there are multiple buckets per cacheline. For now I'd simplify the hash into a single lock and an array of struct hlist_head. In the future it could be another user of some kind of relatively-generic hash implementation based on rcu that has been talked about for a while. Well, it scales better than one lock per the whole queue, but we can see how it looks with one lock. I used RCU hash table in kevents, but it scales very
Re: [take2 1/4] kevent: core files.
On Tue, Aug 01, 2006 at 05:01:38PM -0700, David Miller ([EMAIL PROTECTED]) wrote: From: Zach Brown [EMAIL PROTECTED] Date: Tue, 01 Aug 2006 16:56:59 -0700 Even if we only have one syscall with a cmd multiplexer (which I'm not thrilled with), we should at least make these arguments explicit in the system call. It's weird to hide them in a struct. We could also think about making them u32 or u64 so that we don't need compat wrappers, but maybe that's overkill. I think making the userspace data structure not require any compat handling is a must, thanks for pointing this out Zach. It does not require compat macros, since unsigned int has the same size on all normal machines where Linux runs, although it can be different. Anyway, I will replace it with explicit syscall parameters. It'd be great if these struct members could get a prefix (ala: inode - i_, socket - sk_) so that it's less painful getting tags helpers to look up instances for us. Asking for 'lock' is hilarious. Agreed. Heh, it was so much less typing... Hmm. I think the current preference is not to have a lock per bucket. Yes, it loses badly, that's why we undid this in the routing cache and just have a fixed sized array of locks which is hashed into. For kevents, I think a single spinlock initially is fine and if we hit performance problems on SMP we can fix it. We should not implement complexity we have no proof of needing yet :) Ok, let's see how it will behave. +#define KEVENT_MAX_REQUESTS PAGE_SIZE/sizeof(struct kevent) This is unused? It is probably groundwork for the mmap() ring buffer... :) A lot of work, isn't it? :) -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take2 1/4] kevent: core files.
From: Evgeniy Polyakov [EMAIL PROTECTED] Date: Wed, 2 Aug 2006 10:39:18 +0400 u64 is not aligned, so I prefer to use u32 as much as possible. We have aligned_u64 exactly for this purpose, netfilter makes use of it to avoid the x86_64 vs. x86 u64 alignment discrepency. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take2 1/4] kevent: core files.
On Wed, Aug 02, 2006 at 12:25:05AM -0700, David Miller ([EMAIL PROTECTED]) wrote: From: Evgeniy Polyakov [EMAIL PROTECTED] Date: Wed, 2 Aug 2006 10:39:18 +0400 u64 is not aligned, so I prefer to use u32 as much as possible. We have aligned_u64 exactly for this purpose, netfilter makes use of it to avoid the x86_64 vs. x86 u64 alignment discrepency. Ok, I will use that type. -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC 1/4] kevent: core files.
Herbert Xu wrote: The other to consider is that events don't come from the hardware. Events are written by the kernel. So if user-space is just reading the events that we've written, then there are no cache misses at all. Not quite true. The ring buffer can be written to from another processor. The kernel thread responsible for generating the event (receiving data from network or disk, expired timer) can run independently on another CPU. This is the case to keep in mind here. I thought Zach and the other involved in the discussions in Ottawa said this has been shown to be a problem and that a ring buffer implementation with something other than simple front and back pointers is preferable. -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ signature.asc Description: OpenPGP digital signature
Re: [RFC 1/4] kevent: core files.
From: Ulrich Drepper [EMAIL PROTECTED] Date: Tue, 01 Aug 2006 00:53:10 -0700 This is the case to keep in mind here. I thought Zach and the other involved in the discussions in Ottawa said this has been shown to be a problem and that a ring buffer implementation with something other than simple front and back pointers is preferable. This is part of why I suggested VJ style channel data structure. At worst, the cachelines for the entries get into shared modified state when the remove userland cpu reads the slot. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[take2 1/4] kevent: core files.
This patch includes core kevent files: - userspace controlling - kernelspace interfaces - initialization - notification state machines It might also inlclude parts from other subsystem (like network related syscalls, so it is possible that it will not compile without other patches applied). Signed-off-by: Evgeniy Polyakov [EMAIL PROTECTED] diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index dd63d47..0af988a 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S @@ -317,3 +317,7 @@ ENTRY(sys_call_table) .long sys_tee /* 315 */ .long sys_vmsplice .long sys_move_pages + .long sys_aio_recv + .long sys_aio_send + .long sys_kevent_get_events + .long sys_kevent_ctl diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 5d4a7d1..e157ad4 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -713,4 +713,8 @@ #endif .quad sys_tee .quad compat_sys_vmsplice .quad compat_sys_move_pages + .quad sys_aio_recv + .quad sys_aio_send + .quad sys_kevent_get_events + .quad sys_kevent_ctl ia32_syscall_end: diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index fc1c8dd..a76e50d 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -323,10 +323,14 @@ #define __NR_sync_file_range 314 #define __NR_tee 315 #define __NR_vmsplice 316 #define __NR_move_pages317 +#define __NR_aio_recv 318 +#define __NR_aio_send 319 +#define __NR_kevent_get_events 320 +#define __NR_kevent_ctl321 #ifdef __KERNEL__ -#define NR_syscalls 318 +#define NR_syscalls 322 /* * user-visible error numbers are in the range -1 - -128: see diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index 94387c9..9e61299 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -619,10 +619,18 @@ #define __NR_vmsplice 278 __SYSCALL(__NR_vmsplice, sys_vmsplice) #define __NR_move_pages279 __SYSCALL(__NR_move_pages, sys_move_pages) +#define __NR_aio_recv 280 +__SYSCALL(__NR_aio_recv, sys_aio_recv) +#define __NR_aio_send 281 +__SYSCALL(__NR_aio_send, sys_aio_send) +#define __NR_aio_sendfile 282 +__SYSCALL(__NR_aio_sendfile, sys_kevent_get_events) +#define __NR_kevent_ctl283 +__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl) #ifdef __KERNEL__ -#define __NR_syscall_max __NR_move_pages +#define __NR_syscall_max __NR_kevent_ctl #ifndef __NO_STUBS diff --git a/include/linux/kevent.h b/include/linux/kevent.h new file mode 100644 index 000..6c36f3f --- /dev/null +++ b/include/linux/kevent.h @@ -0,0 +1,259 @@ +/* + * kevent.h + * + * 2006 Copyright (c) Evgeniy Polyakov [EMAIL PROTECTED] + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef __KEVENT_H +#define __KEVENT_H + +/* + * Kevent request flags. + */ + +#define KEVENT_REQ_ONESHOT 0x1 /* Process this event only once and then dequeue. */ + +/* + * Kevent return flags. + */ +#define KEVENT_RET_BROKEN 0x1 /* Kevent is broken. */ +#define KEVENT_RET_DONE0x2 /* Kevent processing was finished successfully. */ + +/* + * Kevent type set. + */ +#define KEVENT_SOCKET 0 +#define KEVENT_INODE 1 +#define KEVENT_TIMER 2 +#define KEVENT_POLL3 +#define KEVENT_NAIO4 +#define KEVENT_AIO 5 +#defineKEVENT_MAX 6 + +/* + * Per-type event sets. + * Number of per-event sets should be exactly as number of kevent types. + */ + +/* + * Timer events. + */ +#defineKEVENT_TIMER_FIRED 0x1 + +/* + * Socket/network asynchronous IO events. + */ +#defineKEVENT_SOCKET_RECV 0x1 +#defineKEVENT_SOCKET_ACCEPT0x2 +#defineKEVENT_SOCKET_SEND 0x4 + +/* + * Inode events. + */ +#defineKEVENT_INODE_CREATE 0x1 +#defineKEVENT_INODE_REMOVE 0x2 + +/* + * Poll events. + */ +#defineKEVENT_POLL_POLLIN 0x0001 +#defineKEVENT_POLL_POLLPRI 0x0002
Re: [take2 1/4] kevent: core files.
On Tue, 1 Aug 2006, Evgeniy Polyakov wrote: + u-ready_num = 0; +#ifdef CONFIG_KEVENT_USER_STAT + u-wait_num = u-im_num = u-total = 0; +#endif Generally, #ifdefs in the body of the kernel code are discouraged. Can you abstract these out as static inlines? - James -- James Morris [EMAIL PROTECTED] - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take2 1/4] kevent: core files.
On Tue, Aug 01, 2006 at 09:46:58AM -0400, James Morris ([EMAIL PROTECTED]) wrote: On Tue, 1 Aug 2006, Evgeniy Polyakov wrote: + u-ready_num = 0; +#ifdef CONFIG_KEVENT_USER_STAT + u-wait_num = u-im_num = u-total = 0; +#endif Generally, #ifdefs in the body of the kernel code are discouraged. Can you abstract these out as static inlines? Yes, it is possible. I would ask is it needed at all? It contains number of immediately fired events (i.e. those which were ready when event was added and thus syscall returned immediately showing that it is ready), total number of events, which were inserted in the given queue and number of events which were marked as ready after they were inserted. Currently it is compilation option which ends up in printk with above info when kevent queue is removed. - James -- James Morris [EMAIL PROTECTED] -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take2 1/4] kevent: core files.
On Tue, 1 Aug 2006, Evgeniy Polyakov wrote: On Tue, Aug 01, 2006 at 09:46:58AM -0400, James Morris ([EMAIL PROTECTED]) wrote: On Tue, 1 Aug 2006, Evgeniy Polyakov wrote: + u-ready_num = 0; +#ifdef CONFIG_KEVENT_USER_STAT + u-wait_num = u-im_num = u-total = 0; +#endif Generally, #ifdefs in the body of the kernel code are discouraged. Can you abstract these out as static inlines? Yes, it is possible. I would ask is it needed at all? Yes, please, it is standard kernel development practice. Otherwise, the kernel will turn into an unmaintainable #ifdef jungle. It contains number of immediately fired events (i.e. those which were ready when event was added and thus syscall returned immediately showing that it is ready), total number of events, which were inserted in the given queue and number of events which were marked as ready after they were inserted. Currently it is compilation option which ends up in printk with above info when kevent queue is removed. Fine, make static inline void kevent_user_stat_reset(u); etc. which compile to nothing when it's not confifgured. -- James Morris [EMAIL PROTECTED] - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take2 1/4] kevent: core files.
On Tue, Aug 01, 2006 at 10:27:36AM -0400, James Morris ([EMAIL PROTECTED]) wrote: + u-ready_num = 0; +#ifdef CONFIG_KEVENT_USER_STAT + u-wait_num = u-im_num = u-total = 0; +#endif Generally, #ifdefs in the body of the kernel code are discouraged. Can you abstract these out as static inlines? Yes, it is possible. I would ask is it needed at all? Yes, please, it is standard kernel development practice. Will do. Thanks, James. -- James Morris [EMAIL PROTECTED] -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC 1/4] kevent: core files.
I do not think if we do a ring buffer that events should be obtainable via a syscall at all. Rather, I think this system call should be purely sleep until ring is not empty. Mmm, yeah, of course. That's much simpler. I'm looking forward to Evgeniy's next patch set. The ring buffer size, as Evgeniy also tried to describe, is bounded purely by the number of registered events. Yeah. fwiw, fs/aio.c has this property today. - z - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take2 1/4] kevent: core files.
From: Zach Brown [EMAIL PROTECTED] Date: Tue, 01 Aug 2006 16:56:59 -0700 Even if we only have one syscall with a cmd multiplexer (which I'm not thrilled with), we should at least make these arguments explicit in the system call. It's weird to hide them in a struct. We could also think about making them u32 or u64 so that we don't need compat wrappers, but maybe that's overkill. I think making the userspace data structure not require any compat handling is a must, thanks for pointing this out Zach. It'd be great if these struct members could get a prefix (ala: inode - i_, socket - sk_) so that it's less painful getting tags helpers to look up instances for us. Asking for 'lock' is hilarious. Agreed. Hmm. I think the current preference is not to have a lock per bucket. Yes, it loses badly, that's why we undid this in the routing cache and just have a fixed sized array of locks which is hashed into. For kevents, I think a single spinlock initially is fine and if we hit performance problems on SMP we can fix it. We should not implement complexity we have no proof of needing yet :) +#define KEVENT_MAX_REQUESTSPAGE_SIZE/sizeof(struct kevent) This is unused? It is probably groundwork for the mmap() ring buffer... :) - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [take2 1/4] kevent: core files.
OK, here's some of my reactions to the core part. +#define KEVENT_SOCKET0 +#define KEVENT_INODE 1 +#define KEVENT_TIMER 2 +#define KEVENT_POLL 3 +#define KEVENT_NAIO 4 +#define KEVENT_AIO 5 I guess we can't really avoid some form of centralized list of the constants in the API if we're going for a flat constant namespace. It'll be irritating to manage this list over time, just like it's irritating to manage syscall numbers now. +/* + * Socket/network asynchronous IO events. + */ +#define KEVENT_SOCKET_RECV 0x1 +#define KEVENT_SOCKET_ACCEPT0x2 +#define KEVENT_SOCKET_SEND 0x4 I wonder if these shouldn't live in the subsystems instead of in kevent.h. +/* + * Poll events. + */ +#define KEVENT_POLL_POLLIN 0x0001 +#define KEVENT_POLL_POLLPRI 0x0002 +#define KEVENT_POLL_POLLOUT 0x0004 +#define KEVENT_POLL_POLLERR 0x0008 +#define KEVENT_POLL_POLLHUP 0x0010 +#define KEVENT_POLL_POLLNVAL0x0020 + +#define KEVENT_POLL_POLLRDNORM 0x0040 +#define KEVENT_POLL_POLLRDBAND 0x0080 +#define KEVENT_POLL_POLLWRNORM 0x0100 +#define KEVENT_POLL_POLLWRBAND 0x0200 +#define KEVENT_POLL_POLLMSG 0x0400 +#define KEVENT_POLL_POLLREMOVE 0x1000 And couldn't we just use the existing poll bit definitions for this? +struct kevent_id +{ + __u32 raw[2]; +}; Why not a simple u64? Users can play games with packing it into other types if they want. + __u32 user[2];/* User's data. It is not used, just copied to/from user. */ + void*ptr; + }; Again just a u64 seems like it would be simpler. userspace library wrappers can help massage it, but the kernel is just treating it as an opaque data blob. +}; + +#define KEVENT_CTL_ADD 0 +#define KEVENT_CTL_REMOVE 1 +#define KEVENT_CTL_MODIFY 2 +#define KEVENT_CTL_INIT 3 + +struct kevent_user_control +{ + unsigned intcmd;/* Control command, e.g. KEVENT_ADD, KEVENT_REMOVE... */ + unsigned intnum;/* Number of ukevents this strucutre controls. */ + unsigned inttimeout;/* Timeout in milliseconds waiting for num events to become ready. */ +}; Even if we only have one syscall with a cmd multiplexer (which I'm not thrilled with), we should at least make these arguments explicit in the system call. It's weird to hide them in a struct. We could also think about making them u32 or u64 so that we don't need compat wrappers, but maybe that's overkill. Also, can we please use a struct timespec for the timeout? Then the kernel will have the luxury of using whatever mechanism it wants to satisfy the user's precision desires. Just like sys_nanosleep() uses timespec and so can be implemented with hrtimers. +struct kevent +{ (trivial nit, struct kevent { is the preferred form.) + struct ukevent event; + spinlock_t lock; /* This lock protects ukevent manipulations, e.g. ret_flags changes. */ It'd be great if these struct members could get a prefix (ala: inode - i_, socket - sk_) so that it's less painful getting tags helpers to look up instances for us. Asking for 'lock' is hilarious. +struct kevent_list +{ + struct list_headkevent_list;/* List of all kevents. */ + spinlock_t kevent_lock;/* Protects all manipulations with queue of kevents. */ +}; + +struct kevent_user +{ + struct kevent_list kqueue[KEVENT_HASH_MASK+1]; Hmm. I think the current preference is not to have a lock per bucket. It doesn't scale nearly as well as it seems like it should as the cache footprint is higher and as cacheline contention hits as there are multiple buckets per cacheline. For now I'd simplify the hash into a single lock and an array of struct hlist_head. In the future it could be another user of some kind of relatively-generic hash implementation based on rcu that has been talked about for a while. +#define KEVENT_MAX_REQUESTS PAGE_SIZE/sizeof(struct kevent) This is unused? +#define list_for_each_entry_reverse_safe(pos, n, head, member) \ + for (pos = list_entry((head)-prev, typeof(*pos), member), \ + n = list_entry(pos-member.prev, typeof(*pos), member); \ + prefetch(pos-member.prev), pos-member != (head);\ + pos = n, n = list_entry(pos-member.prev, typeof(*pos), member)) If anyone was calling this they could use list_for_each_entry_safe_reverse() in list.h but nothing is calling it? Either way, it should be removed :). +#define sock_async(__sk) 0 It's a minor complaint, but these kinds of ifdefs that drop arguments can cause unused
Re: [RFC 1/4] kevent: core files.
On Sat, Jul 29, 2006 at 09:18:47AM -0700, Ulrich Drepper ([EMAIL PROTECTED]) wrote: Evgeniy Polyakov wrote: Btw, why do we want mapped ring of ready events? If user requestd some event, he definitely wants to get them back when they are ready, and not to check and then get them? Could you please explain more on this issue? If of course makes no sense to enter the kernel to actually get the event. This should be done by storing the event in the ring buffer. I.e., there are two ways to get an event: - with a syscall. This can report as many events at once as the caller provides space for. And no event which is reported in the run buffer should be reported this way - if there is space, report it in the ring buffer. Yes, the buffer can be optional, then all events are reported by the system call. That requires a copy, which can neglect syscall overhead. Do we really want it to be done? So the use case would be like this: wait_and_get_event: is buffer empty ? yes - make syscall no - get event from buffer To avoid races, the syscall needs to take a parameter indicating the last event checked out from the buffer. If in the meantime the kernel put another event in the buffer the syscall immediately returns. Similar to what we do in the futex syscall. And how misordering between queue and buffer is going to be managed? I.e. when buffer is full and events are placed into queue, so syscall could get them, and then syscall is called to get events from the queue but not from the buffer - we can endup taking events from buffer while old are placed in the queue. And how waiting will be done without syscalls? Will glibc take care of it? The question is how to best represent the ring buffer. Zach and some others had some ready responses in Ottawa. The important thing is to avoid cache line ping pong when possible. Is the ring buffer absolutely necessary? Probably not. But it has the potential to help quite a bit. Don't look at the problem to solve in the context of heavy I/O operations when another syscall here and there doesn't matter. With this single event mechanism for every possible event the kernel can generate programming can look quite different. E.g., every read() call can implicitly we changed into an async read call followed by a user-level reschedule. This rescheduling allows another thread of execution to run while the read request is processed. I.e., it's basically a setjmp() followed by a goto into the inner loop to get the next event. And now suddenly the event notification mechanism really should be as fast as possible. If we submit basically every request asynchronously and are not creating dedicated threads for specific tasks anymore we a) have a lot more event notifications b) the probability of an event being reported when we want the receive the next one if higher (i.e., the case where no syscall vs syscall makes a difference) Yes, all this will require changes in the way programs a written but we shouldn't limit the way we can write programs unnecessarily. I think that given increasing discrepancies in relative speed/latency of the peripherals and the CPU this is one possible solution to keep the CPUs busy without resorting to a gazillion separate threads in each program. Ok, let's do it in the following way: I present new version of kevent with new syscalls and fixed issues mentioned before, while people look at it we can end up with mapped buffer design. Is it ok? -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC 1/4] kevent: core files.
Evgeniy Polyakov [EMAIL PROTECTED] wrote: - if there is space, report it in the ring buffer. Yes, the buffer can be optional, then all events are reported by the system call. That requires a copy, which can neglect syscall overhead. Do we really want it to be done? Please note that we're talking about events here, not actual data. So only the event is being copied, which is presumably rather small compared to the data. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC 1/4] kevent: core files.
On Mon, Jul 31, 2006 at 08:35:55PM +1000, Herbert Xu ([EMAIL PROTECTED]) wrote: Evgeniy Polyakov [EMAIL PROTECTED] wrote: - if there is space, report it in the ring buffer. Yes, the buffer can be optional, then all events are reported by the system call. That requires a copy, which can neglect syscall overhead. Do we really want it to be done? Please note that we're talking about events here, not actual data. So only the event is being copied, which is presumably rather small compared to the data. In syscall time kevents copy 40bytes for each event + 12 bytes of header (number of events, timeout and command number). That's likely two cache lines if only one event is reported. -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC 1/4] kevent: core files.
From: Evgeniy Polyakov [EMAIL PROTECTED] Date: Mon, 31 Jul 2006 14:50:37 +0400 In syscall time kevents copy 40bytes for each event + 12 bytes of header (number of events, timeout and command number). That's likely two cache lines if only one event is reported. Do you know how many cachelines are dirtied by system call entry and exit on typical system? On sparc64 it is a minimum of 3 64-byte cachelines just to save and restore the system call time cpu register state. If application is deep in a call chain, register windows might spill and each such register window will dirty 2 more cachelines as they are dumped to the stack. I am not even talking about the other basic necessities of doing a system call such as touching various task_struct and thread_info state to check for pending signals etc. System call overhead is non-trivial especially when you are using it to move only a few small objects into and out of the kernel. So I would say for up to 4 or 5 events, system call overhead alone touches as many cache lines as the events themselves. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC 1/4] kevent: core files.
On Mon, Jul 31, 2006 at 03:57:16AM -0700, David Miller wrote: So I would say for up to 4 or 5 events, system call overhead alone touches as many cache lines as the events themselves. Absolutely. The other to consider is that events don't come from the hardware. Events are written by the kernel. So if user-space is just reading the events that we've written, then there are no cache misses at all. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC 1/4] kevent: core files.
On Mon, Jul 31, 2006 at 02:33:22PM +0400, Evgeniy Polyakov ([EMAIL PROTECTED]) wrote: Ok, let's do it in the following way: I present new version of kevent with new syscalls and fixed issues mentioned before, while people look at it we can end up with mapped buffer design. Is it ok? Since kevents are never generated by kernel, but only marked as ready, length of the main queue performs as flow control, so we can create a mapped buffer which will have space equal to the main queue length multiplied by size of the copied to userspace structure plus 16 bits for the start index of the kernel writing side, i.e. it will store offset where the oldest event was placed. Since queue length is a limited factor and thus no new events can be added when queue is full, that means that buffer is full too and userspace must read events. When syscall is called to add new kevent and provided there offset differs from what kernel stored, that means that all events from kernel to provided index have been read and new events can be added. Thus we can even allow read-only mapping. Kernel's index is incremented modulo queue length. If kevent was removed after it was marked as ready, it's copy stays in the mapped buffer, but special flag can be assigned to show that kevent is no longer valid. -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC 1/4] kevent: core files.
From: Evgeniy Polyakov [EMAIL PROTECTED] Date: Mon, 31 Jul 2006 23:41:43 +0400 Since kevents are never generated by kernel, but only marked as ready, length of the main queue performs as flow control, so we can create a mapped buffer which will have space equal to the main queue length multiplied by size of the copied to userspace structure plus 16 bits for the start index of the kernel writing side, i.e. it will store offset where the oldest event was placed. Since queue length is a limited factor and thus no new events can be added when queue is full, that means that buffer is full too and userspace must read events. When syscall is called to add new kevent and provided there offset differs from what kernel stored, that means that all events from kernel to provided index have been read and new events can be added. Thus we can even allow read-only mapping. Kernel's index is incremented modulo queue length. If kevent was removed after it was marked as ready, it's copy stays in the mapped buffer, but special flag can be assigned to show that kevent is no longer valid. This sounds reasonable. However we must be mindful that the thread of control trying to add a new event might not be in a position to drain the queue of pending events when the queue is full. Usually he will be trying to add an event in response to handling another event. So we'd have cases like this, assume we start with a full event queue: thread Athread B dequeue event aha, new connection accept() register new kevent queue is now full again add kevent on new connection At this point thread A doesn't have very many options when the kevent add fails. You cannot force this thread to read more events, since he may not be in a state where he is easily able to do so. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC 1/4] kevent: core files.
On Monday 31 July 2006 17:00, David Miller wrote: So we'd have cases like this, assume we start with a full event queue: thread Athread B dequeue event aha, new connection accept() register new kevent queue is now full again add kevent on new connection At this point thread A doesn't have very many options when the kevent add fails. You cannot force this thread to read more events, since he may not be in a state where he is easily able to do so. There has to be some thread that is responsible for reading events. Perhaps a reasonable thing for a blocked thread that cannot process events to do is to yield to one that can? - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC 1/4] kevent: core files.
From: Brent Cook [EMAIL PROTECTED] Date: Mon, 31 Jul 2006 17:16:48 -0500 There has to be some thread that is responsible for reading events. Perhaps a reasonable thing for a blocked thread that cannot process events to do is to yield to one that can? The reason one decentralizes event processing into threads is so that once they are tasked to process some event they need not be concerned with event state. They are designed to process their event through to the end, then return to the top level and say any more work for me? - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC 1/4] kevent: core files.
From: Zach Brown [EMAIL PROTECTED] Date: Thu, 27 Jul 2006 12:18:42 -0700 [ I kept this thread around in my inbox because I wanted to give it some deep thought, so sorry for replying to old bits... ] So as the kernel generates events in the ring it only produces an event if the ownership field says that userspace has consumed it and in doing so it sets the ownership field to tell userspace that an event is waiting. userspace and the kernel now each follow their index around the ring as the ownership field lets them produce or consume the event at their index. Can someone tell me if the cache coherence costs of this are extreme? I'm hoping they're not. No need for an owner field, we can use something like a VJ netchannel datastructure for this. Kernel only writes to producer index and user only writes to consumer index. So, great, glibc can now find pending events very quickly if they're waiting in the ring and can fall back to the collection syscall if it wants to wait and the ring is empty. If it consumes events via the syscall it increases its ring index by the number the syscall returned. I do not think if we do a ring buffer that events should be obtainable via a syscall at all. Rather, I think this system call should be purely sleep until ring is not empty. This is actually reasonably simple stuff to implement as Evgeniy has tried to explain. Events in kevent live on a ready list when they have triggered. Existence on a list determined the state, and I think this design btw invalidates some of the arguments against using netlink that Ulrich mentions in his paper. If netlink socket queuing fails, well then kevent stays on ready list and that is all until the kevent can be successfully published to the user. I am not advocating netlink at all for this, as the ring buffer idea is much better. The ring buffer size, as Evgeniy also tried to describe, is bounded purely by the number of registered events. So event loop of application might look something like this: struct ukevent cur_event; struct timeval timeo; setup_timeout(timeo); for (;;) { int err; while(!(err = ukevent_dequeue(evt_fd, evt_ring, cur_event, timeo))) { struct my_event_object *o = event_to_object(cur_event); o-dispatch(o, cur_event); setup_timeout(timeo); } if (err == -ETIMEDOUT) timeout_processing(); else event_error_processing(err); } ukevent_dequeue() is perhaps some GLIBC implemented routine which does something like: int err; for (;;) { if (!evt_ring_empty(evt_ring)) { struct ukevent *p = evt_ring_consume(evt_ring); memcpy(event_p, p, sizeof(struct ukevent)); return 0; } err = kevent_wait(evt_fd, timeo_p); if (err 0) break; } return err; It's just some stupid ideas... we could also choose to expose the ring buffer layout directly to the user event loop and let it perform the dequeue operation and kevent_wait() calls directly. I don't see why not to allow that. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC 1/4] kevent: core files.
From: Evgeniy Polyakov [EMAIL PROTECTED] Date: Fri, 28 Jul 2006 09:23:12 +0400 I completely agree that existing kevent interface is not the best, so I'm opened for any suggestions. Should kevent creation/removing/modification be separated too? I do not think so, object for these 3 operations are the same, so there are no typing issues. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC 1/4] kevent: core files.
Nicholas Miell wrote: [...] and was wondering if you were familiar with the Solaris port APIs* and, I wasn't. if so, you could please comment on how your proposed event channels are different/better. There indeed is not much difference. The differences are in the details. The way those ports are specified doesn't allow much room for further optimizations. E.g., the userlevel ring buffer isn't possible. But mostly it's the same semantics. The ec_t type in my text is also better a file descriptor since otherwise it cannot be transported via Unix stream sockets. -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ signature.asc Description: OpenPGP digital signature
Re: [RFC 1/4] kevent: core files.
On Fri, Jul 28, 2006 at 08:38:02PM -0700, Ulrich Drepper ([EMAIL PROTECTED]) wrote: Zach Brown wrote: Ulrich, would you be satisfied if we didn't have the userspace mapped ring on the first pass and only had a collection syscall? I'm not the one to make a call but why rush things? Let's do it right from the start. Later changes can only lead to problems with users of the earlier interface. Btw, why do we want mapped ring of ready events? If user requestd some event, he definitely wants to get them back when they are ready, and not to check and then get them? Could you please explain more on this issue? -- ??? Ulrich Drepper ??? Red Hat, Inc. ??? 444 Castro St ??? Mountain View, CA ??? -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC 1/4] kevent: core files.
On Fri, Jul 28, 2006 at 09:32:42PM -0700, Nicholas Miell ([EMAIL PROTECTED]) wrote: Speaking of API design choices, I saw your OLS paper and was wondering if you were familiar with the Solaris port APIs* and, if so, you could please comment on how your proposed event channels are different/better. As far as it concerns kevents - userspace ports are just usual users of kevents, like timer notifications. Add another syscall to complete requested kevents and you get exactly Solaris ports. It is fairly simple to implement on top of kevents, I just do not see immediate benefits from that. -- Nicholas Miell [EMAIL PROTECTED] -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC 1/4] kevent: core files.
Evgeniy Polyakov wrote: Btw, why do we want mapped ring of ready events? If user requestd some event, he definitely wants to get them back when they are ready, and not to check and then get them? Could you please explain more on this issue? If of course makes no sense to enter the kernel to actually get the event. This should be done by storing the event in the ring buffer. I.e., there are two ways to get an event: - with a syscall. This can report as many events at once as the caller provides space for. And no event which is reported in the run buffer should be reported this way - if there is space, report it in the ring buffer. Yes, the buffer can be optional, then all events are reported by the system call. So the use case would be like this: wait_and_get_event: is buffer empty ? yes - make syscall no - get event from buffer To avoid races, the syscall needs to take a parameter indicating the last event checked out from the buffer. If in the meantime the kernel put another event in the buffer the syscall immediately returns. Similar to what we do in the futex syscall. The question is how to best represent the ring buffer. Zach and some others had some ready responses in Ottawa. The important thing is to avoid cache line ping pong when possible. Is the ring buffer absolutely necessary? Probably not. But it has the potential to help quite a bit. Don't look at the problem to solve in the context of heavy I/O operations when another syscall here and there doesn't matter. With this single event mechanism for every possible event the kernel can generate programming can look quite different. E.g., every read() call can implicitly we changed into an async read call followed by a user-level reschedule. This rescheduling allows another thread of execution to run while the read request is processed. I.e., it's basically a setjmp() followed by a goto into the inner loop to get the next event. And now suddenly the event notification mechanism really should be as fast as possible. If we submit basically every request asynchronously and are not creating dedicated threads for specific tasks anymore we a) have a lot more event notifications b) the probability of an event being reported when we want the receive the next one if higher (i.e., the case where no syscall vs syscall makes a difference) Yes, all this will require changes in the way programs a written but we shouldn't limit the way we can write programs unnecessarily. I think that given increasing discrepancies in relative speed/latency of the peripherals and the CPU this is one possible solution to keep the CPUs busy without resorting to a gazillion separate threads in each program. -- ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖ signature.asc Description: OpenPGP digital signature
Re: [RFC 1/4] kevent: core files.
On Saturday 29 July 2006 18:18, Ulrich Drepper wrote: Evgeniy Polyakov wrote: Btw, why do we want mapped ring of ready events? If user requestd some event, he definitely wants to get them back when they are ready, and not to check and then get them? Could you please explain more on this issue? If of course makes no sense to enter the kernel to actually get the event. This should be done by storing the event in the ring buffer. I.e., there are two ways to get an event: - with a syscall. This can report as many events at once as the caller provides space for. And no event which is reported in the run buffer should be reported this way - if there is space, report it in the ring buffer. Yes, the buffer can be optional, then all events are reported by the system call. So the use case would be like this: wait_and_get_event: is buffer empty ? yes - make syscall no - get event from buffer To avoid races, the syscall needs to take a parameter indicating the last event checked out from the buffer. If in the meantime the kernel put another event in the buffer the syscall immediately returns. Similar to what we do in the futex syscall. Couldn't this be done in a general way: Given a fd that supports streaming input, map some user-mem as a ring buffer for input. Maybe the kernel should control the buffer in order to make resizing possible (i.e., TCP zero-copy and window scaling). Hans Henrik - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC 1/4] kevent: core files.
On Sat, 2006-07-29 at 19:48 +0400, Evgeniy Polyakov wrote: On Fri, Jul 28, 2006 at 09:32:42PM -0700, Nicholas Miell ([EMAIL PROTECTED]) wrote: Speaking of API design choices, I saw your OLS paper and was wondering if you were familiar with the Solaris port APIs* and, if so, you could please comment on how your proposed event channels are different/better. As far as it concerns kevents - userspace ports are just usual users of kevents, like timer notifications. Add another syscall to complete requested kevents and you get exactly Solaris ports. It is fairly simple to implement on top of kevents, I just do not see immediate benefits from that. Sorry, I wasn't talking about kevent, I was talking about the interfaces described in The Need for Asynchronous, Zero-Copy Network I/O by Ulrich Drepper -- specifically the ec_t type and related functions and the modifications to struct sigevent. -- Nicholas Miell [EMAIL PROTECTED] - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC 1/4] kevent: core files.
I completely agree that existing kevent interface is not the best, so I'm opened for any suggestions. Should kevent creation/removing/modification be separated too? Yeah, I think so. Hmm, it looks like I'm lost here... Yeah, it seems my description might not have sunk in :). We're giving userspace a way to collect events without performing a system call. And why do we want this? So that event collection can be very efficient. How glibc is supposed to determine, that some events already fired and such requests will return immediately, or for example how timer events will be managed? ... That was what my previous mail was all about! - z - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC 1/4] kevent: core files.
On Fri, Jul 28, 2006 at 11:33:16AM -0700, Zach Brown ([EMAIL PROTECTED]) wrote: I completely agree that existing kevent interface is not the best, so I'm opened for any suggestions. Should kevent creation/removing/modification be separated too? Yeah, I think so. So, I'm going to create kevent_create/destroy/control and kevent_get_events() Or any better names? Hmm, it looks like I'm lost here... Yeah, it seems my description might not have sunk in :). We're giving userspace a way to collect events without performing a system call. And why do we want this? So that event collection can be very efficient. How glibc is supposed to determine, that some events already fired and such requests will return immediately, or for example how timer events will be managed? ... That was what my previous mail was all about! Some events are impossible to create in userspace (like timer notification, which requires timer start and check when timer completed). Actually all events are part of the kernel, since glibc does not have any knowledge about in-kernel state machines which are bound to appropriate kevents, so each kevent takes at least two syscall (create and get ready), and I do not see how, for exmple, glibc can avoid them when user requested POLLIN or similar event for network dataflow? According to syscall speed on Linux, last time I checked empty syscall took about 100ns on AMD Athlon 3500+. - z -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html