[take22 1/4] kevent: Core files.

2006-11-01 Thread Evgeniy Polyakov

Core files.

This patch includes core kevent files:
 * userspace controlling
 * kernelspace interfaces
 * initialization
 * notification state machines

Some bits of documentation can be found on project's homepage (and links from 
there):
http://tservice.net.ru/~s0mbre/old/?section=projectsitem=kevent

Signed-off-by: Evgeniy Polyakov [EMAIL PROTECTED]

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index 7e639f7..a9560eb 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -318,3 +318,6 @@ ENTRY(sys_call_table)
.long sys_vmsplice
.long sys_move_pages
.long sys_getcpu
+   .long sys_kevent_get_events
+   .long sys_kevent_ctl/* 320 */
+   .long sys_kevent_wait
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index b4aa875..cf18955 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -714,8 +714,11 @@ #endif
.quad compat_sys_get_robust_list
.quad sys_splice
.quad sys_sync_file_range
-   .quad sys_tee
+   .quad sys_tee   /* 315 */
.quad compat_sys_vmsplice
.quad compat_sys_move_pages
.quad sys_getcpu
+   .quad sys_kevent_get_events
+   .quad sys_kevent_ctl/* 320 */
+   .quad sys_kevent_wait
 ia32_syscall_end:  
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index bd99870..f009677 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -324,10 +324,13 @@ #define __NR_tee  315
 #define __NR_vmsplice  316
 #define __NR_move_pages317
 #define __NR_getcpu318
+#define __NR_kevent_get_events 319
+#define __NR_kevent_ctl320
+#define __NR_kevent_wait   321
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 319
+#define NR_syscalls 322
 #include linux/err.h
 
 /*
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 6137146..c53d156 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -619,10 +619,16 @@ #define __NR_vmsplice 278
 __SYSCALL(__NR_vmsplice, sys_vmsplice)
 #define __NR_move_pages279
 __SYSCALL(__NR_move_pages, sys_move_pages)
+#define __NR_kevent_get_events 280
+__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events)
+#define __NR_kevent_ctl281
+__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl)
+#define __NR_kevent_wait   282
+__SYSCALL(__NR_kevent_wait, sys_kevent_wait)
 
 #ifdef __KERNEL__
 
-#define __NR_syscall_max __NR_move_pages
+#define __NR_syscall_max __NR_kevent_wait
 #include linux/err.h
 
 #ifndef __NO_STUBS
diff --git a/include/linux/kevent.h b/include/linux/kevent.h
new file mode 100644
index 000..743b328
--- /dev/null
+++ b/include/linux/kevent.h
@@ -0,0 +1,205 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov [EMAIL PROTECTED]
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __KEVENT_H
+#define __KEVENT_H
+#include linux/types.h
+#include linux/list.h
+#include linux/rbtree.h
+#include linux/spinlock.h
+#include linux/mutex.h
+#include linux/wait.h
+#include linux/net.h
+#include linux/rcupdate.h
+#include linux/kevent_storage.h
+#include linux/ukevent.h
+
+#define KEVENT_MIN_BUFFS_ALLOC 3
+
+struct kevent;
+struct kevent_storage;
+typedef int (* kevent_callback_t)(struct kevent *);
+
+/* @callback is called each time new event has been caught. */
+/* @enqueue is called each time new event is queued. */
+/* @dequeue is called each time event is dequeued. */
+
+struct kevent_callbacks {
+   kevent_callback_t   callback, enqueue, dequeue;
+};
+
+#define KEVENT_READY   0x1
+#define KEVENT_STORAGE 0x2
+#define KEVENT_USER0x4
+
+struct kevent
+{
+   /* Used for kevent freeing.*/
+   struct rcu_head rcu_head;
+   struct ukevent  event;
+   /* This lock protects ukevent manipulations, e.g. ret_flags changes. */
+   spinlock_t  ulock;
+
+   /* Entry of user's tree. */
+   struct rb_node  kevent_node;
+   /* Entry of origin's queue. */
+   struct list_headstorage_entry;
+   /* Entry of user's ready. */
+   struct 

Re: [take21 1/4] kevent: Core files.

2006-10-28 Thread Eric Dumazet

+/*
+ * Called under kevent_user-ready_lock, so updates are always protected.
+ */
+int kevent_user_ring_add_event(struct kevent *k)
+{
+   unsigned int pidx, off;
+   struct kevent_mring *ring, *copy_ring;
+   
+   ring = k-user-pring[0];
+
+   if ((ring-kidx + 1 == ring-uidx) ||
+   ((ring-kidx + 1 == KEVENT_MAX_EVENTS)  ring-uidx == 
0)) {
+   if (k-user-overflow_kevent == NULL)
+   k-user-overflow_kevent = k;
+   return -EAGAIN;
+   }
+


I really dont understand how you manage to queue multiple kevents in the 
'overflow list'. You just queue one kevent at most. What am I missing ?





+
+   for (i=0; iKEVENT_MAX_PAGES; ++i) {
+   u-pring[i] = (struct kevent_mring 
*)__get_free_page(GFP_KERNEL);
+   if (!u-pring[i])
+   break;
+   }
+
+   if (i != KEVENT_MAX_PAGES)
+   goto err_out_free;


Why dont you use goto directly ?

if (!u-pring[i])
goto err_out_free;





+
+   u-pring[0]-uidx = u-pring[0]-kidx = 0;
+
+   return 0;
+
+err_out_free:
+   for (i=0; iKEVENT_MAX_PAGES; ++i) {
+   if (!u-pring[i])
+   break;
+
+   free_page((unsigned long)u-pring[i]);
+   }
+   return k;
+}
+






+static int kevent_user_ctl_add(struct kevent_user *u, unsigned int num, void 
__user *arg)
+{
+   int err, cerr = 0, knum = 0, rnum = 0, i;
+   void __user *orig = arg;
+   struct ukevent uk;
+
+   mutex_lock(u-ctl_mutex);
+
+   err = -EINVAL;
+   if (num  KEVENT_MIN_BUFFS_ALLOC) {
+   struct ukevent *ukev;
+
+   ukev = kevent_get_user(num, arg);
+   if (ukev) {
+   for (i = 0; i  num; ++i) {
+   err = kevent_user_add_ukevent(ukev[i], u);
+   if (err) {
+   kevent_stat_im(u);
+   if (i != rnum)
+   memcpy(ukev[rnum], ukev[i], 
sizeof(struct ukevent));
+   rnum++;
+   } else
+   knum++;



Why are you using/counting knum ?




+   }
+   if (copy_to_user(orig, ukev, rnum*sizeof(struct 
ukevent)))
+   cerr = -EFAULT;
+   kfree(ukev);
+   goto out_setup;
+   }
+   }
+
+   for (i = 0; i  num; ++i) {
+   if (copy_from_user(uk, arg, sizeof(struct ukevent))) {
+   cerr = -EFAULT;
+   break;
+   }
+   arg += sizeof(struct ukevent);
+
+   err = kevent_user_add_ukevent(uk, u);
+   if (err) {
+   kevent_stat_im(u);
+   if (copy_to_user(orig, uk, sizeof(struct ukevent))) {
+   cerr = -EFAULT;
+   break;
+   }
+   orig += sizeof(struct ukevent);
+   rnum++;
+   } else
+   knum++;
+   }
+
+out_setup:
+   if (cerr  0) {
+   err = cerr;
+   goto out_remove;
+   }
+
+   err = rnum;
+out_remove:
+   mutex_unlock(u-ctl_mutex);
+
+   return err;
+}

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take21 1/4] kevent: Core files.

2006-10-28 Thread Evgeniy Polyakov
On Sat, Oct 28, 2006 at 12:28:12PM +0200, Eric Dumazet ([EMAIL PROTECTED]) 
wrote:
 +/*
 + * Called under kevent_user-ready_lock, so updates are always protected.
 + */
 +int kevent_user_ring_add_event(struct kevent *k)
 +{
 + unsigned int pidx, off;
 + struct kevent_mring *ring, *copy_ring;
 + 
 + ring = k-user-pring[0];
 +
 + if ((ring-kidx + 1 == ring-uidx) ||
 + ((ring-kidx + 1 == KEVENT_MAX_EVENTS)  ring-uidx 
 == 0)) {
 + if (k-user-overflow_kevent == NULL)
 + k-user-overflow_kevent = k;
 + return -EAGAIN;
 + }
 +
 
 
 I really dont understand how you manage to queue multiple kevents in the 
 'overflow list'. You just queue one kevent at most. What am I missing ?

There is no overflow list - it is a pointer to the first kevent in the
ready queue, which was not put into ring buffer. It is an optimisation, 
which allows to not search for that position each time new event should 
be placed into the buffer, when it starts to have an empty slot.
 
 
 +
 +for (i=0; iKEVENT_MAX_PAGES; ++i) {
 +u-pring[i] = (struct kevent_mring 
 *)__get_free_page(GFP_KERNEL);
 +if (!u-pring[i])
 +break;
 +}
 +
 +if (i != KEVENT_MAX_PAGES)
 +goto err_out_free;
 
 Why dont you use goto directly ?
 
   if (!u-pring[i])
   goto err_out_free;
 
 
I used a fallback mode here which allowed to use smaller number of pages
for kevent ring buffer, but then decided to drop it.
So it is possible to use goto directly.
 
 +
 +u-pring[0]-uidx = u-pring[0]-kidx = 0;
 +
 +return 0;
 +
 +err_out_free:
 +for (i=0; iKEVENT_MAX_PAGES; ++i) {
 +if (!u-pring[i])
 +break;
 +
 +free_page((unsigned long)u-pring[i]);
 +}
 +return k;
 +}
 +
 
 
 
 
 +static int kevent_user_ctl_add(struct kevent_user *u, unsigned int num, 
 void __user *arg)
 +{
 +int err, cerr = 0, knum = 0, rnum = 0, i;
 +void __user *orig = arg;
 +struct ukevent uk;
 +
 +mutex_lock(u-ctl_mutex);
 +
 +err = -EINVAL;
 +if (num  KEVENT_MIN_BUFFS_ALLOC) {
 +struct ukevent *ukev;
 +
 +ukev = kevent_get_user(num, arg);
 +if (ukev) {
 +for (i = 0; i  num; ++i) {
 +err = kevent_user_add_ukevent(ukev[i], u);
 +if (err) {
 +kevent_stat_im(u);
 +if (i != rnum)
 +memcpy(ukev[rnum], 
 ukev[i], sizeof(struct ukevent));
 +rnum++;
 +} else
 +knum++;
 
 
 Why are you using/counting knum ?
 
It should go avay. 
 
 +}
 +if (copy_to_user(orig, ukev, rnum*sizeof(struct 
 ukevent)))
 +cerr = -EFAULT;
 +kfree(ukev);
 +goto out_setup;
 +}
 +}
 +
 +for (i = 0; i  num; ++i) {
 +if (copy_from_user(uk, arg, sizeof(struct ukevent))) {
 +cerr = -EFAULT;
 +break;
 +}
 +arg += sizeof(struct ukevent);
 +
 +err = kevent_user_add_ukevent(uk, u);
 +if (err) {
 +kevent_stat_im(u);
 +if (copy_to_user(orig, uk, sizeof(struct ukevent))) 
 {
 +cerr = -EFAULT;
 +break;
 +}
 +orig += sizeof(struct ukevent);
 +rnum++;
 +} else
 +knum++;
 +}
 +
 +out_setup:
 +if (cerr  0) {
 +err = cerr;
 +goto out_remove;
 +}
 +
 +err = rnum;
 +out_remove:
 +mutex_unlock(u-ctl_mutex);
 +
 +return err;
 +}
 -
 To unsubscribe from this list: send the line unsubscribe netdev in
 the body of a message to [EMAIL PROTECTED]
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take21 1/4] kevent: Core files.

2006-10-28 Thread Eric Dumazet

Evgeniy Polyakov a e'crit :

On Sat, Oct 28, 2006 at 12:28:12PM +0200, Eric Dumazet ([EMAIL PROTECTED]) 
wrote:


I really dont understand how you manage to queue multiple kevents in the 
'overflow list'. You just queue one kevent at most. What am I missing ?


There is no overflow list - it is a pointer to the first kevent in the
ready queue, which was not put into ring buffer. It is an optimisation, 
which allows to not search for that position each time new event should 
be placed into the buffer, when it starts to have an empty slot.


This overflow list (you may call it differently, but still it IS a list), is 
not complete. I feel you add it just to make me happy, but I am not (yet :) )


For example, you make no test at kevent_finish_user_complete() time.

Obviously, you can have a dangling pointer, and crash your box in certain 
conditions.


static void kevent_finish_user_complete(struct kevent *k, int deq)
{
struct kevent_user *u = k-user;
unsigned long flags;

if (deq)
kevent_dequeue(k);

spin_lock_irqsave(u-ready_lock, flags);
if (k-flags  KEVENT_READY) {
+   if (u-overflow_event == k) {
+   /* MUST do something to change u-overflow_kevent */
+   }
list_del(k-ready_entry);
k-flags = ~KEVENT_READY;
u-ready_num--;
}
spin_unlock_irqrestore(u-ready_lock, flags);

kevent_user_put(u);
call_rcu(k-rcu_head, kevent_free_rcu);
}

Eric
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take21 1/4] kevent: Core files.

2006-10-28 Thread Evgeniy Polyakov
On Sat, Oct 28, 2006 at 02:36:31PM +0200, Eric Dumazet ([EMAIL PROTECTED]) 
wrote:
 Evgeniy Polyakov a e'crit :
 On Sat, Oct 28, 2006 at 12:28:12PM +0200, Eric Dumazet 
 ([EMAIL PROTECTED]) wrote:
 
 I really dont understand how you manage to queue multiple kevents in the 
 'overflow list'. You just queue one kevent at most. What am I missing ?
 
 There is no overflow list - it is a pointer to the first kevent in the
 ready queue, which was not put into ring buffer. It is an optimisation, 
 which allows to not search for that position each time new event should 
 be placed into the buffer, when it starts to have an empty slot.
 
 This overflow list (you may call it differently, but still it IS a list), 
 is not complete. I feel you add it just to make me happy, but I am not (yet 
 :) )

There is no overflow list.
There is ready queue, part of which (first several entries) is copied
into the ring buffer, overflow_kevent is a pointer to the first kevent which
was not copied.

 For example, you make no test at kevent_finish_user_complete() time.
 
 Obviously, you can have a dangling pointer, and crash your box in certain 
 conditions.

You are right, I did not put overflow_kevent check into all places which
can remove kevent.

Here is a patch I am about to commit into the kevent tree:

diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c
index 711a8a8..ecee668 100644
--- a/kernel/kevent/kevent_user.c
+++ b/kernel/kevent/kevent_user.c
@@ -235,6 +235,36 @@ static void kevent_free_rcu(struct rcu_h
 }
 
 /*
+ * Must be called under u-ready_lock.
+ * This function removes kevent from ready queue and 
+ * tries to add new kevent into ring buffer.
+ */
+static void kevent_remove_ready(struct kevent *k)
+{
+   struct kevent_user *u = k-user;
+
+   list_del(k-ready_entry);
+   k-flags = ~KEVENT_READY;
+   u-ready_num--;
+   if (++u-pring[0]-uidx == KEVENT_MAX_EVENTS)
+   u-pring[0]-uidx = 0;
+   
+   if (u-overflow_kevent) {
+   int err;
+
+   err = kevent_user_ring_add_event(u-overflow_kevent);
+   if (!err || u-overflow_kevent == k) {
+   if (u-overflow_kevent-ready_entry.next == 
u-ready_list)
+   u-overflow_kevent = NULL;
+   else
+   u-overflow_kevent = 
+   
list_entry(u-overflow_kevent-ready_entry.next, 
+   struct kevent, 
ready_entry);
+   }
+   }
+}
+
+/*
  * Complete kevent removing - it dequeues kevent from storage list
  * if it is requested, removes kevent from ready list, drops userspace
  * control block reference counter and schedules kevent freeing through RCU.
@@ -248,11 +278,8 @@ static void kevent_finish_user_complete(
kevent_dequeue(k);
 
spin_lock_irqsave(u-ready_lock, flags);
-   if (k-flags  KEVENT_READY) {
-   list_del(k-ready_entry);
-   k-flags = ~KEVENT_READY;
-   u-ready_num--;
-   }
+   if (k-flags  KEVENT_READY)
+   kevent_remove_ready(k);
spin_unlock_irqrestore(u-ready_lock, flags);
 
kevent_user_put(u);
@@ -303,25 +330,7 @@ static struct kevent *kqueue_dequeue_rea
spin_lock_irqsave(u-ready_lock, flags);
if (u-ready_num  !list_empty(u-ready_list)) {
k = list_entry(u-ready_list.next, struct kevent, ready_entry);
-   list_del(k-ready_entry);
-   k-flags = ~KEVENT_READY;
-   u-ready_num--;
-   if (++u-pring[0]-uidx == KEVENT_MAX_EVENTS)
-   u-pring[0]-uidx = 0;
-   
-   if (u-overflow_kevent) {
-   int err;
-
-   err = kevent_user_ring_add_event(u-overflow_kevent);
-   if (!err) {
-   if (u-overflow_kevent-ready_entry.next == 
u-ready_list)
-   u-overflow_kevent = NULL;
-   else
-   u-overflow_kevent = 
-   
list_entry(u-overflow_kevent-ready_entry.next, 
-   struct kevent, 
ready_entry);
-   }
-   }
+   kevent_remove_ready(k);
}
spin_unlock_irqrestore(u-ready_lock, flags);
 

It tries to put next kevent into the ring and thus update
overflow_kevent if new kevent has been put into the 
buffer or kevent being removed is overflow kevent.
Patch depends on committed changes of returned error numbers and unused
variables cleanup, it will be included into next patchset if there are
no problems with it.

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL 

Re: [take21 1/4] kevent: Core files.

2006-10-28 Thread Eric Dumazet

Evgeniy Polyakov a e'crit :

On Sat, Oct 28, 2006 at 02:36:31PM +0200, Eric Dumazet ([EMAIL PROTECTED]) 
wrote:

Evgeniy Polyakov a e'crit :
On Sat, Oct 28, 2006 at 12:28:12PM +0200, Eric Dumazet 
([EMAIL PROTECTED]) wrote:
I really dont understand how you manage to queue multiple kevents in the 
'overflow list'. You just queue one kevent at most. What am I missing ?

There is no overflow list - it is a pointer to the first kevent in the
ready queue, which was not put into ring buffer. It is an optimisation, 
which allows to not search for that position each time new event should 
be placed into the buffer, when it starts to have an empty slot.
This overflow list (you may call it differently, but still it IS a list), 
is not complete. I feel you add it just to make me happy, but I am not (yet 
:) )


There is no overflow list.
There is ready queue, part of which (first several entries) is copied
into the ring buffer, overflow_kevent is a pointer to the first kevent which
was not copied.


For example, you make no test at kevent_finish_user_complete() time.

Obviously, you can have a dangling pointer, and crash your box in certain 
conditions.


You are right, I did not put overflow_kevent check into all places which
can remove kevent.

Here is a patch I am about to commit into the kevent tree:

diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c
index 711a8a8..ecee668 100644
--- a/kernel/kevent/kevent_user.c
+++ b/kernel/kevent/kevent_user.c
@@ -235,6 +235,36 @@ static void kevent_free_rcu(struct rcu_h
 }
 
 /*

+ * Must be called under u-ready_lock.
+ * This function removes kevent from ready queue and 
+ * tries to add new kevent into ring buffer.

+ */
+static void kevent_remove_ready(struct kevent *k)
+{
+   struct kevent_user *u = k-user;
+
+   list_del(k-ready_entry);


Arg... no

You cannot call list_del() , then check overflow_kevent.

I you call list_del on what happens to be the kevent pointed by 
overflow_kevent, you loose...



+   k-flags = ~KEVENT_READY;
+   u-ready_num--;
+   if (++u-pring[0]-uidx == KEVENT_MAX_EVENTS)
+   u-pring[0]-uidx = 0;
+   
+   if (u-overflow_kevent) {
+   int err;
+
+   err = kevent_user_ring_add_event(u-overflow_kevent);
+   if (!err || u-overflow_kevent == k) {
+   if (u-overflow_kevent-ready_entry.next == 
u-ready_list)
+   u-overflow_kevent = NULL;
+   else
+u-overflow_kevent = 
+	list_entry(u-overflow_kevent-ready_entry.next, 
+			struct kevent, ready_entry);

+   }
+   }
+}
+
+/*
  * Complete kevent removing - it dequeues kevent from storage list
  * if it is requested, removes kevent from ready list, drops userspace
  * control block reference counter and schedules kevent freeing through RCU.
@@ -248,11 +278,8 @@ static void kevent_finish_user_complete(
kevent_dequeue(k);
 
 	spin_lock_irqsave(u-ready_lock, flags);

-   if (k-flags  KEVENT_READY) {
-   list_del(k-ready_entry);
-   k-flags = ~KEVENT_READY;
-   u-ready_num--;
-   }
+   if (k-flags  KEVENT_READY)
+   kevent_remove_ready(k);
spin_unlock_irqrestore(u-ready_lock, flags);
 
 	kevent_user_put(u);

@@ -303,25 +330,7 @@ static struct kevent *kqueue_dequeue_rea
spin_lock_irqsave(u-ready_lock, flags);
if (u-ready_num  !list_empty(u-ready_list)) {
k = list_entry(u-ready_list.next, struct kevent, ready_entry);
-   list_del(k-ready_entry);
-   k-flags = ~KEVENT_READY;
-   u-ready_num--;
-   if (++u-pring[0]-uidx == KEVENT_MAX_EVENTS)
-   u-pring[0]-uidx = 0;
-   
-   if (u-overflow_kevent) {
-   int err;
-
-   err = kevent_user_ring_add_event(u-overflow_kevent);
-   if (!err) {
-   if (u-overflow_kevent-ready_entry.next == 
u-ready_list)
-   u-overflow_kevent = NULL;
-   else
-	u-overflow_kevent = 
-		list_entry(u-overflow_kevent-ready_entry.next, 
-struct kevent, ready_entry);

-   }
-   }
+   kevent_remove_ready(k);
}
spin_unlock_irqrestore(u-ready_lock, flags);
 



-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take21 1/4] kevent: Core files.

2006-10-28 Thread Evgeniy Polyakov
On Sat, Oct 28, 2006 at 03:23:40PM +0200, Eric Dumazet ([EMAIL PROTECTED]) 
wrote:
 diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c
 index 711a8a8..ecee668 100644
 --- a/kernel/kevent/kevent_user.c
 +++ b/kernel/kevent/kevent_user.c
 @@ -235,6 +235,36 @@ static void kevent_free_rcu(struct rcu_h
  }
  
  /*
 + * Must be called under u-ready_lock.
 + * This function removes kevent from ready queue and 
 + * tries to add new kevent into ring buffer.
 + */
 +static void kevent_remove_ready(struct kevent *k)
 +{
 +struct kevent_user *u = k-user;
 +
 +list_del(k-ready_entry);
 
 Arg... no
 
 You cannot call list_del() , then check overflow_kevent.
 
 I you call list_del on what happens to be the kevent pointed by 
 overflow_kevent, you loose...

This function is always called from appropriate context, where it is
guaranteed that it is safe to call list_del:
1. when kevent is removed. It is called after check, that given kevent 
is in the ready queue.
2. when dequeued from ready queue, which means that it can be removed
from that queue.

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take21 1/4] kevent: Core files.

2006-10-28 Thread Eric Dumazet

Evgeniy Polyakov a e'crit :

On Sat, Oct 28, 2006 at 03:23:40PM +0200, Eric Dumazet ([EMAIL PROTECTED]) 
wrote:

diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c
index 711a8a8..ecee668 100644
--- a/kernel/kevent/kevent_user.c
+++ b/kernel/kevent/kevent_user.c
@@ -235,6 +235,36 @@ static void kevent_free_rcu(struct rcu_h
}

/*
+ * Must be called under u-ready_lock.
+ * This function removes kevent from ready queue and 
+ * tries to add new kevent into ring buffer.

+ */
+static void kevent_remove_ready(struct kevent *k)
+{
+   struct kevent_user *u = k-user;
+
+   list_del(k-ready_entry);

Arg... no

You cannot call list_del() , then check overflow_kevent.

I you call list_del on what happens to be the kevent pointed by 
overflow_kevent, you loose...


This function is always called from appropriate context, where it is
guaranteed that it is safe to call list_del:
1. when kevent is removed. It is called after check, that given kevent 
is in the ready queue.

2. when dequeued from ready queue, which means that it can be removed
from that queue.



Could you please check the list_del() function ?

file include/linux/list.h

static inline void list_del(struct list_head *entry)
{
  __list_del(entry-prev, entry-next);
  entry-next = LIST_POISON1;
  entry-prev = LIST_POISON2;
}

So, after calling list_del(k-read_entry);
next and prev are basically destroyed.

So when you write later :

+if (!err || u-overflow_kevent == k) {
+if (u-overflow_kevent-ready_entry.next == u-ready_list)
+u-overflow_kevent = NULL;
+else
+u-overflow_kevent = + 
list_entry(u-overflow_kevent-ready_entry.next, + 
struct kevent, ready_entry);

+}


then you have a problem, since

list_entry(k-ready_entry.next, struct kevent, ready_entry);

will give you garbage.

Eric
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take21 1/4] kevent: Core files.

2006-10-28 Thread Evgeniy Polyakov
On Sat, Oct 28, 2006 at 03:34:52PM +0200, Eric Dumazet ([EMAIL PROTECTED]) 
wrote:
 +  list_del(k-ready_entry);
 Arg... no
 
 You cannot call list_del() , then check overflow_kevent.
 
 I you call list_del on what happens to be the kevent pointed by 
 overflow_kevent, you loose...
 
 This function is always called from appropriate context, where it is
 guaranteed that it is safe to call list_del:
 1. when kevent is removed. It is called after check, that given kevent 
 is in the ready queue.
 2. when dequeued from ready queue, which means that it can be removed
 from that queue.
 
 
 Could you please check the list_del() function ?
 
 file include/linux/list.h
 
 static inline void list_del(struct list_head *entry)
 {
   __list_del(entry-prev, entry-next);
   entry-next = LIST_POISON1;
   entry-prev = LIST_POISON2;
 }
 
 So, after calling list_del(k-read_entry);
 next and prev are basically destroyed.
 
 So when you write later :
 
 +if (!err || u-overflow_kevent == k) {
 +if (u-overflow_kevent-ready_entry.next == u-ready_list)
 +u-overflow_kevent = NULL;
 +else
 +u-overflow_kevent = + 
 list_entry(u-overflow_kevent-ready_entry.next, + 
 struct kevent, ready_entry);
 +}
 
 
 then you have a problem, since
 
 list_entry(k-ready_entry.next, struct kevent, ready_entry);
 
 will give you garbage.

Ok, I understand you now.
To remove this issue we can delete entry from the list after all checks
with overflow_kevent pointer are completed, i.e. have something like
this:

diff --git a/kernel/kevent/kevent_user.c b/kernel/kevent/kevent_user.c
index 711a8a8..f3fec9b 100644
--- a/kernel/kevent/kevent_user.c
+++ b/kernel/kevent/kevent_user.c
@@ -235,6 +235,36 @@ static void kevent_free_rcu(struct rcu_h
 }
 
 /*
+ * Must be called under u-ready_lock.
+ * This function removes kevent from ready queue and 
+ * tries to add new kevent into ring buffer.
+ */
+static void kevent_remove_ready(struct kevent *k)
+{
+   struct kevent_user *u = k-user;
+
+   if (++u-pring[0]-uidx == KEVENT_MAX_EVENTS)
+   u-pring[0]-uidx = 0;
+
+   if (u-overflow_kevent) {
+   int err;
+
+   err = kevent_user_ring_add_event(u-overflow_kevent);
+   if (!err || u-overflow_kevent == k) {
+   if (u-overflow_kevent-ready_entry.next == 
u-ready_list)
+   u-overflow_kevent = NULL;
+   else
+   u-overflow_kevent = 
+   
list_entry(u-overflow_kevent-ready_entry.next, 
+   struct kevent, 
ready_entry);
+   }
+   }
+   list_del(k-ready_entry);
+   k-flags = ~KEVENT_READY;
+   u-ready_num--;
+}
+
+/*
  * Complete kevent removing - it dequeues kevent from storage list
  * if it is requested, removes kevent from ready list, drops userspace
  * control block reference counter and schedules kevent freeing through RCU.
@@ -248,11 +278,8 @@ static void kevent_finish_user_complete(
kevent_dequeue(k);
 
spin_lock_irqsave(u-ready_lock, flags);
-   if (k-flags  KEVENT_READY) {
-   list_del(k-ready_entry);
-   k-flags = ~KEVENT_READY;
-   u-ready_num--;
-   }
+   if (k-flags  KEVENT_READY)
+   kevent_remove_ready(k);
spin_unlock_irqrestore(u-ready_lock, flags);
 
kevent_user_put(u);
@@ -303,25 +330,7 @@ static struct kevent *kqueue_dequeue_rea
spin_lock_irqsave(u-ready_lock, flags);
if (u-ready_num  !list_empty(u-ready_list)) {
k = list_entry(u-ready_list.next, struct kevent, ready_entry);
-   list_del(k-ready_entry);
-   k-flags = ~KEVENT_READY;
-   u-ready_num--;
-   if (++u-pring[0]-uidx == KEVENT_MAX_EVENTS)
-   u-pring[0]-uidx = 0;
-   
-   if (u-overflow_kevent) {
-   int err;
-
-   err = kevent_user_ring_add_event(u-overflow_kevent);
-   if (!err) {
-   if (u-overflow_kevent-ready_entry.next == 
u-ready_list)
-   u-overflow_kevent = NULL;
-   else
-   u-overflow_kevent = 
-   
list_entry(u-overflow_kevent-ready_entry.next, 
-   struct kevent, 
ready_entry);
-   }
-   }
+   kevent_remove_ready(k);
}
spin_unlock_irqrestore(u-ready_lock, flags);
 

Thanks.

 Eric

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  

[take21 1/4] kevent: Core files.

2006-10-27 Thread Evgeniy Polyakov

Core files.

This patch includes core kevent files:
 * userspace controlling
 * kernelspace interfaces
 * initialization
 * notification state machines

Some bits of documentation can be found on project's homepage (and links from 
there):
http://tservice.net.ru/~s0mbre/old/?section=projectsitem=kevent

Signed-off-by: Evgeniy Polyakov [EMAIL PROTECTED]

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index 7e639f7..a9560eb 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -318,3 +318,6 @@ ENTRY(sys_call_table)
.long sys_vmsplice
.long sys_move_pages
.long sys_getcpu
+   .long sys_kevent_get_events
+   .long sys_kevent_ctl/* 320 */
+   .long sys_kevent_wait
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index b4aa875..cf18955 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -714,8 +714,11 @@ #endif
.quad compat_sys_get_robust_list
.quad sys_splice
.quad sys_sync_file_range
-   .quad sys_tee
+   .quad sys_tee   /* 315 */
.quad compat_sys_vmsplice
.quad compat_sys_move_pages
.quad sys_getcpu
+   .quad sys_kevent_get_events
+   .quad sys_kevent_ctl/* 320 */
+   .quad sys_kevent_wait
 ia32_syscall_end:  
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index bd99870..f009677 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -324,10 +324,13 @@ #define __NR_tee  315
 #define __NR_vmsplice  316
 #define __NR_move_pages317
 #define __NR_getcpu318
+#define __NR_kevent_get_events 319
+#define __NR_kevent_ctl320
+#define __NR_kevent_wait   321
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 319
+#define NR_syscalls 322
 #include linux/err.h
 
 /*
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 6137146..c53d156 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -619,10 +619,16 @@ #define __NR_vmsplice 278
 __SYSCALL(__NR_vmsplice, sys_vmsplice)
 #define __NR_move_pages279
 __SYSCALL(__NR_move_pages, sys_move_pages)
+#define __NR_kevent_get_events 280
+__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events)
+#define __NR_kevent_ctl281
+__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl)
+#define __NR_kevent_wait   282
+__SYSCALL(__NR_kevent_wait, sys_kevent_wait)
 
 #ifdef __KERNEL__
 
-#define __NR_syscall_max __NR_move_pages
+#define __NR_syscall_max __NR_kevent_wait
 #include linux/err.h
 
 #ifndef __NO_STUBS
diff --git a/include/linux/kevent.h b/include/linux/kevent.h
new file mode 100644
index 000..125414c
--- /dev/null
+++ b/include/linux/kevent.h
@@ -0,0 +1,205 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov [EMAIL PROTECTED]
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __KEVENT_H
+#define __KEVENT_H
+#include linux/types.h
+#include linux/list.h
+#include linux/rbtree.h
+#include linux/spinlock.h
+#include linux/mutex.h
+#include linux/wait.h
+#include linux/net.h
+#include linux/rcupdate.h
+#include linux/kevent_storage.h
+#include linux/ukevent.h
+
+#define KEVENT_MIN_BUFFS_ALLOC 3
+
+struct kevent;
+struct kevent_storage;
+typedef int (* kevent_callback_t)(struct kevent *);
+
+/* @callback is called each time new event has been caught. */
+/* @enqueue is called each time new event is queued. */
+/* @dequeue is called each time event is dequeued. */
+
+struct kevent_callbacks {
+   kevent_callback_t   callback, enqueue, dequeue;
+};
+
+#define KEVENT_READY   0x1
+#define KEVENT_STORAGE 0x2
+#define KEVENT_USER0x4
+
+struct kevent
+{
+   /* Used for kevent freeing.*/
+   struct rcu_head rcu_head;
+   struct ukevent  event;
+   /* This lock protects ukevent manipulations, e.g. ret_flags changes. */
+   spinlock_t  ulock;
+
+   /* Entry of user's tree. */
+   struct rb_node  kevent_node;
+   /* Entry of origin's queue. */
+   struct list_headstorage_entry;
+   /* Entry of user's ready. */
+   struct 

Re: [take19 1/4] kevent: Core files.

2006-10-17 Thread Chase Venters
On Tuesday 17 October 2006 00:09, Johann Borck wrote:
 Regarding mukevent I'm thinking of a event-type specific struct, that is
 filled by the originating code, and placed into a per-event-type ring
 buffer (which  requires modification of kevent_wait).

I'd personally worry about an implementation that used a per-event-type ring 
buffer, because you're still left having to hack around starvation issues in 
user-space. It is of course possible under the current model for anyone who 
wants per-event-type ring buffers to have them - just make separate kevent 
sets.

I haven't thought this through all the way yet, but why not have variable 
length event structures and have the kernel fill in a next pointer in each 
one? This could even be used to keep backwards binary compatibility while 
adding additional fields to the structures over time, though no space would 
be wasted on modern programs. You still end up with a question of what to do 
in case of overflow, but I'm thinking the thing to do in that case might be 
to start pushing overflow events onto a linked list which can be written back 
into the ring buffer when space becomes available. The appropriate behavior 
would be to throw new events on the linked list if the linked list had any 
events, so that things are delivered in order, but write to the mapped buffer 
directly otherwise.

Deciding when to do that is tricky, and I haven't thought through the 
implications fully when I say this, but what about activating a bottom half 
when more space becomes available, and let that drain overflowed events back 
into the mapped buffer? Or perhaps the time to do it would be in the next 
blocking wait, when the queue emptied? 

I think it is very important to avoid any limits that can not be adjusted on 
the fly at run-time by CAP_SYS_ADMIN or what have you. Doing it this way may 
have other problems I've ignored but at least the big one - compile-time 
capacity limits in the year 2006 - would be largely avoided :P

Nothing real solid yet, just some electrical storms in the grey matter...

Thanks,
Chase
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take19 1/4] kevent: Core files.

2006-10-17 Thread Evgeniy Polyakov
On Tue, Oct 17, 2006 at 07:10:14AM +0200, Johann Borck ([EMAIL PROTECTED]) 
wrote:
 Ulrich Drepper wrote:
  Evgeniy Polyakov wrote:
  Existing design does not allow overflow.
 
  And I've pointed out a number of times that this is not practical at
  best.  There are event sources which can create events which cannot be
  coalesced into one single event as it would be required with your design.
 
  Signals are one example, specifically realtime signals.  If we do not
  want the design to be limited from the start this approach has to be
  thought over.
 
 
  So zap mmap() support completely, since it is not usable at all. We
  wont discuss on it.
 
  Initial implementation did not have it.
  But I was requested to do it, and it is ready now.
  No one likes it, but no one provides an alternative implementation.
  We are stuck.
 
  We need the mapped ring buffer.  The current design (before it was
  removed) was broken but this does not mean it shouldn't be
  implemented.  We just need more time to figure out how to implement it
  correctly.
 
 Considering the if at all and if then how of ring buffer implemetation
 I'd like to throw in some ideas I had when reading the discussion and
 respective code. If I understood Ulrich Drepper right, his notion of a
 generic event handling interface is, that it has to be flexible enough
 to transport additional info from origin to userspace, and to support
 queuing of events from the same origin, so that additional
 per-event-occurrence data doesn't get lost, which would happen when
 coalescing multiple events into one until delivery. From what I read he
 says ring buffer is broken because of  insufficient space for additional
 data (mukevent) and the limited number of events that can be put into
 ring buffer. Another argument is missing notification of userspace about
 dropped events in case ring buffer limit is reached. (is that right?)

I can add such notification, but its existense _is_ the broken design.
After such condition happend, all new events will dissapear (although
they are still accessible through usual queue) from mapped buffer.

While writing this I have come to the idea on how to imrove the case of
the size of mapped buffer - we can make it with limited size, and when
it is full, some bit will be set in the shared area and obviously no new
events can be added there, but when user commits some events from that
buffer (i.e. says to kernel that appropriate kevents can be freed or
requeued according to theirs flags), new ready events from ready queue
can be copied into mapped buffer.

It still does not solve (and I do insist that it is broken behaviour)
the case when kernel is going to generate infinite number of events for
one requested by userspace (as in case of generating new 'data_has_arrived'
event when new byte has been received).

Userspace events are only marked as ready, they are not generated - it
is high-performance _feature_ of the new design, not some kind of a bug.

 I see no reason why kevent couldn't be modified to fit (all) these
 needs. While modifying the server-example and writing a client using
 kevent I came across the coalescing problem, there were more incoming
 connections than accept events, and I had to work around that. In this

Btw, accept() issue is exactly the same as with usual poll() - repeated
insertion of the same kevent will fire immediately, which requires event
to be one-shot. One of the initial implementation contained number of
ready for accept sockets as one of the returned parameters though.

 case the pure number of coalesced events would suffice, while it
 wouldn't for the example of RT-signals that Ulrich Drepper gave. So if
 coalescing can be done at all or if it is impossible depends on the type
 of event. The same goes for additional data delivered with the events.
 There might be no panacea for all possible scenarios with one fixed
 design. Either performance suffers for 'lightweight' events  which don't
 need additional data and/or coalescing is not problematic and/or ring
 buffer, or kevent is not usable for other types of events. Why not treat
 different things differently, and let the (kernel-)user decide.
 I don't know if I got all this right, but if, then ring buffer is needed
 especially for cases where coalescing is not possible and additional
 data has to be delivered for each triggered notification (so the pure
 number of events is not enough; other reasons? performance? ). To me it
 doesn't make sense to have kevent fill memory and use processor-time if
 buffer is not used at all, which is the case when using kevent_getevents.
 So here are my Ideas:
 Make usage of ring buffer optional, if not required for specific
 event-type it might be chosen by userspace-code.
 Make limit of events in ring buffer optional and controllable from
 userspace.

It is of course possible, main problem is that existing design of the
mapped buffer is not sufficient, and there are no other propositions
except that 'it 

Re: [take19 1/4] kevent: Core files.

2006-10-17 Thread Evgeniy Polyakov
On Tue, Oct 17, 2006 at 12:59:47AM -0500, Chase Venters ([EMAIL PROTECTED]) 
wrote:
 On Tuesday 17 October 2006 00:09, Johann Borck wrote:
  Regarding mukevent I'm thinking of a event-type specific struct, that is
  filled by the originating code, and placed into a per-event-type ring
  buffer (which  requires modification of kevent_wait).
 
 I'd personally worry about an implementation that used a per-event-type ring 
 buffer, because you're still left having to hack around starvation issues in 
 user-space. It is of course possible under the current model for anyone who 
 wants per-event-type ring buffers to have them - just make separate kevent 
 sets.
 
 I haven't thought this through all the way yet, but why not have variable 
 length event structures and have the kernel fill in a next pointer in each 
 one? This could even be used to keep backwards binary compatibility while 

Why do we want variable size structures in mmap ring buffer?

 adding additional fields to the structures over time, though no space would 
 be wasted on modern programs. You still end up with a question of what to do 
 in case of overflow, but I'm thinking the thing to do in that case might be 
 to start pushing overflow events onto a linked list which can be written back 
 into the ring buffer when space becomes available. The appropriate behavior 
 would be to throw new events on the linked list if the linked list had any 
 events, so that things are delivered in order, but write to the mapped buffer 
 directly otherwise.

I think in a similar way.
Kevent actually do not require such list, since it has already queue of
the ready events.

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take19 1/4] kevent: Core files.

2006-10-17 Thread Chase Venters
On Tuesday 17 October 2006 05:42, Evgeniy Polyakov wrote:
 On Tue, Oct 17, 2006 at 12:59:47AM -0500, Chase Venters 
([EMAIL PROTECTED]) wrote:
  On Tuesday 17 October 2006 00:09, Johann Borck wrote:
   Regarding mukevent I'm thinking of a event-type specific struct, that
   is filled by the originating code, and placed into a per-event-type
   ring buffer (which  requires modification of kevent_wait).
 
  I'd personally worry about an implementation that used a per-event-type
  ring buffer, because you're still left having to hack around starvation
  issues in user-space. It is of course possible under the current model
  for anyone who wants per-event-type ring buffers to have them - just make
  separate kevent sets.
 
  I haven't thought this through all the way yet, but why not have variable
  length event structures and have the kernel fill in a next pointer in
  each one? This could even be used to keep backwards binary compatibility
  while

 Why do we want variable size structures in mmap ring buffer?

Flexibility primarily. So when we all decide to add a new event type six 
months from now, or add more information to an existing one, we don't run the 
risk that the existing mukevent isn't big enough.

  adding additional fields to the structures over time, though no space
  would be wasted on modern programs. You still end up with a question of
  what to do in case of overflow, but I'm thinking the thing to do in that
  case might be to start pushing overflow events onto a linked list which
  can be written back into the ring buffer when space becomes available.
  The appropriate behavior would be to throw new events on the linked list
  if the linked list had any events, so that things are delivered in order,
  but write to the mapped buffer directly otherwise.

 I think in a similar way.
 Kevent actually do not require such list, since it has already queue of
 the ready events.

The current event types coalesce if there are multiple events, correct? It 
sounds like there may be other event types where coalescing multiple events 
is not the correct approach.

Thanks,
Chase
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take19 1/4] kevent: Core files.

2006-10-17 Thread Eric Dumazet
On Tuesday 17 October 2006 12:39, Evgeniy Polyakov wrote:

 I can add such notification, but its existense _is_ the broken design.
 After such condition happend, all new events will dissapear (although
 they are still accessible through usual queue) from mapped buffer.

 While writing this I have come to the idea on how to imrove the case of
 the size of mapped buffer - we can make it with limited size, and when
 it is full, some bit will be set in the shared area and obviously no new
 events can be added there, but when user commits some events from that
 buffer (i.e. says to kernel that appropriate kevents can be freed or
 requeued according to theirs flags), new ready events from ready queue
 can be copied into mapped buffer.

 It still does not solve (and I do insist that it is broken behaviour)
 the case when kernel is going to generate infinite number of events for
 one requested by userspace (as in case of generating new 'data_has_arrived'
 event when new byte has been received).

Behavior is not broken. It's quite usefull and works 99.% of time.

I was trying to suggest you but you missed my point.

You dont want to use a bit, but a full sequence counter, 32bits.

A program may handle XXX.XXX handles, but use a 4096 entries ring 
buffer 'only'.

The user program keeps a local copy of a special word 
named 'ring_buffer_full_counter'

Each time the kernel cannot queue an event in the ring buffer, it increase 
the ring_buffer_was_full_counter (exported to user app in the mmap view)

When the user application notice the kernel 
changed ring_buffer_was_full_counter it does a full scan of all file 
handles (preferably using poll() to get all relevant info in one syscall) :

do {
   if (read_event_from_mmap()) {handle_event(fd); continue;}
   /* ring buffer is empty, check if we missed some events */
   if (unlikely(mmap-ring_buffer_full_counter !=  
my_ring_buffer_full_counter)) {
my_ring_buffer_full_counter = mmap-ring_buffer_full_counter;
/* slow PATH */
/* can use a big poll() for example, or just a loop without poll() */
for_all_file_desc_do() {
check if some event/data is waiting on THIS fd
}
/* 
}
else  syscall_wait_for_one_available_kevent(queue)
}

This is how a program can recover. If ring buffer has a reasonable size, this 
kind of event should not happen very frequently. If it does (because events 
continue to fill ring_buffer during recovery and might hit FULL again), maybe 
a smart program is able to resize the ring_buffer, and start using it after 
yet another recovery pass.
If not, we dont care, because a big poll() give us many ready file-descriptors 
in one syscall, and maybe this is much better than kevent/epoll when XX.XXX 
events are ready.

Eric
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take19 1/4] kevent: Core files.

2006-10-17 Thread Evgeniy Polyakov
On Tue, Oct 17, 2006 at 08:12:04AM -0500, Chase Venters ([EMAIL PROTECTED]) 
wrote:
Regarding mukevent I'm thinking of a event-type specific struct, that
is filled by the originating code, and placed into a per-event-type
ring buffer (which  requires modification of kevent_wait).
  
   I'd personally worry about an implementation that used a per-event-type
   ring buffer, because you're still left having to hack around starvation
   issues in user-space. It is of course possible under the current model
   for anyone who wants per-event-type ring buffers to have them - just make
   separate kevent sets.
  
   I haven't thought this through all the way yet, but why not have variable
   length event structures and have the kernel fill in a next pointer in
   each one? This could even be used to keep backwards binary compatibility
   while
 
  Why do we want variable size structures in mmap ring buffer?
 
 Flexibility primarily. So when we all decide to add a new event type six 
 months from now, or add more information to an existing one, we don't run the 
 risk that the existing mukevent isn't big enough.

Do we need such flexibility, when we have unique id attached to each
event? User can store any information in own buffers, which are indexed
by that id.

   adding additional fields to the structures over time, though no space
   would be wasted on modern programs. You still end up with a question of
   what to do in case of overflow, but I'm thinking the thing to do in that
   case might be to start pushing overflow events onto a linked list which
   can be written back into the ring buffer when space becomes available.
   The appropriate behavior would be to throw new events on the linked list
   if the linked list had any events, so that things are delivered in order,
   but write to the mapped buffer directly otherwise.
 
  I think in a similar way.
  Kevent actually do not require such list, since it has already queue of
  the ready events.
 
 The current event types coalesce if there are multiple events, correct? It 
 sounds like there may be other event types where coalescing multiple events 
 is not the correct approach.

There is no events coalescing, I think that it is even incorrect to say, that
something is being coalesced in kevents.

There is 'new' (which is well forgotten old) approach - user _asks_ kernel 
about some information, and kernel says when it is ready. Kernel does not 
say: part of the info is ready, part of the info is ready and so on, it 
just marks user's request as ready - that means that it is possible that
there were zillions of events, each one could mark the _same_ userspace
request as ready, and exactly what user requested is transferred back. 
Thus it is very fast and is correct way to deal with problem of pipes of 
different diameters.

Kernel does not generate events - only user creates requests, which are
marked as ready.

I made that decision to remove _any_ kind of possible overflows from
kernel side - if user was scheduled away, or has unsufficient space or
bad mood, to not introduce any kind of ugly priorities (higher one 
could fill the whole pipe while lower could not even send a single event). 
Instead kernel does just what it was requested to do, and it can provide 
some hints on how that process happend (for example how many sockets are 
ready for accept(), or how many bytes are in the receiving queue).

And that approach does solve the problem of the cases when it looks like
it is logical to _generate_ event - for example in inotify case, where
new event is _generated_ each time requested case happens. For example
the case when new files are created in the directory - it is possible
that there will be queue overflow (btw, watch for each file in the kernel 
tree takes about 2gb of kernel mem), if many files were created, so
userspace must rescan the whole directory to check missed files, so why
is it needed at all to generate info about first two or ten files,
instead userspace asks kernel to notify it when directory has changed or
some new files were created, and kernelspace will answer when directory
has been changed or new files were created (with some hint with number
of them).

Likely request for generation of events in kernel is a workaround for 
some other problems, which in long term will hit us with new troubles -
queue length and overflows.

 Thanks,
 Chase

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take19 1/4] kevent: Core files.

2006-10-17 Thread Evgeniy Polyakov
On Tue, Oct 17, 2006 at 03:19:36PM +0200, Eric Dumazet ([EMAIL PROTECTED]) 
wrote:
 On Tuesday 17 October 2006 12:39, Evgeniy Polyakov wrote:
 
  I can add such notification, but its existense _is_ the broken design.
  After such condition happend, all new events will dissapear (although
  they are still accessible through usual queue) from mapped buffer.
 
  While writing this I have come to the idea on how to imrove the case of
  the size of mapped buffer - we can make it with limited size, and when
  it is full, some bit will be set in the shared area and obviously no new
  events can be added there, but when user commits some events from that
  buffer (i.e. says to kernel that appropriate kevents can be freed or
  requeued according to theirs flags), new ready events from ready queue
  can be copied into mapped buffer.
 
  It still does not solve (and I do insist that it is broken behaviour)
  the case when kernel is going to generate infinite number of events for
  one requested by userspace (as in case of generating new 'data_has_arrived'
  event when new byte has been received).
 
 Behavior is not broken. It's quite usefull and works 99.% of time.

 I was trying to suggest you but you missed my point.
 
 You dont want to use a bit, but a full sequence counter, 32bits.
 
 A program may handle XXX.XXX handles, but use a 4096 entries ring 
 buffer 'only'.
 
 The user program keeps a local copy of a special word 
 named 'ring_buffer_full_counter'
 
 Each time the kernel cannot queue an event in the ring buffer, it increase 
 the ring_buffer_was_full_counter (exported to user app in the mmap view)
 
 When the user application notice the kernel 
 changed ring_buffer_was_full_counter it does a full scan of all file 
 handles (preferably using poll() to get all relevant info in one syscall) :

I.e. to scan the rest of the xxx.xxx events?

 do {
if (read_event_from_mmap()) {handle_event(fd); continue;}
/* ring buffer is empty, check if we missed some events */
if (unlikely(mmap-ring_buffer_full_counter !=  
 my_ring_buffer_full_counter)) {
   my_ring_buffer_full_counter = mmap-ring_buffer_full_counter;
   /* slow PATH */
   /* can use a big poll() for example, or just a loop without poll() */
   for_all_file_desc_do() {
   check if some event/data is waiting on THIS fd
   }
   /* 
   }
 else  syscall_wait_for_one_available_kevent(queue)
 }
 
 This is how a program can recover. If ring buffer has a reasonable size, this 
 kind of event should not happen very frequently. If it does (because events 
 continue to fill ring_buffer during recovery and might hit FULL again), maybe 
 a smart program is able to resize the ring_buffer, and start using it after 
 yet another recovery pass.
 If not, we dont care, because a big poll() give us many ready 
 file-descriptors 
 in one syscall, and maybe this is much better than kevent/epoll when XX.XXX 
 events are ready.

What about the case, which I described in other e-mail, when in case of
the full ring buffer, no new events are written there, and when
userspace commits (i.e. marks as ready to be freed or requeued by kernel) 
some events, new ones will be copied from ready queue into the buffer?

 Eric

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take19 1/4] kevent: Core files.

2006-10-17 Thread Eric Dumazet
On Tuesday 17 October 2006 15:42, Evgeniy Polyakov wrote:
 On Tue, Oct 17, 2006 at 03:19:36PM +0200, Eric Dumazet ([EMAIL PROTECTED]) 
wrote:
  On Tuesday 17 October 2006 12:39, Evgeniy Polyakov wrote:
   I can add such notification, but its existense _is_ the broken design.
   After such condition happend, all new events will dissapear (although
   they are still accessible through usual queue) from mapped buffer.
  
   While writing this I have come to the idea on how to imrove the case of
   the size of mapped buffer - we can make it with limited size, and when
   it is full, some bit will be set in the shared area and obviously no
   new events can be added there, but when user commits some events from
   that buffer (i.e. says to kernel that appropriate kevents can be freed
   or requeued according to theirs flags), new ready events from ready
   queue can be copied into mapped buffer.
  
   It still does not solve (and I do insist that it is broken behaviour)
   the case when kernel is going to generate infinite number of events for
   one requested by userspace (as in case of generating new
   'data_has_arrived' event when new byte has been received).
 
  Behavior is not broken. It's quite usefull and works 99.% of time.
 
  I was trying to suggest you but you missed my point.
 
  You dont want to use a bit, but a full sequence counter, 32bits.
 
  A program may handle XXX.XXX handles, but use a 4096 entries ring
  buffer 'only'.
 
  The user program keeps a local copy of a special word
  named 'ring_buffer_full_counter'
 
  Each time the kernel cannot queue an event in the ring buffer, it
  increase the ring_buffer_was_full_counter (exported to user app in the
  mmap view)
 
  When the user application notice the kernel
  changed ring_buffer_was_full_counter it does a full scan of all file
  handles (preferably using poll() to get all relevant info in one syscall)
  :

 I.e. to scan the rest of the xxx.xxx events?

  do {
 if (read_event_from_mmap()) {handle_event(fd); continue;}
 /* ring buffer is empty, check if we missed some events */
 if (unlikely(mmap-ring_buffer_full_counter !=
  my_ring_buffer_full_counter)) {
  my_ring_buffer_full_counter = mmap-ring_buffer_full_counter;
  /* slow PATH */
  /* can use a big poll() for example, or just a loop without poll() */
  for_all_file_desc_do() {
  check if some event/data is waiting on THIS fd
  }
  /*
  }
  else  syscall_wait_for_one_available_kevent(queue)
  }
 
  This is how a program can recover. If ring buffer has a reasonable size,
  this kind of event should not happen very frequently. If it does (because
  events continue to fill ring_buffer during recovery and might hit FULL
  again), maybe a smart program is able to resize the ring_buffer, and
  start using it after yet another recovery pass.
  If not, we dont care, because a big poll() give us many ready
  file-descriptors in one syscall, and maybe this is much better than
  kevent/epoll when XX.XXX events are ready.

 What about the case, which I described in other e-mail, when in case of
 the full ring buffer, no new events are written there, and when
 userspace commits (i.e. marks as ready to be freed or requeued by kernel)
 some events, new ones will be copied from ready queue into the buffer?

Then, user might receive 'false events', exactly like poll()/select()/epoll() 
can do sometime. IE a 'ready' indication while there is no current event 
available on a particular fd / event_source.

This should be safe, since those programs already ignore read() 
returns -EAGAIN and other similar things.

Programmer prefers to receive two 'event available' indications than ZERO (and 
be stuck for infinite time). Of course, hot path (normal cases) should return 
one 'event' only.

In order words, being ultra fast 99.99 % of the time, but being able to block 
forever once in a while is not an option.
 
Eric

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take19 1/4] kevent: Core files.

2006-10-17 Thread Evgeniy Polyakov
On Tue, Oct 17, 2006 at 03:52:34PM +0200, Eric Dumazet ([EMAIL PROTECTED]) 
wrote:
  What about the case, which I described in other e-mail, when in case of
  the full ring buffer, no new events are written there, and when
  userspace commits (i.e. marks as ready to be freed or requeued by kernel)
  some events, new ones will be copied from ready queue into the buffer?
 
 Then, user might receive 'false events', exactly like poll()/select()/epoll() 
 can do sometime. IE a 'ready' indication while there is no current event 
 available on a particular fd / event_source.

Only if user simultaneously uses oth interfaces and remove even from the
queue when it's copy was in mapped buffer, but in that case it's user's
problem (and if we do want, we can store pointer/index of the ring
buffer entry, so when event is removed from the ready queue (using 
kevent_get_events()), appropriate entry in the ring buffer will be
updated to show that it is no longer valid.

 This should be safe, since those programs already ignore read() 
 returns -EAGAIN and other similar things.
 
 Programmer prefers to receive two 'event available' indications than ZERO 
 (and 
 be stuck for infinite time). Of course, hot path (normal cases) should return 
 one 'event' only.
 
 In order words, being ultra fast 99.99 % of the time, but being able to block 
 forever once in a while is not an option.

Have I missed something? It looks like the only problematic situation is
described above when user simultaneously uses both interfaces.

 Eric

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take19 1/4] kevent: Core files.

2006-10-17 Thread Eric Dumazet
On Tuesday 17 October 2006 16:07, Evgeniy Polyakov wrote:
 On Tue, Oct 17, 2006 at 03:52:34PM +0200, Eric Dumazet ([EMAIL PROTECTED]) 
wrote:
   What about the case, which I described in other e-mail, when in case of
   the full ring buffer, no new events are written there, and when
   userspace commits (i.e. marks as ready to be freed or requeued by
   kernel) some events, new ones will be copied from ready queue into the
   buffer?
 
  Then, user might receive 'false events', exactly like
  poll()/select()/epoll() can do sometime. IE a 'ready' indication while
  there is no current event available on a particular fd / event_source.

 Only if user simultaneously uses oth interfaces and remove even from the
 queue when it's copy was in mapped buffer, but in that case it's user's
 problem (and if we do want, we can store pointer/index of the ring
 buffer entry, so when event is removed from the ready queue (using
 kevent_get_events()), appropriate entry in the ring buffer will be
 updated to show that it is no longer valid.

  This should be safe, since those programs already ignore read()
  returns -EAGAIN and other similar things.
 
  Programmer prefers to receive two 'event available' indications than ZERO
  (and be stuck for infinite time). Of course, hot path (normal cases)
  should return one 'event' only.
 
  In order words, being ultra fast 99.99 % of the time, but being able to
  block forever once in a while is not an option.

 Have I missed something? It looks like the only problematic situation is
 described above when user simultaneously uses both interfaces.

In my point of view, user of the 'mmaped ring buffer' should be prepared to 
use both interfaces. Or else you are forced to presize the ring buffer to 
insane limits.

That is :
- Most of the time, we expect consuming events via mmaped ring buffer and no 
syscalls.
- In case we notice a 'mmaped ring buffer overflow', syscalls to get/consume 
events that could not be stored in mmaped buffer (but queued by kevent 
subsystem). If not stored by kevent subsystem (memory failure ?), revert to 
poll() to fetch all 'missed fds' in one row. Go back to normal mode.

- In case of empty ring buffer (or no mmap support at all, because this app 
doesnt expect lot of events per time unit, or because kevent dont have mmap 
support) : Be able to syscall and wait for an event.

Eric
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take19 1/4] kevent: Core files.

2006-10-17 Thread Evgeniy Polyakov
On Tue, Oct 17, 2006 at 04:25:00PM +0200, Eric Dumazet ([EMAIL PROTECTED]) 
wrote:
 On Tuesday 17 October 2006 16:07, Evgeniy Polyakov wrote:
  On Tue, Oct 17, 2006 at 03:52:34PM +0200, Eric Dumazet ([EMAIL PROTECTED]) 
 wrote:
What about the case, which I described in other e-mail, when in case of
the full ring buffer, no new events are written there, and when
userspace commits (i.e. marks as ready to be freed or requeued by
kernel) some events, new ones will be copied from ready queue into the
buffer?
  
   Then, user might receive 'false events', exactly like
   poll()/select()/epoll() can do sometime. IE a 'ready' indication while
   there is no current event available on a particular fd / event_source.
 
  Only if user simultaneously uses oth interfaces and remove even from the
  queue when it's copy was in mapped buffer, but in that case it's user's
  problem (and if we do want, we can store pointer/index of the ring
  buffer entry, so when event is removed from the ready queue (using
  kevent_get_events()), appropriate entry in the ring buffer will be
  updated to show that it is no longer valid.
 
   This should be safe, since those programs already ignore read()
   returns -EAGAIN and other similar things.
  
   Programmer prefers to receive two 'event available' indications than ZERO
   (and be stuck for infinite time). Of course, hot path (normal cases)
   should return one 'event' only.
  
   In order words, being ultra fast 99.99 % of the time, but being able to
   block forever once in a while is not an option.
 
  Have I missed something? It looks like the only problematic situation is
  described above when user simultaneously uses both interfaces.
 
 In my point of view, user of the 'mmaped ring buffer' should be prepared to 
 use both interfaces. Or else you are forced to presize the ring buffer to 
 insane limits.
 
 That is :
 - Most of the time, we expect consuming events via mmaped ring buffer and no 
 syscalls.
 - In case we notice a 'mmaped ring buffer overflow', syscalls to get/consume 
 events that could not be stored in mmaped buffer (but queued by kevent 
 subsystem). If not stored by kevent subsystem (memory failure ?), revert to 
 poll() to fetch all 'missed fds' in one row. Go back to normal mode.

kevent uses smaller amount of memory than epoll() per event, so it is very
unlikely that it will be impossible to store new event there and epoll()
will succeed. The same can be applied to poll(), which allocates the
whole table in syscall.

 - In case of empty ring buffer (or no mmap support at all, because this app 
 doesnt expect lot of events per time unit, or because kevent dont have mmap 
 support) : Be able to syscall and wait for an event.

So the most complex case is when user is going to use both interfaces,
and it's steps when mapped ring buffer has overflow.
In that case user can either read and mark some events as ready in ring
buffer (the latter is being done through special syscall), so kevent
core will put there new ready events.
User can also get events using usual syscall, in that case events in
ring buffer must be updated - and actually I implemented mapped buffer
in the way which allows to remove events from the queue - queue is a
FIFO, and the first entry to be obtained through syscall is _always_ the
first entry in the ring buffer.

So when user reads event through syscall (no matter if we are in overflow
case or not), even being read is easily accessible in the ring buffer.

So I propose following design for ring buffer (quite simple):
kernelspace maintains two indexes - to the first and the last events in
the ring buffer (and maximum size of the buffer of course).
When new event is marked as ready, some info is being copied into ring
buffer and index of the last entry is increased.
When event is being read through syscall it is _guaranteed_ that that 
event will be at the position pointed by the index of the first
element, that index is then increased (thus opening new slot in the
buffer).
If index of the last entry reaches (with possible wrapping) index of the
first entry, that means that overflow has happend. In this case no new
events can be copied into ring buffer, so they are only placed into
ready queue (accessible through syscall kevent_get_events()).

When user calls kevent_get_events() it will obtain the first element
(pointed by index of the first element in the ring buffer), and if there
is ready event, which is not placed into the ring buffer, it is
copied (with appropriate update of the last index and new overflow
condition).

When userspace calls kevent_wait(num), it means that userspace marks as
ready first (from index of the first element) $num elements, which thus
can be removed (or requeued) and replaced by pending ready events.

Does it sound like clawing over the glass or much better?

 Eric
 -
 To unsubscribe from this list: send the line unsubscribe netdev in
 the body of a message to [EMAIL PROTECTED]
 More 

Re: [take19 1/4] kevent: Core files.

2006-10-17 Thread Eric Dumazet
On Tuesday 17 October 2006 17:09, Evgeniy Polyakov wrote:
 On Tue, Oct 17, 2006 at 04:25:00PM +0200, Eric Dumazet ([EMAIL PROTECTED]) 
wrote:
  On Tuesday 17 October 2006 16:07, Evgeniy Polyakov wrote:
   On Tue, Oct 17, 2006 at 03:52:34PM +0200, Eric Dumazet
   ([EMAIL PROTECTED])
 
  wrote:
 What about the case, which I described in other e-mail, when in
 case of the full ring buffer, no new events are written there, and
 when userspace commits (i.e. marks as ready to be freed or requeued
 by kernel) some events, new ones will be copied from ready queue
 into the buffer?
   
Then, user might receive 'false events', exactly like
poll()/select()/epoll() can do sometime. IE a 'ready' indication
while there is no current event available on a particular fd /
event_source.
  
   Only if user simultaneously uses oth interfaces and remove even from
   the queue when it's copy was in mapped buffer, but in that case it's
   user's problem (and if we do want, we can store pointer/index of the
   ring buffer entry, so when event is removed from the ready queue (using
   kevent_get_events()), appropriate entry in the ring buffer will be
   updated to show that it is no longer valid.
  
This should be safe, since those programs already ignore read()
returns -EAGAIN and other similar things.
   
Programmer prefers to receive two 'event available' indications than
ZERO (and be stuck for infinite time). Of course, hot path (normal
cases) should return one 'event' only.
   
In order words, being ultra fast 99.99 % of the time, but being able
to block forever once in a while is not an option.
  
   Have I missed something? It looks like the only problematic situation
   is described above when user simultaneously uses both interfaces.
 
  In my point of view, user of the 'mmaped ring buffer' should be prepared
  to use both interfaces. Or else you are forced to presize the ring buffer
  to insane limits.
 
  That is :
  - Most of the time, we expect consuming events via mmaped ring buffer and
  no syscalls.
  - In case we notice a 'mmaped ring buffer overflow', syscalls to
  get/consume events that could not be stored in mmaped buffer (but queued
  by kevent subsystem). If not stored by kevent subsystem (memory failure
  ?), revert to poll() to fetch all 'missed fds' in one row. Go back to
  normal mode.

 kevent uses smaller amount of memory than epoll() per event, so it is very
 unlikely that it will be impossible to store new event there and epoll()
 will succeed. The same can be applied to poll(), which allocates the
 whole table in syscall.

  - In case of empty ring buffer (or no mmap support at all, because this
  app doesnt expect lot of events per time unit, or because kevent dont
  have mmap support) : Be able to syscall and wait for an event.

 So the most complex case is when user is going to use both interfaces,
 and it's steps when mapped ring buffer has overflow.
 In that case user can either read and mark some events as ready in ring
 buffer (the latter is being done through special syscall), so kevent
 core will put there new ready events.
 User can also get events using usual syscall, in that case events in
 ring buffer must be updated - and actually I implemented mapped buffer
 in the way which allows to remove events from the queue - queue is a
 FIFO, and the first entry to be obtained through syscall is _always_ the
 first entry in the ring buffer.

 So when user reads event through syscall (no matter if we are in overflow
 case or not), even being read is easily accessible in the ring buffer.

 So I propose following design for ring buffer (quite simple):
 kernelspace maintains two indexes - to the first and the last events in
 the ring buffer (and maximum size of the buffer of course).
 When new event is marked as ready, some info is being copied into ring
 buffer and index of the last entry is increased.
 When event is being read through syscall it is _guaranteed_ that that
 event will be at the position pointed by the index of the first
 element, that index is then increased (thus opening new slot in the
 buffer).
 If index of the last entry reaches (with possible wrapping) index of the
 first entry, that means that overflow has happend. In this case no new
 events can be copied into ring buffer, so they are only placed into
 ready queue (accessible through syscall kevent_get_events()).

 When user calls kevent_get_events() it will obtain the first element
 (pointed by index of the first element in the ring buffer), and if there
 is ready event, which is not placed into the ring buffer, it is
 copied (with appropriate update of the last index and new overflow
 condition).

Well, I'm not sure its good to do this 'move one event from ready list to slot 
X', one by one, because this event will likely be flushed out of processor 
cache (because we will have to consume 4096 events before reaching this one). 
I think its better to batch 

Re: [take19 1/4] kevent: Core files.

2006-10-17 Thread Hans Henrik Happe
On Tuesday 17 October 2006 16:25, Eric Dumazet wrote:
 On Tuesday 17 October 2006 16:07, Evgeniy Polyakov wrote:
  On Tue, Oct 17, 2006 at 03:52:34PM +0200, Eric Dumazet 
([EMAIL PROTECTED]) 
 wrote:
What about the case, which I described in other e-mail, when in case 
of
the full ring buffer, no new events are written there, and when
userspace commits (i.e. marks as ready to be freed or requeued by
kernel) some events, new ones will be copied from ready queue into the
buffer?
  
   Then, user might receive 'false events', exactly like
   poll()/select()/epoll() can do sometime. IE a 'ready' indication while
   there is no current event available on a particular fd / event_source.
 
  Only if user simultaneously uses oth interfaces and remove even from the
  queue when it's copy was in mapped buffer, but in that case it's user's
  problem (and if we do want, we can store pointer/index of the ring
  buffer entry, so when event is removed from the ready queue (using
  kevent_get_events()), appropriate entry in the ring buffer will be
  updated to show that it is no longer valid.
 
   This should be safe, since those programs already ignore read()
   returns -EAGAIN and other similar things.
  
   Programmer prefers to receive two 'event available' indications than 
ZERO
   (and be stuck for infinite time). Of course, hot path (normal cases)
   should return one 'event' only.
  
   In order words, being ultra fast 99.99 % of the time, but being able to
   block forever once in a while is not an option.
 
  Have I missed something? It looks like the only problematic situation is
  described above when user simultaneously uses both interfaces.
 
 In my point of view, user of the 'mmaped ring buffer' should be prepared to 
 use both interfaces. Or else you are forced to presize the ring buffer to 
 insane limits.

I don't see why overflow couldn't be handle by a syscall telling the kernel 
that the buffer is ready for new events. As mentioned most of the time 
overflow should not happend and if it does the syscall should be amortized 
nicely by the number of events.

 That is :
 - Most of the time, we expect consuming events via mmaped ring buffer and no 
 syscalls.
 - In case we notice a 'mmaped ring buffer overflow', syscalls to get/consume 
 events that could not be stored in mmaped buffer (but queued by kevent 
 subsystem). If not stored by kevent subsystem (memory failure ?), revert to 
 poll() to fetch all 'missed fds' in one row. Go back to normal mode.
 
 - In case of empty ring buffer (or no mmap support at all, because this app 
 doesnt expect lot of events per time unit, or because kevent dont have mmap 
 support) : Be able to syscall and wait for an event.

As I see it there are two main problems with a mmapped ring buffer (correct me 
if I'm wrong):

1. Overflow.
2. Handle multiple kernel event that only needs one  user event. I.e. multiple 
packet arriving at the same socket. The user should only see one IN event at 
the time he is ready to handle it.

In an earlier post I suggested a scheme that solves these issues. It was based 
on the assumption that kernel and user-space share index variables and can 
read/update them atomically without much overhead. Only in cases where the 
buffer is empty and full system call would be required.

Hans Henrik Happe
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take19 1/4] kevent: Core files.

2006-10-17 Thread Evgeniy Polyakov
On Tue, Oct 17, 2006 at 05:32:28PM +0200, Eric Dumazet ([EMAIL PROTECTED]) 
wrote:
  So the most complex case is when user is going to use both interfaces,
  and it's steps when mapped ring buffer has overflow.
  In that case user can either read and mark some events as ready in ring
  buffer (the latter is being done through special syscall), so kevent
  core will put there new ready events.
  User can also get events using usual syscall, in that case events in
  ring buffer must be updated - and actually I implemented mapped buffer
  in the way which allows to remove events from the queue - queue is a
  FIFO, and the first entry to be obtained through syscall is _always_ the
  first entry in the ring buffer.
 
  So when user reads event through syscall (no matter if we are in overflow
  case or not), even being read is easily accessible in the ring buffer.
 
  So I propose following design for ring buffer (quite simple):
  kernelspace maintains two indexes - to the first and the last events in
  the ring buffer (and maximum size of the buffer of course).
  When new event is marked as ready, some info is being copied into ring
  buffer and index of the last entry is increased.
  When event is being read through syscall it is _guaranteed_ that that
  event will be at the position pointed by the index of the first
  element, that index is then increased (thus opening new slot in the
  buffer).
  If index of the last entry reaches (with possible wrapping) index of the
  first entry, that means that overflow has happend. In this case no new
  events can be copied into ring buffer, so they are only placed into
  ready queue (accessible through syscall kevent_get_events()).
 
  When user calls kevent_get_events() it will obtain the first element
  (pointed by index of the first element in the ring buffer), and if there
  is ready event, which is not placed into the ring buffer, it is
  copied (with appropriate update of the last index and new overflow
  condition).
 
 Well, I'm not sure its good to do this 'move one event from ready list to 
 slot 
 X', one by one, because this event will likely be flushed out of processor 
 cache (because we will have to consume 4096 events before reaching this one). 
 I think its better to batch this kind of 'push XX events' later, XX being 
 small enough not to waste CPU cache, and when ring buffer is empty again.

Ok, that's possible.

 mmap buffer is good for latency and minimum synchro between user thread and 
 kernel producer. But once we hit an 'overflow', it is better to revert to a 
 mode feeding XX events per syscall, to be sure it fits CPU caches : The user 
 thread will do the copy between kernel memory to user memory, and this thread 
 will shortly use those events in user land.

User can do both - either get events through syscall, or get them from
mapped ring buffer when it is refilled.

 BTW, maintaining coherency on mmap buffer is expensive : once a event is 
 copied to mmap buffer, kernel has to issue a smp_mb() before updating the 
 index, so that a user thread wont start to consume an event with random 
 values because its CPU see the update on index before updates on data.

There will be some tricks with barriers indeed.

 Once all the queue is flushed in efficient way, we can switch to mmap mode 
 again.
 
 Eric

Ok, there is one apologist for mmap buffer implementation, who forced me
to create first implementation, which was dropped due to absense of
remote mental reading abilities. 
Ulrich, does above approach sound good for you? 
I actually do not want to reimplement something, that will be
pointed to with words 'no matter what you say, it is broken and I do not 
want it' again :).

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take19 1/4] kevent: Core files.

2006-10-17 Thread Eric Dumazet
On Tuesday 17 October 2006 18:01, Evgeniy Polyakov wrote:

 Ok, there is one apologist for mmap buffer implementation, who forced me
 to create first implementation, which was dropped due to absense of
 remote mental reading abilities.
 Ulrich, does above approach sound good for you?
 I actually do not want to reimplement something, that will be
 pointed to with words 'no matter what you say, it is broken and I do not
 want it' again :).

In my humble opinion, you should first write a 'real application', to show how 
the mmap buffer and kevent syscalls would be used (fast path and 
slow/recovery paths). I am sure it would be easier for everybody to agree on 
the API *before* you start coding a *lot* of hard (kernel) stuff : It would 
certainly save your mental CPU cycles (and ours too :) )

This 'real application' could be  the event loop of a simple HTTP server, or a 
basic 'echo all' server. Adding the bits about timers events and signals 
should be done too.

Eric
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take19 1/4] kevent: Core files.

2006-10-17 Thread Evgeniy Polyakov
On Tue, Oct 17, 2006 at 06:26:04PM +0200, Eric Dumazet ([EMAIL PROTECTED]) 
wrote:
 On Tuesday 17 October 2006 18:01, Evgeniy Polyakov wrote:
 
  Ok, there is one apologist for mmap buffer implementation, who forced me
  to create first implementation, which was dropped due to absense of
  remote mental reading abilities.
  Ulrich, does above approach sound good for you?
  I actually do not want to reimplement something, that will be
  pointed to with words 'no matter what you say, it is broken and I do not
  want it' again :).
 
 In my humble opinion, you should first write a 'real application', to show 
 how 
 the mmap buffer and kevent syscalls would be used (fast path and 
 slow/recovery paths). I am sure it would be easier for everybody to agree on 
 the API *before* you start coding a *lot* of hard (kernel) stuff : It would 
 certainly save your mental CPU cycles (and ours too :) )

 This 'real application' could be  the event loop of a simple HTTP server, or 
 a 
 basic 'echo all' server. Adding the bits about timers events and signals 
 should be done too.

I wrote one with previous ring buffer implementation - it used timers
and echoed when they fired, it was even described in details in one of the 
lwn.net articles.

I'm not going to waste others and my time implementing feature requests
without at least _some_ feedback from those who asked them.
In case when person, originally requested some feature, does not answer
and there are other opinions, only they will be get into account of
course.

 Eric

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take19 1/4] kevent: Core files.

2006-10-17 Thread Eric Dumazet
On Tuesday 17 October 2006 18:35, Evgeniy Polyakov wrote:
 On Tue, Oct 17, 2006 at 06:26:04PM +0200, Eric Dumazet ([EMAIL PROTECTED]) 
wrote:
  On Tuesday 17 October 2006 18:01, Evgeniy Polyakov wrote:
   Ok, there is one apologist for mmap buffer implementation, who forced
   me to create first implementation, which was dropped due to absense of
   remote mental reading abilities.
   Ulrich, does above approach sound good for you?
   I actually do not want to reimplement something, that will be
   pointed to with words 'no matter what you say, it is broken and I do
   not want it' again :).
 
  In my humble opinion, you should first write a 'real application', to
  show how the mmap buffer and kevent syscalls would be used (fast path and
  slow/recovery paths). I am sure it would be easier for everybody to agree
  on the API *before* you start coding a *lot* of hard (kernel) stuff : It
  would certainly save your mental CPU cycles (and ours too :) )
 
  This 'real application' could be  the event loop of a simple HTTP server,
  or a basic 'echo all' server. Adding the bits about timers events and
  signals should be done too.

 I wrote one with previous ring buffer implementation - it used timers
 and echoed when they fired, it was even described in details in one of the
 lwn.net articles.

 I'm not going to waste others and my time implementing feature requests
 without at least _some_ feedback from those who asked them.
 In case when person, originally requested some feature, does not answer
 and there are other opinions, only they will be get into account of
 course.

I am not sure I understand what you wrote, English is not our native language.

I think many people gave you feedbacks. I feel that all feedback on this 
mailing list is constructive. Many posts/patches on this list are never 
commented at all.

Eric
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take19 1/4] kevent: Core files.

2006-10-17 Thread Evgeniy Polyakov
On Tue, Oct 17, 2006 at 06:45:54PM +0200, Eric Dumazet ([EMAIL PROTECTED]) 
wrote:
 On Tuesday 17 October 2006 18:35, Evgeniy Polyakov wrote:
  On Tue, Oct 17, 2006 at 06:26:04PM +0200, Eric Dumazet ([EMAIL PROTECTED]) 
 wrote:
   On Tuesday 17 October 2006 18:01, Evgeniy Polyakov wrote:
Ok, there is one apologist for mmap buffer implementation, who forced
me to create first implementation, which was dropped due to absense of
remote mental reading abilities.
Ulrich, does above approach sound good for you?
I actually do not want to reimplement something, that will be
pointed to with words 'no matter what you say, it is broken and I do
not want it' again :).
  
   In my humble opinion, you should first write a 'real application', to
   show how the mmap buffer and kevent syscalls would be used (fast path and
   slow/recovery paths). I am sure it would be easier for everybody to agree
   on the API *before* you start coding a *lot* of hard (kernel) stuff : It
   would certainly save your mental CPU cycles (and ours too :) )
  
   This 'real application' could be  the event loop of a simple HTTP server,
   or a basic 'echo all' server. Adding the bits about timers events and
   signals should be done too.
 
  I wrote one with previous ring buffer implementation - it used timers
  and echoed when they fired, it was even described in details in one of the
  lwn.net articles.
 
  I'm not going to waste others and my time implementing feature requests
  without at least _some_ feedback from those who asked them.
  In case when person, originally requested some feature, does not answer
  and there are other opinions, only they will be get into account of
  course.
 
 I am not sure I understand what you wrote, English is not our native language.
 
 I think many people gave you feedbacks. I feel that all feedback on this 
 mailing list is constructive. Many posts/patches on this list are never 
 commented at all.

And I do greatly appreciate feedback from those people!

But I do not understand why I never got feedback on initial design and
implementation (and then created as far as I recall at least 10
releases) from Ulrich, who first asked for such a feture. 
So right now I'm waiting for his opinion on that problem, even if it will 
be 'it sucks' again, but at least in that case I will not waste people's time.

Ulrich, could you please comment on design notes sent couple of mail
above?

 Eric

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take19 1/4] kevent: Core files.

2006-10-17 Thread Eric Dumazet

Evgeniy Polyakov a e'crit :

On Tue, Oct 17, 2006 at 06:45:54PM +0200, Eric Dumazet ([EMAIL PROTECTED]) 
wrote:

I am not sure I understand what you wrote, English is not our native language.

I think many people gave you feedbacks. I feel that all feedback on this 
mailing list is constructive. Many posts/patches on this list are never 
commented at all.


And I do greatly appreciate feedback from those people!

But I do not understand why I never got feedback on initial design and
implementation (and then created as far as I recall at least 10
releases) from Ulrich, who first asked for such a feture. 
So right now I'm waiting for his opinion on that problem, even if it will 
be 'it sucks' again, but at least in that case I will not waste people's time.


Ulrich, could you please comment on design notes sent couple of mail
above?



Ulrich is a very busy man. We have to live with that.

rant_mode
For example, I *complained* one day, that each glibc fopen()/fread()/fclose() 
pass does a mmap()/munmap() to obtain a single 4KB of memory, without any 
cache mechanism. This badly hurts performance of multi-threaded programs as we 
know mmap()/munmap() has to down_write(mm-mmap_sem); and play VM games.


So to avoid this, I manually call setvbuf() in my own programs, to provide a 
suitable buffer to glibc, because of its suboptimal default allocation, 
vestige of an old epoch...

/rant_mode

Eric

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take19 1/4] kevent: Core files.

2006-10-16 Thread Evgeniy Polyakov
On Sun, Oct 15, 2006 at 04:22:45PM -0700, Ulrich Drepper ([EMAIL PROTECTED]) 
wrote:
 Evgeniy Polyakov wrote:
 Existing design does not allow overflow.
 
 And I've pointed out a number of times that this is not practical at 
 best.  There are event sources which can create events which cannot be 
 coalesced into one single event as it would be required with your design.
 
 Signals are one example, specifically realtime signals.  If we do not 
 want the design to be limited from the start this approach has to be 
 thought over.

The whole idea of mmap buffer seems to be broken, since those who asked
for creation do not like existing design and do not show theirs...

According to signals and possibility to overflow in existing ring buffer
implementation.
You seems to not checked the code - each event can be marked as ready 
only one time, which means only one copy and so on.
It was done _specially_. And it is not limitation, but new approach.
Queue of the same signals or any other events has fundamental flawness
(as any other ring buffer implementation, which has queue size)  -
it's size of the queue and extremely bad case of the overflow.
So, the same event may not be ready several times. Any design which
allows to create infinite number of events generated for the same case
is broken, since consumer can be in situation, when it can not handle
that flow. That is why poll() returns only POLLIN when data is ready in
network stack, but is not trying to generate some kind of a signal for 
each byte/packet/MTU/MSS received.
RT signals have design problems, and I will not repeate the same error
with similar limits in kevent.

 So zap mmap() support completely, since it is not usable at all. We wont 
 discuss on it.
 
 Initial implementation did not have it.
 But I was requested to do it, and it is ready now.
 No one likes it, but no one provides an alternative implementation.
 We are stuck.
 
 We need the mapped ring buffer.  The current design (before it was 
 removed) was broken but this does not mean it shouldn't be implemented. 
  We just need more time to figure out how to implement it correctly.

In the latest patchset it was removed. I'm waiting for your code.

Mmap implementation can be added separately, since it does not affect
kevent core.

 -- 
 ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, 
 CA ❖

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take19 1/4] kevent: Core files.

2006-10-16 Thread Ulrich Drepper

Evgeniy Polyakov wrote:

The whole idea of mmap buffer seems to be broken, since those who asked
for creation do not like existing design and do not show theirs...


What kind of argumentation is that?

   Because my attempt to implement it doesn't work and nobody right
away has a better suggestion this means the idea is broken.

Nonsense.

It just means that time should be spend on thinking about this.  You cut 
all this short by rushing out your attempt without any discussions. 
Unfortunately nobody else really looked at the approach so it lingered 
around for some weeks.  Well, now it is clear that it is not the right 
approach and we can start thinking about it again.



You seems to not checked the code - each event can be marked as ready 
only one time, which means only one copy and so on.

It was done _specially_. And it is not limitation, but new approach.


I know that it is done deliberately and I tell you that this is wrong 
and unacceptable.  Realtime signals are one event which need to have 
more than one event queued.  This is no description of what you have 
implemented, it's a description of the reality of realtime signals.


RT signals are queued.  They carry a data value (the sigval_t object) 
which can be unique for each signal delivery.  Coalescing the signal 
events therefore leads to information loss.


Therefore, at the very least for signal we need to have the ability to 
queue more than one event for each event source.  Not having this 
functionality means that signals and likely other types of events cannot 
be implemented using kevent queues.




Queue of the same signals or any other events has fundamental flawness
(as any other ring buffer implementation, which has queue size)  -
it's size of the queue and extremely bad case of the overflow.


Of course there are additional problems.  Overflows need to be handled. 
 But this is nothing which is unsolvable.




So, the same event may not be ready several times. Any design which
allows to create infinite number of events generated for the same case
is broken, since consumer can be in situation, when it can not handle
that flow.


That's complete nonsense.  Again, for RT signals it is very reasonable 
and not broken to have multiple outstanding signals.




That is why poll() returns only POLLIN when data is ready in
network stack, but is not trying to generate some kind of a signal for 
each byte/packet/MTU/MSS received.


It makes no sense to drag poll() into this discussion.  poll() is a very 
limited interface.  The new event handling is supposed to be the 
opposite, namely, usable for all kinds of events.  Arguing that because 
poll() does it like this just means you don't see what big step is 
needed to get to the goal of a unified event handling.  The shackles of 
poll() must be left behind.




RT signals have design problems, and I will not repeate the same error
with similar limits in kevent.


I don't know what to say.  You claim to be the source of all wisdom is 
OS design.  Maybe you should design your own OS, from ground up.  I 
wonder how many people would like that since all your arguments are 
squarely geared towards optimizing the implementation.  But: the 
implementation is irrelevant without users.  The functionality users (= 
programmers) want and need is what must drive the implementation.  And 
RT signals are definitely heavily used and liked by programmers.  You 
have to accept that you try to modify an OS which has that functionality 
regardless of how much you hate it and want to fight it.




Mmap implementation can be added separately, since it does not affect
kevent core.


That I doubt very much and it is why I would not want the kevent stuff 
go into any released kernel until that detail is resolved.


--
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take19 1/4] kevent: Core files.

2006-10-16 Thread Evgeniy Polyakov
On Mon, Oct 16, 2006 at 03:16:15AM -0700, Ulrich Drepper ([EMAIL PROTECTED]) 
wrote:
 Evgeniy Polyakov wrote:
 The whole idea of mmap buffer seems to be broken, since those who asked
 for creation do not like existing design and do not show theirs...
 
 What kind of argumentation is that?
 
Because my attempt to implement it doesn't work and nobody right
 away has a better suggestion this means the idea is broken.
 
 Nonsense.

Ok, let's reformulate:
My attempt works, but nobody around likes it, I remove it and wait until
some other implement it.

 It just means that time should be spend on thinking about this.  You cut 
 all this short by rushing out your attempt without any discussions. 
 Unfortunately nobody else really looked at the approach so it lingered 
 around for some weeks.  Well, now it is clear that it is not the right 
 approach and we can start thinking about it again.

I talked about it in the last 13 releases of the kevent, and _noone_
said at least some comments. And now I get - 'it is broken, it does not
work, there are problems, we do not want it' and the like. I tried
hardly to show that it does work and problems shown can not happen, but
noone still hears me. Since I think it is not that interface which is
100% required for correct functionality, I removed it. When there are
better suggestions and implementation we can return to them of course.

 You seems to not checked the code - each event can be marked as ready 
 only one time, which means only one copy and so on.
 It was done _specially_. And it is not limitation, but new approach.
 
 I know that it is done deliberately and I tell you that this is wrong 
 and unacceptable.  Realtime signals are one event which need to have 
 more than one event queued.  This is no description of what you have 
 implemented, it's a description of the reality of realtime signals.
 
 RT signals are queued.  They carry a data value (the sigval_t object) 
 which can be unique for each signal delivery.  Coalescing the signal 
 events therefore leads to information loss.
 
 Therefore, at the very least for signal we need to have the ability to 
 queue more than one event for each event source.  Not having this 
 functionality means that signals and likely other types of events cannot 
 be implemented using kevent queues.

Well, my point about rt-signals is that they do not deserve to be
resurrected, but it is only my point :)
In case it is still used, each signal setup should create event - many
signals means many events, each signal can be sent with different
parameters - each event should correspond to one unique case.

 Queue of the same signals or any other events has fundamental flawness
 (as any other ring buffer implementation, which has queue size)  -
 it's size of the queue and extremely bad case of the overflow.
 
 Of course there are additional problems.  Overflows need to be handled. 
  But this is nothing which is unsolvable.

I strongly disagree that having design which allows overflows is
acceptible - do we really want rt-signals queue overflow problems in new
place? Instead some complex allocation scheme can be created.

 So, the same event may not be ready several times. Any design which
 allows to create infinite number of events generated for the same case
 is broken, since consumer can be in situation, when it can not handle
 that flow.
 
 That's complete nonsense.  Again, for RT signals it is very reasonable 
 and not broken to have multiple outstanding signals.

The same signal with different payload is acceptible, but when number of
them increases ulimit and they are started to be forgotten - that's what
I call broken design.

 That is why poll() returns only POLLIN when data is ready in
 network stack, but is not trying to generate some kind of a signal for 
 each byte/packet/MTU/MSS received.
 
 It makes no sense to drag poll() into this discussion.  poll() is a very 
 limited interface.  The new event handling is supposed to be the 
 opposite, namely, usable for all kinds of events.  Arguing that because 
 poll() does it like this just means you don't see what big step is 
 needed to get to the goal of a unified event handling.  The shackles of 
 poll() must be left behind.

Kevent is that subsystem, and for now it works quite good.

 RT signals have design problems, and I will not repeate the same error
 with similar limits in kevent.
 
 I don't know what to say.  You claim to be the source of all wisdom is 
 OS design.  Maybe you should design your own OS, from ground up.  I 
 wonder how many people would like that since all your arguments are 
 squarely geared towards optimizing the implementation.  But: the 
 implementation is irrelevant without users.  The functionality users (= 
 programmers) want and need is what must drive the implementation.  And 
 RT signals are definitely heavily used and liked by programmers.  You 
 have to accept that you try to modify an OS which has that functionality 
 regardless of how 

Re: [take19 1/4] kevent: Core files.

2006-10-16 Thread Johann Borck
Ulrich Drepper wrote:
 Evgeniy Polyakov wrote:
 Existing design does not allow overflow.

 And I've pointed out a number of times that this is not practical at
 best.  There are event sources which can create events which cannot be
 coalesced into one single event as it would be required with your design.

 Signals are one example, specifically realtime signals.  If we do not
 want the design to be limited from the start this approach has to be
 thought over.


 So zap mmap() support completely, since it is not usable at all. We
 wont discuss on it.

 Initial implementation did not have it.
 But I was requested to do it, and it is ready now.
 No one likes it, but no one provides an alternative implementation.
 We are stuck.

 We need the mapped ring buffer.  The current design (before it was
 removed) was broken but this does not mean it shouldn't be
 implemented.  We just need more time to figure out how to implement it
 correctly.

Considering the if at all and if then how of ring buffer implemetation
I'd like to throw in some ideas I had when reading the discussion and
respective code. If I understood Ulrich Drepper right, his notion of a
generic event handling interface is, that it has to be flexible enough
to transport additional info from origin to userspace, and to support
queuing of events from the same origin, so that additional
per-event-occurrence data doesn't get lost, which would happen when
coalescing multiple events into one until delivery. From what I read he
says ring buffer is broken because of  insufficient space for additional
data (mukevent) and the limited number of events that can be put into
ring buffer. Another argument is missing notification of userspace about
dropped events in case ring buffer limit is reached. (is that right?)
I see no reason why kevent couldn't be modified to fit (all) these
needs. While modifying the server-example and writing a client using
kevent I came across the coalescing problem, there were more incoming
connections than accept events, and I had to work around that. In this
case the pure number of coalesced events would suffice, while it
wouldn't for the example of RT-signals that Ulrich Drepper gave. So if
coalescing can be done at all or if it is impossible depends on the type
of event. The same goes for additional data delivered with the events.
There might be no panacea for all possible scenarios with one fixed
design. Either performance suffers for 'lightweight' events  which don't
need additional data and/or coalescing is not problematic and/or ring
buffer, or kevent is not usable for other types of events. Why not treat
different things differently, and let the (kernel-)user decide.
I don't know if I got all this right, but if, then ring buffer is needed
especially for cases where coalescing is not possible and additional
data has to be delivered for each triggered notification (so the pure
number of events is not enough; other reasons? performance? ). To me it
doesn't make sense to have kevent fill memory and use processor-time if
buffer is not used at all, which is the case when using kevent_getevents.
So here are my Ideas:
Make usage of ring buffer optional, if not required for specific
event-type it might be chosen by userspace-code.
Make limit of events in ring buffer optional and controllable from
userspace.
Regarding mukevent I'm thinking of a event-type specific struct, that is
filled by the originating code, and placed into a per-event-type ring
buffer (which  requires modification of kevent_wait). To my limited
understanding it seems that alternative or modified versions of
kevent_storage_ready, (__)kevent_requeue and kevent_user_ring_add_event
could return a void pointer to the position in buffer, and all kevent
has to know about is the size of the struct.
If coalescing doesn't hurt for a specific event-type it might just be
modified to notify userspace about the number of coalesced events. Make
it depend on type of event.

I know this doesn't address all objections that have been made, and
Evgeniy, big sorry for this being just talk again, and maybe not even
applicable for some reasons I do not overlook, but maybe it's worth
consideration. I'll gladly try to put that into code, and see where it
leads. I think kevent is great, and if things can be done to increase
it's genericity without sacrifying performance, why not.
Sorry for the length of post and repetitions,

Johann
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take19 1/4] kevent: Core files.

2006-10-15 Thread Ulrich Drepper

Evgeniy Polyakov wrote:

Existing design does not allow overflow.


And I've pointed out a number of times that this is not practical at 
best.  There are event sources which can create events which cannot be 
coalesced into one single event as it would be required with your design.


Signals are one example, specifically realtime signals.  If we do not 
want the design to be limited from the start this approach has to be 
thought over.



So zap mmap() support completely, since it is not usable at all. We wont 
discuss on it.


Initial implementation did not have it.
But I was requested to do it, and it is ready now.
No one likes it, but no one provides an alternative implementation.
We are stuck.


We need the mapped ring buffer.  The current design (before it was 
removed) was broken but this does not mean it shouldn't be implemented. 
 We just need more time to figure out how to implement it correctly.


--
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take19 1/4] kevent: Core files.

2006-10-05 Thread Evgeniy Polyakov
On Wed, Oct 04, 2006 at 10:57:32AM -0700, Ulrich Drepper ([EMAIL PROTECTED]) 
wrote:
 On 10/3/06, Evgeniy Polyakov [EMAIL PROTECTED] wrote:
 http://tservice.net.ru/~s0mbre/archive/kevent/evserver_kevent.c
 http://tservice.net.ru/~s0mbre/archive/kevent/evtest.c
 
 These are simple programs which by themselves have problems.  For
 instance, I consider a very bad idea to hardcode the size of the ring
 buffer.  Specifying macros in the header file counts as hardcoding.
 Systems grow over time and so will the demand of connections.  I have
 no problem with the kernel hardcoding the value internally (or having
 a /proc entry to select it) but programs should be able to dynamically
 learn about the value so they don't have to be recompiled.

Well, it is possible to create /sys/proc entry for that, and even now 
userspace can grow mapping ring until it is forbiden by kernel, which
means limit is reached.

Actually the whole idea with global limit of kevents does not sound very
good to me, but it is required to remove overflow in mapped buffer.

 But more problematic is that I don't see how the interfaces can be
 efficiently used in multi-threaded (or multi-process) programs.  How
 would multiple threads using the same kevent queue and running in the
 same kevent_get_events() loop work out?  How do they guarantee that
 each request is only handled once?

kqueue_dequeue_ready() is atomic and this function removes kevent from
ready queue so other thread can not get it.

 From what I see now this means a second data structure is needed to
 keep track of the state of each entry.  But even then, how do we even
 recognized used ring buffer entries?
 
 For instance, assume two threads.  Both call get_events, one event is
 reported, both threads are woken up (which is another thing to
 consider, more later).  One thread uses ring buffer entry, the other
 goes back to sleep in get_events.  Now, how does the kernel know when
 the other thread is done working on the ring buffer entry?  There
 might be lots of entries coming in overflowing the entire buffer.
 Heck, you don't even need two threads for this scenario.

Are you talking about mapped buffer or syscall interface?
The former has special syscall kevent_wait(), which reports number of
'processed' events and first processed number, so kernel can remove all
appropriate events. The latter is described above -
kqueue_dequeue_ready() is atomic, so that event will be removed from the
ready queue and optionally from the whole kevent tree.

It is possible to work with both interfaces at the same time, since
mapped buffer contains a copy of the event, which is potentially freed
and processed by other thread. 

Actually I do not like idea of mapped ring anyway, since if application 
uses a lot of events, it will batch them into big chunks, so syscall 
overhead is negligible, if application uses small number of events, 
syscalls will be rare and will not hurt performance.

 When I was thinking about this (and discussing it in Ottawa) I was
 always assuming that we have a status field in the ring buffer entry
 which lets the userlevel code indicate whether the entry is free again
 or not.  This requires a writable mapping, yes, and potentially causes
 cache line ping-pong.  I think Zach mentioned he has some ideas about
 this.

As far as I can see, there are no other ideas on how to implement ring
buffer, so I did it like I wanted. It has some limitation indeed, but
since I do not see any other code, how can I say what is better or
worse?
 
 As for the multiple thread wakeup, I mentioned this before.  We have
 to avoid the trampling herd problem.  We cannot wakeup all waiters.
 But we also cannot assume that, without protocols, waking up just one
 for each available entry is sufficient.  So the first question is:
 what is the current policy?

It is a good practice to _not_ share the same queue between a lot of
threads. Currently all waiters are awakened.

 AIO was removed from patchset by request of Cristoph.
 Timers, network AIO, fs AIO, socket nortifications and poll/select
 events work well with existing structures.
 
 Well, excuse me if I don't take your word for it.  I agree, the AIO
 code should not be submitted along with this.  The same for any other
 code using the event handling.  But we need to check whether the
 interface is generic enough to accomodate them in a way which actually
 makes sense.  Again, think highly threaded processes or multiple
 processes sharing the same event queue.

You missed the point.
I implemented _all_ above and it does work.
Although it was removed from submission patchset.
You can find all patches on kevent homepage, they were posted to lkml@
and netdev@ too many times to miss them.
 
 It is even possible to create variable sized kevents - each kevent
 contain pointer to user's data, which can be considered as pointer to
 additional area (it's size kernel implementation for given kevent type
 can determine from other parameters or use 

Re: [take19 1/4] kevent: Core files.

2006-10-05 Thread Eric Dumazet
On Thursday 05 October 2006 10:57, Evgeniy Polyakov wrote:

 Well, it is possible to create /sys/proc entry for that, and even now
 userspace can grow mapping ring until it is forbiden by kernel, which
 means limit is reached.

No need for yet another /sys/proc entry.

Right now, I (for example) may have a use for Generic event handling, but for 
a program that needs XXX.XXX handles, and about XX.XXX events per second.

Right now, this program uses epoll, and reaches no limit at all, once you pass 
the ulimit -n, and other kernel wide tunes of course, not related to epoll.

With your current kevent, I cannot switch to it, because of hardcoded limits.

I may be wrong, but what is currently missing for me is :

- No hardcoded limit on the max number of events. (A process that can open 
XXX.XXX files should be allowed to open a kevent queue with at least XXX.XXX 
events). Right now thats not clear what happens IF the current limit is 
reached.

- In order to avoid touching the whole ring buffer, it might be good to be 
able to reset the indexes to the beginning when ring buffer is empty. (So if 
the user land is responsive enough to consume events, only first pages of the 
mapping would be used : that saves L1/L2 cpu caches)

A plus would be

- A working/usable mmap ring buffer implementation, but I think its not 
mandatory. System calls are not that expensive, especially if you can batch 
XX events per syscall (like epoll). Nice thing with a ring buffer is that we 
touch less cache lines than say epoll that have lot of linked structures.

About mmap, I think you might want a hybrid thing :

One writable page where userland can write its index, (and hold one or more 
futex shared by kernel) (with appropriate thread locking in case multiple 
threads want to dequeue events). In fast path, no syscalls are needed to 
maintain this user index.

XXX readonly pages (for user, but r/w for kernel), where kernel write its own 
index, and events of course.

Using separate cache lines avoid false sharing : kernel can update its own 
index and events without having to pay the price of cache line ping pongs.
It could use futex infrastructure to wakeup one thread 'only' instead of all 
threads waiting an event.


Eric
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take19 1/4] kevent: Core files.

2006-10-05 Thread Eric Dumazet
On Thursday 05 October 2006 12:55, Evgeniy Polyakov wrote:
 On Thu, Oct 05, 2006 at 12:45:03PM +0200, Eric Dumazet ([EMAIL PROTECTED]) 
 
  What is missing or not obvious is : If events are skipped because of
  overflows, What happens ? Connections stuck forever ? Hope that
  everything will restore itself ? Is kernel able to SIGNAL this problem to
  user land ?

 Exisitng  code does not overflow by design, but can consume a lot of
 memory. I talked about the case, when there will be some limit on
 number of entries put into mapped buffer.

You still dont answer my question. Please answer the question.
Recap : You have a max of  events queued. A network message come and 
kernel want to add another event. It cannot because limit is reached. How the 
User Program knows that this problem was hit ?


 It is the same.
 What if reing buffer was grown upto 3 entry, and is now empty, and we
 need to put there 4 entries? Grow it again?
 It can be done, easily, but it looks like a workaround not as solution.
 And it is highly unlikely that in situation, when there are a lot of
 event, ring can be empty.

I dont speak of re-allocation of ring buffer. I dont care to allocate at 
startup a big enough buffer.

Say you have allocated a ring buffer of 1024*1024 entries.
Then you queue 100 events per second, and dequeue them immediatly.
No need to blindly use all 1024*1024 slots in the ring buffer, doing 
index = (index+1)%(1024*1024)



 epoll() does not have mmap.
 Problem is not about how many events can be put into the kernel, but how
 many of them can be put into mapped buffer.
 There is no problem if mmap is turned off.

So zap mmap() support completely, since it is not usable at all. We wont 
discuss on it.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take19 1/4] kevent: Core files.

2006-10-05 Thread Evgeniy Polyakov
On Thu, Oct 05, 2006 at 02:09:31PM +0200, Eric Dumazet ([EMAIL PROTECTED]) 
wrote:
 On Thursday 05 October 2006 12:55, Evgeniy Polyakov wrote:
  On Thu, Oct 05, 2006 at 12:45:03PM +0200, Eric Dumazet ([EMAIL PROTECTED]) 
  
   What is missing or not obvious is : If events are skipped because of
   overflows, What happens ? Connections stuck forever ? Hope that
   everything will restore itself ? Is kernel able to SIGNAL this problem to
   user land ?
 
  Exisitng  code does not overflow by design, but can consume a lot of
  memory. I talked about the case, when there will be some limit on
  number of entries put into mapped buffer.
 
 You still dont answer my question. Please answer the question.
 Recap : You have a max of  events queued. A network message come and 
 kernel want to add another event. It cannot because limit is reached. How the 
 User Program knows that this problem was hit ?

Existing design does not allow overflow.
If event was added into the queue (like user requested notification,
when new data has arrived), it is guaranteed that there will be place to
put that event into mapped buffer when it is ready.

If user wants to add anotehr event (for example after accept() user
wants to add another socket with request for notification about data
arrival into that socket), it can fail though. This limit is introduced
only because of mmap buffer.
 
  It is the same.
  What if reing buffer was grown upto 3 entry, and is now empty, and we
  need to put there 4 entries? Grow it again?
  It can be done, easily, but it looks like a workaround not as solution.
  And it is highly unlikely that in situation, when there are a lot of
  event, ring can be empty.
 
 I dont speak of re-allocation of ring buffer. I dont care to allocate at 
 startup a big enough buffer.
 
 Say you have allocated a ring buffer of 1024*1024 entries.
 Then you queue 100 events per second, and dequeue them immediatly.
 No need to blindly use all 1024*1024 slots in the ring buffer, doing 
 index = (index+1)%(1024*1024)

But what if they are not dequeued immediateyl? What if rate is high and
while one tries to dequeue, system adds another events?

  epoll() does not have mmap.
  Problem is not about how many events can be put into the kernel, but how
  many of them can be put into mapped buffer.
  There is no problem if mmap is turned off.
 
 So zap mmap() support completely, since it is not usable at all. We wont 
 discuss on it.

Initial implementation did not have it.
But I was requested to do it, and it is ready now.
No one likes it, but no one provides an alternative implementation.
We are stuck.

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take19 1/4] kevent: Core files.

2006-10-05 Thread Hans Henrik Happe
On Thursday 05 October 2006 12:21, Evgeniy Polyakov wrote:
 On Thu, Oct 05, 2006 at 11:56:24AM +0200, Eric Dumazet ([EMAIL PROTECTED]) 
wrote:
  On Thursday 05 October 2006 10:57, Evgeniy Polyakov wrote:
  
   Well, it is possible to create /sys/proc entry for that, and even now
   userspace can grow mapping ring until it is forbiden by kernel, which
   means limit is reached.
  
  No need for yet another /sys/proc entry.
  
  Right now, I (for example) may have a use for Generic event handling, but 
for 
  a program that needs XXX.XXX handles, and about XX.XXX events per second.
  
  Right now, this program uses epoll, and reaches no limit at all, once you 
pass 
  the ulimit -n, and other kernel wide tunes of course, not related to 
epoll.
  
  With your current kevent, I cannot switch to it, because of hardcoded 
limits.
  
  I may be wrong, but what is currently missing for me is :
  
  - No hardcoded limit on the max number of events. (A process that can open 
  XXX.XXX files should be allowed to open a kevent queue with at least 
XXX.XXX 
  events). Right now thats not clear what happens IF the current limit is 
  reached.
 
 This forces to overflows in fixed sized memory mapped buffer.
 If we remove memory mapped buffer or will allow to have overflows (and
 thus skipped entries) keven can easily scale to that limits (tested with
 xx.xxx events though).
 
  - In order to avoid touching the whole ring buffer, it might be good to be 
  able to reset the indexes to the beginning when ring buffer is empty. (So 
if 
  the user land is responsive enough to consume events, only first pages of 
the 
  mapping would be used : that saves L1/L2 cpu caches)
 
 And what happens when there are 3 empty at the beginning and \we need to
 put there 4 ready events?

Couldn't there be 3 areas in the mmap buffer:

- Unused: entries that the kernel can alloc from.
- Alloced: entries alloced by kernel but not yet used by user. Kernel can 
update these if new events requires that.
- Consumed: entries that the user are processing.

The user takes a set of alloced entries and make them consumed. Then it 
processes the events after which it makes them unused. 

If there are no unused entries and the kernel needs some, it has wait for free 
entries. The user has to notify when unused entries becomes available. It 
could set a flag in the mmap'ed area to avoid unnessesary wakeups.

The are some details with indexing and wakeup notification that I have left 
out, but I hope my idea is clear. I could give a more detailed description if 
requested. Also, I'm a user-level programmer so I might not get the whole 
picture.

Hans Henrik Happe
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take19 1/4] kevent: Core files.

2006-10-05 Thread Evgeniy Polyakov
On Thu, Oct 05, 2006 at 04:01:19PM +0200, Hans Henrik Happe ([EMAIL PROTECTED]) 
wrote:
  And what happens when there are 3 empty at the beginning and \we need to
  put there 4 ready events?
 
 Couldn't there be 3 areas in the mmap buffer:
 
 - Unused: entries that the kernel can alloc from.
 - Alloced: entries alloced by kernel but not yet used by user. Kernel can 
 update these if new events requires that.
 - Consumed: entries that the user are processing.
 
 The user takes a set of alloced entries and make them consumed. Then it 
 processes the events after which it makes them unused. 
 
 If there are no unused entries and the kernel needs some, it has wait for 
 free 
 entries. The user has to notify when unused entries becomes available. It 
 could set a flag in the mmap'ed area to avoid unnessesary wakeups.
 
 The are some details with indexing and wakeup notification that I have left 
 out, but I hope my idea is clear. I could give a more detailed description if 
 requested. Also, I'm a user-level programmer so I might not get the whole 
 picture.

This looks good on a picture, but how can you put it into page-based
storage without major and complex shared structures, which should be
properly locked between kernelspace and userspace?

 Hans Henrik Happe

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take19 1/4] kevent: Core files.

2006-10-04 Thread Ulrich Drepper

On 9/20/06, Evgeniy Polyakov [EMAIL PROTECTED] wrote:

This patch includes core kevent files:
[...]


I tried to look at the example programs before and failed.  I tried
again.  Where can I find up-to-date example code?

Some other points:

- I really would prefer not to rush all this into the upstream kernel.
The main problem is that the ring buffer interface is a shared data
structure.  These are always tricky.  We need to find the right
combination between size (as small as possible) and supporting all the
interfaces.

- so far only the timer and aio notification is speced out.  What
about the rest?  Are we sure all aspects can be expressed?  I am not
yet.

- we need an interface to add an event from userlevel.  I.e., we need
to be able to synthesize events.  There are events (like, for instance
the async DNS functionality) which come from userlevel code.

I would very much prefer we look at the other events before setting
the data structures in stone.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take19 1/4] kevent: Core files.

2006-10-04 Thread Evgeniy Polyakov
On Tue, Oct 03, 2006 at 11:34:02PM -0700, Ulrich Drepper ([EMAIL PROTECTED]) 
wrote:
 On 9/20/06, Evgeniy Polyakov [EMAIL PROTECTED] wrote:
 This patch includes core kevent files:
 [...]
 
 I tried to look at the example programs before and failed.  I tried
 again.  Where can I find up-to-date example code?

http://tservice.net.ru/~s0mbre/archive/kevent/evserver_kevent.c
http://tservice.net.ru/~s0mbre/archive/kevent/evtest.c

Structures were not changed from the beginning of kevent project.

 Some other points:
 
 - I really would prefer not to rush all this into the upstream kernel.
 The main problem is that the ring buffer interface is a shared data
 structure.  These are always tricky.  We need to find the right
 combination between size (as small as possible) and supporting all the
 interfaces.

mmap interface itself is in question, since it allows to create dos
since there are no rlimits for pinned memory.

 - so far only the timer and aio notification is speced out.  What
 about the rest?  Are we sure all aspects can be expressed?  I am not
 yet.

AIO was removed from patchset by request of Cristoph.
Timers, network AIO, fs AIO, socket nortifications and poll/select
events work well with existing structures.

 - we need an interface to add an event from userlevel.  I.e., we need
 to be able to synthesize events.  There are events (like, for instance
 the async DNS functionality) which come from userlevel code.
 
 I would very much prefer we look at the other events before setting
 the data structures in stone.

Signals and userspace events (hello solaris) easily fits into existing
structures.

It is even possible to create variable sized kevents - each kevent
contain pointer to user's data, which can be considered as pointer to
additional area (it's size kernel implementation for given kevent type
can determine from other parameters or use predefined one and fetch
additional data in -enqueue() callback).

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take19 1/4] kevent: Core files.

2006-10-04 Thread Ulrich Drepper

On 10/3/06, Evgeniy Polyakov [EMAIL PROTECTED] wrote:

http://tservice.net.ru/~s0mbre/archive/kevent/evserver_kevent.c
http://tservice.net.ru/~s0mbre/archive/kevent/evtest.c


These are simple programs which by themselves have problems.  For
instance, I consider a very bad idea to hardcode the size of the ring
buffer.  Specifying macros in the header file counts as hardcoding.
Systems grow over time and so will the demand of connections.  I have
no problem with the kernel hardcoding the value internally (or having
a /proc entry to select it) but programs should be able to dynamically
learn about the value so they don't have to be recompiled.

But more problematic is that I don't see how the interfaces can be
efficiently used in multi-threaded (or multi-process) programs.  How
would multiple threads using the same kevent queue and running in the
same kevent_get_events() loop work out?  How do they guarantee that
each request is only handled once?


From what I see now this means a second data structure is needed to

keep track of the state of each entry.  But even then, how do we even
recognized used ring buffer entries?

For instance, assume two threads.  Both call get_events, one event is
reported, both threads are woken up (which is another thing to
consider, more later).  One thread uses ring buffer entry, the other
goes back to sleep in get_events.  Now, how does the kernel know when
the other thread is done working on the ring buffer entry?  There
might be lots of entries coming in overflowing the entire buffer.
Heck, you don't even need two threads for this scenario.

When I was thinking about this (and discussing it in Ottawa) I was
always assuming that we have a status field in the ring buffer entry
which lets the userlevel code indicate whether the entry is free again
or not.  This requires a writable mapping, yes, and potentially causes
cache line ping-pong.  I think Zach mentioned he has some ideas about
this.


As for the multiple thread wakeup, I mentioned this before.  We have
to avoid the trampling herd problem.  We cannot wakeup all waiters.
But we also cannot assume that, without protocols, waking up just one
for each available entry is sufficient.  So the first question is:
what is the current policy?



AIO was removed from patchset by request of Cristoph.
Timers, network AIO, fs AIO, socket nortifications and poll/select
events work well with existing structures.


Well, excuse me if I don't take your word for it.  I agree, the AIO
code should not be submitted along with this.  The same for any other
code using the event handling.  But we need to check whether the
interface is generic enough to accomodate them in a way which actually
makes sense.  Again, think highly threaded processes or multiple
processes sharing the same event queue.



It is even possible to create variable sized kevents - each kevent
contain pointer to user's data, which can be considered as pointer to
additional area (it's size kernel implementation for given kevent type
can determine from other parameters or use predefined one and fetch
additional data in -enqueue() callback).


That sounds interesting and certainly helps with securing the
interface for the future.  But if there is anything we can do to avoid
unnecessary costs we should do it, even if this means investigation
all this further.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[take19 1/4] kevent: Core files.

2006-09-20 Thread Evgeniy Polyakov

Core files.

This patch includes core kevent files:
 - userspace controlling
 - kernelspace interfaces
 - initialization
 - notification state machines

Signed-off-by: Evgeniy Polyakov [EMAIL PROTECTED]

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index dd63d47..c10698e 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -317,3 +317,6 @@ ENTRY(sys_call_table)
.long sys_tee   /* 315 */
.long sys_vmsplice
.long sys_move_pages
+   .long sys_kevent_get_events
+   .long sys_kevent_ctl
+   .long sys_kevent_wait   /* 320 */
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 5d4a7d1..a06b76f 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -710,7 +710,10 @@ #endif
.quad compat_sys_get_robust_list
.quad sys_splice
.quad sys_sync_file_range
-   .quad sys_tee
+   .quad sys_tee   /* 315 */
.quad compat_sys_vmsplice
.quad compat_sys_move_pages
+   .quad sys_kevent_get_events
+   .quad sys_kevent_ctl
+   .quad sys_kevent_wait   /* 320 */
 ia32_syscall_end:  
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index fc1c8dd..68072b5 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -323,10 +323,13 @@ #define __NR_sync_file_range  314
 #define __NR_tee   315
 #define __NR_vmsplice  316
 #define __NR_move_pages317
+#define __NR_kevent_get_events 318
+#define __NR_kevent_ctl319
+#define __NR_kevent_wait   320
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 318
+#define NR_syscalls 321
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 94387c9..ee907ad 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -619,10 +619,16 @@ #define __NR_vmsplice 278
 __SYSCALL(__NR_vmsplice, sys_vmsplice)
 #define __NR_move_pages279
 __SYSCALL(__NR_move_pages, sys_move_pages)
+#define __NR_kevent_get_events 280
+__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events)
+#define __NR_kevent_ctl281
+__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl)
+#define __NR_kevent_wait   282
+__SYSCALL(__NR_kevent_wait, sys_kevent_wait)
 
 #ifdef __KERNEL__
 
-#define __NR_syscall_max __NR_move_pages
+#define __NR_syscall_max __NR_kevent_wait
 
 #ifndef __NO_STUBS
 
diff --git a/include/linux/kevent.h b/include/linux/kevent.h
new file mode 100644
index 000..24ced10
--- /dev/null
+++ b/include/linux/kevent.h
@@ -0,0 +1,195 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov [EMAIL PROTECTED]
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __KEVENT_H
+#define __KEVENT_H
+#include linux/types.h
+#include linux/list.h
+#include linux/rbtree.h
+#include linux/spinlock.h
+#include linux/mutex.h
+#include linux/wait.h
+#include linux/net.h
+#include linux/rcupdate.h
+#include linux/kevent_storage.h
+#include linux/ukevent.h
+
+#define KEVENT_MIN_BUFFS_ALLOC 3
+
+struct kevent;
+struct kevent_storage;
+typedef int (* kevent_callback_t)(struct kevent *);
+
+/* @callback is called each time new event has been caught. */
+/* @enqueue is called each time new event is queued. */
+/* @dequeue is called each time event is dequeued. */
+
+struct kevent_callbacks {
+   kevent_callback_t   callback, enqueue, dequeue;
+};
+
+#define KEVENT_READY   0x1
+#define KEVENT_STORAGE 0x2
+#define KEVENT_USER0x4
+
+struct kevent
+{
+   /* Used for kevent freeing.*/
+   struct rcu_head rcu_head;
+   struct ukevent  event;
+   /* This lock protects ukevent manipulations, e.g. ret_flags changes. */
+   spinlock_t  ulock;
+
+   /* Entry of user's tree. */
+   struct rb_node  kevent_node;
+   /* Entry of origin's queue. */
+   struct list_headstorage_entry;
+   /* Entry of user's ready. */
+   struct list_headready_entry;
+
+   u32 flags;
+
+   /* User who requested this kevent. */
+   

[take18 1/4] kevent: Core files.

2006-09-12 Thread Evgeniy Polyakov

Core files.

This patch includes core kevent files:
 - userspace controlling
 - kernelspace interfaces
 - initialization
 - notification state machines

Signed-off-by: Evgeniy Polyakov [EMAIL PROTECTED]

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index dd63d47..c10698e 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -317,3 +317,6 @@ ENTRY(sys_call_table)
.long sys_tee   /* 315 */
.long sys_vmsplice
.long sys_move_pages
+   .long sys_kevent_get_events
+   .long sys_kevent_ctl
+   .long sys_kevent_wait   /* 320 */
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 5d4a7d1..a06b76f 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -710,7 +710,10 @@ #endif
.quad compat_sys_get_robust_list
.quad sys_splice
.quad sys_sync_file_range
-   .quad sys_tee
+   .quad sys_tee   /* 315 */
.quad compat_sys_vmsplice
.quad compat_sys_move_pages
+   .quad sys_kevent_get_events
+   .quad sys_kevent_ctl
+   .quad sys_kevent_wait   /* 320 */
 ia32_syscall_end:  
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index fc1c8dd..68072b5 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -323,10 +323,13 @@ #define __NR_sync_file_range  314
 #define __NR_tee   315
 #define __NR_vmsplice  316
 #define __NR_move_pages317
+#define __NR_kevent_get_events 318
+#define __NR_kevent_ctl319
+#define __NR_kevent_wait   320
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 318
+#define NR_syscalls 321
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 94387c9..ee907ad 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -619,10 +619,16 @@ #define __NR_vmsplice 278
 __SYSCALL(__NR_vmsplice, sys_vmsplice)
 #define __NR_move_pages279
 __SYSCALL(__NR_move_pages, sys_move_pages)
+#define __NR_kevent_get_events 280
+__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events)
+#define __NR_kevent_ctl281
+__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl)
+#define __NR_kevent_wait   282
+__SYSCALL(__NR_kevent_wait, sys_kevent_wait)
 
 #ifdef __KERNEL__
 
-#define __NR_syscall_max __NR_move_pages
+#define __NR_syscall_max __NR_kevent_wait
 
 #ifndef __NO_STUBS
 
diff --git a/include/linux/kevent.h b/include/linux/kevent.h
new file mode 100644
index 000..867820b
--- /dev/null
+++ b/include/linux/kevent.h
@@ -0,0 +1,195 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov [EMAIL PROTECTED]
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __KEVENT_H
+#define __KEVENT_H
+#include linux/types.h
+#include linux/list.h
+#include linux/rbtree.h
+#include linux/spinlock.h
+#include linux/mutex.h
+#include linux/wait.h
+#include linux/net.h
+#include linux/rcupdate.h
+#include linux/kevent_storage.h
+#include linux/ukevent.h
+
+#define KEVENT_MIN_BUFFS_ALLOC 3
+
+struct kevent;
+struct kevent_storage;
+typedef int (* kevent_callback_t)(struct kevent *);
+
+/* @callback is called each time new event has been caught. */
+/* @enqueue is called each time new event is queued. */
+/* @dequeue is called each time event is dequeued. */
+
+struct kevent_callbacks {
+   kevent_callback_t   callback, enqueue, dequeue;
+};
+
+#define KEVENT_READY   0x1
+#define KEVENT_STORAGE 0x2
+#define KEVENT_USER0x4
+
+struct kevent
+{
+   /* Used for kevent freeing.*/
+   struct rcu_head rcu_head;
+   struct ukevent  event;
+   /* This lock protects ukevent manipulations, e.g. ret_flags changes. */
+   spinlock_t  ulock;
+
+   /* Entry of user's tree. */
+   struct rb_node  kevent_node;
+   /* Entry of origin's queue. */
+   struct list_headstorage_entry;
+   /* Entry of user's ready. */
+   struct list_headready_entry;
+
+   u32 flags;
+
+   /* User who requested this kevent. */
+   

Re: [take17 1/4] kevent: Core files.

2006-09-08 Thread shawvrana
I stand corrected.

On Thursday 07 September 2006 23:38, Evgeniy Polyakov wrote:
 On Thu, Sep 07, 2006 at 09:05:16PM -0700, [EMAIL PROTECTED] ([EMAIL 
 PROTECTED]) 
wrote:
   +static int __devinit kevent_user_init(void)
   +{
   + int err = 0;
   +
   + kevent_cache = kmem_cache_create(kevent_cache,
   + sizeof(struct kevent), 0, SLAB_PANIC, NULL, NULL);
   +
   + err = misc_register(kevent_miscdev);
   + if (err) {
   + printk(KERN_ERR Failed to register kevent miscdev: err=%d.\n,
   err); +   goto err_out_exit;
   + }
   +
   + printk(KEVENT subsystem has been successfully registered.\n);
   +
   + return 0;
   +
   +err_out_exit:
   + kmem_cache_destroy(kevent_cache);
   + return err;
   +}
 
  It's probably best to treat kmem_cache_create like a black box and check
  for it returning null.

 It can not return NULL, it will panic instead since I use SLAB_PANIC
 flag.

  Thanks,
  Shaw
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take17 1/4] kevent: Core files.

2006-09-08 Thread Evgeniy Polyakov
On Thu, Sep 07, 2006 at 09:05:16PM -0700, [EMAIL PROTECTED] ([EMAIL PROTECTED]) 
wrote:
  +static int __devinit kevent_user_init(void)
  +{
  +   int err = 0;
  +
  +   kevent_cache = kmem_cache_create(kevent_cache,
  +   sizeof(struct kevent), 0, SLAB_PANIC, NULL, NULL);
  +
  +   err = misc_register(kevent_miscdev);
  +   if (err) {
  +   printk(KERN_ERR Failed to register kevent miscdev: err=%d.\n, 
  err);
  +   goto err_out_exit;
  +   }
  +
  +   printk(KEVENT subsystem has been successfully registered.\n);
  +
  +   return 0;
  +
  +err_out_exit:
  +   kmem_cache_destroy(kevent_cache);
  +   return err;
  +}
 
 It's probably best to treat kmem_cache_create like a black box and check for 
 it returning null.

It can not return NULL, it will panic instead since I use SLAB_PANIC
flag.

 Thanks,
 Shaw

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[take17 1/4] kevent: Core files.

2006-09-07 Thread Evgeniy Polyakov

Core files.

This patch includes core kevent files:
 - userspace controlling
 - kernelspace interfaces
 - initialization
 - notification state machines

Signed-off-by: Evgeniy Polyakov [EMAIL PROTECTED]

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index dd63d47..c10698e 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -317,3 +317,6 @@ ENTRY(sys_call_table)
.long sys_tee   /* 315 */
.long sys_vmsplice
.long sys_move_pages
+   .long sys_kevent_get_events
+   .long sys_kevent_ctl
+   .long sys_kevent_wait   /* 320 */
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 5d4a7d1..a06b76f 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -710,7 +710,10 @@ #endif
.quad compat_sys_get_robust_list
.quad sys_splice
.quad sys_sync_file_range
-   .quad sys_tee
+   .quad sys_tee   /* 315 */
.quad compat_sys_vmsplice
.quad compat_sys_move_pages
+   .quad sys_kevent_get_events
+   .quad sys_kevent_ctl
+   .quad sys_kevent_wait   /* 320 */
 ia32_syscall_end:  
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index fc1c8dd..68072b5 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -323,10 +323,13 @@ #define __NR_sync_file_range  314
 #define __NR_tee   315
 #define __NR_vmsplice  316
 #define __NR_move_pages317
+#define __NR_kevent_get_events 318
+#define __NR_kevent_ctl319
+#define __NR_kevent_wait   320
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 318
+#define NR_syscalls 321
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 94387c9..ee907ad 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -619,10 +619,16 @@ #define __NR_vmsplice 278
 __SYSCALL(__NR_vmsplice, sys_vmsplice)
 #define __NR_move_pages279
 __SYSCALL(__NR_move_pages, sys_move_pages)
+#define __NR_kevent_get_events 280
+__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events)
+#define __NR_kevent_ctl281
+__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl)
+#define __NR_kevent_wait   282
+__SYSCALL(__NR_kevent_wait, sys_kevent_wait)
 
 #ifdef __KERNEL__
 
-#define __NR_syscall_max __NR_move_pages
+#define __NR_syscall_max __NR_kevent_wait
 
 #ifndef __NO_STUBS
 
diff --git a/include/linux/kevent.h b/include/linux/kevent.h
new file mode 100644
index 000..67007f2
--- /dev/null
+++ b/include/linux/kevent.h
@@ -0,0 +1,196 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov [EMAIL PROTECTED]
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __KEVENT_H
+#define __KEVENT_H
+#include linux/types.h
+#include linux/list.h
+#include linux/spinlock.h
+#include linux/mutex.h
+#include linux/wait.h
+#include linux/net.h
+#include linux/rcupdate.h
+#include linux/kevent_storage.h
+#include linux/ukevent.h
+
+#define KEVENT_MIN_BUFFS_ALLOC 3
+
+struct kevent;
+struct kevent_storage;
+typedef int (* kevent_callback_t)(struct kevent *);
+
+/* @callback is called each time new event has been caught. */
+/* @enqueue is called each time new event is queued. */
+/* @dequeue is called each time event is dequeued. */
+
+struct kevent_callbacks {
+   kevent_callback_t   callback, enqueue, dequeue;
+};
+
+#define KEVENT_READY   0x1
+#define KEVENT_STORAGE 0x2
+#define KEVENT_USER0x4
+
+struct kevent
+{
+   /* Used for kevent freeing.*/
+   struct rcu_head rcu_head;
+   struct ukevent  event;
+   /* This lock protects ukevent manipulations, e.g. ret_flags changes. */
+   spinlock_t  ulock;
+
+   /* Entry of user's queue. */
+   struct list_headkevent_entry;
+   /* Entry of origin's queue. */
+   struct list_headstorage_entry;
+   /* Entry of user's ready. */
+   struct list_headready_entry;
+
+   u32 flags;
+
+   /* User who requested this kevent. */
+   struct kevent_user  

Re: [take16 1/4] kevent: Core files.

2006-09-07 Thread Evgeniy Polyakov
On Wed, Sep 06, 2006 at 09:23:56AM -0500, Chase Venters ([EMAIL PROTECTED]) 
wrote:
 On Wed, 6 Sep 2006, Evgeniy Polyakov wrote:
 +struct kevent_user
 +{
 
 These structure names get a little dicey (kevent, kevent_user, ukevent,
 mukevent)... might there be slightly different names that could be
 selected to better distinguish the purpose of each?
 
 Like what?
 ukevent means userspace_kevent, but ukevent is much smaller.
 mukevent is mapped userspace kevent, mukevent is again much smaller.
 
 
 Hmm, well, kevent_user and ukevent are perhaps the only ones I'm concerned 
 about. What about calling kevent_user a kevent_queue, kevent_fd or 
 kevent_set?

kevent_user is kernel side representation of, guess what? Yes, kevent
user :)

 I decided to use queue length for mmaped buffer, using size of the
 mmapped buffer as queue length is possible too.
 But in any case it is very broken behaviour to introduce any kind of
 overflow and special marking for that - rt signals already have it, no
 need to create additional headache.
 
 
 Hmm. The concern here is pinned memory, is it not? I'm trying to think of 
 the best way to avoid compile-time limits. select() has a rather 
 (infamous) compile-time limit of 1,024 thanks to libc (and thanks to the 
 bit vector, a glass ceiling). Now, you'd be a fool to use select() on that 
 many
 fd's in modern code meant to run on modern UNIXes. But kevent is a new 
 system, the grand unified event loop all of us userspace programmers have 
 been begging for since many years ago. Glass ceilings tend to hurt when 
 you run into them :)
 
 Using the size of the memory mapped buffer as queue length sounds like a 
 sane simplification.

Pinned memory is not the _main_ issue in a real world application - only
if it is some kind of a DoS or really broken behaviour where tons of
event queues are going to be created (like many epoll control
descriptors).
Memory mapped buffer actually can even not exist, if application is not
going to use mmap interface.

 +static int kevent_user_ring_init(struct kevent_user *u)
 +{
 +  int pnum;
 +
 +  pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct mukevent) +
 sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE;
 
 This calculation works with the current constants, but it comes up a page
 short if, say, KEVENT_MAX_EVENTS were 4095. It also looks incorrect
 visually since the 'sizeof(unsigned int)' is only factored in once (rather
 than once per page). I suggest a static / inline __max_kevent_pages()
 function that either does:
 
 return KEVENT_MAX_EVENTS / KEVENTS_ON_PAGE + 1;
 
 or
 
 int pnum = KEVENT_MAX_EVENTS / KEVENTS_ON_PAGE;
 if (KEVENT_MAX_EVENTS % KEVENTS_ON_PAGE)
 pnum++;
 return pnum;
 
 Both should be optimized away by the compiler and will give correct
 answers regardless of the constant values.
 
 Above pnum calculation aligns number of mukevents to pages size with
 appropriate check for (unsigned int), although it is not stated in that
 comment (more clear commant can be found around KEVENTS_ON_PAGE).
 You propose esentially the same calcualtion in the seconds case, while
 first one requires additional page in some cases.
 
 
 You are right about my first suggestion sometimes coming up a page extra. 
 What I'm worried about is that the current ALIGN() based calculation comes 
 up a page short if KEVENT_MAX_EVENTS is certain values (say 4095). This is 
 because the unsigned int index is inside kevent_mring for every page, 
 though the ALIGN() calculation just factors in room for one of them. In 
 these boundary cases (KEVENT_MAX_EVENTS == 4095), your calculation thinks 
 it can fit one last mukevent on a page because it didn't factor in room 
 for unsigned int index at the start of every page; rather just for one 
 page. In this case, the modulus should always come up non-zero, giving us 
 the extra required page.

Comment about KEVENTS_ON_PAGE celarly says what must be taken into
account when size is calculated, but you are right, I should use there
better macros, which should take sizeof(struct kevent_mring).
I will update it.

 It is unused, but I'm still waiting on comments if we need
 kevent_get_events() at all - some people wanted to completely eliminate
 that function in favour of total mmap domination.
 
 
 Interesting idea. It would certainly simplify the interface.

Only for those who really wants to use additional mmap interface.

 
 I have no strong opinion on how to behave in this situation.
 kevent can panic, can free cache, can go to infinite loop or screw up
 the hard drive. Everything is (almost) the same.
 
 
 Obviously it's not a huge deal :)
 
 If kevent is to screw up the hard drive, though, we must put in an 
 exception for it to avoid my music directory.

Care to send a patch for kernel command line? :)


-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take15 1/4] kevent: Core files.

2006-09-06 Thread Evgeniy Polyakov
On Tue, Sep 05, 2006 at 03:28:17PM +0200, Arnd Bergmann ([EMAIL PROTECTED]) 
wrote:
 On Monday 04 September 2006 12:14, Evgeniy Polyakov wrote:
 
  +asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min_nr,
  unsigned int max_nr, __u64 timeout, void __user *buf,
  unsigned flags) 
  +asmlinkage long sys_kevent_ctl(int fd, unsigned int cmd, unsigned int num,
  void __user *arg) 
 
 'void __user *arg' in both of these always points to a struct ukevent,
 according to your documentation. Shouldn't it be a 
 'struct ukevent __user *arg' then?

Yep. I will update it in the next patchset.
Thank you.

   Arnd 

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take16 1/4] kevent: Core files.

2006-09-06 Thread Chase Venters

Evgeniy,
	Sorry about the radio silence later. Some reviewer commentary 
follows.


On Wed, 6 Sep 2006, Evgeniy Polyakov wrote:



Core files.

This patch includes core kevent files:
- userspace controlling
- kernelspace interfaces
- initialization
- notification state machines

Signed-off-by: Evgeniy Polyakov [EMAIL PROTECTED]

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index dd63d47..c10698e 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -317,3 +317,6 @@ ENTRY(sys_call_table)
.long sys_tee   /* 315 */
.long sys_vmsplice
.long sys_move_pages
+   .long sys_kevent_get_events
+   .long sys_kevent_ctl
+   .long sys_kevent_wait   /* 320 */
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 5d4a7d1..a06b76f 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -710,7 +710,10 @@ #endif
.quad compat_sys_get_robust_list
.quad sys_splice
.quad sys_sync_file_range
-   .quad sys_tee
+   .quad sys_tee   /* 315 */
.quad compat_sys_vmsplice
.quad compat_sys_move_pages
+   .quad sys_kevent_get_events
+   .quad sys_kevent_ctl
+   .quad sys_kevent_wait   /* 320 */
ia32_syscall_end:
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index fc1c8dd..68072b5 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -323,10 +323,13 @@ #define __NR_sync_file_range  314
#define __NR_tee315
#define __NR_vmsplice   316
#define __NR_move_pages 317
+#define __NR_kevent_get_events 318
+#define __NR_kevent_ctl319
+#define __NR_kevent_wait   320

#ifdef __KERNEL__

-#define NR_syscalls 318
+#define NR_syscalls 321

/*
 * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 94387c9..ee907ad 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -619,10 +619,16 @@ #define __NR_vmsplice 278
__SYSCALL(__NR_vmsplice, sys_vmsplice)
#define __NR_move_pages 279
__SYSCALL(__NR_move_pages, sys_move_pages)
+#define __NR_kevent_get_events 280
+__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events)
+#define __NR_kevent_ctl281
+__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl)
+#define __NR_kevent_wait   282
+__SYSCALL(__NR_kevent_wait, sys_kevent_wait)

#ifdef __KERNEL__

-#define __NR_syscall_max __NR_move_pages
+#define __NR_syscall_max __NR_kevent_wait

#ifndef __NO_STUBS

diff --git a/include/linux/kevent.h b/include/linux/kevent.h
new file mode 100644
index 000..67007f2
--- /dev/null
+++ b/include/linux/kevent.h
@@ -0,0 +1,196 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov [EMAIL PROTECTED]
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __KEVENT_H
+#define __KEVENT_H
+#include linux/types.h
+#include linux/list.h
+#include linux/spinlock.h
+#include linux/mutex.h
+#include linux/wait.h
+#include linux/net.h
+#include linux/rcupdate.h
+#include linux/kevent_storage.h
+#include linux/ukevent.h
+
+#define KEVENT_MIN_BUFFS_ALLOC 3
+
+struct kevent;
+struct kevent_storage;
+typedef int (* kevent_callback_t)(struct kevent *);
+
+/* @callback is called each time new event has been caught. */
+/* @enqueue is called each time new event is queued. */
+/* @dequeue is called each time event is dequeued. */
+
+struct kevent_callbacks {
+   kevent_callback_t   callback, enqueue, dequeue;
+};
+
+#define KEVENT_READY   0x1
+#define KEVENT_STORAGE 0x2
+#define KEVENT_USER0x4
+
+struct kevent
+{
+   /* Used for kevent freeing.*/
+   struct rcu_head rcu_head;
+   struct ukevent  event;
+   /* This lock protects ukevent manipulations, e.g. ret_flags changes. */
+   spinlock_t  ulock;
+
+   /* Entry of user's queue. */
+   struct list_headkevent_entry;
+   /* Entry of origin's queue. */
+   struct list_headstorage_entry;
+   /* Entry of user's ready. */
+   struct list_headready_entry;
+
+   u32 

Re: [take16 1/4] kevent: Core files.

2006-09-06 Thread Chase Venters

On Wed, 6 Sep 2006, Chase Venters wrote:


 +  if (start + num = KEVENT_MAX_EVENTS ||
 +  start = KEVENT_MAX_EVENTS ||
 +  num = KEVENT_MAX_EVENTS)


Since start and num are unsigned, the last two checks are redundant. If start 
or num is individually = KEVENT_MAX_EVENTS, start + num must be.




Actually, my early-morning brain code optimizer is apparently broken, 
because it forgot all about integer wraparound. Disregard please.




Thanks,
Chase


-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take16 1/4] kevent: Core files.

2006-09-06 Thread Evgeniy Polyakov
On Wed, Sep 06, 2006 at 08:40:21AM -0500, Chase Venters ([EMAIL PROTECTED]) 
wrote:
 Evgeniy,
   Sorry about the radio silence later. Some reviewer commentary 
 follows.


 +struct kevent
 +{
 +/* Used for kevent freeing.*/
 +struct rcu_head rcu_head;
 +struct ukevent  event;
 +/* This lock protects ukevent manipulations, e.g. ret_flags changes. 
 */
 +spinlock_t  ulock;
 +
 +/* Entry of user's queue. */
 +struct list_headkevent_entry;
 +/* Entry of origin's queue. */
 +struct list_headstorage_entry;
 +/* Entry of user's ready. */
 +struct list_headready_entry;
 +
 +u32 flags;
 +
 +/* User who requested this kevent. */
 +struct kevent_user  *user;
 +/* Kevent container. */
 +struct kevent_storage   *st;
 +
 +struct kevent_callbacks callbacks;
 +
 +/* Private data for different storages.
 + * poll()/select storage has a list of wait_queue_t containers
 + * for each -poll() { poll_wait()' } here.
 + */
 +void*priv;
 +};
 +
 +#define KEVENT_HASH_MASK0xff
 +
 +struct kevent_user
 +{
 
 These structure names get a little dicey (kevent, kevent_user, ukevent, 
 mukevent)... might there be slightly different names that could be 
 selected to better distinguish the purpose of each?

Like what?
ukevent means userspace_kevent, but ukevent is much smaller.
mukevent is mapped userspace kevent, mukevent is again much smaller.

 +struct list_headkevent_list[KEVENT_HASH_MASK+1];
 +spinlock_t  kevent_lock;
 +/* Number of queued kevents. */
 +unsigned intkevent_num;
 +
 +/* List of ready kevents. */
 +struct list_headready_list;
 +/* Number of ready kevents. */
 +unsigned intready_num;
 +/* Protects all manipulations with ready queue. */
 +spinlock_t  ready_lock;
 +
 +/* Protects against simultaneous kevent_user control manipulations. 
 */
 +struct mutexctl_mutex;
 +/* Wait until some events are ready. */
 +wait_queue_head_t   wait;
 +
 +/* Reference counter, increased for each new kevent. */
 +atomic_trefcnt;
 +
 +unsigned intpages_in_use;
 +/* Array of pages forming mapped ring buffer */
 +struct kevent_mring **pring;
 +
 +#ifdef CONFIG_KEVENT_USER_STAT
 +unsigned long   im_num;
 +unsigned long   wait_num;
 +unsigned long   total;
 +#endif
 +};
 +#define KEVENT_MAX_EVENTS   4096
 +
 
 This limit governs how many simultaneous kevents you can be waiting on / 
 for at once, correct? Would it be possible to drop the hard limit and 
 limit instead, say, the maximum number of kevents you can have pending in 
 the mmap ring-buffer? After the number is exceeded, additional events 
 could get dropped, or some magic number could be put in the 
 kevent_mring-index field to let the process know that it must hit another 
 syscall to drain the rest of the events.

I decided to use queue length for mmaped buffer, using size of the
mmapped buffer as queue length is possible too.
But in any case it is very broken behaviour to introduce any kind of
overflow and special marking for that - rt signals already have it, no
need to create additional headache.


 +static struct kevent_callbacks kevent_registered_callbacks[KEVENT_MAX];
 
 __read_mostly?

Yep, I was told already that some structures can be marked as such.
Such change is not 100% requirement though.

 +
 +int kevent_add_callbacks(const struct kevent_callbacks *cb, int pos)
 +{
 +struct kevent_callbacks *p;
 +
 +if (pos = KEVENT_MAX)
 +return -EINVAL;
 +
 +p = kevent_registered_callbacks[pos];
 +
 +p-enqueue = (cb-enqueue) ? cb-enqueue : kevent_break;
 +p-dequeue = (cb-dequeue) ? cb-dequeue : kevent_break;
 +p-callback = (cb-callback) ? cb-callback : kevent_break;
 
 Curious... why are these callbacks copied, rather than just retaining a 
 pointer to a const/static ops structure?

It simplifies callers of that callbacks to just call a function instead
of dereferencing and check for various pointers.

 +
 +printk(KERN_INFO KEVENT: Added callbacks for type %d.\n, pos);
 
 Is this printk() chatter necessary?

As any other information printk in kernel it is not neccessary, but it
allows user to know which kevent kernel users are enabled.

 +static char kevent_name[] = kevent;
 
 const?

Yep.

 +/*
 + * Initialize mmap ring buffer.
 + * It will store ready kevents, so userspace could get them directly 
 instead
 + * of using syscall. Esentially syscall becomes just a waiting point.
 + */
 +static int kevent_user_ring_init(struct kevent_user *u)
 +{
 +int pnum;
 +
 +pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct mukevent) + 
 sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE;
 
 This calculation works with the current constants, but it comes 

Re: [take16 1/4] kevent: Core files.

2006-09-06 Thread Chase Venters

On Wed, 6 Sep 2006, Evgeniy Polyakov wrote:


+
+struct kevent_user
+{


These structure names get a little dicey (kevent, kevent_user, ukevent,
mukevent)... might there be slightly different names that could be
selected to better distinguish the purpose of each?


Like what?
ukevent means userspace_kevent, but ukevent is much smaller.
mukevent is mapped userspace kevent, mukevent is again much smaller.



Hmm, well, kevent_user and ukevent are perhaps the only ones I'm concerned 
about. What about calling kevent_user a kevent_queue, kevent_fd or 
kevent_set?




I decided to use queue length for mmaped buffer, using size of the
mmapped buffer as queue length is possible too.
But in any case it is very broken behaviour to introduce any kind of
overflow and special marking for that - rt signals already have it, no
need to create additional headache.



Hmm. The concern here is pinned memory, is it not? I'm trying to think of 
the best way to avoid compile-time limits. select() has a rather 
(infamous) compile-time limit of 1,024 thanks to libc (and thanks to the 
bit vector, a glass ceiling). Now, you'd be a fool to use select() on that many
fd's in modern code meant to run on modern UNIXes. But kevent is a new 
system, the grand unified event loop all of us userspace programmers have 
been begging for since many years ago. Glass ceilings tend to hurt when 
you run into them :)


Using the size of the memory mapped buffer as queue length sounds like a 
sane simplification.



+static int kevent_user_ring_init(struct kevent_user *u)
+{
+   int pnum;
+
+   pnum = ALIGN(KEVENT_MAX_EVENTS*sizeof(struct mukevent) +
sizeof(unsigned int), PAGE_SIZE)/PAGE_SIZE;


This calculation works with the current constants, but it comes up a page
short if, say, KEVENT_MAX_EVENTS were 4095. It also looks incorrect
visually since the 'sizeof(unsigned int)' is only factored in once (rather
than once per page). I suggest a static / inline __max_kevent_pages()
function that either does:

return KEVENT_MAX_EVENTS / KEVENTS_ON_PAGE + 1;

or

int pnum = KEVENT_MAX_EVENTS / KEVENTS_ON_PAGE;
if (KEVENT_MAX_EVENTS % KEVENTS_ON_PAGE)
pnum++;
return pnum;

Both should be optimized away by the compiler and will give correct
answers regardless of the constant values.


Above pnum calculation aligns number of mukevents to pages size with
appropriate check for (unsigned int), although it is not stated in that
comment (more clear commant can be found around KEVENTS_ON_PAGE).
You propose esentially the same calcualtion in the seconds case, while
first one requires additional page in some cases.



You are right about my first suggestion sometimes coming up a page extra. 
What I'm worried about is that the current ALIGN() based calculation comes 
up a page short if KEVENT_MAX_EVENTS is certain values (say 4095). This is 
because the unsigned int index is inside kevent_mring for every page, 
though the ALIGN() calculation just factors in room for one of them. In 
these boundary cases (KEVENT_MAX_EVENTS == 4095), your calculation thinks 
it can fit one last mukevent on a page because it didn't factor in room 
for unsigned int index at the start of every page; rather just for one 
page. In this case, the modulus should always come up non-zero, giving us 
the extra required page.




It is unused, but I'm still waiting on comments if we need
kevent_get_events() at all - some people wanted to completely eliminate
that function in favour of total mmap domination.



Interesting idea. It would certainly simplify the interface.



I have no strong opinion on how to behave in this situation.
kevent can panic, can free cache, can go to infinite loop or screw up
the hard drive. Everything is (almost) the same.



Obviously it's not a huge deal :)

If kevent is to screw up the hard drive, though, we must put in an 
exception for it to avoid my music directory.



Looking pretty good. This is my first pass of comments, and I'll probably
have questions that follow, but I'm trying to get a really good picture of
what is going on here for documentation purposes.


Thank you, Chase.
I will definitely get your comments into account and change related
bits.


Thanks again!
Chase
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[take16 1/4] kevent: Core files.

2006-09-06 Thread Evgeniy Polyakov

Core files.

This patch includes core kevent files:
 - userspace controlling
 - kernelspace interfaces
 - initialization
 - notification state machines

Signed-off-by: Evgeniy Polyakov [EMAIL PROTECTED]

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index dd63d47..c10698e 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -317,3 +317,6 @@ ENTRY(sys_call_table)
.long sys_tee   /* 315 */
.long sys_vmsplice
.long sys_move_pages
+   .long sys_kevent_get_events
+   .long sys_kevent_ctl
+   .long sys_kevent_wait   /* 320 */
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 5d4a7d1..a06b76f 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -710,7 +710,10 @@ #endif
.quad compat_sys_get_robust_list
.quad sys_splice
.quad sys_sync_file_range
-   .quad sys_tee
+   .quad sys_tee   /* 315 */
.quad compat_sys_vmsplice
.quad compat_sys_move_pages
+   .quad sys_kevent_get_events
+   .quad sys_kevent_ctl
+   .quad sys_kevent_wait   /* 320 */
 ia32_syscall_end:  
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index fc1c8dd..68072b5 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -323,10 +323,13 @@ #define __NR_sync_file_range  314
 #define __NR_tee   315
 #define __NR_vmsplice  316
 #define __NR_move_pages317
+#define __NR_kevent_get_events 318
+#define __NR_kevent_ctl319
+#define __NR_kevent_wait   320
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 318
+#define NR_syscalls 321
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 94387c9..ee907ad 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -619,10 +619,16 @@ #define __NR_vmsplice 278
 __SYSCALL(__NR_vmsplice, sys_vmsplice)
 #define __NR_move_pages279
 __SYSCALL(__NR_move_pages, sys_move_pages)
+#define __NR_kevent_get_events 280
+__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events)
+#define __NR_kevent_ctl281
+__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl)
+#define __NR_kevent_wait   282
+__SYSCALL(__NR_kevent_wait, sys_kevent_wait)
 
 #ifdef __KERNEL__
 
-#define __NR_syscall_max __NR_move_pages
+#define __NR_syscall_max __NR_kevent_wait
 
 #ifndef __NO_STUBS
 
diff --git a/include/linux/kevent.h b/include/linux/kevent.h
new file mode 100644
index 000..67007f2
--- /dev/null
+++ b/include/linux/kevent.h
@@ -0,0 +1,196 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov [EMAIL PROTECTED]
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __KEVENT_H
+#define __KEVENT_H
+#include linux/types.h
+#include linux/list.h
+#include linux/spinlock.h
+#include linux/mutex.h
+#include linux/wait.h
+#include linux/net.h
+#include linux/rcupdate.h
+#include linux/kevent_storage.h
+#include linux/ukevent.h
+
+#define KEVENT_MIN_BUFFS_ALLOC 3
+
+struct kevent;
+struct kevent_storage;
+typedef int (* kevent_callback_t)(struct kevent *);
+
+/* @callback is called each time new event has been caught. */
+/* @enqueue is called each time new event is queued. */
+/* @dequeue is called each time event is dequeued. */
+
+struct kevent_callbacks {
+   kevent_callback_t   callback, enqueue, dequeue;
+};
+
+#define KEVENT_READY   0x1
+#define KEVENT_STORAGE 0x2
+#define KEVENT_USER0x4
+
+struct kevent
+{
+   /* Used for kevent freeing.*/
+   struct rcu_head rcu_head;
+   struct ukevent  event;
+   /* This lock protects ukevent manipulations, e.g. ret_flags changes. */
+   spinlock_t  ulock;
+
+   /* Entry of user's queue. */
+   struct list_headkevent_entry;
+   /* Entry of origin's queue. */
+   struct list_headstorage_entry;
+   /* Entry of user's ready. */
+   struct list_headready_entry;
+
+   u32 flags;
+
+   /* User who requested this kevent. */
+   struct kevent_user  

Re: [take15 1/4] kevent: Core files.

2006-09-05 Thread Arnd Bergmann
On Monday 04 September 2006 12:14, Evgeniy Polyakov wrote:

 +asmlinkage long sys_kevent_get_events(int ctl_fd, unsigned int min_nr,
   unsigned int max_nr, __u64 timeout, void __user *buf,
   unsigned flags) 
 +asmlinkage long sys_kevent_ctl(int fd, unsigned int cmd, unsigned int num,
   void __user *arg) 

'void __user *arg' in both of these always points to a struct ukevent,
according to your documentation. Shouldn't it be a 
'struct ukevent __user *arg' then?

Arnd 
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[take15 1/4] kevent: Core files.

2006-09-04 Thread Evgeniy Polyakov

Core files.

This patch includes core kevent files:
 - userspace controlling
 - kernelspace interfaces
 - initialization
 - notification state machines

Signed-off-by: Evgeniy Polyakov [EMAIL PROTECTED]

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index dd63d47..c10698e 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -317,3 +317,6 @@ ENTRY(sys_call_table)
.long sys_tee   /* 315 */
.long sys_vmsplice
.long sys_move_pages
+   .long sys_kevent_get_events
+   .long sys_kevent_ctl
+   .long sys_kevent_wait   /* 320 */
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 5d4a7d1..a06b76f 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -710,7 +710,10 @@ #endif
.quad compat_sys_get_robust_list
.quad sys_splice
.quad sys_sync_file_range
-   .quad sys_tee
+   .quad sys_tee   /* 315 */
.quad compat_sys_vmsplice
.quad compat_sys_move_pages
+   .quad sys_kevent_get_events
+   .quad sys_kevent_ctl
+   .quad sys_kevent_wait   /* 320 */
 ia32_syscall_end:  
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index fc1c8dd..68072b5 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -323,10 +323,13 @@ #define __NR_sync_file_range  314
 #define __NR_tee   315
 #define __NR_vmsplice  316
 #define __NR_move_pages317
+#define __NR_kevent_get_events 318
+#define __NR_kevent_ctl319
+#define __NR_kevent_wait   320
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 318
+#define NR_syscalls 321
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 94387c9..ee907ad 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -619,10 +619,16 @@ #define __NR_vmsplice 278
 __SYSCALL(__NR_vmsplice, sys_vmsplice)
 #define __NR_move_pages279
 __SYSCALL(__NR_move_pages, sys_move_pages)
+#define __NR_kevent_get_events 280
+__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events)
+#define __NR_kevent_ctl281
+__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl)
+#define __NR_kevent_wait   282
+__SYSCALL(__NR_kevent_wait, sys_kevent_wait)
 
 #ifdef __KERNEL__
 
-#define __NR_syscall_max __NR_move_pages
+#define __NR_syscall_max __NR_kevent_wait
 
 #ifndef __NO_STUBS
 
diff --git a/include/linux/kevent.h b/include/linux/kevent.h
new file mode 100644
index 000..67007f2
--- /dev/null
+++ b/include/linux/kevent.h
@@ -0,0 +1,196 @@
+/*
+ * 2006 Copyright (c) Evgeniy Polyakov [EMAIL PROTECTED]
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __KEVENT_H
+#define __KEVENT_H
+#include linux/types.h
+#include linux/list.h
+#include linux/spinlock.h
+#include linux/mutex.h
+#include linux/wait.h
+#include linux/net.h
+#include linux/rcupdate.h
+#include linux/kevent_storage.h
+#include linux/ukevent.h
+
+#define KEVENT_MIN_BUFFS_ALLOC 3
+
+struct kevent;
+struct kevent_storage;
+typedef int (* kevent_callback_t)(struct kevent *);
+
+/* @callback is called each time new event has been caught. */
+/* @enqueue is called each time new event is queued. */
+/* @dequeue is called each time event is dequeued. */
+
+struct kevent_callbacks {
+   kevent_callback_t   callback, enqueue, dequeue;
+};
+
+#define KEVENT_READY   0x1
+#define KEVENT_STORAGE 0x2
+#define KEVENT_USER0x4
+
+struct kevent
+{
+   /* Used for kevent freeing.*/
+   struct rcu_head rcu_head;
+   struct ukevent  event;
+   /* This lock protects ukevent manipulations, e.g. ret_flags changes. */
+   spinlock_t  ulock;
+
+   /* Entry of user's queue. */
+   struct list_headkevent_entry;
+   /* Entry of origin's queue. */
+   struct list_headstorage_entry;
+   /* Entry of user's ready. */
+   struct list_headready_entry;
+
+   u32 flags;
+
+   /* User who requested this kevent. */
+   struct kevent_user  

[take5 1/4] kevent: Core files.

2006-08-08 Thread Evgeniy Polyakov

Core files.

This patch includes core kevent files:
 - userspace controlling
 - kernelspace interfaces
 - initialization
 - notification state machines

It might also inlclude parts from other subsystem (like network related
syscalls, so it is possible that it will not compile without other
patches applied).

Signed-off-by: Evgeniy Polyakov [EMAIL PROTECTED]

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index dd63d47..0af988a 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -317,3 +317,7 @@ ENTRY(sys_call_table)
.long sys_tee   /* 315 */
.long sys_vmsplice
.long sys_move_pages
+   .long sys_aio_recv
+   .long sys_aio_send
+   .long sys_kevent_get_events
+   .long sys_kevent_ctl
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 5d4a7d1..e157ad4 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -713,4 +713,8 @@ #endif
.quad sys_tee
.quad compat_sys_vmsplice
.quad compat_sys_move_pages
+   .quad sys_aio_recv
+   .quad sys_aio_send
+   .quad sys_kevent_get_events
+   .quad sys_kevent_ctl
 ia32_syscall_end:  
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index fc1c8dd..a76e50d 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -323,10 +323,14 @@ #define __NR_sync_file_range  314
 #define __NR_tee   315
 #define __NR_vmsplice  316
 #define __NR_move_pages317
+#define __NR_aio_recv  318
+#define __NR_aio_send  319
+#define __NR_kevent_get_events 320
+#define __NR_kevent_ctl321
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 318
+#define NR_syscalls 322
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 94387c9..9a0b581 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -619,10 +619,18 @@ #define __NR_vmsplice 278
 __SYSCALL(__NR_vmsplice, sys_vmsplice)
 #define __NR_move_pages279
 __SYSCALL(__NR_move_pages, sys_move_pages)
+#define __NR_aio_recv  280
+__SYSCALL(__NR_aio_recv, sys_aio_recv)
+#define __NR_aio_send  281
+__SYSCALL(__NR_aio_send, sys_aio_send)
+#define __NR_kevent_get_events 282
+__SYSCALL(__NR_kevent_get_events, sys_kevent_get_events)
+#define __NR_kevent_ctl283
+__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl)
 
 #ifdef __KERNEL__
 
-#define __NR_syscall_max __NR_move_pages
+#define __NR_syscall_max __NR_kevent_ctl
 
 #ifndef __NO_STUBS
 
diff --git a/include/linux/kevent.h b/include/linux/kevent.h
new file mode 100644
index 000..c32f3bd
--- /dev/null
+++ b/include/linux/kevent.h
@@ -0,0 +1,296 @@
+/*
+ * kevent.h
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov [EMAIL PROTECTED]
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __KEVENT_H
+#define __KEVENT_H
+
+/*
+ * Kevent request flags.
+ */
+
+#define KEVENT_REQ_ONESHOT 0x1 /* Process this event only once 
and then dequeue. */
+
+/*
+ * Kevent return flags.
+ */
+#define KEVENT_RET_BROKEN  0x1 /* Kevent is broken. */
+#define KEVENT_RET_DONE0x2 /* Kevent processing 
was finished successfully. */
+
+/*
+ * Kevent type set.
+ */
+#define KEVENT_SOCKET  0
+#define KEVENT_INODE   1
+#define KEVENT_TIMER   2
+#define KEVENT_POLL3
+#define KEVENT_NAIO4
+#define KEVENT_AIO 5
+#defineKEVENT_MAX  6
+
+/*
+ * Per-type event sets.
+ * Number of per-event sets should be exactly as number of kevent types.
+ */
+
+/*
+ * Timer events.
+ */
+#defineKEVENT_TIMER_FIRED  0x1
+
+/*
+ * Socket/network asynchronous IO events.
+ */
+#defineKEVENT_SOCKET_RECV  0x1
+#defineKEVENT_SOCKET_ACCEPT0x2
+#defineKEVENT_SOCKET_SEND  0x4
+
+/*
+ * Inode events.
+ */
+#defineKEVENT_INODE_CREATE 0x1
+#defineKEVENT_INODE_REMOVE 0x2
+
+/*
+ * Poll events.
+ */
+#defineKEVENT_POLL_POLLIN  0x0001
+#define

Re: [take5 1/4] kevent: Core files.

2006-08-08 Thread Zach Brown

 +++ b/include/linux/kevent.h

 ...

 +#ifdef CONFIG_KEVENT_SOCKET
 +
 +extern struct file_operations socket_file_ops;

This doesn't build because socket_file_ops was left static in net/socket.c.

In any case, kevent.h has no business exposing socket_file_ops to users
of the kevent api just so the kevent core can test files as being backed
by sockets.  It'd be more appropriate to call into the socket layer with
the filp and let it return -EINVAL or -ESOCKNOOPT instead of trying to
do that in the kevent layer.

- z
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take5 1/4] kevent: Core files.

2006-08-08 Thread Evgeniy Polyakov
On Tue, Aug 08, 2006 at 03:02:59PM -0700, Zach Brown ([EMAIL PROTECTED]) wrote:
 
  +++ b/include/linux/kevent.h
 
  ...
 
  +#ifdef CONFIG_KEVENT_SOCKET
  +
  +extern struct file_operations socket_file_ops;
 
 This doesn't build because socket_file_ops was left static in net/socket.c.

I exported it. It just sneaked out of patchset.

 In any case, kevent.h has no business exposing socket_file_ops to users
 of the kevent api just so the kevent core can test files as being backed
 by sockets.  It'd be more appropriate to call into the socket layer with
 the filp and let it return -EINVAL or -ESOCKNOOPT instead of trying to
 do that in the kevent layer.

Ok, I will move to use some functions from socket code without exporting
socket_file_ops.

 - z

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take4 1/4] kevent: Core files.

2006-08-05 Thread Greg KH
On Sat, Aug 05, 2006 at 05:02:38PM +0400, Evgeniy Polyakov wrote:
 +static int __devinit kevent_user_init(void)
 +{
 + struct class_device *dev;
 + int err = 0;
 + 
 + err = register_filesystem(kevent_fs_type);
 + if (err)
 + panic(%s: failed to register filesystem: err=%d.\n,
 +kevent_name, err);
 +
 + kevent_mnt = kern_mount(kevent_fs_type);
 + if (IS_ERR(kevent_mnt))
 + panic(%s: failed to mount silesystem: err=%ld.\n, 
 + kevent_name, PTR_ERR(kevent_mnt));
 + 
 + kevent_user_major = register_chrdev(0, kevent_name, kevent_user_fops);
 + if (kevent_user_major  0) {
 + printk(KERN_ERR Failed to register \%s\ char device: 
 err=%d.\n, 
 + kevent_name, kevent_user_major);
 + return -ENODEV;
 + }
 +
 + kevent_user_class = class_create(THIS_MODULE, kevent);
 + if (IS_ERR(kevent_user_class)) {
 + printk(KERN_ERR Failed to register \%s\ class: err=%ld.\n, 
 + kevent_name, PTR_ERR(kevent_user_class));
 + err = PTR_ERR(kevent_user_class);
 + goto err_out_unregister;
 + }
 +
 + dev = class_device_create(kevent_user_class, NULL, 
 + MKDEV(kevent_user_major, 0), NULL, kevent_name);
 + if (IS_ERR(dev)) {
 + printk(KERN_ERR Failed to create %d.%d class device in \%s\ 
 class: err=%ld.\n, 
 + kevent_user_major, 0, kevent_name, 
 PTR_ERR(dev));
 + err = PTR_ERR(dev);
 + goto err_out_class_destroy;
 + }

As you are only using 1 minor number in this code, why not just use a
miscdevice instead?  It saves a bit of overhead and makes the code a
tiny bit smaller :)

thanks,

greg k-h
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take4 1/4] kevent: Core files.

2006-08-05 Thread Evgeniy Polyakov
On Sat, Aug 05, 2006 at 10:57:02AM -0700, GregKH ([EMAIL PROTECTED]) wrote:
  +   dev = class_device_create(kevent_user_class, NULL, 
  +   MKDEV(kevent_user_major, 0), NULL, kevent_name);
  +   if (IS_ERR(dev)) {
  +   printk(KERN_ERR Failed to create %d.%d class device in \%s\ 
  class: err=%ld.\n, 
  +   kevent_user_major, 0, kevent_name, 
  PTR_ERR(dev));
  +   err = PTR_ERR(dev);
  +   goto err_out_class_destroy;
  +   }
 
 As you are only using 1 minor number in this code, why not just use a
 miscdevice instead?  It saves a bit of overhead and makes the code a
 tiny bit smaller :)

No problem. I will move it to miscdevice instead of full chardev.

 thanks,
 
 greg k-h

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take3 1/4] kevent: Core files.

2006-08-03 Thread Eric Dumazet
On Thursday 03 August 2006 11:46, Evgeniy Polyakov wrote:
 Core files.

 This patch includes core kevent files:
  - userspace controlling
  - kernelspace interfaces
  - initialization
  - notification state machines


 +static int kevent_user_wait(struct file *file, struct kevent_user *u,
 + unsigned int min_nr, unsigned int max_nr, unsigned int timeout,
 + void __user *buf)
 +{


 + mutex_lock(u-ctl_mutex);
 + while (num  max_nr  ((k = kqueue_dequeue_ready(u)) != NULL)) {
 + if (copy_to_user(buf + num*sizeof(struct ukevent),
 + k-event, sizeof(struct ukevent))) {
 + cerr = -EINVAL;
 + break;
 + }


It seems quite wrong to hold ctl_mutex while doing a copy_to_user() (of 
possibly a large amount of data) : A thread can sleep on a page fault and 
other threads cannot make progress.

Eric
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take3 1/4] kevent: Core files.

2006-08-03 Thread Evgeniy Polyakov
On Thu, Aug 03, 2006 at 04:40:34PM +0200, Eric Dumazet ([EMAIL PROTECTED]) 
wrote:
  +   mutex_lock(u-ctl_mutex);
  +   while (num  max_nr  ((k = kqueue_dequeue_ready(u)) != NULL)) {
  +   if (copy_to_user(buf + num*sizeof(struct ukevent),
  +   k-event, sizeof(struct ukevent))) {
  +   cerr = -EINVAL;
  +   break;
  +   }
 
 
 It seems quite wrong to hold ctl_mutex while doing a copy_to_user() (of 
 possibly a large amount of data) : A thread can sleep on a page fault and 
 other threads cannot make progress.

I would not call that wrong - system prevents some threads from removing 
kevents which are counted to be transfered to the userspace, i.e. when 
dequeuing was awakened and it had seen some events it is possible, that 
when it will dequeue them part will be removed by other thread, so I 
prevent this.

 Eric

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take3 1/4] kevent: Core files.

2006-08-03 Thread Eric Dumazet
On Thursday 03 August 2006 16:55, Evgeniy Polyakov wrote:
 On Thu, Aug 03, 2006 at 04:40:34PM +0200, Eric Dumazet ([EMAIL PROTECTED]) 
wrote:
   + mutex_lock(u-ctl_mutex);
   + while (num  max_nr  ((k = kqueue_dequeue_ready(u)) != NULL)) {
   + if (copy_to_user(buf + num*sizeof(struct ukevent),
   + k-event, sizeof(struct ukevent))) {
   + cerr = -EINVAL;
   + break;
   + }
 
  It seems quite wrong to hold ctl_mutex while doing a copy_to_user() (of
  possibly a large amount of data) : A thread can sleep on a page fault and
  other threads cannot make progress.

 I would not call that wrong - system prevents some threads from removing
 kevents which are counted to be transfered to the userspace, i.e. when
 dequeuing was awakened and it had seen some events it is possible, that
 when it will dequeue them part will be removed by other thread, so I
 prevent this.

Hum, wrong was maybe not the good word but kqueue_dequeue_ready() uses a 
spinlock (ready_lock) to protect ready_list. One particular struct kevent is 
given to one thread, one at a time.

If you look at fs/eventpoll.c, you can see how carefull is ep_send_events() so 
that multiple threads can in the same time transfer different items to user 
memory.

In a model where several threads are servicing events collected by a single 
point (epoll, or kevent), this is important to not block all threads because 
of a single thread waiting a swapin (trigered by copy_to_user() )

Eric
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take3 1/4] kevent: Core files.

2006-08-03 Thread Evgeniy Polyakov
On Thu, Aug 03, 2006 at 05:11:58PM +0200, Eric Dumazet ([EMAIL PROTECTED]) 
wrote:
 On Thursday 03 August 2006 16:55, Evgeniy Polyakov wrote:
  On Thu, Aug 03, 2006 at 04:40:34PM +0200, Eric Dumazet ([EMAIL PROTECTED]) 
 wrote:
+   mutex_lock(u-ctl_mutex);
+   while (num  max_nr  ((k = kqueue_dequeue_ready(u)) != NULL)) 
{
+   if (copy_to_user(buf + num*sizeof(struct ukevent),
+   k-event, sizeof(struct 
ukevent))) {
+   cerr = -EINVAL;
+   break;
+   }
  
   It seems quite wrong to hold ctl_mutex while doing a copy_to_user() (of
   possibly a large amount of data) : A thread can sleep on a page fault and
   other threads cannot make progress.
 
  I would not call that wrong - system prevents some threads from removing
  kevents which are counted to be transfered to the userspace, i.e. when
  dequeuing was awakened and it had seen some events it is possible, that
  when it will dequeue them part will be removed by other thread, so I
  prevent this.
 
 Hum, wrong was maybe not the good word but kqueue_dequeue_ready() uses 
 a 
 spinlock (ready_lock) to protect ready_list. One particular struct kevent is 
 given to one thread, one at a time.

I mean that wait_event logic will see that there are requested number of
events, and when it starts to get them, it is possible that there will
be no events at all. 

 If you look at fs/eventpoll.c, you can see how carefull is ep_send_events() 
 so 
 that multiple threads can in the same time transfer different items to user 
 memory.

It is done under the same logic under ep-sem semaphore, which is being
held for del and read operations.
Or do you mean to have rw semahore instead of mutex here?

 In a model where several threads are servicing events collected by a single 
 point (epoll, or kevent), this is important to not block all threads because 
 of a single thread waiting a swapin (trigered by copy_to_user() )
 
 Eric

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take3 1/4] kevent: Core files.

2006-08-03 Thread David Miller
From: Evgeniy Polyakov [EMAIL PROTECTED]
Date: Thu, 3 Aug 2006 18:55:57 +0400

 I would not call that wrong - system prevents some threads from removing 
 kevents which are counted to be transfered to the userspace, i.e. when 
 dequeuing was awakened and it had seen some events it is possible, that 
 when it will dequeue them part will be removed by other thread, so I 
 prevent this.

Queue is all that matters to be synchronized, so it seems
better to have a mutex on the queue rather than a global
one.  That way, user can only hurt himself.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take2 1/4] kevent: core files.

2006-08-02 Thread Evgeniy Polyakov
On Tue, Aug 01, 2006 at 04:56:59PM -0700, Zach Brown ([EMAIL PROTECTED]) wrote:
 
 OK, here's some of my reactions to the core part.

Thanks.

  +#define KEVENT_SOCKET  0
  +#define KEVENT_INODE   1
  +#define KEVENT_TIMER   2
  +#define KEVENT_POLL3
  +#define KEVENT_NAIO4
  +#define KEVENT_AIO 5
 
 I guess we can't really avoid some form of centralized list of the
 constants in the API if we're going for a flat constant namespace.
 It'll be irritating to manage this list over time, just like it's
 irritating to manage syscall numbers now.
 
  +/*
  + * Socket/network asynchronous IO events.
  + */
  +#defineKEVENT_SOCKET_RECV  0x1
  +#defineKEVENT_SOCKET_ACCEPT0x2
  +#defineKEVENT_SOCKET_SEND  0x4
 
 I wonder if these shouldn't live in the subsystems instead of in kevent.h.

Yes it could, but it requires including those files in kevent.h, which
is exported to userspace, and it is not always possible to publish
included file there.

  +/*
  + * Poll events.
  + */
  +#defineKEVENT_POLL_POLLIN  0x0001
  +#defineKEVENT_POLL_POLLPRI 0x0002
  +#defineKEVENT_POLL_POLLOUT 0x0004
  +#defineKEVENT_POLL_POLLERR 0x0008
  +#defineKEVENT_POLL_POLLHUP 0x0010
  +#defineKEVENT_POLL_POLLNVAL0x0020
  +
  +#defineKEVENT_POLL_POLLRDNORM  0x0040
  +#defineKEVENT_POLL_POLLRDBAND  0x0080
  +#defineKEVENT_POLL_POLLWRNORM  0x0100
  +#defineKEVENT_POLL_POLLWRBAND  0x0200
  +#defineKEVENT_POLL_POLLMSG 0x0400
  +#defineKEVENT_POLL_POLLREMOVE  0x1000
 
 And couldn't we just use the existing poll bit definitions for this?

asm/poll.h I expect.
linux/poll.h is too heavy or not?

  +struct kevent_id
  +{
  +   __u32   raw[2];
  +};
 
 Why not a simple u64?  Users can play games with packing it into other
 types if they want.
 
  +   __u32   user[2];/* User's data. It is 
  not used, just copied to/from user. */
  +   void*ptr;
  +   };
 
 Again just a u64 seems like it would be simpler.  userspace library
 wrappers can help massage it, but the kernel is just treating it as an
 opaque data blob.

u64 is not aligned, so I prefer to use u32 as much as possible.

  +};
  +
  +#defineKEVENT_CTL_ADD  0
  +#defineKEVENT_CTL_REMOVE   1
  +#defineKEVENT_CTL_MODIFY   2
  +#defineKEVENT_CTL_INIT 3
  +
  +struct kevent_user_control
  +{
  +   unsigned intcmd;/* Control command, 
  e.g. KEVENT_ADD, KEVENT_REMOVE... */
  +   unsigned intnum;/* Number of ukevents 
  this strucutre controls. */
  +   unsigned inttimeout;/* Timeout in 
  milliseconds waiting for num events to become ready. */
  +};
 
 Even if we only have one syscall with a cmd multiplexer (which I'm not
 thrilled with), we should at least make these arguments explicit in the
 system call.  It's weird to hide them in a struct.  We could also think
 about making them u32 or u64 so that we don't need compat wrappers, but
 maybe that's overkill.

Ok.

 Also, can we please use a struct timespec for the timeout?  Then the
 kernel will have the luxury of using whatever mechanism it wants to
 satisfy the user's precision desires.  Just like sys_nanosleep() uses
 timespec and so can be implemented with hrtimers.

It has variable size, I strongly against such things between kernel and
userspace.

  +struct kevent
  +{
 
 (trivial nit, struct kevent { is the preferred form.)

Ok.

  +   struct ukevent  event;
  +   spinlock_t  lock;   /* This lock protects 
  ukevent manipulations, e.g. ret_flags changes. */
 
 
 It'd be great if these struct members could get a prefix (ala: inode -
 i_, socket - sk_) so that it's less painful getting tags helpers to
 look up instances for us.  Asking for 'lock' is hilarious.

But it requires much less typing :)
Will update.

  +struct kevent_list
  +{
  +   struct list_headkevent_list;/* List of all kevents. 
  */
  +   spinlock_t  kevent_lock;/* Protects all 
  manipulations with queue of kevents. */
  +};
  +
  +struct kevent_user
  +{
  +   struct kevent_list  kqueue[KEVENT_HASH_MASK+1];
 
 Hmm.  I think the current preference is not to have a lock per bucket.
 It doesn't scale nearly as well as it seems like it should as the cache
 footprint is higher and as cacheline contention hits as there are
 multiple buckets per cacheline.  For now I'd simplify the hash into a
 single lock and an array of struct hlist_head.  In the future it could
 be another user of some kind of relatively-generic hash implementation
 based on rcu that has been talked about for a while.

Well, it scales better than one lock per the whole queue, but we can
see how it looks with one lock.

I used RCU hash table in kevents, but it scales very 

Re: [take2 1/4] kevent: core files.

2006-08-02 Thread Evgeniy Polyakov
On Tue, Aug 01, 2006 at 05:01:38PM -0700, David Miller ([EMAIL PROTECTED]) 
wrote:
 From: Zach Brown [EMAIL PROTECTED]
 Date: Tue, 01 Aug 2006 16:56:59 -0700
 
  Even if we only have one syscall with a cmd multiplexer (which I'm not
  thrilled with), we should at least make these arguments explicit in the
  system call.  It's weird to hide them in a struct.  We could also think
  about making them u32 or u64 so that we don't need compat wrappers, but
  maybe that's overkill.
 
 I think making the userspace data structure not require any compat
 handling is a must, thanks for pointing this out Zach.

It does not require compat macros, since unsigned int has the same size
on all normal machines where Linux runs, although it can be different.
Anyway, I will replace it with explicit syscall parameters.

  It'd be great if these struct members could get a prefix (ala: inode -
  i_, socket - sk_) so that it's less painful getting tags helpers to
  look up instances for us.  Asking for 'lock' is hilarious.
 
 Agreed.

Heh, it was so much less typing...

  Hmm.  I think the current preference is not to have a lock per bucket.
 
 Yes, it loses badly, that's why we undid this in the routing cache
 and just have a fixed sized array of locks which is hashed into.
 
 For kevents, I think a single spinlock initially is fine and
 if we hit performance problems on SMP we can fix it.  We should
 not implement complexity we have no proof of needing yet :)

Ok, let's see how it will behave.

   +#define KEVENT_MAX_REQUESTS  PAGE_SIZE/sizeof(struct kevent)
  
  This is unused?
 
 It is probably groundwork for the mmap() ring buffer... :)

A lot of work, isn't it? :)

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take2 1/4] kevent: core files.

2006-08-02 Thread David Miller
From: Evgeniy Polyakov [EMAIL PROTECTED]
Date: Wed, 2 Aug 2006 10:39:18 +0400

 u64 is not aligned, so I prefer to use u32 as much as possible.

We have aligned_u64 exactly for this purpose, netfilter makes
use of it to avoid the x86_64 vs. x86 u64 alignment discrepency.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take2 1/4] kevent: core files.

2006-08-02 Thread Evgeniy Polyakov
On Wed, Aug 02, 2006 at 12:25:05AM -0700, David Miller ([EMAIL PROTECTED]) 
wrote:
 From: Evgeniy Polyakov [EMAIL PROTECTED]
 Date: Wed, 2 Aug 2006 10:39:18 +0400
 
  u64 is not aligned, so I prefer to use u32 as much as possible.
 
 We have aligned_u64 exactly for this purpose, netfilter makes
 use of it to avoid the x86_64 vs. x86 u64 alignment discrepency.

Ok, I will use that type.

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 1/4] kevent: core files.

2006-08-01 Thread Ulrich Drepper
Herbert Xu wrote:
 The other to consider is that events don't come from the hardware.
 Events are written by the kernel.  So if user-space is just reading
 the events that we've written, then there are no cache misses at all.

Not quite true.  The ring buffer can be written to from another
processor.  The kernel thread responsible for generating the event
(receiving data from network or disk, expired timer) can run
independently on another CPU.

This is the case to keep in mind here.  I thought Zach and the other
involved in the discussions in Ottawa said this has been shown to be a
problem and that a ring buffer implementation with something other than
simple front and back pointers is preferable.

-- 
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖



signature.asc
Description: OpenPGP digital signature


Re: [RFC 1/4] kevent: core files.

2006-08-01 Thread David Miller
From: Ulrich Drepper [EMAIL PROTECTED]
Date: Tue, 01 Aug 2006 00:53:10 -0700

 This is the case to keep in mind here.  I thought Zach and the other
 involved in the discussions in Ottawa said this has been shown to be a
 problem and that a ring buffer implementation with something other than
 simple front and back pointers is preferable.

This is part of why I suggested VJ style channel data
structure.  At worst, the cachelines for the entries get
into shared modified state when the remove userland cpu
reads the slot.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[take2 1/4] kevent: core files.

2006-08-01 Thread Evgeniy Polyakov

This patch includes core kevent files:
 - userspace controlling
 - kernelspace interfaces
 - initialization
 - notification state machines

It might also inlclude parts from other subsystem (like network related
syscalls, so it is possible that it will not compile without other
patches applied).

Signed-off-by: Evgeniy Polyakov [EMAIL PROTECTED]


diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index dd63d47..0af988a 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -317,3 +317,7 @@ ENTRY(sys_call_table)
.long sys_tee   /* 315 */
.long sys_vmsplice
.long sys_move_pages
+   .long sys_aio_recv
+   .long sys_aio_send
+   .long sys_kevent_get_events
+   .long sys_kevent_ctl
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 5d4a7d1..e157ad4 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -713,4 +713,8 @@ #endif
.quad sys_tee
.quad compat_sys_vmsplice
.quad compat_sys_move_pages
+   .quad sys_aio_recv
+   .quad sys_aio_send
+   .quad sys_kevent_get_events
+   .quad sys_kevent_ctl
 ia32_syscall_end:  

diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index fc1c8dd..a76e50d 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -323,10 +323,14 @@ #define __NR_sync_file_range  314
 #define __NR_tee   315
 #define __NR_vmsplice  316
 #define __NR_move_pages317
+#define __NR_aio_recv  318
+#define __NR_aio_send  319
+#define __NR_kevent_get_events 320
+#define __NR_kevent_ctl321
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 318
+#define NR_syscalls 322
 
 /*
  * user-visible error numbers are in the range -1 - -128: see

diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 94387c9..9e61299 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -619,10 +619,18 @@ #define __NR_vmsplice 278
 __SYSCALL(__NR_vmsplice, sys_vmsplice)
 #define __NR_move_pages279
 __SYSCALL(__NR_move_pages, sys_move_pages)
+#define __NR_aio_recv  280
+__SYSCALL(__NR_aio_recv, sys_aio_recv)
+#define __NR_aio_send  281
+__SYSCALL(__NR_aio_send, sys_aio_send)
+#define __NR_aio_sendfile  282
+__SYSCALL(__NR_aio_sendfile, sys_kevent_get_events)
+#define __NR_kevent_ctl283
+__SYSCALL(__NR_kevent_ctl, sys_kevent_ctl)
 
 #ifdef __KERNEL__
 
-#define __NR_syscall_max __NR_move_pages
+#define __NR_syscall_max __NR_kevent_ctl
 
 #ifndef __NO_STUBS
 
diff --git a/include/linux/kevent.h b/include/linux/kevent.h
new file mode 100644
index 000..6c36f3f
--- /dev/null
+++ b/include/linux/kevent.h
@@ -0,0 +1,259 @@
+/*
+ * kevent.h
+ * 
+ * 2006 Copyright (c) Evgeniy Polyakov [EMAIL PROTECTED]
+ * All rights reserved.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __KEVENT_H
+#define __KEVENT_H
+
+/*
+ * Kevent request flags.
+ */
+
+#define KEVENT_REQ_ONESHOT 0x1 /* Process this event only once 
and then dequeue. */
+
+/*
+ * Kevent return flags.
+ */
+#define KEVENT_RET_BROKEN  0x1 /* Kevent is broken. */
+#define KEVENT_RET_DONE0x2 /* Kevent processing 
was finished successfully. */
+
+/*
+ * Kevent type set.
+ */
+#define KEVENT_SOCKET  0
+#define KEVENT_INODE   1
+#define KEVENT_TIMER   2
+#define KEVENT_POLL3
+#define KEVENT_NAIO4
+#define KEVENT_AIO 5
+#defineKEVENT_MAX  6
+
+/*
+ * Per-type event sets.
+ * Number of per-event sets should be exactly as number of kevent types.
+ */
+
+/*
+ * Timer events.
+ */
+#defineKEVENT_TIMER_FIRED  0x1
+
+/*
+ * Socket/network asynchronous IO events.
+ */
+#defineKEVENT_SOCKET_RECV  0x1
+#defineKEVENT_SOCKET_ACCEPT0x2
+#defineKEVENT_SOCKET_SEND  0x4
+
+/*
+ * Inode events.
+ */
+#defineKEVENT_INODE_CREATE 0x1
+#defineKEVENT_INODE_REMOVE 0x2
+
+/*
+ * Poll events.
+ */
+#defineKEVENT_POLL_POLLIN  0x0001
+#defineKEVENT_POLL_POLLPRI 0x0002

Re: [take2 1/4] kevent: core files.

2006-08-01 Thread James Morris
On Tue, 1 Aug 2006, Evgeniy Polyakov wrote:

 + u-ready_num = 0;
 +#ifdef CONFIG_KEVENT_USER_STAT
 + u-wait_num = u-im_num = u-total = 0;
 +#endif

Generally, #ifdefs in the body of the kernel code are discouraged.  Can 
you abstract these out as static inlines?


- James
-- 
James Morris
[EMAIL PROTECTED]
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take2 1/4] kevent: core files.

2006-08-01 Thread Evgeniy Polyakov
On Tue, Aug 01, 2006 at 09:46:58AM -0400, James Morris ([EMAIL PROTECTED]) 
wrote:
 On Tue, 1 Aug 2006, Evgeniy Polyakov wrote:
 
  +   u-ready_num = 0;
  +#ifdef CONFIG_KEVENT_USER_STAT
  +   u-wait_num = u-im_num = u-total = 0;
  +#endif
 
 Generally, #ifdefs in the body of the kernel code are discouraged.  Can 
 you abstract these out as static inlines?

Yes, it is possible.
I would ask is it needed at all? It contains number of immediately fired
events (i.e. those which were ready when event was added and thus
syscall returned immediately showing that it is ready), total number of
events, which were inserted in the given queue and number of events
which were marked as ready after they were inserted.
Currently it is compilation option which ends up in printk with above
info when kevent queue is removed.
 
 - James
 -- 
 James Morris
 [EMAIL PROTECTED]

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take2 1/4] kevent: core files.

2006-08-01 Thread James Morris
On Tue, 1 Aug 2006, Evgeniy Polyakov wrote:

 On Tue, Aug 01, 2006 at 09:46:58AM -0400, James Morris ([EMAIL PROTECTED]) 
 wrote:
  On Tue, 1 Aug 2006, Evgeniy Polyakov wrote:
  
   + u-ready_num = 0;
   +#ifdef CONFIG_KEVENT_USER_STAT
   + u-wait_num = u-im_num = u-total = 0;
   +#endif
  
  Generally, #ifdefs in the body of the kernel code are discouraged.  Can 
  you abstract these out as static inlines?
 
 Yes, it is possible.
 I would ask is it needed at all?

Yes, please, it is standard kernel development practice.

Otherwise, the kernel will turn into an unmaintainable #ifdef jungle.

 It contains number of immediately fired
 events (i.e. those which were ready when event was added and thus
 syscall returned immediately showing that it is ready), total number of
 events, which were inserted in the given queue and number of events
 which were marked as ready after they were inserted.
 Currently it is compilation option which ends up in printk with above
 info when kevent queue is removed.

Fine, make 

static inline void kevent_user_stat_reset(u);

etc.

which compile to nothing when it's not confifgured.


-- 
James Morris
[EMAIL PROTECTED]
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take2 1/4] kevent: core files.

2006-08-01 Thread Evgeniy Polyakov
On Tue, Aug 01, 2006 at 10:27:36AM -0400, James Morris ([EMAIL PROTECTED]) 
wrote:
+   u-ready_num = 0;
+#ifdef CONFIG_KEVENT_USER_STAT
+   u-wait_num = u-im_num = u-total = 0;
+#endif
   
   Generally, #ifdefs in the body of the kernel code are discouraged.  Can 
   you abstract these out as static inlines?
  
  Yes, it is possible.
  I would ask is it needed at all?
 
 Yes, please, it is standard kernel development practice.

Will do.
Thanks, James.

 -- 
 James Morris
 [EMAIL PROTECTED]

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 1/4] kevent: core files.

2006-08-01 Thread Zach Brown

 I do not think if we do a ring buffer that events should be obtainable
 via a syscall at all.  Rather, I think this system call should be
 purely sleep until ring is not empty.

Mmm, yeah, of course.  That's much simpler.  I'm looking forward to
Evgeniy's next patch set.

 The ring buffer size, as Evgeniy also tried to describe, is bounded
 purely by the number of registered events.

Yeah.  fwiw, fs/aio.c has this property today.

- z
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take2 1/4] kevent: core files.

2006-08-01 Thread David Miller
From: Zach Brown [EMAIL PROTECTED]
Date: Tue, 01 Aug 2006 16:56:59 -0700

 Even if we only have one syscall with a cmd multiplexer (which I'm not
 thrilled with), we should at least make these arguments explicit in the
 system call.  It's weird to hide them in a struct.  We could also think
 about making them u32 or u64 so that we don't need compat wrappers, but
 maybe that's overkill.

I think making the userspace data structure not require any compat
handling is a must, thanks for pointing this out Zach.

 It'd be great if these struct members could get a prefix (ala: inode -
 i_, socket - sk_) so that it's less painful getting tags helpers to
 look up instances for us.  Asking for 'lock' is hilarious.

Agreed.

 Hmm.  I think the current preference is not to have a lock per bucket.

Yes, it loses badly, that's why we undid this in the routing cache
and just have a fixed sized array of locks which is hashed into.

For kevents, I think a single spinlock initially is fine and
if we hit performance problems on SMP we can fix it.  We should
not implement complexity we have no proof of needing yet :)

  +#define KEVENT_MAX_REQUESTSPAGE_SIZE/sizeof(struct kevent)
 
 This is unused?

It is probably groundwork for the mmap() ring buffer... :)

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [take2 1/4] kevent: core files.

2006-08-01 Thread Zach Brown

OK, here's some of my reactions to the core part.

 +#define KEVENT_SOCKET0
 +#define KEVENT_INODE 1
 +#define KEVENT_TIMER 2
 +#define KEVENT_POLL  3
 +#define KEVENT_NAIO  4
 +#define KEVENT_AIO   5

I guess we can't really avoid some form of centralized list of the
constants in the API if we're going for a flat constant namespace.
It'll be irritating to manage this list over time, just like it's
irritating to manage syscall numbers now.

 +/*
 + * Socket/network asynchronous IO events.
 + */
 +#define  KEVENT_SOCKET_RECV  0x1
 +#define  KEVENT_SOCKET_ACCEPT0x2
 +#define  KEVENT_SOCKET_SEND  0x4

I wonder if these shouldn't live in the subsystems instead of in kevent.h.

 +/*
 + * Poll events.
 + */
 +#define  KEVENT_POLL_POLLIN  0x0001
 +#define  KEVENT_POLL_POLLPRI 0x0002
 +#define  KEVENT_POLL_POLLOUT 0x0004
 +#define  KEVENT_POLL_POLLERR 0x0008
 +#define  KEVENT_POLL_POLLHUP 0x0010
 +#define  KEVENT_POLL_POLLNVAL0x0020
 +
 +#define  KEVENT_POLL_POLLRDNORM  0x0040
 +#define  KEVENT_POLL_POLLRDBAND  0x0080
 +#define  KEVENT_POLL_POLLWRNORM  0x0100
 +#define  KEVENT_POLL_POLLWRBAND  0x0200
 +#define  KEVENT_POLL_POLLMSG 0x0400
 +#define  KEVENT_POLL_POLLREMOVE  0x1000

And couldn't we just use the existing poll bit definitions for this?

 +struct kevent_id
 +{
 + __u32   raw[2];
 +};

Why not a simple u64?  Users can play games with packing it into other
types if they want.

 + __u32   user[2];/* User's data. It is 
 not used, just copied to/from user. */
 + void*ptr;
 + };

Again just a u64 seems like it would be simpler.  userspace library
wrappers can help massage it, but the kernel is just treating it as an
opaque data blob.

 +};
 +
 +#define  KEVENT_CTL_ADD  0
 +#define  KEVENT_CTL_REMOVE   1
 +#define  KEVENT_CTL_MODIFY   2
 +#define  KEVENT_CTL_INIT 3
 +
 +struct kevent_user_control
 +{
 + unsigned intcmd;/* Control command, 
 e.g. KEVENT_ADD, KEVENT_REMOVE... */
 + unsigned intnum;/* Number of ukevents 
 this strucutre controls. */
 + unsigned inttimeout;/* Timeout in 
 milliseconds waiting for num events to become ready. */
 +};

Even if we only have one syscall with a cmd multiplexer (which I'm not
thrilled with), we should at least make these arguments explicit in the
system call.  It's weird to hide them in a struct.  We could also think
about making them u32 or u64 so that we don't need compat wrappers, but
maybe that's overkill.

Also, can we please use a struct timespec for the timeout?  Then the
kernel will have the luxury of using whatever mechanism it wants to
satisfy the user's precision desires.  Just like sys_nanosleep() uses
timespec and so can be implemented with hrtimers.

 +struct kevent
 +{

(trivial nit, struct kevent { is the preferred form.)

 + struct ukevent  event;
 + spinlock_t  lock;   /* This lock protects 
 ukevent manipulations, e.g. ret_flags changes. */


It'd be great if these struct members could get a prefix (ala: inode -
i_, socket - sk_) so that it's less painful getting tags helpers to
look up instances for us.  Asking for 'lock' is hilarious.

 +struct kevent_list
 +{
 + struct list_headkevent_list;/* List of all kevents. 
 */
 + spinlock_t  kevent_lock;/* Protects all 
 manipulations with queue of kevents. */
 +};
 +
 +struct kevent_user
 +{
 + struct kevent_list  kqueue[KEVENT_HASH_MASK+1];

Hmm.  I think the current preference is not to have a lock per bucket.
It doesn't scale nearly as well as it seems like it should as the cache
footprint is higher and as cacheline contention hits as there are
multiple buckets per cacheline.  For now I'd simplify the hash into a
single lock and an array of struct hlist_head.  In the future it could
be another user of some kind of relatively-generic hash implementation
based on rcu that has been talked about for a while.

 +#define KEVENT_MAX_REQUESTS  PAGE_SIZE/sizeof(struct kevent)

This is unused?

 +#define list_for_each_entry_reverse_safe(pos, n, head, member)   
 \
 + for (pos = list_entry((head)-prev, typeof(*pos), member),  \
 + n = list_entry(pos-member.prev, typeof(*pos), member); \
 +  prefetch(pos-member.prev), pos-member != (head);\
 +  pos = n, n = list_entry(pos-member.prev, typeof(*pos), member))

If anyone was calling this they could use
list_for_each_entry_safe_reverse() in list.h but nothing is calling it?
 Either way, it should be removed :).

 +#define sock_async(__sk) 0

It's a minor complaint, but these kinds of ifdefs that drop arguments
can cause unused 

Re: [RFC 1/4] kevent: core files.

2006-07-31 Thread Evgeniy Polyakov
On Sat, Jul 29, 2006 at 09:18:47AM -0700, Ulrich Drepper ([EMAIL PROTECTED]) 
wrote:
 Evgeniy Polyakov wrote:
  Btw, why do we want mapped ring of ready events?
  If user requestd some event, he definitely wants to get them back when
  they are ready, and not to check and then get them?
  Could you please explain more on this issue?
 
 If of course makes no sense to enter the kernel to actually get the
 event.  This should be done by storing the event in the ring buffer.
 I.e., there are two ways to get an event:
 
 - with a syscall.  This can report as many events at once as the caller
   provides space for.  And no event which is reported in the run buffer
   should be reported this way
 
 - if there is space, report it in the ring buffer.  Yes, the buffer
   can be optional, then all events are reported by the system call.

That requires a copy, which can neglect syscall overhead.
Do we really want it to be done?
 
 So the use case would be like this:
 
 
 wait_and_get_event:
 
   is buffer empty ?
 
 yes - make syscall
 
 no - get event from buffer
 
 
 To avoid races, the syscall needs to take a parameter indicating the
 last event checked out from the buffer.  If in the meantime the kernel
 put another event in the buffer the syscall immediately returns.
 Similar to what we do in the futex syscall.

And how misordering between queue and buffer is going to be managed?
I.e. when buffer is full and events are placed into queue, so syscall
could get them, and then syscall is called to get events from the queue
but not from the buffer - we can endup taking events from buffer while
old are placed in the queue.
And how waiting will be done without syscalls? Will glibc take care of
it?

 The question is how to best represent the ring buffer.  Zach and some
 others had some ready responses in Ottawa.  The important thing is to
 avoid cache line ping pong when possible.
 
 Is the ring buffer absolutely necessary?  Probably not.  But it has the
 potential to help quite a bit.  Don't look at the problem to solve in
 the context of heavy I/O operations when another syscall here and there
 doesn't matter.  With this single event mechanism for every possible
 event the kernel can generate programming can look quite different.
 E.g., every read() call can implicitly we changed into an async read
 call followed by a user-level reschedule.  This rescheduling allows
 another thread of execution to run while the read request is processed.
  I.e., it's basically a setjmp() followed by a goto into the inner loop
 to get the next event.  And now suddenly the event notification
 mechanism really should be as fast as possible.  If we submit basically
 every request asynchronously and are not creating dedicated threads for
 specific tasks anymore we
 
 a) have a lot more event notifications
 
 b) the probability of an event being reported when we want the receive
the next one if higher (i.e., the case where no syscall vs syscall
makes a difference)
 
 Yes, all this will require changes in the way programs a written but we
 shouldn't limit the way we can write programs unnecessarily.  I think
 that given increasing discrepancies in relative speed/latency of the
 peripherals and the CPU this is one possible solution to keep the CPUs
 busy without resorting to a gazillion separate threads in each program.

Ok, let's do it in the following way:
I present new version of kevent with new syscalls and fixed issues mentioned
before, while people look at it we can end up with mapped buffer design.
Is it ok?

 -- 
 ➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
 



-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 1/4] kevent: core files.

2006-07-31 Thread Herbert Xu
Evgeniy Polyakov [EMAIL PROTECTED] wrote:

 - if there is space, report it in the ring buffer.  Yes, the buffer
   can be optional, then all events are reported by the system call.
 
 That requires a copy, which can neglect syscall overhead.
 Do we really want it to be done?

Please note that we're talking about events here, not actual data.  So
only the event is being copied, which is presumably rather small compared
to the data.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 1/4] kevent: core files.

2006-07-31 Thread Evgeniy Polyakov
On Mon, Jul 31, 2006 at 08:35:55PM +1000, Herbert Xu ([EMAIL PROTECTED]) wrote:
 Evgeniy Polyakov [EMAIL PROTECTED] wrote:
 
  - if there is space, report it in the ring buffer.  Yes, the buffer
can be optional, then all events are reported by the system call.
  
  That requires a copy, which can neglect syscall overhead.
  Do we really want it to be done?
 
 Please note that we're talking about events here, not actual data.  So
 only the event is being copied, which is presumably rather small compared
 to the data.

In syscall time kevents copy 40bytes for each event + 12 bytes of header 
(number of events, timeout and command number). That's likely two cache
lines if only one event is reported.

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 1/4] kevent: core files.

2006-07-31 Thread David Miller
From: Evgeniy Polyakov [EMAIL PROTECTED]
Date: Mon, 31 Jul 2006 14:50:37 +0400

 In syscall time kevents copy 40bytes for each event + 12 bytes of header 
 (number of events, timeout and command number). That's likely two cache
 lines if only one event is reported.

Do you know how many cachelines are dirtied by system call
entry and exit on typical system?

On sparc64 it is a minimum of 3 64-byte cachelines just to save and
restore the system call time cpu register state.  If application is
deep in a call chain, register windows might spill and each such
register window will dirty 2 more cachelines as they are dumped to the
stack.

I am not even talking about the other basic necessities of doing
a system call such as touching various task_struct and thread_info
state to check for pending signals etc.

System call overhead is non-trivial especially when you are using
it to move only a few small objects into and out of the kernel.

So I would say for up to 4 or 5 events, system call overhead alone
touches as many cache lines as the events themselves.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 1/4] kevent: core files.

2006-07-31 Thread Herbert Xu
On Mon, Jul 31, 2006 at 03:57:16AM -0700, David Miller wrote:
 
 So I would say for up to 4 or 5 events, system call overhead alone
 touches as many cache lines as the events themselves.

Absolutely.

The other to consider is that events don't come from the hardware.
Events are written by the kernel.  So if user-space is just reading
the events that we've written, then there are no cache misses at all.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 1/4] kevent: core files.

2006-07-31 Thread Evgeniy Polyakov
On Mon, Jul 31, 2006 at 02:33:22PM +0400, Evgeniy Polyakov ([EMAIL PROTECTED]) 
wrote:
 Ok, let's do it in the following way:
 I present new version of kevent with new syscalls and fixed issues mentioned
 before, while people look at it we can end up with mapped buffer design.
 Is it ok?

Since kevents are never generated by kernel, but only marked as ready,
length of the main queue performs as flow control, so we can create a
mapped buffer which will have space equal to the main queue length
multiplied by size of the copied to userspace structure plus 16 bits for
the start index of the kernel writing side, i.e. it will store offset
where the oldest event was placed.
Since queue length is a limited factor and thus no new events can be added
when queue is full, that means that buffer is full too and userspace
must read events. When syscall is called to add new kevent and provided 
there offset differs from what kernel stored, that means that all events 
from kernel to provided index have been read and new events can be added.
Thus we can even allow read-only mapping. Kernel's index is incremented
modulo queue length. If kevent was removed after it was marked as
ready, it's copy stays in the mapped buffer, but special flag can be
assigned to show that kevent is no longer valid.


-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 1/4] kevent: core files.

2006-07-31 Thread David Miller
From: Evgeniy Polyakov [EMAIL PROTECTED]
Date: Mon, 31 Jul 2006 23:41:43 +0400

 Since kevents are never generated by kernel, but only marked as ready,
 length of the main queue performs as flow control, so we can create a
 mapped buffer which will have space equal to the main queue length
 multiplied by size of the copied to userspace structure plus 16 bits for
 the start index of the kernel writing side, i.e. it will store offset
 where the oldest event was placed.

 Since queue length is a limited factor and thus no new events can be added
 when queue is full, that means that buffer is full too and userspace
 must read events. When syscall is called to add new kevent and provided 
 there offset differs from what kernel stored, that means that all events 
 from kernel to provided index have been read and new events can be added.
 Thus we can even allow read-only mapping. Kernel's index is incremented
 modulo queue length. If kevent was removed after it was marked as
 ready, it's copy stays in the mapped buffer, but special flag can be
 assigned to show that kevent is no longer valid.

This sounds reasonable.

However we must be mindful that the thread of control trying to
add a new event might not be in a position to drain the queue
of pending events when the queue is full.  Usually he will be
trying to add an event in response to handling another event.

So we'd have cases like this, assume we start with a full event
queue:

thread Athread B

dequeue event
aha, new connection
accept()
register new kevent
queue is now full again
add kevent on new
connection

At this point thread A doesn't have very many options when the kevent
add fails.  You cannot force this thread to read more events, since he
may not be in a state where he is easily able to do so.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 1/4] kevent: core files.

2006-07-31 Thread Brent Cook
On Monday 31 July 2006 17:00, David Miller wrote:

 So we'd have cases like this, assume we start with a full event
 queue:

   thread Athread B

   dequeue event
   aha, new connection
   accept()
   register new kevent
   queue is now full again
   add kevent on new
   connection

 At this point thread A doesn't have very many options when the kevent
 add fails.  You cannot force this thread to read more events, since he
 may not be in a state where he is easily able to do so.

There has to be some thread that is responsible for reading events. Perhaps a 
reasonable thing for a blocked thread that cannot process events to do is to 
yield to one that can?

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 1/4] kevent: core files.

2006-07-31 Thread David Miller
From: Brent Cook [EMAIL PROTECTED]
Date: Mon, 31 Jul 2006 17:16:48 -0500

 There has to be some thread that is responsible for reading
 events. Perhaps a reasonable thing for a blocked thread that cannot
 process events to do is to yield to one that can?

The reason one decentralizes event processing into threads is so that
once they are tasked to process some event they need not be concerned
with event state.

They are designed to process their event through to the end, then
return to the top level and say any more work for me?
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 1/4] kevent: core files.

2006-07-31 Thread David Miller
From: Zach Brown [EMAIL PROTECTED]
Date: Thu, 27 Jul 2006 12:18:42 -0700

[ I kept this thread around in my inbox because I wanted to give it
  some deep thought, so sorry for replying to old bits... ]

 So as the kernel generates events in the ring it only produces an event
 if the ownership field says that userspace has consumed it and in doing
 so it sets the ownership field to tell userspace that an event is
 waiting.  userspace and the kernel now each follow their index around
 the ring as the ownership field lets them produce or consume the event
 at their index.  Can someone tell me if the cache coherence costs of
 this are extreme?  I'm hoping they're not.

No need for an owner field, we can use something like a VJ
netchannel datastructure for this.  Kernel only writes to
producer index and user only writes to consumer index.

 So, great, glibc can now find pending events very quickly if they're
 waiting in the ring and can fall back to the collection syscall if it
 wants to wait and the ring is empty.  If it consumes events via the
 syscall it increases its ring index by the number the syscall returned.

I do not think if we do a ring buffer that events should be obtainable
via a syscall at all.  Rather, I think this system call should be
purely sleep until ring is not empty.

This is actually reasonably simple stuff to implement as Evgeniy
has tried to explain.

Events in kevent live on a ready list when they have triggered.
Existence on a list determined the state, and I think this design
btw invalidates some of the arguments against using netlink that
Ulrich mentions in his paper.  If netlink socket queuing fails,
well then kevent stays on ready list and that is all until the
kevent can be successfully published to the user.

I am not advocating netlink at all for this, as the ring buffer idea
is much better.

The ring buffer size, as Evgeniy also tried to describe, is bounded
purely by the number of registered events.  So event loop of
application might look something like this:

struct ukevent cur_event;
struct timeval timeo;

setup_timeout(timeo);
for (;;) {
int err;
while(!(err = ukevent_dequeue(evt_fd, evt_ring,
  cur_event, timeo))) {
struct my_event_object *o =
event_to_object(cur_event);
o-dispatch(o, cur_event);
setup_timeout(timeo);
}
if (err == -ETIMEDOUT)
timeout_processing();
else
event_error_processing(err);
}

ukevent_dequeue() is perhaps some GLIBC implemented routine which does
something like:

int err;

for (;;) {
if (!evt_ring_empty(evt_ring)) {
struct ukevent *p = evt_ring_consume(evt_ring);
memcpy(event_p, p, sizeof(struct ukevent));
return 0;
}
err = kevent_wait(evt_fd, timeo_p);
if (err  0)
break;
}
return err;

It's just some stupid ideas... we could also choose to expose the ring
buffer layout directly to the user event loop and let it perform the
dequeue operation and kevent_wait() calls directly.  I don't see why
not to allow that.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 1/4] kevent: core files.

2006-07-31 Thread David Miller
From: Evgeniy Polyakov [EMAIL PROTECTED]
Date: Fri, 28 Jul 2006 09:23:12 +0400

 I completely agree that existing kevent interface is not the best, so
 I'm opened for any suggestions.
 Should kevent creation/removing/modification be separated too?

I do not think so, object for these 3 operations are the same,
so there are no typing issues.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 1/4] kevent: core files.

2006-07-30 Thread Ulrich Drepper
Nicholas Miell wrote:
 [...] and was wondering
 if you were familiar with the Solaris port APIs* and,

I wasn't.


 if so, you could
 please comment on how your proposed event channels are different/better.

There indeed is not much difference.  The differences are in the
details.  The way those ports are specified doesn't allow much room for
further optimizations.  E.g., the userlevel ring buffer isn't possible.
 But mostly it's the same semantics.  The ec_t type in my text is also
better a file descriptor since otherwise it cannot be transported via
Unix stream sockets.

-- 
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖



signature.asc
Description: OpenPGP digital signature


Re: [RFC 1/4] kevent: core files.

2006-07-29 Thread Evgeniy Polyakov
On Fri, Jul 28, 2006 at 08:38:02PM -0700, Ulrich Drepper ([EMAIL PROTECTED]) 
wrote:
 Zach Brown wrote:
  Ulrich, would you be satisfied if we didn't
  have the userspace mapped ring on the first pass and only had a
  collection syscall?
 
 I'm not the one to make a call but why rush things?  Let's do it right
 from the start.  Later changes can only lead to problems with users of
 the earlier interface.

Btw, why do we want mapped ring of ready events?
If user requestd some event, he definitely wants to get them back when
they are ready, and not to check and then get them?
Could you please explain more on this issue?

 -- 
 ??? Ulrich Drepper ??? Red Hat, Inc. ??? 444 Castro St ??? Mountain View, CA 
 ???
 



-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 1/4] kevent: core files.

2006-07-29 Thread Evgeniy Polyakov
On Fri, Jul 28, 2006 at 09:32:42PM -0700, Nicholas Miell ([EMAIL PROTECTED]) 
wrote:
 Speaking of API design choices, I saw your OLS paper and was wondering
 if you were familiar with the Solaris port APIs* and, if so, you could
 please comment on how your proposed event channels are different/better.

As far as it concerns kevents - userspace ports are just usual users
of kevents, like timer notifications. Add another syscall to complete
requested kevents and you get exactly Solaris ports.
It is fairly simple to implement on top of kevents, I just do not see
immediate benefits from that.

 -- 
 Nicholas Miell [EMAIL PROTECTED]

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 1/4] kevent: core files.

2006-07-29 Thread Ulrich Drepper
Evgeniy Polyakov wrote:
 Btw, why do we want mapped ring of ready events?
 If user requestd some event, he definitely wants to get them back when
 they are ready, and not to check and then get them?
 Could you please explain more on this issue?

If of course makes no sense to enter the kernel to actually get the
event.  This should be done by storing the event in the ring buffer.
I.e., there are two ways to get an event:

- with a syscall.  This can report as many events at once as the caller
  provides space for.  And no event which is reported in the run buffer
  should be reported this way

- if there is space, report it in the ring buffer.  Yes, the buffer
  can be optional, then all events are reported by the system call.


So the use case would be like this:


wait_and_get_event:

  is buffer empty ?

yes - make syscall

no - get event from buffer


To avoid races, the syscall needs to take a parameter indicating the
last event checked out from the buffer.  If in the meantime the kernel
put another event in the buffer the syscall immediately returns.
Similar to what we do in the futex syscall.

The question is how to best represent the ring buffer.  Zach and some
others had some ready responses in Ottawa.  The important thing is to
avoid cache line ping pong when possible.


Is the ring buffer absolutely necessary?  Probably not.  But it has the
potential to help quite a bit.  Don't look at the problem to solve in
the context of heavy I/O operations when another syscall here and there
doesn't matter.  With this single event mechanism for every possible
event the kernel can generate programming can look quite different.
E.g., every read() call can implicitly we changed into an async read
call followed by a user-level reschedule.  This rescheduling allows
another thread of execution to run while the read request is processed.
 I.e., it's basically a setjmp() followed by a goto into the inner loop
to get the next event.  And now suddenly the event notification
mechanism really should be as fast as possible.  If we submit basically
every request asynchronously and are not creating dedicated threads for
specific tasks anymore we

a) have a lot more event notifications

b) the probability of an event being reported when we want the receive
   the next one if higher (i.e., the case where no syscall vs syscall
   makes a difference)

Yes, all this will require changes in the way programs a written but we
shouldn't limit the way we can write programs unnecessarily.  I think
that given increasing discrepancies in relative speed/latency of the
peripherals and the CPU this is one possible solution to keep the CPUs
busy without resorting to a gazillion separate threads in each program.

-- 
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖



signature.asc
Description: OpenPGP digital signature


Re: [RFC 1/4] kevent: core files.

2006-07-29 Thread Hans Henrik Happe
On Saturday 29 July 2006 18:18, Ulrich Drepper wrote:
 Evgeniy Polyakov wrote:
  Btw, why do we want mapped ring of ready events?
  If user requestd some event, he definitely wants to get them back when
  they are ready, and not to check and then get them?
  Could you please explain more on this issue?
 
 If of course makes no sense to enter the kernel to actually get the
 event.  This should be done by storing the event in the ring buffer.
 I.e., there are two ways to get an event:
 
 - with a syscall.  This can report as many events at once as the caller
   provides space for.  And no event which is reported in the run buffer
   should be reported this way
 
 - if there is space, report it in the ring buffer.  Yes, the buffer
   can be optional, then all events are reported by the system call.
 
 
 So the use case would be like this:
 
 
 wait_and_get_event:
 
   is buffer empty ?
 
 yes - make syscall
 
 no - get event from buffer
 
 
 To avoid races, the syscall needs to take a parameter indicating the
 last event checked out from the buffer.  If in the meantime the kernel
 put another event in the buffer the syscall immediately returns.
 Similar to what we do in the futex syscall.

Couldn't this be done in a general way: Given a fd that supports streaming 
input, map some user-mem as a ring buffer for input. Maybe the kernel should 
control the buffer in order to make resizing possible (i.e., TCP zero-copy 
and window scaling).

Hans Henrik 
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 1/4] kevent: core files.

2006-07-29 Thread Nicholas Miell
On Sat, 2006-07-29 at 19:48 +0400, Evgeniy Polyakov wrote:
 On Fri, Jul 28, 2006 at 09:32:42PM -0700, Nicholas Miell ([EMAIL PROTECTED]) 
 wrote:
  Speaking of API design choices, I saw your OLS paper and was wondering
  if you were familiar with the Solaris port APIs* and, if so, you could
  please comment on how your proposed event channels are different/better.
 
 As far as it concerns kevents - userspace ports are just usual users
 of kevents, like timer notifications. Add another syscall to complete
 requested kevents and you get exactly Solaris ports.
 It is fairly simple to implement on top of kevents, I just do not see
 immediate benefits from that.
 

Sorry, I wasn't talking about kevent, I was talking about the interfaces
described in The Need for Asynchronous, Zero-Copy Network I/O by
Ulrich Drepper -- specifically the ec_t type and related functions and
the modifications to struct sigevent.

-- 
Nicholas Miell [EMAIL PROTECTED]

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 1/4] kevent: core files.

2006-07-28 Thread Zach Brown

 I completely agree that existing kevent interface is not the best, so
 I'm opened for any suggestions.
 Should kevent creation/removing/modification be separated too?

Yeah, I think so.

 Hmm, it looks like I'm lost here...
 Yeah, it seems my description might not have sunk in :).  We're giving
 userspace a way to collect events without performing a system call.
 
 And why do we want this?

So that event collection can be very efficient.

 How glibc is supposed to determine, that some events already fired and
 such requests will return immediately, or for example how timer events
 will be managed?

...

That was what my previous mail was all about!

- z
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC 1/4] kevent: core files.

2006-07-28 Thread Evgeniy Polyakov
On Fri, Jul 28, 2006 at 11:33:16AM -0700, Zach Brown ([EMAIL PROTECTED]) wrote:
 
  I completely agree that existing kevent interface is not the best, so
  I'm opened for any suggestions.
  Should kevent creation/removing/modification be separated too?
 
 Yeah, I think so.

So, I'm going to create kevent_create/destroy/control and kevent_get_events()
Or any better names?

  Hmm, it looks like I'm lost here...
  Yeah, it seems my description might not have sunk in :).  We're giving
  userspace a way to collect events without performing a system call.
  
  And why do we want this?
 
 So that event collection can be very efficient.
 
  How glibc is supposed to determine, that some events already fired and
  such requests will return immediately, or for example how timer events
  will be managed?
 
 ...
 
 That was what my previous mail was all about!

Some events are impossible to create in userspace (like timer
notification, which requires timer start and check when timer
completed).
Actually all events are part of the kernel, since glibc does not have
any knowledge about in-kernel state machines which are bound to
appropriate kevents, so each kevent takes at least two syscall (create
and get ready), and I do not see how, for exmple, glibc can avoid them
when user requested POLLIN or similar event for network dataflow?

According to syscall speed on Linux, last time I checked empty syscall 
took about 100ns on AMD Athlon 3500+.

 - z

-- 
Evgeniy Polyakov
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


  1   2   >