* Paul E. McKenney ([email protected]) wrote: > On Mon, Jun 06, 2011 at 01:46:00PM -0400, Mathieu Desnoyers wrote: > > If I remove the URCU_CALL_RCU_RT flag from the rbtree single writer > > test, thus using the pthread_cond_signal mechanism, there is a huge > > slowdown: without cpu affinity for the worker threads, it crawls to 129 > > updates/s (looks like mutex contention between the thread calling > > call_rcu and the call_rcu thread). Adding CPU affinity to the per-cpu > > call_rcu threads, I get 546 updates/s, which is slightly better (better > > cache locality, and maybe the mutex contention is not as bad thanks to > > the two threads sharing the same CPU). > > > > So I decided to try replacing pthread_cond_wait/signal with my > > futex-based implementation I use for the rest of the urcu lib: it has > > the advantage of removing the mutex from the call_rcu() execution > > entirely, sampling the "futex" variable without any mutex whatsoever for > > the case where no wakeup is needed. > > > > Disabling URCU_CALL_RCU_RT flag, with per-cpu affined call_rcu threads, > > with my futex-based wakeup implementation, I get 55754 updates/s (even > > better than with URCU_CALL_RCU_RT flag!). > > Assuming that compat_futex.c handles the non-Linux case, works for me!
OK, patch merged. Thanks! Mathieu > > Thanx, Paul > > > Signed-off-by: Mathieu Desnoyers <[email protected]> > > --- > > urcu-call-rcu-impl.h | 56 > > +++++++++++++++++++++++++-------------------------- > > 1 file changed, 28 insertions(+), 28 deletions(-) > > > > Index: userspace-rcu/urcu-call-rcu-impl.h > > =================================================================== > > --- userspace-rcu.orig/urcu-call-rcu-impl.h > > +++ userspace-rcu/urcu-call-rcu-impl.h > > @@ -39,6 +39,7 @@ > > #include "urcu-call-rcu.h" > > #include "urcu-pointer.h" > > #include "urcu/list.h" > > +#include "urcu/urcu-futex.h" > > > > /* Data structure that identifies a call_rcu thread. */ > > > > @@ -46,7 +47,7 @@ struct call_rcu_data { > > struct cds_wfq_queue cbs; > > unsigned long flags; > > pthread_mutex_t mtx; > > - pthread_cond_t cond; > > + int futex; > > unsigned long qlen; > > pthread_t tid; > > int cpu_affinity; > > @@ -88,6 +89,26 @@ static struct call_rcu_data *default_cal > > static struct call_rcu_data **per_cpu_call_rcu_data; > > static long maxcpus; > > > > +static void call_rcu_wait(struct call_rcu_data *crdp) > > +{ > > + /* Read call_rcu list before read futex */ > > + cmm_smp_mb(); > > + if (uatomic_read(&crdp->futex) == -1) > > + futex_async(&crdp->futex, FUTEX_WAIT, -1, > > + NULL, NULL, 0); > > +} > > + > > +static void call_rcu_wake_up(struct call_rcu_data *crdp) > > +{ > > + /* Write to call_rcu list before reading/writing futex */ > > + cmm_smp_mb(); > > + if (unlikely(uatomic_read(&crdp->futex) == -1)) { > > + uatomic_set(&crdp->futex, 0); > > + futex_async(&crdp->futex, FUTEX_WAKE, 1, > > + NULL, NULL, 0); > > + } > > +} > > + > > /* Allocate the array if it has not already been allocated. */ > > > > static void alloc_cpu_call_rcu_data(void) > > @@ -219,19 +240,9 @@ static void *call_rcu_thread(void *arg) > > if (crdp->flags & URCU_CALL_RCU_RT) > > poll(NULL, 0, 10); > > else { > > - call_rcu_lock(&crdp->mtx); > > - _CMM_STORE_SHARED(crdp->flags, > > - crdp->flags & ~URCU_CALL_RCU_RUNNING); > > - if (&crdp->cbs.head == > > - _CMM_LOAD_SHARED(crdp->cbs.tail) && > > - pthread_cond_wait(&crdp->cond, &crdp->mtx) != 0) { > > - perror("pthread_cond_wait"); > > - exit(-1); > > - } > > - _CMM_STORE_SHARED(crdp->flags, > > - crdp->flags | URCU_CALL_RCU_RUNNING); > > + if (&crdp->cbs.head == _CMM_LOAD_SHARED(crdp->cbs.tail)) > > + call_rcu_wait(crdp); > > poll(NULL, 0, 10); > > - call_rcu_unlock(&crdp->mtx); > > } > > } > > call_rcu_lock(&crdp->mtx); > > @@ -264,11 +275,8 @@ static void call_rcu_data_init(struct ca > > perror("pthread_mutex_init"); > > exit(-1); > > } > > - if (pthread_cond_init(&crdp->cond, NULL) != 0) { > > - perror("pthread_cond_init"); > > - exit(-1); > > - } > > - crdp->flags = flags | URCU_CALL_RCU_RUNNING; > > + crdp->futex = 0; > > + crdp->flags = flags; > > cds_list_add(&crdp->list, &call_rcu_data_list); > > crdp->cpu_affinity = cpu_affinity; > > cmm_smp_mb(); /* Structure initialized before pointer is planted. */ > > @@ -492,16 +500,8 @@ int create_all_cpu_call_rcu_data(unsigne > > */ > > static void wake_call_rcu_thread(struct call_rcu_data *crdp) > > { > > - if (!(_CMM_LOAD_SHARED(crdp->flags) & URCU_CALL_RCU_RT)) { > > - call_rcu_lock(&crdp->mtx); > > - if (!(_CMM_LOAD_SHARED(crdp->flags) & URCU_CALL_RCU_RUNNING)) { > > - if (pthread_cond_signal(&crdp->cond) != 0) { > > - perror("pthread_cond_signal"); > > - exit(-1); > > - } > > - } > > - call_rcu_unlock(&crdp->mtx); > > - } > > + if (!(_CMM_LOAD_SHARED(crdp->flags) & URCU_CALL_RCU_RT)) > > + call_rcu_wake_up(crdp); > > } > > > > /* > > > > > > _______________________________________________ > > rp mailing list > > [email protected] > > http://svcs.cs.pdx.edu/mailman/listinfo/rp -- Mathieu Desnoyers Operating System Efficiency R&D Consultant EfficiOS Inc. http://www.efficios.com _______________________________________________ ltt-dev mailing list [email protected] http://lists.casi.polymtl.ca/cgi-bin/mailman/listinfo/ltt-dev
