RE: [PATCH RFC 01/16] prcu: Add PRCU implementation

zhangheng (AC) Mon, 29 Jan 2018 21:34:41 -0800

-----Original Message-----
>From: Boqun Feng [mailto:[email protected]] 
>Sent: 2018年1月25日 15:31
>To: Paul E. McKenney <[email protected]>
>Cc: [email protected]; Guohanjun (Hanjun Guo) <[email protected]>; 
>zhangheng (AC) <[email protected]>; Chenhaibo (Haibo, OS Lab) 
><[email protected]>; [email protected]; [email protected]
>Subject: Re: [PATCH RFC 01/16] prcu: Add PRCU implementation
>
>On Wed, Jan 24, 2018 at 10:16:18PM -0800, Paul E. McKenney wrote:
>> On Tue, Jan 23, 2018 at 03:59:26PM +0800, [email protected] wrote:
>> > From: Heng Zhang <[email protected]>
>> > 
>> > This RCU implementation (PRCU) is based on a fast consensus protocol 
>> > published in the following paper:
>> > 
>> > Fast Consensus Using Bounded Staleness for Scalable Read-mostly 
>> > Synchronization.
>> > Haibo Chen, Heng Zhang, Ran Liu, Binyu Zang, and Haibing Guan.
>> > IEEE Transactions on Parallel and Distributed Systems (TPDS), 2016.
>> > https://dl.acm.org/citation.cfm?id=3024114.3024143
>> > 
>> > Signed-off-by: Heng Zhang <[email protected]>
>> > Signed-off-by: Lihao Liang <[email protected]>
>> 
>> A few comments and questions interspersed.
>> 
>>                                                      Thanx, Paul
>> 
>> > ---
>> >  include/linux/prcu.h |  37 +++++++++++++++
>> >  kernel/rcu/Makefile  |   2 +-
>> >  kernel/rcu/prcu.c    | 125 
>> > +++++++++++++++++++++++++++++++++++++++++++++++++++
>> >  kernel/sched/core.c  |   2 +
>> >  4 files changed, 165 insertions(+), 1 deletion(-)  create mode 
>> > 100644 include/linux/prcu.h  create mode 100644 kernel/rcu/prcu.c
>> > 
>> > diff --git a/include/linux/prcu.h b/include/linux/prcu.h new file 
>> > mode 100644 index 00000000..653b4633
>> > --- /dev/null
>> > +++ b/include/linux/prcu.h
>> > @@ -0,0 +1,37 @@
>> > +#ifndef __LINUX_PRCU_H
>> > +#define __LINUX_PRCU_H
>> > +
>> > +#include <linux/atomic.h>
>> > +#include <linux/mutex.h>
>> > +#include <linux/wait.h>
>> > +
>> > +#define CONFIG_PRCU
>> > +
>> > +struct prcu_local_struct {
>> > +  unsigned int locked;
>> > +  unsigned int online;
>> > +  unsigned long long version;
>> > +};
>> > +
>> > +struct prcu_struct {
>> > +  atomic64_t global_version;
>> > +  atomic_t active_ctr;
>> > +  struct mutex mtx;
>> > +  wait_queue_head_t wait_q;
>> > +};
>> > +
>> > +#ifdef CONFIG_PRCU
>> > +void prcu_read_lock(void);
>> > +void prcu_read_unlock(void);
>> > +void synchronize_prcu(void);
>> > +void prcu_note_context_switch(void);
>> > +
>> > +#else /* #ifdef CONFIG_PRCU */
>> > +
>> > +#define prcu_read_lock() do {} while (0) #define prcu_read_unlock() 
>> > +do {} while (0) #define synchronize_prcu() do {} while (0) #define 
>> > +prcu_note_context_switch() do {} while (0)
>> 
>> If CONFIG_PRCU=n and some code is built that uses PRCU, shouldn't you 
>> get a build error rather than an error-free but inoperative PRCU?
>> 
>> Of course, Peter's question about purpose of the patch set applies 
>> here as well.
>> 
>> > +
>> > +#endif /* #ifdef CONFIG_PRCU */
>> > +#endif /* __LINUX_PRCU_H */
>> > diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 
>> > 23803c7d..8791419c 100644
>> > --- a/kernel/rcu/Makefile
>> > +++ b/kernel/rcu/Makefile
>> > @@ -2,7 +2,7 @@
>> >  # and is generally not a function of system call inputs.
>> >  KCOV_INSTRUMENT := n
>> > 
>> > -obj-y += update.o sync.o
>> > +obj-y += update.o sync.o prcu.o
>> >  obj-$(CONFIG_CLASSIC_SRCU) += srcu.o
>> >  obj-$(CONFIG_TREE_SRCU) += srcutree.o
>> >  obj-$(CONFIG_TINY_SRCU) += srcutiny.o diff --git 
>> > a/kernel/rcu/prcu.c b/kernel/rcu/prcu.c new file mode 100644 index 
>> > 00000000..a00b9420
>> > --- /dev/null
>> > +++ b/kernel/rcu/prcu.c
>> > @@ -0,0 +1,125 @@
>> > +#include <linux/smp.h>
>> > +#include <linux/prcu.h>
>> > +#include <linux/percpu.h>
>> > +#include <linux/compiler.h>
>> > +#include <linux/sched.h>
>> > +
>> > +#include <asm/barrier.h>
>> > +
>> > +DEFINE_PER_CPU_SHARED_ALIGNED(struct prcu_local_struct, 
>> > +prcu_local);
>> > +
>> > +struct prcu_struct global_prcu = {
>> > +  .global_version = ATOMIC64_INIT(0),
>> > +  .active_ctr = ATOMIC_INIT(0),
>> > +  .mtx = __MUTEX_INITIALIZER(global_prcu.mtx),
>> > +  .wait_q = __WAIT_QUEUE_HEAD_INITIALIZER(global_prcu.wait_q)
>> > +};
>> > +struct prcu_struct *prcu = &global_prcu;
>> > +
>> > +static inline void prcu_report(struct prcu_local_struct *local) {
>> > +  unsigned long long global_version;
>> > +  unsigned long long local_version;
>> > +
>> > +  global_version = atomic64_read(&prcu->global_version);
>> > +  local_version = local->version;
>> > +  if (global_version > local_version)
>> > +          cmpxchg(&local->version, local_version, global_version); }
>> > +
>> > +void prcu_read_lock(void)
>> > +{
>> > +  struct prcu_local_struct *local;
>> > +
>> > +  local = get_cpu_ptr(&prcu_local);
>> > +  if (!local->online) {
>> > +          WRITE_ONCE(local->online, 1);
>> > +          smp_mb();
>> > +  }
>> > +
>> > +  local->locked++;
>> > +  put_cpu_ptr(&prcu_local);
>> > +}
>> > +EXPORT_SYMBOL(prcu_read_lock);
>> > +
>> > +void prcu_read_unlock(void)
>> > +{
>> > +  int locked;
>> > +  struct prcu_local_struct *local;
>> > +
>> > +  barrier();
>> > +  local = get_cpu_ptr(&prcu_local);
>> > +  locked = local->locked;
>> > +  if (locked) {
>> > +          local->locked--;
>> > +          if (locked == 1)
>> > +                  prcu_report(local);
>> 
>> Is ordering important here?  It looks to me that the compiler could 
>> rearrange some of the accesses within prcu_report() with the 
>> local->locked decrement.  There appears to be some potential for load 
>> and store tearing, though perhaps you have verified that your compiler 
>> avoids this on the architecture that you are using.
>> 
>> > +          put_cpu_ptr(&prcu_local);
>> > +  } else {
>> 
>> Hmmm...  We get here if the RCU read-side critical section was preempted.
>> If none of them are preempted, ->active_ctr remains zero.
>> 
>> > +          put_cpu_ptr(&prcu_local);
>> > +          if (!atomic_dec_return(&prcu->active_ctr))
>> > +                  wake_up(&prcu->wait_q);
>> > +  }
>> > +}
>> > +EXPORT_SYMBOL(prcu_read_unlock);
>> > +
>> > +static void prcu_handler(void *info) {
>> > +  struct prcu_local_struct *local;
>> > +
>> > +  local = this_cpu_ptr(&prcu_local);
>> > +  if (!local->locked)
>
>And I think a smp_mb() is needed here, because in the following case:
>
>       CPU 0                                     CPU 1
>       ==================              ==========================
>       {X is initially 0}
>
>       WRITE_ONCE(X, 1);
>
>                             prcu_read_unlock(void):
>                             if (locked) {
>                                                     synchronize_prcu(void):
>                                                       ...
>                                                       <send IPI to CPU 0>
>       local->locked--;
>                             # switch to IPI
> WRITE_ONCE(local->version,....)
>                                                   <read CPU 0 version to be 
> latest>
>                                                     <return>
>
>                                                     r1 = READ_ONCE(X);
>
>r1 could be 0, which breaks RCU guarantees.
>


Thank you.
As I know,
it guarantees that the interrupt to be handled after all write instructions 
issued before have complete in x86 arch.
So the smp_mb is meaningless in x86 arch.
But I am not sure whether other archs guarantee this feature. If not, we do 
need a smp_mb here.

>> > +          WRITE_ONCE(local->version, 
>> > atomic64_read(&prcu->global_version));
>> > +}
>> > +
>> > +void synchronize_prcu(void)
>> > +{
>> > +  int cpu;
>> > +  cpumask_t cpus;
>> > +  unsigned long long version;
>> > +  struct prcu_local_struct *local;
>> > +
>> > +  version = atomic64_add_return(1, &prcu->global_version);
>> > +  mutex_lock(&prcu->mtx);
>> > +
>> > +  local = get_cpu_ptr(&prcu_local);
>> > +  local->version = version;
>> > +  put_cpu_ptr(&prcu_local);
>> > +
>> > +  cpumask_clear(&cpus);
>> > +  for_each_possible_cpu(cpu) {
>> > +          local = per_cpu_ptr(&prcu_local, cpu);
>> > +          if (!READ_ONCE(local->online))
>> > +                  continue;
>> > +          if (READ_ONCE(local->version) < version) {
>> 
>> On 32-bit systems, given that ->version is long long, you might see 
>> load tearing.  And on some 32-bit systems, the cmpxchg() in 
>> prcu_hander() might not build.
>> 
>
>/me curious about why an atomic64_t is used here for global version. I think 
>maybe 32bit global version still suffices.
>
>Regards,
>Boqun

Because the synchronization latency is low, it can have higher gp frequency.
It seems that 32bit can only correctly work for several years if there are 20+ 
gps per second.

>
>> Or is the idea that only prcu_handler() updates ->version?  But in 
>> that case, you wouldn't need the READ_ONCE() above.  What am I missing here?
>> 
>> > +                  smp_call_function_single(cpu, prcu_handler, NULL, 0);
>> > +                  cpumask_set_cpu(cpu, &cpus);
>> > +          }
>> > +  }
>> > +
>> > +  for_each_cpu(cpu, &cpus) {
>> > +          local = per_cpu_ptr(&prcu_local, cpu);
>> > +          while (READ_ONCE(local->version) < version)
>> 
>> This ->version read can also tear on some 32-bit systems, and this one 
>> most definitely can race with the prcu_handler() above.  Does the 
>> algorithm operate correctly in that case?  (It doesn't look that way 
>> to me, but I might be missing something.) Or are 32-bit systems excluded?
>> 
>> > +                  cpu_relax();
>> > +  }
>> 
>> I might be missing something, but I believe we need a memory barrier 
>> here on non-TSO systems.  Without that, couldn't we miss a preemption?
>> 
>> > +
>> > +  if (atomic_read(&prcu->active_ctr))
>> > +          wait_event(prcu->wait_q, !atomic_read(&prcu->active_ctr));
>> > +
>> > +  mutex_unlock(&prcu->mtx);
>> > +}
>> > +EXPORT_SYMBOL(synchronize_prcu);
>> > +
>> > +void prcu_note_context_switch(void) {
>> > +  struct prcu_local_struct *local;
>> > +
>> > +  local = get_cpu_ptr(&prcu_local);
>> > +  if (local->locked) {
>> > +          atomic_add(local->locked, &prcu->active_ctr);
>> > +          local->locked = 0;
>> > +  }
>> > +  local->online = 0;
>> > +  prcu_report(local);
>> > +  put_cpu_ptr(&prcu_local);
>> > +}
>> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 
>> > 326d4f88..a308581b 100644
>> > --- a/kernel/sched/core.c
>> > +++ b/kernel/sched/core.c
>> > @@ -15,6 +15,7 @@
>> >  #include <linux/init_task.h>
>> >  #include <linux/context_tracking.h>  #include 
>> > <linux/rcupdate_wait.h>
>> > +#include <linux/prcu.h>
>> > 
>> >  #include <linux/blkdev.h>
>> >  #include <linux/kprobes.h>
>> > @@ -3383,6 +3384,7 @@ static void __sched notrace __schedule(bool 
>> > preempt)
>> > 
>> >    local_irq_disable();
>> >    rcu_note_context_switch(preempt);
>> > +  prcu_note_context_switch();
>> > 
>> >    /*
>> >     * Make sure that signal_pending_state()->signal_pending() below
>> > --
>> > 2.14.1.729.g59c0ea183
>> > 
>> 
>

RE: [PATCH RFC 01/16] prcu: Add PRCU implementation

Reply via email to