Re: [Xen-devel] [RFC v3 2/2] x86/xen: allow privcmd hypercalls to be preempted

2015-01-22 Thread Paul E. McKenney
On Thu, Jan 22, 2015 at 03:16:57PM -0500, Steven Rostedt wrote:
> 
> [ Added Paul McKenney ]
> 
> On Thu, 22 Jan 2015 19:39:13 +0100
> "Luis R. Rodriguez"  wrote:
> 
> > > Why not make this a tracepoint? Then you can enable it only when you
> > > want to. As tracepoints are also hooks, you could add you own code that
> > > hooks to it and does a printk as well. The advantage of doing it via a
> > > tracepoint is that you can turn it on and off regardless of what the
> > > loglevel is set at.
> > 
> > This uses NOKPROBE_SYMBOL and notrace since based on Andy's advice
> > we are not confident that tracing and kprobes are safe to use in what
> > might be an extended RCU quiescent state (i.e. where we're outside
> > irq_enter and irq_exit).
> 
> We have trace_*_rcuidle() for such cases.
> 
> That is, you create the tracepoint just the same, and instead of having
> trace_foo(), if you are in a known area that is outside of rcu viewing,
> you use trace_foo_rcuidle() and it will tell RCU "hey, there's something
> here that may need RCU, so look at me!"

What Steve said!

Also, there is an rcu_is_watching() API member that can tell you
whether or not RCU is paying attention at a given point.  Or test with
CONFIG_PROVE_RCU, in which case lockdep will yell at you if you should
have used the _rcuidle() form of the tracing hooks.  ;-)

Thanx, Paul

> Also, please remove the "notrace", because function tracing goes an
> extra step to not require RCU being visible. The only thing you get
> with notrace is not being able to trace an otherwise traceable function.
> 
> -- Steve
> 
> > 
> > > That is, if there is any practical use for that message. Tracing just
> > > sched_switch will give you the same info.
> > 
> > IMHO it may be more useful if we knew exactly what hypercalls were
> > being preempted but perhaps all that can be left as a secondary
> > exercise and for now I'll just nuke the print.
> 


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH tip/core/rcu 02/20] x86: Use common outgoing-CPU-notification code

2015-03-03 Thread Paul E. McKenney
From: "Paul E. McKenney" 

This commit removes the open-coded CPU-offline notification with new
common code.  Among other things, this change avoids calling scheduler
code using RCU from an offline CPU that RCU is ignoring.  It also allows
Xen to notice at online time that the CPU did not go offline correctly.
Note that Xen has the surviving CPU carry out some cleanup operations,
so if the surviving CPU times out, these cleanup operations might have
been carried out while the outgoing CPU was still running.  It might
therefore be unwise to bring this CPU back online, and this commit
avoids doing so.

Signed-off-by: Paul E. McKenney 
Cc: 
Cc: Konrad Rzeszutek Wilk 
Cc: Boris Ostrovsky 
Cc: David Vrabel 
Cc: 
---
 arch/x86/include/asm/cpu.h |  2 --
 arch/x86/include/asm/smp.h |  1 -
 arch/x86/kernel/smpboot.c  | 29 -
 arch/x86/xen/smp.c | 18 --
 4 files changed, 16 insertions(+), 34 deletions(-)

diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index d2b12988d2ed..bf2caa1dedc5 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -34,8 +34,6 @@ extern int _debug_hotplug_cpu(int cpu, int action);
 #endif
 #endif
 
-DECLARE_PER_CPU(int, cpu_state);
-
 int mwait_usable(const struct cpuinfo_x86 *);
 
 #endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 8cd1cc3bc835..8cd27e08e23c 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -150,7 +150,6 @@ static inline void arch_send_call_function_ipi_mask(const 
struct cpumask *mask)
 }
 
 void cpu_disable_common(void);
-void cpu_die_common(unsigned int cpu);
 void native_smp_prepare_boot_cpu(void);
 void native_smp_prepare_cpus(unsigned int max_cpus);
 void native_smp_cpus_done(unsigned int max_cpus);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index febc6aabc72e..ff24fbd17fe7 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -77,9 +77,6 @@
 #include 
 #include 
 
-/* State of each CPU */
-DEFINE_PER_CPU(int, cpu_state) = { 0 };
-
 /* Number of siblings per CPU package */
 int smp_num_siblings = 1;
 EXPORT_SYMBOL(smp_num_siblings);
@@ -257,7 +254,7 @@ static void notrace start_secondary(void *unused)
lock_vector_lock();
set_cpu_online(smp_processor_id(), true);
unlock_vector_lock();
-   per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+   cpu_set_state_online(smp_processor_id());
x86_platform.nmi_init();
 
/* enable local interrupts */
@@ -948,7 +945,10 @@ int native_cpu_up(unsigned int cpu, struct task_struct 
*tidle)
 */
mtrr_save_state();
 
-   per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+   /* x86 CPUs take themselves offline, so delayed offline is OK. */
+   err = cpu_check_up_prepare(cpu);
+   if (err && err != -EBUSY)
+   return err;
 
/* the FPU context is blank, nobody can own it */
__cpu_disable_lazy_restore(cpu);
@@ -1191,7 +1191,7 @@ void __init native_smp_prepare_boot_cpu(void)
switch_to_new_gdt(me);
/* already set me in cpu_online_mask in boot_cpu_init() */
cpumask_set_cpu(me, cpu_callout_mask);
-   per_cpu(cpu_state, me) = CPU_ONLINE;
+   cpu_set_state_online(me);
 }
 
 void __init native_smp_cpus_done(unsigned int max_cpus)
@@ -1318,14 +1318,10 @@ static void __ref remove_cpu_from_maps(int cpu)
numa_remove_cpu(cpu);
 }
 
-static DEFINE_PER_CPU(struct completion, die_complete);
-
 void cpu_disable_common(void)
 {
int cpu = smp_processor_id();
 
-   init_completion(&per_cpu(die_complete, smp_processor_id()));
-
remove_siblinginfo(cpu);
 
/* It's now safe to remove this processor from the online map */
@@ -1349,19 +1345,12 @@ int native_cpu_disable(void)
return 0;
 }
 
-void cpu_die_common(unsigned int cpu)
-{
-   wait_for_completion_timeout(&per_cpu(die_complete, cpu), HZ);
-}
-
 void native_cpu_die(unsigned int cpu)
 {
/* We don't do anything here: idle task is faking death itself. */
 
-   cpu_die_common(cpu);
-
/* They ack this in play_dead() by setting CPU_DEAD */
-   if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
+   if (cpu_wait_death(cpu, 5)) {
if (system_state == SYSTEM_RUNNING)
pr_info("CPU %u is now offline\n", cpu);
} else {
@@ -1375,10 +1364,8 @@ void play_dead_common(void)
reset_lazy_tlbstate();
amd_e400_remove_cpu(raw_smp_processor_id());
 
-   mb();
/* Ack it */
-   __this_cpu_write(cpu_state, CPU_DEAD);
-   complete(&per_cpu(die_complete, smp_processor_id()));
+   (void)cpu_report_death();
 
/*
 * With physical CPU hotplug, we should halt the cpu
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 08e8489c47f1..e2c7389c58c5 100644
--- a/arch/x86/xen/smp.c
+

Re: [Xen-devel] [PATCH tip/core/rcu 02/20] x86: Use common outgoing-CPU-notification code

2015-03-03 Thread Paul E. McKenney
On Tue, Mar 03, 2015 at 02:17:24PM -0500, Boris Ostrovsky wrote:
> On 03/03/2015 12:42 PM, Paul E. McKenney wrote:
> >  }
> >@@ -511,7 +508,8 @@ static void xen_cpu_die(unsigned int cpu)
> > schedule_timeout(HZ/10);
> > }
> >-cpu_die_common(cpu);
> >+(void)cpu_wait_death(cpu, 5);
> >+/* FIXME: Are the below calls really safe in case of timeout? */
> 
> 
> Not for HVM guests (PV guests will only reach this point after
> target cpu has been marked as down by the hypervisor).
> 
> We need at least to have a message similar to what native_cpu_die()
> prints on cpu_wait_death() failure. And I think we should not call
> the two routines below (three, actually --- there is also
> xen_teardown_timer() below, which is not part of the diff).
> 
> -boris
> 
> 
> > xen_smp_intr_free(cpu);
> > xen_uninit_lock_cpu(cpu);

So something like this, then?

if (cpu_wait_death(cpu, 5)) {
xen_smp_intr_free(cpu);
xen_uninit_lock_cpu(cpu);
xen_teardown_timer(cpu);
}

Easy change for me to make if so!

Or do I need some other check for HVM-vs.-PV guests, and, if so, what
would that check be?  And also if so, is it OK to online a PV guest's
CPU that timed out during its previous offline?

Thanx, Paul


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH tip/core/rcu 02/20] x86: Use common outgoing-CPU-notification code

2015-03-03 Thread Paul E. McKenney
On Tue, Mar 03, 2015 at 03:13:07PM -0500, Boris Ostrovsky wrote:
> On 03/03/2015 02:42 PM, Paul E. McKenney wrote:
> >On Tue, Mar 03, 2015 at 02:17:24PM -0500, Boris Ostrovsky wrote:
> >>On 03/03/2015 12:42 PM, Paul E. McKenney wrote:
> >>>  }
> >>>@@ -511,7 +508,8 @@ static void xen_cpu_die(unsigned int cpu)
> >>>   schedule_timeout(HZ/10);
> >>>   }
> >>>-  cpu_die_common(cpu);
> >>>+  (void)cpu_wait_death(cpu, 5);
> >>>+  /* FIXME: Are the below calls really safe in case of timeout? */
> >>
> >>
> >>Not for HVM guests (PV guests will only reach this point after
> >>target cpu has been marked as down by the hypervisor).
> >>
> >>We need at least to have a message similar to what native_cpu_die()
> >>prints on cpu_wait_death() failure. And I think we should not call
> >>the two routines below (three, actually --- there is also
> >>xen_teardown_timer() below, which is not part of the diff).
> >>
> >>-boris
> >>
> >>
> >>>   xen_smp_intr_free(cpu);
> >>>   xen_uninit_lock_cpu(cpu);
> >
> >So something like this, then?
> >
> > if (cpu_wait_death(cpu, 5)) {
> > xen_smp_intr_free(cpu);
> > xen_uninit_lock_cpu(cpu);
> > xen_teardown_timer(cpu);
> > }
> 
>   else
>   pr_err("CPU %u didn't die...\n", cpu);
> 
> 
> >
> >Easy change for me to make if so!
> >
> >Or do I need some other check for HVM-vs.-PV guests, and, if so, what
> >would that check be?  And also if so, is it OK to online a PV guest's
> >CPU that timed out during its previous offline?
> 
> 
> I believe PV VCPUs will always be CPU_DEAD by the time we get here
> since we are (indirectly) waiting for this in the loop at the
> beginning of xen_cpu_die():
> 
> 'while (xen_pv_domain() && HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu,
> NULL))' will exit only after 'HYPERVISOR_vcpu_op(VCPUOP_down,
> smp_processor_id()' in xen_play_dead(). Which happens after
> play_dead_common() has marked the cpu as CPU_DEAD.
> 
> So no test is needed.

OK, so I have the following patch on top of my previous patch, which
I will merge if testing goes well.  So if a CPU times out going offline,
the above three functions will not be called, the "didn't die" message
will be printed, and any future attempt to online that CPU will fail.
Is that the correct semantics?

Thanx, Paul



diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index e2c7389c58c5..f2a06ff0614d 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -508,12 +508,13 @@ static void xen_cpu_die(unsigned int cpu)
schedule_timeout(HZ/10);
}
 
-   (void)cpu_wait_death(cpu, 5);
-   /* FIXME: Are the below calls really safe in case of timeout? */
-
-   xen_smp_intr_free(cpu);
-   xen_uninit_lock_cpu(cpu);
-   xen_teardown_timer(cpu);
+   if (cpu_wait_death(cpu, 5)) {
+   xen_smp_intr_free(cpu);
+   xen_uninit_lock_cpu(cpu);
+   xen_teardown_timer(cpu);
+   } else {
+   pr_err("CPU %u didn't die...\n", cpu);
+   }
 }
 
 static void xen_play_dead(void) /* used only with HOTPLUG_CPU */


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH tip/core/rcu 02/20] x86: Use common outgoing-CPU-notification code

2015-03-03 Thread Paul E. McKenney
On Tue, Mar 03, 2015 at 05:06:50PM -0500, Boris Ostrovsky wrote:
> On 03/03/2015 04:26 PM, Paul E. McKenney wrote:
> >On Tue, Mar 03, 2015 at 03:13:07PM -0500, Boris Ostrovsky wrote:
> >>On 03/03/2015 02:42 PM, Paul E. McKenney wrote:
> >>>On Tue, Mar 03, 2015 at 02:17:24PM -0500, Boris Ostrovsky wrote:
> >>>>On 03/03/2015 12:42 PM, Paul E. McKenney wrote:
> >>>>>  }
> >>>>>@@ -511,7 +508,8 @@ static void xen_cpu_die(unsigned int cpu)
> >>>>> schedule_timeout(HZ/10);
> >>>>> }
> >>>>>-cpu_die_common(cpu);
> >>>>>+(void)cpu_wait_death(cpu, 5);
> >>>>>+/* FIXME: Are the below calls really safe in case of timeout? */
> >>>>
> >>>>Not for HVM guests (PV guests will only reach this point after
> >>>>target cpu has been marked as down by the hypervisor).
> >>>>
> >>>>We need at least to have a message similar to what native_cpu_die()
> >>>>prints on cpu_wait_death() failure. And I think we should not call
> >>>>the two routines below (three, actually --- there is also
> >>>>xen_teardown_timer() below, which is not part of the diff).
> >>>>
> >>>>-boris
> >>>>
> >>>>
> >>>>> xen_smp_intr_free(cpu);
> >>>>> xen_uninit_lock_cpu(cpu);
> >>>So something like this, then?
> >>>
> >>>   if (cpu_wait_death(cpu, 5)) {
> >>>   xen_smp_intr_free(cpu);
> >>>   xen_uninit_lock_cpu(cpu);
> >>>   xen_teardown_timer(cpu);
> >>>   }
> >>else
> >>pr_err("CPU %u didn't die...\n", cpu);
> >>
> >>
> >>>Easy change for me to make if so!
> >>>
> >>>Or do I need some other check for HVM-vs.-PV guests, and, if so, what
> >>>would that check be?  And also if so, is it OK to online a PV guest's
> >>>CPU that timed out during its previous offline?
> >>
> >>I believe PV VCPUs will always be CPU_DEAD by the time we get here
> >>since we are (indirectly) waiting for this in the loop at the
> >>beginning of xen_cpu_die():
> >>
> >>'while (xen_pv_domain() && HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu,
> >>NULL))' will exit only after 'HYPERVISOR_vcpu_op(VCPUOP_down,
> >>smp_processor_id()' in xen_play_dead(). Which happens after
> >>play_dead_common() has marked the cpu as CPU_DEAD.
> >>
> >>So no test is needed.
> >OK, so I have the following patch on top of my previous patch, which
> >I will merge if testing goes well.  So if a CPU times out going offline,
> >the above three functions will not be called, the "didn't die" message
> >will be printed, and any future attempt to online that CPU will fail.
> >Is that the correct semantics?
> 
> Yes.
> 
> I am not sure whether not ever onlining the CPU is the best outcome
> but then I don't think trying to online it again with all interrupts
> and such still set up will work well. And it's an improvement over
> what we have now anyway (with current code we may clean up things
> for a non-dead cpu).

Another strategy is to key off of the return value of cpu_check_up_prepare().
If it returns -EBUSY, then the outgoing CPU finished up after the
surviving CPU timed out.  The CPU trying to bring the new CPU online
could (in theory, anyway) do the xen_smp_intr_free(), xen_uninit_lock_cpu(),
and xen_teardown_timer() at that point.

But I must defer to you on this sort of thing.

Thanx, Paul

> Thanks.
> -boris
> 
> 
> >
> > Thanx, Paul
> >
> >
> >
> >diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
> >index e2c7389c58c5..f2a06ff0614d 100644
> >--- a/arch/x86/xen/smp.c
> >+++ b/arch/x86/xen/smp.c
> >@@ -508,12 +508,13 @@ static void xen_cpu_die(unsigned int cpu)
> > schedule_timeout(HZ/10);
> > }
> >-(void)cpu_wait_death(cpu, 5);
> >-/* FIXME: Are the below calls really safe in case of timeout? */
> >-
> >-xen_smp_intr_free(cpu);
> >-xen_uninit_lock_cpu(cpu);
> >-xen_teardown_timer(cpu);
> >+if (cpu_wait_death(cpu, 5)) {
> >+xen_smp_intr_free(cpu);
> >+xen_uninit_lock_cpu(cpu);
> >+xen_teardown_timer(cpu);
> >+} else {
> >+pr_err("CPU %u didn't die...\n", cpu);
> >+}
> >  }
> >  static void xen_play_dead(void) /* used only with HOTPLUG_CPU */
> >
> 


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH tip/core/rcu 02/20] x86: Use common outgoing-CPU-notification code

2015-03-04 Thread Paul E. McKenney
On Tue, Mar 03, 2015 at 02:31:51PM -0800, Paul E. McKenney wrote:
> On Tue, Mar 03, 2015 at 05:06:50PM -0500, Boris Ostrovsky wrote:
> > On 03/03/2015 04:26 PM, Paul E. McKenney wrote:
> > >On Tue, Mar 03, 2015 at 03:13:07PM -0500, Boris Ostrovsky wrote:
> > >>On 03/03/2015 02:42 PM, Paul E. McKenney wrote:
> > >>>On Tue, Mar 03, 2015 at 02:17:24PM -0500, Boris Ostrovsky wrote:
> > >>>>On 03/03/2015 12:42 PM, Paul E. McKenney wrote:
> > >>>>>  }
> > >>>>>@@ -511,7 +508,8 @@ static void xen_cpu_die(unsigned int cpu)
> > >>>>>   schedule_timeout(HZ/10);
> > >>>>>   }
> > >>>>>-  cpu_die_common(cpu);
> > >>>>>+  (void)cpu_wait_death(cpu, 5);
> > >>>>>+  /* FIXME: Are the below calls really safe in case of timeout? */
> > >>>>
> > >>>>Not for HVM guests (PV guests will only reach this point after
> > >>>>target cpu has been marked as down by the hypervisor).
> > >>>>
> > >>>>We need at least to have a message similar to what native_cpu_die()
> > >>>>prints on cpu_wait_death() failure. And I think we should not call
> > >>>>the two routines below (three, actually --- there is also
> > >>>>xen_teardown_timer() below, which is not part of the diff).
> > >>>>
> > >>>>-boris
> > >>>>
> > >>>>
> > >>>>>   xen_smp_intr_free(cpu);
> > >>>>>   xen_uninit_lock_cpu(cpu);
> > >>>So something like this, then?
> > >>>
> > >>> if (cpu_wait_death(cpu, 5)) {
> > >>> xen_smp_intr_free(cpu);
> > >>> xen_uninit_lock_cpu(cpu);
> > >>> xen_teardown_timer(cpu);
> > >>> }
> > >>  else
> > >>  pr_err("CPU %u didn't die...\n", cpu);
> > >>
> > >>
> > >>>Easy change for me to make if so!
> > >>>
> > >>>Or do I need some other check for HVM-vs.-PV guests, and, if so, what
> > >>>would that check be?  And also if so, is it OK to online a PV guest's
> > >>>CPU that timed out during its previous offline?
> > >>
> > >>I believe PV VCPUs will always be CPU_DEAD by the time we get here
> > >>since we are (indirectly) waiting for this in the loop at the
> > >>beginning of xen_cpu_die():
> > >>
> > >>'while (xen_pv_domain() && HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu,
> > >>NULL))' will exit only after 'HYPERVISOR_vcpu_op(VCPUOP_down,
> > >>smp_processor_id()' in xen_play_dead(). Which happens after
> > >>play_dead_common() has marked the cpu as CPU_DEAD.
> > >>
> > >>So no test is needed.
> > >OK, so I have the following patch on top of my previous patch, which
> > >I will merge if testing goes well.  So if a CPU times out going offline,
> > >the above three functions will not be called, the "didn't die" message
> > >will be printed, and any future attempt to online that CPU will fail.
> > >Is that the correct semantics?
> > 
> > Yes.
> > 
> > I am not sure whether not ever onlining the CPU is the best outcome
> > but then I don't think trying to online it again with all interrupts
> > and such still set up will work well. And it's an improvement over
> > what we have now anyway (with current code we may clean up things
> > for a non-dead cpu).
> 
> Another strategy is to key off of the return value of cpu_check_up_prepare().
> If it returns -EBUSY, then the outgoing CPU finished up after the
> surviving CPU timed out.  The CPU trying to bring the new CPU online
> could (in theory, anyway) do the xen_smp_intr_free(), xen_uninit_lock_cpu(),
> and xen_teardown_timer() at that point.

And the code for this, in xen_cpu_up(), might look something like the
following:

rc = cpu_check_up_prepare(cpu);
if (rc && rc != -EBUSY)
return rc;
if (rc == EBUSY) {
xen_smp_intr_free(cpu);
xen_uninit_lock_cpu(cpu);
xen_teardown_timer(cpu);
}

The idea is that we detect when the CPU eventually took itself offline,
but only did so after the surviving CPU timed out.  (Of course, it
would probably be best to put those three statements into a small
function that is called from both places.)

I have no idea wh

Re: [Xen-devel] [PATCH tip/core/rcu 02/20] x86: Use common outgoing-CPU-notification code

2015-03-04 Thread Paul E. McKenney
On Wed, Mar 04, 2015 at 09:55:11AM -0500, Boris Ostrovsky wrote:
> On 03/04/2015 09:43 AM, Paul E. McKenney wrote:
> >On Tue, Mar 03, 2015 at 02:31:51PM -0800, Paul E. McKenney wrote:
> >>On Tue, Mar 03, 2015 at 05:06:50PM -0500, Boris Ostrovsky wrote:
> >>>On 03/03/2015 04:26 PM, Paul E. McKenney wrote:
> >>>>On Tue, Mar 03, 2015 at 03:13:07PM -0500, Boris Ostrovsky wrote:
> >>>>>On 03/03/2015 02:42 PM, Paul E. McKenney wrote:
> >>>>>>On Tue, Mar 03, 2015 at 02:17:24PM -0500, Boris Ostrovsky wrote:
> >>>>>>>On 03/03/2015 12:42 PM, Paul E. McKenney wrote:
> >>>>>>>>  }
> >>>>>>>>@@ -511,7 +508,8 @@ static void xen_cpu_die(unsigned int cpu)
> >>>>>>>>  schedule_timeout(HZ/10);
> >>>>>>>>  }
> >>>>>>>>- cpu_die_common(cpu);
> >>>>>>>>+ (void)cpu_wait_death(cpu, 5);
> >>>>>>>>+ /* FIXME: Are the below calls really safe in case of timeout? */
> >>>>>>>Not for HVM guests (PV guests will only reach this point after
> >>>>>>>target cpu has been marked as down by the hypervisor).
> >>>>>>>
> >>>>>>>We need at least to have a message similar to what native_cpu_die()
> >>>>>>>prints on cpu_wait_death() failure. And I think we should not call
> >>>>>>>the two routines below (three, actually --- there is also
> >>>>>>>xen_teardown_timer() below, which is not part of the diff).
> >>>>>>>
> >>>>>>>-boris
> >>>>>>>
> >>>>>>>
> >>>>>>>>  xen_smp_intr_free(cpu);
> >>>>>>>>  xen_uninit_lock_cpu(cpu);
> >>>>>>So something like this, then?
> >>>>>>
> >>>>>>if (cpu_wait_death(cpu, 5)) {
> >>>>>>xen_smp_intr_free(cpu);
> >>>>>>xen_uninit_lock_cpu(cpu);
> >>>>>>xen_teardown_timer(cpu);
> >>>>>>}
> >>>>> else
> >>>>> pr_err("CPU %u didn't die...\n", cpu);
> >>>>>
> >>>>>
> >>>>>>Easy change for me to make if so!
> >>>>>>
> >>>>>>Or do I need some other check for HVM-vs.-PV guests, and, if so, what
> >>>>>>would that check be?  And also if so, is it OK to online a PV guest's
> >>>>>>CPU that timed out during its previous offline?
> >>>>>I believe PV VCPUs will always be CPU_DEAD by the time we get here
> >>>>>since we are (indirectly) waiting for this in the loop at the
> >>>>>beginning of xen_cpu_die():
> >>>>>
> >>>>>'while (xen_pv_domain() && HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu,
> >>>>>NULL))' will exit only after 'HYPERVISOR_vcpu_op(VCPUOP_down,
> >>>>>smp_processor_id()' in xen_play_dead(). Which happens after
> >>>>>play_dead_common() has marked the cpu as CPU_DEAD.
> >>>>>
> >>>>>So no test is needed.
> >>>>OK, so I have the following patch on top of my previous patch, which
> >>>>I will merge if testing goes well.  So if a CPU times out going offline,
> >>>>the above three functions will not be called, the "didn't die" message
> >>>>will be printed, and any future attempt to online that CPU will fail.
> >>>>Is that the correct semantics?
> >>>Yes.
> >>>
> >>>I am not sure whether not ever onlining the CPU is the best outcome
> >>>but then I don't think trying to online it again with all interrupts
> >>>and such still set up will work well. And it's an improvement over
> >>>what we have now anyway (with current code we may clean up things
> >>>for a non-dead cpu).
> >>Another strategy is to key off of the return value of 
> >>cpu_check_up_prepare().
> >>If it returns -EBUSY, then the outgoing CPU finished up after the
> >>surviving CPU timed out.  The CPU trying to bring the new CPU online
> >>could (in theory, anyway) do the xen_smp_intr_free(), xen_uninit_lock_cpu(),
> >>and xen_teardown_timer() at that point.
> >And the co

Re: [Xen-devel] [PATCH tip/core/rcu 02/20] x86: Use common outgoing-CPU-notification code

2015-03-05 Thread Paul E. McKenney
On Thu, Mar 05, 2015 at 04:17:59PM -0500, Boris Ostrovsky wrote:
> On 03/04/2015 10:25 AM, Paul E. McKenney wrote:
> >On Wed, Mar 04, 2015 at 09:55:11AM -0500, Boris Ostrovsky wrote:
> 
> >>The simple solution is to stop calling native_cpu_die() above but
> >>I'd like to use common code in native_cpu_die(). I'll see if I can
> >>carve it out without too much damage to x86.
> >
> >Very good, thank you!  I look forward to seeing your patch.
> 
> How about something like this, on top of your original 02/20 patch
> (this is copy-paste but hopefully it can be applied):

Unfortunately, no joy.  :-(

I was able to fix a couple of whitespace problems, but it still didn't
like this hunk: "@@ -766,12 +778,6 @@".

Could you please send the patch as an attachment, post it on the web
somewhere, or otherwise get me a clean copy it it?

Thanx, Paul

>  arch/x86/include/asm/smp.h |1 +
>  arch/x86/kernel/smpboot.c  |   12 +++-
>  arch/x86/xen/smp.c |   34 --
>  3 files changed, 32 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
> index 8cd27e0..a5cb4f6 100644
> --- a/arch/x86/include/asm/smp.h
> +++ b/arch/x86/include/asm/smp.h
> @@ -155,6 +155,7 @@ void native_smp_prepare_cpus(unsigned int max_cpus);
>  void native_smp_cpus_done(unsigned int max_cpus);
>  int native_cpu_up(unsigned int cpunum, struct task_struct *tidle);
>  int native_cpu_disable(void);
> +int common_cpu_die(unsigned int cpu);
>  void native_cpu_die(unsigned int cpu);
>  void native_play_dead(void);
>  void play_dead_common(void);
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index ff24fbd..c8fa349 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -1345,8 +1345,10 @@ int native_cpu_disable(void)
>   return 0;
>  }
> 
> -void native_cpu_die(unsigned int cpu)
> +int common_cpu_die(unsigned int cpu)
>  {
> + int ret = 0;
> +
>   /* We don't do anything here: idle task is faking death itself. */
> 
>   /* They ack this in play_dead() by setting CPU_DEAD */
> @@ -1355,7 +1357,15 @@ void native_cpu_die(unsigned int cpu)
>   pr_info("CPU %u is now offline\n", cpu);
>   } else {
>   pr_err("CPU %u didn't die...\n", cpu);
> + ret = -1;
>   }
> +
> + return ret;
> +}
> +
> +void native_cpu_die(unsigned int cpu)
> +{
> + common_cpu_die(cpu);
>  }
> 
>  void play_dead_common(void)
> diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
> index e2c7389..1c5e760 100644
> --- a/arch/x86/xen/smp.c
> +++ b/arch/x86/xen/smp.c
> @@ -455,7 +455,10 @@ static int xen_cpu_up(unsigned int cpu, struct
> task_struct *idle)
>   xen_setup_timer(cpu);
>   xen_init_lock_cpu(cpu);
> 
> - /* Xen outgoing CPUs need help cleaning up, so -EBUSY is an error. */
> + /*
> +  * PV VCPUs are always successfully taken down (see 'while' loop
> +  * in xen_cpu_die()), so -EBUSY is an error.
> +  */
>   rc = cpu_check_up_prepare(cpu);
>   if (rc)
>   return rc;
> @@ -508,12 +511,11 @@ static void xen_cpu_die(unsigned int cpu)
>   schedule_timeout(HZ/10);
>   }
> 
> - (void)cpu_wait_death(cpu, 5);
> - /* FIXME: Are the below calls really safe in case of timeout? */
> -
> - xen_smp_intr_free(cpu);
> - xen_uninit_lock_cpu(cpu);
> - xen_teardown_timer(cpu);
> + if (common_cpu_die(cpu) == 0) {
> + xen_smp_intr_free(cpu);
> + xen_uninit_lock_cpu(cpu);
> + xen_teardown_timer(cpu);
> + }
>  }
> 
>  static void xen_play_dead(void) /* used only with HOTPLUG_CPU */
> @@ -745,6 +747,16 @@ static void __init
> xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
>  static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
>  {
>   int rc;
> +
> + /*
> +  * This can happen if CPU was offlined earlier and
> +  * offlining timed out in common_cpu_die().
> +  */
> + if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) {
> + xen_smp_intr_free(cpu);
> + xen_uninit_lock_cpu(cpu);
> + }
> +
>   /*
>* xen_smp_intr_init() needs to run before native_cpu_up()
>* so that IPI vectors are set up on the booting CPU before
> @@ -766,12 +778,6 @@ static int xen_hvm_cpu_up(unsigned int cpu,
> struct task_struct *tidle)
>   return rc;
>  }
> 
> -static void xen_hvm_cpu_die(unsigned int cpu)
>

[Xen-devel] [PATCH v2 tip/core/rcu 02/22] x86: Use common outgoing-CPU-notification code

2015-03-16 Thread Paul E. McKenney
From: "Paul E. McKenney" 

This commit removes the open-coded CPU-offline notification with new
common code.  Among other things, this change avoids calling scheduler
code using RCU from an offline CPU that RCU is ignoring.  It also allows
Xen to notice at online time that the CPU did not go offline correctly.
Note that Xen has the surviving CPU carry out some cleanup operations,
so if the surviving CPU times out, these cleanup operations might have
been carried out while the outgoing CPU was still running.  It might
therefore be unwise to bring this CPU back online, and this commit
avoids doing so.

Signed-off-by: Boris Ostrovsky 
Signed-off-by: Paul E. McKenney 
Cc: 
Cc: Konrad Rzeszutek Wilk 
Cc: David Vrabel 
Cc: 
---
 arch/x86/include/asm/cpu.h |  2 --
 arch/x86/include/asm/smp.h |  2 +-
 arch/x86/kernel/smpboot.c  | 39 ++-
 arch/x86/xen/smp.c | 46 +-
 4 files changed, 44 insertions(+), 45 deletions(-)

diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index d2b12988d2ed..bf2caa1dedc5 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -34,8 +34,6 @@ extern int _debug_hotplug_cpu(int cpu, int action);
 #endif
 #endif
 
-DECLARE_PER_CPU(int, cpu_state);
-
 int mwait_usable(const struct cpuinfo_x86 *);
 
 #endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 8cd1cc3bc835..a5cb4f6e9492 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -150,12 +150,12 @@ static inline void arch_send_call_function_ipi_mask(const 
struct cpumask *mask)
 }
 
 void cpu_disable_common(void);
-void cpu_die_common(unsigned int cpu);
 void native_smp_prepare_boot_cpu(void);
 void native_smp_prepare_cpus(unsigned int max_cpus);
 void native_smp_cpus_done(unsigned int max_cpus);
 int native_cpu_up(unsigned int cpunum, struct task_struct *tidle);
 int native_cpu_disable(void);
+int common_cpu_die(unsigned int cpu);
 void native_cpu_die(unsigned int cpu);
 void native_play_dead(void);
 void play_dead_common(void);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index febc6aabc72e..c8fa34963ead 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -77,9 +77,6 @@
 #include 
 #include 
 
-/* State of each CPU */
-DEFINE_PER_CPU(int, cpu_state) = { 0 };
-
 /* Number of siblings per CPU package */
 int smp_num_siblings = 1;
 EXPORT_SYMBOL(smp_num_siblings);
@@ -257,7 +254,7 @@ static void notrace start_secondary(void *unused)
lock_vector_lock();
set_cpu_online(smp_processor_id(), true);
unlock_vector_lock();
-   per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+   cpu_set_state_online(smp_processor_id());
x86_platform.nmi_init();
 
/* enable local interrupts */
@@ -948,7 +945,10 @@ int native_cpu_up(unsigned int cpu, struct task_struct 
*tidle)
 */
mtrr_save_state();
 
-   per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+   /* x86 CPUs take themselves offline, so delayed offline is OK. */
+   err = cpu_check_up_prepare(cpu);
+   if (err && err != -EBUSY)
+   return err;
 
/* the FPU context is blank, nobody can own it */
__cpu_disable_lazy_restore(cpu);
@@ -1191,7 +1191,7 @@ void __init native_smp_prepare_boot_cpu(void)
switch_to_new_gdt(me);
/* already set me in cpu_online_mask in boot_cpu_init() */
cpumask_set_cpu(me, cpu_callout_mask);
-   per_cpu(cpu_state, me) = CPU_ONLINE;
+   cpu_set_state_online(me);
 }
 
 void __init native_smp_cpus_done(unsigned int max_cpus)
@@ -1318,14 +1318,10 @@ static void __ref remove_cpu_from_maps(int cpu)
numa_remove_cpu(cpu);
 }
 
-static DEFINE_PER_CPU(struct completion, die_complete);
-
 void cpu_disable_common(void)
 {
int cpu = smp_processor_id();
 
-   init_completion(&per_cpu(die_complete, smp_processor_id()));
-
remove_siblinginfo(cpu);
 
/* It's now safe to remove this processor from the online map */
@@ -1349,24 +1345,27 @@ int native_cpu_disable(void)
return 0;
 }
 
-void cpu_die_common(unsigned int cpu)
+int common_cpu_die(unsigned int cpu)
 {
-   wait_for_completion_timeout(&per_cpu(die_complete, cpu), HZ);
-}
+   int ret = 0;
 
-void native_cpu_die(unsigned int cpu)
-{
/* We don't do anything here: idle task is faking death itself. */
 
-   cpu_die_common(cpu);
-
/* They ack this in play_dead() by setting CPU_DEAD */
-   if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
+   if (cpu_wait_death(cpu, 5)) {
if (system_state == SYSTEM_RUNNING)
pr_info("CPU %u is now offline\n", cpu);
} else {
pr_err("CPU %u didn't die...\n", cpu);
+   ret = -1;
}
+
+   return ret;
+}
+
+void native_cpu_die

Re: [Xen-devel] [PATCH v3 01/41] lcoking/barriers, arch: Use smp barriers in smp_store_release()

2016-01-12 Thread Paul E. McKenney
On Sun, Jan 10, 2016 at 04:16:32PM +0200, Michael S. Tsirkin wrote:
> From: Davidlohr Bueso 
> 
> With commit b92b8b35a2e ("locking/arch: Rename set_mb() to smp_store_mb()")
> it was made clear that the context of this call (and thus set_mb)
> is strictly for CPU ordering, as opposed to IO. As such all archs
> should use the smp variant of mb(), respecting the semantics and
> saving a mandatory barrier on UP.
> 
> Signed-off-by: Davidlohr Bueso 
> Signed-off-by: Peter Zijlstra (Intel) 
> Cc: 
> Cc: Andrew Morton 
> Cc: Benjamin Herrenschmidt 
> Cc: Heiko Carstens 
> Cc: Linus Torvalds 
> Cc: Paul E. McKenney 
> Cc: Peter Zijlstra 
> Cc: Thomas Gleixner 
> Cc: Tony Luck 
> Cc: d...@stgolabs.net
> Link: 
> http://lkml.kernel.org/r/1445975631-17047-3-git-send-email-d...@stgolabs.net
> Signed-off-by: Ingo Molnar 

Aside from a need for s/lcoking/locking/ in the subject line:

Reviewed-by: Paul E. McKenney 

> ---
>  arch/ia64/include/asm/barrier.h| 2 +-
>  arch/powerpc/include/asm/barrier.h | 2 +-
>  arch/s390/include/asm/barrier.h| 2 +-
>  include/asm-generic/barrier.h  | 2 +-
>  4 files changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/ia64/include/asm/barrier.h b/arch/ia64/include/asm/barrier.h
> index df896a1..209c4b8 100644
> --- a/arch/ia64/include/asm/barrier.h
> +++ b/arch/ia64/include/asm/barrier.h
> @@ -77,7 +77,7 @@ do {
> \
>   ___p1;  \
>  })
> 
> -#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); mb(); } 
> while (0)
> +#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } 
> while (0)
> 
>  /*
>   * The group barrier in front of the rsm & ssm are necessary to ensure
> diff --git a/arch/powerpc/include/asm/barrier.h 
> b/arch/powerpc/include/asm/barrier.h
> index 0eca6ef..a7af5fb 100644
> --- a/arch/powerpc/include/asm/barrier.h
> +++ b/arch/powerpc/include/asm/barrier.h
> @@ -34,7 +34,7 @@
>  #define rmb()  __asm__ __volatile__ ("sync" : : : "memory")
>  #define wmb()  __asm__ __volatile__ ("sync" : : : "memory")
> 
> -#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); mb(); } 
> while (0)
> +#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } 
> while (0)
> 
>  #ifdef __SUBARCH_HAS_LWSYNC
>  #define SMPWMB  LWSYNC
> diff --git a/arch/s390/include/asm/barrier.h b/arch/s390/include/asm/barrier.h
> index d68e11e..7ffd0b1 100644
> --- a/arch/s390/include/asm/barrier.h
> +++ b/arch/s390/include/asm/barrier.h
> @@ -36,7 +36,7 @@
>  #define smp_mb__before_atomic()  smp_mb()
>  #define smp_mb__after_atomic()   smp_mb()
> 
> -#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); 
> mb(); } while (0)
> +#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); 
> } while (0)
> 
>  #define smp_store_release(p, v)  
> \
>  do { \
> diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
> index b42afad..0f45f93 100644
> --- a/include/asm-generic/barrier.h
> +++ b/include/asm-generic/barrier.h
> @@ -93,7 +93,7 @@
>  #endif   /* CONFIG_SMP */
> 
>  #ifndef smp_store_mb
> -#define smp_store_mb(var, value)  do { WRITE_ONCE(var, value); mb(); } while 
> (0)
> +#define smp_store_mb(var, value)  do { WRITE_ONCE(var, value); smp_mb(); } 
> while (0)
>  #endif
> 
>  #ifndef smp_mb__before_atomic
> -- 
> MST
> 


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v3 05/41] powerpc: reuse asm-generic/barrier.h

2016-01-12 Thread Paul E. McKenney
On Sun, Jan 10, 2016 at 04:17:09PM +0200, Michael S. Tsirkin wrote:
> On powerpc read_barrier_depends, smp_read_barrier_depends
> smp_store_mb(), smp_mb__before_atomic and smp_mb__after_atomic match the
> asm-generic variants exactly. Drop the local definitions and pull in
> asm-generic/barrier.h instead.
> 
> This is in preparation to refactoring this code area.
> 
> Signed-off-by: Michael S. Tsirkin 
> Acked-by: Arnd Bergmann 

Looks sane to me.

Reviewed-by: Paul E. McKenney 

> ---
>  arch/powerpc/include/asm/barrier.h | 9 ++---
>  1 file changed, 2 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/barrier.h 
> b/arch/powerpc/include/asm/barrier.h
> index a7af5fb..980ad0c 100644
> --- a/arch/powerpc/include/asm/barrier.h
> +++ b/arch/powerpc/include/asm/barrier.h
> @@ -34,8 +34,6 @@
>  #define rmb()  __asm__ __volatile__ ("sync" : : : "memory")
>  #define wmb()  __asm__ __volatile__ ("sync" : : : "memory")
> 
> -#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } 
> while (0)
> -
>  #ifdef __SUBARCH_HAS_LWSYNC
>  #define SMPWMB  LWSYNC
>  #else
> @@ -60,9 +58,6 @@
>  #define smp_wmb()barrier()
>  #endif /* CONFIG_SMP */
> 
> -#define read_barrier_depends()   do { } while (0)
> -#define smp_read_barrier_depends()   do { } while (0)
> -
>  /*
>   * This is a barrier which prevents following instructions from being
>   * started until the value of the argument x is known.  For example, if
> @@ -87,8 +82,8 @@ do {
> \
>   ___p1;  \
>  })
> 
> -#define smp_mb__before_atomic() smp_mb()
> -#define smp_mb__after_atomic()  smp_mb()
>  #define smp_mb__before_spinlock()   smp_mb()
> 
> +#include 
> +
>  #endif /* _ASM_POWERPC_BARRIER_H */
> -- 
> MST
> 


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-14 Thread Paul E. McKenney
On Thu, Jan 14, 2016 at 12:04:45PM +, Will Deacon wrote:
> On Wed, Jan 13, 2016 at 12:58:22PM -0800, Leonid Yegoshin wrote:
> > On 01/13/2016 12:48 PM, Peter Zijlstra wrote:
> > >On Wed, Jan 13, 2016 at 11:02:35AM -0800, Leonid Yegoshin wrote:
> > >
> > >>I ask HW team about it but I have a question - has it any relationship 
> > >>with
> > >>replacing MIPS SYNC with lightweight SYNCs (SYNC_WMB etc)?
> > >Of course. If you cannot explain the semantics of the primitives you
> > >introduce, how can we judge the patch.
> > >
> > >
> > You missed a point - it is a question about replacement of SYNC with
> > lightweight primitives. It is NOT a question about multithread system
> > behavior without any SYNC. The answer on a latest Will's question lies in
> > different area.
> 
> The reason we (Peter and I) care about this isn't because we enjoy being
> obstructive. It's because there is a whole load of core (i.e. portable)
> kernel code that is written to the *kernel* memory model. For example,
> the scheduler, RCU, mutex implementations, perf, drivers, you name it.
> 
> Consequently, it's important that the architecture back-ends implement
> these portable primitives (e.g. smp_mb()) in a way that satisfies the
> kernel memory model so that core code doesn't need to worry about the
> underlying architecture for synchronisation purposes. You could turn
> around and say "but if MIPS gets it wrong, then that's MIPS's problem",
> but actually not having a general understanding of the ordering guarantees
> provided by each architecture makes it very difficult for us to extend
> the kernel memory model in such a way that it can be implemented
> efficiently across the board *and* relied upon by core code.

What Will said!

Yes, you can cut corners within MIPS architecture-specific code,
but primitives that are used in the core kernel really do need to
work as expected.

Thanx, Paul

> The virtio patch at the start of the thread doesn't particularly concern
> me. It's the other patches you linked to that implement acquire/release
> that have me worried.
> 
> Will
> 


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-14 Thread Paul E. McKenney
On Thu, Jan 14, 2016 at 11:28:18AM -0800, Leonid Yegoshin wrote:
> On 01/14/2016 04:14 AM, Will Deacon wrote:
> >On Wed, Jan 13, 2016 at 02:26:16PM -0800, Leonid Yegoshin wrote:
> >
> >> Moreover, there are voices against guarantee that it will be in future
> >>and that voices point me to Documentation/memory-barriers.txt section "DATA
> >>DEPENDENCY BARRIERS" examples which require SYNC_RMB between loading
> >>address/index and using that for loading data based on that address or index
> >>for shared data (look on CPU2 pseudo-code):
> >>>To deal with this, a data dependency barrier or better must be inserted
> >>>between the address load and the data load:
> >>>
> >>>CPU 1 CPU 2
> >>>===   ===
> >>>{ A == 1, B == 2, C = 3, P == &A, Q == &C }
> >>>B = 4;
> >>>
> >>>WRITE_ONCE(P, &B);
> >>>  Q = READ_ONCE(P);
> >>>   <---
> >>>SYNC_RMB is here
> >>>  D = *Q;
> >>...
> >>>Another example of where data dependency barriers might be required is
> >>>where a
> >>>number is read from memory and then used to calculate the index for an
> >>>array
> >>>access:
> >>>
> >>>CPU 1 CPU 2
> >>>===   ===
> >>>{ M[0] == 1, M[1] == 2, M[3] = 3, P == 0, Q == 3 }
> >>>M[1] = 4;
> >>>
> >>>WRITE_ONCE(P, 1);
> >>>  Q = READ_ONCE(P);
> >>>   <
> >>>SYNC_RMB is here
> >>>  D = M[Q];
> >>That voices say that there is a legitimate reason to relax HW here for
> >>performance if SYNC_RMB is needed anyway to work with this sequence of
> >>shared data.
> >Are you saying that MIPS needs to implement [smp_]read_barrier_depends?
> 
> It is not me, it is Documentation/memory-barriers.txt from kernel sources.
> 
> HW team can't work on voice statements, it should do a work on
> written documents. If that is written (see above the lines which I
> marked by "SYNC_RMB") then anybody should use it and never mind how
> many CPUs/Threads are in play. This examples explicitly requires to
> insert "data dependency barrier" between reading a shared
> pointer/index and using it to fetch a shared data. So, your
> WRC+addr+addr test is a violation of that recommendation.

Perhaps Documentation/memory-barriers.txt needs additional clarification.
It would not be the first time.

If your CPU implicitly maintains ordering based on address and
data dependencies, then you don't need any instructions for
.

The WRC+addr+addr is OK because data dependencies are not required to be
transitive, in other words, they are not required to flow from one CPU to
another without the help of an explicit memory barrier.  Transitivity is
instead supplied by smp_mb() and by smp_store_release()-smp_load_acquire()
chains.  Here is the Linux kernel code for WRC+addr+addr, give or take
(and no, I have no idea why anyone would want to write code like this):

struct foo {
struct foo **a;
};
struct foo b;
struct foo c;
struct foo d;
struct foo e;
struct foo f = { &d };
struct foo g = { &e };
struct foo *x = &b;

void cpu0(void)
{
WRITE_ONCE(x, &f);
}

void cpu1(void)
{
struct foo *p;

p = lockless_dereference(x);
WRITE_ONCE(p->a, &x);
}

void cpu2(void)
{
r1 = lockless_dereference(f.a);
WRITE_ONCE(*r1, &c);
}

It is legal to end the run with x==&f and r1==&x.  To prevent this outcome,
we do the following:

struct foo {
struct foo **a;
};
struct foo b;
struct foo c;
struct foo d;
struct foo e;
struct foo f = { &d };
struct foo g = { &e };
struct foo *x = &b;

void cpu0(void)
{
WRITE_ONCE(x, &f);
}

void cpu1(void)
{
struct foo *p;

p = lockless_dereference(x);
smp_store_release(&p->a, &x); /* Additional ordering. */
}

void cpu2(void)
{
r1 = lockless_dereference(f.a);
WRITE_ONCE(*r1, &c);
}

And I still don't know why anyone would need this sort of code.  ;-)

Alternatively, we pull cpu2() into cpu1():

struct foo {
struct foo **a;
};
struct foo b;
struct foo c;
struct foo d;
struct foo e;
struct foo f = { &d };
struct foo g = { &e };
struct foo *x = &b;

void cpu0(void)
{
WRITE_ONCE(x, &f);
}

void cpu1(void)
{
struct foo *p;

p = lockless

Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-14 Thread Paul E. McKenney
On Thu, Jan 14, 2016 at 09:15:13PM +0100, Peter Zijlstra wrote:
> On Thu, Jan 14, 2016 at 11:42:02AM -0800, Leonid Yegoshin wrote:
> > An the only point - please use an appropriate SYNC_* barriers instead of
> > heavy bold hammer. That stuff was design explicitly to support the
> > requirements of Documentation/memory-barriers.txt
> 
> That's madness. That document changes from version to version as to what
> we _think_ the actual hardware does. It is _NOT_ a specification.

There is work in progress on a specification, but please don't hold
your breath.  And I am not as optimistic as I might be about any formal
specification keeping up with the Linux kernel or with the hardware that
it supports.  But it seems worth a good try.

> You cannot design hardware from that. Its incomplete and fails to
> specify a bunch of things. It not a mathematically sound definition of a
> memory model.
> 
> Please stop referring to that document for what a particular barrier
> _should_ do.  Explain what MIPS does, so we can attempt to integrate
> this knowledge with our knowledge of PPC/ARM/Alpha/x86/etc. and improve
> upon our understanding of hardware and improve the Linux memory model.

Please!

Thanx, Paul


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-14 Thread Paul E. McKenney
On Thu, Jan 14, 2016 at 12:12:53PM -0800, Leonid Yegoshin wrote:
> On 01/14/2016 04:04 AM, Will Deacon wrote:
> >Consequently, it's important that the architecture back-ends
> >implement these portable primitives (e.g. smp_mb()) in a way that
> >satisfies the kernel memory model so that core code doesn't need
> >to worry about the underlying architecture for synchronisation
> >purposes.
> 
> It seems you don't listen me. I said multiple times - MIPS
> implementation of
> SYNC_RMB/SYNC_WMB/SYNC_MB/SYNC_ACQUIRE/SYNC_RELEASE instructions
> matches the description of
> smp_rmb/smp_wmb/smp_mb/sync_acquire/sync_release from
> Documentation/memory-barriers.txt file.
> 
> What else do you want from me - RTL or microArch design for that?

I suspect that it is more likely that we are talking past each other.
This stuff is subtle and although we have better ways of talking about
it than (say) ten years ago, it is subtle.  Two ways of talking about
it are herd and ppcmem.

The overview of ppcmem (AKA armmem and cppmem) is here:
https://www.cl.cam.ac.uk/~pes20/ppcmem/help.html

The intro to herd is here: http://arxiv.org/pdf/1308.6810v5.pdf
It may be downloaded here: http://diy.inria.fr/herd/

As a very rough rule of thumb, herd is faster and easier to use
and ppcmem is more precise.

So SYNC_RMB is intended to implement smp_rmb(), correct?

You could use SYNC_ACQUIRE() to implement read_barrier_depends() and
smp_read_barrier_depends(), but SYNC_RMB probably does not suffice.
The reason for this is that smp_read_barrier_depends() must order the
pointer load against any subsequent read or write through a dereference
of that pointer.  For example:

p = READ_ONCE(gp);
smp_rmb();
r1 = p->a; /* ordered by smp_rmb(). */
p->b = 42; /* NOT ordered by smp_rmb(), BUG!!! */
r2 = x; /* ordered by smp_rmb(), but doesn't need to be. */

In contrast:

p = READ_ONCE(gp);
smp_read_barrier_depends();
r1 = p->a; /* ordered by smp_read_barrier_depends(). */
p->b = 42; /* ordered by smp_read_barrier_depends(). */
r2 = x; /* not ordered by smp_read_barrier_depends(), which is OK. */

Again, if your hardware maintains local ordering for address
and data dependencies, you can have read_barrier_depends() and
smp_read_barrier_depends() be no-ops like they are for most
architectures.

Does that help?

Thanx, Paul


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-14 Thread Paul E. McKenney
On Thu, Jan 14, 2016 at 12:46:43PM -0800, Leonid Yegoshin wrote:
> On 01/14/2016 12:15 PM, Peter Zijlstra wrote:
> >On Thu, Jan 14, 2016 at 11:42:02AM -0800, Leonid Yegoshin wrote:
> >>An the only point - please use an appropriate SYNC_* barriers instead of
> >>heavy bold hammer. That stuff was design explicitly to support the
> >>requirements of Documentation/memory-barriers.txt
> >That's madness. That document changes from version to version as to what
> >we _think_ the actual hardware does. It is _NOT_ a specification.
> >
> >You cannot design hardware from that. Its incomplete and fails to
> >specify a bunch of things. It not a mathematically sound definition of a
> >memory model.
> >
> >Please stop referring to that document for what a particular barrier
> >_should_ do.  Explain what MIPS does, so we can attempt to integrate
> >this knowledge with our knowledge of PPC/ARM/Alpha/x86/etc. and improve
> >upon our understanding of hardware and improve the Linux memory model.
> 
> I am afraid I can't help you here. It is very complicated stuff and
> a model is actually doesn't fit your assumptions about CPUs well
> without some simplifications which are based on what you want to
> have.
> 
> I say that SYNC_ACQUIRE/etc follows what you expect for smp_acquire
> etc (basing on that document). And at least two CPU models were
> tested with my patches (see it in LMO) for that last year and that
> instructions are implemented now in engineering kernel.
> 
> If you have something else in mind, you can ask me. But I prefer to
> do not deviate too much from Documentation/memory-barriers.txt, for
> exam - if it asks to have memory barrier somewhere, then I assume
> the code should have it, and please - don't ask me a test which
> violates the current version of document recommendations.
> 
> For a moment I don't see a significant changes in this document for
> MIPS Arch at least 1.5 year, and the only significant point is that
> MIPS CPU Arch doesn't have yet smp_read_barrier_depends() and
> smp_rmb() should be used instead.

Is SYNC_ACQUIRE a memory-barrier instruction that orders prior loads
against later loads and stores?  If so, and if MIPS does not do
ordering based on address and data dependencies, I suggest making
read_barrier_depends() be a SYNC_ACQUIRE rather than SYNC_RMB.

Thanx, Paul


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-14 Thread Paul E. McKenney
On Thu, Jan 14, 2016 at 01:01:05PM -0800, Leonid Yegoshin wrote:
> I need some time to understand your test examples. However,

Understood.

> On 01/14/2016 12:34 PM, Paul E. McKenney wrote:
> >
> >
> >The WRC+addr+addr is OK because data dependencies are not required to be
> >transitive, in other words, they are not required to flow from one CPU to
> >another without the help of an explicit memory barrier.
> 
> I don't see any reliable way to fit WRC+addr+addr into "DATA
> DEPENDENCY BARRIERS" section recommendation to have data dependency
> barrier between read of a shared pointer/index and read the shared
> data based on that pointer. If you have this two reads, it doesn't
> matter the rest of scenario, you should put the dependency barrier
> in code anyway. If you don't do it in WRC+addr+addr scenario then
> after years it can be easily changed to different scenario which
> fits some of scenario in "DATA DEPENDENCY BARRIERS" section and
> fails.

The trick is that lockless_dereference() contains an
smp_read_barrier_depends():

#define lockless_dereference(p) \
({ \
typeof(p) _p1 = READ_ONCE(p); \
smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
(_p1); \
})

Or am I missing your point?

> >   Transitivity is
> 
> Peter Zijlstra recently wrote: "In particular we're very much all
> 'confused' about the various notions of transitivity". I am confused
> too, so - please use some more simple way to explain your words.
> Sorry, but we need a common ground first.

OK, how about an example?  (Z6.3 in the ppcmem naming scheme.)

int x, y, z;

void cpu0(void)
{
WRITE_ONCE(x, 1);
smp_wmb();
WRITE_ONCE(y, 1);
}

void cpu1(void)
{
WRITE_ONCE(y, 2);
smp_wmb();
WRITE_ONCE(z, 1);
}

void cpu2(void)
{
r1 = READ_ONCE(z);
smp_rmb();
r2 = read_once(x);
}

If smp_rmb() and smp_wmb() provided transitive ordering, then cpu2()
would see cpu0()'s ordering.  But they do not, so the ordering is
visible at best to the adjacent CPU.  This means that the final value
of y can be 2, while at the same time r1==1 && r2==0.

Now the full barrier, smp_mb(), does provide transitive ordering,
so if the three barriers in the above example are replaced with
smp_mb() the y==2 && r1==1 && r2==0 outcome will be prohibited.

So smp_mb() provides transitivity, as do pairs of smp_store_release()
and smp_read_acquire(), as do RCU grace periods.  The exact interactions
between transitive and non-transitive ordering is a work in progress.
That said, if a series of transitive segments ends in a write, which
connects to a single non-transitive segment starting with a read,
you should be good.  And in fact in the example above, you can replace
the smp_wmb()s with smp_mb() and leave the smp_rmb() and still
prohibit the "cyclic" outcome.

If you want a more formal definition, I must refer you back to the
ppcmem and herd references.

Does that help?

Thanx, Paul


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-14 Thread Paul E. McKenney
On Thu, Jan 14, 2016 at 01:24:34PM -0800, Leonid Yegoshin wrote:
> On 01/14/2016 12:48 PM, Paul E. McKenney wrote:
> >
> >So SYNC_RMB is intended to implement smp_rmb(), correct?
> Yes.
> >
> >You could use SYNC_ACQUIRE() to implement read_barrier_depends() and
> >smp_read_barrier_depends(), but SYNC_RMB probably does not suffice.
> 
> If smp_read_barrier_depends() is used to separate not only two reads
> but read pointer and WRITE basing on that pointer (example below) -
> yes. I just doesn't see any example of this in famous
> Documentation/memory-barriers.txt and had no chance to know what you
> use it in this way too.

Well, Documentation/memory-barriers.txt was intended as a guide for Linux
kernel hackers, and not for hardware architects.  The need for something
more precise has become clear over the past year or two, and I am working
on it with some heavy-duty memory-model folks.  But all previous memory
models have been for a specific CPU architecture, so doing one for the
intersection of several is offering up some complications.  I therefore
cannot yet provide a completion date.

That said, I still suggest use of SYNC_ACQUIRE for read_barrier_depends().

> >The reason for this is that smp_read_barrier_depends() must order the
> >pointer load against any subsequent read or write through a dereference
> >of that pointer.
> 
> I can't see that requirement anywhere in Documents directory. I mean
> - the words "write through a dereference of that pointer" or similar
> for smp_read_barrier_depends.

No worries, I will add one.  Please see the end of this message for an
initial patch.

Please understand that Documentation/memory-barriers.txt is a living
document:

v4.4: Two changes
v4.3: Three changes
v4.2: Six changes
v4.1: Three changes
v4.0: Two changes

It tends to change as we locate corner cases either in hardware or
in software use cases/APIs.

> >   For example:
> >
> > p = READ_ONCE(gp);
> > smp_rmb();
> > r1 = p->a; /* ordered by smp_rmb(). */
> > p->b = 42; /* NOT ordered by smp_rmb(), BUG!!! */
> > r2 = x; /* ordered by smp_rmb(), but doesn't need to be. */
> >
> >In contrast:
> >
> > p = READ_ONCE(gp);
> > smp_read_barrier_depends();
> > r1 = p->a; /* ordered by smp_read_barrier_depends(). */
> > p->b = 42; /* ordered by smp_read_barrier_depends(). */
> > r2 = x; /* not ordered by smp_read_barrier_depends(), which is OK. */
> >
> >Again, if your hardware maintains local ordering for address
> >and data dependencies, you can have read_barrier_depends() and
> >smp_read_barrier_depends() be no-ops like they are for most
> >architectures.
> 
> It is not so simple, I mean "local ordering for address and data
> dependencies". Local ordering is NOT enough. It happens that current
> MIPS R6 doesn't require in your example smp_read_barrier_depends()
> but in discussion it comes out that it may not. Because without
> smp_read_barrier_depends() your example can be a part of Will's
> WRC+addr+addr and we found some design which easily can bump into
> this test. And that design actually performs "local ordering for
> address and data dependencies" too.

As noted in another email in this thread, I do not believe that
WRC+addr+addr needs to be prohibited.  Sounds like Will and I need to
get our story straight, though.

Will?

Thanx, Paul



commit 955720966e216b00613fcf60188d507c103f0e80
Author: Paul E. McKenney 
Date:   Thu Jan 14 14:17:04 2016 -0800

documentation: Subsequent writes ordered by rcu_dereference()

The current memory-barriers.txt does not address the possibility of
a write to a dereferenced pointer.  This should be rare, but when it
happens, we need that write -not- to be clobbered by the initialization.
This commit therefore adds an example showing a data dependency ordering
a later data-dependent write.

Reported-by: Leonid Yegoshin 
Signed-off-by: Paul E. McKenney 

diff --git a/Documentation/memory-barriers.txt 
b/Documentation/memory-barriers.txt
index f49c15f7864f..c66ba46d8079 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -555,6 +555,30 @@ between the address load and the data load:
 This enforces the occurrence of one of the two implications, and prevents the
 third possibility from arising.
 
+A data-dependency barrier must also order against dependent writes:
+
+   CPU 1 CPU 2
+   ===   ===
+   { A == 1, B == 2, C = 3, P == &A, Q == &C }
+   B = 4;
+   
+   WRITE_

Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-14 Thread Paul E. McKenney
On Thu, Jan 14, 2016 at 01:45:44PM -0800, Leonid Yegoshin wrote:
> On 01/14/2016 01:34 PM, Paul E. McKenney wrote:
> >On Thu, Jan 14, 2016 at 12:46:43PM -0800, Leonid Yegoshin wrote:
> >>On 01/14/2016 12:15 PM, Peter Zijlstra wrote:
> >>>On Thu, Jan 14, 2016 at 11:42:02AM -0800, Leonid Yegoshin wrote:
> >>>>An the only point - please use an appropriate SYNC_* barriers instead of
> >>>>heavy bold hammer. That stuff was design explicitly to support the
> >>>>requirements of Documentation/memory-barriers.txt
> >>>That's madness. That document changes from version to version as to what
> >>>we _think_ the actual hardware does. It is _NOT_ a specification.
> >>>
> >>>You cannot design hardware from that. Its incomplete and fails to
> >>>specify a bunch of things. It not a mathematically sound definition of a
> >>>memory model.
> >>>
> >>>Please stop referring to that document for what a particular barrier
> >>>_should_ do.  Explain what MIPS does, so we can attempt to integrate
> >>>this knowledge with our knowledge of PPC/ARM/Alpha/x86/etc. and improve
> >>>upon our understanding of hardware and improve the Linux memory model.
> >>I am afraid I can't help you here. It is very complicated stuff and
> >>a model is actually doesn't fit your assumptions about CPUs well
> >>without some simplifications which are based on what you want to
> >>have.
> >>
> >>I say that SYNC_ACQUIRE/etc follows what you expect for smp_acquire
> >>etc (basing on that document). And at least two CPU models were
> >>tested with my patches (see it in LMO) for that last year and that
> >>instructions are implemented now in engineering kernel.
> >>
> >>If you have something else in mind, you can ask me. But I prefer to
> >>do not deviate too much from Documentation/memory-barriers.txt, for
> >>exam - if it asks to have memory barrier somewhere, then I assume
> >>the code should have it, and please - don't ask me a test which
> >>violates the current version of document recommendations.
> >>
> >>For a moment I don't see a significant changes in this document for
> >>MIPS Arch at least 1.5 year, and the only significant point is that
> >>MIPS CPU Arch doesn't have yet smp_read_barrier_depends() and
> >>smp_rmb() should be used instead.
> 
> >Is SYNC_ACQUIRE a memory-barrier instruction that orders prior loads
> >against later loads and stores?
> 
> Yes, it is in MD00087 (table 6.6 of document Ver 6.04) -
> https://imgtec.com/?do-download=4302

OK, it does look like it should work.  Of course, if you can rely
on straight address/data dependencies, that would be even better.

> >   If so, and if MIPS does not do
> >ordering based on address and data dependencies, I suggest making
> >read_barrier_depends() be a SYNC_ACQUIRE rather than SYNC_RMB.
> 
> I understood that, after I see the example of using it.
> Please consider to add that into Documentation/memory-barriers.txt
> (it is not easy to find that this barrier is used for shared WRITE
> basing on shared pointer), it would be helpful.

Actually, the Linux kernel doesn't have an acquire barrier, just an
smp_load_acquire().  Or did someone sneak one in while I wasn't looking?  ;-)

Thanx, Paul


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-14 Thread Paul E. McKenney
On Thu, Jan 14, 2016 at 01:36:50PM -0800, Leonid Yegoshin wrote:
> On 01/14/2016 01:29 PM, Paul E. McKenney wrote:
> >
> >>On 01/14/2016 12:34 PM, Paul E. McKenney wrote:
> >>>
> >>>The WRC+addr+addr is OK because data dependencies are not required to be
> >>>transitive, in other words, they are not required to flow from one CPU to
> >>>another without the help of an explicit memory barrier.
> >>I don't see any reliable way to fit WRC+addr+addr into "DATA
> >>DEPENDENCY BARRIERS" section recommendation to have data dependency
> >>barrier between read of a shared pointer/index and read the shared
> >>data based on that pointer. If you have this two reads, it doesn't
> >>matter the rest of scenario, you should put the dependency barrier
> >>in code anyway. If you don't do it in WRC+addr+addr scenario then
> >>after years it can be easily changed to different scenario which
> >>fits some of scenario in "DATA DEPENDENCY BARRIERS" section and
> >>fails.
> >The trick is that lockless_dereference() contains an
> >smp_read_barrier_depends():
> >
> >#define lockless_dereference(p) \
> >({ \
> > typeof(p) _p1 = READ_ONCE(p); \
> > smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
> > (_p1); \
> >})
> >
> >Or am I missing your point?
> 
> WRC+addr+addr has no any barrier. lockless_dereference() has a
> barrier. I don't see a common points between this and that in your
> answer, sorry.

Me, I am wondering what WRC+addr+addr has to do with anything at all.



OK, so it looks like Will was asking not about WRC+addr+addr, but instead
about WRC+sync+addr.  This would drop an smp_mb() into cpu2() in my
earlier example, which needs to provide ordering.

I am guessing that the manual's "Older instructions which must be globally
performed when the SYNC instruction completes" provides the equivalent
of ARM/Power A-cumulativity, which can be thought of as transitivity
backwards in time.  This leads me to believe that your smp_mb() needs
to use SYNC rather than SYNC_MB, as was the subject of earlier spirited
discussion in this thread.

Suppose you have something like this:

void cpu0(void)
{
WRITE_ONCE(a, 1);
SYNC_MB();
r0 = READ_ONCE(b);
}

void cpu1(void)
{
WRITE_ONCE(b, 1);
SYNC_MB();
r1 = READ_ONCE(c);
}

void cpu2(void)
{
WRITE_ONCE(c, 1);
SYNC_MB();
r2 = READ_ONCE(d);
}

void cpu3(void)
{
WRITE_ONCE(d, 1);
SYNC_MB();
r3 = READ_ONCE(a);
}

Does your hardware guarantee that it is not possible for all of r0,
r1, r2, and r3 to be equal to zero at the end of the test, assuming
that a, b, c, and d are all initially zero, and the four functions
above run concurrently?  There are many similar litmus tests for other
combinations of reads and writes, but this is perhaps the nastiest from
a hardware viewpoint.  Does SYNC_MB() provide sufficient ordering for
this sort of situation?

Another (more academic) case is this one, with x and y initially zero:

void cpu0(void)
{
WRITE_ONCE(x, 1);
}

void cpu1(void)
{
WRITE_ONCE(y, 1);
}

void cpu2(void)
{
r1 = READ_ONCE(x, 1);
SYNC_MB();
r2 = READ_ONCE(y, 1);
}

void cpu3(void)
{
r3 = READ_ONCE(y, 1);
SYNC_MB();
r4 = READ_ONCE(x, 1);
}

Does SYNC_MB() prohibit r1 == 1 && r2 == 0 && r3 == 1 && r4 == 0?

Now, I don't know of any specific use cases for this pattern, but it
is greatly beloved of some of the old-school concurrency community,
so it is likely to crop up at some point, despite my best efforts.  :-/

Thanx, Paul


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-14 Thread Paul E. McKenney
On Thu, Jan 14, 2016 at 03:33:40PM -0800, Leonid Yegoshin wrote:
> On 01/14/2016 02:55 PM, Paul E. McKenney wrote:
> >OK, so it looks like Will was asking not about WRC+addr+addr, but instead
> >about WRC+sync+addr.
> (He actually asked twice about this and that too but skip this)

Fair enough!  ;-)

> >I am guessing that the manual's "Older instructions which must be globally
> >performed when the SYNC instruction completes" provides the equivalent
> >of ARM/Power A-cumulativity, which can be thought of as transitivity
> >backwards in time.  This leads me to believe that your smp_mb() needs
> >to use SYNC rather than SYNC_MB, as was the subject of earlier spirited
> >discussion in this thread.
> 
> Don't be fooled here by words "ordered" and "completed" - it is HW
> design items and actually written poorly.
> Just assume that SYNC_MB is absolutely the same as SYNC for any CPU
> and coherent device (besides performance). The difference can be in
> non-coherent devices because SYNC actually tries to make a barrier
> for them too. In some SoCs it is just the same because there is no
> need to barrier a non-coherent device (device register access
> usually strictly ordered... if there is no bridge in between).

So smp_mb() can be SYNC_MB.  However, mb() needs to be SYNC for MMIO
purposes, correct?

> >Suppose you have something like this:
> >...
> >Does your hardware guarantee that it is not possible for all of r0,
> >r1, r2, and r3 to be equal to zero at the end of the test, assuming
> >that a, b, c, and d are all initially zero, and the four functions
> >above run concurrently?
> 
> It is assumed to be so from Arch point of view. HW bugs are
> possible, of course.

Indeed!

> >Another (more academic) case is this one, with x and y initially zero:
> >
> >...
> >Does SYNC_MB() prohibit r1 == 1 && r2 == 0 && r3 == 1 && r4 == 0?
> 
> It is assumed to be so from Arch point of view. HW bugs are
> possible, of course.

Looks to me like smp_mb() can be SYNC_MB, then.

> Note: I am not sure about ANY past MIPS R2 CPU because that stuff is
> implemented some time but nobody made it in Linux kernel (it was
> used by some vendor for non-Linux system). For that reason my patch
> for lightweight SYNCs has an option - implement it or implement a
> generic SYNC. It is possible that some vendor did it in different
> way but nobody knows or test it. But as a minimum - SYNC must be
> implemented in spinlocks/atomics/bitops, in recent P5600 it is
> proven that read can pass write in atomics.
> 
> MIPS R6 is a different story, I verified lightweight SYNCs from the
> beginning and it also should use SYNCs.

So you need to build a different kernel for some types of MIPS systems?
Or do you do boot-time rewriting, like a number of other arches do?

Thanx, Paul


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-15 Thread Paul E. McKenney
On Fri, Jan 15, 2016 at 09:55:54AM +0100, Peter Zijlstra wrote:
> On Thu, Jan 14, 2016 at 01:29:13PM -0800, Paul E. McKenney wrote:
> > So smp_mb() provides transitivity, as do pairs of smp_store_release()
> > and smp_read_acquire(), 
> 
> But they provide different grades of transitivity, which is where all
> the confusion lays.
> 
> smp_mb() is strongly/globally transitive, all CPUs will agree on the order.
> 
> Whereas the RCpc release+acquire is weakly so, only the two cpus
> involved in the handover will agree on the order.

Good point!

Using grace periods in place of smp_mb() also provides strong/global
transitivity, but also insanely high latencies.  ;-)

The patch below updates Documentation/memory-barriers.txt to define
local vs. global transitivity.  The corresponding ppcmem litmus test
is included below as well.

Should we start putting litmus tests for the various examples
somewhere, perhaps in a litmus-tests directory within each participating
architecture?  I have a pile of powerpc-related litmus tests on my laptop,
but they probably aren't doing all that much good there.

Thanx, Paul



PPC local-transitive
""
{
0:r1=1; 0:r2=u; 0:r3=v; 0:r4=x; 0:r5=y; 0:r6=z;
1:r1=1; 1:r2=u; 1:r3=v; 1:r4=x; 1:r5=y; 1:r6=z;
2:r1=1; 2:r2=u; 2:r3=v; 2:r4=x; 2:r5=y; 2:r6=z;
3:r1=1; 3:r2=u; 3:r3=v; 3:r4=x; 3:r5=y; 3:r6=z;
}
 P0   | P1   | P2   | P3   ;
 lwz r9,0(r4) | lwz r9,0(r5) | lwz r9,0(r6) | stw r1,0(r3) ;
 lwsync   | lwsync   | lwsync   | sync ;
 stw r1,0(r2) | lwz r8,0(r3) | stw r1,0(r7) | lwz r9,0(r2) ;
 lwsync   | lwz r7,0(r2) |  |  ;
 stw r1,0(r5) | lwsync   |  |  ;
  | stw r1,0(r6) |  |  ;
exists
(* (0:r9=0 /\ 1:r9=1 /\ 2:r9=1 /\ 1:r8=0 /\ 3:r9=0) *)
(* (0:r9=1 /\ 1:r9=1 /\ 2:r9=1) *)
(* (0:r9=0 /\ 1:r9=1 /\ 2:r9=1 /\ 1:r7=0) *)
(0:r9=0 /\ 1:r9=1 /\ 2:r9=1 /\ 1:r7=0)



commit 2cb4e83a1b5c89c8e39b8a64bd89269d05913e41
Author: Paul E. McKenney 
Date:   Fri Jan 15 09:30:42 2016 -0800

documentation: Distinguish between local and global transitivity

The introduction of smp_load_acquire() and smp_store_release() had
the side effect of introducing a weaker notion of transitivity:
The transitivity of full smp_mb() barriers is global, but that
of smp_store_release()/smp_load_acquire() chains is local.  This
commit therefore introduces the notion of local transitivity and
gives an example.

Reported-by: Peter Zijlstra 
    Reported-by: Will Deacon 
Signed-off-by: Paul E. McKenney 

diff --git a/Documentation/memory-barriers.txt 
b/Documentation/memory-barriers.txt
index c66ba46d8079..d8109ed99342 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -1318,8 +1318,82 @@ or a level of cache, CPU 2 might have early access to 
CPU 1's writes.
 General barriers are therefore required to ensure that all CPUs agree
 on the combined order of CPU 1's and CPU 2's accesses.
 
-To reiterate, if your code requires transitivity, use general barriers
-throughout.
+General barriers provide "global transitivity", so that all CPUs will
+agree on the order of operations.  In contrast, a chain of release-acquire
+pairs provides only "local transitivity", so that only those CPUs on
+the chain are guaranteed to agree on the combined order of the accesses.
+For example, switching to C code in deference to Herman Hollerith:
+
+   int u, v, x, y, z;
+
+   void cpu0(void)
+   {
+   r0 = smp_load_acquire(&x);
+   WRITE_ONCE(u, 1);
+   smp_store_release(&y, 1);
+   }
+
+   void cpu1(void)
+   {
+   r1 = smp_load_acquire(&y);
+   r4 = READ_ONCE(v);
+   r5 = READ_ONCE(u);
+   smp_store_release(&z, 1);
+   }
+
+   void cpu2(void)
+   {
+   r2 = smp_load_acquire(&z);
+   smp_store_release(&x, 1);
+   }
+
+   void cpu3(void)
+   {
+   WRITE_ONCE(v, 1);
+   smp_mb();
+   r3 = READ_ONCE(u);
+   }
+
+Because cpu0(), cpu1(), and cpu2() participate in a local transitive
+chain of smp_store_release()/smp_load_acquire() pairs, the following
+outcome is prohibited:
+
+   r0 == 1 && r1 == 1 && r2 == 1
+
+Furthermore, because of the release-acquire relationship between cpu0()
+and cpu1(), cpu1() must see cpu0()'s writes, so that the following
+outcome is prohibited:
+
+   r1 == 1 && r5 == 0
+
+However, the transitivity of release-acquire is local to the participating
+CPUs and does not a

Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-15 Thread Paul E. McKenney
On Fri, Jan 15, 2016 at 10:13:48AM +0100, Peter Zijlstra wrote:
> On Fri, Jan 15, 2016 at 09:55:54AM +0100, Peter Zijlstra wrote:
> > On Thu, Jan 14, 2016 at 01:29:13PM -0800, Paul E. McKenney wrote:
> > > So smp_mb() provides transitivity, as do pairs of smp_store_release()
> > > and smp_read_acquire(), 
> > 
> > But they provide different grades of transitivity, which is where all
> > the confusion lays.
> > 
> > smp_mb() is strongly/globally transitive, all CPUs will agree on the order.
> > 
> > Whereas the RCpc release+acquire is weakly so, only the two cpus
> > involved in the handover will agree on the order.
> 
> And the stuff we're confused about is how best to express the difference
> and guarantees of these two forms of transitivity and how exactly they
> interact.

Hoping my memory-barrier.txt patch helps here...

> And smp_load_acquire()/smp_store_release() are RCpc because TSO archs
> and PPC. the atomic*_{acquire,release}() are RCpc because PPC and
> LOCK,UNLOCK are similarly RCpc because of PPC.
> 
> Now we'd like PPC to stick a SYNC in either LOCK or UNLOCK so at least
> the locks are RCsc again, but they resist for performance reasons but
> waver because they don't want to be the ones finding all the nasty bugs
> because they're the only one.

I believe that the relevant proverb said something about starving to
death between two bales of hay...  ;-)

> Now the thing I worry about, and still have not had an answer to is if
> weakly ordered MIPS will end up being RCsc or RCpc for their locks if
> they get implemented with SYNC_ACQUIRE and SYNC_RELEASE instead of the
> current SYNC.

It would be good to have better clarity on this, no two ways about it.

Thanx, Paul


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-15 Thread Paul E. McKenney
On Fri, Jan 15, 2016 at 10:24:32AM +, Will Deacon wrote:
> On Thu, Jan 14, 2016 at 02:55:10PM -0800, Paul E. McKenney wrote:
> > On Thu, Jan 14, 2016 at 01:36:50PM -0800, Leonid Yegoshin wrote:
> > > On 01/14/2016 01:29 PM, Paul E. McKenney wrote:
> > > >
> > > >>On 01/14/2016 12:34 PM, Paul E. McKenney wrote:
> > > >>>
> > > >>>The WRC+addr+addr is OK because data dependencies are not required to 
> > > >>>be
> > > >>>transitive, in other words, they are not required to flow from one CPU 
> > > >>>to
> > > >>>another without the help of an explicit memory barrier.
> > > >>I don't see any reliable way to fit WRC+addr+addr into "DATA
> > > >>DEPENDENCY BARRIERS" section recommendation to have data dependency
> > > >>barrier between read of a shared pointer/index and read the shared
> > > >>data based on that pointer. If you have this two reads, it doesn't
> > > >>matter the rest of scenario, you should put the dependency barrier
> > > >>in code anyway. If you don't do it in WRC+addr+addr scenario then
> > > >>after years it can be easily changed to different scenario which
> > > >>fits some of scenario in "DATA DEPENDENCY BARRIERS" section and
> > > >>fails.
> > > >The trick is that lockless_dereference() contains an
> > > >smp_read_barrier_depends():
> > > >
> > > >#define lockless_dereference(p) \
> > > >({ \
> > > > typeof(p) _p1 = READ_ONCE(p); \
> > > > smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
> > > > (_p1); \
> > > >})
> > > >
> > > >Or am I missing your point?
> > > 
> > > WRC+addr+addr has no any barrier. lockless_dereference() has a
> > > barrier. I don't see a common points between this and that in your
> > > answer, sorry.
> > 
> > Me, I am wondering what WRC+addr+addr has to do with anything at all.
> 
> See my earlier reply [1] (but also, your WRC Linux example looks more
> like a variant on WWC and I couldn't really follow it).

I will revisit my WRC Linux example.  And yes, creating litmus tests
that use non-fake dependencies is still a bit of an undertaking.  :-/
I am sure that it will seem more natural with time and experience...

> > 
> > 
> > OK, so it looks like Will was asking not about WRC+addr+addr, but instead
> > about WRC+sync+addr.  This would drop an smp_mb() into cpu2() in my
> > earlier example, which needs to provide ordering.
> > 
> > I am guessing that the manual's "Older instructions which must be globally
> > performed when the SYNC instruction completes" provides the equivalent
> > of ARM/Power A-cumulativity, which can be thought of as transitivity
> > backwards in time. 
> 
> I couldn't make that leap. In particular, the manual's "Detailed
> Description" sections explicitly refer to program-order:
> 
>   Every synchronizable specified memory instruction (loads or stores or
>   both) that occurs in the instruction stream before the SYNC
>   instruction must reach a stage in the load/store datapath after which
>   no instruction re-ordering is possible before any synchronizable
>   specified memory instruction which occurs after the SYNC instruction
>   in the instruction stream reaches the same stage in the load/store
>   datapath.
> 
> Will
> 
> [1] 
> http://lists.infradead.org/pipermail/linux-arm-kernel/2016-January/399765.html

All good points.  I think we all agree that the MIPS documentation could
use significant help.  And given that I work for the company that produced
the analogous documentation for PowerPC, that is saying something.  ;-)

We simply can't know if MIPS's memory ordering is sufficient for the
Linux kernel given its current implementation of the ordering primitives
and its current documentation.

I feel a bit better than I did earlier due to Leonid's response to my
earlier litmus-test examples.  But I do recommend some serious stress
testing of MIPS on a good set of litmus tests.  Much nicer finding issues
that way than as random irreproducible strange behavior!

Thanx, Paul


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-15 Thread Paul E. McKenney
On Fri, Jan 15, 2016 at 09:54:01AM -0800, Paul E. McKenney wrote:
> On Fri, Jan 15, 2016 at 10:24:32AM +, Will Deacon wrote:
> > On Thu, Jan 14, 2016 at 02:55:10PM -0800, Paul E. McKenney wrote:
> > > On Thu, Jan 14, 2016 at 01:36:50PM -0800, Leonid Yegoshin wrote:
> > > > On 01/14/2016 01:29 PM, Paul E. McKenney wrote:
> > > > >
> > > > >>On 01/14/2016 12:34 PM, Paul E. McKenney wrote:
> > > > >>>
> > > > >>>The WRC+addr+addr is OK because data dependencies are not required 
> > > > >>>to be
> > > > >>>transitive, in other words, they are not required to flow from one 
> > > > >>>CPU to
> > > > >>>another without the help of an explicit memory barrier.
> > > > >>I don't see any reliable way to fit WRC+addr+addr into "DATA
> > > > >>DEPENDENCY BARRIERS" section recommendation to have data dependency
> > > > >>barrier between read of a shared pointer/index and read the shared
> > > > >>data based on that pointer. If you have this two reads, it doesn't
> > > > >>matter the rest of scenario, you should put the dependency barrier
> > > > >>in code anyway. If you don't do it in WRC+addr+addr scenario then
> > > > >>after years it can be easily changed to different scenario which
> > > > >>fits some of scenario in "DATA DEPENDENCY BARRIERS" section and
> > > > >>fails.
> > > > >The trick is that lockless_dereference() contains an
> > > > >smp_read_barrier_depends():
> > > > >
> > > > >#define lockless_dereference(p) \
> > > > >({ \
> > > > >   typeof(p) _p1 = READ_ONCE(p); \
> > > > >   smp_read_barrier_depends(); /* Dependency order vs. p above. */ 
> > > > > \
> > > > >   (_p1); \
> > > > >})
> > > > >
> > > > >Or am I missing your point?
> > > > 
> > > > WRC+addr+addr has no any barrier. lockless_dereference() has a
> > > > barrier. I don't see a common points between this and that in your
> > > > answer, sorry.
> > > 
> > > Me, I am wondering what WRC+addr+addr has to do with anything at all.
> > 
> > See my earlier reply [1] (but also, your WRC Linux example looks more
> > like a variant on WWC and I couldn't really follow it).
> 
> I will revisit my WRC Linux example.  And yes, creating litmus tests
> that use non-fake dependencies is still a bit of an undertaking.  :-/
> I am sure that it will seem more natural with time and experience...

Hmmm...  You are quite right, I did do WWC.  I need to change cpu2()'s
last access from a store to a load to get WRC.  Plus the levels of
indirection definitely didn't match up, did they?

struct foo {
struct foo *next;
};
struct foo a;
struct foo b;
struct foo c = { &a };
struct foo d = { &b };
struct foo x = { &c };
struct foo y = { &d };
struct foo *r1, *r2, *r3;

void cpu0(void)
{
WRITE_ONCE(x.next, &y);
}

void cpu1(void)
{
r1 = lockless_dereference(x.next);
WRITE_ONCE(r1->next, &x);
}

void cpu2(void)
{
r2 = lockless_dereference(y.next);
r3 = READ_ONCE(r2->next);
}

In this case, it is legal to end the run with:

r1 == &y && r2 == &x && r3 == &c

Please see below for a ppcmem litmus test.

So, did I get it right this time?  ;-)

Thanx, Paul

PS.  And yes, working through this does help me understand the
 benefits of fake dependencies.  Why do you ask?  ;-)



PPC WRCnf+addrs
""
{
0:r2=x; 0:r3=y;
1:r2=x; 1:r3=y;
2:r2=x; 2:r3=y;
c=a; d=b; x=c; y=d;
}
 P0   | P1| P2;
 stw r3,0(r2) | lwz r8,0(r2)  | lwz r8,0(r3)  ;
  | stw r2,0(r3)  | lwz r9,0(r8)  ;
exists
(1:r8=y /\ 2:r8=x /\ 2:r9=c)


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-15 Thread Paul E. McKenney
On Fri, Jan 15, 2016 at 10:27:14PM +0100, Peter Zijlstra wrote:
> On Fri, Jan 15, 2016 at 09:46:12AM -0800, Paul E. McKenney wrote:
> > On Fri, Jan 15, 2016 at 10:13:48AM +0100, Peter Zijlstra wrote:
> 
> > > And the stuff we're confused about is how best to express the difference
> > > and guarantees of these two forms of transitivity and how exactly they
> > > interact.
> > 
> > Hoping my memory-barrier.txt patch helps here...
> 
> Yes, that seems a good start. But yesterday you raised the 'fun' point
> of two globally ordered sequences connected by a single local link.

The conclusion that I am slowly coming to is that litmus tests should
not be thought of as linear chains, but rather as cycles.  If you think
of it as a cycle, then it doesn't matter where the local link is, just
how many of them and how they are connected.

But I will admit that there are some rather strange litmus tests that
challenge this cycle-centric view, for example, the one shown below.
It turns out that herd and ppcmem disagree on the outcome.  (The Power
architects side with ppcmem.)

> And I think I'm still confused on LWSYNC (in the smp_wmb case) when one
> of the stores looses a conflict, and if that scenario matters. If it
> does, we should inspect the same case for other barriers.

Indeed.  I am still working on how these should be described.  My
current thought is to be quite conservative on what ordering is
actually respected, however, the current task is formalizing how
RCU plays with the rest of the memory model.

Thanx, Paul



PPC Overlapping Group-B sets version 4
""
(* When the Group-B sets from two different barriers involve instructions in
   the same thread, within that thread one set must contain the other.

P0  P1  P2
Rx=1Wy=1Wz=2
dep.lwsync  lwsync
Ry=0Wz=1Wx=1
Rz=1

assert(!(z=2))

   Forbidden by ppcmem, allowed by herd.
*)
{
0:r1=x; 0:r2=y; 0:r3=z;
1:r1=x; 1:r2=y; 1:r3=z; 1:r4=1;
2:r1=x; 2:r2=y; 2:r3=z; 2:r4=1; 2:r5=2;
}
 P0 | P1| P2;
 lwz r6,0(r1)   | stw r4,0(r2)  | stw r5,0(r3)  ;
 xor r7,r6,r6   | lwsync| lwsync;
 lwzx r7,r7,r2  | stw r4,0(r3)  | stw r4,0(r1)  ;
 lwz r8,0(r3)   |   |   ;

exists
(z=2 /\ 0:r6=1 /\ 0:r7=0 /\ 0:r8=1)


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-15 Thread Paul E. McKenney
On Fri, Jan 15, 2016 at 10:29:12PM +0100, Peter Zijlstra wrote:
> On Fri, Jan 15, 2016 at 09:39:12AM -0800, Paul E. McKenney wrote:
> > Should we start putting litmus tests for the various examples
> > somewhere, perhaps in a litmus-tests directory within each participating
> > architecture?  I have a pile of powerpc-related litmus tests on my laptop,
> > but they probably aren't doing all that much good there.
> 
> Yeah, or a version of them in C that we can 'compile'?

That would be good as well.  I am guessing that architecture-specific
litmus tests will also be needed, but you are right that
architecture-independent versions are higher priority.

> > commit 2cb4e83a1b5c89c8e39b8a64bd89269d05913e41
> > Author: Paul E. McKenney 
> > Date:   Fri Jan 15 09:30:42 2016 -0800
> > 
> > documentation: Distinguish between local and global transitivity
> > 
> > The introduction of smp_load_acquire() and smp_store_release() had
> > the side effect of introducing a weaker notion of transitivity:
> > The transitivity of full smp_mb() barriers is global, but that
> > of smp_store_release()/smp_load_acquire() chains is local.  This
> > commit therefore introduces the notion of local transitivity and
> > gives an example.
> > 
> > Reported-by: Peter Zijlstra 
> > Reported-by: Will Deacon 
> > Signed-off-by: Paul E. McKenney 
> 
> I think it fails to mention smp_mb__after_release_acquire(), although I
> suspect we didn't actually introduce the primitive yet, which raises the
> point, do we want to?

Well, it is not in v4.4.  I believe that we need good use cases before
we add it.

Thanx, Paul


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-18 Thread Paul E. McKenney
On Mon, Jan 18, 2016 at 04:19:29PM +0800, Herbert Xu wrote:
> Paul E. McKenney  wrote:
> >
> > You could use SYNC_ACQUIRE() to implement read_barrier_depends() and
> > smp_read_barrier_depends(), but SYNC_RMB probably does not suffice.
> > The reason for this is that smp_read_barrier_depends() must order the
> > pointer load against any subsequent read or write through a dereference
> > of that pointer.  For example:
> > 
> >p = READ_ONCE(gp);
> >smp_rmb();
> >r1 = p->a; /* ordered by smp_rmb(). */
> >p->b = 42; /* NOT ordered by smp_rmb(), BUG!!! */
> >r2 = x; /* ordered by smp_rmb(), but doesn't need to be. */
> > 
> > In contrast:
> > 
> >p = READ_ONCE(gp);
> >smp_read_barrier_depends();
> >r1 = p->a; /* ordered by smp_read_barrier_depends(). */
> >p->b = 42; /* ordered by smp_read_barrier_depends(). */
> >r2 = x; /* not ordered by smp_read_barrier_depends(), which is OK. */
> > 
> > Again, if your hardware maintains local ordering for address
> > and data dependencies, you can have read_barrier_depends() and
> > smp_read_barrier_depends() be no-ops like they are for most
> > architectures.
> > 
> > Does that help?
> 
> This is crazy! smp_rmb started out being strictly stronger than
> smp_read_barrier_depends, when did this stop being the case?

Hello, Herbert!

It is true that most Linux kernel code relies only on the read-read
properties of dependencies, but the read-write properties are useful.
Admittedly relatively rarely, but useful.

The better comparison for smp_read_barrier_depends(), especially in
its rcu_dereference*() form, is smp_load_acquire().

Thanx, Paul


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-25 Thread Paul E. McKenney
On Mon, Jan 25, 2016 at 02:41:34PM +, Will Deacon wrote:
> On Fri, Jan 15, 2016 at 11:28:45AM -0800, Paul E. McKenney wrote:
> > On Fri, Jan 15, 2016 at 09:54:01AM -0800, Paul E. McKenney wrote:
> > > On Fri, Jan 15, 2016 at 10:24:32AM +, Will Deacon wrote:
> > > > See my earlier reply [1] (but also, your WRC Linux example looks more
> > > > like a variant on WWC and I couldn't really follow it).
> > > 
> > > I will revisit my WRC Linux example.  And yes, creating litmus tests
> > > that use non-fake dependencies is still a bit of an undertaking.  :-/
> > > I am sure that it will seem more natural with time and experience...
> > 
> > Hmmm...  You are quite right, I did do WWC.  I need to change cpu2()'s
> > last access from a store to a load to get WRC.  Plus the levels of
> > indirection definitely didn't match up, did they?
> 
> Nope, it was pretty baffling!

"It is a service that I provide."  ;-)

> > struct foo {
> > struct foo *next;
> > };
> > struct foo a;
> > struct foo b;
> > struct foo c = { &a };
> > struct foo d = { &b };
> > struct foo x = { &c };
> > struct foo y = { &d };
> > struct foo *r1, *r2, *r3;
> > 
> > void cpu0(void)
> > {
> > WRITE_ONCE(x.next, &y);
> > }
> > 
> > void cpu1(void)
> > {
> > r1 = lockless_dereference(x.next);
> > WRITE_ONCE(r1->next, &x);
> > }
> > 
> > void cpu2(void)
> > {
> > r2 = lockless_dereference(y.next);
> > r3 = READ_ONCE(r2->next);
> > }
> > 
> > In this case, it is legal to end the run with:
> > 
> > r1 == &y && r2 == &x && r3 == &c
> > 
> > Please see below for a ppcmem litmus test.
> > 
> > So, did I get it right this time?  ;-)
> 
> The code above looks correct to me (in that it matches WRC+addrs),
> but your litmus test:
> 
> > PPC WRCnf+addrs
> > ""
> > {
> > 0:r2=x; 0:r3=y;
> > 1:r2=x; 1:r3=y;
> > 2:r2=x; 2:r3=y;
> > c=a; d=b; x=c; y=d;
> > }
> >  P0   | P1| P2;
> >  stw r3,0(r2) | lwz r8,0(r2)  | lwz r8,0(r3)  ;
> >   | stw r2,0(r3)  | lwz r9,0(r8)  ;
> > exists
> > (1:r8=y /\ 2:r8=x /\ 2:r9=c)
> 
> Seems to be missing the address dependency on P1.

You are quite correct!  How about the following?

As before, both herd and ppcmem say that the cycle is allowed, as
expected, given non-transitive ordering.  To prohibit the cycle, P1
needs a suitable memory-barrier instruction.

Thanx, Paul



PPC WRCnf+addrs
""
{
0:r2=x; 0:r3=y;
1:r2=x; 1:r3=y;
2:r2=x; 2:r3=y;
c=a; d=b; x=c; y=d;
}
 P0   | P1| P2;
 stw r3,0(r2) | lwz r8,0(r2)  | lwz r8,0(r3)  ;
  | stw r2,0(r8)  | lwz r9,0(r8)  ;
exists
(1:r8=y /\ 2:r8=x /\ 2:r9=c)

-- 
This message has been scanned for viruses and
dangerous content by MailScanner, and is
believed to be clean.


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-25 Thread Paul E. McKenney
On Mon, Jan 25, 2016 at 04:42:43PM +, Will Deacon wrote:
> On Fri, Jan 15, 2016 at 01:58:53PM -0800, Paul E. McKenney wrote:
> > On Fri, Jan 15, 2016 at 10:27:14PM +0100, Peter Zijlstra wrote:
> > > On Fri, Jan 15, 2016 at 09:46:12AM -0800, Paul E. McKenney wrote:
> > > > On Fri, Jan 15, 2016 at 10:13:48AM +0100, Peter Zijlstra wrote:
> > > 
> > > > > And the stuff we're confused about is how best to express the 
> > > > > difference
> > > > > and guarantees of these two forms of transitivity and how exactly they
> > > > > interact.
> > > > 
> > > > Hoping my memory-barrier.txt patch helps here...
> > > 
> > > Yes, that seems a good start. But yesterday you raised the 'fun' point
> > > of two globally ordered sequences connected by a single local link.
> > 
> > The conclusion that I am slowly coming to is that litmus tests should
> > not be thought of as linear chains, but rather as cycles.  If you think
> > of it as a cycle, then it doesn't matter where the local link is, just
> > how many of them and how they are connected.
> 
> Do you have some examples of this? I'm struggling to make it work in my
> mind, or are you talking specifically in the context of the kernel
> memory model?

Now that you mention it, maybe it would be best to keep the transitive
and non-transitive separate for the time being anyway.  Just because it
might be possible to deal with does not necessarily mean that we should
be encouraging it.  ;-)

> > But I will admit that there are some rather strange litmus tests that
> > challenge this cycle-centric view, for example, the one shown below.
> > It turns out that herd and ppcmem disagree on the outcome.  (The Power
> > architects side with ppcmem.)
> > 
> > > And I think I'm still confused on LWSYNC (in the smp_wmb case) when one
> > > of the stores looses a conflict, and if that scenario matters. If it
> > > does, we should inspect the same case for other barriers.
> > 
> > Indeed.  I am still working on how these should be described.  My
> > current thought is to be quite conservative on what ordering is
> > actually respected, however, the current task is formalizing how
> > RCU plays with the rest of the memory model.
> > 
> > Thanx, Paul
> > 
> > 
> > 
> > PPC Overlapping Group-B sets version 4
> > ""
> > (* When the Group-B sets from two different barriers involve instructions in
> >the same thread, within that thread one set must contain the other.
> > 
> > P0  P1  P2
> > Rx=1Wy=1Wz=2
> > dep.lwsync  lwsync
> > Ry=0Wz=1Wx=1
> > Rz=1
> > 
> > assert(!(z=2))
> > 
> >Forbidden by ppcmem, allowed by herd.
> > *)
> > {
> > 0:r1=x; 0:r2=y; 0:r3=z;
> > 1:r1=x; 1:r2=y; 1:r3=z; 1:r4=1;
> > 2:r1=x; 2:r2=y; 2:r3=z; 2:r4=1; 2:r5=2;
> > }
> >  P0 | P1| P2;
> >  lwz r6,0(r1)   | stw r4,0(r2)  | stw r5,0(r3)  ;
> >  xor r7,r6,r6   | lwsync| lwsync;
> >  lwzx r7,r7,r2  | stw r4,0(r3)  | stw r4,0(r1)  ;
> >  lwz r8,0(r3)   |   |   ;
> > 
> > exists
> > (z=2 /\ 0:r6=1 /\ 0:r7=0 /\ 0:r8=1)
> 
> That really hurts. Assuming that the "assert(!(z=2))" is actually there
> to constrain the coherence order of z to be {0->1->2}, then I think that
> this test is forbidden on arm using dmb instead of lwsync. That said, I
> also don't think the Rz=1 in P0 changes anything.

What about the smp_wmb() variant of dmb that orders only stores?

> The double negatives don't help here! (it is forbidden to guarantee that
> z is not always 2).

Yes, this is a weird one, and I don't know of any use of it.

Thanx, Paul


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-25 Thread Paul E. McKenney
On Mon, Jan 25, 2016 at 06:02:34PM +, Will Deacon wrote:
> Hi Paul,
> 
> On Fri, Jan 15, 2016 at 09:39:12AM -0800, Paul E. McKenney wrote:
> > On Fri, Jan 15, 2016 at 09:55:54AM +0100, Peter Zijlstra wrote:
> > > On Thu, Jan 14, 2016 at 01:29:13PM -0800, Paul E. McKenney wrote:
> > > > So smp_mb() provides transitivity, as do pairs of smp_store_release()
> > > > and smp_read_acquire(), 
> > > 
> > > But they provide different grades of transitivity, which is where all
> > > the confusion lays.
> > > 
> > > smp_mb() is strongly/globally transitive, all CPUs will agree on the 
> > > order.
> > > 
> > > Whereas the RCpc release+acquire is weakly so, only the two cpus
> > > involved in the handover will agree on the order.
> > 
> > Good point!
> > 
> > Using grace periods in place of smp_mb() also provides strong/global
> > transitivity, but also insanely high latencies.  ;-)
> > 
> > The patch below updates Documentation/memory-barriers.txt to define
> > local vs. global transitivity.  The corresponding ppcmem litmus test
> > is included below as well.
> > 
> > Should we start putting litmus tests for the various examples
> > somewhere, perhaps in a litmus-tests directory within each participating
> > architecture?  I have a pile of powerpc-related litmus tests on my laptop,
> > but they probably aren't doing all that much good there.
> 
> I too would like to have the litmus tests in the kernel so that we can
> refer to them from memory-barriers.txt. Ideally they wouldn't be targetted
> to a particular arch, however.

Agreed.  Working on it...

> > PPC local-transitive
> > ""
> > {
> > 0:r1=1; 0:r2=u; 0:r3=v; 0:r4=x; 0:r5=y; 0:r6=z;
> > 1:r1=1; 1:r2=u; 1:r3=v; 1:r4=x; 1:r5=y; 1:r6=z;
> > 2:r1=1; 2:r2=u; 2:r3=v; 2:r4=x; 2:r5=y; 2:r6=z;
> > 3:r1=1; 3:r2=u; 3:r3=v; 3:r4=x; 3:r5=y; 3:r6=z;
> > }
> >  P0   | P1   | P2   | P3   ;
> >  lwz r9,0(r4) | lwz r9,0(r5) | lwz r9,0(r6) | stw r1,0(r3) ;
> >  lwsync   | lwsync   | lwsync   | sync ;
> >  stw r1,0(r2) | lwz r8,0(r3) | stw r1,0(r7) | lwz r9,0(r2) ;
> >  lwsync   | lwz r7,0(r2) |  |  ;
> >  stw r1,0(r5) | lwsync   |  |  ;
> >   | stw r1,0(r6) |  |  ;
> > exists
> > (* (0:r9=0 /\ 1:r9=1 /\ 2:r9=1 /\ 1:r8=0 /\ 3:r9=0) *)
> > (* (0:r9=1 /\ 1:r9=1 /\ 2:r9=1) *)
> > (* (0:r9=0 /\ 1:r9=1 /\ 2:r9=1 /\ 1:r7=0) *)
> > (0:r9=0 /\ 1:r9=1 /\ 2:r9=1 /\ 1:r7=0)
> 
> i.e. we should rewrite this using READ_ONCE/WRITE_ONCE and smp_mb() etc.

Yep!

> > 
> > 
> > commit 2cb4e83a1b5c89c8e39b8a64bd89269d05913e41
> > Author: Paul E. McKenney 
> > Date:   Fri Jan 15 09:30:42 2016 -0800
> > 
> > documentation: Distinguish between local and global transitivity
> > 
> > The introduction of smp_load_acquire() and smp_store_release() had
> > the side effect of introducing a weaker notion of transitivity:
> > The transitivity of full smp_mb() barriers is global, but that
> > of smp_store_release()/smp_load_acquire() chains is local.  This
> > commit therefore introduces the notion of local transitivity and
> > gives an example.
> > 
> > Reported-by: Peter Zijlstra 
> > Reported-by: Will Deacon 
> > Signed-off-by: Paul E. McKenney 
> > 
> > diff --git a/Documentation/memory-barriers.txt 
> > b/Documentation/memory-barriers.txt
> > index c66ba46d8079..d8109ed99342 100644
> > --- a/Documentation/memory-barriers.txt
> > +++ b/Documentation/memory-barriers.txt
> > @@ -1318,8 +1318,82 @@ or a level of cache, CPU 2 might have early access 
> > to CPU 1's writes.
> >  General barriers are therefore required to ensure that all CPUs agree
> >  on the combined order of CPU 1's and CPU 2's accesses.
> >  
> > -To reiterate, if your code requires transitivity, use general barriers
> > -throughout.
> > +General barriers provide "global transitivity", so that all CPUs will
> > +agree on the order of operations.  In contrast, a chain of release-acquire
> > +pairs provides only "local transitivity", so that only those CPUs on
> > +the chain are guaranteed to agree on the combined order of the accesses.
> 
> Thanks for having a go at this. I tried defining something axiomatically,
> but got stuck pretty quickly. 

Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-26 Thread Paul E. McKenney
On Tue, Jan 26, 2016 at 11:19:27AM +0100, Peter Zijlstra wrote:
> On Mon, Jan 25, 2016 at 10:03:22PM -0800, Paul E. McKenney wrote:
> > On Mon, Jan 25, 2016 at 04:42:43PM +, Will Deacon wrote:
> > > On Fri, Jan 15, 2016 at 01:58:53PM -0800, Paul E. McKenney wrote:
> > > > On Fri, Jan 15, 2016 at 10:27:14PM +0100, Peter Zijlstra wrote:
> 
> > > > > Yes, that seems a good start. But yesterday you raised the 'fun' point
> > > > > of two globally ordered sequences connected by a single local link.
> > > > 
> > > > The conclusion that I am slowly coming to is that litmus tests should
> > > > not be thought of as linear chains, but rather as cycles.  If you think
> > > > of it as a cycle, then it doesn't matter where the local link is, just
> > > > how many of them and how they are connected.
> > > 
> > > Do you have some examples of this? I'm struggling to make it work in my
> > > mind, or are you talking specifically in the context of the kernel
> > > memory model?
> > 
> > Now that you mention it, maybe it would be best to keep the transitive
> > and non-transitive separate for the time being anyway.  Just because it
> > might be possible to deal with does not necessarily mean that we should
> > be encouraging it.  ;-)
> 
> So isn't smp_mb__after_unlock_lock() exactly such a scenario? And would
> not someone trying to implement RCsc locks using locally transitive
> RELEASE/ACQUIRE operations need exactly this stuff?
> 
> That is, I am afraid we need to cover the mix of local and global
> transitive operations at least in overview.

True, but we haven't gotten to locking yet.  That said, I would argue
that smp_mb__after_unlock_lock() upgrades locks to transitive, and
thus would not be an exception to the "no combining transitive and
non-transitive steps in cycles" rule.

Thanx, Paul


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-26 Thread Paul E. McKenney
On Tue, Jan 26, 2016 at 11:24:02AM +0100, Peter Zijlstra wrote:
> On Thu, Jan 14, 2016 at 02:20:46PM -0800, Paul E. McKenney wrote:
> > On Thu, Jan 14, 2016 at 01:24:34PM -0800, Leonid Yegoshin wrote:
> > > On 01/14/2016 12:48 PM, Paul E. McKenney wrote:
> > > >
> > > >So SYNC_RMB is intended to implement smp_rmb(), correct?
> > > Yes.
> > > >
> > > >You could use SYNC_ACQUIRE() to implement read_barrier_depends() and
> > > >smp_read_barrier_depends(), but SYNC_RMB probably does not suffice.
> > > 
> > > If smp_read_barrier_depends() is used to separate not only two reads
> > > but read pointer and WRITE basing on that pointer (example below) -
> > > yes. I just doesn't see any example of this in famous
> > > Documentation/memory-barriers.txt and had no chance to know what you
> > > use it in this way too.
> > 
> > Well, Documentation/memory-barriers.txt was intended as a guide for Linux
> > kernel hackers, and not for hardware architects.
> 
> Yeah, this goes under the header: memory-barriers.txt is _NOT_ a
> specification (I seem to keep repeating this).
> 
> > 
> > 
> > commit 955720966e216b00613fcf60188d507c103f0e80
> > Author: Paul E. McKenney 
> > Date:   Thu Jan 14 14:17:04 2016 -0800
> > 
> > documentation: Subsequent writes ordered by rcu_dereference()
> > 
> > The current memory-barriers.txt does not address the possibility of
> > a write to a dereferenced pointer.  This should be rare, 
> 
> How are these rare? Isn't:
> 
>   rcu_read_lock()
>   obj = rcu_dereference(ptr);
>   if (!atomic_inc_not_zero(&obj->ref))
>   obj = NULL;
>   rcu_read_unlock();
> 
> a _very_ common thing to do?

It is, but it provides its own barriers, so does not need to rely on
dependency ordering.

Thanx, Paul


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-26 Thread Paul E. McKenney
On Tue, Jan 26, 2016 at 11:44:46AM -0800, Linus Torvalds wrote:
> On Tue, Jan 26, 2016 at 9:22 AM, Peter Zijlstra  wrote:
> >
> > This is distinct from:
> 
> That may be distinct, but:
> 
> > struct foo *x = READ_ONCE(*ptr);
> > smp_read_barrier_depends();
> > x->bar = 5;
> 
> This case is complete BS. Stop perpetuating it. I already removed a
> number of bogus cases of it, and I removed the incorrect documentation
> that had this crap.

If I understand your objection correctly, you want the above pattern
expressed either like this:

struct foo *x = rcu_dereference(*ptr);
x->bar = 5;

Or like this:

struct foo *x = lockless_dereference(*ptr);
x->bar = 5;

Or am I missing your point?

> It's called "smp_READ_barrier_depends()" for a reason.
> 
> Alpha is the only one that needs it, and alpha needs it only for
> dependent READS.
> 
> It's not called smp_read_write_barrier_depends(). It's not called
> "smp_mb_depends()". It's a weaker form of "smp_rmb()", nothing else.
> 
> So alpha does have an implied dependency chain from a read to a
> subsequent dependent write, and does not need any extra barriers.
> 
> Alpha does *not* have a dependency chain from a read to a subsequent
> read, which is why we need that horrible crappy
> smp_read_barrier_depends(). But it's the only reason.
> 
> This is the alpha reference manual wrt read-to-write dependency:
> 
>   5.6.1.7 Definition of Dependence Constraint
> 
> The depends relation (DP) is defined as follows. Given u and v
> issued by processor Pi, where u
> is a read or an instruction fetch and v is a write, u precedes v
> in DP order (written u DP v, that
> is, v depends on u) in either of the following situations:
> 
>  • u determines the execution of v, the location accessed by v, or
> the value written by v.
>  • u determines the execution or address or value of another
> memory access z that precedes
> 
> v or might precede v (that is, would precede v in some execution
> path depending
> on the value read by u) by processor issue constraint (see Section 
> 5.6.1.3).
> 
> Note that the dependence barrier honors not only control flow, but
> address and data values too.  This is a different syntax than we use,
> but 'u' is the READ_ONCE, and 'v' is the write. Any data, address or
> conditional dependency between the two implies an ordering.
> 
> So no, "smp_read_barrier_depends()" is *ONLY* about two reads, where
> the second read is data-dependent on the first. Nothing else.
> 
> So if you _ever_ see a "smp_read_barrier_depends()" that isn't about a
> barrier between two reads, then that is a bug.

And the smp_read_barrier_depends() in both rcu_dereference() and
in lockless_dereference() is ordering the read-to-read case and the
underlying hardware is ordering the read-to-write case on weakly ordered
hardware.

Or, again, am I missing your point?

Thanx, Paul

> The above code is crap.  It's exactly as much crap as
> 
>a = READ_ONCE(x);
>smp_rmb();
>WRITE_ONCE(b, y);
> 
> because a "rmb()" simply doesn't have anything to do with
> read-vs-subsequent-write ordering.
> 
>  Linus
> 


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-26 Thread Paul E. McKenney
On Tue, Jan 26, 2016 at 12:16:09PM +, Will Deacon wrote:
> On Mon, Jan 25, 2016 at 10:03:22PM -0800, Paul E. McKenney wrote:
> > On Mon, Jan 25, 2016 at 04:42:43PM +, Will Deacon wrote:
> > > On Fri, Jan 15, 2016 at 01:58:53PM -0800, Paul E. McKenney wrote:
> > > > PPC Overlapping Group-B sets version 4
> > > > ""
> > > > (* When the Group-B sets from two different barriers involve 
> > > > instructions in
> > > >the same thread, within that thread one set must contain the other.
> > > > 
> > > > P0  P1  P2
> > > > Rx=1Wy=1Wz=2
> > > > dep.lwsync  lwsync
> > > > Ry=0Wz=1Wx=1
> > > > Rz=1
> > > > 
> > > > assert(!(z=2))
> > > > 
> > > >Forbidden by ppcmem, allowed by herd.
> > > > *)
> > > > {
> > > > 0:r1=x; 0:r2=y; 0:r3=z;
> > > > 1:r1=x; 1:r2=y; 1:r3=z; 1:r4=1;
> > > > 2:r1=x; 2:r2=y; 2:r3=z; 2:r4=1; 2:r5=2;
> > > > }
> > > >  P0 | P1| P2;
> > > >  lwz r6,0(r1)   | stw r4,0(r2)  | stw r5,0(r3)  ;
> > > >  xor r7,r6,r6   | lwsync| lwsync;
> > > >  lwzx r7,r7,r2  | stw r4,0(r3)  | stw r4,0(r1)  ;
> > > >  lwz r8,0(r3)   |   |   ;
> > > > 
> > > > exists
> > > > (z=2 /\ 0:r6=1 /\ 0:r7=0 /\ 0:r8=1)
> > > 
> > > That really hurts. Assuming that the "assert(!(z=2))" is actually there
> > > to constrain the coherence order of z to be {0->1->2}, then I think that
> > > this test is forbidden on arm using dmb instead of lwsync. That said, I
> > > also don't think the Rz=1 in P0 changes anything.
> > 
> > What about the smp_wmb() variant of dmb that orders only stores?
> 
> Tricky, but I think it still works out if the coherence order of z is as
> I described above. The line of reasoning is weird though -- I ended up
> considering the two cases where P0 reads z before and after it reads x
> and what that means for the read of y.

By "works out" you mean that ARM prohibits the outcome?

BTW, I never have seen a real-world use for this case.  At the moment
it is mostly a cautionary tale about memory-model corner cases and
tools.

Thanx, Paul


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-26 Thread Paul E. McKenney
On Wed, Jan 27, 2016 at 12:52:07AM +0800, Boqun Feng wrote:
> Hi Paul,
> 
> On Mon, Jan 18, 2016 at 07:46:29AM -0800, Paul E. McKenney wrote:
> > On Mon, Jan 18, 2016 at 04:19:29PM +0800, Herbert Xu wrote:
> > > Paul E. McKenney  wrote:
> > > >
> > > > You could use SYNC_ACQUIRE() to implement read_barrier_depends() and
> > > > smp_read_barrier_depends(), but SYNC_RMB probably does not suffice.
> > > > The reason for this is that smp_read_barrier_depends() must order the
> > > > pointer load against any subsequent read or write through a dereference
> > > > of that pointer.  For example:
> > > > 
> > > >p = READ_ONCE(gp);
> > > >smp_rmb();
> > > >r1 = p->a; /* ordered by smp_rmb(). */
> > > >p->b = 42; /* NOT ordered by smp_rmb(), BUG!!! */
> > > >r2 = x; /* ordered by smp_rmb(), but doesn't need to be. */
> > > > 
> > > > In contrast:
> > > > 
> > > >p = READ_ONCE(gp);
> > > >smp_read_barrier_depends();
> > > >r1 = p->a; /* ordered by smp_read_barrier_depends(). */
> > > >p->b = 42; /* ordered by smp_read_barrier_depends(). */
> > > >r2 = x; /* not ordered by smp_read_barrier_depends(), which is 
> > > > OK. */
> > > > 
> > > > Again, if your hardware maintains local ordering for address
> > > > and data dependencies, you can have read_barrier_depends() and
> > > > smp_read_barrier_depends() be no-ops like they are for most
> > > > architectures.
> > > > 
> > > > Does that help?
> > > 
> > > This is crazy! smp_rmb started out being strictly stronger than
> > > smp_read_barrier_depends, when did this stop being the case?
> > 
> > Hello, Herbert!
> > 
> > It is true that most Linux kernel code relies only on the read-read
> > properties of dependencies, but the read-write properties are useful.
> > Admittedly relatively rarely, but useful.
> > 
> > The better comparison for smp_read_barrier_depends(), especially in
> > its rcu_dereference*() form, is smp_load_acquire().
> 
> Confused..
> 
> I recall that last time you and Linus came into a conclusion that even
> on Alpha, a barrier for read->write with data dependency is unnecessary:
> 
> http://article.gmane.org/gmane.linux.kernel/2077661
> 
> And in an earlier mail of that thread, Linus made his point that
> smp_read_barrier_depends() should only be used to order read->read.

Those examples involved read-to-write with conditionals, as in:

if (READ_ONCE(a))
WRITE_ONCE(b, 1);

Without the "if", no ordering is guaranteed on weakly ordered CPUs.
(The volatile accesses keep ordering within the compiler for once...

> So right now, are we going to extend the semantics of
> smp_read_barrier_depends()? Can we just make smp_read_barrier_depends()
> still only work for read->read, and assume all the architectures won't
> reorder read->write with data dependency, so that the code above having
> a smp_rmb() also works?

The semantics of smp_read_barrier_depends() has been both read-to-write
and read-to-read for some time now, this patch just catches the
documentation up with reality.

Thanx, Paul


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-26 Thread Paul E. McKenney
On Tue, Jan 26, 2016 at 11:09:27AM +, Will Deacon wrote:
> On Tue, Jan 26, 2016 at 11:32:00AM +0100, Peter Zijlstra wrote:
> > On Tue, Jan 26, 2016 at 11:24:02AM +0100, Peter Zijlstra wrote:
> > 
> > > Yeah, this goes under the header: memory-barriers.txt is _NOT_ a
> > > specification (I seem to keep repeating this).
> > 
> > Do we want this ?

Seems likely to me.  ;-)

> > ---
> >  Documentation/memory-barriers.txt | 17 +
> >  1 file changed, 17 insertions(+)
> > 
> > diff --git a/Documentation/memory-barriers.txt 
> > b/Documentation/memory-barriers.txt
> > index a61be39c7b51..433326ebdc26 100644
> > --- a/Documentation/memory-barriers.txt
> > +++ b/Documentation/memory-barriers.txt
> > @@ -1,3 +1,4 @@
> > +
> >  
> >  LINUX KERNEL MEMORY BARRIERS
> >  
> > @@ -5,6 +6,22 @@
> >  By: David Howells 
> >  Paul E. McKenney 
> >  
> > +==
> > +DISCLAIMER
> > +==
> > +
> > +This document is not a specification; it is intentionally (for the sake of
> > +brevity) and unintentionally (due to being human) incomplete. This 
> > document is
> > +meant as a guide to using the various memory barriers provided by Linux, 
> > but
> > +in case of any doubt (and there are many) please ask.
> 
> It might be worth adding you and me to the top of the file, to save Paul
> Cc'ing us on questions (get_maintainer.pl points at poor old Corbet for
> this file).
> 
> But yes, it seems that something like this is required.

So Peter, would you like to update your patch to include yourself
and Will as authors?

Thanx, Paul


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-26 Thread Paul E. McKenney
On Tue, Jan 26, 2016 at 02:33:40PM -0800, Linus Torvalds wrote:
> On Tue, Jan 26, 2016 at 2:15 PM, Linus Torvalds
>  wrote:
> >
> > You might as well just write it as
> >
> > struct foo x = READ_ONCE(*ptr);
> > x->bar = 5;
> >
> > because that "smp_read_barrier_depends()" does NOTHING wrt the second write.
> 
> Just to clarify: on alpha it adds a memory barrier, but that memory
> barrier is useless.

No trailing data-dependent read, so agreed, no smp_read_barrier_depends()
needed.  That said, I believe that we should encourage rcu_dereference*()
or lockless_dereference() instead of READ_ONCE() for documentation
reasons, though.

> On non-alpha, it is a no-op, and obviously does nothing simply because
> it generates no code.
> 
> So if anybody believes that the "smp_read_barrier_depends()" does
> something, they are *wrong*.

The other problem with smp_read_barrier_depends() is that it is often
a pain figuring out which prior load it is supposed to apply to.
Hence my preference for rcu_dereference*() and lockless_dereference().

> And if anybody sends out an email with that smp_read_barrier_depends()
> in an example, they are actively just confusing other people, which is
> even worse than just being wrong. Which is why I jumped in.
> 
> So stop perpetuating the myth that smp_read_barrier_depends() does
> something here. It does not. It's a bug, and it has become this "mind
> virus" for some people that seem to believe that it does something.

It looks like I should add words to memory-barriers.txt de-emphasizing
smp_read_barrier_depends().  I will take a look at that.

> I had to remove this crap once from the kernel already, see commit
> 105ff3cbf225 ("atomic: remove all traces of READ_ONCE_CTRL() and
> atomic*_read_ctrl()").
> 
> I don't want to ever see that broken construct again. And I want to
> make sure that everybody is educated about how broken it was. I'm
> extremely unhappy that it came up again.

Well, if it makes you feel better, that was control dependencies and this
was data dependencies.  So it was not -exactly- the same.  ;-)

(Sorry, couldn't resist...)

> If it turns out that some architecture does actually need a barrier
> between a read and a dependent write, then that will mean that
> 
>  (a) we'll have to make up a _new_ barrier, because
> "smp_read_barrier_depends()" is not that barrier. We'll presumably
> then have to make that new barrier part of "rcu_derefence()" and
> friends.

Agreed.  We can worry about whether or not we replace the current
smp_read_barrier_depends() with that new barrier when and if such
hardware appears.

>  (b) we will have found an architecture with even worse memory
> ordering semantics than alpha, and we'll have to stop castigating
> alpha for being the worst memory ordering ever.

;-) ;-) ;-)

> but I sincerely hope that we'll never find that kind of broken architecture.

Apparently at least some hardware vendors are reading memory-barriers.txt,
so perhaps the odds of that kind of breakage have reduced.

Thanx, Paul


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-26 Thread Paul E. McKenney
On Tue, Jan 26, 2016 at 12:10:10PM +, Will Deacon wrote:
> On Mon, Jan 25, 2016 at 05:06:46PM -0800, Paul E. McKenney wrote:
> > On Mon, Jan 25, 2016 at 02:41:34PM +, Will Deacon wrote:
> > > On Fri, Jan 15, 2016 at 11:28:45AM -0800, Paul E. McKenney wrote:
> > > > On Fri, Jan 15, 2016 at 09:54:01AM -0800, Paul E. McKenney wrote:
> > > > > On Fri, Jan 15, 2016 at 10:24:32AM +, Will Deacon wrote:
> > > > > > See my earlier reply [1] (but also, your WRC Linux example looks 
> > > > > > more
> > > > > > like a variant on WWC and I couldn't really follow it).
> > > > > 
> > > > > I will revisit my WRC Linux example.  And yes, creating litmus tests
> > > > > that use non-fake dependencies is still a bit of an undertaking.  :-/
> > > > > I am sure that it will seem more natural with time and experience...
> > > > 
> > > > Hmmm...  You are quite right, I did do WWC.  I need to change cpu2()'s
> > > > last access from a store to a load to get WRC.  Plus the levels of
> > > > indirection definitely didn't match up, did they?
> > > 
> > > Nope, it was pretty baffling!
> > 
> > "It is a service that I provide."  ;-)
> > 
> > > > struct foo {
> > > > struct foo *next;
> > > > };
> > > > struct foo a;
> > > > struct foo b;
> > > > struct foo c = { &a };
> > > > struct foo d = { &b };
> > > > struct foo x = { &c };
> > > > struct foo y = { &d };
> > > > struct foo *r1, *r2, *r3;
> > > > 
> > > > void cpu0(void)
> > > > {
> > > > WRITE_ONCE(x.next, &y);
> > > > }
> > > > 
> > > > void cpu1(void)
> > > > {
> > > > r1 = lockless_dereference(x.next);
> > > > WRITE_ONCE(r1->next, &x);
> > > > }
> > > > 
> > > > void cpu2(void)
> > > > {
> > > > r2 = lockless_dereference(y.next);
> > > > r3 = READ_ONCE(r2->next);
> > > > }
> > > > 
> > > > In this case, it is legal to end the run with:
> > > > 
> > > > r1 == &y && r2 == &x && r3 == &c
> > > > 
> > > > Please see below for a ppcmem litmus test.
> > > > 
> > > > So, did I get it right this time?  ;-)
> > > 
> > > The code above looks correct to me (in that it matches WRC+addrs),
> > > but your litmus test:
> > > 
> > > > PPC WRCnf+addrs
> > > > ""
> > > > {
> > > > 0:r2=x; 0:r3=y;
> > > > 1:r2=x; 1:r3=y;
> > > > 2:r2=x; 2:r3=y;
> > > > c=a; d=b; x=c; y=d;
> > > > }
> > > >  P0   | P1| P2;
> > > >  stw r3,0(r2) | lwz r8,0(r2)  | lwz r8,0(r3)  ;
> > > >   | stw r2,0(r3)  | lwz r9,0(r8)  ;
> > > > exists
> > > > (1:r8=y /\ 2:r8=x /\ 2:r9=c)
> > > 
> > > Seems to be missing the address dependency on P1.
> > 
> > You are quite correct!  How about the following?
> 
> I think that's it!
> 
> > As before, both herd and ppcmem say that the cycle is allowed, as
> > expected, given non-transitive ordering.  To prohibit the cycle, P1
> > needs a suitable memory-barrier instruction.
> > 
> > 
> > 
> > PPC WRCnf+addrs
> > ""
> > {
> > 0:r2=x; 0:r3=y;
> > 1:r2=x; 1:r3=y;
> > 2:r2=x; 2:r3=y;
> > c=a; d=b; x=c; y=d;
> > }
> >  P0   | P1| P2;
> >  stw r3,0(r2) | lwz r8,0(r2)  | lwz r8,0(r3)  ;
> >   | stw r2,0(r8)  | lwz r9,0(r8)  ;
> > exists
> > (1:r8=y /\ 2:r8=x /\ 2:r9=c)
> 
> Agreed.

OK, thank you!  Would you agree that it would be good to replace the
current xor-based fake-dependency litmus tests with tests having real
dependencies?

Thanx, Paul


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-26 Thread Paul E. McKenney
On Tue, Jan 26, 2016 at 03:45:23PM -0800, Linus Torvalds wrote:
> On Tue, Jan 26, 2016 at 3:29 PM, Paul E. McKenney
>  wrote:
> >
> > No trailing data-dependent read, so agreed, no smp_read_barrier_depends()
> > needed.  That said, I believe that we should encourage rcu_dereference*()
> > or lockless_dereference() instead of READ_ONCE() for documentation
> > reasons, though.
> 
> I agree that that is likely the right thing to do in pretty much all 
> situations.
> 
> In theory, there might be performance situations where we'd want to
> actively avoid the smp_read_barrier_depends() inherent in those, but
> considering that it's only a performance issue on alpha, and we
> probably have all of two or three people using Linux on alpha, it's a
> pretty theoretical performance worry.

Agreed!

Thanx, Paul


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-27 Thread Paul E. McKenney
On Wed, Jan 27, 2016 at 10:25:46AM +, Will Deacon wrote:
> On Tue, Jan 26, 2016 at 11:58:20AM -0800, Paul E. McKenney wrote:
> > On Tue, Jan 26, 2016 at 12:16:09PM +, Will Deacon wrote:
> > > On Mon, Jan 25, 2016 at 10:03:22PM -0800, Paul E. McKenney wrote:
> > > > On Mon, Jan 25, 2016 at 04:42:43PM +, Will Deacon wrote:
> > > > > On Fri, Jan 15, 2016 at 01:58:53PM -0800, Paul E. McKenney wrote:
> > > > > > PPC Overlapping Group-B sets version 4
> > > > > > ""
> > > > > > (* When the Group-B sets from two different barriers involve 
> > > > > > instructions in
> > > > > >the same thread, within that thread one set must contain the 
> > > > > > other.
> > > > > > 
> > > > > > P0  P1  P2
> > > > > > Rx=1Wy=1Wz=2
> > > > > > dep.lwsync  lwsync
> > > > > > Ry=0Wz=1Wx=1
> > > > > > Rz=1
> > > > > > 
> > > > > > assert(!(z=2))
> > > > > > 
> > > > > >Forbidden by ppcmem, allowed by herd.
> > > > > > *)
> > > > > > {
> > > > > > 0:r1=x; 0:r2=y; 0:r3=z;
> > > > > > 1:r1=x; 1:r2=y; 1:r3=z; 1:r4=1;
> > > > > > 2:r1=x; 2:r2=y; 2:r3=z; 2:r4=1; 2:r5=2;
> > > > > > }
> > > > > >  P0 | P1| P2;
> > > > > >  lwz r6,0(r1)   | stw r4,0(r2)  | stw r5,0(r3)  ;
> > > > > >  xor r7,r6,r6   | lwsync| lwsync;
> > > > > >  lwzx r7,r7,r2  | stw r4,0(r3)  | stw r4,0(r1)  ;
> > > > > >  lwz r8,0(r3)   |   |   ;
> > > > > > 
> > > > > > exists
> > > > > > (z=2 /\ 0:r6=1 /\ 0:r7=0 /\ 0:r8=1)
> > > > > 
> > > > > That really hurts. Assuming that the "assert(!(z=2))" is actually 
> > > > > there
> > > > > to constrain the coherence order of z to be {0->1->2}, then I think 
> > > > > that
> > > > > this test is forbidden on arm using dmb instead of lwsync. That said, 
> > > > > I
> > > > > also don't think the Rz=1 in P0 changes anything.
> > > > 
> > > > What about the smp_wmb() variant of dmb that orders only stores?
> > > 
> > > Tricky, but I think it still works out if the coherence order of z is as
> > > I described above. The line of reasoning is weird though -- I ended up
> > > considering the two cases where P0 reads z before and after it reads x
> > > and what that means for the read of y.
> > 
> > By "works out" you mean that ARM prohibits the outcome?
> 
> Yes, that's my understanding.

Very good, we have agreement between the two architectures, then.  ;-)

Thanx, Paul


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH] documentation: Add disclaimer

2016-01-27 Thread Paul E. McKenney
On Wed, Jan 27, 2016 at 02:57:07PM +, David Howells wrote:
> Peter Zijlstra  wrote:
> 
> > +==
> > +DISCLAIMER
> > +==
> > +
> > +This document is not a specification; it is intentionally (for the sake of
> > +brevity) and unintentionally (due to being human) incomplete. This 
> > document is
> > +meant as a guide to using the various memory barriers provided by Linux, 
> > but
> > +in case of any doubt (and there are many) please ask.
> > +
> > +I repeat, this document is not a specification of what Linux expects from
> > +hardware.
> 
> The purpose of this document is twofold:
> 
>  (1) to specify the minimum functionality that one can rely on for any
>  particular barrier, and
> 
>  (2) to provide a guide as to how to use the barriers that are available.
> 
> Note that an architecture can provide more than the minimum requirement for
> any particular barrier, but if the barrier provides less than that, it is
> incorrect.
> 
> Note also that it is possible that a barrier may be a no-op for an
> architecture because the way that arch works renders an explicit barrier
> unnecessary in that case.
> 
> > +
> 
> Can you bung an extra blank line in here if you have to redo this at all?
> 
> > +
> > +CONTENTS
> > +
> >  
> >   (*) Abstract memory access model.

Good point!  Would you be willing to add a Signed-off-by so I
can take the combined change, assuming Peter and Will are good
with it?

Thanx, Paul


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [v3,11/41] mips: reuse asm-generic/barrier.h

2016-01-27 Thread Paul E. McKenney
On Wed, Jan 27, 2016 at 10:04:47AM +0800, Boqun Feng wrote:
> On Tue, Jan 26, 2016 at 03:29:21PM -0800, Paul E. McKenney wrote:
> > On Tue, Jan 26, 2016 at 02:33:40PM -0800, Linus Torvalds wrote:
> > > On Tue, Jan 26, 2016 at 2:15 PM, Linus Torvalds
> > >  wrote:
> > > >
> > > > You might as well just write it as
> > > >
> > > > struct foo x = READ_ONCE(*ptr);
> > > > x->bar = 5;
> > > >
> > > > because that "smp_read_barrier_depends()" does NOTHING wrt the second 
> > > > write.
> > > 
> > > Just to clarify: on alpha it adds a memory barrier, but that memory
> > > barrier is useless.
> > 
> > No trailing data-dependent read, so agreed, no smp_read_barrier_depends()
> > needed.  That said, I believe that we should encourage rcu_dereference*()
> > or lockless_dereference() instead of READ_ONCE() for documentation
> > reasons, though.
> > 
> > > On non-alpha, it is a no-op, and obviously does nothing simply because
> > > it generates no code.
> > > 
> > > So if anybody believes that the "smp_read_barrier_depends()" does
> > > something, they are *wrong*.
> > 
> > The other problem with smp_read_barrier_depends() is that it is often
> > a pain figuring out which prior load it is supposed to apply to.
> > Hence my preference for rcu_dereference*() and lockless_dereference().
> > 
> 
> Because semantically speaking, rcu_derefence*() and
> lockless_dereference() are CONSUME(i.e. data/address dependent
> read->read and read->write pairs are ordered), whereas
> smp_read_barrier_depends() only guarantees read->read pairs with data
> dependency are ordered, right?
> 
> If so, maybe we need to call it out in memory-barriers.txt, for example:
> 
> diff --git a/Documentation/memory-barriers.txt 
> b/Documentation/memory-barriers.txt
> index 904ee42..6b262c2 100644
> --- a/Documentation/memory-barriers.txt
> +++ b/Documentation/memory-barriers.txt
> @@ -1703,8 +1703,8 @@ There are some more advanced barrier functions:
>  
>  
>   (*) lockless_dereference();
> - This can be thought of as a pointer-fetch wrapper around the
> - smp_read_barrier_depends() data-dependency barrier.
> + This is a load, and any load or store that has a data dependency on the
> + value returned by this load won't be reordered before this load.

This is a good start, but more is needed to warn people off of
smp_read_barrier_depends().  But yes, better explanation would be good.

Thanx, Paul

>   This is also similar to rcu_dereference(), but in cases where
>   object lifetime is handled by some mechanism other than RCU, for
> 
> 
> Regards,
> Boqun
> 
> > > And if anybody sends out an email with that smp_read_barrier_depends()
> > > in an example, they are actively just confusing other people, which is
> > > even worse than just being wrong. Which is why I jumped in.
> > > 
> > > So stop perpetuating the myth that smp_read_barrier_depends() does
> > > something here. It does not. It's a bug, and it has become this "mind
> > > virus" for some people that seem to believe that it does something.
> > 
> > It looks like I should add words to memory-barriers.txt de-emphasizing
> > smp_read_barrier_depends().  I will take a look at that.
> > 
> > > I had to remove this crap once from the kernel already, see commit
> > > 105ff3cbf225 ("atomic: remove all traces of READ_ONCE_CTRL() and
> > > atomic*_read_ctrl()").
> > > 
> > > I don't want to ever see that broken construct again. And I want to
> > > make sure that everybody is educated about how broken it was. I'm
> > > extremely unhappy that it came up again.
> > 
> > Well, if it makes you feel better, that was control dependencies and this
> > was data dependencies.  So it was not -exactly- the same.  ;-)
> > 
> > (Sorry, couldn't resist...)
> > 
> > > If it turns out that some architecture does actually need a barrier
> > > between a read and a dependent write, then that will mean that
> > > 
> > >  (a) we'll have to make up a _new_ barrier, because
> > > "smp_read_barrier_depends()" is not that barrier. We'll presumably
> > > then have to make that new barrier part of "rcu_derefence()" and
> > > friends.
> > 
> > Agreed.  We can worry about whether or not we replace the current
> > smp_read_barrier_

Re: [Xen-devel] [PATCH] smpboot: Add smpboot state variables instead of reusing CPU hotplug states

2015-11-06 Thread Paul E. McKenney
On Thu, Oct 15, 2015 at 01:32:44PM +0200, Daniel Wagner wrote:
> The cpu hotplug state machine in smpboot.c is reusing the states from
> cpu.h. That is confusing when it comes to the CPU_DEAD_FROZEN usage.
> Paul explained to me that he was in need of an additional state
> for destinguishing between a CPU error states. For this he just
> picked CPU_DEAD_FROZEN.
> 
> 8038dad7e888581266c76df15d70ca457a3c5910 smpboot: Add common code for 
> notification from dying CPU
> 2a442c9c6453d3d043dfd89f2e03a1deff8a6f06 x86: Use common 
> outgoing-CPU-notification code
> 
> Instead of reusing the states, let's add new definition inside
> the smpboot.c file with explenation what those states
> mean. Thanks Paul for providing them.
> 
> Signed-off-by: Daniel Wagner 

Apologies for the delay, I didn't realize that you were waiting on me.

Reviewed-by: Paul E. McKenney 

> Cc: Thomas Gleixner 
> Cc: "Paul E. McKenney" 
> Cc: Peter Zijlstra 
> Cc: xen-de...@lists.xenproject.org
> Cc: linux-ker...@vger.kernel.org
> ---
>  arch/x86/xen/smp.c  |  4 +--
>  include/linux/cpu.h |  3 +-
>  kernel/smpboot.c| 82 
> -
>  3 files changed, 67 insertions(+), 22 deletions(-)
> 
> diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
> index 3f4ebf0..804bf5c 100644
> --- a/arch/x86/xen/smp.c
> +++ b/arch/x86/xen/smp.c
> @@ -495,7 +495,7 @@ static int xen_cpu_up(unsigned int cpu, struct 
> task_struct *idle)
>   rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
>   BUG_ON(rc);
> 
> - while (cpu_report_state(cpu) != CPU_ONLINE)
> + while (!cpu_check_online(cpu))
>   HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
> 
>   return 0;
> @@ -767,7 +767,7 @@ static int xen_hvm_cpu_up(unsigned int cpu, struct 
> task_struct *tidle)
>* This can happen if CPU was offlined earlier and
>* offlining timed out in common_cpu_die().
>*/
> - if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) {
> + if (cpu_check_timeout(cpu)) {
>   xen_smp_intr_free(cpu);
>   xen_uninit_lock_cpu(cpu);
>   }
> diff --git a/include/linux/cpu.h b/include/linux/cpu.h
> index 23c30bd..f78ab46 100644
> --- a/include/linux/cpu.h
> +++ b/include/linux/cpu.h
> @@ -284,7 +284,8 @@ void arch_cpu_idle_dead(void);
> 
>  DECLARE_PER_CPU(bool, cpu_dead_idle);
> 
> -int cpu_report_state(int cpu);
> +int cpu_check_online(int cpu);
> +int cpu_check_timeout(int cpu);
>  int cpu_check_up_prepare(int cpu);
>  void cpu_set_state_online(int cpu);
>  #ifdef CONFIG_HOTPLUG_CPU
> diff --git a/kernel/smpboot.c b/kernel/smpboot.c
> index a818cbc..75e5724 100644
> --- a/kernel/smpboot.c
> +++ b/kernel/smpboot.c
> @@ -371,19 +371,63 @@ int smpboot_update_cpumask_percpu_thread(struct 
> smp_hotplug_thread *plug_thread,
>  }
>  EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread);
> 
> +/* The CPU is offline, and its last offline operation was
> + * successful and proceeded normally.  (Or, alternatively, the
> + * CPU never has come online, as this is the initial state.)
> + */
> +#define CPUHP_POST_DEAD  0x01
> +
> +/* The CPU is in the process of coming online.
> + * Simple architectures can skip this state, and just invoke
> + * cpu_set_state_online() unconditionally instead.
> + */
> +#define CPUHP_UP_PREPARE 0x02
> +
> +/* The CPU is now online.  Simple architectures can skip this
> + * state, and just invoke cpu_wait_death() and cpu_report_death()
> + * unconditionally instead.
> + */
> +#define CPUHP_ONLINE 0x03
> +
> +/* The CPU has gone offline, so that it may now be safely
> + * powered off (or whatever the architecture needs to do to it).
> + */
> +#define CPUHP_DEAD   0x04
> +
> +/* The CPU did not go offline in a timely fashion, if at all,
> + * so it might need special processing at the next online (for
> + * example, simply refusing to bring it online).
> + */
> +#define CPUHP_BROKEN 0x05
> +
> +/* The CPU eventually did go offline, but not in a timely
> + * fashion.  If some sort of reset operation is required before it
> + * can be brought online, that reset operation needs to be carried
> + * out at online time.  (Or, again, the architecture might simply
> + * refuse to bring it online.)
> + */
> +#define CPUHP_TIMEOUT0x06
> +
>  static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = 
> ATOMIC_INIT(CPU_POST_DEAD);
> 
>  /*
>   * Called to poll specified CPU's state, for example, when waiting for
>   * a CPU to come online.
>   */
> -int cpu_report_state(int cpu)
> +int cpu_check_online(int cpu)
> +{
> + 

Re: [Xen-devel] [PATCH] documentation: Add disclaimer

2016-04-14 Thread Paul E. McKenney
On Wed, Jan 27, 2016 at 09:35:46AM +0100, Peter Zijlstra wrote:
> On Tue, Jan 26, 2016 at 12:11:43PM -0800, Paul E. McKenney wrote:
> > So Peter, would you like to update your patch to include yourself
> > and Will as authors?
> 
> Sure, here goes.
> 
> ---
> Subject: documentation: Add disclaimer
> 
> It appears people are reading this document as a requirements list for
> building hardware. This is not the intent of this document. Nor is it
> particularly suited for this purpose.
> 
> The primary purpose of this document is our collective attempt to define
> a set of primitives that (hopefully) allow us to write correct code on
> the myriad of SMP platforms Linux supports.
> 
> Its a definite work in progress as our understanding of these platforms,
> and memory ordering in general, progresses.
> 
> Nor does being mentioned in this document mean we think its a
> particularly good idea; the data dependency barrier required by Alpha
> being a prime example. Yes we have it, no you're insane to require it
> when building new hardware.
> 
> Signed-off-by: Peter Zijlstra (Intel) 

Rather belatedly queued and pushed to -rcu, apologies for the delay.
One minor edit noted below.

Thanx, Paul

> ---
>  Documentation/memory-barriers.txt | 18 +-
>  1 file changed, 17 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/memory-barriers.txt 
> b/Documentation/memory-barriers.txt
> index a61be39c7b51..98626125f484 100644
> --- a/Documentation/memory-barriers.txt
> +++ b/Documentation/memory-barriers.txt
> @@ -4,8 +4,24 @@
> 
>  By: David Howells 
>  Paul E. McKenney 
> +Will Deacon 
> +Peter Zijlstra 
> 
> -Contents:
> +==
> +DISCLAIMER
> +==
> +
> +This document is not a specification; it is intentionally (for the sake of
> +brevity) and unintentionally (due to being human) incomplete. This document 
> is
> +meant as a guide to using the various memory barriers provided by Linux, but
> +in case of any doubt (and there are many) please ask.
> +
> +I repeat, this document is not a specification of what Linux expects from

s/I/To/ because there is more than one author.

> +hardware.
> +
> +
> +CONTENTS
> +
> 
>   (*) Abstract memory access model.
> 
> 


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH] documentation: Add disclaimer

2016-04-14 Thread Paul E. McKenney
On Wed, Jan 27, 2016 at 02:57:07PM +, David Howells wrote:
> Peter Zijlstra  wrote:
> 
> > +==
> > +DISCLAIMER
> > +==
> > +
> > +This document is not a specification; it is intentionally (for the sake of
> > +brevity) and unintentionally (due to being human) incomplete. This 
> > document is
> > +meant as a guide to using the various memory barriers provided by Linux, 
> > but
> > +in case of any doubt (and there are many) please ask.
> > +
> > +I repeat, this document is not a specification of what Linux expects from
> > +hardware.
> 
> The purpose of this document is twofold:
> 
>  (1) to specify the minimum functionality that one can rely on for any
>  particular barrier, and
> 
>  (2) to provide a guide as to how to use the barriers that are available.
> 
> Note that an architecture can provide more than the minimum requirement for
> any particular barrier, but if the barrier provides less than that, it is
> incorrect.
> 
> Note also that it is possible that a barrier may be a no-op for an
> architecture because the way that arch works renders an explicit barrier
> unnecessary in that case.
> 
> > +
> 
> Can you bung an extra blank line in here if you have to redo this at all?

Done as part of your patch.  Again, apologies for the delay.

Thanx, Paul

> > +
> > +CONTENTS
> > +
> >  
> >   (*) Abstract memory access model.
> >  
> 
> David
> 


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [tip:core/rcu] x86: Use common outgoing-CPU-notification code

2015-03-27 Thread tip-bot for Paul E. McKenney
Commit-ID:  2a442c9c6453d3d043dfd89f2e03a1deff8a6f06
Gitweb: http://git.kernel.org/tip/2a442c9c6453d3d043dfd89f2e03a1deff8a6f06
Author: Paul E. McKenney 
AuthorDate: Wed, 25 Feb 2015 11:42:15 -0800
Committer:  Paul E. McKenney 
CommitDate: Wed, 11 Mar 2015 13:22:35 -0700

x86: Use common outgoing-CPU-notification code

This commit removes the open-coded CPU-offline notification with new
common code.  Among other things, this change avoids calling scheduler
code using RCU from an offline CPU that RCU is ignoring.  It also allows
Xen to notice at online time that the CPU did not go offline correctly.
Note that Xen has the surviving CPU carry out some cleanup operations,
so if the surviving CPU times out, these cleanup operations might have
been carried out while the outgoing CPU was still running.  It might
therefore be unwise to bring this CPU back online, and this commit
avoids doing so.

Signed-off-by: Boris Ostrovsky 
Signed-off-by: Paul E. McKenney 
Cc: 
Cc: Konrad Rzeszutek Wilk 
Cc: David Vrabel 
Cc: 
---
 arch/x86/include/asm/cpu.h |  2 --
 arch/x86/include/asm/smp.h |  2 +-
 arch/x86/kernel/smpboot.c  | 39 ++-
 arch/x86/xen/smp.c | 46 +-
 4 files changed, 44 insertions(+), 45 deletions(-)

diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index d2b1298..bf2caa1 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -34,8 +34,6 @@ extern int _debug_hotplug_cpu(int cpu, int action);
 #endif
 #endif
 
-DECLARE_PER_CPU(int, cpu_state);
-
 int mwait_usable(const struct cpuinfo_x86 *);
 
 #endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 8cd1cc3..a5cb4f6 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -150,12 +150,12 @@ static inline void arch_send_call_function_ipi_mask(const 
struct cpumask *mask)
 }
 
 void cpu_disable_common(void);
-void cpu_die_common(unsigned int cpu);
 void native_smp_prepare_boot_cpu(void);
 void native_smp_prepare_cpus(unsigned int max_cpus);
 void native_smp_cpus_done(unsigned int max_cpus);
 int native_cpu_up(unsigned int cpunum, struct task_struct *tidle);
 int native_cpu_disable(void);
+int common_cpu_die(unsigned int cpu);
 void native_cpu_die(unsigned int cpu);
 void native_play_dead(void);
 void play_dead_common(void);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index febc6aa..c8fa349 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -77,9 +77,6 @@
 #include 
 #include 
 
-/* State of each CPU */
-DEFINE_PER_CPU(int, cpu_state) = { 0 };
-
 /* Number of siblings per CPU package */
 int smp_num_siblings = 1;
 EXPORT_SYMBOL(smp_num_siblings);
@@ -257,7 +254,7 @@ static void notrace start_secondary(void *unused)
lock_vector_lock();
set_cpu_online(smp_processor_id(), true);
unlock_vector_lock();
-   per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+   cpu_set_state_online(smp_processor_id());
x86_platform.nmi_init();
 
/* enable local interrupts */
@@ -948,7 +945,10 @@ int native_cpu_up(unsigned int cpu, struct task_struct 
*tidle)
 */
mtrr_save_state();
 
-   per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+   /* x86 CPUs take themselves offline, so delayed offline is OK. */
+   err = cpu_check_up_prepare(cpu);
+   if (err && err != -EBUSY)
+   return err;
 
/* the FPU context is blank, nobody can own it */
__cpu_disable_lazy_restore(cpu);
@@ -1191,7 +1191,7 @@ void __init native_smp_prepare_boot_cpu(void)
switch_to_new_gdt(me);
/* already set me in cpu_online_mask in boot_cpu_init() */
cpumask_set_cpu(me, cpu_callout_mask);
-   per_cpu(cpu_state, me) = CPU_ONLINE;
+   cpu_set_state_online(me);
 }
 
 void __init native_smp_cpus_done(unsigned int max_cpus)
@@ -1318,14 +1318,10 @@ static void __ref remove_cpu_from_maps(int cpu)
numa_remove_cpu(cpu);
 }
 
-static DEFINE_PER_CPU(struct completion, die_complete);
-
 void cpu_disable_common(void)
 {
int cpu = smp_processor_id();
 
-   init_completion(&per_cpu(die_complete, smp_processor_id()));
-
remove_siblinginfo(cpu);
 
/* It's now safe to remove this processor from the online map */
@@ -1349,24 +1345,27 @@ int native_cpu_disable(void)
return 0;
 }
 
-void cpu_die_common(unsigned int cpu)
+int common_cpu_die(unsigned int cpu)
 {
-   wait_for_completion_timeout(&per_cpu(die_complete, cpu), HZ);
-}
+   int ret = 0;
 
-void native_cpu_die(unsigned int cpu)
-{
/* We don't do anything here: idle task is faking death itself. */
 
-   cpu_die_common(cpu);
-
/* They ack this in play_dead() by setting CPU_DEAD */
-   if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
+   if (cpu_wait_death(cpu, 5)) {
if (syst