Re: [PATCH] powerpc/powernv: move OPAL call wrapper tracing and interrupt handling to C

2019-02-26 Thread Stewart Smith
Nicholas Piggin  writes:
> The OPAL call wrapper gets interrupt disabling wrong. It disables
> interrupts just by clearing MSR[EE], which has two problems:
>
> - It doesn't call into the IRQ tracing subsystem, which means tracing
>   across OPAL calls does not always notice IRQs have been disabled.
>
> - It doesn't go through the IRQ soft-mask code, which causes a minor
>   bug. MSR[EE] can not be restored by saving the MSR then clearing
>   MSR[EE], because a racing interrupt while soft-masked could clear
>   MSR[EE] between the two steps. This can cause MSR[EE] to be
>   incorrectly enabled when the OPAL call returns. Fortunately that
>   should only result in another masked interrupt being taken to
>   disable MSR[EE] again, but it's a bit sloppy.
>
> The existing code also saves MSR to PACA, which is not re-entrant if
> there is a nested OPAL call from different MSR contexts, which can
> happen these days with SRESET interrupts on bare metal.
>
> To fix these issues, move the tracing and IRQ handling code to C, and
> call into asm just for the low level call when everything is ready to
> go. Save the MSR on stack rather than PACA.
>
> Performance cost is kept to a minimum with a few optimisations:
>
> - The endian switch upon return is combined with the MSR restore,
>   which avoids an expensive context synchronizing operation for LE
>   kernels. This makes up for the additional mtmsrd to enable
>   interrupts with local_irq_enable().
>
> - blr is now used to return from the opal_* functions that are called
>   as C functions, to avoid link stack corruption. This requires a
>   skiboot fix as well to keep the call stack balanced.
>
> A NULL call is more costly after this, (410ns->430ns on POWER9), but
> OPAL calls are generally not performance critical at this scale.

At least with this patch we can start to better measure things, and
that's a big plus. Also, When I get to start worrying about <1ms OPAL
calls, I think we can revisit optimising this path :)


-- 
Stewart Smith
OPAL Architect, IBM.



[PATCH] powerpc/powernv: move OPAL call wrapper tracing and interrupt handling to C

2019-02-26 Thread Nicholas Piggin
The OPAL call wrapper gets interrupt disabling wrong. It disables
interrupts just by clearing MSR[EE], which has two problems:

- It doesn't call into the IRQ tracing subsystem, which means tracing
  across OPAL calls does not always notice IRQs have been disabled.

- It doesn't go through the IRQ soft-mask code, which causes a minor
  bug. MSR[EE] can not be restored by saving the MSR then clearing
  MSR[EE], because a racing interrupt while soft-masked could clear
  MSR[EE] between the two steps. This can cause MSR[EE] to be
  incorrectly enabled when the OPAL call returns. Fortunately that
  should only result in another masked interrupt being taken to
  disable MSR[EE] again, but it's a bit sloppy.

The existing code also saves MSR to PACA, which is not re-entrant if
there is a nested OPAL call from different MSR contexts, which can
happen these days with SRESET interrupts on bare metal.

To fix these issues, move the tracing and IRQ handling code to C, and
call into asm just for the low level call when everything is ready to
go. Save the MSR on stack rather than PACA.

Performance cost is kept to a minimum with a few optimisations:

- The endian switch upon return is combined with the MSR restore,
  which avoids an expensive context synchronizing operation for LE
  kernels. This makes up for the additional mtmsrd to enable
  interrupts with local_irq_enable().

- blr is now used to return from the opal_* functions that are called
  as C functions, to avoid link stack corruption. This requires a
  skiboot fix as well to keep the call stack balanced.

A NULL call is more costly after this, (410ns->430ns on POWER9), but
OPAL calls are generally not performance critical at this scale.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/asm-prototypes.h |  10 +-
 arch/powerpc/platforms/powernv/Makefile   |   5 +-
 arch/powerpc/platforms/powernv/opal-call.c| 283 +++
 .../powerpc/platforms/powernv/opal-wrappers.S | 343 ++
 4 files changed, 328 insertions(+), 313 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/opal-call.c

diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
b/arch/powerpc/include/asm/asm-prototypes.h
index 1d911f68a23b..6c67b4bef854 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -37,13 +37,11 @@ void kexec_copy_flush(struct kimage *image);
 extern struct static_key hcall_tracepoint_key;
 void __trace_hcall_entry(unsigned long opcode, unsigned long *args);
 void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf);
-/* OPAL tracing */
-#ifdef CONFIG_JUMP_LABEL
-extern struct static_key opal_tracepoint_key;
-#endif
 
-void __trace_opal_entry(unsigned long opcode, unsigned long *args);
-void __trace_opal_exit(long opcode, unsigned long retval);
+/* OPAL */
+int64_t __opal_call(int64_t a0, int64_t a1, int64_t a2, int64_t a3,
+   int64_t a4, int64_t a5, int64_t a6, int64_t a7,
+   int64_t opcode, uint64_t msr);
 
 /* VMX copying */
 int enter_vmx_usercopy(void);
diff --git a/arch/powerpc/platforms/powernv/Makefile 
b/arch/powerpc/platforms/powernv/Makefile
index b540ce8eec55..da2e99efbd04 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
-obj-y  += setup.o opal-wrappers.o opal.o opal-async.o idle.o
-obj-y  += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
+obj-y  += setup.o opal-call.o opal-wrappers.o opal.o 
opal-async.o
+obj-y  += idle.o opal-rtc.o opal-nvram.o opal-lpc.o 
opal-flash.o
 obj-y  += rng.o opal-elog.o opal-dump.o opal-sysparam.o 
opal-sensor.o
 obj-y  += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o
 obj-y  += opal-kmsg.o opal-powercap.o opal-psr.o 
opal-sensor-groups.o
@@ -11,7 +11,6 @@ obj-$(CONFIG_CXL_BASE)+= pci-cxl.o
 obj-$(CONFIG_EEH)  += eeh-powernv.o
 obj-$(CONFIG_PPC_SCOM) += opal-xscom.o
 obj-$(CONFIG_MEMORY_FAILURE)   += opal-memory-errors.o
-obj-$(CONFIG_TRACEPOINTS)  += opal-tracepoints.o
 obj-$(CONFIG_OPAL_PRD) += opal-prd.o
 obj-$(CONFIG_PERF_EVENTS) += opal-imc.o
 obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o
diff --git a/arch/powerpc/platforms/powernv/opal-call.c 
b/arch/powerpc/platforms/powernv/opal-call.c
new file mode 100644
index ..578757d403ab
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-call.c
@@ -0,0 +1,283 @@
+// SPDX-License-Identifier: GPL-2.0
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#ifdef CONFIG_TRACEPOINTS
+/*
+ * Since the tracing code might execute OPAL calls we need to guard against
+ * recursion.
+ */
+static DEFINE_PER_CPU(unsigned int, opal_trace_depth);
+
+static void __trace_opal_entry(s64 a0, s64 a1, s64 a2, s64 a3,
+  s64 a4, s64 a5, s64 a6, s64 a7,
+

[RFC PATCH] powerpc/powernv: move OPAL call wrapper tracing and interrupt handling to C

2018-12-21 Thread Nicholas Piggin
The OPAL call wrapper gets interrupt disabling wrong. It disables
interrupts just by clearing MSR[EE], which has two problems:

- It doesn't call into the IRQ tracing subsystem, which means tracing
  across OPAL calls does not always notice IRQs have been disabled.

- It doesn't go through the IRQ soft-mask code, which causes a minor
  bug. MSR[EE] can not be restored by saving the MSR then clearing
  MSR[EE], because a racing interrupt while soft-masked because a
  masked interrupt could race and clear MSR[EE] between the two steps.
  This can cause MSR[EE] to be incorrectly enabled when the OPAL call
  returns. Fortunately that should just take an interrupt and re-run
  the masked handler to fix things, but it's sloppy.

The wapper also saves MSR to PACA, which is not re-entrant if the
nested MSRs does not match, which could be a problem if a SRESET
interrupts a real-mode call, for example.

To fix this, move the tracing and IRQ handling code to C, and call
into asm when everything is ready to go. Save MSR on stack.

Performance cost is kept to a minimum with a few optimisations,

- The endian switch upon return is combined with the MSR restore,
  which avoids an expensive context synchronizing operation for LE
  kernels. This makes up for the additional mtmsrd to enable
  interrupts with local_irq_enable().

- The bl/blr branches are balanced to avoid link stack corruption and
  reduce mispredicts. This requires a skiboot link stack fix as well.

Even so, a NULL call goes from 410ns to 430ns (POWER9) after this.
I would have expected it to come out much closer or even a little
ahead, can't see why it's ~70 cycles slower. Might have to do some
trace analysis.
---
 arch/powerpc/include/asm/asm-prototypes.h |  10 +-
 arch/powerpc/platforms/powernv/Makefile   |   5 +-
 arch/powerpc/platforms/powernv/opal-call.c| 277 ++
 .../platforms/powernv/opal-tracepoints.c  |  88 -
 .../powerpc/platforms/powernv/opal-wrappers.S | 344 ++
 5 files changed, 322 insertions(+), 402 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/opal-call.c
 delete mode 100644 arch/powerpc/platforms/powernv/opal-tracepoints.c

diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
b/arch/powerpc/include/asm/asm-prototypes.h
index ec691d489656..4c0599e5bfaf 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -37,13 +37,11 @@ void kexec_copy_flush(struct kimage *image);
 extern struct static_key hcall_tracepoint_key;
 void __trace_hcall_entry(unsigned long opcode, unsigned long *args);
 void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf);
-/* OPAL tracing */
-#ifdef HAVE_JUMP_LABEL
-extern struct static_key opal_tracepoint_key;
-#endif
 
-void __trace_opal_entry(unsigned long opcode, unsigned long *args);
-void __trace_opal_exit(long opcode, unsigned long retval);
+/* OPAL */
+int64_t __opal_call(int64_t a0, int64_t a1, int64_t a2, int64_t a3,
+   int64_t a4, int64_t a5, int64_t a6, int64_t a7,
+   int64_t opcode, uint64_t msr);
 
 /* VMX copying */
 int enter_vmx_usercopy(void);
diff --git a/arch/powerpc/platforms/powernv/Makefile 
b/arch/powerpc/platforms/powernv/Makefile
index b540ce8eec55..da2e99efbd04 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
-obj-y  += setup.o opal-wrappers.o opal.o opal-async.o idle.o
-obj-y  += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
+obj-y  += setup.o opal-call.o opal-wrappers.o opal.o 
opal-async.o
+obj-y  += idle.o opal-rtc.o opal-nvram.o opal-lpc.o 
opal-flash.o
 obj-y  += rng.o opal-elog.o opal-dump.o opal-sysparam.o 
opal-sensor.o
 obj-y  += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o
 obj-y  += opal-kmsg.o opal-powercap.o opal-psr.o 
opal-sensor-groups.o
@@ -11,7 +11,6 @@ obj-$(CONFIG_CXL_BASE)+= pci-cxl.o
 obj-$(CONFIG_EEH)  += eeh-powernv.o
 obj-$(CONFIG_PPC_SCOM) += opal-xscom.o
 obj-$(CONFIG_MEMORY_FAILURE)   += opal-memory-errors.o
-obj-$(CONFIG_TRACEPOINTS)  += opal-tracepoints.o
 obj-$(CONFIG_OPAL_PRD) += opal-prd.o
 obj-$(CONFIG_PERF_EVENTS) += opal-imc.o
 obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o
diff --git a/arch/powerpc/platforms/powernv/opal-call.c 
b/arch/powerpc/platforms/powernv/opal-call.c
new file mode 100644
index ..172fea114a92
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-call.c
@@ -0,0 +1,277 @@
+// SPDX-License-Identifier: GPL-2.0
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#ifdef CONFIG_TRACEPOINTS
+/*
+ * Since the tracing code might execute OPAL calls we need to guard against
+ * recursion.
+ */
+static DEFINE_PER_CPU(unsigned int, opal_trace_depth);
+
+static void __trace_opal_entry(s64 a0, s64 a1, s64 a2, s64 a3,
+