On Tue, Feb 09, 2016 at 12:23:58PM +0100, Ingo Molnar wrote:
> * Will Deacon <will.dea...@arm.com> wrote:
> > On Wed, Feb 03, 2016 at 01:32:10PM +0000, Will Deacon wrote:
> > > On Wed, Feb 03, 2016 at 09:33:39AM +0100, Ingo Molnar wrote:
> > > > In fact I'd suggest to test this via a quick runtime hack like this in 
> > > > rcupdate.h:
> > > > 
> > > >         extern int panic_timeout;
> > > > 
> > > >         ...
> > > > 
> > > >         if (panic_timeout)
> > > >                 smp_load_acquire(p);
> > > >         else
> > > >                 typeof(*p) *________p1 = (typeof(*p) 
> > > > *__force)lockless_dereference(p);
> > > > 
> > > > (or so)
> > > 
> > > So the problem with this is that a LOAD <ctrl> LOAD sequence isn't an
> > > ordering hazard on ARM, so you're potentially at the mercy of the branch
> > > predictor as to whether you get an acquire. That's not to say it won't
> > > be discarded as soon as the conditional is resolved, but it could
> > > screw up the benchmarking.
> > > 
> > > I'd be better off doing some runtime patching, but that's not something
> > > I can knock up in a couple of minutes (so I'll add it to my list).
> > 
> > ... so I actually got that up and running, believe it or not. Filthy stuff.
> 
> Wow!
> 
> I tried to implement the simpler solution by hacking rcupdate.h, but got 
> drowned 
> in nasty circular header file dependencies and gave up...

I hacked linux/compiler.h, but got similar problems and ended up copying
what I need under a new namespace (posh way of saying I added _WILL to
everything until it built).

> If you are not overly embarrassed by posting hacky patches, mind posting your 
> solution?

Ok, since you asked! I'm thoroughly ashamed of the hacks, but maybe it
will keep those spammy Microsoft recruiters at bay ;)

Note that the "trigger" is changing the loglevel, but you need to do this
by echoing to /proc/sysrq-trigger, otherwise the whole thing gets invoked
in irq context which ends badly. It's also heavily arm64-specific.

> > The good news is that you're right, and I'm now seeing ~1% difference 
> > between 
> > the runs with ~0.3% noise for either of them. I still think that's 
> > significant, 
> > but it's a lot more reassuring than 4%.
> 
> hm, so for such marginal effects I think we could improve the testing method 
> a 
> bit: we could improve 'perf bench sched messaging' to allow 'steady state 
> testing': to not exit+restart all the processes between test iterations, but 
> to 
> continuously measure and print out current performance figures.
> 
> I.e. every 10 seconds it could print a decaying running average of current 
> throughput.
> 
> That way you could patch/unpatch the instructions without having to restart 
> the 
> tasks. If you still see an effect (in the numbers reported every 10 seconds), 
> then 
> that's a guaranteed result.
> 
> [ We have such functionality in 'perf bench numa' (the --show-convergence 
> option), 
>   for similar reasons, to allow runtime monitoring and tweaking of kernel 
>   parameters. ]

That sounds handy.

Will

--->8

diff --git a/arch/arm64/include/asm/cpufeature.h 
b/arch/arm64/include/asm/cpufeature.h
index 8f271b83f910..1a1e353983be 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -30,8 +30,8 @@
 #define ARM64_HAS_LSE_ATOMICS                  5
 #define ARM64_WORKAROUND_CAVIUM_23154          6
 #define ARM64_WORKAROUND_834220                        7
-
-#define ARM64_NCAPS                            8
+#define ARM64_RCU_USES_ACQUIRE                 8
+#define ARM64_NCAPS                            9
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c
index d2ee1b21a10d..edbf188a8541 100644
--- a/arch/arm64/kernel/alternative.c
+++ b/arch/arm64/kernel/alternative.c
@@ -143,6 +143,66 @@ static int __apply_alternatives_multi_stop(void *unused)
        return 0;
 }
 
+static void __apply_alternatives_rcu(void *alt_region)
+{
+       struct alt_instr *alt;
+       struct alt_region *region = alt_region;
+       u32 *origptr, *replptr;
+
+       for (alt = region->begin; alt < region->end; alt++) {
+               u32 insn, orig_insn;
+               int nr_inst;
+
+               if (alt->cpufeature != ARM64_RCU_USES_ACQUIRE)
+                       continue;
+
+               BUG_ON(alt->alt_len != alt->orig_len);
+
+               origptr = ALT_ORIG_PTR(alt);
+               replptr = ALT_REPL_PTR(alt);
+               nr_inst = alt->alt_len / sizeof(insn);
+
+               BUG_ON(nr_inst != 1);
+
+               insn = le32_to_cpu(*replptr);
+               orig_insn = le32_to_cpu(*origptr);
+               *(origptr) = cpu_to_le32(insn);
+               *replptr = cpu_to_le32(orig_insn);
+
+               flush_icache_range((uintptr_t)origptr, (uintptr_t)(origptr + 
1));
+               pr_info_ratelimited("%p: 0x%x => 0x%x\n", origptr, orig_insn, 
insn);
+       }
+}
+
+static int will_patched;
+
+static int __apply_alternatives_rcu_multi_stop(void *unused)
+{
+       struct alt_region region = {
+               .begin  = __alt_instructions,
+               .end    = __alt_instructions_end,
+       };
+
+       /* We always have a CPU 0 at this point (__init) */
+       if (smp_processor_id()) {
+               while (!READ_ONCE(will_patched))
+                       cpu_relax();
+               isb();
+       } else {
+               __apply_alternatives_rcu(&region);
+               /* Barriers provided by the cache flushing */
+               WRITE_ONCE(will_patched, 1);
+       }
+
+       return 0;
+}
+
+void apply_alternatives_rcu(void)
+{
+       will_patched = 0;
+       stop_machine(__apply_alternatives_rcu_multi_stop, NULL, 
cpu_online_mask);
+}
+
 void __init apply_alternatives_all(void)
 {
        /* better not try code patching on a live SMP system */
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index e3928f578891..98c132c3b018 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -141,7 +141,10 @@ SECTIONS
 
        PERCPU_SECTION(L1_CACHE_BYTES)
 
-       . = ALIGN(4);
+       __init_end = .;
+       . = ALIGN(PAGE_SIZE);
+
+
        .altinstructions : {
                __alt_instructions = .;
                *(.altinstructions)
@@ -151,8 +154,7 @@ SECTIONS
                *(.altinstr_replacement)
        }
 
-       . = ALIGN(PAGE_SIZE);
-       __init_end = .;
+       . = ALIGN(4);
 
        _data = .;
        _sdata = .;
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index e5139402e7f8..3eb7193fcf88 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -80,6 +80,7 @@ static int __init sysrq_always_enabled_setup(char *str)
 
 __setup("sysrq_always_enabled", sysrq_always_enabled_setup);
 
+extern void apply_alternatives_rcu(void);
 
 static void sysrq_handle_loglevel(int key)
 {
@@ -89,6 +90,7 @@ static void sysrq_handle_loglevel(int key)
        console_loglevel = CONSOLE_LOGLEVEL_DEFAULT;
        pr_info("Loglevel set to %d\n", i);
        console_loglevel = i;
+       apply_alternatives_rcu();
 }
 static struct sysrq_key_op sysrq_loglevel_op = {
        .handler        = sysrq_handle_loglevel,
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 00b042c49ccd..75e29d61fedd 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -218,6 +218,61 @@ void __read_once_size(const volatile void *p, void *res, 
int size)
        __READ_ONCE_SIZE;
 }
 
+extern void panic(const char *fmt, ...);
+
+#define __stringify_1_will(x...)       #x
+#define __stringify_will(x...) __stringify_1_will(x)
+
+#define ALTINSTR_ENTRY_WILL(feature)                                           
      \
+       " .word 661b - .\n"                             /* label           */ \
+       " .word 663f - .\n"                             /* new instruction */ \
+       " .hword " __stringify_will(feature) "\n"               /* feature bit  
   */ \
+       " .byte 662b-661b\n"                            /* source len      */ \
+       " .byte 664f-663f\n"                            /* replacement len */
+
+#define __ALTERNATIVE_CFG_WILL(oldinstr, newinstr, feature, cfg_enabled)       
\
+       ".if "__stringify_will(cfg_enabled)" == 1\n"                            
\
+       "661:\n\t"                                                      \
+       oldinstr "\n"                                                   \
+       "662:\n"                                                        \
+       ".pushsection .altinstructions,\"a\"\n"                         \
+       ALTINSTR_ENTRY_WILL(feature)                                            
\
+       ".popsection\n"                                                 \
+       ".pushsection .altinstr_replacement, \"a\"\n"                   \
+       "663:\n\t"                                                      \
+       newinstr "\n"                                                   \
+       "664:\n\t"                                                      \
+       ".popsection\n\t"                                               \
+       ".org   . - (664b-663b) + (662b-661b)\n\t"                      \
+       ".org   . - (662b-661b) + (664b-663b)\n"                        \
+       ".endif\n"
+
+#define _ALTERNATIVE_CFG_WILL(oldinstr, newinstr, feature, cfg, ...)   \
+       __ALTERNATIVE_CFG_WILL(oldinstr, newinstr, feature, 1)
+
+#define ALTERNATIVE_WILL(oldinstr, newinstr, ...)   \
+       _ALTERNATIVE_CFG_WILL(oldinstr, newinstr, __VA_ARGS__, 1)
+
+#define __READ_ONCE_SIZE_WILL                                          \
+({                                                                     \
+       __u64 tmp;                                                      \
+                                                                       \
+       switch (size) {                                                 \
+       case 8: asm volatile(                                           \
+               ALTERNATIVE_WILL("ldr %0, %1", "ldar %0, %1", 8)        \
+               : "=r" (tmp) : "Q" (*(volatile __u64 *)p));             \
+               *(__u64 *)res = tmp; break;                             \
+       default:                                                        \
+               panic("that's no pointer, yo");                         \
+       }                                                               \
+})
+
+static __always_inline
+void __read_once_size_will(const volatile void *p, void *res, int size)
+{
+       __READ_ONCE_SIZE_WILL;
+}
+
 #ifdef CONFIG_KASAN
 /*
  * This function is not 'inline' because __no_sanitize_address confilcts
@@ -537,10 +592,17 @@ static __always_inline void __write_once_size(volatile 
void *p, void *res, int s
  * object's lifetime is managed by something other than RCU.  That
  * "something other" might be reference counting or simple immortality.
  */
+
+#define READ_ONCE_WILL(x)                                              \
+({                                                                     \
+       union { typeof(x) __val; char __c[1]; } __u;                    \
+       __read_once_size_will(&(x), __u.__c, sizeof(x));                \
+       __u.__val;                                                      \
+})
+
 #define lockless_dereference(p) \
 ({ \
-       typeof(p) _________p1 = READ_ONCE(p); \
-       smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
+       typeof(p) _________p1 = READ_ONCE_WILL(p); \
        (_________p1); \
 })
 

Reply via email to