On 1/23/19 6:15 AM, Masami Hiramatsu wrote:
> Hi Daniel,
> 
> On Fri, 21 Dec 2018 11:27:32 +0100
> Daniel Bristot de Oliveira <[email protected]> wrote:
> 
>> Currently, the patch of an address is done in three steps:
>>
>> -- Pseudo-code #1 - Current implementation ---
>>         1) add an int3 trap to the address that will be patched
>>             sync cores (send IPI to all other CPUs)
>>         2) update all but the first byte of the patched range
>>             sync cores (send IPI to all other CPUs)
>>         3) replace the first byte (int3) by the first byte of replacing 
>> opcode
>>             sync cores (send IPI to all other CPUs)
>> -- Pseudo-code #1 ---
>>
>> When a static key has more than one entry, these steps are called once for
>> each entry. The number of IPIs then is linear with regard to the number 'n' 
>> of
>> entries of a key: O(n*3), which is O(n).
>>
>> This algorithm works fine for the update of a single key. But we think
>> it is possible to optimize the case in which a static key has more than
>> one entry. For instance, the sched_schedstats jump label has 56 entries
>> in my (updated) fedora kernel, resulting in 168 IPIs for each CPU in
>> which the thread that is enabling the key is _not_ running.
>>
>> With this patch, rather than receiving a single patch to be processed, a 
>> vector
>> of patches is passed, enabling the rewrite of the pseudo-code #1 in this
>> way:
>>
>> -- Pseudo-code #2 - This patch  ---
>> 1)  for each patch in the vector:
>>         add an int3 trap to the address that will be patched
>>
>>     sync cores (send IPI to all other CPUs)
>>
>> 2)  for each patch in the vector:
>>         update all but the first byte of the patched range
>>
>>     sync cores (send IPI to all other CPUs)
>>
>> 3)  for each patch in the vector:
>>         replace the first byte (int3) by the first byte of replacing opcode
>>
>>     sync cores (send IPI to all other CPUs)
>> -- Pseudo-code #2 - This patch  ---
>>
>> Doing the update in this way, the number of IPI becomes O(3) with regard
>> to the number of keys, which is O(1).
>>
>> The batch mode is done with the function text_poke_bp_batch(), that receives
>> two arguments: a vector of "struct text_to_poke", and the number of entries
>> in the vector.
>>
>> The vector must be sorted by the addr field of the text_to_poke structure,
>> enabling the binary search of a handler in the poke_int3_handler function
>> (a fast path).
>>
>> Signed-off-by: Daniel Bristot de Oliveira <[email protected]>
>> Cc: Thomas Gleixner <[email protected]>
>> Cc: Ingo Molnar <[email protected]>
>> Cc: Borislav Petkov <[email protected]>
>> Cc: "H. Peter Anvin" <[email protected]>
>> Cc: Greg Kroah-Hartman <[email protected]>
>> Cc: Masami Hiramatsu <[email protected]>
>> Cc: "Steven Rostedt (VMware)" <[email protected]>
>> Cc: Jiri Kosina <[email protected]>
>> Cc: Josh Poimboeuf <[email protected]>
>> Cc: "Peter Zijlstra (Intel)" <[email protected]>
>> Cc: Chris von Recklinghausen <[email protected]>
>> Cc: Jason Baron <[email protected]>
>> Cc: Scott Wood <[email protected]>
>> Cc: Marcelo Tosatti <[email protected]>
>> Cc: Clark Williams <[email protected]>
>> Cc: [email protected]
>> Cc: [email protected]
>> ---
>>  arch/x86/include/asm/text-patching.h |  15 ++++
>>  arch/x86/kernel/alternative.c        | 108 +++++++++++++++++++++++++--
>>  2 files changed, 117 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/text-patching.h 
>> b/arch/x86/include/asm/text-patching.h
>> index e85ff65c43c3..42ea7846df33 100644
>> --- a/arch/x86/include/asm/text-patching.h
>> +++ b/arch/x86/include/asm/text-patching.h
>> @@ -18,6 +18,20 @@ static inline void apply_paravirt(struct 
>> paravirt_patch_site *start,
>>  #define __parainstructions_end      NULL
>>  #endif
>>  
>> +/*
>> + * Currently, the max observed size in the kernel code is
>> + * JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5.
>> + * Raise it if needed.
>> + */
>> +#define POKE_MAX_OPCODE_SIZE        5
>> +
>> +struct text_to_poke {
>> +    void *handler;
>> +    void *addr;
>> +    size_t len;
>> +    const char opcode[POKE_MAX_OPCODE_SIZE];
>> +};
>> +
>>  extern void *text_poke_early(void *addr, const void *opcode, size_t len);
>>  
>>  /*
>> @@ -37,6 +51,7 @@ extern void *text_poke_early(void *addr, const void 
>> *opcode, size_t len);
>>  extern void *text_poke(void *addr, const void *opcode, size_t len);
>>  extern int poke_int3_handler(struct pt_regs *regs);
>>  extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void 
>> *handler);
>> +extern void text_poke_bp_batch(struct text_to_poke *tp, unsigned int 
>> nr_entries);
>>  extern int after_bootmem;
>>  
>>  #endif /* _ASM_X86_TEXT_PATCHING_H */
>> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
>> index 6f5ad8587de0..8fa47e5ec709 100644
>> --- a/arch/x86/kernel/alternative.c
>> +++ b/arch/x86/kernel/alternative.c
>> @@ -21,6 +21,7 @@
>>  #include <asm/tlbflush.h>
>>  #include <asm/io.h>
>>  #include <asm/fixmap.h>
>> +#include <linux/bsearch.h>
>>  
>>  int __read_mostly alternatives_patched;
>>  
>> @@ -738,10 +739,32 @@ static void do_sync_core(void *info)
>>  }
>>  
>>  static bool bp_patching_in_progress;
>> +/*
>> + * Single poke.
>> + */
>>  static void *bp_int3_handler, *bp_int3_addr;
>> +/*
>> + * Batching poke.
>> + */
>> +static struct text_to_poke *bp_int3_tpv;
>> +static unsigned int bp_int3_tpv_nr;
>> +
>> +static int text_bp_batch_bsearch(const void *key, const void *elt)
>> +{
>> +    struct text_to_poke *tp = (struct text_to_poke *) elt;
>> +
>> +    if (key < tp->addr)
>> +            return -1;
>> +    if (key > tp->addr)
>> +            return 1;
>> +    return 0;
>> +}
>>  
>>  int poke_int3_handler(struct pt_regs *regs)
>>  {
>> +    void *ip;
>> +    struct text_to_poke *tp;
>> +
>>      /*
>>       * Having observed our INT3 instruction, we now must observe
>>       * bp_patching_in_progress.
>> @@ -757,21 +780,41 @@ int poke_int3_handler(struct pt_regs *regs)
>>      if (likely(!bp_patching_in_progress))
>>              return 0;
>>  
>> -    if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr)
>> +    if (user_mode(regs))
>>              return 0;
>>  
>> -    /* set up the specified breakpoint handler */
>> -    regs->ip = (unsigned long) bp_int3_handler;
>> +    /*
>> +     * Single poke first.
>> +     */
> 
> I wonder why would you separate single poke and batch poke?
> It seems a single poke is just a case that bp_int3_tpv_nr == 1.

Hi Masami!

The single poke is used only at the boot time, before the system is able to
allocate memory. After that, the batch mode becomes the default.

I was thinking to make one function to each method, but then I would have to
change the do_int3() and manage how to switch between one and the other without
further overhead. I was planing to do this in a second round of improvements.

> If so, you can remove bp_int3_addr and this block.
> 
>> +    if (bp_int3_addr) {
>> +            if (regs->ip == (unsigned long) bp_int3_addr) {
>> +                    regs->ip = (unsigned long) bp_int3_handler;
>> +                    return 1;
>> +            }
>> +            return 0;
>> +    }
>>
>> -    return 1;
>> +    /*
>> +     * Batch mode.
>> +     */
>> +    if (bp_int3_tpv_nr) {
> 
> if (unlikely(bp_int3_tpv_nr))
> 
> Sorry about interrupting, but this is a "hot-path" when we use kprobes.

No problem at all! :-)

I will change this function to better deal with the hot-path (the default mode
after the system boots up).

how about something like this:
------------------ %< ------------------
int poke_int3_handler(struct pt_regs *regs)
{
        void *ip;
        struct text_to_poke *tp;

        /*
         * Having observed our INT3 instruction, we now must observe
         * bp_patching_in_progress.
         *
         *      in_progress = TRUE              INT3
         *      WMB                             RMB
         *      write INT3                      if (in_progress)
         *
         * Idem for bp_int3_handler.
         */
        smp_rmb();

        if (likely(!bp_patching_in_progress))
                return 0;

        if (user_mode(regs))
                return 0;

        /*
         * Single poke is only used at the boot.
         */
        if (unlikely(!bp_int3_tpv))
                goto single_poke;

        ip = (void *) regs->ip - sizeof(unsigned char);
        tp = bsearch(ip, bp_int3_tpv, bp_int3_tpv_nr,
                     sizeof(struct text_to_poke),
                     text_bp_batch_bsearch);
        if (tp) {
                /* set up the specified breakpoint handler */
                regs->ip = (unsigned long) tp->handler;
                return 1;
        }

        return 0;

single_poke:
        if (regs->ip == (unsigned long) bp_int3_addr) {
                regs->ip = (unsigned long) bp_int3_handler;
                return 1;
        }

        return 0;
}
------------- >% ----------

In this way the default code is up, and the only 'if' I am using is a var of the
batch mode (that will be used later). If are are still at the boot, we are
jumping to the end of the function.

look better?

> 
> Also, could you add NOKPROBE_SYMBOL(); for all symbols involved in this
> process?
> Recently I found I missed it for poke_int3_handler and sent a fix.
> ( https://www.mail-archive.com/[email protected]/msg1898241.html )
> If this increase the function-call-chain from poke_int3_handler, those
> must be marked as NOKPROBE_SYMBOL().

Ack! Doing that!

Thanks!

Reply via email to