Re: [PATCH v2 1/4] powerpc/code-patching: introduce patch_instructions()

2023-03-11 Thread Christophe Leroy


Le 09/03/2023 à 19:02, Hari Bathini a écrit :
> patch_instruction() entails setting up pte, patching the instruction,
> clearing the pte and flushing the tlb. If multiple instructions need
> to be patched, every instruction would have to go through the above
> drill unnecessarily. Instead, introduce function patch_instructions()
> that patches multiple instructions at one go while setting up the pte,
> clearing the pte and flushing the tlb only once per page range of
> instructions. Observed ~5X improvement in speed of execution using
> patch_instructions() over patch_instructions(), when more instructions
> are to be patched.
> 
> Signed-off-by: Hari Bathini 
> ---
>   arch/powerpc/include/asm/code-patching.h |   1 +
>   arch/powerpc/lib/code-patching.c | 151 ---
>   2 files changed, 106 insertions(+), 46 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/code-patching.h 
> b/arch/powerpc/include/asm/code-patching.h
> index 3f881548fb61..059fc4fe700e 100644
> --- a/arch/powerpc/include/asm/code-patching.h
> +++ b/arch/powerpc/include/asm/code-patching.h
> @@ -74,6 +74,7 @@ int create_cond_branch(ppc_inst_t *instr, const u32 *addr,
>   int patch_branch(u32 *addr, unsigned long target, int flags);
>   int patch_instruction(u32 *addr, ppc_inst_t instr);
>   int raw_patch_instruction(u32 *addr, ppc_inst_t instr);
> +int patch_instructions(u32 *addr, u32 *code, bool fill_inst, size_t len);
>   
>   static inline unsigned long patch_site_addr(s32 *site)
>   {
> diff --git a/arch/powerpc/lib/code-patching.c 
> b/arch/powerpc/lib/code-patching.c
> index b00112d7ad46..33857b9b53de 100644
> --- a/arch/powerpc/lib/code-patching.c
> +++ b/arch/powerpc/lib/code-patching.c
> @@ -278,77 +278,117 @@ static void unmap_patch_area(unsigned long addr)
>   flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>   }
>   
> -static int __do_patch_instruction_mm(u32 *addr, ppc_inst_t instr)
> +static int __do_patch_instructions_mm(u32 *addr, u32 *code, bool fill_inst, 
> size_t len)

Some time ago we did a huge work to clean up use of u32 as code versus 
ppc_inst_t.

Please carefully audit all places where you use u32 instead of 
ppc_inst_t. If you do so, don't use anymore the word 'instruction' in 
the function name.

>   {
> - int err;
> - u32 *patch_addr;
> - unsigned long text_poke_addr;
> - pte_t *pte;
> - unsigned long pfn = get_patch_pfn(addr);
> - struct mm_struct *patching_mm;
> - struct mm_struct *orig_mm;
> + struct mm_struct *patching_mm, *orig_mm;

This change is cosmetic and not functionnaly liked to the patch.

> + unsigned long text_poke_addr, pfn;
> + u32 *patch_addr, *end, *pend;
> + ppc_inst_t instr;
>   spinlock_t *ptl;
> + int ilen, err;
> + pte_t *pte;

Why move this declaration ?

>   
>   patching_mm = __this_cpu_read(cpu_patching_context.mm);
>   text_poke_addr = __this_cpu_read(cpu_patching_context.addr);
> - patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr));
>   
>   pte = get_locked_pte(patching_mm, text_poke_addr, );
>   if (!pte)
>   return -ENOMEM;
>   
> - __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, 
> PAGE_KERNEL), 0);
> + end = (void *)addr + len;
> + do {
> + pfn = get_patch_pfn(addr);
> + __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, 
> PAGE_KERNEL), 0);
>   
> - /* order PTE update before use, also serves as the hwsync */
> - asm volatile("ptesync": : :"memory");
> -
> - /* order context switch after arbitrary prior code */
> - isync();
> -
> - orig_mm = start_using_temp_mm(patching_mm);
> -
> - err = __patch_instruction(addr, instr, patch_addr);
> + /* order PTE update before use, also serves as the hwsync */
> + asm volatile("ptesync": : :"memory");
>   
> - /* hwsync performed by __patch_instruction (sync) if successful */
> - if (err)
> - mb();  /* sync */
> + /* order context switch after arbitrary prior code */
> + isync();
> +
> + orig_mm = start_using_temp_mm(patching_mm);
> +
> + patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr));
> + pend = (void *)addr + PAGE_SIZE - offset_in_page(addr);
> + if (end < pend)
> + pend = end;
> +
> + while (addr < pend) {
> + instr = ppc_inst_read(code);
> + ilen = ppc_inst_len(instr);
> + err = __patch_instruction(addr, instr, patch_addr);
> + /* hwsync performed by __patch_instruction (sync) if 
> successful */

As you aim at optimising the loops, you should before cache flushed 
outside this loop, with flush_dcache_range() and invalidate_dcache_range().

> + if (err) {
> + mb();  /* sync */
> + break;
> +

Re: [PATCH v2 1/4] powerpc/code-patching: introduce patch_instructions()

2023-03-10 Thread Christophe Leroy


Le 10/03/2023 à 19:26, Christophe Leroy a écrit :
> 
> 
> Le 09/03/2023 à 19:02, Hari Bathini a écrit :
>> patch_instruction() entails setting up pte, patching the instruction,
>> clearing the pte and flushing the tlb. If multiple instructions need
>> to be patched, every instruction would have to go through the above
>> drill unnecessarily. Instead, introduce function patch_instructions()
>> that patches multiple instructions at one go while setting up the pte,
>> clearing the pte and flushing the tlb only once per page range of
>> instructions. Observed ~5X improvement in speed of execution using
>> patch_instructions() over patch_instructions(), when more instructions
>> are to be patched.
> 
> I get a 13% degradation on the time needed to activate ftrace on a 
> powerpc 8xx.
> 
> Before your patch, activation ftrace takes 550k timebase ticks. After 
> your patch it takes 620k timebase ticks.
> 

More details about the problem:

Before your patch, patch_instruction() is a simple, stackless function 
(Note that the first branch is noped out after startup).

0254 :
  254:  48 00 00 6c b   2c0 
  258:  7c e0 00 a6 mfmsr   r7
  25c:  7c 51 13 a6 mtspr   81,r2
  260:  3d 40 00 00 lis r10,0
262: R_PPC_ADDR16_HA.data
  264:  39 4a 00 00 addir10,r10,0
266: R_PPC_ADDR16_LO.data
  268:  7c 69 1b 78 mr  r9,r3
  26c:  3d 29 40 00 addis   r9,r9,16384
  270:  81 0a 00 08 lwz r8,8(r10)
  274:  55 29 00 26 rlwinm  r9,r9,0,0,19
  278:  81 4a 00 04 lwz r10,4(r10)
  27c:  61 29 01 25 ori r9,r9,293
  280:  91 28 00 00 stw r9,0(r8)
  284:  55 49 00 26 rlwinm  r9,r10,0,0,19
  288:  50 6a 05 3e rlwimi  r10,r3,0,20,31
  28c:  90 8a 00 00 stw r4,0(r10)
  290:  7c 00 50 6c dcbst   0,r10
  294:  7c 00 04 ac hwsync
  298:  7c 00 1f ac icbi0,r3
  29c:  7c 00 04 ac hwsync
  2a0:  4c 00 01 2c isync
  2a4:  38 60 00 00 li  r3,0
  2a8:  39 40 00 00 li  r10,0
  2ac:  91 48 00 00 stw r10,0(r8)
  2b0:  7c 00 4a 64 tlbie   r9,r0
  2b4:  7c 00 04 ac hwsync
  2b8:  7c e0 01 24 mtmsr   r7
  2bc:  4e 80 00 20 blr

  2c0:  90 83 00 00 stw r4,0(r3)
  2c4:  7c 00 18 6c dcbst   0,r3
  2c8:  7c 00 04 ac hwsync
  2cc:  7c 00 1f ac icbi0,r3
  2d0:  7c 00 04 ac hwsync
  2d4:  4c 00 01 2c isync
  2d8:  38 60 00 00 li  r3,0
  2dc:  4e 80 00 20 blr
  2e0:  38 60 ff ff li  r3,-1
  2e4:  4b ff ff c4 b   2a8 
  2e8:  38 60 ff ff li  r3,-1
  2ec:  4e 80 00 20 blr


Once your patch is there, patch_instruction() becomes a function that 
has to step up a stack in order to call __do_patch_instructions().
And __do_patch_instructions() is quite a big function.

022c <__do_patch_instructions>:
  22c:  3d 20 00 00 lis r9,0
22e: R_PPC_ADDR16_HA.data
  230:  39 29 00 00 addir9,r9,0
232: R_PPC_ADDR16_LO.data
  234:  81 69 00 04 lwz r11,4(r9)
  238:  2c 05 00 00 cmpwi   r5,0
  23c:  81 89 00 08 lwz r12,8(r9)
  240:  7c c3 32 14 add r6,r3,r6
  244:  55 6b 00 26 rlwinm  r11,r11,0,0,19
  248:  38 00 00 00 li  r0,0
  24c:  54 6a 05 3e clrlwi  r10,r3,20
  250:  21 0a 10 00 subfic  r8,r10,4096
  254:  7d 03 42 14 add r8,r3,r8
  258:  7c 69 1b 78 mr  r9,r3
  25c:  7f 88 30 40 cmplw   cr7,r8,r6
  260:  3d 29 40 00 addis   r9,r9,16384
  264:  55 29 00 26 rlwinm  r9,r9,0,0,19
  268:  61 29 01 25 ori r9,r9,293
  26c:  91 2c 00 00 stw r9,0(r12)
  270:  7d 4a 5b 78 or  r10,r10,r11
  274:  40 9d 00 08 ble cr7,27c <__do_patch_instructions+0x50>
  278:  7c c8 33 78 mr  r8,r6
  27c:  7f 83 40 40 cmplw   cr7,r3,r8
  280:  40 9c 01 2c bge cr7,3ac <__do_patch_instructions+0x180>
  284:  7c 69 18 f8 not r9,r3
  288:  7d 28 4a 14 add r9,r8,r9
  28c:  55 29 f7 fe rlwinm  r9,r9,30,31,31
  290:  7c e3 50 50 subfr7,r3,r10
  294:  80 a4 00 00 lwz r5,0(r4)
  298:  90 aa 00 00 stw r5,0(r10)
  29c:  7c 00 50 6c dcbst   0,r10
  2a0:  7c 00 04 ac hwsync
  2a4:  7c 00 1f ac icbi0,r3
  2a8:  7c 00 04 ac hwsync
  2ac:  4c 00 01 2c isync
  2b0:  38 63 00 04 addir3,r3,4
  2b4:  40 82 00 08 bne 2bc <__do_patch_instructions+0x90>
  2b8:  38 84 00 04 addir4,r4,4
  2bc:  7f 83 40 40 cmplw   cr7,r3,r8
  2c0:  40 9c 00 a4 bge cr7,364 <__do_patch_instructions+0x138>
  2c4:  2f 89 00 00 cmpwi   cr7,r9,0
  2c8:  41 9e 00 38 beq cr7,300 <__do_patch_instructions+0xd4>
  2cc:  7d 23 3a 14 add r9,r3,r7
  2d0:  81 44 00 00 lwz r10,0(r4)
  2d4:  91 49 00 00 stw r10,0(r9)
  2d8:  7c 00 48 6c dcbst   0,r9
  2dc:  7c 00 04 ac hwsync
  2e0:  7c 00 1f ac icbi0,r3
  2e4:  7c 00 04 ac  

Re: [PATCH v2 1/4] powerpc/code-patching: introduce patch_instructions()

2023-03-10 Thread Christophe Leroy


Le 09/03/2023 à 19:02, Hari Bathini a écrit :
> patch_instruction() entails setting up pte, patching the instruction,
> clearing the pte and flushing the tlb. If multiple instructions need
> to be patched, every instruction would have to go through the above
> drill unnecessarily. Instead, introduce function patch_instructions()
> that patches multiple instructions at one go while setting up the pte,
> clearing the pte and flushing the tlb only once per page range of
> instructions. Observed ~5X improvement in speed of execution using
> patch_instructions() over patch_instructions(), when more instructions
> are to be patched.

I get a 13% degradation on the time needed to activate ftrace on a 
powerpc 8xx.

Before your patch, activation ftrace takes 550k timebase ticks. After 
your patch it takes 620k timebase ticks.

Christophe

> 
> Signed-off-by: Hari Bathini 
> ---
>   arch/powerpc/include/asm/code-patching.h |   1 +
>   arch/powerpc/lib/code-patching.c | 151 ---
>   2 files changed, 106 insertions(+), 46 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/code-patching.h 
> b/arch/powerpc/include/asm/code-patching.h
> index 3f881548fb61..059fc4fe700e 100644
> --- a/arch/powerpc/include/asm/code-patching.h
> +++ b/arch/powerpc/include/asm/code-patching.h
> @@ -74,6 +74,7 @@ int create_cond_branch(ppc_inst_t *instr, const u32 *addr,
>   int patch_branch(u32 *addr, unsigned long target, int flags);
>   int patch_instruction(u32 *addr, ppc_inst_t instr);
>   int raw_patch_instruction(u32 *addr, ppc_inst_t instr);
> +int patch_instructions(u32 *addr, u32 *code, bool fill_inst, size_t len);
>   
>   static inline unsigned long patch_site_addr(s32 *site)
>   {
> diff --git a/arch/powerpc/lib/code-patching.c 
> b/arch/powerpc/lib/code-patching.c
> index b00112d7ad46..33857b9b53de 100644
> --- a/arch/powerpc/lib/code-patching.c
> +++ b/arch/powerpc/lib/code-patching.c
> @@ -278,77 +278,117 @@ static void unmap_patch_area(unsigned long addr)
>   flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>   }
>   
> -static int __do_patch_instruction_mm(u32 *addr, ppc_inst_t instr)
> +static int __do_patch_instructions_mm(u32 *addr, u32 *code, bool fill_inst, 
> size_t len)
>   {
> - int err;
> - u32 *patch_addr;
> - unsigned long text_poke_addr;
> - pte_t *pte;
> - unsigned long pfn = get_patch_pfn(addr);
> - struct mm_struct *patching_mm;
> - struct mm_struct *orig_mm;
> + struct mm_struct *patching_mm, *orig_mm;
> + unsigned long text_poke_addr, pfn;
> + u32 *patch_addr, *end, *pend;
> + ppc_inst_t instr;
>   spinlock_t *ptl;
> + int ilen, err;
> + pte_t *pte;
>   
>   patching_mm = __this_cpu_read(cpu_patching_context.mm);
>   text_poke_addr = __this_cpu_read(cpu_patching_context.addr);
> - patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr));
>   
>   pte = get_locked_pte(patching_mm, text_poke_addr, );
>   if (!pte)
>   return -ENOMEM;
>   
> - __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, 
> PAGE_KERNEL), 0);
> + end = (void *)addr + len;
> + do {
> + pfn = get_patch_pfn(addr);
> + __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, 
> PAGE_KERNEL), 0);
>   
> - /* order PTE update before use, also serves as the hwsync */
> - asm volatile("ptesync": : :"memory");
> -
> - /* order context switch after arbitrary prior code */
> - isync();
> -
> - orig_mm = start_using_temp_mm(patching_mm);
> -
> - err = __patch_instruction(addr, instr, patch_addr);
> + /* order PTE update before use, also serves as the hwsync */
> + asm volatile("ptesync": : :"memory");
>   
> - /* hwsync performed by __patch_instruction (sync) if successful */
> - if (err)
> - mb();  /* sync */
> + /* order context switch after arbitrary prior code */
> + isync();
> +
> + orig_mm = start_using_temp_mm(patching_mm);
> +
> + patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr));
> + pend = (void *)addr + PAGE_SIZE - offset_in_page(addr);
> + if (end < pend)
> + pend = end;
> +
> + while (addr < pend) {
> + instr = ppc_inst_read(code);
> + ilen = ppc_inst_len(instr);
> + err = __patch_instruction(addr, instr, patch_addr);
> + /* hwsync performed by __patch_instruction (sync) if 
> successful */
> + if (err) {
> + mb();  /* sync */
> + break;
> + }
> +
> + patch_addr = (void *)patch_addr + ilen;
> + addr = (void *)addr + ilen;
> + if (!fill_inst)
> + code = (void *)code + ilen;
> + }
>   
> - /* context