Re: [PATCH v2 1/4] powerpc/code-patching: introduce patch_instructions()
Le 09/03/2023 à 19:02, Hari Bathini a écrit : > patch_instruction() entails setting up pte, patching the instruction, > clearing the pte and flushing the tlb. If multiple instructions need > to be patched, every instruction would have to go through the above > drill unnecessarily. Instead, introduce function patch_instructions() > that patches multiple instructions at one go while setting up the pte, > clearing the pte and flushing the tlb only once per page range of > instructions. Observed ~5X improvement in speed of execution using > patch_instructions() over patch_instructions(), when more instructions > are to be patched. > > Signed-off-by: Hari Bathini > --- > arch/powerpc/include/asm/code-patching.h | 1 + > arch/powerpc/lib/code-patching.c | 151 --- > 2 files changed, 106 insertions(+), 46 deletions(-) > > diff --git a/arch/powerpc/include/asm/code-patching.h > b/arch/powerpc/include/asm/code-patching.h > index 3f881548fb61..059fc4fe700e 100644 > --- a/arch/powerpc/include/asm/code-patching.h > +++ b/arch/powerpc/include/asm/code-patching.h > @@ -74,6 +74,7 @@ int create_cond_branch(ppc_inst_t *instr, const u32 *addr, > int patch_branch(u32 *addr, unsigned long target, int flags); > int patch_instruction(u32 *addr, ppc_inst_t instr); > int raw_patch_instruction(u32 *addr, ppc_inst_t instr); > +int patch_instructions(u32 *addr, u32 *code, bool fill_inst, size_t len); > > static inline unsigned long patch_site_addr(s32 *site) > { > diff --git a/arch/powerpc/lib/code-patching.c > b/arch/powerpc/lib/code-patching.c > index b00112d7ad46..33857b9b53de 100644 > --- a/arch/powerpc/lib/code-patching.c > +++ b/arch/powerpc/lib/code-patching.c > @@ -278,77 +278,117 @@ static void unmap_patch_area(unsigned long addr) > flush_tlb_kernel_range(addr, addr + PAGE_SIZE); > } > > -static int __do_patch_instruction_mm(u32 *addr, ppc_inst_t instr) > +static int __do_patch_instructions_mm(u32 *addr, u32 *code, bool fill_inst, > size_t len) Some time ago we did a huge work to clean up use of u32 as code versus ppc_inst_t. Please carefully audit all places where you use u32 instead of ppc_inst_t. If you do so, don't use anymore the word 'instruction' in the function name. > { > - int err; > - u32 *patch_addr; > - unsigned long text_poke_addr; > - pte_t *pte; > - unsigned long pfn = get_patch_pfn(addr); > - struct mm_struct *patching_mm; > - struct mm_struct *orig_mm; > + struct mm_struct *patching_mm, *orig_mm; This change is cosmetic and not functionnaly liked to the patch. > + unsigned long text_poke_addr, pfn; > + u32 *patch_addr, *end, *pend; > + ppc_inst_t instr; > spinlock_t *ptl; > + int ilen, err; > + pte_t *pte; Why move this declaration ? > > patching_mm = __this_cpu_read(cpu_patching_context.mm); > text_poke_addr = __this_cpu_read(cpu_patching_context.addr); > - patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr)); > > pte = get_locked_pte(patching_mm, text_poke_addr, ); > if (!pte) > return -ENOMEM; > > - __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, > PAGE_KERNEL), 0); > + end = (void *)addr + len; > + do { > + pfn = get_patch_pfn(addr); > + __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, > PAGE_KERNEL), 0); > > - /* order PTE update before use, also serves as the hwsync */ > - asm volatile("ptesync": : :"memory"); > - > - /* order context switch after arbitrary prior code */ > - isync(); > - > - orig_mm = start_using_temp_mm(patching_mm); > - > - err = __patch_instruction(addr, instr, patch_addr); > + /* order PTE update before use, also serves as the hwsync */ > + asm volatile("ptesync": : :"memory"); > > - /* hwsync performed by __patch_instruction (sync) if successful */ > - if (err) > - mb(); /* sync */ > + /* order context switch after arbitrary prior code */ > + isync(); > + > + orig_mm = start_using_temp_mm(patching_mm); > + > + patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr)); > + pend = (void *)addr + PAGE_SIZE - offset_in_page(addr); > + if (end < pend) > + pend = end; > + > + while (addr < pend) { > + instr = ppc_inst_read(code); > + ilen = ppc_inst_len(instr); > + err = __patch_instruction(addr, instr, patch_addr); > + /* hwsync performed by __patch_instruction (sync) if > successful */ As you aim at optimising the loops, you should before cache flushed outside this loop, with flush_dcache_range() and invalidate_dcache_range(). > + if (err) { > + mb(); /* sync */ > + break; > +
Re: [PATCH v2 1/4] powerpc/code-patching: introduce patch_instructions()
Le 10/03/2023 à 19:26, Christophe Leroy a écrit : > > > Le 09/03/2023 à 19:02, Hari Bathini a écrit : >> patch_instruction() entails setting up pte, patching the instruction, >> clearing the pte and flushing the tlb. If multiple instructions need >> to be patched, every instruction would have to go through the above >> drill unnecessarily. Instead, introduce function patch_instructions() >> that patches multiple instructions at one go while setting up the pte, >> clearing the pte and flushing the tlb only once per page range of >> instructions. Observed ~5X improvement in speed of execution using >> patch_instructions() over patch_instructions(), when more instructions >> are to be patched. > > I get a 13% degradation on the time needed to activate ftrace on a > powerpc 8xx. > > Before your patch, activation ftrace takes 550k timebase ticks. After > your patch it takes 620k timebase ticks. > More details about the problem: Before your patch, patch_instruction() is a simple, stackless function (Note that the first branch is noped out after startup). 0254 : 254: 48 00 00 6c b 2c0 258: 7c e0 00 a6 mfmsr r7 25c: 7c 51 13 a6 mtspr 81,r2 260: 3d 40 00 00 lis r10,0 262: R_PPC_ADDR16_HA.data 264: 39 4a 00 00 addir10,r10,0 266: R_PPC_ADDR16_LO.data 268: 7c 69 1b 78 mr r9,r3 26c: 3d 29 40 00 addis r9,r9,16384 270: 81 0a 00 08 lwz r8,8(r10) 274: 55 29 00 26 rlwinm r9,r9,0,0,19 278: 81 4a 00 04 lwz r10,4(r10) 27c: 61 29 01 25 ori r9,r9,293 280: 91 28 00 00 stw r9,0(r8) 284: 55 49 00 26 rlwinm r9,r10,0,0,19 288: 50 6a 05 3e rlwimi r10,r3,0,20,31 28c: 90 8a 00 00 stw r4,0(r10) 290: 7c 00 50 6c dcbst 0,r10 294: 7c 00 04 ac hwsync 298: 7c 00 1f ac icbi0,r3 29c: 7c 00 04 ac hwsync 2a0: 4c 00 01 2c isync 2a4: 38 60 00 00 li r3,0 2a8: 39 40 00 00 li r10,0 2ac: 91 48 00 00 stw r10,0(r8) 2b0: 7c 00 4a 64 tlbie r9,r0 2b4: 7c 00 04 ac hwsync 2b8: 7c e0 01 24 mtmsr r7 2bc: 4e 80 00 20 blr 2c0: 90 83 00 00 stw r4,0(r3) 2c4: 7c 00 18 6c dcbst 0,r3 2c8: 7c 00 04 ac hwsync 2cc: 7c 00 1f ac icbi0,r3 2d0: 7c 00 04 ac hwsync 2d4: 4c 00 01 2c isync 2d8: 38 60 00 00 li r3,0 2dc: 4e 80 00 20 blr 2e0: 38 60 ff ff li r3,-1 2e4: 4b ff ff c4 b 2a8 2e8: 38 60 ff ff li r3,-1 2ec: 4e 80 00 20 blr Once your patch is there, patch_instruction() becomes a function that has to step up a stack in order to call __do_patch_instructions(). And __do_patch_instructions() is quite a big function. 022c <__do_patch_instructions>: 22c: 3d 20 00 00 lis r9,0 22e: R_PPC_ADDR16_HA.data 230: 39 29 00 00 addir9,r9,0 232: R_PPC_ADDR16_LO.data 234: 81 69 00 04 lwz r11,4(r9) 238: 2c 05 00 00 cmpwi r5,0 23c: 81 89 00 08 lwz r12,8(r9) 240: 7c c3 32 14 add r6,r3,r6 244: 55 6b 00 26 rlwinm r11,r11,0,0,19 248: 38 00 00 00 li r0,0 24c: 54 6a 05 3e clrlwi r10,r3,20 250: 21 0a 10 00 subfic r8,r10,4096 254: 7d 03 42 14 add r8,r3,r8 258: 7c 69 1b 78 mr r9,r3 25c: 7f 88 30 40 cmplw cr7,r8,r6 260: 3d 29 40 00 addis r9,r9,16384 264: 55 29 00 26 rlwinm r9,r9,0,0,19 268: 61 29 01 25 ori r9,r9,293 26c: 91 2c 00 00 stw r9,0(r12) 270: 7d 4a 5b 78 or r10,r10,r11 274: 40 9d 00 08 ble cr7,27c <__do_patch_instructions+0x50> 278: 7c c8 33 78 mr r8,r6 27c: 7f 83 40 40 cmplw cr7,r3,r8 280: 40 9c 01 2c bge cr7,3ac <__do_patch_instructions+0x180> 284: 7c 69 18 f8 not r9,r3 288: 7d 28 4a 14 add r9,r8,r9 28c: 55 29 f7 fe rlwinm r9,r9,30,31,31 290: 7c e3 50 50 subfr7,r3,r10 294: 80 a4 00 00 lwz r5,0(r4) 298: 90 aa 00 00 stw r5,0(r10) 29c: 7c 00 50 6c dcbst 0,r10 2a0: 7c 00 04 ac hwsync 2a4: 7c 00 1f ac icbi0,r3 2a8: 7c 00 04 ac hwsync 2ac: 4c 00 01 2c isync 2b0: 38 63 00 04 addir3,r3,4 2b4: 40 82 00 08 bne 2bc <__do_patch_instructions+0x90> 2b8: 38 84 00 04 addir4,r4,4 2bc: 7f 83 40 40 cmplw cr7,r3,r8 2c0: 40 9c 00 a4 bge cr7,364 <__do_patch_instructions+0x138> 2c4: 2f 89 00 00 cmpwi cr7,r9,0 2c8: 41 9e 00 38 beq cr7,300 <__do_patch_instructions+0xd4> 2cc: 7d 23 3a 14 add r9,r3,r7 2d0: 81 44 00 00 lwz r10,0(r4) 2d4: 91 49 00 00 stw r10,0(r9) 2d8: 7c 00 48 6c dcbst 0,r9 2dc: 7c 00 04 ac hwsync 2e0: 7c 00 1f ac icbi0,r3 2e4: 7c 00 04 ac
Re: [PATCH v2 1/4] powerpc/code-patching: introduce patch_instructions()
Le 09/03/2023 à 19:02, Hari Bathini a écrit : > patch_instruction() entails setting up pte, patching the instruction, > clearing the pte and flushing the tlb. If multiple instructions need > to be patched, every instruction would have to go through the above > drill unnecessarily. Instead, introduce function patch_instructions() > that patches multiple instructions at one go while setting up the pte, > clearing the pte and flushing the tlb only once per page range of > instructions. Observed ~5X improvement in speed of execution using > patch_instructions() over patch_instructions(), when more instructions > are to be patched. I get a 13% degradation on the time needed to activate ftrace on a powerpc 8xx. Before your patch, activation ftrace takes 550k timebase ticks. After your patch it takes 620k timebase ticks. Christophe > > Signed-off-by: Hari Bathini > --- > arch/powerpc/include/asm/code-patching.h | 1 + > arch/powerpc/lib/code-patching.c | 151 --- > 2 files changed, 106 insertions(+), 46 deletions(-) > > diff --git a/arch/powerpc/include/asm/code-patching.h > b/arch/powerpc/include/asm/code-patching.h > index 3f881548fb61..059fc4fe700e 100644 > --- a/arch/powerpc/include/asm/code-patching.h > +++ b/arch/powerpc/include/asm/code-patching.h > @@ -74,6 +74,7 @@ int create_cond_branch(ppc_inst_t *instr, const u32 *addr, > int patch_branch(u32 *addr, unsigned long target, int flags); > int patch_instruction(u32 *addr, ppc_inst_t instr); > int raw_patch_instruction(u32 *addr, ppc_inst_t instr); > +int patch_instructions(u32 *addr, u32 *code, bool fill_inst, size_t len); > > static inline unsigned long patch_site_addr(s32 *site) > { > diff --git a/arch/powerpc/lib/code-patching.c > b/arch/powerpc/lib/code-patching.c > index b00112d7ad46..33857b9b53de 100644 > --- a/arch/powerpc/lib/code-patching.c > +++ b/arch/powerpc/lib/code-patching.c > @@ -278,77 +278,117 @@ static void unmap_patch_area(unsigned long addr) > flush_tlb_kernel_range(addr, addr + PAGE_SIZE); > } > > -static int __do_patch_instruction_mm(u32 *addr, ppc_inst_t instr) > +static int __do_patch_instructions_mm(u32 *addr, u32 *code, bool fill_inst, > size_t len) > { > - int err; > - u32 *patch_addr; > - unsigned long text_poke_addr; > - pte_t *pte; > - unsigned long pfn = get_patch_pfn(addr); > - struct mm_struct *patching_mm; > - struct mm_struct *orig_mm; > + struct mm_struct *patching_mm, *orig_mm; > + unsigned long text_poke_addr, pfn; > + u32 *patch_addr, *end, *pend; > + ppc_inst_t instr; > spinlock_t *ptl; > + int ilen, err; > + pte_t *pte; > > patching_mm = __this_cpu_read(cpu_patching_context.mm); > text_poke_addr = __this_cpu_read(cpu_patching_context.addr); > - patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr)); > > pte = get_locked_pte(patching_mm, text_poke_addr, ); > if (!pte) > return -ENOMEM; > > - __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, > PAGE_KERNEL), 0); > + end = (void *)addr + len; > + do { > + pfn = get_patch_pfn(addr); > + __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, > PAGE_KERNEL), 0); > > - /* order PTE update before use, also serves as the hwsync */ > - asm volatile("ptesync": : :"memory"); > - > - /* order context switch after arbitrary prior code */ > - isync(); > - > - orig_mm = start_using_temp_mm(patching_mm); > - > - err = __patch_instruction(addr, instr, patch_addr); > + /* order PTE update before use, also serves as the hwsync */ > + asm volatile("ptesync": : :"memory"); > > - /* hwsync performed by __patch_instruction (sync) if successful */ > - if (err) > - mb(); /* sync */ > + /* order context switch after arbitrary prior code */ > + isync(); > + > + orig_mm = start_using_temp_mm(patching_mm); > + > + patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr)); > + pend = (void *)addr + PAGE_SIZE - offset_in_page(addr); > + if (end < pend) > + pend = end; > + > + while (addr < pend) { > + instr = ppc_inst_read(code); > + ilen = ppc_inst_len(instr); > + err = __patch_instruction(addr, instr, patch_addr); > + /* hwsync performed by __patch_instruction (sync) if > successful */ > + if (err) { > + mb(); /* sync */ > + break; > + } > + > + patch_addr = (void *)patch_addr + ilen; > + addr = (void *)addr + ilen; > + if (!fill_inst) > + code = (void *)code + ilen; > + } > > - /* context