[PATCH powerpc/next 02/17] powerpc/bpf: Emit a single branch instruction for known short branch ranges

2022-02-14 Thread Naveen N. Rao
PPC_BCC() emits two instructions to accommodate scenarios where we need
to branch outside the range of a conditional branch. PPC_BCC_SHORT()
emits a single branch instruction and can be used when the branch is
known to be within a conditional branch range.

Convert some of the uses of PPC_BCC() in the powerpc BPF JIT over to
PPC_BCC_SHORT() where we know the branch range.

Signed-off-by: Naveen N. Rao 
---
 arch/powerpc/net/bpf_jit_comp32.c | 8 
 arch/powerpc/net/bpf_jit_comp64.c | 8 
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit_comp32.c 
b/arch/powerpc/net/bpf_jit_comp32.c
index cf8dd8aea386c4..81e0c56661ddf2 100644
--- a/arch/powerpc/net/bpf_jit_comp32.c
+++ b/arch/powerpc/net/bpf_jit_comp32.c
@@ -221,7 +221,7 @@ static int bpf_jit_emit_tail_call(u32 *image, struct 
codegen_context *ctx, u32 o
EMIT(PPC_RAW_LWZ(_R0, b2p_bpf_array, offsetof(struct bpf_array, 
map.max_entries)));
EMIT(PPC_RAW_CMPLW(b2p_index, _R0));
EMIT(PPC_RAW_LWZ(_R0, _R1, bpf_jit_stack_offsetof(ctx, BPF_PPC_TC)));
-   PPC_BCC(COND_GE, out);
+   PPC_BCC_SHORT(COND_GE, out);
 
/*
 * if (tail_call_cnt >= MAX_TAIL_CALL_CNT)
@@ -230,7 +230,7 @@ static int bpf_jit_emit_tail_call(u32 *image, struct 
codegen_context *ctx, u32 o
EMIT(PPC_RAW_CMPLWI(_R0, MAX_TAIL_CALL_CNT));
/* tail_call_cnt++; */
EMIT(PPC_RAW_ADDIC(_R0, _R0, 1));
-   PPC_BCC(COND_GE, out);
+   PPC_BCC_SHORT(COND_GE, out);
 
/* prog = array->ptrs[index]; */
EMIT(PPC_RAW_RLWINM(_R3, b2p_index, 2, 0, 29));
@@ -243,7 +243,7 @@ static int bpf_jit_emit_tail_call(u32 *image, struct 
codegen_context *ctx, u32 o
 *   goto out;
 */
EMIT(PPC_RAW_CMPLWI(_R3, 0));
-   PPC_BCC(COND_EQ, out);
+   PPC_BCC_SHORT(COND_EQ, out);
 
/* goto *(prog->bpf_func + prologue_size); */
EMIT(PPC_RAW_LWZ(_R3, _R3, offsetof(struct bpf_prog, bpf_func)));
@@ -834,7 +834,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, 
struct codegen_context *
if (BPF_MODE(code) == BPF_PROBE_MEM) {
PPC_LI32(_R0, TASK_SIZE - off);
EMIT(PPC_RAW_CMPLW(src_reg, _R0));
-   PPC_BCC(COND_GT, (ctx->idx + 5) * 4);
+   PPC_BCC_SHORT(COND_GT, (ctx->idx + 4) * 4);
EMIT(PPC_RAW_LI(dst_reg, 0));
/*
 * For BPF_DW case, "li reg_h,0" would be 
needed when
diff --git a/arch/powerpc/net/bpf_jit_comp64.c 
b/arch/powerpc/net/bpf_jit_comp64.c
index e1e8c934308adb..b1ed8611091d2b 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -225,7 +225,7 @@ static int bpf_jit_emit_tail_call(u32 *image, struct 
codegen_context *ctx, u32 o
EMIT(PPC_RAW_LWZ(b2p[TMP_REG_1], b2p_bpf_array, offsetof(struct 
bpf_array, map.max_entries)));
EMIT(PPC_RAW_RLWINM(b2p_index, b2p_index, 0, 0, 31));
EMIT(PPC_RAW_CMPLW(b2p_index, b2p[TMP_REG_1]));
-   PPC_BCC(COND_GE, out);
+   PPC_BCC_SHORT(COND_GE, out);
 
/*
 * if (tail_call_cnt >= MAX_TAIL_CALL_CNT)
@@ -233,7 +233,7 @@ static int bpf_jit_emit_tail_call(u32 *image, struct 
codegen_context *ctx, u32 o
 */
PPC_BPF_LL(b2p[TMP_REG_1], 1, bpf_jit_stack_tailcallcnt(ctx));
EMIT(PPC_RAW_CMPLWI(b2p[TMP_REG_1], MAX_TAIL_CALL_CNT));
-   PPC_BCC(COND_GE, out);
+   PPC_BCC_SHORT(COND_GE, out);
 
/*
 * tail_call_cnt++;
@@ -251,7 +251,7 @@ static int bpf_jit_emit_tail_call(u32 *image, struct 
codegen_context *ctx, u32 o
 *   goto out;
 */
EMIT(PPC_RAW_CMPLDI(b2p[TMP_REG_1], 0));
-   PPC_BCC(COND_EQ, out);
+   PPC_BCC_SHORT(COND_EQ, out);
 
/* goto *(prog->bpf_func + prologue_size); */
PPC_BPF_LL(b2p[TMP_REG_1], b2p[TMP_REG_1], offsetof(struct bpf_prog, 
bpf_func));
@@ -807,7 +807,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, 
struct codegen_context *
else /* BOOK3S_64 */
PPC_LI64(b2p[TMP_REG_2], PAGE_OFFSET);
EMIT(PPC_RAW_CMPLD(b2p[TMP_REG_1], 
b2p[TMP_REG_2]));
-   PPC_BCC(COND_GT, (ctx->idx + 4) * 4);
+   PPC_BCC_SHORT(COND_GT, (ctx->idx + 3) * 4);
EMIT(PPC_RAW_LI(dst_reg, 0));
/*
 * Check if 'off' is word aligned because 
PPC_BPF_LL()
-- 
2.35.1



[PATCH powerpc/next 01/17] powerpc/bpf: Skip branch range validation during first pass

2022-02-14 Thread Naveen N. Rao
During the first pass, addrs[] is still being populated. So, all
branches to following instructions will appear to be going to the start
of the JIT program. Ignore branch range validation for such instructions
and assume those to be in range. Branch range validation will happen
during the second pass after addrs[] is setup properly.

Signed-off-by: Naveen N. Rao 
---
 arch/powerpc/net/bpf_jit.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
index b20a2a83a6e75b..9cdd33d6be4cc0 100644
--- a/arch/powerpc/net/bpf_jit.h
+++ b/arch/powerpc/net/bpf_jit.h
@@ -27,7 +27,7 @@
 #define PPC_JMP(dest)\
do {  \
long offset = (long)(dest) - (ctx->idx * 4);  \
-   if (!is_offset_in_branch_range(offset)) { \
+   if ((dest) != 0 && !is_offset_in_branch_range(offset)) {
  \
pr_err_ratelimited("Branch offset 0x%lx (@%u) out of 
range\n", offset, ctx->idx);   \
return -ERANGE;   \
} \
@@ -41,7 +41,7 @@
 #define PPC_BCC_SHORT(cond, dest)\
do {  \
long offset = (long)(dest) - (ctx->idx * 4);  \
-   if (!is_offset_in_cond_branch_range(offset)) {\
+   if ((dest) != 0 && !is_offset_in_cond_branch_range(offset)) {   
  \
pr_err_ratelimited("Conditional branch offset 0x%lx 
(@%u) out of range\n", offset, ctx->idx);   \
return -ERANGE;   \
} \
-- 
2.35.1



[PATCH powerpc/next 00/17] powerpc/bpf: Some updates and cleanups

2022-02-14 Thread Naveen N. Rao
This is a follow-up series with the pending patches from:
https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=279602=*

Patches 1-5 and 8 are unchanged. Patch 6 is new and patch 7 has changes 
based on discussion from the last series. Patch 9 has a small change to 
not skip the toc load for elf v2.

Patches 10-17 are new to this series and are largely some cleanups to 
the bpf code on powerpc.

- Naveen


Jordan Niethe (1):
  powerpc64/bpf: Store temp registers' bpf to ppc mapping

Naveen N. Rao (16):
  powerpc/bpf: Skip branch range validation during first pass
  powerpc/bpf: Emit a single branch instruction for known short branch
ranges
  powerpc/bpf: Handle large branch ranges with BPF_EXIT
  powerpc64/bpf: Do not save/restore LR on each call to
bpf_stf_barrier()
  powerpc64/bpf: Use r12 for constant blinding
  powerpc64: Set PPC64_ELF_ABI_v[1|2] macros to 1
  powerpc64/bpf elfv2: Setup kernel TOC in r2 on entry
  powerpc64/bpf elfv1: Do not load TOC before calling functions
  powerpc64/bpf: Optimize instruction sequence used for function calls
  powerpc/bpf: Rename PPC_BL_ABS() to PPC_BL()
  powerpc64/bpf: Convert some of the uses of PPC_BPF_[LL|STL] to
PPC_BPF_[LD|STD]
  powerpc64/bpf: Get rid of PPC_BPF_[LL|STL|STLU] macros
  powerpc/bpf: Cleanup bpf_jit.h
  powerpc/bpf: Move bpf_jit64.h into bpf_jit_comp64.c
  powerpc/bpf: Use _Rn macros for GPRs
  powerpc/bpf: Simplify bpf_to_ppc() and adopt it for powerpc64

 arch/powerpc/include/asm/types.h  |   4 +-
 arch/powerpc/net/bpf_jit.h|  35 +--
 arch/powerpc/net/bpf_jit64.h  |  91 --
 arch/powerpc/net/bpf_jit_comp.c   |  34 ++-
 arch/powerpc/net/bpf_jit_comp32.c | 113 
 arch/powerpc/net/bpf_jit_comp64.c | 440 --
 6 files changed, 340 insertions(+), 377 deletions(-)
 delete mode 100644 arch/powerpc/net/bpf_jit64.h


base-commit: 1b43a74f255c5c00db25a5fedfd75ca0dc029022
-- 
2.35.1



Re: [PATCH v3 12/12] lkdtm: Add a test for function descriptors protection

2022-02-14 Thread Christophe Leroy


Le 11/02/2022 à 02:09, Kees Cook a écrit :
> On Sun, Oct 17, 2021 at 02:38:25PM +0200, Christophe Leroy wrote:
>> Add WRITE_OPD to check that you can't modify function
>> descriptors.
>>
>> Gives the following result when function descriptors are
>> not protected:
>>
>>  lkdtm: Performing direct entry WRITE_OPD
>>  lkdtm: attempting bad 16 bytes write at c269b358
>>  lkdtm: FAIL: survived bad write
>>  lkdtm: do_nothing was hijacked!
>>
>> Looks like a standard compiler barrier() is not enough to force
>> GCC to use the modified function descriptor. Had to add a fake empty
>> inline assembly to force GCC to reload the function descriptor.
>>
>> Signed-off-by: Christophe Leroy 
>> ---
>>   drivers/misc/lkdtm/core.c  |  1 +
>>   drivers/misc/lkdtm/lkdtm.h |  1 +
>>   drivers/misc/lkdtm/perms.c | 22 ++
>>   3 files changed, 24 insertions(+)
>>
>> diff --git a/drivers/misc/lkdtm/core.c b/drivers/misc/lkdtm/core.c
>> index fe6fd34b8caf..de092aa03b5d 100644
>> --- a/drivers/misc/lkdtm/core.c
>> +++ b/drivers/misc/lkdtm/core.c
>> @@ -148,6 +148,7 @@ static const struct crashtype crashtypes[] = {
>>  CRASHTYPE(WRITE_RO),
>>  CRASHTYPE(WRITE_RO_AFTER_INIT),
>>  CRASHTYPE(WRITE_KERN),
>> +CRASHTYPE(WRITE_OPD),
>>  CRASHTYPE(REFCOUNT_INC_OVERFLOW),
>>  CRASHTYPE(REFCOUNT_ADD_OVERFLOW),
>>  CRASHTYPE(REFCOUNT_INC_NOT_ZERO_OVERFLOW),
>> diff --git a/drivers/misc/lkdtm/lkdtm.h b/drivers/misc/lkdtm/lkdtm.h
>> index c212a253edde..188bd0fd6575 100644
>> --- a/drivers/misc/lkdtm/lkdtm.h
>> +++ b/drivers/misc/lkdtm/lkdtm.h
>> @@ -105,6 +105,7 @@ void __init lkdtm_perms_init(void);
>>   void lkdtm_WRITE_RO(void);
>>   void lkdtm_WRITE_RO_AFTER_INIT(void);
>>   void lkdtm_WRITE_KERN(void);
>> +void lkdtm_WRITE_OPD(void);
>>   void lkdtm_EXEC_DATA(void);
>>   void lkdtm_EXEC_STACK(void);
>>   void lkdtm_EXEC_KMALLOC(void);
>> diff --git a/drivers/misc/lkdtm/perms.c b/drivers/misc/lkdtm/perms.c
>> index 1cf24c4a79e9..2c6aba3ff32b 100644
>> --- a/drivers/misc/lkdtm/perms.c
>> +++ b/drivers/misc/lkdtm/perms.c
>> @@ -44,6 +44,11 @@ static noinline void do_overwritten(void)
>>  return;
>>   }
>>   
>> +static noinline void do_almost_nothing(void)
>> +{
>> +pr_info("do_nothing was hijacked!\n");
>> +}
>> +
>>   static void *setup_function_descriptor(func_desc_t *fdesc, void *dst)
>>   {
>>  if (!have_function_descriptors())
>> @@ -144,6 +149,23 @@ void lkdtm_WRITE_KERN(void)
>>  do_overwritten();
>>   }
>>   
>> +void lkdtm_WRITE_OPD(void)
>> +{
>> +size_t size = sizeof(func_desc_t);
>> +void (*func)(void) = do_nothing;
>> +
>> +if (!have_function_descriptors()) {
>> +pr_info("XFAIL: Platform doesn't use function descriptors.\n");
>> +return;
>> +}
>> +pr_info("attempting bad %zu bytes write at %px\n", size, do_nothing);
>> +memcpy(do_nothing, do_almost_nothing, size);
>> +pr_err("FAIL: survived bad write\n");
> 
> Non-function-descriptor architectures would successfully crash at the
> memcpy too, right? (i.e. for them this is just repeating WRITE_KERN)

Yes it should. But not for the good reason.

> 
> I'm pondering the utility of the XFAIL vs just letting is succeed, but I
> think it more accurate to say "hey, no OPD" as you have it.
> 
>> +
>> +asm("" : "=m"(func));
>> +func();
>> +}
>> +
>>   void lkdtm_EXEC_DATA(void)
>>   {
>>  execute_location(data_area, CODE_WRITE);
>> -- 
>> 2.31.1
>>
> 
> One tiny suggestion, since I think you need to respin for the
> EXPORT_SYMBOL_GPL() anyway. Please update the selftests too:
> 
> diff --git a/tools/testing/selftests/lkdtm/tests.txt 
> b/tools/testing/selftests/lkdtm/tests.txt
> index 6b36b7f5dcf9..243c781f0780 100644
> --- a/tools/testing/selftests/lkdtm/tests.txt
> +++ b/tools/testing/selftests/lkdtm/tests.txt
> @@ -44,6 +44,7 @@ ACCESS_NULL
>   WRITE_RO
>   WRITE_RO_AFTER_INIT
>   WRITE_KERN
> +WRITE_OPD
>   REFCOUNT_INC_OVERFLOW
>   REFCOUNT_ADD_OVERFLOW
>   REFCOUNT_INC_NOT_ZERO_OVERFLOW
> 
> (Though for the future I've been considering making the selftests an
> opt-out list so the "normal" stuff doesn't need to keep getting added
> there.)
> 
> Thanks!
> 
> Acked-by: Kees Cook 
> 

Done.

Thanks
Christophe

Re: [PATCH v3 08/12] asm-generic: Refactor dereference_[kernel]_function_descriptor()

2022-02-14 Thread Christophe Leroy


Le 11/02/2022 à 01:56, Kees Cook a écrit :
> On Thu, Feb 10, 2022 at 09:30:43PM +1100, Michael Ellerman wrote:
>> Christophe Leroy  writes:
>>> diff --git a/kernel/extable.c b/kernel/extable.c
>>> index b0ea5eb0c3b4..1ef13789bea9 100644
>>> --- a/kernel/extable.c
>>> +++ b/kernel/extable.c
>>> @@ -159,12 +160,32 @@ int kernel_text_address(unsigned long addr)
>>>   }
>>>   
>>>   /*
>>> - * On some architectures (PPC64, IA64) function pointers
>>> + * On some architectures (PPC64, IA64, PARISC) function pointers
>>>* are actually only tokens to some data that then holds the
>>>* real function address. As a result, to find if a function
>>>* pointer is part of the kernel text, we need to do some
>>>* special dereferencing first.
>>>*/
>>> +#ifdef CONFIG_HAVE_FUNCTION_DESCRIPTORS
>>> +void *dereference_function_descriptor(void *ptr)
>>> +{
>>> +   func_desc_t *desc = ptr;
>>> +   void *p;
>>> +
>>> +   if (!get_kernel_nofault(p, (void *)>addr))
>>> +   ptr = p;
>>> +   return ptr;
>>> +}
>>
>> This needs an EXPORT_SYMBOL_GPL(), otherwise the build breaks after
>> patch 10 with CONFIG_LKDTM=m.
> 
> Oh good catch!
> 
> (There have been a few cases of LKDTM=m being the only thing needed a
> symbol, so I've pondered giving it a namespace or constructing a little
> ifdef wrapper... but this seems ok to export...)
> 

powerpc and ia64 had it as a static inline, but parisc had it as a plain 
function and didn't export it. So I guess the export is not required at 
this point. I will export it in patch 10 when it becomes necessary.

Christophe

Re: [PATCH v3 04/12] powerpc: Prepare func_desc_t for refactorisation

2022-02-14 Thread Christophe Leroy


Le 11/02/2022 à 01:54, Kees Cook a écrit :
> On Sun, Oct 17, 2021 at 02:38:17PM +0200, Christophe Leroy wrote:
>> In preparation of making func_desc_t generic, change the ELFv2
>> version to a struct containing 'addr' element.
>>
>> This allows using single helpers common to ELFv1 and ELFv2.
>>
>> Signed-off-by: Christophe Leroy 
>> ---
>>   arch/powerpc/kernel/module_64.c | 32 ++--
>>   1 file changed, 14 insertions(+), 18 deletions(-)
>>
>> diff --git a/arch/powerpc/kernel/module_64.c 
>> b/arch/powerpc/kernel/module_64.c
>> index a89da0ee25e2..b687ef88c4c4 100644
>> --- a/arch/powerpc/kernel/module_64.c
>> +++ b/arch/powerpc/kernel/module_64.c
>> @@ -33,19 +33,13 @@
>>   #ifdef PPC64_ELF_ABI_v2
>>   
>>   /* An address is simply the address of the function. */
>> -typedef unsigned long func_desc_t;
>> +typedef struct {
>> +unsigned long addr;
>> +} func_desc_t;
>>   
>>   static func_desc_t func_desc(unsigned long addr)
>>   {
>> -return addr;
>> -}
>> -static unsigned long func_addr(unsigned long addr)
>> -{
>> -return addr;
>> -}
>> -static unsigned long stub_func_addr(func_desc_t func)
>> -{
>> -return func;
>> +return (func_desc_t){addr};
> 
> There's only 1 element in the struct, so okay, but it hurt my eyes a
> little. I would have been happier with:
> 
>   return (func_desc_t){ .addr = addr; };
> 
> But of course that also looks bonkers because it starts with "return".
> So no matter what I do my eyes bug out. ;)
> 
> So it's fine either way. :)
> 
> Reviewed-by: Kees Cook 

I am going for:

  static func_desc_t func_desc(unsigned long addr)
  {
+   func_desc_t desc = {
+   .addr = addr,
+   };
+
+   return desc;
  }


Thanks
Christophe

Re: Build regressions/improvements in v5.17-rc4

2022-02-14 Thread Geert Uytterhoeven

On Mon, 14 Feb 2022, Geert Uytterhoeven wrote:

JFYI, when comparing v5.17-rc4[1] to v5.17-rc3[3], the summaries are:
 - build errors: +1/-0


  + error: arch/powerpc/kvm/book3s_64_entry.o: relocation truncated to fit: 
R_PPC64_REL14 (stub) against symbol `system_reset_common' defined in .text section 
in arch/powerpc/kernel/head_64.o:  => (.text+0x3ec)

powerpc-gcc5/powerpc-allyesconfig


[1] 
http://kisskb.ellerman.id.au/kisskb/branch/linus/head/754e0b0e35608ed5206d6a67a791563c631cec07/
 (all 99 configs)
[3] 
http://kisskb.ellerman.id.au/kisskb/branch/linus/head/dfd42facf1e4ada021b939b4e19c935dcdd55566/
 (all 99 configs)


Gr{oetje,eeting}s,

Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds


Re: BUG: sleeping function called from invalid context at include/linux/sched/mm.h:256

2022-02-14 Thread Christophe Leroy


Le 14/02/2022 à 10:35, Peter Zijlstra a écrit :
> On Sun, Feb 13, 2022 at 12:05:50AM +0100, Paul Menzel wrote:
> 
>> [0.012154][T1] BUG: sleeping function called from invalid context at
> 
>> [0.022443][T1] [c84837d0] [c0961aac] > 
>> dump_stack_lvl+0xa0/0xec (unreliable)
>> [0.023356][T1] [c8483820] [c019b314] > 
>> __might_resched+0x2f4/0x310
>> [0.024174][T1] [c84838b0] [c04c0c70] > 
>> kmem_cache_alloc+0x220/0x4b0
>> [0.025000][T1] [c8483920] [c0448af4] > 
>> __pud_alloc+0x74/0x1d0
>> [0.025772][T1] [c8483970] [c008fe3c] > 
>> hash__map_kernel_page+0x2cc/0x390
>> [0.026643][T1] [c8483a20] [c00a9944] > 
>> do_patch_instruction+0x134/0x4a0
> 
> do_patch_instruction() rightfully disables IRQs, but then it goes and
> tries a memory alloc, which seems a bit daft.
> 
> I'm thinking Christophe might know more... he's recently been poking at
> Power text poking..


I don't know all details about PPC64, but here it seems like 
hash__map_kernel_page() allocates intermediate page directories when 
there is not one yet. So if that's the first time 
hash__map_kernel_page() is called for the text_poke_addr() it allocated 
PUD and PMD if necessary.

As it is kernel memory, once PUD and PMD are allocated they will remain 
forever I think. So maybe the only thing to do it to perform a dummy 
mapping/unmapping in text_area_cpu_up() to ensure the page directory are 
created in advance.

Christophe

Re: No Linux logs when doing `ppc64_cpu --smt=off/8`

2022-02-14 Thread Michal Suchánek
Hello,

On Mon, Feb 14, 2022 at 07:08:07AM +0100, Paul Menzel wrote:
> Dear PPC folks,
> 
> 
> On the POWER8 server IBM S822LC running `ppc64_cpu --smt=off` or `ppc64_cpu
> --smt=8`, Linux 5.17-rc4 does not log anything. I would have expected a
> message about the change in number of processing units.

IIRC it was considered too noisy for systems with many CPUs and the
message was dropped. You can always check the resulting state with
ppc64_cpu or examining sysfs.

Thanks

Michal


Re: BUG: sleeping function called from invalid context at include/linux/sched/mm.h:256

2022-02-14 Thread Peter Zijlstra
On Sun, Feb 13, 2022 at 12:05:50AM +0100, Paul Menzel wrote:

> [0.012154][T1] BUG: sleeping function called from invalid context at

> [0.022443][T1] [c84837d0] [c0961aac] > 
> dump_stack_lvl+0xa0/0xec (unreliable)
> [0.023356][T1] [c8483820] [c019b314] > 
> __might_resched+0x2f4/0x310
> [0.024174][T1] [c84838b0] [c04c0c70] > 
> kmem_cache_alloc+0x220/0x4b0
> [0.025000][T1] [c8483920] [c0448af4] > 
> __pud_alloc+0x74/0x1d0
> [0.025772][T1] [c8483970] [c008fe3c] > 
> hash__map_kernel_page+0x2cc/0x390
> [0.026643][T1] [c8483a20] [c00a9944] > 
> do_patch_instruction+0x134/0x4a0

do_patch_instruction() rightfully disables IRQs, but then it goes and
tries a memory alloc, which seems a bit daft.

I'm thinking Christophe might know more... he's recently been poking at
Power text poking..


Re: [PATCH] net: Remove branch in csum_shift()

2022-02-14 Thread Segher Boessenkool
On Sun, Feb 13, 2022 at 05:47:52PM +, David Laight wrote:
> From: Segher Boessenkool 
> > Sent: 13 February 2022 09:16
> > In an ideal world the compiler could choose the optimal code sequences
> > everywhere.  But that won't ever happen, the search space is way too
> > big.  So compilers just use heuristics, not exhaustive search like
> > superopt does.  There is a middle way of course, something with directed
> > searches, and maybe in a few decades systems will be fast enough.  Until
> > then we will very often see code that is 10% slower and 30% bigger than
> > necessary.  A single insn more than needed isn't so bad :-)
> 
> But it can be a lot more than that.

Obviously, but that isn't the case here (for powerpc anyway).  My point
here is that you won't ever get ideal generated code from your high-
level code (which is what C is), certainly not for all architectures.
But it *is* possible to get something reasonably good.

> > Making things branch-free is very much worth it here though!
> 
> I tried to find out where 'here' is.

I meant "with this patch".

Unpredictable branches are very expensive.  They already were something
to worry about on single-issue in-order processors, but they are much
more expensive now.

> I can't get godbolt to generate anything like that object code
> for a call to csum_shift().
> 
> I can't actually get it to issue a rotate (x86 of ppc).

All powerpc rotate insns start with "rl", and no other insns do.  There
also are extended mnemonics to ease programming, like "rotlw", which is
just a form of rlwinm (rotlw d,s,n is rlwnm d,s,n,0,31).

Depending on what tool you use to display binary code it will show you
extended mnemonics for some insns or just the basic insns.

> I think it is only a single instruction because the compiler
> has saved 'offset & 1' much earlier instead of doing testing
> 'offset & 1' just prior to the conditional.

rlwinm -- "nm" means "and mask".  rlwnm d,s,n,mb,me rotates register s
left by the contents of register n bits, and logical ands it with the
mask from bit mb until bit me.

> It certainly has a nasty habit of doing that pessimisation.

?  Not sure what you mean here.

> I also suspect that the addc/addze pair could be removed
> by passing the old checksum into csum_partial.

Maybe?  Does it not have to return a reduced result here?


Segher


Re: [PATCH v5 3/6] mm: make alloc_contig_range work at pageblock granularity

2022-02-14 Thread Christophe Leroy


Le 11/02/2022 à 17:41, Zi Yan a écrit :
> From: Zi Yan 
> 
> alloc_contig_range() worked at MAX_ORDER-1 granularity to avoid merging
> pageblocks with different migratetypes. It might unnecessarily convert
> extra pageblocks at the beginning and at the end of the range. Change
> alloc_contig_range() to work at pageblock granularity.
> 
> Special handling is needed for free pages and in-use pages across the
> boundaries of the range specified alloc_contig_range(). Because these
> partially isolated pages causes free page accounting issues. The free
> pages will be split and freed into separate migratetype lists; the
> in-use pages will be migrated then the freed pages will be handled.
> 
> Signed-off-by: Zi Yan 
> ---
>   include/linux/page-isolation.h |   2 +-
>   mm/internal.h  |   3 +
>   mm/memory_hotplug.c|   3 +-
>   mm/page_alloc.c| 235 +
>   mm/page_isolation.c|  33 -
>   5 files changed, 211 insertions(+), 65 deletions(-)
> 
> diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
> index 4ef7be6def83..78ff940cc169 100644
> --- a/include/linux/page-isolation.h
> +++ b/include/linux/page-isolation.h
> @@ -54,7 +54,7 @@ int move_freepages_block(struct zone *zone, struct page 
> *page,
>*/
>   int
>   start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
> -  unsigned migratetype, int flags);
> +  unsigned migratetype, int flags, gfp_t gfp_flags);
>   
>   /*
>* Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE.
> diff --git a/mm/internal.h b/mm/internal.h
> index 0d240e876831..509cbdc25992 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -319,6 +319,9 @@ isolate_freepages_range(struct compact_control *cc,
>   int
>   isolate_migratepages_range(struct compact_control *cc,
>  unsigned long low_pfn, unsigned long end_pfn);
> +
> +int
> +isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags, int 
> isolate_before_boundary);
>   #endif
>   int find_suitable_fallback(struct free_area *area, unsigned int order,
>   int migratetype, bool only_stealable, bool *can_steal);
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index ce68098832aa..82406d2f3e46 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1863,7 +1863,8 @@ int __ref offline_pages(unsigned long start_pfn, 
> unsigned long nr_pages,
>   /* set above range as isolated */
>   ret = start_isolate_page_range(start_pfn, end_pfn,
>  MIGRATE_MOVABLE,
> -MEMORY_OFFLINE | REPORT_FAILURE);
> +MEMORY_OFFLINE | REPORT_FAILURE,
> +GFP_USER | __GFP_MOVABLE | 
> __GFP_RETRY_MAYFAIL);
>   if (ret) {
>   reason = "failure to isolate range";
>   goto failed_removal_pcplists_disabled;
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 62ef78f3d771..7a4fa21aea5c 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -8985,7 +8985,7 @@ static inline void alloc_contig_dump_pages(struct 
> list_head *page_list)
>   #endif
>   
>   /* [start, end) must belong to a single zone. */
> -static int __alloc_contig_migrate_range(struct compact_control *cc,
> +int __alloc_contig_migrate_range(struct compact_control *cc,
>   unsigned long start, unsigned long end)
>   {
>   /* This function is based on compact_zone() from compaction.c. */
> @@ -9043,6 +9043,167 @@ static int __alloc_contig_migrate_range(struct 
> compact_control *cc,
>   return 0;
>   }
>   
> +/**
> + * split_free_page() -- split a free page at split_pfn_offset
> + * @free_page:   the original free page
> + * @order:   the order of the page
> + * @split_pfn_offset:split offset within the page
> + *
> + * It is used when the free page crosses two pageblocks with different 
> migratetypes
> + * at split_pfn_offset within the page. The split free page will be put into
> + * separate migratetype lists afterwards. Otherwise, the function achieves
> + * nothing.
> + */
> +static inline void split_free_page(struct page *free_page,
> + int order, unsigned long split_pfn_offset)
> +{
> + struct zone *zone = page_zone(free_page);
> + unsigned long free_page_pfn = page_to_pfn(free_page);
> + unsigned long pfn;
> + unsigned long flags;
> + int free_page_order;
> +
> + spin_lock_irqsave(>lock, flags);
> + del_page_from_free_list(free_page, zone, order);
> + for (pfn = free_page_pfn;
> +  pfn < free_page_pfn + (1UL << order);) {
> + int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
> +
> + free_page_order = order_base_2(split_pfn_offset);
> + __free_one_page(pfn_to_page(pfn), pfn, zone, 

<    1   2