[PATCH] powerpc/ftrace: Create a dummy stackframe to fix stack unwind

2023-06-20 Thread Naveen N Rao
With ppc64 -mprofile-kernel and ppc32 -pg, profiling instructions to
call into ftrace are emitted right at function entry. The instruction
sequence used is minimal to reduce overhead. Crucially, a stackframe is
not created for the function being traced. This breaks stack unwinding
since the function being traced does not have a stackframe for itself.
As such, it never shows up in the backtrace:

/sys/kernel/debug/tracing # echo 1 > /proc/sys/kernel/stack_tracer_enabled
/sys/kernel/debug/tracing # cat stack_trace
DepthSize   Location(17 entries)
-   
  0) 4144  32   ftrace_call+0x4/0x44
  1) 4112 432   get_page_from_freelist+0x26c/0x1ad0
  2) 3680 496   __alloc_pages+0x290/0x1280
  3) 3184 336   __folio_alloc+0x34/0x90
  4) 2848 176   vma_alloc_folio+0xd8/0x540
  5) 2672 272   __handle_mm_fault+0x700/0x1cc0
  6) 2400 208   handle_mm_fault+0xf0/0x3f0
  7) 2192  80   ___do_page_fault+0x3e4/0xbe0
  8) 2112 160   do_page_fault+0x30/0xc0
  9) 1952 256   data_access_common_virt+0x210/0x220
 10) 1696 400   0xcf16b100
 11) 1296 384   load_elf_binary+0x804/0x1b80
 12)  912 208   bprm_execve+0x2d8/0x7e0
 13)  704  64   do_execveat_common+0x1d0/0x2f0
 14)  640 160   sys_execve+0x54/0x70
 15)  480  64   system_call_exception+0x138/0x350
 16)  416 416   system_call_common+0x160/0x2c4

Fix this by having ftrace create a dummy stackframe for the function
being traced. With this, backtraces now capture the function being
traced:

/sys/kernel/debug/tracing # cat stack_trace
DepthSize   Location(17 entries)
-   
  0) 3888  32   _raw_spin_trylock+0x8/0x70
  1) 3856 576   get_page_from_freelist+0x26c/0x1ad0
  2) 3280  64   __alloc_pages+0x290/0x1280
  3) 3216 336   __folio_alloc+0x34/0x90
  4) 2880 176   vma_alloc_folio+0xd8/0x540
  5) 2704 416   __handle_mm_fault+0x700/0x1cc0
  6) 2288  96   handle_mm_fault+0xf0/0x3f0
  7) 2192  48   ___do_page_fault+0x3e4/0xbe0
  8) 2144 192   do_page_fault+0x30/0xc0
  9) 1952 608   data_access_common_virt+0x210/0x220
 10) 1344  16   0xc000334bbb50
 11) 1328 416   load_elf_binary+0x804/0x1b80
 12)  912  64   bprm_execve+0x2d8/0x7e0
 13)  848 176   do_execveat_common+0x1d0/0x2f0
 14)  672 192   sys_execve+0x54/0x70
 15)  480  64   system_call_exception+0x138/0x350
 16)  416 416   system_call_common+0x160/0x2c4

This results in two additional stores in the ftrace entry code, but
produces reliable backtraces.

Fixes: 153086644fd1 ("powerpc/ftrace: Add support for -mprofile-kernel ftrace 
ABI")
Cc: sta...@vger.kernel.org
Signed-off-by: Naveen N Rao 
---
Per Nick's suggestion, I'm posting a minimal fix separately to make this 
easier to backport.

- Naveen


 arch/powerpc/kernel/trace/ftrace_mprofile.S | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/trace/ftrace_mprofile.S 
b/arch/powerpc/kernel/trace/ftrace_mprofile.S
index ffb1db38684998..1f7d86de1538e2 100644
--- a/arch/powerpc/kernel/trace/ftrace_mprofile.S
+++ b/arch/powerpc/kernel/trace/ftrace_mprofile.S
@@ -33,6 +33,9 @@
  * and then arrange for the ftrace function to be called.
  */
 .macro ftrace_regs_entry allregs
+   /* Create a minimal stack frame for representing B */
+   PPC_STLUr1, -STACK_FRAME_MIN_SIZE(r1)
+
/* Create our stack frame + pt_regs */
PPC_STLUr1,-SWITCH_FRAME_SIZE(r1)
 
@@ -42,7 +45,7 @@
 
 #ifdef CONFIG_PPC64
/* Save the original return address in A's stack frame */
-   std r0, LRSAVE+SWITCH_FRAME_SIZE(r1)
+   std r0, LRSAVE+SWITCH_FRAME_SIZE+STACK_FRAME_MIN_SIZE(r1)
/* Ok to continue? */
lbz r3, PACA_FTRACE_ENABLED(r13)
cmpdi   r3, 0
@@ -77,6 +80,8 @@
mflrr7
/* Save it as pt_regs->nip */
PPC_STL r7, _NIP(r1)
+   /* Also save it in B's stackframe header for proper unwind */
+   PPC_STL r7, LRSAVE+SWITCH_FRAME_SIZE(r1)
/* Save the read LR in pt_regs->link */
PPC_STL r0, _LINK(r1)
 
@@ -142,7 +147,7 @@
 #endif
 
/* Pop our stack frame */
-   addi r1, r1, SWITCH_FRAME_SIZE
+   addi r1, r1, SWITCH_FRAME_SIZE+STACK_FRAME_MIN_SIZE
 
 #ifdef CONFIG_LIVEPATCH_64
 /* Based on the cmpd above, if the NIP was altered handle livepatch */

base-commit: 12ffddc6444780aec83fa5086673ec005c0bace4
-- 
2.41.0



Re: [PATCH 02/16] powerpc/book3s64/mm: mmu_vmemmap_psize is used by radix

2023-06-20 Thread Michael Ellerman
"Aneesh Kumar K.V"  writes:
> This should not be within CONFIG_PPC_64S_HASHS_MMU. We use mmu_vmemmap_psize
> on radix while mapping the vmemmap area.
>
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  arch/powerpc/mm/book3s64/radix_pgtable.c | 2 --
>  1 file changed, 2 deletions(-)

This breaks microwatt_defconfig, which does not enable CONFIG_PPC_64S_HASH_MMU:

  ../arch/powerpc/mm/book3s64/radix_pgtable.c: In function 
‘radix__early_init_mmu’:
  ../arch/powerpc/mm/book3s64/radix_pgtable.c:601:27: error: lvalue required as 
left operand of assignment
601 | mmu_virtual_psize = MMU_PAGE_4K;
|   ^
  make[5]: *** [../scripts/Makefile.build:252: 
arch/powerpc/mm/book3s64/radix_pgtable.o] Error 1
  make[4]: *** [../scripts/Makefile.build:494: arch/powerpc/mm/book3s64] Error 2
  make[3]: *** [../scripts/Makefile.build:494: arch/powerpc/mm] Error 2
  make[2]: *** [../scripts/Makefile.build:494: arch/powerpc] Error 2
  make[2]: *** Waiting for unfinished jobs
  make[1]: *** [/home/michael/linux/Makefile:2026: .] Error 2
  make: *** [Makefile:226: __sub-make] Error 2

Because mmu_virtual_psize is defined in hash_utils.c, which isn't built.

cheers


Re: [6.4.0-rc7-next-20230620] Boot failure on IBM Power LPAR

2023-06-20 Thread Michael Ellerman
Sachin Sant  writes:
> 6.4.0-rc7-next-20230620 fails to boot on IBM Power LPAR with following
>
> [ 5.548368] BUG: Unable to handle kernel data access at 0x95bdcf954bc34e73
> [ 5.548380] Faulting instruction address: 0xc0548090
> [ 5.548384] Oops: Kernel access of bad area, sig: 11 [#1]
> [ 5.548387] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
> [ 5.548391] Modules linked in: nf_tables(E) nfnetlink(E) sunrpc(E) 
> binfmt_misc(E) pseries_rng(E) aes_gcm_p10_crypto(E) drm(E) 
> drm_panel_orientation_quirks(E) xfs(E) libcrc32c(E) sd_mod(E) sr_mod(E) 
> t10_pi(E) crc64_rocksoft_generic(E) cdrom(E) crc64_rocksoft(E) crc64(E) sg(E) 
> ibmvscsi(E) scsi_transport_srp(E) ibmveth(E) vmx_crypto(E) fuse(E)
> [ 5.548413] CPU: 1 PID: 789 Comm: systemd-udevd Tainted: G E 
> 6.4.0-rc7-next-20230620 #1
> [ 5.548417] Hardware name: IBM,9080-HEX POWER10 (raw) 0x800200 0xf06 
> of:IBM,FW1030.20 (NH1030_058) hv:phyp pSeries
> [ 5.548421] NIP: c0548090 LR: c0547fbc CTR: c04206f0
> [ 5.548424] REGS: c000afb536f0 TRAP: 0380 Tainted: G E 
> (6.4.0-rc7-next-20230620)
> [ 5.548427] MSR: 8280b033  CR: 
> 88028202 XER: 2004
> [ 5.548436] CFAR: c0547fc4 IRQMASK: 0 
> [ 5.548436] GPR00: c0547fbc c000afb53990 c14b1600 
>  
> [ 5.548436] GPR04: 0cc0 34d8 0e6f 
> ed5e02cab43c21e0 
> [ 5.548436] GPR08: 0e6e 0058 001356ea 
> 2000 
> [ 5.548436] GPR12: c04206f0 c013f300  
>  
> [ 5.548436] GPR16:    
> c00092f43708 
> [ 5.548436] GPR20: c00092f436b0  fff7dfff 
> c000afa8 
> [ 5.548436] GPR24: c2b87aa0 00b8 c0159914 
> 0cc0 
> [ 5.548436] GPR28: 95bdcf954bc34e1b ca1fafc0  
> c3019800 
> [ 5.548473] NIP [c0548090] kmem_cache_alloc+0x1a0/0x420
> [ 5.548480] LR [c0547fbc] kmem_cache_alloc+0xcc/0x420
> [ 5.548485] Call Trace:
> [ 5.548487] [c000afb53990] [c0547fbc] kmem_cache_alloc+0xcc/0x420 
> (unreliable)
> [ 5.548493] [c000afb53a00] [c0159914] vm_area_dup+0x44/0xf0
> [ 5.548499] [c000afb53a40] [c015a638] dup_mmap+0x298/0x8b0
> [ 5.548504] [c000afb53bb0] [c015acd0] 
> dup_mm.constprop.0+0x80/0x180
> [ 5.548509] [c000afb53bf0] [c015bdc0] copy_process+0xc00/0x1510
> [ 5.548514] [c000afb53cb0] [c015c848] kernel_clone+0xb8/0x5a0
> [ 5.548519] [c000afb53d30] [c015ceb8] __do_sys_clone+0x88/0xd0
> [ 5.548524] [c000afb53e10] [c0033bcc] 
> system_call_exception+0x13c/0x340
> [ 5.548529] [c000afb53e50] [c000d05c] 
> system_call_vectored_common+0x15c/0x2ec
> [ 5.548534] --- interrupt: 3000 at 0x7fff87f0c178
> [ 5.548538] NIP: 7fff87f0c178 LR:  CTR: 
> [ 5.548540] REGS: c000afb53e80 TRAP: 3000 Tainted: G E 
> (6.4.0-rc7-next-20230620)
> [ 5.548544] MSR: 8000f033  CR: 44004204 
> XER: 
> [ 5.548552] IRQMASK: 0 
> [ 5.548552] GPR00: 0078 7de8cb80 7fff88637500 
> 01200011 
> [ 5.548552] GPR04:    
> 7fff888bd490 
> [ 5.548552] GPR08: 0001   
>  
> [ 5.548552] GPR12:  7fff888c4c00 0002 
> 7de95698 
> [ 5.548552] GPR16: 7de95690 7de95688 7de956a0 
> 0028 
> [ 5.548552] GPR20: 000132bca308 0001 0001 
> 0315 
> [ 5.548552] GPR24: 0003 0040  
> 0003 
> [ 5.548552] GPR28:   7de8cf24 
> 0045 
> [ 5.548586] NIP [7fff87f0c178] 0x7fff87f0c178
> [ 5.548589] LR [] 0x0
> [ 5.548591] --- interrupt: 3000
> [ 5.548593] Code: e93f 7ce95214 e9070008 7f89502a e9270010 2e3c 
> 41920258 2c29 41820250 813f0028 e8ff00b8 38c80001 <7fdc482a> 7d3c4a14 
> 79250022 552ac03e 
> [ 5.548605] ---[ end trace  ]---
> [ 5.550849] pstore: backend (nvram) writing error (-1)
> [ 5.550852] 
> Starting Network Manager...
> [ 5.566384] BUG: Bad rss-counter state mm:dc60f1c1 type:MM_ANONPAGES 
> val:36
> [ 5.568784] BUG: Bad rss-counter state mm:8eb9341b type:MM_ANONPAGES 
> val:36
> [ 5.689774] BUG: Bad rss-counter state mm:edbda345 type:MM_ANONPAGES 
> val:36
> [ 5.692187] BUG: Bad rss-counter state mm:3f7ec21f 

Re: [PATCH mm-unstable v2 07/10] kvm/powerpc: add kvm_arch_test_clear_young()

2023-06-20 Thread Nicholas Piggin
On Wed Jun 21, 2023 at 10:38 AM AEST, Yu Zhao wrote:
> On Tue, Jun 20, 2023 at 1:48 AM Nicholas Piggin  wrote:
> >
> > On Sat May 27, 2023 at 9:44 AM AEST, Yu Zhao wrote:
> > > Implement kvm_arch_test_clear_young() to support the fast path in
> > > mmu_notifier_ops->test_clear_young().
> > >
> > > It focuses on a simple case, i.e., radix MMU sets the accessed bit in
> > > KVM PTEs and VMs are not nested, where it can rely on RCU and
> > > pte_xchg() to safely clear the accessed bit without taking
> > > kvm->mmu_lock. Complex cases fall back to the existing slow path
> > > where kvm->mmu_lock is then taken.
> > >
> > > Signed-off-by: Yu Zhao 
> > > ---
> > >  arch/powerpc/include/asm/kvm_host.h|  8 
> > >  arch/powerpc/include/asm/kvm_ppc.h |  1 +
> > >  arch/powerpc/kvm/book3s.c  |  6 +++
> > >  arch/powerpc/kvm/book3s.h  |  1 +
> > >  arch/powerpc/kvm/book3s_64_mmu_radix.c | 59 ++
> > >  arch/powerpc/kvm/book3s_hv.c   |  5 +++
> > >  6 files changed, 80 insertions(+)
> > >
> > > diff --git a/arch/powerpc/include/asm/kvm_host.h 
> > > b/arch/powerpc/include/asm/kvm_host.h
> > > index 14ee0dece853..75c260ea8a9e 100644
> > > --- a/arch/powerpc/include/asm/kvm_host.h
> > > +++ b/arch/powerpc/include/asm/kvm_host.h
> > > @@ -883,4 +883,12 @@ static inline void kvm_arch_sched_in(struct kvm_vcpu 
> > > *vcpu, int cpu) {}
> > >  static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
> > >  static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
> > >
> > > +#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young
> > > +static inline bool kvm_arch_has_test_clear_young(void)
> > > +{
> > > + return IS_ENABLED(CONFIG_KVM_BOOK3S_HV_POSSIBLE) &&
> > > +cpu_has_feature(CPU_FTR_HVMODE) && 
> > > cpu_has_feature(CPU_FTR_ARCH_300) &&
> > > +radix_enabled();
> >
> > This could probably be radix_enabled() && !kvmhv_on_pseries().
>
> Will do. (I used !kvmhv_on_pseries() in v1 but had second thoughts on
> moving kvmhv_on_pseries() into this file.)

That should be okay. kvmhv_on_pseries is a property of the host so it
seems reasonable to move it here if needed.

> > Although unclear why not nested hypervisor... I'd have to think about it a 
> > bit
> > more. Do you have any idea what might go wrong, or just didn't have the
> > time to consider it? (Not just powerpc nested but any others).
>
> Yes, this series excludes nested KVM support on all architures. The
> common reason for such a decision on powerpc and x86 (aarch64 doesn't
> support nested yet) is that it's quite challenging to make the rmap, a
> complex data structure that maps one PFN to multiple GFNs, lockless.
> (See kvmhv_insert_nest_rmap().) It might be possible, however, the
> potential ROI would be in question.

Okay just wondering. rmap (at least the powerpc one) is just a list
I think, with a few details. If that is all it is, it might not be
so hard to make that lock-free or a fine-grained lock on the rmap
chains maybe. But fine to ignore it to start with.

> > > +}
> > > +
> > >  #endif /* __POWERPC_KVM_HOST_H__ */
> > > diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
> > > b/arch/powerpc/include/asm/kvm_ppc.h
> > > index 79a9c0bb8bba..ff1af6a7b44f 100644
> > > --- a/arch/powerpc/include/asm/kvm_ppc.h
> > > +++ b/arch/powerpc/include/asm/kvm_ppc.h
> > > @@ -287,6 +287,7 @@ struct kvmppc_ops {
> > >   bool (*unmap_gfn_range)(struct kvm *kvm, struct kvm_gfn_range 
> > > *range);
> > >   bool (*age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
> > >   bool (*test_age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
> > > + bool (*test_clear_young)(struct kvm *kvm, struct kvm_gfn_range 
> > > *range);
> > >   bool (*set_spte_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
> > >   void (*free_memslot)(struct kvm_memory_slot *slot);
> > >   int (*init_vm)(struct kvm *kvm);
> > > diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
> > > index 686d8d9eda3e..37bf40b0c4ff 100644
> > > --- a/arch/powerpc/kvm/book3s.c
> > > +++ b/arch/powerpc/kvm/book3s.c
> > > @@ -899,6 +899,12 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct 
> > > kvm_gfn_range *range)
> > >   return kvm->arch.kvm_ops->test_age_gfn(kvm, range);
> > >  }
> > >
> > > +bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range 
> > > *range)
> > > +{
> > > + return !kvm->arch.kvm_ops->test_clear_young ||
> > > +kvm->arch.kvm_ops->test_clear_young(kvm, range);
> > > +}
> > > +
> > >  bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
> > >  {
> > >   return kvm->arch.kvm_ops->set_spte_gfn(kvm, range);
> > > diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h
> > > index 58391b4b32ed..fa2659e21ccc 100644
> > > --- a/arch/powerpc/kvm/book3s.h
> > > +++ b/arch/powerpc/kvm/book3s.h
> > > @@ -12,6 +12,7 @@ extern void kvmppc_core_flush_memslot_hv(struct 

Re: [PATCH 15/17] perf tests task_analyzer: fix bad substitution ${$1}

2023-06-20 Thread Namhyung Kim
Hello,

On Tue, Jun 13, 2023 at 1:06 PM Arnaldo Carvalho de Melo
 wrote:
>
> Em Tue, Jun 13, 2023 at 10:11:43PM +0530, Athira Rajeev escreveu:
> > From: Aditya Gupta 
> >
> > ${$1} gives bad substitution error on sh, bash, and zsh. This seems like
> > a typo, and this patch modifies it to $1, since that is what it's usage
> > looks like from wherever `check_exec_0` is called.
>
> Nicely spotted!
>
> Please add the people that last touched the problem to the cc list,
> specially when it fixes a bug.
>
> Thanks for adding a Fixes tag, that helps the sta...@kernel.org guys to
> get this propagated to supported kernel releases.
>
> I've added the test author to the CC list in this message.
>
> thanks!
>
> - Arnaldo
>
> > This issue due to ${$1} caused all function calls to give error in
> > `find_str_or_fail` line, and so no test runs completely. But
> > 'perf test "perf script task-analyzer tests"' wrongly reports
> > that tests passed with the status OK, which is wrong considering
> > the tests didn't even run completely
> >
> > Fixes: e8478b84d6ba ("perf test: add new task-analyzer tests")
> > Signed-off-by: Athira Rajeev 
> > Signed-off-by: Kajol Jain 
> > Signed-off-by: Aditya Gupta 
> > ---

I'm seeing a different error even after this fix.
Can you please take a look?

Thanks,
Namhyung


$ sudo ./perf test -v task
114: perf script task-analyzer tests :
--- start ---
test child forked, pid 1771042
Please specify a valid report script(see 'perf script -l' for listing)
FAIL: "invocation of perf command failed" Error message: ""
FAIL: "test_basic" Error message: "Failed to find required string:'Comm'."
Please specify a valid report script(see 'perf script -l' for listing)
FAIL: "invocation of perf command failed" Error message: ""
FAIL: "test_ns_rename" Error message: "Failed to find required string:'Comm'."
...
test child finished with -1
 end 
perf script task-analyzer tests: FAILED!


Re: [PATCH mm-unstable v2 07/10] kvm/powerpc: add kvm_arch_test_clear_young()

2023-06-20 Thread Yu Zhao
On Tue, Jun 20, 2023 at 1:48 AM Nicholas Piggin  wrote:
>
> On Sat May 27, 2023 at 9:44 AM AEST, Yu Zhao wrote:
> > Implement kvm_arch_test_clear_young() to support the fast path in
> > mmu_notifier_ops->test_clear_young().
> >
> > It focuses on a simple case, i.e., radix MMU sets the accessed bit in
> > KVM PTEs and VMs are not nested, where it can rely on RCU and
> > pte_xchg() to safely clear the accessed bit without taking
> > kvm->mmu_lock. Complex cases fall back to the existing slow path
> > where kvm->mmu_lock is then taken.
> >
> > Signed-off-by: Yu Zhao 
> > ---
> >  arch/powerpc/include/asm/kvm_host.h|  8 
> >  arch/powerpc/include/asm/kvm_ppc.h |  1 +
> >  arch/powerpc/kvm/book3s.c  |  6 +++
> >  arch/powerpc/kvm/book3s.h  |  1 +
> >  arch/powerpc/kvm/book3s_64_mmu_radix.c | 59 ++
> >  arch/powerpc/kvm/book3s_hv.c   |  5 +++
> >  6 files changed, 80 insertions(+)
> >
> > diff --git a/arch/powerpc/include/asm/kvm_host.h 
> > b/arch/powerpc/include/asm/kvm_host.h
> > index 14ee0dece853..75c260ea8a9e 100644
> > --- a/arch/powerpc/include/asm/kvm_host.h
> > +++ b/arch/powerpc/include/asm/kvm_host.h
> > @@ -883,4 +883,12 @@ static inline void kvm_arch_sched_in(struct kvm_vcpu 
> > *vcpu, int cpu) {}
> >  static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
> >  static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
> >
> > +#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young
> > +static inline bool kvm_arch_has_test_clear_young(void)
> > +{
> > + return IS_ENABLED(CONFIG_KVM_BOOK3S_HV_POSSIBLE) &&
> > +cpu_has_feature(CPU_FTR_HVMODE) && 
> > cpu_has_feature(CPU_FTR_ARCH_300) &&
> > +radix_enabled();
>
> This could probably be radix_enabled() && !kvmhv_on_pseries().

Will do. (I used !kvmhv_on_pseries() in v1 but had second thoughts on
moving kvmhv_on_pseries() into this file.)

> Although unclear why not nested hypervisor... I'd have to think about it a bit
> more. Do you have any idea what might go wrong, or just didn't have the
> time to consider it? (Not just powerpc nested but any others).

Yes, this series excludes nested KVM support on all architures. The
common reason for such a decision on powerpc and x86 (aarch64 doesn't
support nested yet) is that it's quite challenging to make the rmap, a
complex data structure that maps one PFN to multiple GFNs, lockless.
(See kvmhv_insert_nest_rmap().) It might be possible, however, the
potential ROI would be in question.

> > +}
> > +
> >  #endif /* __POWERPC_KVM_HOST_H__ */
> > diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
> > b/arch/powerpc/include/asm/kvm_ppc.h
> > index 79a9c0bb8bba..ff1af6a7b44f 100644
> > --- a/arch/powerpc/include/asm/kvm_ppc.h
> > +++ b/arch/powerpc/include/asm/kvm_ppc.h
> > @@ -287,6 +287,7 @@ struct kvmppc_ops {
> >   bool (*unmap_gfn_range)(struct kvm *kvm, struct kvm_gfn_range *range);
> >   bool (*age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
> >   bool (*test_age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
> > + bool (*test_clear_young)(struct kvm *kvm, struct kvm_gfn_range 
> > *range);
> >   bool (*set_spte_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
> >   void (*free_memslot)(struct kvm_memory_slot *slot);
> >   int (*init_vm)(struct kvm *kvm);
> > diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
> > index 686d8d9eda3e..37bf40b0c4ff 100644
> > --- a/arch/powerpc/kvm/book3s.c
> > +++ b/arch/powerpc/kvm/book3s.c
> > @@ -899,6 +899,12 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct 
> > kvm_gfn_range *range)
> >   return kvm->arch.kvm_ops->test_age_gfn(kvm, range);
> >  }
> >
> > +bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range 
> > *range)
> > +{
> > + return !kvm->arch.kvm_ops->test_clear_young ||
> > +kvm->arch.kvm_ops->test_clear_young(kvm, range);
> > +}
> > +
> >  bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
> >  {
> >   return kvm->arch.kvm_ops->set_spte_gfn(kvm, range);
> > diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h
> > index 58391b4b32ed..fa2659e21ccc 100644
> > --- a/arch/powerpc/kvm/book3s.h
> > +++ b/arch/powerpc/kvm/book3s.h
> > @@ -12,6 +12,7 @@ extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
> >  extern bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range 
> > *range);
> >  extern bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
> >  extern bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range 
> > *range);
> > +extern bool kvm_test_clear_young_hv(struct kvm *kvm, struct kvm_gfn_range 
> > *range);
> >  extern bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range 
> > *range);
> >
> >  extern int kvmppc_mmu_init_pr(struct kvm_vcpu *vcpu);
> > diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c 
> > b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> > 

Re: [PATCH v2 05/12] powerpc: add pte_free_defer() for pgtables sharing page

2023-06-20 Thread Jason Gunthorpe
On Tue, Jun 20, 2023 at 12:54:25PM -0700, Hugh Dickins wrote:
> On Tue, 20 Jun 2023, Jason Gunthorpe wrote:
> > On Tue, Jun 20, 2023 at 12:47:54AM -0700, Hugh Dickins wrote:
> > > Add powerpc-specific pte_free_defer(), to call pte_free() via call_rcu().
> > > pte_free_defer() will be called inside khugepaged's retract_page_tables()
> > > loop, where allocating extra memory cannot be relied upon.  This precedes
> > > the generic version to avoid build breakage from incompatible pgtable_t.
> > > 
> > > This is awkward because the struct page contains only one rcu_head, but
> > > that page may be shared between PTE_FRAG_NR pagetables, each wanting to
> > > use the rcu_head at the same time: account concurrent deferrals with a
> > > heightened refcount, only the first making use of the rcu_head, but
> > > re-deferring if more deferrals arrived during its grace period.
> > 
> > You didn't answer my question why we can't just move the rcu to the
> > actual free page?
> 
> I thought that I had answered it, perhaps not to your satisfaction:
> 
> https://lore.kernel.org/linux-mm/9130acb-193-6fdd-f8df-75766e663...@google.com/
> 
> My conclusion then was:
> Not very good reasons: good enough, or can you supply a better patch?

Oh, I guess I didn't read that email as answering the question..

I was saying to make pte_fragment_free() unconditionally do the
RCU. It is the only thing that uses the page->rcu_head, and it means
PPC would double RCU the final free on the TLB path, but that is
probably OK for now. This means pte_free_defer() won't do anything
special on PPC as PPC will always RCU free these things, this address
the defer concern too, I think. Overall it is easier to reason about.

I looked at fixing the TLB stuff to avoid the double rcu but quickly
got scared that ppc was using a kmem_cache to allocate other page
table sizes so there is not a reliable struct page to get a rcu_head
from. This looks like the main challenge for ppc... We'd have to teach
the tlb code to not do its own RCU stuff for table levels that the
arch is already RCU freeing - and that won't get us to full RCU
freeing on PPC.

Anyhow, this is a full version of what I was thinking:

diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c
index 20652daa1d7e3a..b5dcd0f27fc115 100644
--- a/arch/powerpc/mm/pgtable-frag.c
+++ b/arch/powerpc/mm/pgtable-frag.c
@@ -106,6 +106,21 @@ pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel)
return __alloc_for_ptecache(mm, kernel);
 }
 
+static void pgtable_free_cb(struct rcu_head *head)
+{
+   struct page *page = container_of(head, struct page, rcu_head);
+
+   pgtable_pte_page_dtor(page);
+   __free_page(page);
+}
+
+static void pgtable_free_cb_kernel(struct rcu_head *head)
+{
+   struct page *page = container_of(head, struct page, rcu_head);
+
+   __free_page(page);
+}
+
 void pte_fragment_free(unsigned long *table, int kernel)
 {
struct page *page = virt_to_page(table);
@@ -115,8 +130,13 @@ void pte_fragment_free(unsigned long *table, int kernel)
 
BUG_ON(atomic_read(>pt_frag_refcount) <= 0);
if (atomic_dec_and_test(>pt_frag_refcount)) {
+   /*
+* Always RCU free pagetable memory. rcu_head overlaps with lru
+* which is no longer in use by the time the table is freed.
+*/
if (!kernel)
-   pgtable_pte_page_dtor(page);
-   __free_page(page);
+   call_rcu(>rcu_head, pgtable_free_cb);
+   else
+   call_rcu(>rcu_head, pgtable_free_cb_kernel);
}
 }


Re: [PATCH v4 04/34] pgtable: Create struct ptdesc

2023-06-20 Thread Vishal Moola
On Tue, Jun 20, 2023 at 4:05 PM Jason Gunthorpe  wrote:
>
> On Tue, Jun 20, 2023 at 01:01:39PM -0700, Vishal Moola wrote:
> > On Fri, Jun 16, 2023 at 5:38 AM Jason Gunthorpe  wrote:
> > >
> > > On Mon, Jun 12, 2023 at 02:03:53PM -0700, Vishal Moola (Oracle) wrote:
> > > > Currently, page table information is stored within struct page. As part
> > > > of simplifying struct page, create struct ptdesc for page table
> > > > information.
> > > >
> > > > Signed-off-by: Vishal Moola (Oracle) 
> > > > ---
> > > >  include/linux/pgtable.h | 51 +
> > > >  1 file changed, 51 insertions(+)
> > > >
> > > > diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> > > > index c5a51481bbb9..330de96ebfd6 100644
> > > > --- a/include/linux/pgtable.h
> > > > +++ b/include/linux/pgtable.h
> > > > @@ -975,6 +975,57 @@ static inline void ptep_modify_prot_commit(struct 
> > > > vm_area_struct *vma,
> > > >  #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */
> > > >  #endif /* CONFIG_MMU */
> > > >
> > > > +
> > > > +/**
> > > > + * struct ptdesc - Memory descriptor for page tables.
> > > > + * @__page_flags: Same as page flags. Unused for page tables.
> > > > + * @pt_list: List of used page tables. Used for s390 and x86.
> > > > + * @_pt_pad_1: Padding that aliases with page's compound head.
> > > > + * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs.
> > > > + * @_pt_s390_gaddr: Aliases with page's mapping. Used for s390 gmap 
> > > > only.
> > > > + * @pt_mm: Used for x86 pgds.
> > > > + * @pt_frag_refcount: For fragmented page table tracking. Powerpc and 
> > > > s390 only.
> > > > + * @ptl: Lock for the page table.
> > > > + *
> > > > + * This struct overlays struct page for now. Do not modify without a 
> > > > good
> > > > + * understanding of the issues.
> > > > + */
> > > > +struct ptdesc {
> > > > + unsigned long __page_flags;
> > > > +
> > > > + union {
> > > > + struct list_head pt_list;
> > > > + struct {
> > > > + unsigned long _pt_pad_1;
> > > > + pgtable_t pmd_huge_pte;
> > > > + };
> > > > + };
> > > > + unsigned long _pt_s390_gaddr;
> > > > +
> > > > + union {
> > > > + struct mm_struct *pt_mm;
> > > > + atomic_t pt_frag_refcount;
> > > > + };
> > > > +
> > > > +#if ALLOC_SPLIT_PTLOCKS
> > > > + spinlock_t *ptl;
> > > > +#else
> > > > + spinlock_t ptl;
> > > > +#endif
> > > > +};
> > >
> > > I think you should include the memcg here too? It needs to be valid
> > > for a ptdesc, even if we don't currently deref it through the ptdesc
> > > type.
> >
> > Yes, thanks for catching that! I'll add it to v5.
> >
> > > Also, do you see a way to someday put a 'struct rcu_head' into here?
> >
> > Eventually, when they're being dynamically allocated independent of
> > struct page. Although at that point I'm not sure if we'll need one.
>
> Sooner than dynamic struct page?
>
> Probably it can overlap pt_list in alot of arches?

Ah yes, there will be one if v5 overlapping with pt_list
(it already does in struct page anyways).


Re: [PATCH v4 04/34] pgtable: Create struct ptdesc

2023-06-20 Thread Jason Gunthorpe
On Tue, Jun 20, 2023 at 01:01:39PM -0700, Vishal Moola wrote:
> On Fri, Jun 16, 2023 at 5:38 AM Jason Gunthorpe  wrote:
> >
> > On Mon, Jun 12, 2023 at 02:03:53PM -0700, Vishal Moola (Oracle) wrote:
> > > Currently, page table information is stored within struct page. As part
> > > of simplifying struct page, create struct ptdesc for page table
> > > information.
> > >
> > > Signed-off-by: Vishal Moola (Oracle) 
> > > ---
> > >  include/linux/pgtable.h | 51 +
> > >  1 file changed, 51 insertions(+)
> > >
> > > diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> > > index c5a51481bbb9..330de96ebfd6 100644
> > > --- a/include/linux/pgtable.h
> > > +++ b/include/linux/pgtable.h
> > > @@ -975,6 +975,57 @@ static inline void ptep_modify_prot_commit(struct 
> > > vm_area_struct *vma,
> > >  #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */
> > >  #endif /* CONFIG_MMU */
> > >
> > > +
> > > +/**
> > > + * struct ptdesc - Memory descriptor for page tables.
> > > + * @__page_flags: Same as page flags. Unused for page tables.
> > > + * @pt_list: List of used page tables. Used for s390 and x86.
> > > + * @_pt_pad_1: Padding that aliases with page's compound head.
> > > + * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs.
> > > + * @_pt_s390_gaddr: Aliases with page's mapping. Used for s390 gmap only.
> > > + * @pt_mm: Used for x86 pgds.
> > > + * @pt_frag_refcount: For fragmented page table tracking. Powerpc and 
> > > s390 only.
> > > + * @ptl: Lock for the page table.
> > > + *
> > > + * This struct overlays struct page for now. Do not modify without a good
> > > + * understanding of the issues.
> > > + */
> > > +struct ptdesc {
> > > + unsigned long __page_flags;
> > > +
> > > + union {
> > > + struct list_head pt_list;
> > > + struct {
> > > + unsigned long _pt_pad_1;
> > > + pgtable_t pmd_huge_pte;
> > > + };
> > > + };
> > > + unsigned long _pt_s390_gaddr;
> > > +
> > > + union {
> > > + struct mm_struct *pt_mm;
> > > + atomic_t pt_frag_refcount;
> > > + };
> > > +
> > > +#if ALLOC_SPLIT_PTLOCKS
> > > + spinlock_t *ptl;
> > > +#else
> > > + spinlock_t ptl;
> > > +#endif
> > > +};
> >
> > I think you should include the memcg here too? It needs to be valid
> > for a ptdesc, even if we don't currently deref it through the ptdesc
> > type.
> 
> Yes, thanks for catching that! I'll add it to v5.
> 
> > Also, do you see a way to someday put a 'struct rcu_head' into here?
> 
> Eventually, when they're being dynamically allocated independent of
> struct page. Although at that point I'm not sure if we'll need one.

Sooner than dynamic struct page?

Probably it can overlap pt_list in alot of arches?

Jason


Re: [PATCH v4 04/34] pgtable: Create struct ptdesc

2023-06-20 Thread Vishal Moola
On Fri, Jun 16, 2023 at 5:38 AM Jason Gunthorpe  wrote:
>
> On Mon, Jun 12, 2023 at 02:03:53PM -0700, Vishal Moola (Oracle) wrote:
> > Currently, page table information is stored within struct page. As part
> > of simplifying struct page, create struct ptdesc for page table
> > information.
> >
> > Signed-off-by: Vishal Moola (Oracle) 
> > ---
> >  include/linux/pgtable.h | 51 +
> >  1 file changed, 51 insertions(+)
> >
> > diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> > index c5a51481bbb9..330de96ebfd6 100644
> > --- a/include/linux/pgtable.h
> > +++ b/include/linux/pgtable.h
> > @@ -975,6 +975,57 @@ static inline void ptep_modify_prot_commit(struct 
> > vm_area_struct *vma,
> >  #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */
> >  #endif /* CONFIG_MMU */
> >
> > +
> > +/**
> > + * struct ptdesc - Memory descriptor for page tables.
> > + * @__page_flags: Same as page flags. Unused for page tables.
> > + * @pt_list: List of used page tables. Used for s390 and x86.
> > + * @_pt_pad_1: Padding that aliases with page's compound head.
> > + * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs.
> > + * @_pt_s390_gaddr: Aliases with page's mapping. Used for s390 gmap only.
> > + * @pt_mm: Used for x86 pgds.
> > + * @pt_frag_refcount: For fragmented page table tracking. Powerpc and s390 
> > only.
> > + * @ptl: Lock for the page table.
> > + *
> > + * This struct overlays struct page for now. Do not modify without a good
> > + * understanding of the issues.
> > + */
> > +struct ptdesc {
> > + unsigned long __page_flags;
> > +
> > + union {
> > + struct list_head pt_list;
> > + struct {
> > + unsigned long _pt_pad_1;
> > + pgtable_t pmd_huge_pte;
> > + };
> > + };
> > + unsigned long _pt_s390_gaddr;
> > +
> > + union {
> > + struct mm_struct *pt_mm;
> > + atomic_t pt_frag_refcount;
> > + };
> > +
> > +#if ALLOC_SPLIT_PTLOCKS
> > + spinlock_t *ptl;
> > +#else
> > + spinlock_t ptl;
> > +#endif
> > +};
>
> I think you should include the memcg here too? It needs to be valid
> for a ptdesc, even if we don't currently deref it through the ptdesc
> type.

Yes, thanks for catching that! I'll add it to v5.

> Also, do you see a way to someday put a 'struct rcu_head' into here?

Eventually, when they're being dynamically allocated independent of
struct page. Although at that point I'm not sure if we'll need one.

> Thanks,
> Jason


Re: [PATCH v2 05/12] powerpc: add pte_free_defer() for pgtables sharing page

2023-06-20 Thread Hugh Dickins
On Tue, 20 Jun 2023, Jason Gunthorpe wrote:
> On Tue, Jun 20, 2023 at 12:47:54AM -0700, Hugh Dickins wrote:
> > Add powerpc-specific pte_free_defer(), to call pte_free() via call_rcu().
> > pte_free_defer() will be called inside khugepaged's retract_page_tables()
> > loop, where allocating extra memory cannot be relied upon.  This precedes
> > the generic version to avoid build breakage from incompatible pgtable_t.
> > 
> > This is awkward because the struct page contains only one rcu_head, but
> > that page may be shared between PTE_FRAG_NR pagetables, each wanting to
> > use the rcu_head at the same time: account concurrent deferrals with a
> > heightened refcount, only the first making use of the rcu_head, but
> > re-deferring if more deferrals arrived during its grace period.
> 
> You didn't answer my question why we can't just move the rcu to the
> actual free page?

I thought that I had answered it, perhaps not to your satisfaction:

https://lore.kernel.org/linux-mm/9130acb-193-6fdd-f8df-75766e663...@google.com/

My conclusion then was:
Not very good reasons: good enough, or can you supply a better patch?

Hugh

> 
> Since PPC doesn't recycle the frags, we don't need to carefully RCU
> free each frag, we just need to RCU free the entire page when it
> becomes eventually free?
> 
> Jason


Re: [6.4.0-rc7-next-20230620] Boot failure on IBM Power LPAR

2023-06-20 Thread Yu Zhao
On Tue, Jun 20, 2023 at 05:41:57PM +0530, Sachin Sant wrote:
> 6.4.0-rc7-next-20230620 fails to boot on IBM Power LPAR with following

Sorry for hijacking this thread -- I've been seeing another crash on
NV since -rc1 but I haven't had the time to bisect. Just FYI.

[0.814500] BUG: Unable to handle kernel data access on read at 
0xc00a03f9
[0.814814] Faulting instruction address: 0xc0c77324
[0.814988] Oops: Kernel access of bad area, sig: 11 [#1]
[0.815185] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA PowerNV
[0.815487] Modules linked in:
[0.815653] CPU: 4 PID: 1 Comm: swapper/0 Not tainted 6.4.0-rc7 #1
[0.815980] Hardware name: ZAIUS_FX_10  POWER9 (raw) 0x4e1202 
opal:custom PowerNV
[0.816293] NIP:  c0c77324 LR: c0c7c2c8 CTR: c0c77270
[0.816525] REGS: c0002a416de0 TRAP: 0300   Not tainted  (6.4.0-rc7)
[0.816778] MSR:  90009033   CR: 24008280  
XER: 2004
[0.817113] CFAR: c0c772c8 DAR: c00a03f9 DSISR: 4000 
IRQMASK: 1
[0.817113] GPR00: c0c7c2c8 c0002a417080 c1deea00 
03f9
[0.817113] GPR04: 0001  c27f3b28 
c16e0610
[0.817113] GPR08:  c2b9db10 c2b903e0 
84008282
[0.817113] GPR12:  c03dba00 c00128c8 

[0.817113] GPR16:    
0007
[0.817113] GPR20: c27903f0 c20034a4 c29db000 

[0.817113] GPR24: c29dad70 c29dad50  
0001
[0.817113] GPR28: c2d57378  c2d47378 
c00a03f9
[0.820185] NIP [c0c77324] io_serial_in+0xb4/0x130
[0.820383] LR [c0c7c2c8] serial8250_config_port+0x4b8/0x1680
[0.820610] Call Trace:
[0.820733] [c0002a417080] [c0c768c8] 
serial8250_request_std_resource+0x88/0x200 (unreliable)
[0.821221] [c0002a4170c0] [c0c7be50] 
serial8250_config_port+0x40/0x1680
[0.821623] [c0002a417190] [c0c733ec] 
univ8250_config_port+0x11c/0x1e0
[0.821956] [c0002a4171f0] [c0c71824] 
uart_add_one_port+0x244/0x750
[0.822244] [c0002a417310] [c0c73958] 
serial8250_register_8250_port+0x3b8/0x780
[0.822504] [c0002a4173c0] [c0c7411c] 
serial8250_probe+0x14c/0x1e0
[0.822833] [c0002a417760] [c0cd87e8] platform_probe+0x98/0x1b0
[0.823157] [c0002a4177e0] [c0cd2a50] really_probe+0x130/0x5b0
[0.823517] [c0002a417870] [c0cd2f94] 
__driver_probe_device+0xc4/0x240
[0.823827] [c0002a4178f0] [c0cd3164] 
driver_probe_device+0x54/0x180
[0.824096] [c0002a417930] [c0cd3618] __driver_attach+0x168/0x300
[0.824330] [c0002a4179b0] [c0ccf468] bus_for_each_dev+0xa8/0x130
[0.824650] [c0002a417a10] [c0cd1ef4] driver_attach+0x34/0x50
[0.825094] [c0002a417a30] [c0cd112c] bus_add_driver+0x16c/0x310
[0.825445] [c0002a417ac0] [c0cd55d4] driver_register+0xa4/0x1b0
[0.825787] [c0002a417b30] [c0cd7548] 
__platform_driver_register+0x38/0x50
[0.826037] [c0002a417b50] [c206be38] serial8250_init+0x1f8/0x270
[0.826267] [c0002a417c00] [c0012260] do_one_initcall+0x60/0x300
[0.826529] [c0002a417ce0] [c20052c4] 
kernel_init_freeable+0x3c0/0x484
[0.826883] [c0002a417de0] [c00128f4] kernel_init+0x34/0x1e0
[0.827187] [c0002a417e50] [c000d014] 
ret_from_kernel_user_thread+0x14/0x1c
[0.827595] --- interrupt: 0 at 0x0
[0.827722] NIP:   LR:  CTR: 
[0.827951] REGS: c0002a417e80 TRAP:    Not tainted  (6.4.0-rc7)
[0.828162] MSR:   <>  CR:   XER: 
[0.828394] CFAR:  IRQMASK: 0
[0.828394] GPR00:    

[0.828394] GPR04:    

[0.828394] GPR08:    

[0.828394] GPR12:    

[0.828394] GPR16:    

[0.828394] GPR20:    

[0.828394] GPR24:    

[0.828394] GPR28:    

[0.831329] NIP [] 0x0
[0.831445] LR [] 0x0
[0.831554] --- interrupt: 0
[0.831664] Code: 7bc30020 eba1ffe8 ebc1fff0 ebe1fff8 4e800020 6000 
6042 3d2200db

Re: [PATCH v2 02/12] mm: introduce execmem_text_alloc() and jit_text_alloc()

2023-06-20 Thread Andy Lutomirski



On Mon, Jun 19, 2023, at 1:18 PM, Nadav Amit wrote:
>> On Jun 19, 2023, at 10:09 AM, Andy Lutomirski  wrote:
>> 
>> But jit_text_alloc() can't do this, because the order of operations doesn't 
>> match.  With jit_text_alloc(), the executable mapping shows up before the 
>> text is populated, so there is no atomic change from not-there to 
>> populated-and-executable.  Which means that there is an opportunity for 
>> CPUs, speculatively or otherwise, to start filling various caches with 
>> intermediate states of the text, which means that various architectures 
>> (even x86!) may need serialization.
>> 
>> For eBPF- and module- like use cases, where JITting/code gen is quite 
>> coarse-grained, perhaps something vaguely like:
>> 
>> jit_text_alloc() -> returns a handle and an executable virtual address, but 
>> does *not* map it there
>> jit_text_write() -> write to that handle
>> jit_text_map() -> map it and synchronize if needed (no sync needed on x86, I 
>> think)
>
> Andy, would you mind explaining why you think a sync is not needed? I 
> mean I have a “feeling” that perhaps TSO can guarantee something based 
> on the order of write and page-table update. Is that the argument?

Sorry, when I say "no sync" I mean no cross-CPU synchronization.  I'm assuming 
the underlying sequence of events is:

allocate physical pages (jit_text_alloc)

write to them (with MOV, memcpy, whatever), via the direct map or via a 
temporary mm

do an appropriate *local* barrier (which, on x86, is probably implied by TSO, 
as the subsequent pagetable change is at least a release; also, any any 
previous temporary mm stuff would have done MOV CR3 afterwards, which is a full 
"serializing" barrier)

optionally zap the direct map via IPI, assuming the pages are direct mapped 
(but this could be avoided with a smart enough allocator and temporary_mm above)

install the final RX PTE (jit_text_map), which does a MOV or maybe a LOCK 
CMPXCHG16B.  Note that the virtual address in question was not readable or 
executable before this, and all CPUs have serialized since the last time it was 
executable.

either jump to the new text locally, or:

1. Do a store-release to tell other CPUs that the text is mapped
2. Other CPU does a load-acquire to detect that the text is mapped and jumps to 
the text

This is all approximately the same thing that plain old mmap(..., PROT_EXEC, 
...) does.

>
> On this regard, one thing that I clearly do not understand is why 
> *today* it is ok for users of bpf_arch_text_copy() not to call 
> text_poke_sync(). Am I missing something?

I cannot explain this, because I suspect the current code is wrong.  But it's 
only wrong across CPUs, because bpf_arch_text_copy goes through text_poke_copy, 
which calls unuse_temporary_mm(), which is serializing.  And it's plausible 
that most eBPF use cases don't actually cause the loaded program to get used on 
a different CPU without first serializing on the CPU that ends up using it.  
(Context switches and interrupts are serializing.)

FRED could make interrupts non-serializing. I sincerely hope that FRED doesn't 
cause this all to fall apart.

--Andy


Re: [PATCH v2 06/12] mm/execmem: introduce execmem_data_alloc()

2023-06-20 Thread Alexei Starovoitov
On Tue, Jun 20, 2023 at 7:51 AM Steven Rostedt  wrote:
>
> On Mon, 19 Jun 2023 02:43:58 +0200
> Thomas Gleixner  wrote:
>
> > Now you might argue that it _is_ a "hotpath" due to the BPF usage, but
> > then even more so as any intermediate wrapper which converts from one
> > data representation to another data representation is not going to
> > increase performance, right?
>
> Just as a side note. BPF can not attach its return calling code to
> functions that have more than 6 parameters (3 on 32 bit x86), because of
> the way BPF return path trampoline works. It is a requirement that all
> parameters live in registers, and none on the stack.

It's actually 7 and that restriction is being lifted.
The patch set to attach to <= 12 is being discussed.


[PATCH v2 2/2] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to MM CPUs

2023-06-20 Thread Yair Podemsky
Currently the tlb_remove_table_smp_sync IPI is sent to all CPUs
indiscriminately, this causes unnecessary work and delays notable in
real-time use-cases and isolated cpus.
This patch will limit this IPI on systems with ARCH_HAS_CPUMASK_BITS,
Where the IPI will only be sent to cpus referencing the affected mm.

Signed-off-by: Yair Podemsky 
Suggested-by: David Hildenbrand 
---
 include/asm-generic/tlb.h |  4 ++--
 mm/khugepaged.c   |  4 ++--
 mm/mmu_gather.c   | 17 -
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index b46617207c93..0b6ba17cc8d3 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -222,7 +222,7 @@ extern void tlb_remove_table(struct mmu_gather *tlb, void 
*table);
 #define tlb_needs_table_invalidate() (true)
 #endif
 
-void tlb_remove_table_sync_one(void);
+void tlb_remove_table_sync_one(struct mm_struct *mm);
 
 #else
 
@@ -230,7 +230,7 @@ void tlb_remove_table_sync_one(void);
 #error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE
 #endif
 
-static inline void tlb_remove_table_sync_one(void) { }
+static inline void tlb_remove_table_sync_one(struct mm_struct *mm) { }
 
 #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */
 
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 6b9d39d65b73..3e5cb079d268 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1166,7 +1166,7 @@ static int collapse_huge_page(struct mm_struct *mm, 
unsigned long address,
_pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
mmu_notifier_invalidate_range_end();
-   tlb_remove_table_sync_one();
+   tlb_remove_table_sync_one(mm);
 
spin_lock(pte_ptl);
result =  __collapse_huge_page_isolate(vma, address, pte, cc,
@@ -1525,7 +1525,7 @@ static void collapse_and_free_pmd(struct mm_struct *mm, 
struct vm_area_struct *v
addr + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start();
pmd = pmdp_collapse_flush(vma, addr, pmdp);
-   tlb_remove_table_sync_one();
+   tlb_remove_table_sync_one(mm);
mmu_notifier_invalidate_range_end();
mm_dec_nr_ptes(mm);
page_table_check_pte_clear_range(mm, addr, pmd);
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index ea9683e12936..692d8175a88e 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -191,7 +191,13 @@ static void tlb_remove_table_smp_sync(void *arg)
/* Simply deliver the interrupt */
 }
 
-void tlb_remove_table_sync_one(void)
+#ifdef CONFIG_ARCH_HAS_CPUMASK_BITS
+#define REMOVE_TABLE_IPI_MASK mm_cpumask(mm)
+#else
+#define REMOVE_TABLE_IPI_MASK cpu_online_mask
+#endif /* CONFIG_ARCH_HAS_CPUMASK_BITS */
+
+void tlb_remove_table_sync_one(struct mm_struct *mm)
 {
/*
 * This isn't an RCU grace period and hence the page-tables cannot be
@@ -200,7 +206,8 @@ void tlb_remove_table_sync_one(void)
 * It is however sufficient for software page-table walkers that rely on
 * IRQ disabling.
 */
-   smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
+   on_each_cpu_mask(REMOVE_TABLE_IPI_MASK, tlb_remove_table_smp_sync,
+   NULL, true);
 }
 
 static void tlb_remove_table_rcu(struct rcu_head *head)
@@ -237,9 +244,9 @@ static inline void tlb_table_invalidate(struct mmu_gather 
*tlb)
}
 }
 
-static void tlb_remove_table_one(void *table)
+static void tlb_remove_table_one(struct mm_struct *mm, void *table)
 {
-   tlb_remove_table_sync_one();
+   tlb_remove_table_sync_one(mm);
__tlb_remove_table(table);
 }
 
@@ -262,7 +269,7 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | 
__GFP_NOWARN);
if (*batch == NULL) {
tlb_table_invalidate(tlb);
-   tlb_remove_table_one(table);
+   tlb_remove_table_one(tlb->mm, table);
return;
}
(*batch)->nr = 0;
---
v2: replaced no REMOVE_TABLE_IPI_MASK REMOVE_TABLE_IPI_MASK to cpu_online_mask
-- 
2.39.3



[PATCH v2 1/2] arch: Introduce ARCH_HAS_CPUMASK_BITS

2023-06-20 Thread Yair Podemsky
Some architectures set and maintain the mm_cpumask bits when loading
or removing process from cpu.
This Kconfig will mark those to allow different behavior between
kernels that maintain the mm_cpumask and those that do not.

Signed-off-by: Yair Podemsky 
---
 arch/Kconfig | 8 
 arch/arm/Kconfig | 1 +
 arch/powerpc/Kconfig | 1 +
 arch/s390/Kconfig| 1 +
 arch/sparc/Kconfig   | 1 +
 arch/x86/Kconfig | 1 +
 6 files changed, 13 insertions(+)

diff --git a/arch/Kconfig b/arch/Kconfig
index 205fd23e0cad..953fbfa5a2ad 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1466,6 +1466,14 @@ config ARCH_HAS_NONLEAF_PMD_YOUNG
  address translations. Page table walkers that clear the accessed bit
  may use this capability to reduce their search space.
 
+config ARCH_HAS_CPUMASK_BITS
+   bool
+   help
+ Architectures that select this option set bits on the mm_cpumask
+ to mark which cpus loaded the mm, The mask can then be used to
+ control mm specific actions such as tlb_flush.
+
+
 source "kernel/gcov/Kconfig"
 
 source "scripts/gcc-plugins/Kconfig"
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 0fb4b218f665..cd20e96bc1dc 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -70,6 +70,7 @@ config ARM
select GENERIC_SMP_IDLE_THREAD
select HARDIRQS_SW_RESEND
select HAS_IOPORT
+   select ARCH_HAS_CPUMASK_BITS
select HAVE_ARCH_AUDITSYSCALL if AEABI && !OABI_COMPAT
select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index bff5820b7cda..c9218722aa2f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -156,6 +156,7 @@ config PPC
select ARCH_HAS_TICK_BROADCAST  if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAS_UACCESS_FLUSHCACHE
select ARCH_HAS_UBSAN_SANITIZE_ALL
+   select ARCH_HAS_CPUMASK_BITS
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_KEEP_MEMBLOCK
select ARCH_MIGHT_HAVE_PC_PARPORT
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 6dab9c1be508..60bf29bc3f87 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -84,6 +84,7 @@ config S390
select ARCH_HAS_SYSCALL_WRAPPER
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAS_VDSO_DATA
+   select ARCH_HAS_CPUMASK_BITS
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_INLINE_READ_LOCK
select ARCH_INLINE_READ_LOCK_BH
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 8535e19062f6..e8bf4d769306 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -99,6 +99,7 @@ config SPARC64
select ARCH_HAS_PTE_SPECIAL
select PCI_DOMAINS if PCI
select ARCH_HAS_GIGANTIC_PAGE
+   select ARCH_HAS_CPUMASK_BITS
select HAVE_SOFTIRQ_ON_OWN_STACK
select HAVE_SETUP_PER_CPU_AREA
select NEED_PER_CPU_EMBED_FIRST_CHUNK
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 53bab123a8ee..b351421695f3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -185,6 +185,7 @@ config X86
select HAVE_ARCH_THREAD_STRUCT_WHITELIST
select HAVE_ARCH_STACKLEAK
select HAVE_ARCH_TRACEHOOK
+   select ARCH_HAS_CPUMASK_BITS
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
select HAVE_ARCH_USERFAULTFD_WP if X86_64 && USERFAULTFD
-- 
2.39.3



[PATCH v2 0/2] send tlb_remove_table_smp_sync IPI only to necessary CPUs

2023-06-20 Thread Yair Podemsky
Currently the tlb_remove_table_smp_sync IPI is sent to all CPUs
indiscriminately, this causes unnecessary work and delays notable in
real-time use-cases and isolated cpus.
By limiting the IPI to only be sent to cpus referencing the effected
mm.
a config to differentiate architectures that support mm_cpumask from
those that don't will allow safe usage of this feature.

changes from -v1:
- Previous version included a patch to only send the IPI to CPU's with
context_tracking in the kernel space, this was removed due to race 
condition concerns.
- for archs that do not maintain mm_cpumask the mask used should be
 cpu_online_mask (Peter Zijlstra).
 
 v1: https://lore.kernel.org/all/20230404134224.137038-1-ypode...@redhat.com/

Yair Podemsky (2):
  arch: Introduce ARCH_HAS_CPUMASK_BITS
  mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to MM CPUs

 arch/Kconfig  |  8 
 arch/arm/Kconfig  |  1 +
 arch/powerpc/Kconfig  |  1 +
 arch/s390/Kconfig |  1 +
 arch/sparc/Kconfig|  1 +
 arch/x86/Kconfig  |  1 +
 include/asm-generic/tlb.h |  4 ++--
 mm/khugepaged.c   |  4 ++--
 mm/mmu_gather.c   | 17 -
 9 files changed, 29 insertions(+), 9 deletions(-)

-- 
2.39.3



Re: [PATCH v2 06/12] mm/execmem: introduce execmem_data_alloc()

2023-06-20 Thread Steven Rostedt
On Mon, 19 Jun 2023 02:43:58 +0200
Thomas Gleixner  wrote:

> Now you might argue that it _is_ a "hotpath" due to the BPF usage, but
> then even more so as any intermediate wrapper which converts from one
> data representation to another data representation is not going to
> increase performance, right?

Just as a side note. BPF can not attach its return calling code to
functions that have more than 6 parameters (3 on 32 bit x86), because of
the way BPF return path trampoline works. It is a requirement that all
parameters live in registers, and none on the stack.

-- Steve


Re: [PATCH v2 08/16] mm/vmemmap: Improve vmemmap_can_optimize and allow architectures to override

2023-06-20 Thread Aneesh Kumar K.V
Joao Martins  writes:

> On 16/06/2023 12:08, Aneesh Kumar K.V wrote:
>> dax vmemmap optimization requires a minimum of 2 PAGE_SIZE area within
>> vmemmap such that tail page mapping can point to the second PAGE_SIZE area.
>> Enforce that in vmemmap_can_optimize() function.
>> 
>> Architectures like powerpc also want to enable vmemmap optimization
>> conditionally (only with radix MMU translation). Hence allow architecture
>> override.
>> 
> This makes sense. The enforcing here is not just for correctness but because 
> you
> want to use VMEMMAP_RESERVE_NR supposedly?
>
> I would suggest having two patches one for the refactor and another one for 
> the
> override, but I don't feel particularly strongly about it.
>

I will wait for feedback from others. If we have others also suggesting
for the split patch I will do that.

>> Signed-off-by: Aneesh Kumar K.V 
>> ---
>>  include/linux/mm.h | 30 ++
>>  mm/mm_init.c   |  2 +-
>>  2 files changed, 27 insertions(+), 5 deletions(-)
>> 
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 27ce77080c79..9a45e61cd83f 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -31,6 +31,8 @@
>>  #include 
>>  #include 
>>  
>> +#include 
>> +
>
> Why is this include needed?

That was for PAGE_SHIFT. But then we do pull that through other include
dependencies. I will see if i can drop that.

>
>>  struct mempolicy;
>>  struct anon_vma;
>>  struct anon_vma_chain;
>> @@ -3550,13 +3552,33 @@ void vmemmap_free(unsigned long start, unsigned long 
>> end,
>>  struct vmem_altmap *altmap);
>>  #endif
>>  
>> +#define VMEMMAP_RESERVE_NR  2
>
> see below
>
>>  #ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP
>> -static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
>> -   struct dev_pagemap *pgmap)
>> +static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap,
>> +  struct dev_pagemap *pgmap)
>>  {
>> -return is_power_of_2(sizeof(struct page)) &&
>> -pgmap && (pgmap_vmemmap_nr(pgmap) > 1) && !altmap;
>> +if (pgmap) {
>> +unsigned long nr_pages;
>> +unsigned long nr_vmemmap_pages;
>> +
>> +nr_pages = pgmap_vmemmap_nr(pgmap);
>> +nr_vmemmap_pages = ((nr_pages * sizeof(struct page)) >> 
>> PAGE_SHIFT);
>> +/*
>> + * For vmemmap optimization with DAX we need minimum 2 vmemmap
>
>
>
>> + * pages. See layout diagram in 
>> Documentation/mm/vmemmap_dedup.rst
>> + */
>> +return is_power_of_2(sizeof(struct page)) &&
>> +(nr_vmemmap_pages > VMEMMAP_RESERVE_NR) && !altmap;
>> +}
>
> It would be more readable (i.e. less identation) if you just reverse this:
>
>   unsigned long nr_vmemmap_pages;
>
>   if (!pgmap || !is_power_of_2(sizeof(struct page))
>   return false;
>
>   nr_vmemmap_pages = ((pgmap_vmemmap_nr(pgmap) *
>sizeof(struct page)) >> PAGE_SHIFT);
>
>   /*
>* For vmemmap optimization with DAX we need minimum 2 vmemmap
>* pages. See layout diagram in Documentation/mm/vmemmap_dedup.rst
>*/
>   return (nr_vmemmap_pages > VMEMMAP_RESERVE_NR) && !altmap;
>
>

Will update



>> +return false;
>>  }
>> +/*
>> + * If we don't have an architecture override, use the generic rule
>> + */
>> +#ifndef vmemmap_can_optimize
>> +#define vmemmap_can_optimize __vmemmap_can_optimize
>> +#endif
>> +
>
> sparse-vmemmap code is trivial to change to use dedup a single vmemmap page
> (e.g. to align with hugetlb), hopefully the architecture override to do. this 
> is
> to say whether VMEMMAP_RESERVE_NR should have similar to above?

VMEMMAP_RESERVE_NR was added to avoid the usage of `2` in the below code.
The reason we need the arch override was to add an additional check on
ppc64 as shown by patch

https://lore.kernel.org/linux-mm/20230616110826.344417-16-aneesh.ku...@linux.ibm.com/

bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
{
if (radix_enabled())
return __vmemmap_can_optimize(altmap, pgmap);

return false;
}


>
>>  #else
>>  static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
>> struct dev_pagemap *pgmap)
>> diff --git a/mm/mm_init.c b/mm/mm_init.c
>> index 7f7f9c677854..d1676afc94f1 100644
>> --- a/mm/mm_init.c
>> +++ b/mm/mm_init.c
>> @@ -1020,7 +1020,7 @@ static inline unsigned long compound_nr_pages(struct 
>> vmem_altmap *altmap,
>>  if (!vmemmap_can_optimize(altmap, pgmap))
>>  return pgmap_vmemmap_nr(pgmap);
>>  
>> -return 2 * (PAGE_SIZE / sizeof(struct page));
>> +return VMEMMAP_RESERVE_NR * (PAGE_SIZE / sizeof(struct page));
>>  }
>>  
>>  static void __ref memmap_init_compound(struct page *head,


[PATCH v7 17/19] powerpc: mm: Convert to GENERIC_IOREMAP

2023-06-20 Thread Baoquan He
From: Christophe Leroy 

By taking GENERIC_IOREMAP method, the generic generic_ioremap_prot(),
generic_iounmap(), and their generic wrapper ioremap_prot(), ioremap()
and iounmap() are all visible and available to arch. Arch needs to
provide wrapper functions to override the generic versions if there's
arch specific handling in its ioremap_prot(), ioremap() or iounmap().
This change will simplify implementation by removing duplicated codes
with generic_ioremap_prot() and generic_iounmap(), and has the equivalent
functioality as before.

Here, add wrapper functions ioremap_prot() and iounmap() for powerpc's
special operation when ioremap() and iounmap().

Signed-off-by: Christophe Leroy 
Signed-off-by: Baoquan He 
Reviewed-by: Christoph Hellwig 
Reviewed-by: Mike Rapoport (IBM) 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: linuxppc-dev@lists.ozlabs.org
---
 arch/powerpc/Kconfig  |  1 +
 arch/powerpc/include/asm/io.h |  8 +++-
 arch/powerpc/mm/ioremap.c | 26 +-
 arch/powerpc/mm/ioremap_32.c  | 19 +--
 arch/powerpc/mm/ioremap_64.c  | 12 ++--
 5 files changed, 16 insertions(+), 50 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index bff5820b7cda..aadb280a539e 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -194,6 +194,7 @@ config PPC
select GENERIC_CPU_VULNERABILITIES  if PPC_BARRIER_NOSPEC
select GENERIC_EARLY_IOREMAP
select GENERIC_GETTIMEOFDAY
+   select GENERIC_IOREMAP
select GENERIC_IRQ_SHOW
select GENERIC_IRQ_SHOW_LEVEL
select GENERIC_PCI_IOMAPif PCI
diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 67a3fb6de498..0732b743e099 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -889,8 +889,8 @@ static inline void iosync(void)
  *
  */
 extern void __iomem *ioremap(phys_addr_t address, unsigned long size);
-extern void __iomem *ioremap_prot(phys_addr_t address, unsigned long size,
- unsigned long flags);
+#define ioremap ioremap
+#define ioremap_prot ioremap_prot
 extern void __iomem *ioremap_wc(phys_addr_t address, unsigned long size);
 #define ioremap_wc ioremap_wc
 
@@ -904,14 +904,12 @@ void __iomem *ioremap_coherent(phys_addr_t address, 
unsigned long size);
 #define ioremap_cache(addr, size) \
ioremap_prot((addr), (size), pgprot_val(PAGE_KERNEL))
 
-extern void iounmap(volatile void __iomem *addr);
+#define iounmap iounmap
 
 void __iomem *ioremap_phb(phys_addr_t paddr, unsigned long size);
 
 int early_ioremap_range(unsigned long ea, phys_addr_t pa,
unsigned long size, pgprot_t prot);
-void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long 
size,
-pgprot_t prot, void *caller);
 
 extern void __iomem *__ioremap_caller(phys_addr_t, unsigned long size,
  pgprot_t prot, void *caller);
diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c
index 4f12504fb405..705e8e8ffde4 100644
--- a/arch/powerpc/mm/ioremap.c
+++ b/arch/powerpc/mm/ioremap.c
@@ -41,7 +41,7 @@ void __iomem *ioremap_coherent(phys_addr_t addr, unsigned 
long size)
return __ioremap_caller(addr, size, prot, caller);
 }
 
-void __iomem *ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long 
flags)
+void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long flags)
 {
pte_t pte = __pte(flags);
void *caller = __builtin_return_address(0);
@@ -74,27 +74,3 @@ int early_ioremap_range(unsigned long ea, phys_addr_t pa,
 
return 0;
 }
-
-void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long 
size,
-pgprot_t prot, void *caller)
-{
-   struct vm_struct *area;
-   int ret;
-   unsigned long va;
-
-   area = __get_vm_area_caller(size, VM_IOREMAP, IOREMAP_START, 
IOREMAP_END, caller);
-   if (area == NULL)
-   return NULL;
-
-   area->phys_addr = pa;
-   va = (unsigned long)area->addr;
-
-   ret = ioremap_page_range(va, va + size, pa, prot);
-   if (!ret)
-   return (void __iomem *)area->addr + offset;
-
-   vunmap_range(va, va + size);
-   free_vm_area(area);
-
-   return NULL;
-}
diff --git a/arch/powerpc/mm/ioremap_32.c b/arch/powerpc/mm/ioremap_32.c
index 9d13143b8be4..ca5bc6be3e6f 100644
--- a/arch/powerpc/mm/ioremap_32.c
+++ b/arch/powerpc/mm/ioremap_32.c
@@ -21,6 +21,13 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, 
pgprot_t prot, void *call
phys_addr_t p, offset;
int err;
 
+   /*
+* If the address lies within the first 16 MB, assume it's in ISA
+* memory space
+*/
+   if (addr < SZ_16M)
+   addr += _ISA_MEM_BASE;
+
/*
 * Choose an address to map it to.
 * Once the vmalloc system is running, we use 

Re: [RFC PATCH 2/3] powerpc/pci: Remove MVE code

2023-06-20 Thread Michael Ellerman
Joel Stanley  writes:
> With IODA1 support gone the OPAL calls to set MVE are dead code. Remove
> them.
>
> TODO: Do we have rules for removing unused OPAL APIs? Should we leave it
> in opal.h? opal-call.c?

I don't think we have any rules for removal.

When skiboot was being actively developed Stewart and I did try to keep
the skiboot and kernel opal-api.h in sync, but I haven't checked on the
state of that for a few years. ... And I just checked and boy they are
*not* in sync anymore.

But anyway opal.h and opal-call.c are just kernel internal, so I think
we just remove unused code there as normal.

cheers


Re: [PATCH v2 2/2] powerpc/mm: Add memory_block_size as a kernel parameter

2023-06-20 Thread David Hildenbrand

On 20.06.23 14:35, Michael Ellerman wrote:

David Hildenbrand  writes:

On 09.06.23 08:08, Aneesh Kumar K.V wrote:

Certain devices can possess non-standard memory capacities, not constrained
to multiples of 1GB. Provide a kernel parameter so that we can map the
device memory completely on memory hotplug.


So, the unfortunate thing is that these devices would have worked out of
the box before the memory block size was increased from 256 MiB to 1 GiB
in these setups. Now, one has to fine-tune the memory block size. The
only other arch that I know, which supports setting the memory block
size, is x86 for special (large) UV systems -- and at least in the past
128 MiB vs. 2 GiB memory blocks made a performance difference during
boot (maybe no longer today, who knows).


Obviously, less tunable and getting stuff simply working out of the box
is preferable.

Two questions:

1) Isn't there a way to improve auto-detection to fallback to 256 MiB in
these setups, to avoid specifying these parameters?

2) Is the 256 MiB -> 1 GiB memory block size switch really worth it? On
x86-64, experiments (with direct map fragmentation) showed that the
effective performance boost is pretty insignificant, so I wonder how big
the 1 GiB direct map performance improvement is.


The other issue is simply the number of sysfs entries.

With 64TB of memory and a 256MB block size you end up with ~250,000
directories in /sys/devices/system/memory.


Yes, and so far on other archs we only optimize for that for on UV x86 
systems (with a default of 2 GiB). And that was added before we started 
to speed up memory device lookups significantly using a radix tree IIRC.


It's worth noting that there was a discussion on:

(a) not creating these device sysfs entries (when configured on the 
cmdline); often, nobody really ends up using them to online/offline 
memory blocks. Then, the only primary users is lsmem.


(b) exposing logical devices (e.g., a DIMM) taht can only be 
offlined/removed as a whole, instead of their individual memblocks (when 
configured on the cmdline). But for PPC64 that won't help.



But (a) gets more tricky if device drivers (and things like dax/kmem) 
rely on user-space memory onlining/offlining.


--
Cheers,

David / dhildenb



Re: [PATCH v2 2/2] powerpc/mm: Add memory_block_size as a kernel parameter

2023-06-20 Thread Michael Ellerman
David Hildenbrand  writes:
> On 09.06.23 08:08, Aneesh Kumar K.V wrote:
>> Certain devices can possess non-standard memory capacities, not constrained
>> to multiples of 1GB. Provide a kernel parameter so that we can map the
>> device memory completely on memory hotplug.
>
> So, the unfortunate thing is that these devices would have worked out of 
> the box before the memory block size was increased from 256 MiB to 1 GiB 
> in these setups. Now, one has to fine-tune the memory block size. The 
> only other arch that I know, which supports setting the memory block 
> size, is x86 for special (large) UV systems -- and at least in the past 
> 128 MiB vs. 2 GiB memory blocks made a performance difference during 
> boot (maybe no longer today, who knows).
>
>
> Obviously, less tunable and getting stuff simply working out of the box 
> is preferable.
>
> Two questions:
>
> 1) Isn't there a way to improve auto-detection to fallback to 256 MiB in 
> these setups, to avoid specifying these parameters?
>
> 2) Is the 256 MiB -> 1 GiB memory block size switch really worth it? On 
> x86-64, experiments (with direct map fragmentation) showed that the 
> effective performance boost is pretty insignificant, so I wonder how big 
> the 1 GiB direct map performance improvement is.

The other issue is simply the number of sysfs entries.

With 64TB of memory and a 256MB block size you end up with ~250,000
directories in /sys/devices/system/memory.

cheers


Re: [PATCH v2] security/integrity: fix pointer to ESL data and its size on pseries

2023-06-20 Thread R Nageswara Sastry




On 08/06/23 5:34 pm, Nayna Jain wrote:

On PowerVM guest, variable data is prefixed with 8 bytes of timestamp.
Extract ESL by stripping off the timestamp before passing to ESL parser.

Fixes: 4b3e71e9a34c ("integrity/powerpc: Support loading keys from PLPKS")
Cc: sta...@vger.kenrnel.org # v6.3
Signed-off-by: Nayna Jain 
---


Tested-by: Nageswara R Sastry 

Seeing the keyring difference with and with out patch:
With out patch:
[root@ltcrain80-lp2 ~]# grep keyring /proc/keys | grep -v _ses
0131ebb4 I--Q--- 1 perm 0c03 0 65534 keyring   .user_reg: 2
039dfd93 I-- 1 perm 1f0f 0 0 keyring   .ima: 1
03a160b5 I-- 1 perm 1f0b 0 0 keyring 
.builtin_trusted_keys: 2

05377b73 I-- 1 perm 1f0b 0 0 keyring   .platform: empty
0d7ea730 I-- 1 perm 0f0b 0 0 keyring   .blacklist: empty
16235f2d I--Q--- 6 perm 1f3f 0 65534 keyring   _uid.0: empty
1721f130 I-- 1 perm 1f0f 0 0 keyring   .evm: empty

With patch:
[root@ltcrain80-lp2 ~]# grep keyring /proc/keys | grep -v _ses
04820159 I-- 1 perm 0f0b 0 0 keyring   .blacklist: 1
16d05827 I--Q--- 1 perm 0c03 0 65534 keyring   .user_reg: 2
17648d6a I-- 1 perm 1f0b 0 0 keyring 
.builtin_trusted_keys: 2

2158b34f I--Q--- 6 perm 1f3f 0 65534 keyring   _uid.0: empty
2237eff6 I-- 1 perm 1f0f 0 0 keyring   .evm: empty
26d0330c I-- 1 perm 1f0b 0 0 keyring   .platform: 1
2daa48ab I-- 1 perm 1f0f 0 0 keyring   .ima: 1

Thank you.

Changelog:
v2: Fixed feedback from Jarkko
   * added CC to stable
   * moved *data declaration to same line as *db,*dbx
 Renamed extract_data() macro to extract_esl() for clarity
  .../integrity/platform_certs/load_powerpc.c   | 40 ---
  1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/security/integrity/platform_certs/load_powerpc.c 
b/security/integrity/platform_certs/load_powerpc.c
index b9de70b90826..170789dc63d2 100644
--- a/security/integrity/platform_certs/load_powerpc.c
+++ b/security/integrity/platform_certs/load_powerpc.c
@@ -15,6 +15,9 @@
  #include "keyring_handler.h"
  #include "../integrity.h"
  
+#define extract_esl(db, data, size, offset)	\

+   do { db = data + offset; size = size - offset; } while (0)
+
  /*
   * Get a certificate list blob from the named secure variable.
   *
@@ -55,8 +58,9 @@ static __init void *get_cert_list(u8 *key, unsigned long 
keylen, u64 *size)
   */
  static int __init load_powerpc_certs(void)
  {
-   void *db = NULL, *dbx = NULL;
-   u64 dbsize = 0, dbxsize = 0;
+   void *db = NULL, *dbx = NULL, *data = NULL;
+   u64 dsize = 0;
+   u64 offset = 0;
int rc = 0;
ssize_t len;
char buf[32];
@@ -74,38 +78,46 @@ static int __init load_powerpc_certs(void)
return -ENODEV;
}
  
+	if (strcmp("ibm,plpks-sb-v1", buf) == 0)

+   /* PLPKS authenticated variables ESL data is prefixed with 8 
bytes of timestamp */
+   offset = 8;
+
/*
 * Get db, and dbx. They might not exist, so it isn't an error if we
 * can't get them.
 */
-   db = get_cert_list("db", 3, );
-   if (!db) {
+   data = get_cert_list("db", 3, );
+   if (!data) {
pr_info("Couldn't get db list from firmware\n");
-   } else if (IS_ERR(db)) {
-   rc = PTR_ERR(db);
+   } else if (IS_ERR(data)) {
+   rc = PTR_ERR(data);
pr_err("Error reading db from firmware: %d\n", rc);
return rc;
} else {
-   rc = parse_efi_signature_list("powerpc:db", db, dbsize,
+   extract_esl(db, data, dsize, offset);
+
+   rc = parse_efi_signature_list("powerpc:db", db, dsize,
  get_handler_for_db);
if (rc)
pr_err("Couldn't parse db signatures: %d\n", rc);
-   kfree(db);
+   kfree(data);
}
  
-	dbx = get_cert_list("dbx", 4,  );

-   if (!dbx) {
+   data = get_cert_list("dbx", 4,  );
+   if (!data) {
pr_info("Couldn't get dbx list from firmware\n");
-   } else if (IS_ERR(dbx)) {
-   rc = PTR_ERR(dbx);
+   } else if (IS_ERR(data)) {
+   rc = PTR_ERR(data);
pr_err("Error reading dbx from firmware: %d\n", rc);
return rc;
} else {
-   rc = parse_efi_signature_list("powerpc:dbx", dbx, dbxsize,
+   extract_esl(dbx, data, dsize, offset);
+
+   rc = parse_efi_signature_list("powerpc:dbx", dbx, dsize,
  get_handler_for_dbx);
if (rc)
pr_err("Couldn't parse dbx signatures: %d\n", rc);
-   kfree(dbx);
+   

Re: [PATCH] cxl/ocxl: Possible repeated word

2023-06-20 Thread Michael Ellerman
zhumao...@208suo.com writes:
> Delete repeated word in comment.
>
> Signed-off-by: Zhu Mao 
> ---
>   drivers/misc/cxl/native.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
> index 50b0c44bb8d7..6957946a6463 100644
> --- a/drivers/misc/cxl/native.c
> +++ b/drivers/misc/cxl/native.c
> @@ -920,7 +920,7 @@ int cxl_attach_dedicated_process_psl9(struct 
> cxl_context *ctx, u64 wed, u64 amr)
>* Ideally we should do a wmb() here to make sure the changes to 
> the
>* PE are visible to the card before we call afu_enable.
>* On ppc64 though all mmios are preceded by a 'sync' instruction 
> hence
> - * we dont dont need one here.
> + * we dont need one here.

I know the rule is to only do one change per patch, but if you're fixing
the wording of a comment you can absolutely fix the spelling at the same
time.

So please send a v2 with "dont" spelt correctly.

cheers


[6.4.0-rc7-next-20230620] Boot failure on IBM Power LPAR

2023-06-20 Thread Sachin Sant
6.4.0-rc7-next-20230620 fails to boot on IBM Power LPAR with following

[ 5.548368] BUG: Unable to handle kernel data access at 0x95bdcf954bc34e73
[ 5.548380] Faulting instruction address: 0xc0548090
[ 5.548384] Oops: Kernel access of bad area, sig: 11 [#1]
[ 5.548387] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
[ 5.548391] Modules linked in: nf_tables(E) nfnetlink(E) sunrpc(E) 
binfmt_misc(E) pseries_rng(E) aes_gcm_p10_crypto(E) drm(E) 
drm_panel_orientation_quirks(E) xfs(E) libcrc32c(E) sd_mod(E) sr_mod(E) 
t10_pi(E) crc64_rocksoft_generic(E) cdrom(E) crc64_rocksoft(E) crc64(E) sg(E) 
ibmvscsi(E) scsi_transport_srp(E) ibmveth(E) vmx_crypto(E) fuse(E)
[ 5.548413] CPU: 1 PID: 789 Comm: systemd-udevd Tainted: G E 
6.4.0-rc7-next-20230620 #1
[ 5.548417] Hardware name: IBM,9080-HEX POWER10 (raw) 0x800200 0xf06 
of:IBM,FW1030.20 (NH1030_058) hv:phyp pSeries
[ 5.548421] NIP: c0548090 LR: c0547fbc CTR: c04206f0
[ 5.548424] REGS: c000afb536f0 TRAP: 0380 Tainted: G E 
(6.4.0-rc7-next-20230620)
[ 5.548427] MSR: 8280b033  CR: 
88028202 XER: 2004
[ 5.548436] CFAR: c0547fc4 IRQMASK: 0 
[ 5.548436] GPR00: c0547fbc c000afb53990 c14b1600 
 
[ 5.548436] GPR04: 0cc0 34d8 0e6f 
ed5e02cab43c21e0 
[ 5.548436] GPR08: 0e6e 0058 001356ea 
2000 
[ 5.548436] GPR12: c04206f0 c013f300  
 
[ 5.548436] GPR16:    
c00092f43708 
[ 5.548436] GPR20: c00092f436b0  fff7dfff 
c000afa8 
[ 5.548436] GPR24: c2b87aa0 00b8 c0159914 
0cc0 
[ 5.548436] GPR28: 95bdcf954bc34e1b ca1fafc0  
c3019800 
[ 5.548473] NIP [c0548090] kmem_cache_alloc+0x1a0/0x420
[ 5.548480] LR [c0547fbc] kmem_cache_alloc+0xcc/0x420
[ 5.548485] Call Trace:
[ 5.548487] [c000afb53990] [c0547fbc] kmem_cache_alloc+0xcc/0x420 
(unreliable)
[ 5.548493] [c000afb53a00] [c0159914] vm_area_dup+0x44/0xf0
[ 5.548499] [c000afb53a40] [c015a638] dup_mmap+0x298/0x8b0
[ 5.548504] [c000afb53bb0] [c015acd0] dup_mm.constprop.0+0x80/0x180
[ 5.548509] [c000afb53bf0] [c015bdc0] copy_process+0xc00/0x1510
[ 5.548514] [c000afb53cb0] [c015c848] kernel_clone+0xb8/0x5a0
[ 5.548519] [c000afb53d30] [c015ceb8] __do_sys_clone+0x88/0xd0
[ 5.548524] [c000afb53e10] [c0033bcc] 
system_call_exception+0x13c/0x340
[ 5.548529] [c000afb53e50] [c000d05c] 
system_call_vectored_common+0x15c/0x2ec
[ 5.548534] --- interrupt: 3000 at 0x7fff87f0c178
[ 5.548538] NIP: 7fff87f0c178 LR:  CTR: 
[ 5.548540] REGS: c000afb53e80 TRAP: 3000 Tainted: G E 
(6.4.0-rc7-next-20230620)
[ 5.548544] MSR: 8000f033  CR: 44004204 
XER: 
[ 5.548552] IRQMASK: 0 
[ 5.548552] GPR00: 0078 7de8cb80 7fff88637500 
01200011 
[ 5.548552] GPR04:    
7fff888bd490 
[ 5.548552] GPR08: 0001   
 
[ 5.548552] GPR12:  7fff888c4c00 0002 
7de95698 
[ 5.548552] GPR16: 7de95690 7de95688 7de956a0 
0028 
[ 5.548552] GPR20: 000132bca308 0001 0001 
0315 
[ 5.548552] GPR24: 0003 0040  
0003 
[ 5.548552] GPR28:   7de8cf24 
0045 
[ 5.548586] NIP [7fff87f0c178] 0x7fff87f0c178
[ 5.548589] LR [] 0x0
[ 5.548591] --- interrupt: 3000
[ 5.548593] Code: e93f 7ce95214 e9070008 7f89502a e9270010 2e3c 
41920258 2c29 41820250 813f0028 e8ff00b8 38c80001 <7fdc482a> 7d3c4a14 
79250022 552ac03e 
[ 5.548605] ---[ end trace  ]---
[ 5.550849] pstore: backend (nvram) writing error (-1)
[ 5.550852] 
Starting Network Manager...
[ 5.566384] BUG: Bad rss-counter state mm:dc60f1c1 type:MM_ANONPAGES 
val:36
[ 5.568784] BUG: Bad rss-counter state mm:8eb9341b type:MM_ANONPAGES 
val:36
[ 5.689774] BUG: Bad rss-counter state mm:edbda345 type:MM_ANONPAGES 
val:36
[ 5.692187] BUG: Bad rss-counter state mm:3f7ec21f type:MM_ANONPAGES 
val:36
[ 5.705947] BUG: Bad rss-counter state mm:cdbb7cfd type:MM_ANONPAGES 
val:36
[ 6.550855] Kernel panic - not syncing: Fatal exception
[ 6.568226] Rebooting in 10 seconds..

The problem was introduced in 6.4.0-rc7-next-20230619. I tried git bisect, but 
unsure of the
result reported by it. Bisect points to following patch

# git bisect bad
70c94cc2eefd4f98d222834cbe7512804977c2d4 is the first bad commit
commit 70c94cc2eefd4f98d222834cbe7512804977c2d4

Re: [PATCH v2 05/12] powerpc: add pte_free_defer() for pgtables sharing page

2023-06-20 Thread Jason Gunthorpe
On Tue, Jun 20, 2023 at 12:47:54AM -0700, Hugh Dickins wrote:
> Add powerpc-specific pte_free_defer(), to call pte_free() via call_rcu().
> pte_free_defer() will be called inside khugepaged's retract_page_tables()
> loop, where allocating extra memory cannot be relied upon.  This precedes
> the generic version to avoid build breakage from incompatible pgtable_t.
> 
> This is awkward because the struct page contains only one rcu_head, but
> that page may be shared between PTE_FRAG_NR pagetables, each wanting to
> use the rcu_head at the same time: account concurrent deferrals with a
> heightened refcount, only the first making use of the rcu_head, but
> re-deferring if more deferrals arrived during its grace period.

You didn't answer my question why we can't just move the rcu to the
actual free page?

Since PPC doesn't recycle the frags, we don't need to carefully RCU
free each frag, we just need to RCU free the entire page when it
becomes eventually free?

Jason


Re: [PATCH mm-unstable v2 06/10] kvm/powerpc: make radix page tables RCU safe

2023-06-20 Thread Nicholas Piggin
On Tue Jun 20, 2023 at 6:00 PM AEST, Yu Zhao wrote:
> On Tue, Jun 20, 2023 at 12:33 AM Nicholas Piggin  wrote:
> >
> > On Sat May 27, 2023 at 9:44 AM AEST, Yu Zhao wrote:
> > > KVM page tables are currently not RCU safe against remapping, i.e.,
> > > kvmppc_unmap_free_pmd_entry_table() et al. The previous
> >
> > Minor nit but the "page table" is not RCU-safe against something. It
> > is RCU-freed, and therefore some algorithm that accesses it can have
> > the existence guarantee provided by RCU (usually there still needs
> > to be more to it).
> >
> > > mmu_notifier_ops members rely on kvm->mmu_lock to synchronize with
> > > that operation.
> > >
> > > However, the new mmu_notifier_ops member test_clear_young() provides
> > > a fast path that does not take kvm->mmu_lock. To implement
> > > kvm_arch_test_clear_young() for that path, orphan page tables need to
> > > be freed by RCU.
> >
> > Short version: clear the referenced bit using RCU instead of MMU lock
> > to protect against page table freeing, and there is no problem with
> > clearing the bit in a table that has been freed.
> >
> > Seems reasonable.
>
> Thanks. All above points taken.
>
> > > Unmapping, specifically kvm_unmap_radix(), does not free page tables,
> > > hence not a concern.
> >
> > Not sure if you really need to make the distinction about why the page
> > table is freed, we might free them via unmapping. The point is just
> > anything that frees them while there can be concurrent access, right?
>
> Correct.
>
> > > Signed-off-by: Yu Zhao 
> > > ---
> > >  arch/powerpc/kvm/book3s_64_mmu_radix.c | 6 --
> > >  1 file changed, 4 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c 
> > > b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> > > index 461307b89c3a..3b65b3b11041 100644
> > > --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> > > +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> > > @@ -1469,13 +1469,15 @@ int kvmppc_radix_init(void)
> > >  {
> > >   unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
> > >
> > > - kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, 
> > > pte_ctor);
> > > + kvm_pte_cache = kmem_cache_create("kvm-pte", size, size,
> > > +   SLAB_TYPESAFE_BY_RCU, pte_ctor);
> > >   if (!kvm_pte_cache)
> > >   return -ENOMEM;
> > >
> > >   size = sizeof(void *) << RADIX_PMD_INDEX_SIZE;
> > >
> > > - kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, 
> > > pmd_ctor);
> > > + kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size,
> > > +   SLAB_TYPESAFE_BY_RCU, pmd_ctor);
> > >   if (!kvm_pmd_cache) {
> > >   kmem_cache_destroy(kvm_pte_cache);
> > >   return -ENOMEM;
> >
> > KVM PPC HV radix PUD level page tables use the arch/powerpc allocators
> > (for some reason), which are not RCU freed. I think you need them too?
>
> We don't. The use of the arch/powerpc allocator for PUD tables seems
> appropriate to me because, unlike PMD/PTE tables, we never free PUD
> tables during the lifetime of a VM:

Ah you're right, the pud_free only comes from the double alloc case
so it's never visible to concurrent threads.

> * We don't free PUD/PMD/PTE tables when they become empty, i.e., not
> mapping any pages but still attached. (We could in theory, as
> x86/aarch64 do.)

We may try to do that at some point, but that's not related to your
patch for now so no worries.

Thanks,
Nick


Re: [PATCH v9 00/14] pci: Work around ASMedia ASM2824 PCIe link training failures

2023-06-20 Thread Maciej W. Rozycki
On Fri, 16 Jun 2023, Bjorn Helgaas wrote:

> I agree that as I rearranged it, the workaround doesn't apply in all
> cases simultaneously.  Maybe not ideal, but maybe not terrible either.
> Looking at it again, maybe it would have made more sense to move the
> pcie_wait_for_link_delay() change to the last patch along with the
> pci_dev_wait() change.  I dunno.

 I think the order of the changes is not important enough to justify 
spending a lot of time and mental effort on it.  You decided, so be it.  
Thank you for your effort made with this review.

 With this series out of the way I have now posted a small clean-up for 
SBR code duplication between PCI core and an InfiniBand driver I came 
across in the course of working on this series.  See 
.

 Please have a look at your convenience.

  Maciej


[PATCH mm 10/12] mm/khugepaged: collapse_pte_mapped_thp() with mmap_read_lock()

2023-06-20 Thread Hugh Dickins
Bring collapse_and_free_pmd() back into collapse_pte_mapped_thp().
It does need mmap_read_lock(), but it does not need mmap_write_lock(),
nor vma_start_write() nor i_mmap lock nor anon_vma lock.  All racing
paths are relying on pte_offset_map_lock() and pmd_lock(), so use those.

Follow the pattern in retract_page_tables(); and using pte_free_defer()
removes most of the need for tlb_remove_table_sync_one() here; but call
pmdp_get_lockless_sync() to use it in the PAE case.

First check the VMA, in case page tables are being torn down: from JannH.
Confirm the preliminary find_pmd_or_thp_or_none() once page lock has been
acquired and the page looks suitable: from then on its state is stable.

However, collapse_pte_mapped_thp() was doing something others don't:
freeing a page table still containing "valid" entries.  i_mmap lock did
stop a racing truncate from double-freeing those pages, but we prefer
collapse_pte_mapped_thp() to clear the entries as usual.  Their TLB
flush can wait until the pmdp_collapse_flush() which follows, but the
mmu_notifier_invalidate_range_start() has to be done earlier.

Do the "step 1" checking loop without mmu_notifier: it wouldn't be good
for khugepaged to keep on repeatedly invalidating a range which is then
found unsuitable e.g. contains COWs.  "step 2", which does the clearing,
must then be more careful (after dropping ptl to do mmu_notifier), with
abort prepared to correct the accounting like "step 3".  But with those
entries now cleared, "step 4" (after dropping ptl to do pmd_lock) is kept
safe by the huge page lock, which stops new PTEs from being faulted in.

Signed-off-by: Hugh Dickins 
---
This is the version which applies to mm-everything or linux-next.

 mm/khugepaged.c | 174 ++--
 1 file changed, 78 insertions(+), 96 deletions(-)

--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1483,7 +1483,7 @@ static bool khugepaged_add_pte_mapped_th
return ret;
 }
 
-/* hpage must be locked, and mmap_lock must be held in write */
+/* hpage must be locked, and mmap_lock must be held */
 static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmdp, struct page *hpage)
 {
@@ -1495,7 +1495,7 @@ static int set_huge_pmd(struct vm_area_s
};
 
VM_BUG_ON(!PageTransHuge(hpage));
-   mmap_assert_write_locked(vma->vm_mm);
+   mmap_assert_locked(vma->vm_mm);
 
if (do_set_pmd(, hpage))
return SCAN_FAIL;
@@ -1504,48 +1504,6 @@ static int set_huge_pmd(struct vm_area_s
return SCAN_SUCCEED;
 }
 
-/*
- * A note about locking:
- * Trying to take the page table spinlocks would be useless here because those
- * are only used to synchronize:
- *
- *  - modifying terminal entries (ones that point to a data page, not to 
another
- *page table)
- *  - installing *new* non-terminal entries
- *
- * Instead, we need roughly the same kind of protection as free_pgtables() or
- * mm_take_all_locks() (but only for a single VMA):
- * The mmap lock together with this VMA's rmap locks covers all paths towards
- * the page table entries we're messing with here, except for hardware page
- * table walks and lockless_pages_from_mm().
- */
-static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct 
*vma,
- unsigned long addr, pmd_t *pmdp)
-{
-   pmd_t pmd;
-   struct mmu_notifier_range range;
-
-   mmap_assert_write_locked(mm);
-   if (vma->vm_file)
-   
lockdep_assert_held_write(>vm_file->f_mapping->i_mmap_rwsem);
-   /*
-* All anon_vmas attached to the VMA have the same root and are
-* therefore locked by the same lock.
-*/
-   if (vma->anon_vma)
-   lockdep_assert_held_write(>anon_vma->root->rwsem);
-
-   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, mm, addr,
-   addr + HPAGE_PMD_SIZE);
-   mmu_notifier_invalidate_range_start();
-   pmd = pmdp_collapse_flush(vma, addr, pmdp);
-   tlb_remove_table_sync_one();
-   mmu_notifier_invalidate_range_end();
-   mm_dec_nr_ptes(mm);
-   page_table_check_pte_clear_range(mm, addr, pmd);
-   pte_free(mm, pmd_pgtable(pmd));
-}
-
 /**
  * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
  * address haddr.
@@ -1561,26 +1519,29 @@ static void collapse_and_free_pmd(struct
 int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
bool install_pmd)
 {
+   struct mmu_notifier_range range;
+   bool notified = false;
unsigned long haddr = addr & HPAGE_PMD_MASK;
struct vm_area_struct *vma = vma_lookup(mm, haddr);
struct page *hpage;
pte_t *start_pte, *pte;
-   pmd_t *pmd;
-   spinlock_t *ptl;
-   int count = 0, result = SCAN_FAIL;
+   pmd_t *pmd, pgt_pmd;
+   spinlock_t *pml, *ptl;
+   int nr_ptes = 0, result = SCAN_FAIL;

Re: [PATCH mm-unstable v2 06/10] kvm/powerpc: make radix page tables RCU safe

2023-06-20 Thread Yu Zhao
On Tue, Jun 20, 2023 at 12:33 AM Nicholas Piggin  wrote:
>
> On Sat May 27, 2023 at 9:44 AM AEST, Yu Zhao wrote:
> > KVM page tables are currently not RCU safe against remapping, i.e.,
> > kvmppc_unmap_free_pmd_entry_table() et al. The previous
>
> Minor nit but the "page table" is not RCU-safe against something. It
> is RCU-freed, and therefore some algorithm that accesses it can have
> the existence guarantee provided by RCU (usually there still needs
> to be more to it).
>
> > mmu_notifier_ops members rely on kvm->mmu_lock to synchronize with
> > that operation.
> >
> > However, the new mmu_notifier_ops member test_clear_young() provides
> > a fast path that does not take kvm->mmu_lock. To implement
> > kvm_arch_test_clear_young() for that path, orphan page tables need to
> > be freed by RCU.
>
> Short version: clear the referenced bit using RCU instead of MMU lock
> to protect against page table freeing, and there is no problem with
> clearing the bit in a table that has been freed.
>
> Seems reasonable.

Thanks. All above points taken.

> > Unmapping, specifically kvm_unmap_radix(), does not free page tables,
> > hence not a concern.
>
> Not sure if you really need to make the distinction about why the page
> table is freed, we might free them via unmapping. The point is just
> anything that frees them while there can be concurrent access, right?

Correct.

> > Signed-off-by: Yu Zhao 
> > ---
> >  arch/powerpc/kvm/book3s_64_mmu_radix.c | 6 --
> >  1 file changed, 4 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c 
> > b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> > index 461307b89c3a..3b65b3b11041 100644
> > --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> > +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> > @@ -1469,13 +1469,15 @@ int kvmppc_radix_init(void)
> >  {
> >   unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
> >
> > - kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor);
> > + kvm_pte_cache = kmem_cache_create("kvm-pte", size, size,
> > +   SLAB_TYPESAFE_BY_RCU, pte_ctor);
> >   if (!kvm_pte_cache)
> >   return -ENOMEM;
> >
> >   size = sizeof(void *) << RADIX_PMD_INDEX_SIZE;
> >
> > - kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor);
> > + kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size,
> > +   SLAB_TYPESAFE_BY_RCU, pmd_ctor);
> >   if (!kvm_pmd_cache) {
> >   kmem_cache_destroy(kvm_pte_cache);
> >   return -ENOMEM;
>
> KVM PPC HV radix PUD level page tables use the arch/powerpc allocators
> (for some reason), which are not RCU freed. I think you need them too?

We don't. The use of the arch/powerpc allocator for PUD tables seems
appropriate to me because, unlike PMD/PTE tables, we never free PUD
tables during the lifetime of a VM:
* We don't free PUD/PMD/PTE tables when they become empty, i.e., not
mapping any pages but still attached. (We could in theory, as
x86/aarch64 do.)
* We have to free PMD/PTE tables when we replace them with 1GB/2MB
pages. (Otherwise we'd lose track of detached tables.) And we
currently don't support huge pages at P4D level, so we never detach
and free PUD tables.


[PATCH v2 12/12] mm: delete mmap_write_trylock() and vma_try_start_write()

2023-06-20 Thread Hugh Dickins
mmap_write_trylock() and vma_try_start_write() were added just for
khugepaged, but now it has no use for them: delete.

Signed-off-by: Hugh Dickins 
---
 include/linux/mm.h| 17 -
 include/linux/mmap_lock.h | 10 --
 2 files changed, 27 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3c2e56980853..9b24f8fbf899 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -690,21 +690,6 @@ static inline void vma_start_write(struct vm_area_struct 
*vma)
up_write(>vm_lock->lock);
 }
 
-static inline bool vma_try_start_write(struct vm_area_struct *vma)
-{
-   int mm_lock_seq;
-
-   if (__is_vma_write_locked(vma, _lock_seq))
-   return true;
-
-   if (!down_write_trylock(>vm_lock->lock))
-   return false;
-
-   vma->vm_lock_seq = mm_lock_seq;
-   up_write(>vm_lock->lock);
-   return true;
-}
-
 static inline void vma_assert_write_locked(struct vm_area_struct *vma)
 {
int mm_lock_seq;
@@ -730,8 +715,6 @@ static inline bool vma_start_read(struct vm_area_struct 
*vma)
{ return false; }
 static inline void vma_end_read(struct vm_area_struct *vma) {}
 static inline void vma_start_write(struct vm_area_struct *vma) {}
-static inline bool vma_try_start_write(struct vm_area_struct *vma)
-   { return true; }
 static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
 static inline void vma_mark_detached(struct vm_area_struct *vma,
 bool detached) {}
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index aab8f1b28d26..d1191f02c7fa 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -112,16 +112,6 @@ static inline int mmap_write_lock_killable(struct 
mm_struct *mm)
return ret;
 }
 
-static inline bool mmap_write_trylock(struct mm_struct *mm)
-{
-   bool ret;
-
-   __mmap_lock_trace_start_locking(mm, true);
-   ret = down_write_trylock(>mmap_lock) != 0;
-   __mmap_lock_trace_acquire_returned(mm, true, ret);
-   return ret;
-}
-
 static inline void mmap_write_unlock(struct mm_struct *mm)
 {
__mmap_lock_trace_released(mm, true);
-- 
2.35.3



[PATCH v2 11/12] mm/khugepaged: delete khugepaged_collapse_pte_mapped_thps()

2023-06-20 Thread Hugh Dickins
Now that retract_page_tables() can retract page tables reliably, without
depending on trylocks, delete all the apparatus for khugepaged to try
again later: khugepaged_collapse_pte_mapped_thps() etc; and free up the
per-mm memory which was set aside for that in the khugepaged_mm_slot.

But one part of that is worth keeping: when hpage_collapse_scan_file()
found SCAN_PTE_MAPPED_HUGEPAGE, that address was noted in the mm_slot
to be tried for retraction later - catching, for example, page tables
where a reversible mprotect() of a portion had required splitting the
pmd, but now it can be recollapsed.  Call collapse_pte_mapped_thp()
directly in this case (why was it deferred before?  I assume an issue
with needing mmap_lock for write, but now it's only needed for read).

Signed-off-by: Hugh Dickins 
---
 mm/khugepaged.c | 125 +++-
 1 file changed, 16 insertions(+), 109 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 060ac8789a1e..06c659e6a89e 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -92,8 +92,6 @@ static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, 
MM_SLOTS_HASH_BITS);
 
 static struct kmem_cache *mm_slot_cache __read_mostly;
 
-#define MAX_PTE_MAPPED_THP 8
-
 struct collapse_control {
bool is_khugepaged;
 
@@ -107,15 +105,9 @@ struct collapse_control {
 /**
  * struct khugepaged_mm_slot - khugepaged information per mm that is being 
scanned
  * @slot: hash lookup from mm to mm_slot
- * @nr_pte_mapped_thp: number of pte mapped THP
- * @pte_mapped_thp: address array corresponding pte mapped THP
  */
 struct khugepaged_mm_slot {
struct mm_slot slot;
-
-   /* pte-mapped THP in this mm */
-   int nr_pte_mapped_thp;
-   unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP];
 };
 
 /**
@@ -1441,50 +1433,6 @@ static void collect_mm_slot(struct khugepaged_mm_slot 
*mm_slot)
 }
 
 #ifdef CONFIG_SHMEM
-/*
- * Notify khugepaged that given addr of the mm is pte-mapped THP. Then
- * khugepaged should try to collapse the page table.
- *
- * Note that following race exists:
- * (1) khugepaged calls khugepaged_collapse_pte_mapped_thps() for mm_struct A,
- * emptying the A's ->pte_mapped_thp[] array.
- * (2) MADV_COLLAPSE collapses some file extent with target mm_struct B, and
- * retract_page_tables() finds a VMA in mm_struct A mapping the same extent
- * (at virtual address X) and adds an entry (for X) into mm_struct A's
- * ->pte-mapped_thp[] array.
- * (3) khugepaged calls khugepaged_collapse_scan_file() for mm_struct A at X,
- * sees a pte-mapped THP (SCAN_PTE_MAPPED_HUGEPAGE) and adds an entry
- * (for X) into mm_struct A's ->pte-mapped_thp[] array.
- * Thus, it's possible the same address is added multiple times for the same
- * mm_struct.  Should this happen, we'll simply attempt
- * collapse_pte_mapped_thp() multiple times for the same address, under the 
same
- * exclusive mmap_lock, and assuming the first call is successful, subsequent
- * attempts will return quickly (without grabbing any additional locks) when
- * a huge pmd is found in find_pmd_or_thp_or_none().  Since this is a cheap
- * check, and since this is a rare occurrence, the cost of preventing this
- * "multiple-add" is thought to be more expensive than just handling it, should
- * it occur.
- */
-static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
- unsigned long addr)
-{
-   struct khugepaged_mm_slot *mm_slot;
-   struct mm_slot *slot;
-   bool ret = false;
-
-   VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
-
-   spin_lock(_mm_lock);
-   slot = mm_slot_lookup(mm_slots_hash, mm);
-   mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
-   if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) 
{
-   mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
-   ret = true;
-   }
-   spin_unlock(_mm_lock);
-   return ret;
-}
-
 /* hpage must be locked, and mmap_lock must be held */
 static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmdp, struct page *hpage)
@@ -1706,29 +1654,6 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, 
unsigned long addr,
return result;
 }
 
-static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot 
*mm_slot)
-{
-   struct mm_slot *slot = _slot->slot;
-   struct mm_struct *mm = slot->mm;
-   int i;
-
-   if (likely(mm_slot->nr_pte_mapped_thp == 0))
-   return;
-
-   if (!mmap_write_trylock(mm))
-   return;
-
-   if (unlikely(hpage_collapse_test_exit(mm)))
-   goto out;
-
-   for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++)
-   collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i], false);
-
-out:
-   mm_slot->nr_pte_mapped_thp = 0;
-   mmap_write_unlock(mm);
-}
-
 static void retract_page_tables(struct 

[PATCH v2 10/12] mm/khugepaged: collapse_pte_mapped_thp() with mmap_read_lock()

2023-06-20 Thread Hugh Dickins
Bring collapse_and_free_pmd() back into collapse_pte_mapped_thp().
It does need mmap_read_lock(), but it does not need mmap_write_lock(),
nor vma_start_write() nor i_mmap lock nor anon_vma lock.  All racing
paths are relying on pte_offset_map_lock() and pmd_lock(), so use those.

Follow the pattern in retract_page_tables(); and using pte_free_defer()
removes most of the need for tlb_remove_table_sync_one() here; but call
pmdp_get_lockless_sync() to use it in the PAE case.

First check the VMA, in case page tables are being torn down: from JannH.
Confirm the preliminary find_pmd_or_thp_or_none() once page lock has been
acquired and the page looks suitable: from then on its state is stable.

However, collapse_pte_mapped_thp() was doing something others don't:
freeing a page table still containing "valid" entries.  i_mmap lock did
stop a racing truncate from double-freeing those pages, but we prefer
collapse_pte_mapped_thp() to clear the entries as usual.  Their TLB
flush can wait until the pmdp_collapse_flush() which follows, but the
mmu_notifier_invalidate_range_start() has to be done earlier.

Do the "step 1" checking loop without mmu_notifier: it wouldn't be good
for khugepaged to keep on repeatedly invalidating a range which is then
found unsuitable e.g. contains COWs.  "step 2", which does the clearing,
must then be more careful (after dropping ptl to do mmu_notifier), with
abort prepared to correct the accounting like "step 3".  But with those
entries now cleared, "step 4" (after dropping ptl to do pmd_lock) is kept
safe by the huge page lock, which stops new PTEs from being faulted in.

Signed-off-by: Hugh Dickins 
---
 mm/khugepaged.c | 172 ++--
 1 file changed, 77 insertions(+), 95 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index f7a0f7673127..060ac8789a1e 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1485,7 +1485,7 @@ static bool khugepaged_add_pte_mapped_thp(struct 
mm_struct *mm,
return ret;
 }
 
-/* hpage must be locked, and mmap_lock must be held in write */
+/* hpage must be locked, and mmap_lock must be held */
 static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmdp, struct page *hpage)
 {
@@ -1497,7 +1497,7 @@ static int set_huge_pmd(struct vm_area_struct *vma, 
unsigned long addr,
};
 
VM_BUG_ON(!PageTransHuge(hpage));
-   mmap_assert_write_locked(vma->vm_mm);
+   mmap_assert_locked(vma->vm_mm);
 
if (do_set_pmd(, hpage))
return SCAN_FAIL;
@@ -1506,48 +1506,6 @@ static int set_huge_pmd(struct vm_area_struct *vma, 
unsigned long addr,
return SCAN_SUCCEED;
 }
 
-/*
- * A note about locking:
- * Trying to take the page table spinlocks would be useless here because those
- * are only used to synchronize:
- *
- *  - modifying terminal entries (ones that point to a data page, not to 
another
- *page table)
- *  - installing *new* non-terminal entries
- *
- * Instead, we need roughly the same kind of protection as free_pgtables() or
- * mm_take_all_locks() (but only for a single VMA):
- * The mmap lock together with this VMA's rmap locks covers all paths towards
- * the page table entries we're messing with here, except for hardware page
- * table walks and lockless_pages_from_mm().
- */
-static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct 
*vma,
- unsigned long addr, pmd_t *pmdp)
-{
-   pmd_t pmd;
-   struct mmu_notifier_range range;
-
-   mmap_assert_write_locked(mm);
-   if (vma->vm_file)
-   
lockdep_assert_held_write(>vm_file->f_mapping->i_mmap_rwsem);
-   /*
-* All anon_vmas attached to the VMA have the same root and are
-* therefore locked by the same lock.
-*/
-   if (vma->anon_vma)
-   lockdep_assert_held_write(>anon_vma->root->rwsem);
-
-   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, mm, addr,
-   addr + HPAGE_PMD_SIZE);
-   mmu_notifier_invalidate_range_start();
-   pmd = pmdp_collapse_flush(vma, addr, pmdp);
-   tlb_remove_table_sync_one();
-   mmu_notifier_invalidate_range_end();
-   mm_dec_nr_ptes(mm);
-   page_table_check_pte_clear_range(mm, addr, pmd);
-   pte_free(mm, pmd_pgtable(pmd));
-}
-
 /**
  * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
  * address haddr.
@@ -1563,26 +1521,29 @@ static void collapse_and_free_pmd(struct mm_struct *mm, 
struct vm_area_struct *v
 int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
bool install_pmd)
 {
+   struct mmu_notifier_range range;
+   bool notified = false;
unsigned long haddr = addr & HPAGE_PMD_MASK;
struct vm_area_struct *vma = vma_lookup(mm, haddr);
struct page *hpage;
pte_t *start_pte, *pte;
-   pmd_t *pmd;
-   spinlock_t *ptl;
-   

[PATCH v2 09/12] mm/khugepaged: retract_page_tables() without mmap or vma lock

2023-06-20 Thread Hugh Dickins
Simplify shmem and file THP collapse's retract_page_tables(), and relax
its locking: to improve its success rate and to lessen impact on others.

Instead of its MADV_COLLAPSE case doing set_huge_pmd() at target_addr of
target_mm, leave that part of the work to madvise_collapse() calling
collapse_pte_mapped_thp() afterwards: just adjust collapse_file()'s
result code to arrange for that.  That spares retract_page_tables() four
arguments; and since it will be successful in retracting all of the page
tables expected of it, no need to track and return a result code itself.

It needs i_mmap_lock_read(mapping) for traversing the vma interval tree,
but it does not need i_mmap_lock_write() for that: page_vma_mapped_walk()
allows for pte_offset_map_lock() etc to fail, and uses pmd_lock() for
THPs.  retract_page_tables() just needs to use those same spinlocks to
exclude it briefly, while transitioning pmd from page table to none: so
restore its use of pmd_lock() inside of which pte lock is nested.

Users of pte_offset_map_lock() etc all now allow for them to fail:
so retract_page_tables() now has no use for mmap_write_trylock() or
vma_try_start_write().  In common with rmap and page_vma_mapped_walk(),
it does not even need the mmap_read_lock().

But those users do expect the page table to remain a good page table,
until they unlock and rcu_read_unlock(): so the page table cannot be
freed immediately, but rather by the recently added pte_free_defer().

Use the (usually a no-op) pmdp_get_lockless_sync() to send an interrupt
when PAE, and pmdp_collapse_flush() did not already do so: to make sure
that the start,pmdp_get_lockless(),end sequence in __pte_offset_map()
cannot pick up a pmd entry with mismatched pmd_low and pmd_high.

retract_page_tables() can be enhanced to replace_page_tables(), which
inserts the final huge pmd without mmap lock: going through an invalid
state instead of pmd_none() followed by fault.  But that enhancement
does raise some more questions: leave it until a later release.

Signed-off-by: Hugh Dickins 
---
 mm/khugepaged.c | 184 
 1 file changed, 75 insertions(+), 109 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 1083f0e38a07..f7a0f7673127 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1617,9 +1617,8 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, 
unsigned long addr,
break;
case SCAN_PMD_NONE:
/*
-* In MADV_COLLAPSE path, possible race with khugepaged where
-* all pte entries have been removed and pmd cleared.  If so,
-* skip all the pte checks and just update the pmd mapping.
+* All pte entries have been removed and pmd cleared.
+* Skip all the pte checks and just update the pmd mapping.
 */
goto maybe_install_pmd;
default:
@@ -1748,123 +1747,88 @@ static void khugepaged_collapse_pte_mapped_thps(struct 
khugepaged_mm_slot *mm_sl
mmap_write_unlock(mm);
 }
 
-static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
-  struct mm_struct *target_mm,
-  unsigned long target_addr, struct page *hpage,
-  struct collapse_control *cc)
+static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 {
struct vm_area_struct *vma;
-   int target_result = SCAN_FAIL;
 
-   i_mmap_lock_write(mapping);
+   i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, >i_mmap, pgoff, pgoff) {
-   int result = SCAN_FAIL;
-   struct mm_struct *mm = NULL;
-   unsigned long addr = 0;
-   pmd_t *pmd;
-   bool is_target = false;
+   struct mmu_notifier_range range;
+   struct mm_struct *mm;
+   unsigned long addr;
+   pmd_t *pmd, pgt_pmd;
+   spinlock_t *pml;
+   spinlock_t *ptl;
+   bool skipped_uffd = false;
 
/*
 * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
-* got written to. These VMAs are likely not worth investing
-* mmap_write_lock(mm) as PMD-mapping is likely to be split
-* later.
-*
-* Note that vma->anon_vma check is racy: it can be set up after
-* the check but before we took mmap_lock by the fault path.
-* But page lock would prevent establishing any new ptes of the
-* page, so we are safe.
-*
-* An alternative would be drop the check, but check that page
-* table is clear before calling pmdp_collapse_flush() under
-* ptl. It has higher chance to recover THP for the VMA, but
-* has higher cost too. It would also probably require 

[PATCH v2 08/12] mm/pgtable: add pte_free_defer() for pgtable as page

2023-06-20 Thread Hugh Dickins
Add the generic pte_free_defer(), to call pte_free() via call_rcu().
pte_free_defer() will be called inside khugepaged's retract_page_tables()
loop, where allocating extra memory cannot be relied upon.  This version
suits all those architectures which use an unfragmented page for one page
table (none of whose pte_free()s use the mm arg which was passed to it).

Signed-off-by: Hugh Dickins 
---
 include/linux/mm_types.h |  4 
 include/linux/pgtable.h  |  2 ++
 mm/pgtable-generic.c | 20 
 3 files changed, 26 insertions(+)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 1667a1bdb8a8..09335fa28c41 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -144,6 +144,10 @@ struct page {
struct {/* Page table pages */
unsigned long _pt_pad_1;/* compound_head */
pgtable_t pmd_huge_pte; /* protected by page->ptl */
+   /*
+* A PTE page table page might be freed by use of
+* rcu_head: which overlays those two fields above.
+*/
unsigned long _pt_pad_2;/* mapping */
union {
struct mm_struct *pt_mm; /* x86 pgd, s390 */
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 525f1782b466..d18d3e963967 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -112,6 +112,8 @@ static inline void pte_unmap(pte_t *pte)
 }
 #endif
 
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable);
+
 /* Find an entry in the second-level page table.. */
 #ifndef pmd_offset
 static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 5e85a625ab30..ab3741064bb8 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -13,6 +13,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 /*
@@ -230,6 +231,25 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, 
unsigned long address,
return pmd;
 }
 #endif
+
+/* arch define pte_free_defer in asm/pgalloc.h for its own implementation */
+#ifndef pte_free_defer
+static void pte_free_now(struct rcu_head *head)
+{
+   struct page *page;
+
+   page = container_of(head, struct page, rcu_head);
+   pte_free(NULL /* mm not passed and not used */, (pgtable_t)page);
+}
+
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
+{
+   struct page *page;
+
+   page = pgtable;
+   call_rcu(>rcu_head, pte_free_now);
+}
+#endif /* pte_free_defer */
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #if defined(CONFIG_GUP_GET_PXX_LOW_HIGH) && \
-- 
2.35.3



[PATCH v2 07/12] s390: add pte_free_defer() for pgtables sharing page

2023-06-20 Thread Hugh Dickins
Add s390-specific pte_free_defer(), to call pte_free() via call_rcu().
pte_free_defer() will be called inside khugepaged's retract_page_tables()
loop, where allocating extra memory cannot be relied upon.  This precedes
the generic version to avoid build breakage from incompatible pgtable_t.

This version is more complicated than others: because s390 fits two 2K
page tables into one 4K page (so page->rcu_head must be shared between
both halves), and already uses page->lru (which page->rcu_head overlays)
to list any free halves; with clever management by page->_refcount bits.

Build upon the existing management, adjusted to follow a new rule: that
a page is not linked to mm_context_t::pgtable_list while either half is
pending free, by either tlb_remove_table() or pte_free_defer(); but is
afterwards either relinked to the list (if other half is allocated), or
freed (if other half is free): by __tlb_remove_table() in both cases.

This rule ensures that page->lru is no longer in use while page->rcu_head
may be needed for use by pte_free_defer().  And a fortuitous byproduct of
following this rule is that page_table_free() no longer needs its curious
two-step manipulation of _refcount - read commit c2c224932fd0 ("s390/mm:
fix 2KB pgtable release race") for what to think of there.  But it does
not solve the problem that two halves may need rcu_head at the same time.

For that, add HHead bits between s390's AAllocated and PPending bits in
the upper byte of page->_refcount: then the second pte_free_defer() can
see that rcu_head is already in use, and the RCU callee pte_free_half()
can see that it needs to make a further call_rcu() for that other half.

page_table_alloc() set the page->pt_mm field, so __tlb_remove_table()
knows where to link the freed half while its other half is allocated.
But linking to the list needs mm->context.lock: and although AA bit set
guarantees that pt_mm must still be valid, it does not guarantee that mm
is still valid an instant later: so acquiring mm->context.lock would not
be safe.  For now, use a static global mm_pgtable_list_lock instead:
then a soon-to-follow commit will split it per-mm as before (probably by
using a SLAB_TYPESAFE_BY_RCU structure for the list head and its lock);
and update the commentary on the pgtable_list.

Signed-off-by: Hugh Dickins 
---
 arch/s390/include/asm/pgalloc.h |   4 +
 arch/s390/mm/pgalloc.c  | 205 +++-
 include/linux/mm_types.h|   2 +-
 3 files changed, 154 insertions(+), 57 deletions(-)

diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index 17eb618f1348..89a9d5ef94f8 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -143,6 +143,10 @@ static inline void pmd_populate(struct mm_struct *mm,
 #define pte_free_kernel(mm, pte) page_table_free(mm, (unsigned long *) pte)
 #define pte_free(mm, pte) page_table_free(mm, (unsigned long *) pte)
 
+/* arch use pte_free_defer() implementation in arch/s390/mm/pgalloc.c */
+#define pte_free_defer pte_free_defer
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable);
+
 void vmem_map_init(void);
 void *vmem_crst_alloc(unsigned long val);
 pte_t *vmem_pte_alloc(void);
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 66ab68db9842..11983a3ff95a 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -159,6 +159,11 @@ void page_table_free_pgste(struct page *page)
 
 #endif /* CONFIG_PGSTE */
 
+/*
+ * Temporarily use a global spinlock instead of mm->context.lock.
+ * This will be replaced by a per-mm spinlock in a followup commit.
+ */
+static DEFINE_SPINLOCK(mm_pgtable_list_lock);
 /*
  * A 2KB-pgtable is either upper or lower half of a normal page.
  * The second half of the page may be unused or used as another
@@ -172,7 +177,7 @@ void page_table_free_pgste(struct page *page)
  * When a parent page gets fully allocated it contains 2KB-pgtables in both
  * upper and lower halves and is removed from mm_context_t::pgtable_list.
  *
- * When 2KB-pgtable is freed from to fully allocated parent page that
+ * When 2KB-pgtable is freed from the fully allocated parent page that
  * page turns partially allocated and added to mm_context_t::pgtable_list.
  *
  * If 2KB-pgtable is freed from the partially allocated parent page that
@@ -182,16 +187,24 @@ void page_table_free_pgste(struct page *page)
  * As follows from the above, no unallocated or fully allocated parent
  * pages are contained in mm_context_t::pgtable_list.
  *
+ * NOTE NOTE NOTE: The commentary above and below has not yet been updated:
+ * the new rule is that a page is not linked to mm_context_t::pgtable_list
+ * while either half is pending free by any method; but afterwards is
+ * either relinked to it, or freed, by __tlb_remove_table().  This allows
+ * pte_free_defer() to use the page->rcu_head (which overlays page->lru).
+ *
  * The upper byte (bits 24-31) of the parent page _refcount is used
  * for tracking 

[PATCH v2 06/12] sparc: add pte_free_defer() for pte_t *pgtable_t

2023-06-20 Thread Hugh Dickins
Add sparc-specific pte_free_defer(), to call pte_free() via call_rcu().
pte_free_defer() will be called inside khugepaged's retract_page_tables()
loop, where allocating extra memory cannot be relied upon.  This precedes
the generic version to avoid build breakage from incompatible pgtable_t.

sparc32 supports pagetables sharing a page, but does not support THP;
sparc64 supports THP, but does not support pagetables sharing a page.
So the sparc-specific pte_free_defer() is as simple as the generic one,
except for converting between pte_t *pgtable_t and struct page *.

Signed-off-by: Hugh Dickins 
---
 arch/sparc/include/asm/pgalloc_64.h |  4 
 arch/sparc/mm/init_64.c | 16 
 2 files changed, 20 insertions(+)

diff --git a/arch/sparc/include/asm/pgalloc_64.h 
b/arch/sparc/include/asm/pgalloc_64.h
index 7b5561d17ab1..caa7632be4c2 100644
--- a/arch/sparc/include/asm/pgalloc_64.h
+++ b/arch/sparc/include/asm/pgalloc_64.h
@@ -65,6 +65,10 @@ pgtable_t pte_alloc_one(struct mm_struct *mm);
 void pte_free_kernel(struct mm_struct *mm, pte_t *pte);
 void pte_free(struct mm_struct *mm, pgtable_t ptepage);
 
+/* arch use pte_free_defer() implementation in arch/sparc/mm/init_64.c */
+#define pte_free_defer pte_free_defer
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable);
+
 #define pmd_populate_kernel(MM, PMD, PTE)  pmd_set(MM, PMD, PTE)
 #define pmd_populate(MM, PMD, PTE) pmd_set(MM, PMD, PTE)
 
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 04f9db0c3111..0d7fd793924c 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2930,6 +2930,22 @@ void pgtable_free(void *table, bool is_page)
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void pte_free_now(struct rcu_head *head)
+{
+   struct page *page;
+
+   page = container_of(head, struct page, rcu_head);
+   __pte_free((pgtable_t)page_address(page));
+}
+
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
+{
+   struct page *page;
+
+   page = virt_to_page(pgtable);
+   call_rcu(>rcu_head, pte_free_now);
+}
+
 void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
  pmd_t *pmd)
 {
-- 
2.35.3



Re: [PATCH mm-unstable v2 07/10] kvm/powerpc: add kvm_arch_test_clear_young()

2023-06-20 Thread Nicholas Piggin
On Sat May 27, 2023 at 9:44 AM AEST, Yu Zhao wrote:
> Implement kvm_arch_test_clear_young() to support the fast path in
> mmu_notifier_ops->test_clear_young().
>
> It focuses on a simple case, i.e., radix MMU sets the accessed bit in
> KVM PTEs and VMs are not nested, where it can rely on RCU and
> pte_xchg() to safely clear the accessed bit without taking
> kvm->mmu_lock. Complex cases fall back to the existing slow path
> where kvm->mmu_lock is then taken.
>
> Signed-off-by: Yu Zhao 
> ---
>  arch/powerpc/include/asm/kvm_host.h|  8 
>  arch/powerpc/include/asm/kvm_ppc.h |  1 +
>  arch/powerpc/kvm/book3s.c  |  6 +++
>  arch/powerpc/kvm/book3s.h  |  1 +
>  arch/powerpc/kvm/book3s_64_mmu_radix.c | 59 ++
>  arch/powerpc/kvm/book3s_hv.c   |  5 +++
>  6 files changed, 80 insertions(+)
>
> diff --git a/arch/powerpc/include/asm/kvm_host.h 
> b/arch/powerpc/include/asm/kvm_host.h
> index 14ee0dece853..75c260ea8a9e 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -883,4 +883,12 @@ static inline void kvm_arch_sched_in(struct kvm_vcpu 
> *vcpu, int cpu) {}
>  static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
>  static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
>  
> +#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young
> +static inline bool kvm_arch_has_test_clear_young(void)
> +{
> + return IS_ENABLED(CONFIG_KVM_BOOK3S_HV_POSSIBLE) &&
> +cpu_has_feature(CPU_FTR_HVMODE) && 
> cpu_has_feature(CPU_FTR_ARCH_300) &&
> +radix_enabled();

This could probably be radix_enabled() && !kvmhv_on_pseries(). Although
unclear why not nested hypervisor... I'd have to think about it a bit
more. Do you have any idea what might go wrong, or just didn't have the
time to consider it? (Not just powerpc nested but any others).

> +}
> +
>  #endif /* __POWERPC_KVM_HOST_H__ */
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
> b/arch/powerpc/include/asm/kvm_ppc.h
> index 79a9c0bb8bba..ff1af6a7b44f 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -287,6 +287,7 @@ struct kvmppc_ops {
>   bool (*unmap_gfn_range)(struct kvm *kvm, struct kvm_gfn_range *range);
>   bool (*age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
>   bool (*test_age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
> + bool (*test_clear_young)(struct kvm *kvm, struct kvm_gfn_range *range);
>   bool (*set_spte_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
>   void (*free_memslot)(struct kvm_memory_slot *slot);
>   int (*init_vm)(struct kvm *kvm);
> diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
> index 686d8d9eda3e..37bf40b0c4ff 100644
> --- a/arch/powerpc/kvm/book3s.c
> +++ b/arch/powerpc/kvm/book3s.c
> @@ -899,6 +899,12 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct 
> kvm_gfn_range *range)
>   return kvm->arch.kvm_ops->test_age_gfn(kvm, range);
>  }
>  
> +bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range)
> +{
> + return !kvm->arch.kvm_ops->test_clear_young ||
> +kvm->arch.kvm_ops->test_clear_young(kvm, range);
> +}
> +
>  bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
>  {
>   return kvm->arch.kvm_ops->set_spte_gfn(kvm, range);
> diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h
> index 58391b4b32ed..fa2659e21ccc 100644
> --- a/arch/powerpc/kvm/book3s.h
> +++ b/arch/powerpc/kvm/book3s.h
> @@ -12,6 +12,7 @@ extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
>  extern bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range 
> *range);
>  extern bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
>  extern bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range 
> *range);
> +extern bool kvm_test_clear_young_hv(struct kvm *kvm, struct kvm_gfn_range 
> *range);
>  extern bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range 
> *range);
>  
>  extern int kvmppc_mmu_init_pr(struct kvm_vcpu *vcpu);
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c 
> b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> index 3b65b3b11041..0a392e9a100a 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> @@ -1088,6 +1088,65 @@ bool kvm_test_age_radix(struct kvm *kvm, struct 
> kvm_memory_slot *memslot,
>   return ref;
>  }
>  
> +bool kvm_test_clear_young_hv(struct kvm *kvm, struct kvm_gfn_range *range)
> +{
> + bool err;
> + gfn_t gfn = range->start;
> +
> + rcu_read_lock();
> +
> + err = !kvm_is_radix(kvm);
> + if (err)
> + goto unlock;
> +
> + /*
> +  * Case 1:  This function  kvmppc_switch_mmu_to_hpt()
> +  *
> +  *  rcu_read_lock()
> +  *  Test kvm_is_radix()kvm->arch.radix = 0
> +  *  Use kvm->arch.pgtable 

[PATCH v2 05/12] powerpc: add pte_free_defer() for pgtables sharing page

2023-06-20 Thread Hugh Dickins
Add powerpc-specific pte_free_defer(), to call pte_free() via call_rcu().
pte_free_defer() will be called inside khugepaged's retract_page_tables()
loop, where allocating extra memory cannot be relied upon.  This precedes
the generic version to avoid build breakage from incompatible pgtable_t.

This is awkward because the struct page contains only one rcu_head, but
that page may be shared between PTE_FRAG_NR pagetables, each wanting to
use the rcu_head at the same time: account concurrent deferrals with a
heightened refcount, only the first making use of the rcu_head, but
re-deferring if more deferrals arrived during its grace period.

Signed-off-by: Hugh Dickins 
---
 arch/powerpc/include/asm/pgalloc.h |  4 +++
 arch/powerpc/mm/pgtable-frag.c | 51 ++
 2 files changed, 55 insertions(+)

diff --git a/arch/powerpc/include/asm/pgalloc.h 
b/arch/powerpc/include/asm/pgalloc.h
index 3360cad78ace..3a971e2a8c73 100644
--- a/arch/powerpc/include/asm/pgalloc.h
+++ b/arch/powerpc/include/asm/pgalloc.h
@@ -45,6 +45,10 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t 
ptepage)
pte_fragment_free((unsigned long *)ptepage, 0);
 }
 
+/* arch use pte_free_defer() implementation in arch/powerpc/mm/pgtable-frag.c 
*/
+#define pte_free_defer pte_free_defer
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable);
+
 /*
  * Functions that deal with pagetables that could be at any level of
  * the table need to be passed an "index_size" so they know how to
diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c
index 20652daa1d7e..e4f58c5fc2ac 100644
--- a/arch/powerpc/mm/pgtable-frag.c
+++ b/arch/powerpc/mm/pgtable-frag.c
@@ -120,3 +120,54 @@ void pte_fragment_free(unsigned long *table, int kernel)
__free_page(page);
}
 }
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define PTE_FREE_DEFERRED 0x1 /* beyond any PTE_FRAG_NR */
+
+static void pte_free_now(struct rcu_head *head)
+{
+   struct page *page;
+   int refcount;
+
+   page = container_of(head, struct page, rcu_head);
+   refcount = atomic_sub_return(PTE_FREE_DEFERRED - 1,
+>pt_frag_refcount);
+   if (refcount < PTE_FREE_DEFERRED) {
+   pte_fragment_free((unsigned long *)page_address(page), 0);
+   return;
+   }
+   /*
+* One page may be shared between PTE_FRAG_NR pagetables.
+* At least one more call to pte_free_defer() came in while we
+* were already deferring, so the free must be deferred again;
+* but just for one grace period, however many calls came in.
+*/
+   while (refcount >= PTE_FREE_DEFERRED + PTE_FREE_DEFERRED) {
+   refcount = atomic_sub_return(PTE_FREE_DEFERRED,
+>pt_frag_refcount);
+   }
+   /* Remove that refcount of 1 left for fragment freeing above */
+   atomic_dec(>pt_frag_refcount);
+   call_rcu(>rcu_head, pte_free_now);
+}
+
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
+{
+   struct page *page;
+
+   page = virt_to_page(pgtable);
+   /*
+* One page may be shared between PTE_FRAG_NR pagetables: only queue
+* it once for freeing, but note whenever the free must be deferred.
+*
+* (This would be much simpler if the struct page had an rcu_head for
+* each fragment, or if we could allocate a separate array for that.)
+*
+* Convert our refcount of 1 to a refcount of PTE_FREE_DEFERRED, and
+* proceed to call_rcu() only when the rcu_head is not already in use.
+*/
+   if (atomic_add_return(PTE_FREE_DEFERRED - 1, >pt_frag_refcount) <
+ PTE_FREE_DEFERRED + PTE_FREE_DEFERRED)
+   call_rcu(>rcu_head, pte_free_now);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-- 
2.35.3



[PATCH v2 04/12] powerpc: assert_pte_locked() use pte_offset_map_nolock()

2023-06-20 Thread Hugh Dickins
Instead of pte_lockptr(), use the recently added pte_offset_map_nolock()
in assert_pte_locked().  BUG if pte_offset_map_nolock() fails: this is
stricter than the previous implementation, which skipped when pmd_none()
(with a comment on khugepaged collapse transitions): but wouldn't we want
to know, if an assert_pte_locked() caller can be racing such transitions?

This mod might cause new crashes: which either expose my ignorance, or
indicate issues to be fixed, or limit the usage of assert_pte_locked().

Signed-off-by: Hugh Dickins 
---
 arch/powerpc/mm/pgtable.c | 16 ++--
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index cb2dcdb18f8e..16b061af86d7 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -311,6 +311,8 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long 
addr)
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
+   pte_t *pte;
+   spinlock_t *ptl;
 
if (mm == _mm)
return;
@@ -321,16 +323,10 @@ void assert_pte_locked(struct mm_struct *mm, unsigned 
long addr)
pud = pud_offset(p4d, addr);
BUG_ON(pud_none(*pud));
pmd = pmd_offset(pud, addr);
-   /*
-* khugepaged to collapse normal pages to hugepage, first set
-* pmd to none to force page fault/gup to take mmap_lock. After
-* pmd is set to none, we do a pte_clear which does this assertion
-* so if we find pmd none, return.
-*/
-   if (pmd_none(*pmd))
-   return;
-   BUG_ON(!pmd_present(*pmd));
-   assert_spin_locked(pte_lockptr(mm, pmd));
+   pte = pte_offset_map_nolock(mm, pmd, addr, );
+   BUG_ON(!pte);
+   assert_spin_locked(ptl);
+   pte_unmap(pte);
 }
 #endif /* CONFIG_DEBUG_VM */
 
-- 
2.35.3



[PATCH v2 03/12] arm: adjust_pte() use pte_offset_map_nolock()

2023-06-20 Thread Hugh Dickins
Instead of pte_lockptr(), use the recently added pte_offset_map_nolock()
in adjust_pte(): because it gives the not-locked ptl for precisely that
pte, which the caller can then safely lock; whereas pte_lockptr() is not
so tightly coupled, because it dereferences the pmd pointer again.

Signed-off-by: Hugh Dickins 
---
 arch/arm/mm/fault-armv.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c
index ca5302b0b7ee..7cb125497976 100644
--- a/arch/arm/mm/fault-armv.c
+++ b/arch/arm/mm/fault-armv.c
@@ -117,11 +117,10 @@ static int adjust_pte(struct vm_area_struct *vma, 
unsigned long address,
 * must use the nested version.  This also means we need to
 * open-code the spin-locking.
 */
-   pte = pte_offset_map(pmd, address);
+   pte = pte_offset_map_nolock(vma->vm_mm, pmd, address, );
if (!pte)
return 0;
 
-   ptl = pte_lockptr(vma->vm_mm, pmd);
do_pte_lock(ptl);
 
ret = do_adjust_pte(vma, address, pfn, pte);
-- 
2.35.3



Re: [PATCH] cxl/ocxl: Possible repeated word

2023-06-20 Thread Frederic Barrat



Hello,

While the correction in the comment is of course ok, the patch was sent 
as html. You may want to check/fix how it was submitted.


  Fred


On 18/06/2023 17:08, zhumao...@208suo.com wrote:
Delete repeated word in comment. Signed-off-by: Zhu Mao  208suo. com> --- drivers/misc/cxl/native. c | 2 +- 1 file changed, 1 
insertion(+), 1 deletion(-) diff --git a/drivers/misc/cxl/native. c 
b/drivers/misc/cxl/native. cindex

ZjQcmQRYFpfptBannerStart
This Message Is From an External Sender
This message came from outside your organization.
Report Suspicious

ZjQcmQRYFpfptBannerEnd
Delete repeated word in comment.

Signed-off-by: Zhu Mao 
---
  drivers/misc/cxl/native.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
index 50b0c44bb8d7..6957946a6463 100644
--- a/drivers/misc/cxl/native.c
+++ b/drivers/misc/cxl/native.c
@@ -920,7 +920,7 @@ int cxl_attach_dedicated_process_psl9(struct 
cxl_context *ctx, u64 wed, u64 amr)

   * Ideally we should do a wmb() here to make sure the changes to the
   * PE are visible to the card before we call afu_enable.
   * On ppc64 though all mmios are preceded by a 'sync' instruction 
hence

- * we dont dont need one here.
+ * we dont need one here.
   */

  result = cxl_ops->afu_reset(afu);


[PATCH v2 02/12] mm/pgtable: add PAE safety to __pte_offset_map()

2023-06-20 Thread Hugh Dickins
There is a faint risk that __pte_offset_map(), on a 32-bit architecture
with a 64-bit pmd_t e.g. x86-32 with CONFIG_X86_PAE=y, would succeed on
a pmdval assembled from a pmd_low and a pmd_high which never belonged
together: their combination not pointing to a page table at all, perhaps
not even a valid pfn.  pmdp_get_lockless() is not enough to prevent that.

Guard against that (on such configs) by local_irq_save() blocking TLB
flush between present updates, as linux/pgtable.h suggests.  It's only
needed around the pmdp_get_lockless() in __pte_offset_map(): a race when
__pte_offset_map_lock() repeats the pmdp_get_lockless() after getting the
lock, would just send it back to __pte_offset_map() again.

Complement this pmdp_get_lockless_start() and pmdp_get_lockless_end(),
used only locally in __pte_offset_map(), with a pmdp_get_lockless_sync()
synonym for tlb_remove_table_sync_one(): to send the necessary interrupt
at the right moment on those configs which do not already send it.

CONFIG_GUP_GET_PXX_LOW_HIGH is enabled when required by mips, sh and x86.
It is not enabled by arm-32 CONFIG_ARM_LPAE: my understanding is that
Will Deacon's 2020 enhancements to READ_ONCE() are sufficient for arm.
It is not enabled by arc, but its pmd_t is 32-bit even when pte_t 64-bit.

Limit the IRQ disablement to CONFIG_HIGHPTE?  Perhaps, but would need a
little more work, to retry if pmd_low good for page table, but pmd_high
non-zero from THP (and that might be making x86-specific assumptions).

Signed-off-by: Hugh Dickins 
---
 include/linux/pgtable.h |  4 
 mm/pgtable-generic.c| 29 +
 2 files changed, 33 insertions(+)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 8b0fc7fdc46f..525f1782b466 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -390,6 +390,7 @@ static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
return pmd;
 }
 #define pmdp_get_lockless pmdp_get_lockless
+#define pmdp_get_lockless_sync() tlb_remove_table_sync_one()
 #endif /* CONFIG_PGTABLE_LEVELS > 2 */
 #endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */
 
@@ -408,6 +409,9 @@ static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
 {
return pmdp_get(pmdp);
 }
+static inline void pmdp_get_lockless_sync(void)
+{
+}
 #endif
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 674671835631..5e85a625ab30 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -232,12 +232,41 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, 
unsigned long address,
 #endif
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
+#if defined(CONFIG_GUP_GET_PXX_LOW_HIGH) && \
+   (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RCU))
+/*
+ * See the comment above ptep_get_lockless() in include/linux/pgtable.h:
+ * the barriers in pmdp_get_lockless() cannot guarantee that the value in
+ * pmd_high actually belongs with the value in pmd_low; but holding interrupts
+ * off blocks the TLB flush between present updates, which guarantees that a
+ * successful __pte_offset_map() points to a page from matched halves.
+ */
+static unsigned long pmdp_get_lockless_start(void)
+{
+   unsigned long irqflags;
+
+   local_irq_save(irqflags);
+   return irqflags;
+}
+static void pmdp_get_lockless_end(unsigned long irqflags)
+{
+   local_irq_restore(irqflags);
+}
+#else
+static unsigned long pmdp_get_lockless_start(void) { return 0; }
+static void pmdp_get_lockless_end(unsigned long irqflags) { }
+#endif
+
 pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
 {
+   unsigned long irqflags;
pmd_t pmdval;
 
rcu_read_lock();
+   irqflags = pmdp_get_lockless_start();
pmdval = pmdp_get_lockless(pmd);
+   pmdp_get_lockless_end(irqflags);
+
if (pmdvalp)
*pmdvalp = pmdval;
if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval)))
-- 
2.35.3



[PATCH v2 01/12] mm/pgtable: add rcu_read_lock() and rcu_read_unlock()s

2023-06-20 Thread Hugh Dickins
Before putting them to use (several commits later), add rcu_read_lock()
to pte_offset_map(), and rcu_read_unlock() to pte_unmap().  Make this a
separate commit, since it risks exposing imbalances: prior commits have
fixed all the known imbalances, but we may find some have been missed.

Signed-off-by: Hugh Dickins 
---
 include/linux/pgtable.h | 4 ++--
 mm/pgtable-generic.c| 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index a1326e61d7ee..8b0fc7fdc46f 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -99,7 +99,7 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned 
long address)
((pte_t *)kmap_local_page(pmd_page(*(pmd))) + pte_index((address)))
 #define pte_unmap(pte) do {\
kunmap_local((pte));\
-   /* rcu_read_unlock() to be added later */   \
+   rcu_read_unlock();  \
 } while (0)
 #else
 static inline pte_t *__pte_map(pmd_t *pmd, unsigned long address)
@@ -108,7 +108,7 @@ static inline pte_t *__pte_map(pmd_t *pmd, unsigned long 
address)
 }
 static inline void pte_unmap(pte_t *pte)
 {
-   /* rcu_read_unlock() to be added later */
+   rcu_read_unlock();
 }
 #endif
 
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index c7ab18a5fb77..674671835631 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -236,7 +236,7 @@ pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, 
pmd_t *pmdvalp)
 {
pmd_t pmdval;
 
-   /* rcu_read_lock() to be added later */
+   rcu_read_lock();
pmdval = pmdp_get_lockless(pmd);
if (pmdvalp)
*pmdvalp = pmdval;
@@ -250,7 +250,7 @@ pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, 
pmd_t *pmdvalp)
}
return __pte_map(, addr);
 nomap:
-   /* rcu_read_unlock() to be added later */
+   rcu_read_unlock();
return NULL;
 }
 
-- 
2.35.3



[PATCH v2 00/12] mm: free retracted page table by RCU

2023-06-20 Thread Hugh Dickins
Here is v2 third series of patches to mm (and a few architectures), based
on v6.4-rc5 with the preceding two series applied: in which khugepaged
takes advantage of pte_offset_map[_lock]() allowing for pmd transitions.
Differences from v1 are noted patch by patch below

This follows on from the v2 "arch: allow pte_offset_map[_lock]() to fail"
https://lore.kernel.org/linux-mm/a4963be9-7aa6-350-66d0-2ba843e1a...@google.com/
series of 23 posted on 2023-06-08 (and now in mm-stable - thank you),
and the v2 "mm: allow pte_offset_map[_lock]() to fail"
https://lore.kernel.org/linux-mm/c1c9a74a-bc5b-15ea-e5d2-8ec34bc9...@google.com/
series of 32 posted on 2023-06-08 (and now in mm-stable - thank you),
and replaces the v1 "mm: free retracted page table by RCU"
https://lore.kernel.org/linux-mm/35e983f5-7ed3-b310-d949-9ae8b130c...@google.com/
series of 12 posted on 2023-05-28 (which was bad on powerpc and s390).

The first two series were "independent": neither depending for build or
correctness on the other, but both series had to be in before this third
series is added to make the effective changes; and it would probably be
best to hold this series back until the following release, since it might
now reveal missed imbalances which the first series hoped to fix.

What is it all about?  Some mmap_lock avoidance i.e. latency reduction.
Initially just for the case of collapsing shmem or file pages to THPs:
the usefulness of MADV_COLLAPSE on shmem is being limited by that
mmap_write_lock it currently requires.

Likely to be relied upon later in other contexts e.g. freeing of
empty page tables (but that's not work I'm doing).  mmap_write_lock
avoidance when collapsing to anon THPs?  Perhaps, but again that's not
work I've done: a quick attempt was not as easy as the shmem/file case.

These changes (though of course not these exact patches) have been in
Google's data centre kernel for three years now: we do rely upon them.

Based on the preceding two series over v6.4-rc5, or any v6.4-rc; and
almost good on current mm-everything or current linux-next - just one
patch conflicts, the 10/12: I'll reply to that one with its
mm-everything or linux-next equivalent (ptent replacing *pte).

01/12 mm/pgtable: add rcu_read_lock() and rcu_read_unlock()s
  v2: same as v1
02/12 mm/pgtable: add PAE safety to __pte_offset_map()
  v2: rename to pmdp_get_lockless_start/end() per Matthew;
  so use inlines without _irq_save(flags) macro oddity;
  add pmdp_get_lockless_sync() for use later in 09/12.
03/12 arm: adjust_pte() use pte_offset_map_nolock()
  v2: same as v1
04/12 powerpc: assert_pte_locked() use pte_offset_map_nolock()
  v2: same as v1
05/12 powerpc: add pte_free_defer() for pgtables sharing page
  v2: fix rcu_head usage to cope with concurrent deferrals;
  add para to commit message explaining rcu_head issue.
06/12 sparc: add pte_free_defer() for pte_t *pgtable_t
  v2: use page_address() instead of less common page_to_virt();
  add para to commit message explaining simple conversion;
  changed title since sparc64 pgtables do not share page.
07/12 s390: add pte_free_defer() for pgtables sharing page
  v2: complete rewrite, integrated with s390's existing pgtable
  management; temporarily using a global mm_pgtable_list_lock,
  to be restored to per-mm spinlock in a later followup patch.
08/12 mm/pgtable: add pte_free_defer() for pgtable as page
  v2: add comment on rcu_head to "Page table pages", per JannH
09/12 mm/khugepaged: retract_page_tables() without mmap or vma lock
  v2: repeat checks under ptl because UFFD, per PeterX and JannH;
  bring back mmu_notifier calls for PMD, per JannH and Jason;
  pmdp_get_lockless_sync() to issue missing interrupt if PAE.
10/12 mm/khugepaged: collapse_pte_mapped_thp() with mmap_read_lock()
  v2: first check VMA, in case page tables torn down, per JannH;
  pmdp_get_lockless_sync() to issue missing interrupt if PAE;
  moved mmu_notifier after step 1, reworked final goto labels.
11/12 mm/khugepaged: delete khugepaged_collapse_pte_mapped_thps()
  v2: same as v1
12/12 mm: delete mmap_write_trylock() and vma_try_start_write()
  v2: same as v1

 arch/arm/mm/fault-armv.c|   3 +-
 arch/powerpc/include/asm/pgalloc.h  |   4 +
 arch/powerpc/mm/pgtable-frag.c  |  51 
 arch/powerpc/mm/pgtable.c   |  16 +-
 arch/s390/include/asm/pgalloc.h |   4 +
 arch/s390/mm/pgalloc.c  | 205 +
 arch/sparc/include/asm/pgalloc_64.h |   4 +
 arch/sparc/mm/init_64.c |  16 +
 include/linux/mm.h  |  17 --
 include/linux/mm_types.h|   6 +-
 include/linux/mmap_lock.h   |  10 -
 include/linux/pgtable.h |  10 +-
 mm/khugepaged.c | 481 +++---
 mm/pgtable-generic.c|  53 +++-
 14 files changed, 467 insertions(+), 413 deletions(-)


Re: [PATCH mm-unstable v2 01/10] mm/kvm: add mmu_notifier_ops->test_clear_young()

2023-06-20 Thread Nicholas Piggin
On Sat May 27, 2023 at 9:44 AM AEST, Yu Zhao wrote:
> Add mmu_notifier_ops->test_clear_young() to supersede test_young()
> and clear_young().
>
> test_clear_young() has a fast path, which if supported, allows its
> callers to safely clear the accessed bit without taking
> kvm->mmu_lock.
>
> The fast path requires arch-specific code that generally relies on
> RCU and CAS: the former protects KVM page tables from being freed
> while the latter clears the accessed bit atomically against both the
> hardware and other software page table walkers. If the fast path is
> unsupported, test_clear_young() falls back to the existing slow path
> where kvm->mmu_lock is then taken.
>
> test_clear_young() can also operate on a range of KVM PTEs
> individually according to a bitmap, if the caller provides it.

It would be better if you could do patch 1 that only touches the
mmu_notifier code and implements mmu_notifier_test_clear_young() in
terms of existing callbacks, and next patch swaps KVM to new callbacks
and remove the old ones.

> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> index 64a3e051c3c4..dfdbb370682d 100644
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -60,6 +60,8 @@ enum mmu_notifier_event {
>  };
>  
>  #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)
> +#define MMU_NOTIFIER_RANGE_LOCKLESS  (1 << 1)
> +#define MMU_NOTIFIER_RANGE_YOUNG (1 << 2)
>  
>  struct mmu_notifier_ops {
>   /*
> @@ -122,6 +124,10 @@ struct mmu_notifier_ops {
> struct mm_struct *mm,
> unsigned long address);
>  
> + int (*test_clear_young)(struct mmu_notifier *mn, struct mm_struct *mm,
> + unsigned long start, unsigned long end,
> + bool clear, unsigned long *bitmap);

This should have a comment like the others. Callback wants to know how
to implement it.

Could add a _range on it as well while you're here, to correct that
inconsistency.

> +
>   /*
>* change_pte is called in cases that pte mapping to page is changed:
>* for example, when ksm remaps pte to point to a new shared page.
> @@ -392,6 +398,9 @@ extern int __mmu_notifier_clear_young(struct mm_struct 
> *mm,
> unsigned long end);
>  extern int __mmu_notifier_test_young(struct mm_struct *mm,
>unsigned long address);
> +extern int __mmu_notifier_test_clear_young(struct mm_struct *mm,
> +unsigned long start, unsigned long 
> end,
> +bool clear, unsigned long *bitmap);
>  extern void __mmu_notifier_change_pte(struct mm_struct *mm,
> unsigned long address, pte_t pte);
>  extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range 
> *r);
> @@ -440,6 +449,35 @@ static inline int mmu_notifier_test_young(struct 
> mm_struct *mm,
>   return 0;
>  }
>  
> +/*
> + * mmu_notifier_test_clear_young() returns nonzero if any of the KVM PTEs 
> within
> + * a given range was young. Specifically, it returns 
> MMU_NOTIFIER_RANGE_LOCKLESS
> + * if the fast path was successful, MMU_NOTIFIER_RANGE_YOUNG otherwise.
> + *
> + * The last parameter to the function is a bitmap and only the fast path
> + * supports it: if it is NULL, the function falls back to the slow path if 
> the
> + * fast path was unsuccessful; otherwise, the function bails out.

Then if it was NULL, you would just not populate it. Minmize differences
and cases for the caller/implementations.

> + *
> + * The bitmap has the following specifications:
> + * 1. The number of bits should be at least (end-start)/PAGE_SIZE.
> + * 2. The offset of each bit should be relative to the end, i.e., the offset
> + *corresponding to addr should be (end-addr)/PAGE_SIZE-1. This is 
> convenient
> + *for batching while forward looping.
> + *
> + * When testing, this function sets the corresponding bit in the bitmap for 
> each
> + * young KVM PTE. When clearing, this function clears the accessed bit for 
> each
> + * young KVM PTE whose corresponding bit in the bitmap is set.

I think this is over-designed as a first pass. The secondary MMU should
just implement the call always. If it can't do it locklessly, then just
do individual lookups. If the benefit is in the batching as you say then
the locked version will get similar benefit. Possibly more because locks
like some amount of batching when contended.

I think that would reduce some concerns about cases of secondary MMUs
that do not not support the lockless version yet, and avoid
proliferation of code paths by platform.

_If_ that was insufficient then I would like to see numbers and profiles
and incremental patch to expose more complexity like this.

Also mmu notifier code should say nothing about KVM PTEs or use kvm
names in any code or comments either. "if the page was accessed via the

Re: [RFC PATCH v1 1/3] Revert "powerpc/bug: Provide better flexibility to WARN_ON/__WARN_FLAGS() with asm goto"

2023-06-20 Thread Peter Zijlstra
On Tue, Jun 20, 2023 at 10:51:25AM +0530, Naveen N Rao wrote:
> Christophe Leroy wrote:
> > This reverts commit 1e688dd2a3d6759d416616ff07afc4bb836c4213.
> > 
> > That commit aimed at optimising the code around generation of
> > WARN_ON/BUG_ON but this leads to a lot of dead code erroneously
> > generated by GCC.
> > 
> >  text  data bss dec hex filename
> >   9551585   3627834  224376 13403795 cc8693 vmlinux.before
> >   9535281   3628358  224376 13388015 cc48ef vmlinux.after
> > 
> > Once this change is reverted, in a standard configuration (pmac32 +
> > function tracer) the text is reduced by 16k which is around 1.7%
> 
> Aneesh recently reported a build failure due to the use of 'asm goto' in
> WARN_ON(). We were able to root-cause it to the use of 'asm goto' with two
> config options: CONFIG_CC_OPTIMIZE_FOR_SIZE and
> CONFIG_DEBUG_SECTION_MISMATCH.

FWIW;

I recently had clang-powerpc report a very dodgy build error that was
due to a combination of these asm-goto and the usage of __cleanup__.
For some reason the label of the asm-goto crossed over the __cleanup__
variable declaration -- which is not valid, but also was completely
insane for that's not what the code called for.

  
https://lkml.kernel.org/r/20230610082005.gb1370...@hirez.programming.kicks-ass.net

But in my book that's a compiler issue, not a kernel issue and I'd be
hesitant to pull the asm-goto use just for that.




Re: [RFC PATCH v1 2/3] powerpc: Mark all .S files invalid for objtool

2023-06-20 Thread Naveen N Rao

Christophe Leroy wrote:



Le 20/06/2023 à 08:04, Naveen N Rao a écrit :

Christophe Leroy wrote:

A lot of work is required in .S files in order to get them ready
for objtool checks.

For the time being, exclude them from the checks.

This is done with the script below:

#!/bin/sh
DIRS=`find arch/powerpc -name "*.S" -exec dirname {} \; | sort | 
uniq`

for d in $DIRS
do
    pushd $d
    echo >> Makefile
    for f in *.S
    do
    echo "OBJECT_FILES_NON_STANDARD_$f := y" | sed s/"\.S"/".o"/g
    done >> Makefile
    popd
done

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/boot/Makefile | 17 +
 arch/powerpc/crypto/Makefile   | 13 +++
 arch/powerpc/kernel/Makefile   | 44 ++
 arch/powerpc/kernel/trace/Makefile |  4 ++
 arch/powerpc/kernel/vdso/Makefile  | 11 ++
 arch/powerpc/kexec/Makefile    |  2 +
 arch/powerpc/kvm/Makefile  | 13 +++
 arch/powerpc/lib/Makefile  | 25 
 arch/powerpc/mm/book3s32/Makefile  |  3 ++
 arch/powerpc/mm/nohash/Makefile    |  3 ++
 arch/powerpc/perf/Makefile |  2 +
 arch/powerpc/platforms/44x/Makefile    |  2 +
 arch/powerpc/platforms/52xx/Makefile   |  3 ++
 arch/powerpc/platforms/83xx/Makefile   |  2 +
 arch/powerpc/platforms/cell/spufs/Makefile |  3 ++
 arch/powerpc/platforms/pasemi/Makefile |  2 +
 arch/powerpc/platforms/powermac/Makefile   |  3 ++
 arch/powerpc/platforms/powernv/Makefile    |  3 ++
 arch/powerpc/platforms/ps3/Makefile    |  2 +
 arch/powerpc/platforms/pseries/Makefile    |  2 +
 arch/powerpc/purgatory/Makefile    |  3 ++
 arch/powerpc/sysdev/Makefile   |  3 ++
 arch/powerpc/xmon/Makefile |  3 ++
 23 files changed, 168 insertions(+)



I think it might be better to have a config option so that architectures 
can opt-in to skip objtool on asm files. We can then do:


Well, the idea here was to initially opt out every file in order to 
quickly add support for objtool uaccess checking, and then opt-in back 
files one by one once they are ready for it.


That was my initial thought too.



In most files,all we have to do is to replace all _GLOBAL() by 
SYM_FUNC_START(), add a SYM_FUNC_END() at the end of the function, then 
do the same with SYM_FUNC_START_LOCAL() with all non global functions.


That's easy to do and worth it but it takes time, hence the idea of an 
incremental approach.


Right. But until that's done, I am not sure it is worth the churn to the 
Makefiles.


Besides, it isn't clear to me that the existing features we are 
targeting for objtool on powerpc need objtool to run on asm files (so, 
an incremental approach may not really get us much). Objtool --mcount 
doesn't care about .S files.  I suppose uaccess validation also doesn't 
need it. It's likely just stack validation (should we choose to enable 
it) that needs objtool to be run on asm files.


The first patch series converting much of our .S files can drop the 
config option and exclude the remaining files.



- Naveen



Re: [PATCH mm-unstable v2 06/10] kvm/powerpc: make radix page tables RCU safe

2023-06-20 Thread Nicholas Piggin
On Sat May 27, 2023 at 9:44 AM AEST, Yu Zhao wrote:
> KVM page tables are currently not RCU safe against remapping, i.e.,
> kvmppc_unmap_free_pmd_entry_table() et al. The previous

Minor nit but the "page table" is not RCU-safe against something. It
is RCU-freed, and therefore some algorithm that accesses it can have
the existence guarantee provided by RCU (usually there still needs
to be more to it).

> mmu_notifier_ops members rely on kvm->mmu_lock to synchronize with
> that operation.
>
> However, the new mmu_notifier_ops member test_clear_young() provides
> a fast path that does not take kvm->mmu_lock. To implement
> kvm_arch_test_clear_young() for that path, orphan page tables need to
> be freed by RCU.

Short version: clear the referenced bit using RCU instead of MMU lock
to protect against page table freeing, and there is no problem with
clearing the bit in a table that has been freed.

Seems reasonable.

>
> Unmapping, specifically kvm_unmap_radix(), does not free page tables,
> hence not a concern.

Not sure if you really need to make the distinction about why the page
table is freed, we might free them via unmapping. The point is just
anything that frees them while there can be concurrent access, right?

>
> Signed-off-by: Yu Zhao 
> ---
>  arch/powerpc/kvm/book3s_64_mmu_radix.c | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c 
> b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> index 461307b89c3a..3b65b3b11041 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> @@ -1469,13 +1469,15 @@ int kvmppc_radix_init(void)
>  {
>   unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
>  
> - kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor);
> + kvm_pte_cache = kmem_cache_create("kvm-pte", size, size,
> +   SLAB_TYPESAFE_BY_RCU, pte_ctor);
>   if (!kvm_pte_cache)
>   return -ENOMEM;
>  
>   size = sizeof(void *) << RADIX_PMD_INDEX_SIZE;
>  
> - kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor);
> + kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size,
> +   SLAB_TYPESAFE_BY_RCU, pmd_ctor);
>   if (!kvm_pmd_cache) {
>   kmem_cache_destroy(kvm_pte_cache);
>   return -ENOMEM;

KVM PPC HV radix PUD level page tables use the arch/powerpc allocators
(for some reason), which are not RCU freed. I think you need them too?
I wouldn't mind if the kvm pud table slab was moved in here instead of
shared.

Thanks,
Nick


Re: [RFC PATCH v1 2/3] powerpc: Mark all .S files invalid for objtool

2023-06-20 Thread Christophe Leroy


Le 20/06/2023 à 08:04, Naveen N Rao a écrit :
> Christophe Leroy wrote:
>> A lot of work is required in .S files in order to get them ready
>> for objtool checks.
>>
>> For the time being, exclude them from the checks.
>>
>> This is done with the script below:
>>
>> #!/bin/sh
>> DIRS=`find arch/powerpc -name "*.S" -exec dirname {} \; | sort | 
>> uniq`
>> for d in $DIRS
>> do
>>     pushd $d
>>     echo >> Makefile
>>     for f in *.S
>>     do
>>     echo "OBJECT_FILES_NON_STANDARD_$f := y" | sed s/"\.S"/".o"/g
>>     done >> Makefile
>>     popd
>> done
>>
>> Signed-off-by: Christophe Leroy 
>> ---
>>  arch/powerpc/boot/Makefile | 17 +
>>  arch/powerpc/crypto/Makefile   | 13 +++
>>  arch/powerpc/kernel/Makefile   | 44 ++
>>  arch/powerpc/kernel/trace/Makefile |  4 ++
>>  arch/powerpc/kernel/vdso/Makefile  | 11 ++
>>  arch/powerpc/kexec/Makefile    |  2 +
>>  arch/powerpc/kvm/Makefile  | 13 +++
>>  arch/powerpc/lib/Makefile  | 25 
>>  arch/powerpc/mm/book3s32/Makefile  |  3 ++
>>  arch/powerpc/mm/nohash/Makefile    |  3 ++
>>  arch/powerpc/perf/Makefile |  2 +
>>  arch/powerpc/platforms/44x/Makefile    |  2 +
>>  arch/powerpc/platforms/52xx/Makefile   |  3 ++
>>  arch/powerpc/platforms/83xx/Makefile   |  2 +
>>  arch/powerpc/platforms/cell/spufs/Makefile |  3 ++
>>  arch/powerpc/platforms/pasemi/Makefile |  2 +
>>  arch/powerpc/platforms/powermac/Makefile   |  3 ++
>>  arch/powerpc/platforms/powernv/Makefile    |  3 ++
>>  arch/powerpc/platforms/ps3/Makefile    |  2 +
>>  arch/powerpc/platforms/pseries/Makefile    |  2 +
>>  arch/powerpc/purgatory/Makefile    |  3 ++
>>  arch/powerpc/sysdev/Makefile   |  3 ++
>>  arch/powerpc/xmon/Makefile |  3 ++
>>  23 files changed, 168 insertions(+)
>>
> 
> I think it might be better to have a config option so that architectures 
> can opt-in to skip objtool on asm files. We can then do:

Well, the idea here was to initially opt out every file in order to 
quickly add support for objtool uaccess checking, and then opt-in back 
files one by one once they are ready for it.

In most files,all we have to do is to replace all _GLOBAL() by 
SYM_FUNC_START(), add a SYM_FUNC_END() at the end of the function, then 
do the same with SYM_FUNC_START_LOCAL() with all non global functions.

That's easy to do and worth it but it takes time, hence the idea of an 
incremental approach.

Christophe


Re: [RFC PATCH v1 2/3] powerpc: Mark all .S files invalid for objtool

2023-06-20 Thread Naveen N Rao

Christophe Leroy wrote:

A lot of work is required in .S files in order to get them ready
for objtool checks.

For the time being, exclude them from the checks.

This is done with the script below:

#!/bin/sh
DIRS=`find arch/powerpc -name "*.S" -exec dirname {} \; | sort | uniq`
for d in $DIRS
do
pushd $d
echo >> Makefile
for f in *.S
do
echo "OBJECT_FILES_NON_STANDARD_$f := y" | sed 
s/"\.S"/".o"/g
done >> Makefile
popd
done

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/boot/Makefile | 17 +
 arch/powerpc/crypto/Makefile   | 13 +++
 arch/powerpc/kernel/Makefile   | 44 ++
 arch/powerpc/kernel/trace/Makefile |  4 ++
 arch/powerpc/kernel/vdso/Makefile  | 11 ++
 arch/powerpc/kexec/Makefile|  2 +
 arch/powerpc/kvm/Makefile  | 13 +++
 arch/powerpc/lib/Makefile  | 25 
 arch/powerpc/mm/book3s32/Makefile  |  3 ++
 arch/powerpc/mm/nohash/Makefile|  3 ++
 arch/powerpc/perf/Makefile |  2 +
 arch/powerpc/platforms/44x/Makefile|  2 +
 arch/powerpc/platforms/52xx/Makefile   |  3 ++
 arch/powerpc/platforms/83xx/Makefile   |  2 +
 arch/powerpc/platforms/cell/spufs/Makefile |  3 ++
 arch/powerpc/platforms/pasemi/Makefile |  2 +
 arch/powerpc/platforms/powermac/Makefile   |  3 ++
 arch/powerpc/platforms/powernv/Makefile|  3 ++
 arch/powerpc/platforms/ps3/Makefile|  2 +
 arch/powerpc/platforms/pseries/Makefile|  2 +
 arch/powerpc/purgatory/Makefile|  3 ++
 arch/powerpc/sysdev/Makefile   |  3 ++
 arch/powerpc/xmon/Makefile |  3 ++
 23 files changed, 168 insertions(+)



I think it might be better to have a config option so that architectures 
can opt-in to skip objtool on asm files. We can then do:


diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index 9f94fc83f08652..878027cf4faf37 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -359,7 +359,11 @@ $(obj)/%.s: $(src)/%.S FORCE
   $(call if_changed_dep,cpp_s_S)

quiet_cmd_as_o_S = AS $(quiet_modtag)  $@
+ifndef CONFIG_ARCH_OBJTOOL_SKIP_ASM
  cmd_as_o_S = $(CC) $(a_flags) -c -o $@ $< $(cmd_objtool)
+else
+  cmd_as_o_S = $(CC) $(a_flags) -c -o $@ $<
+endif

ifdef CONFIG_ASM_MODVERSIONS



- Naveen