date:20250530

[PATCH v2] selftests/filesystems: Fix build of anon_inode_test

2025-05-30 Thread Mark Brown

The newly added anon_inode_test test fails to build due to attempting to
include a nonexisting overlayfs/wrapper.h:

anon_inode_test.c:10:10: fatal error: overlayfs/wrappers.h: No such file or 
directory
   10 | #include "overlayfs/wrappers.h"
  |  ^~

This is due to 0bd92b9fe538 ("selftests/filesystems: move wrapper.h out
of overlayfs subdir") which was added in the vfs-6.16.selftests branch
which was based on -rc5 and did not contain the newly added test so once
things were merged into mainline the build started failing - both
parent commits are fine.

Fixes: 3e406741b1989 ("Merge tag 'vfs-6.16-rc1.selftests' of 
git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs")
Signed-off-by: Mark Brown 
---
Changes in v2:
- Rebase onto mainline and adjust fixes commit now the two branches got
  merged there.
- Link to v1: 
https://lore.kernel.org/r/20250518-selftests-anon-inode-build-v1-1-71eff8183...@kernel.org
---
 tools/testing/selftests/filesystems/anon_inode_test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/filesystems/anon_inode_test.c 
b/tools/testing/selftests/filesystems/anon_inode_test.c
index e8e0ef1460d2..73e0a4d4fb2f 100644
--- a/tools/testing/selftests/filesystems/anon_inode_test.c
+++ b/tools/testing/selftests/filesystems/anon_inode_test.c
@@ -7,7 +7,7 @@
 #include 
 
 #include "../kselftest_harness.h"
-#include "overlayfs/wrappers.h"
+#include "wrappers.h"
 
 TEST(anon_inode_no_chown)
 {

---
base-commit: f66bc387efbee59978e076ce9bf123ac353b389c
change-id: 20250516-selftests-anon-inode-build-007e206e8422

Best regards,
-- 
Mark Brown

Re: [PATCH v3 1/4] media: qcom: camss: vfe: Add VBIF setting support

2025-05-30 Thread Bryan O'Donoghue

On 30/05/2025 10:00, Vincent Knecht via B4 Relay wrote:
> From: Vincent Knecht 
> 
> Some devices need writing values to VFE VBIF registers.
> Add helper functions to do this.
> 
> Signed-off-by: Vincent Knecht 
> ---
>   drivers/media/platform/qcom/camss/Makefile |  1 +
>   drivers/media/platform/qcom/camss/camss-vfe-4-1.c  | 12 +++
>   drivers/media/platform/qcom/camss/camss-vfe-vbif.c | 25 
> ++
>   drivers/media/platform/qcom/camss/camss-vfe-vbif.h | 19 
>   drivers/media/platform/qcom/camss/camss-vfe.c  |  9 
>   drivers/media/platform/qcom/camss/camss-vfe.h  |  3 +++
>   6 files changed, 69 insertions(+)
> 
> diff --git a/drivers/media/platform/qcom/camss/Makefile 
> b/drivers/media/platform/qcom/camss/Makefile
> index 
> d26a9c24a430a831e0d865db4d96142da5276653..4c66d29ae505ae5adc717ae98f77fb736a6e15b9
>  100644
> --- a/drivers/media/platform/qcom/camss/Makefile
> +++ b/drivers/media/platform/qcom/camss/Makefile
> @@ -21,6 +21,7 @@ qcom-camss-objs += \
>   camss-vfe-680.o \
>   camss-vfe-780.o \
>   camss-vfe-gen1.o \
> + camss-vfe-vbif.o \
>   camss-vfe.o \
>   camss-video.o \
>   camss-format.o \
> diff --git a/drivers/media/platform/qcom/camss/camss-vfe-4-1.c 
> b/drivers/media/platform/qcom/camss/camss-vfe-4-1.c
> index 
> 901677293d971cf761944a660ef719af38203f22..9cf1ccdb2fe7ca9bf89b746af836e1035b457a8f
>  100644
> --- a/drivers/media/platform/qcom/camss/camss-vfe-4-1.c
> +++ b/drivers/media/platform/qcom/camss/camss-vfe-4-1.c
> @@ -15,6 +15,7 @@
>   #include "camss.h"
>   #include "camss-vfe.h"
>   #include "camss-vfe-gen1.h"
> +#include "camss-vfe-vbif.h"
> 
>   #define VFE_0_HW_VERSION0x000
> 
> @@ -733,6 +734,7 @@ static void vfe_set_qos(struct vfe_device *vfe)
>   {
>   u32 val = VFE_0_BUS_BDG_QOS_CFG_0_CFG;
>   u32 val7 = VFE_0_BUS_BDG_QOS_CFG_7_CFG;
> + int ret;
> 
>   writel_relaxed(val, vfe->base + VFE_0_BUS_BDG_QOS_CFG_0);
>   writel_relaxed(val, vfe->base + VFE_0_BUS_BDG_QOS_CFG_1);
> @@ -742,6 +744,16 @@ static void vfe_set_qos(struct vfe_device *vfe)
>   writel_relaxed(val, vfe->base + VFE_0_BUS_BDG_QOS_CFG_5);
>   writel_relaxed(val, vfe->base + VFE_0_BUS_BDG_QOS_CFG_6);
>   writel_relaxed(val7, vfe->base + VFE_0_BUS_BDG_QOS_CFG_7);
> +
> + /* SoC-specific VBIF settings */
> + if (vfe->res->has_vbif) {
> + ret = vfe_vbif_apply_settings(vfe);
> + if (ret < 0) {
> + dev_err_ratelimited(vfe->camss->dev,
> + "VFE: VBIF error %d\n",
> + ret);
> + }
> + }
>   }
> 
>   static void vfe_set_ds(struct vfe_device *vfe)
> diff --git a/drivers/media/platform/qcom/camss/camss-vfe-vbif.c 
> b/drivers/media/platform/qcom/camss/camss-vfe-vbif.c
> new file mode 100644
> index 
> ..691335f231a6001e6c535431a18b2e21ddc832c9
> --- /dev/null
> +++ b/drivers/media/platform/qcom/camss/camss-vfe-vbif.c
> @@ -0,0 +1,25 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * camss-vfe-vbif.c
> + *
> + * Qualcomm MSM Camera Subsystem - VFE VBIF Module
> + *
> + * Copyright (c) 2025, The Linux Foundation. All rights reserved.
> + *
> + */
> +
> +#include 
> +
> +#include "camss.h"
> +#include "camss-vfe.h"
> +#include "camss-vfe-vbif.h"
> +
> +void vfe_vbif_write_reg(struct vfe_device *vfe, u32 reg, u32 val)
> +{
> + writel_relaxed(val, vfe->vbif_base + reg);
> +}
> +
> +int vfe_vbif_apply_settings(struct vfe_device *vfe)
> +{
> + return 0;
> +}
> diff --git a/drivers/media/platform/qcom/camss/camss-vfe-vbif.h 
> b/drivers/media/platform/qcom/camss/camss-vfe-vbif.h
> new file mode 100644
> index 
> ..502db629e961f67723b14a7c8c9ca973fe4c267c
> --- /dev/null
> +++ b/drivers/media/platform/qcom/camss/camss-vfe-vbif.h
> @@ -0,0 +1,19 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * camss-vfe-vbif.h
> + *
> + * Qualcomm MSM Camera Subsystem - VFE VBIF Module
> + *
> + * Copyright (c) 2025, The Linux Foundation. All rights reserved.
> + *
> + */
> +#ifndef QC_MSM_CAMSS_VFE_VBIF_H
> +#define QC_MSM_CAMSS_VFE_VBIF_H
> +
> +#include "camss-vfe.h"
> +
> +void vfe_vbif_write_reg(struct vfe_device *vfe, u32 reg, u32 val);
> +
> +int vfe_vbif_apply_settings(struct vfe_device *vfe);
> +
> +#endif /* QC_MSM_CAMSS_VFE_VBIF_H */
> diff --git a/drivers/media/platform/qcom/camss/camss-vfe.c 
> b/drivers/media/platform/qcom/camss/camss-vfe.c
> index 
> 4bca6c3abaff9b898ea879674a3ff8f3592d3139..3138562d399444c5cf2ae96bf16b75b85ff5c5ca
>  100644
> --- a/drivers/media/platform/qcom/camss/camss-vfe.c
> +++ b/drivers/media/platform/qcom/camss/camss-vfe.c
> @@ -1807,6 +1807,15 @@ int msm_vfe_subdev_init(struct camss *camss, struct 
> vfe_device *vfe,
>   return PTR_ERR(vfe->base);
>

Re: [PATCH v3 2/4] media: qcom: camss: Add support for MSM8939

2025-05-30 Thread Bryan O'Donoghue

On 30/05/2025 10:00, Vincent Knecht via B4 Relay wrote:
> + camss->res->version == CAMSS_8x39 ||

This is not correct - it should be 893x since 8939 and 8936 are ~ the 
same SoC - probably 36 is just a binned version of 39.

Anyway the x is the least significant digit.

Please fix
---
bod

Re: [PATCH v3 3/3] rpmsg: ctrl: Introduce RPMSG_CREATE_EPT_FD_IOCTL uAPI

2025-05-30 Thread Dawei Li

HI Beleswar,

Thanks for reviewing.

On Fri, May 30, 2025 at 03:15:28PM +0530, Beleswar Prasad Padhi wrote:
> Hi Dawei,
> 
> On 19/05/25 20:38, Dawei Li wrote:
> > Implement RPMSG_CREATE_EPT_FD_IOCTL, new uAPI for rpmsg ctrl, which
> > shares most of operations of RPMSG_CREATE_EPT_IOCTL except that it
> > returns fd representing eptdev to userspace directly.
> >
> > Possible calling procedures for userspace are:
> > - fd = open("/dev/rpmsg_ctrlX")
> > - ioctl(fd, RPMSG_CREATE_EPT_FD_IOCTL, &info);
> > - fd_ep = info.fd
> 
> 
> We are returning a new fd to userspace from inside an IOCTL itself. Is this a
> standard way of doing things in Kernel space? (see below related comment)

Yes, anon_get_{fd,file} are used extensively in kernel for returning a new
fd to userspace which is associated with an unique data structure in kernel
space, in different ways:

- via ioctl(), some examples are:

 - KVM ioctl(s)
   - KVM_CREATE_VCPU -> kvm_vm_ioctl_create_vcpu
   - KVM_GET_STATS_FD -> kvm_vcpu_ioctl_get_stats_fd
   - KVM_CREATE_DEVICE -> kvm_ioctl_create_device
   - KVM_CREATE_VM -> kvm_dev_ioctl_create_vm 

 - DMA buf/fence/sync ioctls
   - DMA_BUF_IOCTL_EXPORT_SYNC_FILE -> dma_buf_export_sync_file
   - SW_SYNC_IOC_CREATE_FENCE -> sw_sync_ioctl_create_fence
   - Couples of driver implement DMA buf by using anon file _implicitly_:
 - UDMABUF_CREATE -> udmabuf_ioctl_create
 - DMA_HEAP_IOCTL_ALLOC -> dma_heap_ioctl_allocate

 - gpiolib ioctls:
   - GPIO_GET_LINEHANDLE_IOCTL -> linehandle_create
   - GPIO_V2_GET_LINE_IOCTL

 -  IOMMUFD ioctls:

 -  VFIO Ioctls:

 - 


- via other specific syscalls:
 - epoll_create1
 - bpf 
 - perf_event_open
 - inotify_init
 - ...

> 
> > - operations on fd_ep(write, read, poll ioctl)
> > - ioctl(fd_ep, RPMSG_DESTROY_EPT_IOCTL)
> > - close(fd_ep)
> 
> 
> Can we rely on the userspace to close() the fd_ep? (if not done, could be a
> memory leak..).. Opposed to fd, which we can rely on the userspace to
> close() since they initiated the open() call. I am just trying to understand 
> if
> this is a standard way of doing things...

Good question.

When userland gets a fd from kernel, it's userland's duty to manage and release
the resource when it's done with it, because kernel never knows when the fd and
its resourcen are not needed by userland except process is on exiting. The fact
remains true no matter how fd is generated from kernel:
- open()
- ioctl()
- Other syscalls(epoll_create1, e.g, as listed above)

As a result, kernel & driver provide fops->release() to achieve resource
release when fd is not needed for userland, some callers of it maybe:
- Userland call close() explicitly
- Kernel does the dirty job when user process exits(if some fds are
  still opened):
  - Userland call exit() explicitly.
  - User process was killed by some signals.

Maybe some comments/docs are needed in uAPI?

> 
> > - close(fd)
> >

[snip]

> > +
> > +   if (cmd == RPMSG_CREATE_EPT_IOCTL || cmd == RPMSG_CREATE_DEV_IOCTL ||
> > +   cmd == RPMSG_RELEASE_DEV_IOCTL) {
> > +   if (copy_from_user(&eptinfo, argp, sizeof(eptinfo)))
> > +   return -EFAULT;
> > +
> > +   memcpy(chinfo.name, eptinfo.name, RPMSG_NAME_SIZE);
> > +   chinfo.name[RPMSG_NAME_SIZE - 1] = '\0';
> > +   chinfo.src = eptinfo.src;
> > +   chinfo.dst = eptinfo.dst;
> > +   } else if (cmd == RPMSG_CREATE_EPT_FD_IOCTL) {
> 
> 
> Maybe we can put this 'else if condition' in the first 'if' and treat other
> conditions under 'else', as 'RPMSG_CREATE_EPT_FD_IOCTL' is the only
> ioctl with a different struct type.

Good point! I will try to address it in next respin.

> 
> Thanks,
> Beleswar
> 
> > +   if (copy_from_user(&ept_fd_info, argp, sizeof(ept_fd_info)))
> > +   return -EFAULT;
> > +
> > +   memcpy(chinfo.name, ept_fd_info.name, RPMSG_NAME_SIZE);
> > +   chinfo.name[RPMSG_NAME_SIZE - 1] = '\0';
> > +   chinfo.src = ept_fd_info.src;
> > +   chinfo.dst = ept_fd_info.dst;
> > +   }
> >  

[snip]

Thanks,

Dawei

[RFC PATCH v1 4/6] mm: Introduce arch_in_lazy_mmu_mode()

2025-05-30 Thread Ryan Roberts

Introduce new arch_in_lazy_mmu_mode() API, which returns true if the
calling context is currently in lazy mmu mode or false otherwise. Each
arch that supports lazy mmu mode must provide an implementation of this
API.

The API will shortly be used to prevent accidental lazy mmu mode nesting
when performing an allocation, and will additionally be used to ensure
pte modification vs tlb flushing order does not get inadvertantly
swapped.

Signed-off-by: Ryan Roberts 
---
 arch/arm64/include/asm/pgtable.h  |  8 
 .../powerpc/include/asm/book3s/64/tlbflush-hash.h | 15 +++
 arch/sparc/include/asm/tlbflush_64.h  |  1 +
 arch/sparc/mm/tlb.c   | 12 
 arch/x86/include/asm/paravirt.h   |  5 +
 arch/x86/include/asm/paravirt_types.h |  1 +
 arch/x86/kernel/paravirt.c|  6 ++
 arch/x86/xen/mmu_pv.c |  6 ++
 include/linux/pgtable.h   |  1 +
 9 files changed, 55 insertions(+)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 5285757ee0c1..add75dee49f5 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -119,6 +119,14 @@ static inline void arch_leave_lazy_mmu_mode(void)
clear_thread_flag(TIF_LAZY_MMU);
 }
 
+static inline bool arch_in_lazy_mmu_mode(void)
+{
+   if (in_interrupt())
+   return false;
+
+   return test_thread_flag(TIF_LAZY_MMU);
+}
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
 
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h 
b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
index 146287d9580f..4123a9da32cc 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
@@ -57,6 +57,21 @@ static inline void arch_leave_lazy_mmu_mode(void)
 
 #define arch_flush_lazy_mmu_mode()  do {} while (0)
 
+static inline bool arch_in_lazy_mmu_mode(void)
+{
+   struct ppc64_tlb_batch *batch;
+   bool active;
+
+   if (radix_enabled())
+   return false;
+
+   batch = get_cpu_ptr(&ppc64_tlb_batch);
+   active = batch->active;
+   put_cpu_ptr(&ppc64_tlb_batch);
+
+   return active;
+}
+
 extern void hash__tlbiel_all(unsigned int action);
 
 extern void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize,
diff --git a/arch/sparc/include/asm/tlbflush_64.h 
b/arch/sparc/include/asm/tlbflush_64.h
index 8b8cdaa69272..204bc957df9e 100644
--- a/arch/sparc/include/asm/tlbflush_64.h
+++ b/arch/sparc/include/asm/tlbflush_64.h
@@ -45,6 +45,7 @@ void flush_tlb_pending(void);
 void arch_enter_lazy_mmu_mode(void);
 void arch_leave_lazy_mmu_mode(void);
 #define arch_flush_lazy_mmu_mode()  do {} while (0)
+bool arch_in_lazy_mmu_mode(void);
 
 /* Local cpu only.  */
 void __flush_tlb_all(void);
diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c
index a35ddcca5e76..83ab4ba4f4fb 100644
--- a/arch/sparc/mm/tlb.c
+++ b/arch/sparc/mm/tlb.c
@@ -69,6 +69,18 @@ void arch_leave_lazy_mmu_mode(void)
preempt_enable();
 }
 
+bool arch_in_lazy_mmu_mode(void)
+{
+   struct tlb_batch *tb;
+   bool active;
+
+   tb = get_cpu_ptr(&tlb_batch);
+   active = tb->active;
+   put_cpu_ptr(&tlb_batch);
+
+   return active;
+}
+
 static void tlb_batch_add_one(struct mm_struct *mm, unsigned long vaddr,
  bool exec, unsigned int hugepage_shift)
 {
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index b5e59a7ba0d0..c7ea3ccb8a41 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -542,6 +542,11 @@ static inline void arch_flush_lazy_mmu_mode(void)
PVOP_VCALL0(mmu.lazy_mode.flush);
 }
 
+static inline bool arch_in_lazy_mmu_mode(void)
+{
+   return PVOP_CALL0(bool, mmu.lazy_mode.in);
+}
+
 static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
phys_addr_t phys, pgprot_t flags)
 {
diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 37a8627d8277..41001ca9d010 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -46,6 +46,7 @@ struct pv_lazy_ops {
void (*enter)(void);
void (*leave)(void);
void (*flush)(void);
+   bool (*in)(void);
 } __no_randomize_layout;
 #endif
 
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index ab3e172dcc69..9af1a04a47fd 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -106,6 +106,11 @@ static noinstr void pv_native_set_debugreg(int regno, 
unsigned long val)
 {
native_set_debugreg(regno, val);
 }
+
+static noinstr bool paravirt_retfalse(void)
+{
+   return false;
+}
 #endif
 
 struct pv_info pv_info = {
@@ -228,6 +233,7 @@ struc

[RFC PATCH v1 3/6] mm: Avoid calling page allocator from apply_to_page_range()

2025-05-30 Thread Ryan Roberts

Lazy mmu mode applies to the current task and permits pte modifications
to be deferred and updated at a later time in a batch to improve
performance. apply_to_page_range() calls its callback in lazy mmu mode
and some of those callbacks call into the page allocator to either
allocate or free pages.

This is problematic with CONFIG_DEBUG_PAGEALLOC because
debug_pagealloc_[un]map_pages() calls the arch implementation of
__kernel_map_pages() which must modify the ptes for the linear map.

There are two possibilities at this point:

 - If the arch implementation modifies the ptes directly without first
   entering lazy mmu mode, the pte modifications may get deferred until
   the existing lazy mmu mode is exited. This could result in taking
   spurious faults for example.

 - If the arch implementation enters a nested lazy mmu mode before
   modification of the ptes (many arches use apply_to_page_range()),
   then the linear map updates will definitely be applied upon leaving
   the inner lazy mmu mode. But because lazy mmu mode does not support
   nesting, the remainder of the outer user is no longer in lazy mmu
   mode and the optimization opportunity is lost.

So let's just ensure that the page allocator is never called from within
lazy mmu mode. New "_nolazy" variants of apply_to_page_range() and
apply_to_existing_page_range() are introduced which don't enter lazy mmu
mode. Then users which need to call into the page allocator within their
callback are updated to use the _nolazy variants.

Signed-off-by: Ryan Roberts 
---
 include/linux/mm.h |  6 ++
 kernel/bpf/arena.c |  6 +++---
 mm/kasan/shadow.c  |  2 +-
 mm/memory.c| 54 +++---
 4 files changed, 51 insertions(+), 17 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e51dba8398f7..11cae6ce04ff 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3743,9 +3743,15 @@ static inline bool gup_can_follow_protnone(struct 
vm_area_struct *vma,
 typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
 extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
   unsigned long size, pte_fn_t fn, void *data);
+extern int apply_to_page_range_nolazy(struct mm_struct *mm,
+ unsigned long address, unsigned long size,
+ pte_fn_t fn, void *data);
 extern int apply_to_existing_page_range(struct mm_struct *mm,
   unsigned long address, unsigned long size,
   pte_fn_t fn, void *data);
+extern int apply_to_existing_page_range_nolazy(struct mm_struct *mm,
+  unsigned long address, unsigned long size,
+  pte_fn_t fn, void *data);
 
 #ifdef CONFIG_PAGE_POISONING
 extern void __kernel_poison_pages(struct page *page, int numpages);
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 0d56cea71602..ca833cfeefb7 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -187,10 +187,10 @@ static void arena_map_free(struct bpf_map *map)
/*
 * free_vm_area() calls remove_vm_area() that calls 
free_unmap_vmap_area().
 * It unmaps everything from vmalloc area and clears pgtables.
-* Call apply_to_existing_page_range() first to find populated ptes and
-* free those pages.
+* Call apply_to_existing_page_range_nolazy() first to find populated
+* ptes and free those pages.
 */
-   apply_to_existing_page_range(&init_mm, 
bpf_arena_get_kern_vm_start(arena),
+   apply_to_existing_page_range_nolazy(&init_mm, 
bpf_arena_get_kern_vm_start(arena),
 KERN_VM_SZ - GUARD_SZ, existing_page_cb, 
NULL);
free_vm_area(arena->kern_vm);
range_tree_destroy(&arena->rt);
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index d2c70cd2afb1..2325c5166c3a 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -590,7 +590,7 @@ void kasan_release_vmalloc(unsigned long start, unsigned 
long end,
 
 
if (flags & KASAN_VMALLOC_PAGE_RANGE)
-   apply_to_existing_page_range(&init_mm,
+   apply_to_existing_page_range_nolazy(&init_mm,
 (unsigned long)shadow_start,
 size, kasan_depopulate_vmalloc_pte,
 NULL);
diff --git a/mm/memory.c b/mm/memory.c
index 49199410805c..24436074ce48 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2913,7 +2913,7 @@ EXPORT_SYMBOL(vm_iomap_memory);
 static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 unsigned long addr, unsigned long end,
 pte_fn_t fn, void *data, bool create,
-pgtbl_mod_mask *mask)
+

[RFC PATCH v1 1/6] fs/proc/task_mmu: Fix pte update and tlb maintenance ordering in pagemap_scan_pmd_entry()

2025-05-30 Thread Ryan Roberts

pagemap_scan_pmd_entry() was previously modifying ptes while in lazy mmu
mode, then performing tlb maintenance for the modified ptes, then
leaving lazy mmu mode. But any pte modifications during lazy mmu mode
may be deferred until arch_leave_lazy_mmu_mode(), inverting the required
ordering between pte modificaiton and tlb maintenance.

Let's fix that by leaving mmu mode, forcing all the pte updates to be
actioned, before doing the tlb maintenance.

This is a theorectical bug discovered during code review.

Fixes: 52526ca7fdb9 ("fs/proc/task_mmu: implement IOCTL to get and optionally 
clear info about PTEs")
Signed-off-by: Ryan Roberts 
---
 fs/proc/task_mmu.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 994cde10e3f4..361f3ffd9a0c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -2557,10 +2557,9 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned 
long start,
}
 
 flush_and_return:
+   arch_leave_lazy_mmu_mode();
if (flush_end)
flush_tlb_range(vma, start, addr);
-
-   arch_leave_lazy_mmu_mode();
pte_unmap_unlock(start_pte, ptl);
 
cond_resched();
-- 
2.43.0

[RFC PATCH v1 2/6] mm: Fix pte update and tlb maintenance ordering in migrate_vma_collect_pmd()

2025-05-30 Thread Ryan Roberts

migrate_vma_collect_pmd() was previously modifying ptes while in lazy
mmu mode, then performing tlb maintenance for the modified ptes, then
leaving lazy mmu mode. But any pte modifications during lazy mmu mode
may be deferred until arch_leave_lazy_mmu_mode(), inverting the required
ordering between pte modificaiton and tlb maintenance.

Let's fix that by leaving mmu mode (forcing all the pte updates to be
actioned) before doing the tlb maintenance.

This is a theorectical bug discovered during code review.

Fixes: 60bae7370896 ("mm/migrate_device.c: flush TLB while holding PTL")
Signed-off-by: Ryan Roberts 
---
 mm/migrate_device.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 3158afe7eb23..fc73a940c112 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -283,11 +283,12 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
migrate->src[migrate->npages++] = mpfn;
}
 
+   arch_leave_lazy_mmu_mode();
+
/* Only flush the TLB if we actually modified any entries */
if (unmapped)
flush_tlb_range(walk->vma, start, end);
 
-   arch_leave_lazy_mmu_mode();
pte_unmap_unlock(ptep - 1, ptl);
 
return 0;
-- 
2.43.0

[RFC PATCH v1 0/6] Lazy mmu mode fixes and improvements

2025-05-30 Thread Ryan Roberts

Hi All,

I recently added support for lazy mmu mode on arm64. The series is now in
Linus's tree so should be in v6.16-rc1. But during testing in linux-next we
found some ugly corners (unexpected nesting). I was able to fix those issues by
making the arm64 implementation more permissive (like the other arches). But
this is quite fragile IMHO. So I'd rather fix the root cause and ensure that
lazy mmu mode never nests, and more importantly, that code never makes pgtable
modifications expecting them to be immediate, not knowing that it's actually in
lazy mmu mode so the changes get deferred.

The first 2 patches are unrelated, very obvious bug fixes. They don't affect
arm64 because arm64 only uses lazy mmu for kernel mappings. But I noticed them
during code review and think they should be fixed.

The next 3 patches are aimed at solving the nesting issue.

And the final patch is reverting the "permissive" fix I did for arm64, which is
no longer needed after the previous 3 patches.

I've labelled this RFC for now because it depends on the arm64 lazy mmu patches
in Linus's master, so it won't apply to mm-unstable. But I'm keen to get review
and siince I'm touching various arches and modifying some core mm stuff, I
thought that might take a while so thought I'd beat the rush and get a first
version out early.

I've build-tested all the affected arches. And I've run mm selftests for the
arm64 build, with no issues (with DEBUG_PAGEALLOC and KFENCE enabled).

Applies against Linus's master branch (f66bc387efbe).

Thanks,
Ryan


Ryan Roberts (6):
  fs/proc/task_mmu: Fix pte update and tlb maintenance ordering in
pagemap_scan_pmd_entry()
  mm: Fix pte update and tlb maintenance ordering in
migrate_vma_collect_pmd()
  mm: Avoid calling page allocator from apply_to_page_range()
  mm: Introduce arch_in_lazy_mmu_mode()
  mm: Avoid calling page allocator while in lazy mmu mode
  Revert "arm64/mm: Permit lazy_mmu_mode to be nested"

 arch/arm64/include/asm/pgtable.h  | 22 
 .../include/asm/book3s/64/tlbflush-hash.h | 15 ++
 arch/sparc/include/asm/tlbflush_64.h  |  1 +
 arch/sparc/mm/tlb.c   | 12 +
 arch/x86/include/asm/paravirt.h   |  5 ++
 arch/x86/include/asm/paravirt_types.h |  1 +
 arch/x86/kernel/paravirt.c|  6 +++
 arch/x86/xen/mmu_pv.c |  6 +++
 fs/proc/task_mmu.c|  3 +-
 include/asm-generic/tlb.h |  2 +
 include/linux/mm.h|  6 +++
 include/linux/pgtable.h   |  1 +
 kernel/bpf/arena.c|  6 +--
 mm/kasan/shadow.c |  2 +-
 mm/memory.c   | 54 ++-
 mm/migrate_device.c   |  3 +-
 mm/mmu_gather.c   | 15 ++
 17 files changed, 128 insertions(+), 32 deletions(-)

--
2.43.0

[RFC PATCH v1 6/6] Revert "arm64/mm: Permit lazy_mmu_mode to be nested"

2025-05-30 Thread Ryan Roberts

Commit 491344301b25 ("arm64/mm: Permit lazy_mmu_mode to be nested") made
the arm64 implementation of lazy_mmu_mode tolerant to nesting. But
subsequent commits have fixed the core code to ensure that lazy_mmu_mode
never gets nested (as originally intended). Therefore we can revert this
commit and reinstate the VM_WARN() if nesting is detected in future.

Signed-off-by: Ryan Roberts 
---
 arch/arm64/include/asm/pgtable.h | 14 ++
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index add75dee49f5..dcf0adbeb803 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -83,21 +83,11 @@ static inline void queue_pte_barriers(void)
 #define  __HAVE_ARCH_ENTER_LAZY_MMU_MODE
 static inline void arch_enter_lazy_mmu_mode(void)
 {
-   /*
-* lazy_mmu_mode is not supposed to permit nesting. But in practice this
-* does happen with CONFIG_DEBUG_PAGEALLOC, where a page allocation
-* inside a lazy_mmu_mode section (such as zap_pte_range()) will change
-* permissions on the linear map with apply_to_page_range(), which
-* re-enters lazy_mmu_mode. So we tolerate nesting in our
-* implementation. The first call to arch_leave_lazy_mmu_mode() will
-* flush and clear the flag such that the remainder of the work in the
-* outer nest behaves as if outside of lazy mmu mode. This is safe and
-* keeps tracking simple.
-*/
-
if (in_interrupt())
return;
 
+   VM_WARN_ON(test_thread_flag(TIF_LAZY_MMU));
+
set_thread_flag(TIF_LAZY_MMU);
 }
 
-- 
2.43.0

[RFC PATCH v1 5/6] mm: Avoid calling page allocator while in lazy mmu mode

2025-05-30 Thread Ryan Roberts

Lazy mmu mode applies to the current task and permits pte modifications
to be deferred and updated at a later time in a batch to improve
performance. tlb_next_batch() is called in lazy mmu mode as follows:

zap_pte_range
  arch_enter_lazy_mmu_mode
  do_zap_pte_range
zap_present_ptes
  zap_present_folio_ptes
__tlb_remove_folio_pages
  __tlb_remove_folio_pages_size
tlb_next_batch
  arch_leave_lazy_mmu_mode

tlb_next_batch() may call into the page allocator which is problematic
with CONFIG_DEBUG_PAGEALLOC because debug_pagealloc_[un]map_pages()
calls the arch implementation of __kernel_map_pages() which must modify
the ptes for the linear map.

There are two possibilities at this point:

- If the arch implementation modifies the ptes directly without first
  entering lazy mmu mode, the pte modifications may get deferred until
  the existing lazy mmu mode is exited. This could result in taking
  spurious faults for example.

- If the arch implementation enters a nested lazy mmu mode before
  modification of the ptes (many arches use apply_to_page_range()),
  then the linear map updates will definitely be applied upon leaving
  the inner lazy mmu mode. But because lazy mmu mode does not support
  nesting, the remainder of the outer user is no longer in lazy mmu
  mode and the optimization opportunity is lost.

So let's just ensure that the page allocator is never called from within
lazy mmu mode. Use the new arch_in_lazy_mmu_mode() API to check if we
are in lazy mmu mode, and if so, when calling into the page allocator,
temporarily leave lazy mmu mode.

Given this new API we can also add VM_WARNings to check that we exit
lazy mmu mode when required to ensure the PTEs are actually updated
prior to tlb flushing.

Signed-off-by: Ryan Roberts 
---
 include/asm-generic/tlb.h |  2 ++
 mm/mmu_gather.c   | 15 +++
 2 files changed, 17 insertions(+)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 88a42973fa47..84fb269b78a5 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -469,6 +469,8 @@ tlb_update_vma_flags(struct mmu_gather *tlb, struct 
vm_area_struct *vma)
 
 static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
 {
+   VM_WARN_ON(arch_in_lazy_mmu_mode());
+
/*
 * Anything calling __tlb_adjust_range() also sets at least one of
 * these bits.
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index db7ba4a725d6..0bd1e69b048b 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -18,6 +18,7 @@
 static bool tlb_next_batch(struct mmu_gather *tlb)
 {
struct mmu_gather_batch *batch;
+   bool lazy_mmu;
 
/* Limit batching if we have delayed rmaps pending */
if (tlb->delayed_rmap && tlb->active != &tlb->local)
@@ -32,7 +33,15 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
return false;
 
+   lazy_mmu = arch_in_lazy_mmu_mode();
+   if (lazy_mmu)
+   arch_leave_lazy_mmu_mode();
+
batch = (void *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+
+   if (lazy_mmu)
+   arch_enter_lazy_mmu_mode();
+
if (!batch)
return false;
 
@@ -145,6 +154,8 @@ static void tlb_batch_pages_flush(struct mmu_gather *tlb)
 {
struct mmu_gather_batch *batch;
 
+   VM_WARN_ON(arch_in_lazy_mmu_mode());
+
for (batch = &tlb->local; batch && batch->nr; batch = batch->next)
__tlb_batch_free_encoded_pages(batch);
tlb->active = &tlb->local;
@@ -154,6 +165,8 @@ static void tlb_batch_list_free(struct mmu_gather *tlb)
 {
struct mmu_gather_batch *batch, *next;
 
+   VM_WARN_ON(arch_in_lazy_mmu_mode());
+
for (batch = tlb->local.next; batch; batch = next) {
next = batch->next;
free_pages((unsigned long)batch, 0);
@@ -363,6 +376,8 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
 {
struct mmu_table_batch **batch = &tlb->batch;
 
+   VM_WARN_ON(arch_in_lazy_mmu_mode());
+
if (*batch == NULL) {
*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | 
__GFP_NOWARN);
if (*batch == NULL) {
-- 
2.43.0

Re: [PATCH v3 3/3] rpmsg: ctrl: Introduce RPMSG_CREATE_EPT_FD_IOCTL uAPI

2025-05-30 Thread Beleswar Prasad Padhi

Hi Dawei,

On 19/05/25 20:38, Dawei Li wrote:
> Implement RPMSG_CREATE_EPT_FD_IOCTL, new uAPI for rpmsg ctrl, which
> shares most of operations of RPMSG_CREATE_EPT_IOCTL except that it
> returns fd representing eptdev to userspace directly.
>
> Possible calling procedures for userspace are:
> - fd = open("/dev/rpmsg_ctrlX")
> - ioctl(fd, RPMSG_CREATE_EPT_FD_IOCTL, &info);
> - fd_ep = info.fd


We are returning a new fd to userspace from inside an IOCTL itself. Is this a
standard way of doing things in Kernel space? (see below related comment)

> - operations on fd_ep(write, read, poll ioctl)
> - ioctl(fd_ep, RPMSG_DESTROY_EPT_IOCTL)
> - close(fd_ep)


Can we rely on the userspace to close() the fd_ep? (if not done, could be a
memory leak..).. Opposed to fd, which we can rely on the userspace to
close() since they initiated the open() call. I am just trying to understand if
this is a standard way of doing things...

> - close(fd)
>
> Signed-off-by: Dawei Li 
> ---
>  drivers/rpmsg/rpmsg_ctrl.c | 38 ++
>  include/uapi/linux/rpmsg.h | 24 
>  2 files changed, 54 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/rpmsg/rpmsg_ctrl.c b/drivers/rpmsg/rpmsg_ctrl.c
> index 28f57945ccd9..9f2f118ceb7b 100644
> --- a/drivers/rpmsg/rpmsg_ctrl.c
> +++ b/drivers/rpmsg/rpmsg_ctrl.c
> @@ -75,19 +75,32 @@ static long rpmsg_ctrldev_ioctl(struct file *fp, unsigned 
> int cmd,
>   unsigned long arg)
>  {
>   struct rpmsg_ctrldev *ctrldev = fp->private_data;
> + struct rpmsg_endpoint_fd_info ept_fd_info;
>   void __user *argp = (void __user *)arg;
>   struct rpmsg_endpoint_info eptinfo;
>   struct rpmsg_channel_info chinfo;
>   struct rpmsg_device *rpdev;
>   int ret = 0;
> -
> - if (copy_from_user(&eptinfo, argp, sizeof(eptinfo)))
> - return -EFAULT;
> -
> - memcpy(chinfo.name, eptinfo.name, RPMSG_NAME_SIZE);
> - chinfo.name[RPMSG_NAME_SIZE - 1] = '\0';
> - chinfo.src = eptinfo.src;
> - chinfo.dst = eptinfo.dst;
> + int fd = -1;
> +
> + if (cmd == RPMSG_CREATE_EPT_IOCTL || cmd == RPMSG_CREATE_DEV_IOCTL ||
> + cmd == RPMSG_RELEASE_DEV_IOCTL) {
> + if (copy_from_user(&eptinfo, argp, sizeof(eptinfo)))
> + return -EFAULT;
> +
> + memcpy(chinfo.name, eptinfo.name, RPMSG_NAME_SIZE);
> + chinfo.name[RPMSG_NAME_SIZE - 1] = '\0';
> + chinfo.src = eptinfo.src;
> + chinfo.dst = eptinfo.dst;
> + } else if (cmd == RPMSG_CREATE_EPT_FD_IOCTL) {


Maybe we can put this 'else if condition' in the first 'if' and treat other
conditions under 'else', as 'RPMSG_CREATE_EPT_FD_IOCTL' is the only
ioctl with a different struct type.

Thanks,
Beleswar

> + if (copy_from_user(&ept_fd_info, argp, sizeof(ept_fd_info)))
> + return -EFAULT;
> +
> + memcpy(chinfo.name, ept_fd_info.name, RPMSG_NAME_SIZE);
> + chinfo.name[RPMSG_NAME_SIZE - 1] = '\0';
> + chinfo.src = ept_fd_info.src;
> + chinfo.dst = ept_fd_info.dst;
> + }
>  
>   mutex_lock(&ctrldev->ctrl_lock);
>   switch (cmd) {
> @@ -110,6 +123,15 @@ static long rpmsg_ctrldev_ioctl(struct file *fp, 
> unsigned int cmd,
>   chinfo.name, ret);
>   break;
>  
> + case RPMSG_CREATE_EPT_FD_IOCTL:
> + ret = rpmsg_anonymous_eptdev_create(ctrldev->rpdev, 
> &ctrldev->dev, chinfo,
> + ept_fd_info.flags, &fd);
> + if (!ret) {
> + ept_fd_info.fd = fd;
> + ret = copy_to_user(argp, &ept_fd_info, 
> sizeof(ept_fd_info));
> + }
> + break;
> +
>   default:
>   ret = -EINVAL;
>   }
> diff --git a/include/uapi/linux/rpmsg.h b/include/uapi/linux/rpmsg.h
> index f0c8da2b185b..e7057bd23577 100644
> --- a/include/uapi/linux/rpmsg.h
> +++ b/include/uapi/linux/rpmsg.h
> @@ -53,4 +53,28 @@ struct rpmsg_endpoint_info {
>   */
>  #define RPMSG_SET_INCOMING_FLOWCONTROL _IOR(0xb5, 0x6, int)
>  
> +/**
> + * struct rpmsg_endpoint_fd_info - endpoint & fd info representation
> + * @name: name of service
> + * @src: local address. To set to RPMSG_ADDR_ANY if not used.
> + * @dst: destination address. To set to RPMSG_ADDR_ANY if not used.
> + * @flags: file flags of endpoint device, valid flags:
> + * O_RDONLY/O_WRONLY/O_RDWR
> + * O_NONBLOCK
> + * O_CLOEXEC
> + * @fd: fd returned from driver
> + */
> +struct rpmsg_endpoint_fd_info {
> + char name[32];
> + __u32 src;
> + __u32 dst;
> + __u32 flags;
> + __s32 fd;
> +};
> +
> +/**
> + * Instantiate a new rmpsg endpoint which is represented by fd
> + */
> +#define RPMSG_CREATE_EPT_FD_IOCTL _IOWR(0xb5, 0x7, struct 
> rpmsg_endpoint_fd_info)
> +
>  #endif

Re: [PATCH v3 1/1] selftests/mm/uffd: Refactor non-composite global vars into struct

2025-05-30 Thread Ujwal Kundur

> Sorry I don't have an opinion on which of these is the best (I can try
> to find some time to form an opionion on this later!), but:
>
> Fixing the flakiness sounds great, but I would suggest decoupling that
> from the refactoring. If it's practical, focus on removing the globals
> first, while leaving the fundamental logic the same, even if it's bad.
> Then as a separate series, fix the logic.

Thanks, much appreciated. I'll send a patch with the refactoring + a comment
regarding this variable left-in and follow-up with a fix.

Re: [RFC PATCH v1 0/6] Lazy mmu mode fixes and improvements

2025-05-30 Thread Lorenzo Stoakes

+cc Jann who is a specialist in all things page table-y and especially scary
edge cases :)

On Fri, May 30, 2025 at 03:04:38PM +0100, Ryan Roberts wrote:
> Hi All,
>
> I recently added support for lazy mmu mode on arm64. The series is now in
> Linus's tree so should be in v6.16-rc1. But during testing in linux-next we
> found some ugly corners (unexpected nesting). I was able to fix those issues 
> by
> making the arm64 implementation more permissive (like the other arches). But
> this is quite fragile IMHO. So I'd rather fix the root cause and ensure that
> lazy mmu mode never nests, and more importantly, that code never makes pgtable
> modifications expecting them to be immediate, not knowing that it's actually 
> in
> lazy mmu mode so the changes get deferred.

When you say fragile, are you confident it _works_ but perhaps not quite as well
as you want? Or are you concerned this might be broken upstream in any way?

I am thinking specifically about the proposed use in Dev's new series [0] and
obviously hoping (and assuming in fact) that it's the former :)

[0]: https://lore.kernel.org/linux-mm/20250530090407.19237-1-dev.j...@arm.com/

>
> The first 2 patches are unrelated, very obvious bug fixes. They don't affect
> arm64 because arm64 only uses lazy mmu for kernel mappings. But I noticed them
> during code review and think they should be fixed.
>
> The next 3 patches are aimed at solving the nesting issue.
>
> And the final patch is reverting the "permissive" fix I did for arm64, which 
> is
> no longer needed after the previous 3 patches.
>
> I've labelled this RFC for now because it depends on the arm64 lazy mmu 
> patches
> in Linus's master, so it won't apply to mm-unstable. But I'm keen to get 
> review
> and siince I'm touching various arches and modifying some core mm stuff, I
> thought that might take a while so thought I'd beat the rush and get a first
> version out early.
>
> I've build-tested all the affected arches. And I've run mm selftests for the
> arm64 build, with no issues (with DEBUG_PAGEALLOC and KFENCE enabled).
>
> Applies against Linus's master branch (f66bc387efbe).
>
> Thanks,
> Ryan
>
>
> Ryan Roberts (6):
>   fs/proc/task_mmu: Fix pte update and tlb maintenance ordering in
> pagemap_scan_pmd_entry()
>   mm: Fix pte update and tlb maintenance ordering in
> migrate_vma_collect_pmd()
>   mm: Avoid calling page allocator from apply_to_page_range()
>   mm: Introduce arch_in_lazy_mmu_mode()
>   mm: Avoid calling page allocator while in lazy mmu mode
>   Revert "arm64/mm: Permit lazy_mmu_mode to be nested"
>
>  arch/arm64/include/asm/pgtable.h  | 22 
>  .../include/asm/book3s/64/tlbflush-hash.h | 15 ++
>  arch/sparc/include/asm/tlbflush_64.h  |  1 +
>  arch/sparc/mm/tlb.c   | 12 +
>  arch/x86/include/asm/paravirt.h   |  5 ++
>  arch/x86/include/asm/paravirt_types.h |  1 +
>  arch/x86/kernel/paravirt.c|  6 +++
>  arch/x86/xen/mmu_pv.c |  6 +++
>  fs/proc/task_mmu.c|  3 +-
>  include/asm-generic/tlb.h |  2 +
>  include/linux/mm.h|  6 +++
>  include/linux/pgtable.h   |  1 +
>  kernel/bpf/arena.c|  6 +--
>  mm/kasan/shadow.c |  2 +-
>  mm/memory.c   | 54 ++-
>  mm/migrate_device.c   |  3 +-
>  mm/mmu_gather.c   | 15 ++
>  17 files changed, 128 insertions(+), 32 deletions(-)
>
> --
> 2.43.0
>

[PATCH v3 2/4] media: qcom: camss: Add support for MSM8939

2025-05-30 Thread Vincent Knecht via B4 Relay

From: Vincent Knecht 

The camera subsystem for the MSM8939 is the same as MSM8916 except with
3 CSID instead of 2, and some higher clock rates.

As a quirk, this SoC needs writing values to 2 VFE VBIF registers
(see downstream msm8939-camera.dtsi vbif-{regs,settings} properties).
This fixes black stripes across sensor and garbage in CSID TPG outputs.

Add support for the MSM8939 camera subsystem.

Signed-off-by: Vincent Knecht 
---
 drivers/media/platform/qcom/camss/camss-csiphy.c   |   1 +
 drivers/media/platform/qcom/camss/camss-ispif.c|   8 +-
 drivers/media/platform/qcom/camss/camss-vfe-vbif.c |   6 +
 drivers/media/platform/qcom/camss/camss-vfe.c  |   1 +
 drivers/media/platform/qcom/camss/camss.c  | 157 +
 drivers/media/platform/qcom/camss/camss.h  |   1 +
 6 files changed, 172 insertions(+), 2 deletions(-)

diff --git a/drivers/media/platform/qcom/camss/camss-csiphy.c 
b/drivers/media/platform/qcom/camss/camss-csiphy.c
index 
c622efcc92ff3781d7fc3ace0253c2d64c91e847..6311fc2975aa1345e430a477c8a6476f1d7e5663
 100644
--- a/drivers/media/platform/qcom/camss/camss-csiphy.c
+++ b/drivers/media/platform/qcom/camss/camss-csiphy.c
@@ -605,6 +605,7 @@ int msm_csiphy_subdev_init(struct camss *camss,
return PTR_ERR(csiphy->base);
 
if (camss->res->version == CAMSS_8x16 ||
+   camss->res->version == CAMSS_8x39 ||
camss->res->version == CAMSS_8x53 ||
camss->res->version == CAMSS_8x96) {
csiphy->base_clk_mux =
diff --git a/drivers/media/platform/qcom/camss/camss-ispif.c 
b/drivers/media/platform/qcom/camss/camss-ispif.c
index 
2dc585c6123dd248a5bacd9c7a88cb5375644311..aaf3caa42d33dcb641651e7f5bc0c2a564d85bfa
 100644
--- a/drivers/media/platform/qcom/camss/camss-ispif.c
+++ b/drivers/media/platform/qcom/camss/camss-ispif.c
@@ -1112,6 +1112,8 @@ int msm_ispif_subdev_init(struct camss *camss,
/* Number of ISPIF lines - same as number of CSID hardware modules */
if (camss->res->version == CAMSS_8x16)
ispif->line_num = 2;
+   else if (camss->res->version == CAMSS_8x39)
+   ispif->line_num = 3;
else if (camss->res->version == CAMSS_8x96 ||
 camss->res->version == CAMSS_8x53 ||
 camss->res->version == CAMSS_660)
@@ -1128,7 +1130,8 @@ int msm_ispif_subdev_init(struct camss *camss,
ispif->line[i].ispif = ispif;
ispif->line[i].id = i;
 
-   if (camss->res->version == CAMSS_8x16) {
+   if (camss->res->version == CAMSS_8x16 ||
+   camss->res->version == CAMSS_8x39) {
ispif->line[i].formats = ispif_formats_8x16;
ispif->line[i].nformats =
ARRAY_SIZE(ispif_formats_8x16);
@@ -1162,7 +1165,8 @@ int msm_ispif_subdev_init(struct camss *camss,
ispif->irq = ret;
snprintf(ispif->irq_name, sizeof(ispif->irq_name), "%s_%s",
 dev_name(dev), MSM_ISPIF_NAME);
-   if (camss->res->version == CAMSS_8x16)
+   if (camss->res->version == CAMSS_8x16 ||
+   camss->res->version == CAMSS_8x39)
ret = devm_request_irq(dev, ispif->irq, ispif_isr_8x16,
   IRQF_TRIGGER_RISING, ispif->irq_name, ispif);
else if (camss->res->version == CAMSS_8x96 ||
diff --git a/drivers/media/platform/qcom/camss/camss-vfe-vbif.c 
b/drivers/media/platform/qcom/camss/camss-vfe-vbif.c
index 
691335f231a6001e6c535431a18b2e21ddc832c9..911f8da02f1fbb500ab9564978e2b0dddf93e84e
 100644
--- a/drivers/media/platform/qcom/camss/camss-vfe-vbif.c
+++ b/drivers/media/platform/qcom/camss/camss-vfe-vbif.c
@@ -14,6 +14,9 @@
 #include "camss-vfe.h"
 #include "camss-vfe-vbif.h"
 
+#define VBIF_FIXED_SORT_EN 0x30
+#define VBIF_FIXED_SORT_SEL0   0x34
+
 void vfe_vbif_write_reg(struct vfe_device *vfe, u32 reg, u32 val)
 {
writel_relaxed(val, vfe->vbif_base + reg);
@@ -21,5 +24,8 @@ void vfe_vbif_write_reg(struct vfe_device *vfe, u32 reg, u32 
val)
 
 int vfe_vbif_apply_settings(struct vfe_device *vfe)
 {
+   vfe_vbif_write_reg(vfe, VBIF_FIXED_SORT_EN, 0xfff);
+   vfe_vbif_write_reg(vfe, VBIF_FIXED_SORT_SEL0, 0x555000);
+
return 0;
 }
diff --git a/drivers/media/platform/qcom/camss/camss-vfe.c 
b/drivers/media/platform/qcom/camss/camss-vfe.c
index 
3138562d399444c5cf2ae96bf16b75b85ff5c5ca..ac3a9579e3e6910eee8c1ec11c4fff6e1bc94443
 100644
--- a/drivers/media/platform/qcom/camss/camss-vfe.c
+++ b/drivers/media/platform/qcom/camss/camss-vfe.c
@@ -290,6 +290,7 @@ static u32 vfe_src_pad_code(struct vfe_line *line, u32 
sink_code,
 
switch (vfe->camss->res->version) {
case CAMSS_8x16:
+   case CAMSS_8x39:
case CAMSS_8x53:
switch (sink_code) {
case MEDIA_BUS_FMT_YUYV8_1X16:
diff --git a/drivers/media/platform/qcom/camss/camss.c 
b/drivers/media/platform/qcom/cam

[PATCH v3 3/4] media: dt-bindings: Add qcom,msm8939-camss

2025-05-30 Thread Vincent Knecht via B4 Relay

From: Vincent Knecht 

Add bindings for qcom,msm8939-camss in order to support the camera
subsystem for MSM8939.

Signed-off-by: Vincent Knecht 
---
 .../bindings/media/qcom,msm8939-camss.yaml | 253 +
 1 file changed, 253 insertions(+)

diff --git a/Documentation/devicetree/bindings/media/qcom,msm8939-camss.yaml 
b/Documentation/devicetree/bindings/media/qcom,msm8939-camss.yaml
new file mode 100644
index 
..592b847433d7a788d8c1635129dd408cb0112073
--- /dev/null
+++ b/Documentation/devicetree/bindings/media/qcom,msm8939-camss.yaml
@@ -0,0 +1,253 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/media/qcom,msm8939-camss.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm MSM8939 Camera Subsystem (CAMSS)
+
+maintainers:
+  - Vincent Knecht 
+
+description:
+  The CAMSS IP is a CSI decoder and ISP present on Qualcomm platforms
+
+properties:
+  compatible:
+const: qcom,msm8939-camss
+
+  reg:
+maxItems: 11
+
+  reg-names:
+items:
+  - const: csid0
+  - const: csid1
+  - const: csid2
+  - const: csiphy0
+  - const: csiphy0_clk_mux
+  - const: csiphy1
+  - const: csiphy1_clk_mux
+  - const: csi_clk_mux
+  - const: ispif
+  - const: vfe0
+  - const: vfe0_vbif
+
+  clocks:
+maxItems: 24
+
+  clock-names:
+items:
+  - const: ahb
+  - const: csi0
+  - const: csi0_ahb
+  - const: csi0_phy
+  - const: csi0_pix
+  - const: csi0_rdi
+  - const: csi1
+  - const: csi1_ahb
+  - const: csi1_phy
+  - const: csi1_pix
+  - const: csi1_rdi
+  - const: csi2
+  - const: csi2_ahb
+  - const: csi2_phy
+  - const: csi2_pix
+  - const: csi2_rdi
+  - const: csiphy0_timer
+  - const: csiphy1_timer
+  - const: csi_vfe0
+  - const: ispif_ahb
+  - const: top_ahb
+  - const: vfe0
+  - const: vfe_ahb
+  - const: vfe_axi
+
+  interrupts:
+maxItems: 7
+
+  interrupt-names:
+items:
+  - const: csid0
+  - const: csid1
+  - const: csid2
+  - const: csiphy0
+  - const: csiphy1
+  - const: ispif
+  - const: vfe0
+
+  iommus:
+maxItems: 1
+
+  power-domains:
+items:
+  - description: VFE GDSC - Video Front End, Global Distributed Switch 
Controller.
+
+  vdda-supply:
+description:
+  Definition of the regulator used as analog power supply.
+
+  ports:
+$ref: /schemas/graph.yaml#/properties/ports
+
+description:
+  CSI input ports.
+
+patternProperties:
+  "^port@[0-1]$":
+$ref: /schemas/graph.yaml#/$defs/port-base
+unevaluatedProperties: false
+
+description:
+  Input port for receiving CSI data.
+
+properties:
+  endpoint:
+$ref: video-interfaces.yaml#
+unevaluatedProperties: false
+
+properties:
+  data-lanes:
+minItems: 1
+maxItems: 4
+
+  bus-type:
+enum:
+  - 4 # MEDIA_BUS_TYPE_CSI2_DPHY
+
+required:
+  - data-lanes
+
+required:
+  - compatible
+  - reg
+  - reg-names
+  - clocks
+  - clock-names
+  - interrupts
+  - interrupt-names
+  - iommus
+  - power-domains
+  - vdda-supply
+  - ports
+
+additionalProperties: false
+
+examples:
+  - |
+#include 
+#include 
+
+isp@1b08000 {
+compatible = "qcom,msm8939-camss";
+
+reg = <0x01b08000 0x100>,
+  <0x01b08400 0x100>,
+  <0x01b08800 0x100>,
+  <0x01b0ac00 0x200>,
+  <0x01b00030 0x4>,
+  <0x01b0b000 0x200>,
+  <0x01b00038 0x4>,
+  <0x01b00020 0x10>,
+  <0x01b0a000 0x500>,
+  <0x01b1 0x1000>,
+  <0x01b4 0x200>;
+
+reg-names = "csid0",
+"csid1",
+"csid2",
+"csiphy0",
+"csiphy0_clk_mux",
+"csiphy1",
+"csiphy1_clk_mux",
+"csi_clk_mux",
+"ispif",
+"vfe0",
+"vfe0_vbif";
+
+clocks = <&gcc GCC_CAMSS_AHB_CLK>,
+ <&gcc GCC_CAMSS_CSI0_CLK>,
+ <&gcc GCC_CAMSS_CSI0_AHB_CLK>,
+ <&gcc GCC_CAMSS_CSI0PHY_CLK>,
+ <&gcc GCC_CAMSS_CSI0PIX_CLK>,
+ <&gcc GCC_CAMSS_CSI0RDI_CLK>,
+ <&gcc GCC_CAMSS_CSI1_CLK>,
+ <&gcc GCC_CAMSS_CSI1_AHB_CLK>,
+ <&gcc GCC_CAMSS_CSI1PHY_CLK>,
+ <&gcc GCC_CAMSS_CSI1PIX_CLK>,
+ <&gcc GCC_CAMSS_CSI1RDI_CLK>,
+ <&gcc GCC_CAMSS_CSI2_CLK>,
+ <&gcc GCC_CAMSS_CSI2_AHB_CLK>,
+ <&gcc GCC_CAMSS_CSI2PHY_CLK>,
+ <&gcc GCC_CAMSS_CSI2PIX_CLK>,
+

Re: [PATCH v2 2/3] virtio-mem: fix multiple typos in struct comments and function docs

2025-05-30 Thread David Hildenbrand


On 29.05.25 10:42, Alok Tiwari wrote:

Corrected several spelling mistakes in code comments, including:
- "bock" -> "block"
- "valued" -> "value"
- "actipn" -> "action"
- "accidentially" -> "accidentally"
- Improved grammar in a few places for clarity.

These changes are purely cosmetic and do not affect functionality.

Signed-off-by: Alok Tiwari 


Reviewed-by: David Hildenbrand 

--
Cheers,

David / dhildenb

[PATCH v3 1/4] media: qcom: camss: vfe: Add VBIF setting support

2025-05-30 Thread Vincent Knecht via B4 Relay

From: Vincent Knecht 

Some devices need writing values to VFE VBIF registers.
Add helper functions to do this.

Signed-off-by: Vincent Knecht 
---
 drivers/media/platform/qcom/camss/Makefile |  1 +
 drivers/media/platform/qcom/camss/camss-vfe-4-1.c  | 12 +++
 drivers/media/platform/qcom/camss/camss-vfe-vbif.c | 25 ++
 drivers/media/platform/qcom/camss/camss-vfe-vbif.h | 19 
 drivers/media/platform/qcom/camss/camss-vfe.c  |  9 
 drivers/media/platform/qcom/camss/camss-vfe.h  |  3 +++
 6 files changed, 69 insertions(+)

diff --git a/drivers/media/platform/qcom/camss/Makefile 
b/drivers/media/platform/qcom/camss/Makefile
index 
d26a9c24a430a831e0d865db4d96142da5276653..4c66d29ae505ae5adc717ae98f77fb736a6e15b9
 100644
--- a/drivers/media/platform/qcom/camss/Makefile
+++ b/drivers/media/platform/qcom/camss/Makefile
@@ -21,6 +21,7 @@ qcom-camss-objs += \
camss-vfe-680.o \
camss-vfe-780.o \
camss-vfe-gen1.o \
+   camss-vfe-vbif.o \
camss-vfe.o \
camss-video.o \
camss-format.o \
diff --git a/drivers/media/platform/qcom/camss/camss-vfe-4-1.c 
b/drivers/media/platform/qcom/camss/camss-vfe-4-1.c
index 
901677293d971cf761944a660ef719af38203f22..9cf1ccdb2fe7ca9bf89b746af836e1035b457a8f
 100644
--- a/drivers/media/platform/qcom/camss/camss-vfe-4-1.c
+++ b/drivers/media/platform/qcom/camss/camss-vfe-4-1.c
@@ -15,6 +15,7 @@
 #include "camss.h"
 #include "camss-vfe.h"
 #include "camss-vfe-gen1.h"
+#include "camss-vfe-vbif.h"
 
 #define VFE_0_HW_VERSION   0x000
 
@@ -733,6 +734,7 @@ static void vfe_set_qos(struct vfe_device *vfe)
 {
u32 val = VFE_0_BUS_BDG_QOS_CFG_0_CFG;
u32 val7 = VFE_0_BUS_BDG_QOS_CFG_7_CFG;
+   int ret;
 
writel_relaxed(val, vfe->base + VFE_0_BUS_BDG_QOS_CFG_0);
writel_relaxed(val, vfe->base + VFE_0_BUS_BDG_QOS_CFG_1);
@@ -742,6 +744,16 @@ static void vfe_set_qos(struct vfe_device *vfe)
writel_relaxed(val, vfe->base + VFE_0_BUS_BDG_QOS_CFG_5);
writel_relaxed(val, vfe->base + VFE_0_BUS_BDG_QOS_CFG_6);
writel_relaxed(val7, vfe->base + VFE_0_BUS_BDG_QOS_CFG_7);
+
+   /* SoC-specific VBIF settings */
+   if (vfe->res->has_vbif) {
+   ret = vfe_vbif_apply_settings(vfe);
+   if (ret < 0) {
+   dev_err_ratelimited(vfe->camss->dev,
+   "VFE: VBIF error %d\n",
+   ret);
+   }
+   }
 }
 
 static void vfe_set_ds(struct vfe_device *vfe)
diff --git a/drivers/media/platform/qcom/camss/camss-vfe-vbif.c 
b/drivers/media/platform/qcom/camss/camss-vfe-vbif.c
new file mode 100644
index 
..691335f231a6001e6c535431a18b2e21ddc832c9
--- /dev/null
+++ b/drivers/media/platform/qcom/camss/camss-vfe-vbif.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * camss-vfe-vbif.c
+ *
+ * Qualcomm MSM Camera Subsystem - VFE VBIF Module
+ *
+ * Copyright (c) 2025, The Linux Foundation. All rights reserved.
+ *
+ */
+
+#include 
+
+#include "camss.h"
+#include "camss-vfe.h"
+#include "camss-vfe-vbif.h"
+
+void vfe_vbif_write_reg(struct vfe_device *vfe, u32 reg, u32 val)
+{
+   writel_relaxed(val, vfe->vbif_base + reg);
+}
+
+int vfe_vbif_apply_settings(struct vfe_device *vfe)
+{
+   return 0;
+}
diff --git a/drivers/media/platform/qcom/camss/camss-vfe-vbif.h 
b/drivers/media/platform/qcom/camss/camss-vfe-vbif.h
new file mode 100644
index 
..502db629e961f67723b14a7c8c9ca973fe4c267c
--- /dev/null
+++ b/drivers/media/platform/qcom/camss/camss-vfe-vbif.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * camss-vfe-vbif.h
+ *
+ * Qualcomm MSM Camera Subsystem - VFE VBIF Module
+ *
+ * Copyright (c) 2025, The Linux Foundation. All rights reserved.
+ *
+ */
+#ifndef QC_MSM_CAMSS_VFE_VBIF_H
+#define QC_MSM_CAMSS_VFE_VBIF_H
+
+#include "camss-vfe.h"
+
+void vfe_vbif_write_reg(struct vfe_device *vfe, u32 reg, u32 val);
+
+int vfe_vbif_apply_settings(struct vfe_device *vfe);
+
+#endif /* QC_MSM_CAMSS_VFE_VBIF_H */
diff --git a/drivers/media/platform/qcom/camss/camss-vfe.c 
b/drivers/media/platform/qcom/camss/camss-vfe.c
index 
4bca6c3abaff9b898ea879674a3ff8f3592d3139..3138562d399444c5cf2ae96bf16b75b85ff5c5ca
 100644
--- a/drivers/media/platform/qcom/camss/camss-vfe.c
+++ b/drivers/media/platform/qcom/camss/camss-vfe.c
@@ -1807,6 +1807,15 @@ int msm_vfe_subdev_init(struct camss *camss, struct 
vfe_device *vfe,
return PTR_ERR(vfe->base);
}
 
+   if (vfe->res->has_vbif) {
+   vfe->vbif_base = devm_platform_ioremap_resource_byname(pdev,
+   vfe->res->vbif_name);
+   if (IS_ERR(vfe->vbif_base)) {
+   dev_err(dev, "could not map vbif memory\

[PATCH v3 0/4] CAMSS support for MSM8939

2025-05-30 Thread Vincent Knecht via B4 Relay

This series adds CAMSS support for MSM8939.
It's mostly identical to MSM8916, except for some clocks
and an additional CSI.

To fix black stripes across sensor output, and garbage in
CSID TPG output, 2 VFE VBIF register settings are needed.
So the 1st patch adds helper functions to do just that.

Patch 1: adds helper for VFE VBIF settings
Patch 2: adds CAMSS_8x39 version in CAMSS driver
Patch 3: documents qcom,msm8939-camss DT bindings
Patch 4: adds camss and cci in msm8939.dtsi

Signed-off-by: Vincent Knecht 
---
Changes in v3:
- Patch 1:
  - Use braces around multiline (Bryan)
  - Rename vfe_vbif_reg_write to vfe_vbif_write_reg (Bryan)
  - Get rid of switch block on CAMSS version (Bryan)
- Patch 2:
  - Get rid of switch block on CAMSS version (Bryan)
- Patch 3: no change
- Patch 4: no change
  - Tried to get rid of CCI camss_ahb but this resulted in device
freeze+reboot (Konrad)
- Link to v2: 
https://lore.kernel.org/r/20250525-camss-8x39-vbif-v2-0-6d3d5c5af...@mailoo.org

Changes in v2:
- Patch 1:
  - Fix devm_platform_ioremap_resource_byname line to not end with
opening parenthesis (media-ci/1-checkpatch)
  - Move camss-vfe-4-1.c handling of VBIF previously in patch 2 here
(Dmitry)
- Patch 2:
  - Declare regulators in PHY entries, not CSID ones (Bryan)
- Patch 3: (bindings)
  - Fix bindings checks for new errors (Rob)
  - Fix properties ordering, code-style and example (Krzysztof)
  - Sort reg-names, clock-names and interrupt-names alphanumerically (Bryan)
- Patch 4: (dtsi)
  - Move #address/#size cells before status (Konrad)
  - Aligned CCI with msm8916, thus removing ispif_ahb mention (Konrad)
If "camss_ahb should be unnecessary", it's still required by 
qcom,i2c-cci.yaml
- Link to v1: 
https://lore.kernel.org/r/20250520-camss-8x39-vbif-v1-0-a12cd6006...@mailoo.org

---
Vincent Knecht (4):
  media: qcom: camss: vfe: Add VBIF setting support
  media: qcom: camss: Add support for MSM8939
  media: dt-bindings: Add qcom,msm8939-camss
  arm64: dts: qcom: msm8939: Add camss and cci

 .../bindings/media/qcom,msm8939-camss.yaml | 253 +
 arch/arm64/boot/dts/qcom/msm8939-pm8916.dtsi   |   4 +
 arch/arm64/boot/dts/qcom/msm8939.dtsi  | 146 
 drivers/media/platform/qcom/camss/Makefile |   1 +
 drivers/media/platform/qcom/camss/camss-csiphy.c   |   1 +
 drivers/media/platform/qcom/camss/camss-ispif.c|   8 +-
 drivers/media/platform/qcom/camss/camss-vfe-4-1.c  |  12 +
 drivers/media/platform/qcom/camss/camss-vfe-vbif.c |  31 +++
 drivers/media/platform/qcom/camss/camss-vfe-vbif.h |  19 ++
 drivers/media/platform/qcom/camss/camss-vfe.c  |  10 +
 drivers/media/platform/qcom/camss/camss-vfe.h  |   3 +
 drivers/media/platform/qcom/camss/camss.c  | 157 +
 drivers/media/platform/qcom/camss/camss.h  |   1 +
 13 files changed, 644 insertions(+), 2 deletions(-)
---
base-commit: 8566fc3b96539e3235909d6bdda198e1282beaed
change-id: 20250517-camss-8x39-vbif-975ff5819198

Best regards,
-- 
Vincent Knecht

Re: [PATCH] virtio-vdpa: Remove virtqueue list

2025-05-30 Thread Eugenio Perez Martin

On Thu, May 29, 2025 at 9:30 AM Viresh Kumar  wrote:
>
> The virtio vdpa implementation creates a list of virtqueues, while the
> same is already available in the struct virtio_device.
>
> This list is never traversed though, and only the pointer to the struct
> virtio_vdpa_vq_info is used in the callback, where the virtqueue pointer
> could be directly used.
>
> Remove the unwanted code to simplify the driver.
>

Acked-by: Eugenio Pérez 

Thanks!

> Signed-off-by: Viresh Kumar 
> ---
> Only build tested.
> ---
>  drivers/virtio/virtio_vdpa.c | 44 +++-
>  1 file changed, 3 insertions(+), 41 deletions(-)
>
> diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
> index 1f60c9d5cb18..e25610e3393a 100644
> --- a/drivers/virtio/virtio_vdpa.c
> +++ b/drivers/virtio/virtio_vdpa.c
> @@ -28,19 +28,6 @@ struct virtio_vdpa_device {
> struct virtio_device vdev;
> struct vdpa_device *vdpa;
> u64 features;
> -
> -   /* The lock to protect virtqueue list */
> -   spinlock_t lock;
> -   /* List of virtio_vdpa_vq_info */
> -   struct list_head virtqueues;
> -};
> -
> -struct virtio_vdpa_vq_info {
> -   /* the actual virtqueue */
> -   struct virtqueue *vq;
> -
> -   /* the list node for the virtqueues list */
> -   struct list_head node;
>  };
>
>  static inline struct virtio_vdpa_device *
> @@ -135,9 +122,9 @@ static irqreturn_t virtio_vdpa_config_cb(void *private)
>
>  static irqreturn_t virtio_vdpa_virtqueue_cb(void *private)
>  {
> -   struct virtio_vdpa_vq_info *info = private;
> +   struct virtqueue *vq = private;
>
> -   return vring_interrupt(0, info->vq);
> +   return vring_interrupt(0, vq);
>  }
>
>  static struct virtqueue *
> @@ -145,18 +132,15 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, 
> unsigned int index,
>  void (*callback)(struct virtqueue *vq),
>  const char *name, bool ctx)
>  {
> -   struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
> struct vdpa_device *vdpa = vd_get_vdpa(vdev);
> struct device *dma_dev;
> const struct vdpa_config_ops *ops = vdpa->config;
> -   struct virtio_vdpa_vq_info *info;
> bool (*notify)(struct virtqueue *vq) = virtio_vdpa_notify;
> struct vdpa_callback cb;
> struct virtqueue *vq;
> u64 desc_addr, driver_addr, device_addr;
> /* Assume split virtqueue, switch to packed if necessary */
> struct vdpa_vq_state state = {0};
> -   unsigned long flags;
> u32 align, max_num, min_num = 1;
> bool may_reduce_num = true;
> int err;
> @@ -179,10 +163,6 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, 
> unsigned int index,
> if (ops->get_vq_ready(vdpa, index))
> return ERR_PTR(-ENOENT);
>
> -   /* Allocate and fill out our active queue description */
> -   info = kmalloc(sizeof(*info), GFP_KERNEL);
> -   if (!info)
> -   return ERR_PTR(-ENOMEM);
> if (ops->get_vq_size)
> max_num = ops->get_vq_size(vdpa, index);
> else
> @@ -217,7 +197,7 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned 
> int index,
>
> /* Setup virtqueue callback */
> cb.callback = callback ? virtio_vdpa_virtqueue_cb : NULL;
> -   cb.private = info;
> +   cb.private = vq;
> cb.trigger = NULL;
> ops->set_vq_cb(vdpa, index, &cb);
> ops->set_vq_num(vdpa, index, virtqueue_get_vring_size(vq));
> @@ -248,13 +228,6 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, 
> unsigned int index,
>
> ops->set_vq_ready(vdpa, index, 1);
>
> -   vq->priv = info;
> -   info->vq = vq;
> -
> -   spin_lock_irqsave(&vd_dev->lock, flags);
> -   list_add(&info->node, &vd_dev->virtqueues);
> -   spin_unlock_irqrestore(&vd_dev->lock, flags);
> -
> return vq;
>
>  err_vq:
> @@ -263,7 +236,6 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned 
> int index,
> ops->set_vq_ready(vdpa, index, 0);
> /* VDPA driver should make sure vq is stopeed here */
> WARN_ON(ops->get_vq_ready(vdpa, index));
> -   kfree(info);
> return ERR_PTR(err);
>  }
>
> @@ -272,20 +244,12 @@ static void virtio_vdpa_del_vq(struct virtqueue *vq)
> struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vq->vdev);
> struct vdpa_device *vdpa = vd_dev->vdpa;
> const struct vdpa_config_ops *ops = vdpa->config;
> -   struct virtio_vdpa_vq_info *info = vq->priv;
> unsigned int index = vq->index;
> -   unsigned long flags;
> -
> -   spin_lock_irqsave(&vd_dev->lock, flags);
> -   list_del(&info->node);
> -   spin_unlock_irqrestore(&vd_dev->lock, flags);
>
> /* Select and deactivate the queue (best effort) */
> ops->set_vq_ready(vdpa, index, 0);
>
> vring_del_virtqueue(vq);
> -
> -   kfree(info);
>  }
>

Re: [PATCH] selftests/filesystems: Fix build of anon_inode_test

2025-05-30 Thread Mark Brown

On Sun, May 18, 2025 at 03:01:34PM +0100, Mark Brown wrote:
> The anon_inode_test test fails to build due to attempting to include
> a nonexisting overlayfs/wrapper.h:
> 
> anon_inode_test.c:10:10: fatal error: overlayfs/wrappers.h: No such file or 
> directory
>10 | #include "overlayfs/wrappers.h"
>   |  ^~

This build failure, first reported against -next and which should be
fixed by this patch, is now present in mainline.

> This is due to 0bd92b9fe538 ("selftests/filesystems: move wrapper.h out
> of overlayfs subdir") which was added in the vfs-6.16.selftests branch
> which was based on -rc5 and does not contain the newly added test so
> once things were merged into vfs.all in the build started failing - both
> parent commits are fine.
> 
> Fixes: feaa00dbff45a ("Merge branch 'vfs-6.16.selftests' into vfs.all")

I see that the two branches get sent separately to Linus so the merge
that triggers things is now:

   3e406741b19890 ("Merge tag 'vfs-6.16-rc1.selftests' of 
git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs")

I'll resend with that updated.

> Signed-off-by: Mark Brown 
> ---
>  tools/testing/selftests/filesystems/anon_inode_test.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/tools/testing/selftests/filesystems/anon_inode_test.c 
> b/tools/testing/selftests/filesystems/anon_inode_test.c
> index e8e0ef1460d2..73e0a4d4fb2f 100644
> --- a/tools/testing/selftests/filesystems/anon_inode_test.c
> +++ b/tools/testing/selftests/filesystems/anon_inode_test.c
> @@ -7,7 +7,7 @@
>  #include 
>  
>  #include "../kselftest_harness.h"
> -#include "overlayfs/wrappers.h"
> +#include "wrappers.h"
>  
>  TEST(anon_inode_no_chown)
>  {
> 
> ---
> base-commit: feaa00dbff45ad9a0dcd04a92f88c745bf880f55
> change-id: 20250516-selftests-anon-inode-build-007e206e8422
> 
> Best regards,
> -- 
> Mark Brown 
> 


signature.asc
Description: PGP signature

Re: [RFC PATCH v1 3/6] mm: Avoid calling page allocator from apply_to_page_range()

2025-05-30 Thread Liam R. Howlett

* Ryan Roberts  [250530 10:05]:
> Lazy mmu mode applies to the current task and permits pte modifications
> to be deferred and updated at a later time in a batch to improve
> performance. apply_to_page_range() calls its callback in lazy mmu mode
> and some of those callbacks call into the page allocator to either
> allocate or free pages.
> 
> This is problematic with CONFIG_DEBUG_PAGEALLOC because
> debug_pagealloc_[un]map_pages() calls the arch implementation of
> __kernel_map_pages() which must modify the ptes for the linear map.
> 
> There are two possibilities at this point:
> 
>  - If the arch implementation modifies the ptes directly without first
>entering lazy mmu mode, the pte modifications may get deferred until
>the existing lazy mmu mode is exited. This could result in taking
>spurious faults for example.
> 
>  - If the arch implementation enters a nested lazy mmu mode before
>modification of the ptes (many arches use apply_to_page_range()),
>then the linear map updates will definitely be applied upon leaving
>the inner lazy mmu mode. But because lazy mmu mode does not support
>nesting, the remainder of the outer user is no longer in lazy mmu
>mode and the optimization opportunity is lost.
> 
> So let's just ensure that the page allocator is never called from within
> lazy mmu mode. New "_nolazy" variants of apply_to_page_range() and
> apply_to_existing_page_range() are introduced which don't enter lazy mmu
> mode. Then users which need to call into the page allocator within their
> callback are updated to use the _nolazy variants.
> 
> Signed-off-by: Ryan Roberts 
> ---
>  include/linux/mm.h |  6 ++
>  kernel/bpf/arena.c |  6 +++---
>  mm/kasan/shadow.c  |  2 +-
>  mm/memory.c| 54 +++---
>  4 files changed, 51 insertions(+), 17 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index e51dba8398f7..11cae6ce04ff 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3743,9 +3743,15 @@ static inline bool gup_can_follow_protnone(struct 
> vm_area_struct *vma,
>  typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
>  extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
>  unsigned long size, pte_fn_t fn, void *data);
> +extern int apply_to_page_range_nolazy(struct mm_struct *mm,
> +   unsigned long address, unsigned long size,
> +   pte_fn_t fn, void *data);

We are removing externs as things are edited, so probably drop them
here.

>  extern int apply_to_existing_page_range(struct mm_struct *mm,
>  unsigned long address, unsigned long size,
>  pte_fn_t fn, void *data);
> +extern int apply_to_existing_page_range_nolazy(struct mm_struct *mm,
> +unsigned long address, unsigned long size,
> +pte_fn_t fn, void *data);
>  
>  #ifdef CONFIG_PAGE_POISONING
>  extern void __kernel_poison_pages(struct page *page, int numpages);
> diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
> index 0d56cea71602..ca833cfeefb7 100644
> --- a/kernel/bpf/arena.c
> +++ b/kernel/bpf/arena.c
> @@ -187,10 +187,10 @@ static void arena_map_free(struct bpf_map *map)
>   /*
>* free_vm_area() calls remove_vm_area() that calls 
> free_unmap_vmap_area().
>* It unmaps everything from vmalloc area and clears pgtables.
> -  * Call apply_to_existing_page_range() first to find populated ptes and
> -  * free those pages.
> +  * Call apply_to_existing_page_range_nolazy() first to find populated
> +  * ptes and free those pages.
>*/
> - apply_to_existing_page_range(&init_mm, 
> bpf_arena_get_kern_vm_start(arena),
> + apply_to_existing_page_range_nolazy(&init_mm, 
> bpf_arena_get_kern_vm_start(arena),
>KERN_VM_SZ - GUARD_SZ, existing_page_cb, 
> NULL);
>   free_vm_area(arena->kern_vm);
>   range_tree_destroy(&arena->rt);
> diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
> index d2c70cd2afb1..2325c5166c3a 100644
> --- a/mm/kasan/shadow.c
> +++ b/mm/kasan/shadow.c
> @@ -590,7 +590,7 @@ void kasan_release_vmalloc(unsigned long start, unsigned 
> long end,
>  
>  
>   if (flags & KASAN_VMALLOC_PAGE_RANGE)
> - apply_to_existing_page_range(&init_mm,
> + apply_to_existing_page_range_nolazy(&init_mm,
>(unsigned long)shadow_start,
>size, kasan_depopulate_vmalloc_pte,
>NULL);
> diff --git a/mm/memory.c b/mm/memory.c
> index 49199410805c..24436074ce48 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2913,7 +2913,7 @@ EXPORT_SYMBOL(vm_iomap_memory);
>  static int apply_to_pte_range(struct mm_struct

Re: [RFC PATCH v1 1/6] fs/proc/task_mmu: Fix pte update and tlb maintenance ordering in pagemap_scan_pmd_entry()

2025-05-30 Thread Ryan Roberts

On 30/05/2025 17:26, Jann Horn wrote:
> On Fri, May 30, 2025 at 4:04 PM Ryan Roberts  wrote:
>> pagemap_scan_pmd_entry() was previously modifying ptes while in lazy mmu
>> mode, then performing tlb maintenance for the modified ptes, then
>> leaving lazy mmu mode. But any pte modifications during lazy mmu mode
>> may be deferred until arch_leave_lazy_mmu_mode(), inverting the required
>> ordering between pte modificaiton and tlb maintenance.
>>
>> Let's fix that by leaving mmu mode, forcing all the pte updates to be
>> actioned, before doing the tlb maintenance.
>>
>> This is a theorectical bug discovered during code review.
>>
>> Fixes: 52526ca7fdb9 ("fs/proc/task_mmu: implement IOCTL to get and 
>> optionally clear info about PTEs")
> 
> Hmm... isn't lazy mmu mode supposed to also delay TLB flushes, and
> preserve the ordering of PTE modifications and TLB flushes?
> 
> Looking at the existing implementations of lazy MMU:
> 
>  - In Xen PV implementation of lazy MMU, I see that TLB flush
> hypercalls are delayed as well (xen_flush_tlb(),
> xen_flush_tlb_one_user() and xen_flush_tlb_multi() all use
> xen_mc_issue(XEN_LAZY_MMU) which delays issuing if lazymmu is active).
>  - The sparc version also seems to delay TLB flushes, and sparc's
> arch_leave_lazy_mmu_mode() seems to do TLB flushes via
> flush_tlb_pending() if necessary.
>  - powerpc's arch_leave_lazy_mmu_mode() also seems to do TLB flushes.
> 
> Am I missing something?

I doubt it. I suspect this was just my misunderstanding then. I hadn't
appreciated that lazy mmu is also guarranteed to maintain flush ordering; it's
chronically under-documented. Sorry for the noise here. On that basis, I expect
the first 2 patches can definitely be dropped.

> 
> If arm64 requires different semantics compared to all existing
> implementations and doesn't delay TLB flushes for lazy mmu mode, I
> think the "Fixes" tag should point to your addition of lazy mmu
> support for arm64.

arm64 doesn't require different semantics. arm64 is using lazy mmu in a very
limited manner and it can already tolerate the current code.

I just spotted this during code review and was trying to be a good citizen.
Thanks for setting me straight!

Thanks,
Ryan

> 
>> Signed-off-by: Ryan Roberts 
>> ---
>>  fs/proc/task_mmu.c | 3 +--
>>  1 file changed, 1 insertion(+), 2 deletions(-)
>>
>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>> index 994cde10e3f4..361f3ffd9a0c 100644
>> --- a/fs/proc/task_mmu.c
>> +++ b/fs/proc/task_mmu.c
>> @@ -2557,10 +2557,9 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, 
>> unsigned long start,
>> }
>>
>>  flush_and_return:
>> +   arch_leave_lazy_mmu_mode();
>> if (flush_end)
>> flush_tlb_range(vma, start, addr);
>> -
>> -   arch_leave_lazy_mmu_mode();
> 
> I think this ordering was probably intentional, because doing it this
> way around allows Xen PV to avoid one more hypercall, because the TLB
> flush can be batched together with the page table changes?
> 
> 
>> pte_unmap_unlock(start_pte, ptl);
>>
>> cond_resched();
>> --
>> 2.43.0
>>

Re: [RFC PATCH v1 1/6] fs/proc/task_mmu: Fix pte update and tlb maintenance ordering in pagemap_scan_pmd_entry()

2025-05-30 Thread Jann Horn

On Fri, May 30, 2025 at 6:45 PM Ryan Roberts  wrote:
> On 30/05/2025 17:26, Jann Horn wrote:
> > On Fri, May 30, 2025 at 4:04 PM Ryan Roberts  wrote:
> >> pagemap_scan_pmd_entry() was previously modifying ptes while in lazy mmu
> >> mode, then performing tlb maintenance for the modified ptes, then
> >> leaving lazy mmu mode. But any pte modifications during lazy mmu mode
> >> may be deferred until arch_leave_lazy_mmu_mode(), inverting the required
> >> ordering between pte modificaiton and tlb maintenance.
> >>
> >> Let's fix that by leaving mmu mode, forcing all the pte updates to be
> >> actioned, before doing the tlb maintenance.
> >>
> >> This is a theorectical bug discovered during code review.
> >>
> >> Fixes: 52526ca7fdb9 ("fs/proc/task_mmu: implement IOCTL to get and 
> >> optionally clear info about PTEs")
> >
> > Hmm... isn't lazy mmu mode supposed to also delay TLB flushes, and
> > preserve the ordering of PTE modifications and TLB flushes?
> >
> > Looking at the existing implementations of lazy MMU:
> >
> >  - In Xen PV implementation of lazy MMU, I see that TLB flush
> > hypercalls are delayed as well (xen_flush_tlb(),
> > xen_flush_tlb_one_user() and xen_flush_tlb_multi() all use
> > xen_mc_issue(XEN_LAZY_MMU) which delays issuing if lazymmu is active).
> >  - The sparc version also seems to delay TLB flushes, and sparc's
> > arch_leave_lazy_mmu_mode() seems to do TLB flushes via
> > flush_tlb_pending() if necessary.
> >  - powerpc's arch_leave_lazy_mmu_mode() also seems to do TLB flushes.
> >
> > Am I missing something?
>
> I doubt it. I suspect this was just my misunderstanding then. I hadn't
> appreciated that lazy mmu is also guarranteed to maintain flush ordering; it's
> chronically under-documented. Sorry for the noise here. On that basis, I 
> expect
> the first 2 patches can definitely be dropped.

Yeah looking at this code I agree that it could use significantly more
verbose comments on the API contract.

[PATCH bpf-next v2 2/2] selftests/bpf: Add test for bpftool access to read-only protected maps

2025-05-30 Thread Slava Imameev

Add selftest cases that validate bpftool's expected behavior when
accessing maps protected from modification via security_bpf_map.

The test includes a BPF program attached to security_bpf_map with two maps:
- A protected map that only allows read-only access
- An unprotected map that allows full access

The test script attaches the BPF program to security_bpf_map and
verifies that for the bpftool map command:
- Read access works on both maps
- Write access fails on the protected map
- Write access succeeds on the unprotected map
- These behaviors remain consistent when the maps are pinned

Signed-off-by: Slava Imameev 
---
Changes in v2:
- fix for a test compilation error: "conflicting types for 'bpf_fentry_test1'"
---
---
 tools/testing/selftests/bpf/Makefile  |   1 +
 .../selftests/bpf/progs/security_bpf_map.c|  56 +
 .../testing/selftests/bpf/test_bpftool_map.sh | 208 ++
 3 files changed, 265 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/security_bpf_map.c
 create mode 100755 tools/testing/selftests/bpf/test_bpftool_map.sh

diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index cf5ed3bee573..731a86407799 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -109,6 +109,7 @@ TEST_PROGS := test_kmod.sh \
test_xdping.sh \
test_bpftool_build.sh \
test_bpftool.sh \
+   test_bpftool_map.sh \
test_bpftool_metadata.sh \
test_doc_build.sh \
test_xsk.sh \
diff --git a/tools/testing/selftests/bpf/progs/security_bpf_map.c 
b/tools/testing/selftests/bpf/progs/security_bpf_map.c
new file mode 100644
index ..09048c096ee4
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/security_bpf_map.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "vmlinux.h"
+#include 
+#include 
+
+char _license[] SEC("license") = "GPL";
+
+#define EPERM 1 /* Operation not permitted */
+
+/* From include/linux/mm.h. */
+#define FMODE_WRITE0x2
+
+struct map;
+
+struct {
+   __uint(type, BPF_MAP_TYPE_ARRAY);
+   __type(key, __u32);
+   __type(value, __u32);
+   __uint(max_entries, 1);
+} prot_map SEC(".maps");
+
+struct {
+   __uint(type, BPF_MAP_TYPE_ARRAY);
+   __type(key, __u32);
+   __type(value, __u32);
+   __uint(max_entries, 1);
+} not_prot_map SEC(".maps");
+
+SEC("fmod_ret/security_bpf_map")
+int BPF_PROG(fmod_bpf_map, struct bpf_map *map, int fmode)
+{
+   if (map == &prot_map) {
+   /* Allow read-only access */
+   if (fmode & FMODE_WRITE)
+   return -EPERM;
+   }
+
+   return 0;
+}
+
+/*
+ * This program keeps references to maps. This is needed to prevent
+ * optimizing them out.
+ */
+SEC("fentry/bpf_fentry_test1")
+int BPF_PROG(bpf_map_test0, int a)
+{
+   __u32 key = 0;
+   __u32 val1 = a;
+   __u32 val2 = a + 1;
+
+   bpf_map_update_elem(&prot_map, &key, &val1, BPF_ANY);
+   bpf_map_update_elem(¬_prot_map, &key, &val2, BPF_ANY);
+   return 0;
+}
diff --git a/tools/testing/selftests/bpf/test_bpftool_map.sh 
b/tools/testing/selftests/bpf/test_bpftool_map.sh
new file mode 100755
index ..c7c7f3d2071e
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_bpftool_map.sh
@@ -0,0 +1,208 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+PROTECTED_MAP_NAME="prot_map"
+NOT_PROTECTED_MAP_NAME="not_prot_map"
+BPF_FILE="security_bpf_map.bpf.o"
+TESTNAME="security_bpf_map"
+BPF_FS=$(awk '$3 == "bpf" {print $2; exit}' /proc/mounts)
+BPF_DIR="$BPF_FS/test_$TESTNAME"
+SCRIPT_DIR=$(dirname $(realpath "$0"))
+BPF_FILE_PATH="$SCRIPT_DIR/$BPF_FILE"
+# Assume the script is located under tools/testing/selftests/bpf/
+KDIR_ROOT_DIR=$(realpath "$SCRIPT_DIR"/../../../../)
+
+_cleanup()
+{
+   set +eu
+   [ -d "$TMPDIR" ] && rm -rf "$TMPDIR" 2> /dev/null
+   [ -d "$BPF_DIR" ] && rm -rf "$BPF_DIR" 2> /dev/null
+}
+
+cleanup_skip()
+{
+   echo "selftests: $TESTNAME [SKIP]"
+   _cleanup
+
+   exit $ksft_skip
+}
+
+cleanup()
+{
+   if [ "$?" = 0 ]; then
+   echo "selftests: $TESTNAME [PASS]"
+   else
+   echo "selftests: $TESTNAME [FAILED]"
+   fi
+   _cleanup
+}
+
+# Parameters:
+#   $1: The top of kernel repository
+#   $2: Output directory
+build_bpftool()
+{
+   local kdir_root_dir="$1"
+   local output_dir="$2"
+   local pwd="$(pwd)"
+   local ncpus=1
+
+   echo Building bpftool ...
+
+   #We want to start build from the top of kernel repository.
+   cd "$kdir_root_dir"
+   if [ ! -e tools/bpf/bpftool/Makefile ]; then
+   echo bpftool files not found
+   exit $ksft_skip
+   fi
+
+   # Determine the number of CPUs for parallel compilation
+   if command -v nproc >/dev/null 2>&1; then
+   ncpus=$(np

[PATCH bpf-next v2 1/2] bpftool: Use appropriate permissions for map access

2025-05-30 Thread Slava Imameev

Modify several functions in tools/bpf/bpftool/common.c to allow
specification of requested access for file descriptors, such as
read-only access.

Update bpftool to request only read access for maps when write
access is not required. This fixes errors when reading from maps
that are protected from modification via security_bpf_map.

Signed-off-by: Slava Imameev 
---
Changes in v2:
- fix for a test compilation error: "conflicting types for 'bpf_fentry_test1'"
---
---
 tools/bpf/bpftool/btf.c   |  3 +-
 tools/bpf/bpftool/common.c| 57 ++-
 tools/bpf/bpftool/iter.c  |  2 +-
 tools/bpf/bpftool/link.c  |  2 +-
 tools/bpf/bpftool/main.h  | 13 ---
 tools/bpf/bpftool/map.c   | 56 +-
 tools/bpf/bpftool/map_perf_ring.c |  3 +-
 tools/bpf/bpftool/prog.c  |  4 +--
 8 files changed, 90 insertions(+), 50 deletions(-)

diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c
index 6b14cbfa58aa..1ba27cb03348 100644
--- a/tools/bpf/bpftool/btf.c
+++ b/tools/bpf/bpftool/btf.c
@@ -905,7 +905,8 @@ static int do_dump(int argc, char **argv)
return -1;
}
 
-   fd = map_parse_fd_and_info(&argc, &argv, &info, &len);
+   fd = map_parse_fd_and_info(&argc, &argv, &info, &len,
+  BPF_F_RDONLY);
if (fd < 0)
return -1;
 
diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index ecfa790adc13..ff1c99281beb 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -193,7 +193,8 @@ int mount_tracefs(const char *target)
return err;
 }
 
-int open_obj_pinned(const char *path, bool quiet)
+int open_obj_pinned(const char *path, bool quiet,
+   const struct bpf_obj_get_opts *opts)
 {
char *pname;
int fd = -1;
@@ -205,7 +206,7 @@ int open_obj_pinned(const char *path, bool quiet)
goto out_ret;
}
 
-   fd = bpf_obj_get(pname);
+   fd = bpf_obj_get_opts(pname, opts);
if (fd < 0) {
if (!quiet)
p_err("bpf obj get (%s): %s", pname,
@@ -221,12 +222,13 @@ int open_obj_pinned(const char *path, bool quiet)
return fd;
 }
 
-int open_obj_pinned_any(const char *path, enum bpf_obj_type exp_type)
+int open_obj_pinned_any(const char *path, enum bpf_obj_type exp_type,
+   const struct bpf_obj_get_opts *opts)
 {
enum bpf_obj_type type;
int fd;
 
-   fd = open_obj_pinned(path, false);
+   fd = open_obj_pinned(path, false, opts);
if (fd < 0)
return -1;
 
@@ -555,7 +557,7 @@ static int do_build_table_cb(const char *fpath, const 
struct stat *sb,
if (typeflag != FTW_F)
goto out_ret;
 
-   fd = open_obj_pinned(fpath, true);
+   fd = open_obj_pinned(fpath, true, NULL);
if (fd < 0)
goto out_ret;
 
@@ -928,7 +930,7 @@ int prog_parse_fds(int *argc, char ***argv, int **fds)
path = **argv;
NEXT_ARGP();
 
-   (*fds)[0] = open_obj_pinned_any(path, BPF_OBJ_PROG);
+   (*fds)[0] = open_obj_pinned_any(path, BPF_OBJ_PROG, NULL);
if ((*fds)[0] < 0)
return -1;
return 1;
@@ -965,7 +967,8 @@ int prog_parse_fd(int *argc, char ***argv)
return fd;
 }
 
-static int map_fd_by_name(char *name, int **fds)
+static int map_fd_by_name(char *name, int **fds,
+ const struct bpf_get_fd_by_id_opts *opts)
 {
unsigned int id = 0;
int fd, nb_fds = 0;
@@ -973,6 +976,7 @@ static int map_fd_by_name(char *name, int **fds)
int err;
 
while (true) {
+   LIBBPF_OPTS(bpf_get_fd_by_id_opts, opts_ro);
struct bpf_map_info info = {};
__u32 len = sizeof(info);
 
@@ -985,7 +989,9 @@ static int map_fd_by_name(char *name, int **fds)
return nb_fds;
}
 
-   fd = bpf_map_get_fd_by_id(id);
+   /* Request a read-only fd to query the map info */
+   opts_ro.open_flags = BPF_F_RDONLY;
+   fd = bpf_map_get_fd_by_id_opts(id, &opts_ro);
if (fd < 0) {
p_err("can't get map by id (%u): %s",
  id, strerror(errno));
@@ -1004,6 +1010,15 @@ static int map_fd_by_name(char *name, int **fds)
continue;
}
 
+   /* Get an fd with the requested options. */
+   close(fd);
+   fd = bpf_map_get_fd_by_id_opts(id, opts);
+   if (fd < 0) {
+   p_err("can't get map by id (%u): %s", id,
+ strerror(errno));
+   goto err_close_fds;
+   }
+
if (nb_fds > 0) {

Re: [PATCH v2 1/1] mm/memory_hotplug: PG_offline_skippable for offlining memory blocks with PageOffline pages

2025-05-30 Thread Vlastimil Babka

On 5/20/25 18:42, David Hildenbrand wrote:
> A long-term goal is supporting frozen PageOffline pages, and later
> PageOffline pages that don't have a refcount at all. Some more work for

Looking forward to that :)

> that is needed -- in particular around non-folio page migration and
> memory ballooning drivers -- but let's start by handling PageOffline pages
> that can be skipped during memory offlining differently.
> 
> Note that PageOffline is used to mark pages that are logically offline
> in an otherwise online memory block (e.g., 128 MiB). If a memory
> block is offline, the memmap is considered compeltely uninitialized
> and stale (see pfn_to_online_page()).
> 
> Let's introduce a PageOffline specific page flag (PG_offline_skippable)
> that for now reuses PG_owner_2. In the memdesc future, it will be one of
> a small number of per-memdesc flags stored alongside the type.
> 
> By setting PG_offline_skippable, a driver indicates that it can
> restore the PageOffline state of these specific pages when re-onlining a
> memory block: it knows that these pages are supposed to be PageOffline()
> without the information in the vmemmap, so it can filter them out and
> not expose them to the buddy -> they stay PageOffline().
> 
> While PG_offline_offlineable might be clearer, it is also super
> confusing. Alternatives (PG_offline_sticky?) also don't quite feel right.
> So let's use "skippable" for now.
> 
> The flag is not supposed to be used for movable PageOffline pages as
> used for balloon compaction; movable PageOffline() pages can simply be
> migrated during the memory offlining stage, turning the migration
> destination page PageOffline() and turning the migration source page
> into a free buddy page.
> 
> Let's convert the single user from our MEM_GOING_OFFLINE approach
> to the new PG_offline_skippable approach: virtio-mem. Fortunately,
> this simplifies the code quite a lot. The only corner case we have to
> take care of is when force-unloading the virtio-mem driver: we have to
> prevent partially-plugged memory blocks from getting offlined by
> clearing PG_offline_skippable again.
> 
> What if someone decides to grab a reference on these pages although they
> really shouldn't? After all, we'll now keep the refcount at 1 (until we
> can properly stop using the refcount completely).
> 
> Well, less worse things will happen than would currently: currently,
> if someone would grab a reference to these pages, in MEM_GOING_OFFLINE
> we would run into the
>   if (WARN_ON(!page_ref_dec_and_test(page)))
>   dump_page(page, "fake-offline page referenced");
> 
> And once that unexpected reference would get dropped, we would end up
> freeing that page to the buddy: ouch.
> 
> Now, we'll allow for offlining that memory, and when that unexpected
> reference would get dropped, we would not end up freeing that page to
> the buddy. Once we have frozen PageOffline() pages, it will all get a
> lot cleaner.

Hmm, a question on that later in the code (assuming I identified the right
place).

> Note that we didn't see the existing WARN_ON so far, because nobody
> should ever be referencing such pages.

It's mostly a speculative refcount increase from a pfn walker, such as
compaction scanner, that can happen due to its inherent raciness.

> An alternative might be to have another callback chain from memory hotplug
> code, where a driver that owns that page could agree to skip the
> PageOffline() page. However, we would have to repeatedly issue these
> callbacks for individual PageOffline() pages, which does not sound
> compelling. As we have spare bits, let's use this simpler approach for
> now.
> 
> Acked-by: Zi Yan 
> Signed-off-by: David Hildenbrand 

Acked-by: Vlastimil Babka  # page allocator

I'll leave hotplug to the experts :)



> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index f6482223e28a2..7e4c41e46a911 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -7023,12 +7023,12 @@ unsigned long __offline_isolated_pages(unsigned long 
> start_pfn,
>   continue;
>   }
>   /*
> -  * At this point all remaining PageOffline() pages have a
> -  * reference count of 0 and can simply be skipped.
> +  * At this point all remaining PageOffline() pages must be
> +  * "skippable" and have exactly one reference.
>*/
>   if (PageOffline(page)) {
> - BUG_ON(page_count(page));
> - BUG_ON(PageBuddy(page));
> + WARN_ON_ONCE(!PageOfflineSkippable(page));
> + WARN_ON_ONCE(page_count(page) != 1);

So is this the part where an unexpected speculative refcount might be
detected? Should be harmless then as it will then decrease the refcount from
e.g. 2 to 1 and nothing will happen right.
That's assuming that once we pass __offline_isolated_pages(), the following
actions wont modify the refcount or

[PATCH v3 4/4] arm64: dts: qcom: msm8939: Add camss and cci

2025-05-30 Thread Vincent Knecht via B4 Relay

From: Vincent Knecht 

Add the camera subsystem and CCI used to interface with cameras on the
Snapdragon 615.

Signed-off-by: Vincent Knecht 
---
 arch/arm64/boot/dts/qcom/msm8939-pm8916.dtsi |   4 +
 arch/arm64/boot/dts/qcom/msm8939.dtsi| 146 +++
 2 files changed, 150 insertions(+)

diff --git a/arch/arm64/boot/dts/qcom/msm8939-pm8916.dtsi 
b/arch/arm64/boot/dts/qcom/msm8939-pm8916.dtsi
index 
adb96cd8d643e5fde1ac95c0fc3c9c3c3efb07e8..659d127b1bc3570d137ca986e4eacf600c183e5e
 100644
--- a/arch/arm64/boot/dts/qcom/msm8939-pm8916.dtsi
+++ b/arch/arm64/boot/dts/qcom/msm8939-pm8916.dtsi
@@ -11,6 +11,10 @@
 #include "msm8939.dtsi"
 #include "pm8916.dtsi"
 
+&camss {
+   vdda-supply = <&pm8916_l2>;
+};
+
 &mdss_dsi0 {
vdda-supply = <&pm8916_l2>;
vddio-supply = <&pm8916_l6>;
diff --git a/arch/arm64/boot/dts/qcom/msm8939.dtsi 
b/arch/arm64/boot/dts/qcom/msm8939.dtsi
index 
68b92fdb996c26e7a1aadedf0f52e1afca85c4ab..082542b54d96adaed3e6b49bc3682005ea018a72
 100644
--- a/arch/arm64/boot/dts/qcom/msm8939.dtsi
+++ b/arch/arm64/boot/dts/qcom/msm8939.dtsi
@@ -1434,6 +1434,145 @@ mdss_dsi1_phy: phy@1aa0300 {
};
};
 
+   camss: isp@1b08000 {
+   compatible = "qcom,msm8939-camss";
+   reg = <0x01b08000 0x100>,
+ <0x01b08400 0x100>,
+ <0x01b08800 0x100>,
+ <0x01b0ac00 0x200>,
+ <0x01b00030 0x4>,
+ <0x01b0b000 0x200>,
+ <0x01b00038 0x4>,
+ <0x01b00020 0x10>,
+ <0x01b0a000 0x500>,
+ <0x01b1 0x1000>,
+ <0x01b4 0x200>;
+   reg-names = "csid0",
+   "csid1",
+   "csid2",
+   "csiphy0",
+   "csiphy0_clk_mux",
+   "csiphy1",
+   "csiphy1_clk_mux",
+   "csi_clk_mux",
+   "ispif",
+   "vfe0",
+   "vfe0_vbif";
+
+   clocks = <&gcc GCC_CAMSS_AHB_CLK>,
+<&gcc GCC_CAMSS_CSI0_CLK>,
+<&gcc GCC_CAMSS_CSI0_AHB_CLK>,
+<&gcc GCC_CAMSS_CSI0PHY_CLK>,
+<&gcc GCC_CAMSS_CSI0PIX_CLK>,
+<&gcc GCC_CAMSS_CSI0RDI_CLK>,
+<&gcc GCC_CAMSS_CSI1_CLK>,
+<&gcc GCC_CAMSS_CSI1_AHB_CLK>,
+<&gcc GCC_CAMSS_CSI1PHY_CLK>,
+<&gcc GCC_CAMSS_CSI1PIX_CLK>,
+<&gcc GCC_CAMSS_CSI1RDI_CLK>,
+<&gcc GCC_CAMSS_CSI2_CLK>,
+<&gcc GCC_CAMSS_CSI2_AHB_CLK>,
+<&gcc GCC_CAMSS_CSI2PHY_CLK>,
+<&gcc GCC_CAMSS_CSI2PIX_CLK>,
+<&gcc GCC_CAMSS_CSI2RDI_CLK>,
+<&gcc GCC_CAMSS_CSI0PHYTIMER_CLK>,
+<&gcc GCC_CAMSS_CSI1PHYTIMER_CLK>,
+<&gcc GCC_CAMSS_CSI_VFE0_CLK>,
+<&gcc GCC_CAMSS_ISPIF_AHB_CLK>,
+<&gcc GCC_CAMSS_TOP_AHB_CLK>,
+<&gcc GCC_CAMSS_VFE0_CLK>,
+<&gcc GCC_CAMSS_VFE_AHB_CLK>,
+<&gcc GCC_CAMSS_VFE_AXI_CLK>;
+   clock-names = "ahb",
+ "csi0",
+ "csi0_ahb",
+ "csi0_phy",
+ "csi0_pix",
+ "csi0_rdi",
+ "csi1",
+ "csi1_ahb",
+ "csi1_phy",
+ "csi1_pix",
+ "csi1_rdi",
+ "csi2",
+ "csi2_ahb",
+ "csi2_phy",
+ "csi2_pix",
+ "csi2_rdi",
+ "csiphy0_timer",
+ "csiphy1_timer",
+ "csi_vfe0",
+ "ispif_ahb",
+ "top_ahb",
+ "vfe0",
+

Re: [RFC PATCH v1 1/6] fs/proc/task_mmu: Fix pte update and tlb maintenance ordering in pagemap_scan_pmd_entry()

2025-05-30 Thread Jann Horn

On Fri, May 30, 2025 at 4:04 PM Ryan Roberts  wrote:
> pagemap_scan_pmd_entry() was previously modifying ptes while in lazy mmu
> mode, then performing tlb maintenance for the modified ptes, then
> leaving lazy mmu mode. But any pte modifications during lazy mmu mode
> may be deferred until arch_leave_lazy_mmu_mode(), inverting the required
> ordering between pte modificaiton and tlb maintenance.
>
> Let's fix that by leaving mmu mode, forcing all the pte updates to be
> actioned, before doing the tlb maintenance.
>
> This is a theorectical bug discovered during code review.
>
> Fixes: 52526ca7fdb9 ("fs/proc/task_mmu: implement IOCTL to get and optionally 
> clear info about PTEs")

Hmm... isn't lazy mmu mode supposed to also delay TLB flushes, and
preserve the ordering of PTE modifications and TLB flushes?

Looking at the existing implementations of lazy MMU:

 - In Xen PV implementation of lazy MMU, I see that TLB flush
hypercalls are delayed as well (xen_flush_tlb(),
xen_flush_tlb_one_user() and xen_flush_tlb_multi() all use
xen_mc_issue(XEN_LAZY_MMU) which delays issuing if lazymmu is active).
 - The sparc version also seems to delay TLB flushes, and sparc's
arch_leave_lazy_mmu_mode() seems to do TLB flushes via
flush_tlb_pending() if necessary.
 - powerpc's arch_leave_lazy_mmu_mode() also seems to do TLB flushes.

Am I missing something?

If arm64 requires different semantics compared to all existing
implementations and doesn't delay TLB flushes for lazy mmu mode, I
think the "Fixes" tag should point to your addition of lazy mmu
support for arm64.

> Signed-off-by: Ryan Roberts 
> ---
>  fs/proc/task_mmu.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
>
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 994cde10e3f4..361f3ffd9a0c 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -2557,10 +2557,9 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned 
> long start,
> }
>
>  flush_and_return:
> +   arch_leave_lazy_mmu_mode();
> if (flush_end)
> flush_tlb_range(vma, start, addr);
> -
> -   arch_leave_lazy_mmu_mode();

I think this ordering was probably intentional, because doing it this
way around allows Xen PV to avoid one more hypercall, because the TLB
flush can be batched together with the page table changes?


> pte_unmap_unlock(start_pte, ptl);
>
> cond_resched();
> --
> 2.43.0
>

Re: [RFC PATCH v1 0/6] Lazy mmu mode fixes and improvements

2025-05-30 Thread Ryan Roberts

On 30/05/2025 15:47, Lorenzo Stoakes wrote:
> +cc Jann who is a specialist in all things page table-y and especially scary
> edge cases :)
> 
> On Fri, May 30, 2025 at 03:04:38PM +0100, Ryan Roberts wrote:
>> Hi All,
>>
>> I recently added support for lazy mmu mode on arm64. The series is now in
>> Linus's tree so should be in v6.16-rc1. But during testing in linux-next we
>> found some ugly corners (unexpected nesting). I was able to fix those issues 
>> by
>> making the arm64 implementation more permissive (like the other arches). But
>> this is quite fragile IMHO. So I'd rather fix the root cause and ensure that
>> lazy mmu mode never nests, and more importantly, that code never makes 
>> pgtable
>> modifications expecting them to be immediate, not knowing that it's actually 
>> in
>> lazy mmu mode so the changes get deferred.
> 
> When you say fragile, are you confident it _works_ but perhaps not quite as 
> well
> as you want? Or are you concerned this might be broken upstream in any way?

I'm confident that it _works_ for arm64 as it is, upstream. But if Dev's series
were to go in _without_ the lazy_mmu bracketting in some manner, then it would
be broken if the config includes CONFIG_DEBUG_PAGEALLOC.

There's a lot more explanation in the later patches as to how it can be broken,
but for arm64, the situation is currently like this, because our implementation
of __change_memory_common() uses apply_to_page_range() which implicitly starts
an inner lazy_mmu_mode. We enter multiple times, but we exit one the first call
to exit. Everything works correctly but it's not optimal because C is no longer
deferred:

arch_enter_lazy_mmu_mode()<< outer lazy mmu region
  
  alloc_pages()
debug_pagealloc_map_pages()
  __kernel_map_pages()
__change_memory_common()
  arch_enter_lazy_mmu_mode()  << inner lazy mmu region

  arch_leave_lazy_mmu_mode()  << exit; complete A + B
clear_page()
<< no longer in lazy mode
arch_leave_lazy_mmu_mode()<< nop

An alternative implementation would not add the nested lazy mmu mode, so we end
up with this:

arch_enter_lazy_mmu_mode()<< outer lazy mmu region
  
  alloc_pages()
debug_pagealloc_map_pages()
  __kernel_map_pages()
__change_memory_common()
 << deferred due to lazy mmu
clear_page()  << BANG! B has not be actioned
  
arch_leave_lazy_mmu_mode()

This is clearly a much worse outcome. It's not happening today but it could in
future. That's why I'm claiming it's fragile. It's much better (IMHO) to
disallow calling the page allocator when in lazy mmu mode.

I won't speak for other arches; there may be more or less potential impact for 
them.

> 
> I am thinking specifically about the proposed use in Dev's new series [0] and
> obviously hoping (and assuming in fact) that it's the former :)

Dev's changes aren't directly related to this, but if a version was accepted
that didn't include the lazy mmu mode, that would cause non-obvious issues.

Hope that helps?

Thanks,
Ryan

> 
> [0]: https://lore.kernel.org/linux-mm/20250530090407.19237-1-dev.j...@arm.com/
> 
>>
>> The first 2 patches are unrelated, very obvious bug fixes. They don't affect
>> arm64 because arm64 only uses lazy mmu for kernel mappings. But I noticed 
>> them
>> during code review and think they should be fixed.
>>
>> The next 3 patches are aimed at solving the nesting issue.
>>
>> And the final patch is reverting the "permissive" fix I did for arm64, which 
>> is
>> no longer needed after the previous 3 patches.
>>
>> I've labelled this RFC for now because it depends on the arm64 lazy mmu 
>> patches
>> in Linus's master, so it won't apply to mm-unstable. But I'm keen to get 
>> review
>> and siince I'm touching various arches and modifying some core mm stuff, I
>> thought that might take a while so thought I'd beat the rush and get a first
>> version out early.
>>
>> I've build-tested all the affected arches. And I've run mm selftests for the
>> arm64 build, with no issues (with DEBUG_PAGEALLOC and KFENCE enabled).
>>
>> Applies against Linus's master branch (f66bc387efbe).
>>
>> Thanks,
>> Ryan
>>
>>
>> Ryan Roberts (6):
>>   fs/proc/task_mmu: Fix pte update and tlb maintenance ordering in
>> pagemap_scan_pmd_entry()
>>   mm: Fix pte update and tlb maintenance ordering in
>> migrate_vma_collect_pmd()
>>   mm: Avoid calling page allocator from apply_to_page_range()
>>   mm: Introduce arch_in_lazy_mmu_mode()
>>   mm: Avoid calling page allocator while in lazy mmu mode
>>   Revert "arm64/mm: Permit lazy_mmu_mode to be nested"
>>
>>  arch/arm64/include/asm/pgtable.h  | 22 
>>  .../include/asm/book3s/64/tlbflush-hash.h | 15 ++
>>  arch/sparc/include/asm/tlbflush_64.h  |  1 +
>>  arch/sparc/mm/tlb.c   | 12 +
>>  arch/x86/include/

Re: [RFC PATCH v1 3/6] mm: Avoid calling page allocator from apply_to_page_range()

2025-05-30 Thread Ryan Roberts

On 30/05/2025 17:23, Liam R. Howlett wrote:
> * Ryan Roberts  [250530 10:05]:
>> Lazy mmu mode applies to the current task and permits pte modifications
>> to be deferred and updated at a later time in a batch to improve
>> performance. apply_to_page_range() calls its callback in lazy mmu mode
>> and some of those callbacks call into the page allocator to either
>> allocate or free pages.
>>
>> This is problematic with CONFIG_DEBUG_PAGEALLOC because
>> debug_pagealloc_[un]map_pages() calls the arch implementation of
>> __kernel_map_pages() which must modify the ptes for the linear map.
>>
>> There are two possibilities at this point:
>>
>>  - If the arch implementation modifies the ptes directly without first
>>entering lazy mmu mode, the pte modifications may get deferred until
>>the existing lazy mmu mode is exited. This could result in taking
>>spurious faults for example.
>>
>>  - If the arch implementation enters a nested lazy mmu mode before
>>modification of the ptes (many arches use apply_to_page_range()),
>>then the linear map updates will definitely be applied upon leaving
>>the inner lazy mmu mode. But because lazy mmu mode does not support
>>nesting, the remainder of the outer user is no longer in lazy mmu
>>mode and the optimization opportunity is lost.
>>
>> So let's just ensure that the page allocator is never called from within
>> lazy mmu mode. New "_nolazy" variants of apply_to_page_range() and
>> apply_to_existing_page_range() are introduced which don't enter lazy mmu
>> mode. Then users which need to call into the page allocator within their
>> callback are updated to use the _nolazy variants.
>>
>> Signed-off-by: Ryan Roberts 
>> ---
>>  include/linux/mm.h |  6 ++
>>  kernel/bpf/arena.c |  6 +++---
>>  mm/kasan/shadow.c  |  2 +-
>>  mm/memory.c| 54 +++---
>>  4 files changed, 51 insertions(+), 17 deletions(-)
>>
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index e51dba8398f7..11cae6ce04ff 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -3743,9 +3743,15 @@ static inline bool gup_can_follow_protnone(struct 
>> vm_area_struct *vma,
>>  typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
>>  extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
>> unsigned long size, pte_fn_t fn, void *data);
>> +extern int apply_to_page_range_nolazy(struct mm_struct *mm,
>> +  unsigned long address, unsigned long size,
>> +  pte_fn_t fn, void *data);
> 
> We are removing externs as things are edited, so probably drop them
> here.

ACK

> 
>>  extern int apply_to_existing_page_range(struct mm_struct *mm,
>> unsigned long address, unsigned long size,
>> pte_fn_t fn, void *data);
>> +extern int apply_to_existing_page_range_nolazy(struct mm_struct *mm,
>> +   unsigned long address, unsigned long size,
>> +   pte_fn_t fn, void *data);
>>  
>>  #ifdef CONFIG_PAGE_POISONING
>>  extern void __kernel_poison_pages(struct page *page, int numpages);
>> diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
>> index 0d56cea71602..ca833cfeefb7 100644
>> --- a/kernel/bpf/arena.c
>> +++ b/kernel/bpf/arena.c
>> @@ -187,10 +187,10 @@ static void arena_map_free(struct bpf_map *map)
>>  /*
>>   * free_vm_area() calls remove_vm_area() that calls 
>> free_unmap_vmap_area().
>>   * It unmaps everything from vmalloc area and clears pgtables.
>> - * Call apply_to_existing_page_range() first to find populated ptes and
>> - * free those pages.
>> + * Call apply_to_existing_page_range_nolazy() first to find populated
>> + * ptes and free those pages.
>>   */
>> -apply_to_existing_page_range(&init_mm, 
>> bpf_arena_get_kern_vm_start(arena),
>> +apply_to_existing_page_range_nolazy(&init_mm, 
>> bpf_arena_get_kern_vm_start(arena),
>>   KERN_VM_SZ - GUARD_SZ, existing_page_cb, 
>> NULL);
>>  free_vm_area(arena->kern_vm);
>>  range_tree_destroy(&arena->rt);
>> diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
>> index d2c70cd2afb1..2325c5166c3a 100644
>> --- a/mm/kasan/shadow.c
>> +++ b/mm/kasan/shadow.c
>> @@ -590,7 +590,7 @@ void kasan_release_vmalloc(unsigned long start, unsigned 
>> long end,
>>  
>>  
>>  if (flags & KASAN_VMALLOC_PAGE_RANGE)
>> -apply_to_existing_page_range(&init_mm,
>> +apply_to_existing_page_range_nolazy(&init_mm,
>>   (unsigned long)shadow_start,
>>   size, kasan_depopulate_vmalloc_pte,
>>   NULL);
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 49199410805c..24436074ce48 100644
>> --- a/mm/memory.c

Re: [PATCH v2 3/3] remoteproc: imx_rproc: add power mode check for remote core attachment

2025-05-30 Thread Ulf Hansson

On Thu, 29 May 2025 at 22:15, Hiago De Franco  wrote:
>
> On Thu, May 29, 2025 at 03:54:47AM +, Peng Fan wrote:
>
> [...]
>
> > > We are making progress ;-)
> > >
> > > With the patches you shared Ulf (I added them on top of the current
> > > master branch), it works as expected, dev_pm_genpd_is_on() returns 0
> > > when I boot the kernel without M4 running and it returns 1 when I
> > > boot the kernel with M4 running with a hello-world demo.
> > >
> > > However now I tried to, if dev_pm_genpd_is_on() returns 1, put the
> > > DETACHED state, something as
> > >
> > > if (dev_pm_genpd_is_on(priv->pd_list->pd_devs[0]))
> > > priv->rproc->state = RPROC_DETACHED;
> > >
> > > In this case I used 0 because I understand this is the
> > > IMX_SC_R_M4_0_PID0 defined in my device tree overlay:
> > >
> > > power-domains = <&pd IMX_SC_R_M4_0_PID0>,
> > > <&pd IMX_SC_R_M4_0_MU_1A>;
> > >
> > > But in this case, the kernel does not boot anymore, I see the "Starting
> > > kernel..." and nothing else.
> >
> > Please add "earlycon" in bootargs to see where it hangs.
>
> Thanks Peng! I was able to catch the kernel panic yesterday, however I
> must say that today I was doing the tests again and the issue is gone.
> Sorry, I might have done something wrong yesterday with the tests.
> Anyway, here is the log:
>
> [1.271163] remoteproc remoteproc0: imx-rproc is available
> [1.280296] remoteproc remoteproc0: attaching to imx-rproc
> [1.285756] Unable to handle kernel paging request at virtual address 
> 80005ae3dd79
> [1.293624] Mem abort info:
> [1.294655] mmc0: SDHCI controller on 5b01.mmc [5b01.mmc] using 
> ADMA
> [1.296386]   ESR = 0x9605
> [1.307194]   EC = 0x25: DABT (current EL), IL = 32 bits
> [1.312473]   SET = 0, FnV = 0
> [1.315566]   EA = 0, S1PTW = 0
> [1.318649]   FSC = 0x05: level 1 translation fault
> [1.323510] Data abort info:
> [1.326370]   ISV = 0, ISS = 0x0005, ISS2 = 0x
> [1.331846]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
> [1.336882]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
> [1.342182] swapper pgtable: 4k pages, 48-bit VAs, pgdp=96bc1000
> [1.348870] [80005ae3dd79] pgd=, p4d=100097054003, 
> pud=
> [1.357565] Internal error: Oops: 9605 [#1]  SMP
> [1.363198] Modules linked in:
> [1.366236] CPU: 2 UID: 0 PID: 47 Comm: kworker/u16:3 Not tainted 
> 6.15.0-03667-g3f5f09105c40-dirty #826 PREEMPT
> [1.376405] Hardware name: Toradex Colibri iMX8QXP on Colibri Evaluation 
> Board V3 (DT)
> [1.384313] Workqueue: events_unbound deferred_probe_work_func
> [1.390128] pstate: 0005 (nzcv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> [1.397076] pc : rproc_handle_resources.constprop.0+0x78/0x1d0
> [1.402896] lr : rproc_boot+0x368/0x56c
> [1.406717] sp : 8000819c3990
> [1.410017] x29: 8000819c3990 x28: 80005ae3dd7d x27: 
> 
> [1.417145] x26:  x25: 015ec038 x24: 
> 800080f0c0a8
> [1.424268] x23: 8000813a6110 x22: d999ad79 x21: 
> 015ec000
> [1.431392] x20: 26665683 x19: 80005ae3dd79 x18: 
> 0006
> [1.438516] x17: 01799400 x16: 01798e00 x15: 
> 4addd15cca11c529
> [1.445639] x14: 53ebce6d5564d787 x13: 4addd15cca11c529 x12: 
> 53ebce6d5564d787
> [1.452763] x11: 95a1e33b6b190674 x10: 9e3c9abdb41ca345 x9 : 
> ab17b4eaffd6fd1c
> [1.459887] x8 : d5da055de4cfbb87 x7 : dfd7fa31596acbbc x6 : 
> 9946d97107d0dcca
> [1.467011] x5 : 010c7800 x4 : 03fc x3 : 
> 010c7780
> [1.474134] x2 : fff0 x1 : 8000814a3000 x0 : 
> 8000814a3000
> [1.481261] Call trace:
> [1.483690]  rproc_handle_resources.constprop.0+0x78/0x1d0 (P)
> [1.487705] mmc0: new HS400 MMC card at address 0001
> [1.489502]  rproc_boot+0x368/0x56c
> [1.495349] mmcblk0: mmc0:0001 Q2J55L 7.09 GiB
> [1.497929]  rproc_add+0x184/0x190
> [1.504356]  mmcblk0: p1 p2
> [1.505747]  imx_rproc_probe+0x458/0x528
> [1.509238] mmcblk0boot0: mmc0:0001 Q2J55L 16.0 MiB
> [1.512437]  platform_probe+0x68/0xc0
> [1.512452]  really_probe+0xc0/0x38c
> [1.520584] mmcblk0boot1: mmc0:0001 Q2J55L 16.0 MiB
> [1.520951]  __driver_probe_device+0x7c/0x15c
> [1.527522] mmcblk0rpmb: mmc0:0001 Q2J55L 4.00 MiB, chardev (242:0)
> [1.529377]  driver_probe_device+0x3c/0x10c
> [1.544263]  __device_attach_driver+0xbc/0x158
> [1.548586]  bus_for_each_drv+0x84/0xe0
> [1.552407]  __device_attach+0x9c/0x1ac
> [1.556231]  device_initial_probe+0x14/0x20
> [1.560401]  bus_probe_device+0xac/0xb0
> [1.564221]  deferred_probe_work_func+0x9c/0xec
> [1.568741]  process_one_work+0x14c/0x28c
> [1.572735]  worker_thread+0x2cc/0x3d4
> [1.576473]  kthread+0x12c/0x208
> [1.579687]  ret_from_

Re: [RFC PATCH v2 0/9] KVM: Enable Nested Virt selftests

2025-05-30 Thread Miguel Luis

Hi Ganapatrao,

> On 12 May 2025, at 10:52, Ganapatrao Kulkarni 
>  wrote:
> 
> This patch series makes the selftest work with NV enabled. The guest code
> is run in vEL2 instead of EL1. We add a command line option to enable
> testing of NV. The NV tests are disabled by default.
> 

The following two tests arch_timer and vgic_lpi_stress pass for a guest in VHE
mode but in a nVHE mode guest they are failing for me. I’ve tested them on
Marc’s repo tag 'kvmarm-fixes-6.16-1' on an AmpereOne.

Do you have plans to add nvhe mode testing to this series?

Thanks
Miguel

> Modified around 12 selftests in this series.
> 
> Changes since v1:
> - Updated NV helper functions as per comments [1].
> - Modified existing testscases to run guest code in vEL2.
> 
> [1] https://lkml.iu.edu/hypermail/linux/kernel/2502.0/07001.html 
> 
> Ganapatrao Kulkarni (9):
>  KVM: arm64: nv: selftests: Add support to run guest code in vEL2.
>  KVM: arm64: nv: selftests: Add simple test to run guest code in vEL2
>  KVM: arm64: nv: selftests: Enable hypervisor timer tests to run in
>vEL2
>  KVM: arm64: nv: selftests: enable aarch32_id_regs test to run in vEL2
>  KVM: arm64: nv: selftests: Enable vgic tests to run in vEL2
>  KVM: arm64: nv: selftests: Enable set_id_regs test to run in vEL2
>  KVM: arm64: nv: selftests: Enable test to run in vEL2
>  KVM: selftests: arm64: Extend kvm_page_table_test to run guest code in
>vEL2
>  KVM: arm64: nv: selftests: Enable page_fault_test test to run in vEL2
> 
> tools/testing/selftests/kvm/Makefile.kvm  |   2 +
> tools/testing/selftests/kvm/arch_timer.c  |   8 +-
> .../selftests/kvm/arm64/aarch32_id_regs.c |  34 -
> .../testing/selftests/kvm/arm64/arch_timer.c  | 118 +++---
> .../selftests/kvm/arm64/nv_guest_hypervisor.c |  68 ++
> .../selftests/kvm/arm64/page_fault_test.c |  35 +-
> .../testing/selftests/kvm/arm64/set_id_regs.c |  57 -
> tools/testing/selftests/kvm/arm64/vgic_init.c |  54 +++-
> tools/testing/selftests/kvm/arm64/vgic_irq.c  |  27 ++--
> .../selftests/kvm/arm64/vgic_lpi_stress.c |  19 ++-
> .../testing/selftests/kvm/guest_print_test.c  |  32 +
> .../selftests/kvm/include/arm64/arch_timer.h  |  16 +++
> .../kvm/include/arm64/kvm_util_arch.h |   3 +
> .../selftests/kvm/include/arm64/nv_util.h |  45 +++
> .../selftests/kvm/include/arm64/vgic.h|   1 +
> .../testing/selftests/kvm/include/kvm_util.h  |   3 +
> .../selftests/kvm/include/timer_test.h|   1 +
> .../selftests/kvm/kvm_page_table_test.c   |  30 -
> tools/testing/selftests/kvm/lib/arm64/nv.c|  46 +++
> .../selftests/kvm/lib/arm64/processor.c   |  61 ++---
> tools/testing/selftests/kvm/lib/arm64/vgic.c  |   8 ++
> 21 files changed, 604 insertions(+), 64 deletions(-)
> create mode 100644 tools/testing/selftests/kvm/arm64/nv_guest_hypervisor.c
> create mode 100644 tools/testing/selftests/kvm/include/arm64/nv_util.h
> create mode 100644 tools/testing/selftests/kvm/lib/arm64/nv.c
> 
> -- 
> 2.48.1
> 
>

Re: [PATCH] fs/dax: Fix "don't skip locked entries when scanning entries"

2025-05-30 Thread Christian Brauner

On Fri, 23 May 2025 14:37:49 +1000, Alistair Popple wrote:
> Commit 6be3e21d25ca ("fs/dax: don't skip locked entries when scanning
> entries") introduced a new function, wait_entry_unlocked_exclusive(),
> which waits for the current entry to become unlocked without advancing
> the XArray iterator state.
> 
> Waiting for the entry to become unlocked requires dropping the XArray
> lock. This requires calling xas_pause() prior to dropping the lock
> which leaves the xas in a suitable state for the next iteration. However
> this has the side-effect of advancing the xas state to the next index.
> Normally this isn't an issue because xas_for_each() contains code to
> detect this state and thus avoid advancing the index a second time on
> the next loop iteration.
> 
> [...]

Applied to the vfs.fixes branch of the vfs/vfs.git tree.
Patches in the vfs.fixes branch should appear in linux-next soon.

Please report any outstanding bugs that were missed during review in a
new review to the original patch series allowing us to drop it.

It's encouraged to provide Acked-bys and Reviewed-bys even though the
patch has now been applied. If possible patch trailers will be updated.

Note that commit hashes shown below are subject to change due to rebase,
trailer updates or similar. If in doubt, please check the listed branch.

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git
branch: vfs.fixes

[1/1] fs/dax: Fix "don't skip locked entries when scanning entries"
  https://git.kernel.org/vfs/vfs/c/dd59137bfe70

Re: [PATCH v3 2/4] media: qcom: camss: Add support for MSM8939

2025-05-30 Thread Bryan O'Donoghue


On 30/05/2025 12:49, Bryan O'Donoghue wrote:

On 30/05/2025 10:00, Vincent Knecht via B4 Relay wrote:

+   camss->res->version == CAMSS_8x39 ||

This is not correct - it should be 893x since 8939 and 8936 are ~ the
same SoC - probably 36 is just a binned version of 39.

Anyway the x is the least significant digit.

Please fix
---
bod



No wait I'm wrong 8939 or 8039 - when adding 36 we will probably just 
use the compat string to differentiate.


This is fine.

Reviewed-by: Bryan O'Donoghue

Re: [RFC PATCH net-next v2 2/2] selftests: net: add XDP socket tests for virtio-net

2025-05-30 Thread Maciej Fijalkowski

On Thu, May 29, 2025 at 09:29:14PM +0700, Bui Quang Minh wrote:
> On 5/29/25 18:18, Maciej Fijalkowski wrote:
> > On Tue, May 27, 2025 at 11:19:04PM +0700, Bui Quang Minh wrote:
> > > This adds a test to test the virtio-net rx when there is a XDP socket
> > > bound to it. There are tests for both copy mode and zerocopy mode, both
> > > cases when XDP program returns XDP_PASS and XDP_REDIRECT to a XDP socket.
> > > 
> > > Signed-off-by: Bui Quang Minh 
> > Hi Bui,
> > 
> > have you considered adjusting xskxceiver for your needs? If yes and you
> > decided to go with another test app then what were the issues around it?
> > 
> > This is yet another approach for xsk testing where we already have a
> > test framework.
> 
> Hi,
> 
> I haven't tried much hard to adapt xskxceiver. I did have a look at
> xskxceiver but I felt the supported topology is not suitable for my need. To
> test the receiving side in virtio-net, I use Qemu to set up virtio-net in
> the guest and vhost-net in the host side. The sending side is in the host
> and the receiving is in the guest so I can't figure out how to do that with
> xskxceiver.

I see - couldn't the python side be executing xdpsock then instead of your
own app?

I wouldn't like to end up with several xsk tools for testing data path on
different environments.

> 
> Thanks,
> Quang Minh.
> 
> >

Re: [RFC PATCH v1 3/6] mm: Avoid calling page allocator from apply_to_page_range()

2025-05-30 Thread Liam R. Howlett

* Ryan Roberts  [250530 12:50]:
...

> > 
> > 
> > These wrappers are terrible for readability and annoying for argument
> > lists too.
> 
> Agreed.
> 
> > 
> > Could we do something like the pgtbl_mod_mask or zap_details and pass
> > through a struct or one unsigned int for create and lazy_mmu?
> 
> Or just create some enum flags?
> 
> > 
> > At least we'd have better self-documenting code in the wrappers.. and if
> > we ever need a third boolean, we could avoid multiplying the wrappers
> > again.
> > 
> > WDYT?
> 
> I'm happy with either approach. I was expecting more constination about the 
> idea
> of being able to disable lazy mode though, so perhaps I'll wait and see if any
> arrives. If it doesn't... flags?

Yes, that works as well.  Please use pmd_flags or anything more
descriptive than just 'flags' :)

I wonder which approach is best in asm instructions and self-documenting
code.

Regards,
Liam

[PATCH bpf-next 1/2] bpftool: Use appropriate permissions for map access

2025-05-30 Thread Slava Imameev

Modify several functions in tools/bpf/bpftool/common.c to allow
specification of requested access for file descriptors, such as
read-only access.

Update bpftool to request only read access for maps when write
access is not required. This fixes errors when reading from maps
that are protected from modification via security_bpf_map.

Signed-off-by: Slava Imameev 
---
 tools/bpf/bpftool/btf.c   |  3 +-
 tools/bpf/bpftool/common.c| 57 ++-
 tools/bpf/bpftool/iter.c  |  2 +-
 tools/bpf/bpftool/link.c  |  2 +-
 tools/bpf/bpftool/main.h  | 13 ---
 tools/bpf/bpftool/map.c   | 56 +-
 tools/bpf/bpftool/map_perf_ring.c |  3 +-
 tools/bpf/bpftool/prog.c  |  4 +--
 8 files changed, 90 insertions(+), 50 deletions(-)

diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c
index 6b14cbfa58aa..1ba27cb03348 100644
--- a/tools/bpf/bpftool/btf.c
+++ b/tools/bpf/bpftool/btf.c
@@ -905,7 +905,8 @@ static int do_dump(int argc, char **argv)
return -1;
}
 
-   fd = map_parse_fd_and_info(&argc, &argv, &info, &len);
+   fd = map_parse_fd_and_info(&argc, &argv, &info, &len,
+  BPF_F_RDONLY);
if (fd < 0)
return -1;
 
diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index ecfa790adc13..ff1c99281beb 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -193,7 +193,8 @@ int mount_tracefs(const char *target)
return err;
 }
 
-int open_obj_pinned(const char *path, bool quiet)
+int open_obj_pinned(const char *path, bool quiet,
+   const struct bpf_obj_get_opts *opts)
 {
char *pname;
int fd = -1;
@@ -205,7 +206,7 @@ int open_obj_pinned(const char *path, bool quiet)
goto out_ret;
}
 
-   fd = bpf_obj_get(pname);
+   fd = bpf_obj_get_opts(pname, opts);
if (fd < 0) {
if (!quiet)
p_err("bpf obj get (%s): %s", pname,
@@ -221,12 +222,13 @@ int open_obj_pinned(const char *path, bool quiet)
return fd;
 }
 
-int open_obj_pinned_any(const char *path, enum bpf_obj_type exp_type)
+int open_obj_pinned_any(const char *path, enum bpf_obj_type exp_type,
+   const struct bpf_obj_get_opts *opts)
 {
enum bpf_obj_type type;
int fd;
 
-   fd = open_obj_pinned(path, false);
+   fd = open_obj_pinned(path, false, opts);
if (fd < 0)
return -1;
 
@@ -555,7 +557,7 @@ static int do_build_table_cb(const char *fpath, const 
struct stat *sb,
if (typeflag != FTW_F)
goto out_ret;
 
-   fd = open_obj_pinned(fpath, true);
+   fd = open_obj_pinned(fpath, true, NULL);
if (fd < 0)
goto out_ret;
 
@@ -928,7 +930,7 @@ int prog_parse_fds(int *argc, char ***argv, int **fds)
path = **argv;
NEXT_ARGP();
 
-   (*fds)[0] = open_obj_pinned_any(path, BPF_OBJ_PROG);
+   (*fds)[0] = open_obj_pinned_any(path, BPF_OBJ_PROG, NULL);
if ((*fds)[0] < 0)
return -1;
return 1;
@@ -965,7 +967,8 @@ int prog_parse_fd(int *argc, char ***argv)
return fd;
 }
 
-static int map_fd_by_name(char *name, int **fds)
+static int map_fd_by_name(char *name, int **fds,
+ const struct bpf_get_fd_by_id_opts *opts)
 {
unsigned int id = 0;
int fd, nb_fds = 0;
@@ -973,6 +976,7 @@ static int map_fd_by_name(char *name, int **fds)
int err;
 
while (true) {
+   LIBBPF_OPTS(bpf_get_fd_by_id_opts, opts_ro);
struct bpf_map_info info = {};
__u32 len = sizeof(info);
 
@@ -985,7 +989,9 @@ static int map_fd_by_name(char *name, int **fds)
return nb_fds;
}
 
-   fd = bpf_map_get_fd_by_id(id);
+   /* Request a read-only fd to query the map info */
+   opts_ro.open_flags = BPF_F_RDONLY;
+   fd = bpf_map_get_fd_by_id_opts(id, &opts_ro);
if (fd < 0) {
p_err("can't get map by id (%u): %s",
  id, strerror(errno));
@@ -1004,6 +1010,15 @@ static int map_fd_by_name(char *name, int **fds)
continue;
}
 
+   /* Get an fd with the requested options. */
+   close(fd);
+   fd = bpf_map_get_fd_by_id_opts(id, opts);
+   if (fd < 0) {
+   p_err("can't get map by id (%u): %s", id,
+ strerror(errno));
+   goto err_close_fds;
+   }
+
if (nb_fds > 0) {
tmp = realloc(*fds, (nb_fds + 1) * sizeof(int));
if (!tmp

Re: [PATCH] kunit: configs: Enable CONFIG_INIT_STACK_ALL_PATTERN in all_tests

2025-05-30 Thread Richard Fitzgerald


On 29/5/25 16:38, Jakub Kicinski wrote:

On Fri, 11 Apr 2025 10:59:04 +0100 Richard Fitzgerald wrote:

Enable CONFIG_INIT_STACK_ALL_PATTERN in all_tests.config. This helps
to detect use of uninitialized local variables.

This option found an uninitialized data bug in the cs_dsp test.

Signed-off-by: Richard Fitzgerald 
---
  tools/testing/kunit/configs/all_tests.config | 1 +
  1 file changed, 1 insertion(+)

diff --git a/tools/testing/kunit/configs/all_tests.config 
b/tools/testing/kunit/configs/all_tests.config
index cdd9782f9646..4a60bb71fe72 100644
--- a/tools/testing/kunit/configs/all_tests.config
+++ b/tools/testing/kunit/configs/all_tests.config
@@ -10,6 +10,7 @@ CONFIG_KUNIT_EXAMPLE_TEST=y
  CONFIG_KUNIT_ALL_TESTS=y
  
  CONFIG_FORTIFY_SOURCE=y

+CONFIG_INIT_STACK_ALL_PATTERN=y


This breaks kunit for older compilers:


Drop it then.
It's not essential. Just something that showed a bug in a test so I
thought would be useful to test always. But if there are compatibility
problems it would be better not to have it in all_tests.

[PATCH bpf-next 2/2] selftests/bpf: Add test for bpftool access to read-only protected maps

2025-05-30 Thread Slava Imameev

Add selftest cases that validate bpftool's expected behavior when
accessing maps protected from modification via security_bpf_map.

The test includes a BPF program attached to security_bpf_map with two maps:
- A protected map that only allows read-only access
- An unprotected map that allows full access

The test script attaches the BPF program to security_bpf_map and
verifies that for the bpftool map command:
- Read access works on both maps
- Write access fails on the protected map
- Write access succeeds on the unprotected map
- These behaviors remain consistent when the maps are pinned

Signed-off-by: Slava Imameev 
---
 tools/testing/selftests/bpf/Makefile  |   1 +
 .../selftests/bpf/progs/security_bpf_map.c|  56 +
 .../testing/selftests/bpf/test_bpftool_map.sh | 208 ++
 3 files changed, 265 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/security_bpf_map.c
 create mode 100755 tools/testing/selftests/bpf/test_bpftool_map.sh

diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index cf5ed3bee573..731a86407799 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -109,6 +109,7 @@ TEST_PROGS := test_kmod.sh \
test_xdping.sh \
test_bpftool_build.sh \
test_bpftool.sh \
+   test_bpftool_map.sh \
test_bpftool_metadata.sh \
test_doc_build.sh \
test_xsk.sh \
diff --git a/tools/testing/selftests/bpf/progs/security_bpf_map.c 
b/tools/testing/selftests/bpf/progs/security_bpf_map.c
new file mode 100644
index ..57226f2ceb5f
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/security_bpf_map.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "vmlinux.h"
+#include 
+#include 
+
+char _license[] SEC("license") = "GPL";
+
+#define EPERM 1 /* Operation not permitted */
+
+/* From include/linux/mm.h. */
+#define FMODE_WRITE0x2
+
+struct map;
+
+struct {
+   __uint(type, BPF_MAP_TYPE_ARRAY);
+   __type(key, __u32);
+   __type(value, __u32);
+   __uint(max_entries, 1);
+} prot_map SEC(".maps");
+
+struct {
+   __uint(type, BPF_MAP_TYPE_ARRAY);
+   __type(key, __u32);
+   __type(value, __u32);
+   __uint(max_entries, 1);
+} not_prot_map SEC(".maps");
+
+SEC("fmod_ret/security_bpf_map")
+int BPF_PROG(fmod_bpf_map, struct bpf_map *map, int fmode)
+{
+   if (map == &prot_map) {
+   /* Allow read-only access */
+   if (fmode & FMODE_WRITE)
+   return -EPERM;
+   }
+
+   return 0;
+}
+
+/*
+ * This program keeps references to maps. This is needed to prevent
+ * optimizing them out.
+ */
+SEC("fentry/bpf_fentry_test1")
+int BPF_PROG(bpf_fentry_test1, int a)
+{
+   __u32 key = 0;
+   __u32 val1 = a;
+   __u32 val2 = a + 1;
+
+   bpf_map_update_elem(&prot_map, &key, &val1, BPF_ANY);
+   bpf_map_update_elem(¬_prot_map, &key, &val2, BPF_ANY);
+   return 0;
+}
diff --git a/tools/testing/selftests/bpf/test_bpftool_map.sh 
b/tools/testing/selftests/bpf/test_bpftool_map.sh
new file mode 100755
index ..c7c7f3d2071e
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_bpftool_map.sh
@@ -0,0 +1,208 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+PROTECTED_MAP_NAME="prot_map"
+NOT_PROTECTED_MAP_NAME="not_prot_map"
+BPF_FILE="security_bpf_map.bpf.o"
+TESTNAME="security_bpf_map"
+BPF_FS=$(awk '$3 == "bpf" {print $2; exit}' /proc/mounts)
+BPF_DIR="$BPF_FS/test_$TESTNAME"
+SCRIPT_DIR=$(dirname $(realpath "$0"))
+BPF_FILE_PATH="$SCRIPT_DIR/$BPF_FILE"
+# Assume the script is located under tools/testing/selftests/bpf/
+KDIR_ROOT_DIR=$(realpath "$SCRIPT_DIR"/../../../../)
+
+_cleanup()
+{
+   set +eu
+   [ -d "$TMPDIR" ] && rm -rf "$TMPDIR" 2> /dev/null
+   [ -d "$BPF_DIR" ] && rm -rf "$BPF_DIR" 2> /dev/null
+}
+
+cleanup_skip()
+{
+   echo "selftests: $TESTNAME [SKIP]"
+   _cleanup
+
+   exit $ksft_skip
+}
+
+cleanup()
+{
+   if [ "$?" = 0 ]; then
+   echo "selftests: $TESTNAME [PASS]"
+   else
+   echo "selftests: $TESTNAME [FAILED]"
+   fi
+   _cleanup
+}
+
+# Parameters:
+#   $1: The top of kernel repository
+#   $2: Output directory
+build_bpftool()
+{
+   local kdir_root_dir="$1"
+   local output_dir="$2"
+   local pwd="$(pwd)"
+   local ncpus=1
+
+   echo Building bpftool ...
+
+   #We want to start build from the top of kernel repository.
+   cd "$kdir_root_dir"
+   if [ ! -e tools/bpf/bpftool/Makefile ]; then
+   echo bpftool files not found
+   exit $ksft_skip
+   fi
+
+   # Determine the number of CPUs for parallel compilation
+   if command -v nproc >/dev/null 2>&1; then
+   ncpus=$(nproc)
+   fi
+
+   make -C tools/bpf/bpftool -s -j"$ncpus" OUTPUT="$output_dir"/ >/dev/null

Re: [RFC PATCH v2 0/9] KVM: Enable Nested Virt selftests

2025-05-30 Thread Oliver Upton

Hi Ganapat,

On Mon, May 12, 2025 at 03:52:42AM -0700, Ganapatrao Kulkarni wrote:
> This patch series makes the selftest work with NV enabled. The guest code
> is run in vEL2 instead of EL1. We add a command line option to enable
> testing of NV. The NV tests are disabled by default.
> 
> Modified around 12 selftests in this series.

Thanks for sharing this, we sorely need testing for NV.

I haven't looked at these patches thoroughly but I have some overall
feedback. What I'd like to see is that we force *all* KVM selftests
to run in VHE EL2 without an opt-in/opt-out. The user had to boot their
kernel with kvm-arm.mode=nested, so imposing NV testing on them feels
reasonable.

Thanks,
Oliver

Re: [PATCH v1 1/1] selftests/x86: Add a test to detect infinite sigtrap handler loop

2025-05-30 Thread Sohil Mehta

On 5/30/2025 4:07 PM, Xin Li (Intel) wrote:
> When FRED is enabled, if the Trap Flag (TF) is set without an external
> debugger attached, it can lead to an infinite loop in the SIGTRAP
> handler.  To avoid this, the software event flag in the augmented SS
> must be cleared, ensuring that no single-step trap remains pending when
> ERETU completes.
> 
> This test checks for that specific scenario—verifying whether the kernel
> correctly prevents an infinite SIGTRAP loop in this edge case.
> 

It isn't clear from the commit message whether the test is specific to
FRED or a generic one.

> Signed-off-by: Xin Li (Intel) 
> ---
>  tools/testing/selftests/x86/Makefile  |  2 +-
>  .../selftests/x86/test_sigtrap_handler.c  | 80 +++
>  2 files changed, 81 insertions(+), 1 deletion(-)
>  create mode 100644 tools/testing/selftests/x86/test_sigtrap_handler.c
> 
> diff --git a/tools/testing/selftests/x86/Makefile 
> b/tools/testing/selftests/x86/Makefile
> index f703fcfe9f7c..c486fd88ebb1 100644
> --- a/tools/testing/selftests/x86/Makefile
> +++ b/tools/testing/selftests/x86/Makefile
> @@ -12,7 +12,7 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh "$(CC)" 
> trivial_program.c -no-pie)
>  
>  TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt 
> test_mremap_vdso \
>   check_initial_reg_state sigreturn iopl ioperm \
> - test_vsyscall mov_ss_trap \
> + test_vsyscall mov_ss_trap test_sigtrap_handler \
>   syscall_arg_fault fsgsbase_restore sigaltstack
>  TARGETS_C_BOTHBITS += nx_stack
>  TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
> diff --git a/tools/testing/selftests/x86/test_sigtrap_handler.c 
> b/tools/testing/selftests/x86/test_sigtrap_handler.c
> new file mode 100644
> index ..9c5c2cf0cf88
> --- /dev/null
> +++ b/tools/testing/selftests/x86/test_sigtrap_handler.c
> @@ -0,0 +1,80 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later

Curious about your use of GPL-2.0-or-later?

All the files in this directory use GPL-2.0-only or GPL-2.0.

> +/*
> + *  Copyright (C) 2025 Intel Corporation
> + */
> +#define _GNU_SOURCE
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#ifdef __x86_64__
> +# define REG_IP REG_RIP
> +#else
> +# define REG_IP REG_EIP
> +#endif
> +
> +static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), 
> int flags)
> +{
> + struct sigaction sa;
> +
> + memset(&sa, 0, sizeof(sa));
> + sa.sa_sigaction = handler;
> + sa.sa_flags = SA_SIGINFO | flags;
> + sigemptyset(&sa.sa_mask);
> +
> + if (sigaction(sig, &sa, 0))
> + err(1, "sigaction");
> +
> + return;
> +}
> +
> +static unsigned int loop_count_on_same_ip;
> +
> +static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
> +{
> + ucontext_t *ctx = (ucontext_t *)ctx_void;
> + static unsigned long last_trap_ip;
> +
> + if (last_trap_ip == ctx->uc_mcontext.gregs[REG_IP]) {
> + printf("trapped on %016lx\n", last_trap_ip);
> +
> + if (++loop_count_on_same_ip > 10) {
> + printf("trap loop detected, test failed\n");
> + exit(2);
> + }

Most of the x86 selftests use the ksft_exit_fail_msg(), ksft_print_msg()
or [RUN, FAIL, OK] style for error messages and other informational prints.


> +
> + return;
> + }
> +
> + loop_count_on_same_ip = 0;
> + last_trap_ip = ctx->uc_mcontext.gregs[REG_IP];
> + printf("trapped on %016lx\n", last_trap_ip);
> +}
> +
> +int main(int argc, char *argv[])
> +{
> + sethandler(SIGTRAP, sigtrap, 0);
> +

I would suggest a comment here to explain what the following assembly
code is supposed to do. It isn't obvious from a cursory look.

> + asm volatile(
> +#ifdef __x86_64__
> + /* Avoid clobbering the redzone */
> + "sub $128, %rsp\n\t"
> +#endif
> + "push $0x302\n\t"
> + "popf\n\t"
> + "nop\n\t"
> + "nop\n\t"
> + "push $0x202\n\t"
> + "popf\n\t"
> +#ifdef __x86_64__
> + "add $128, %rsp\n\t"
> +#endif
> + );
> +
> + printf("test passed\n");
> + return 0;
> +}
> 
> base-commit: 485d11d84a2452ac16466cc7ae041c93d38929bc

[PATCH net v2] selftests: net: build net/lib dependency in all target

2025-05-30 Thread Bui Quang Minh

We have the logic to include net/lib automatically for net related
selftests. However, currently, this logic is only in install target
which means only `make install` will have net/lib included. This commit
moves the logic to all target so that all `make`, `make run_tests` and
`make install` will have net/lib included in net related selftests.

Reviewed-by: Jakub Kicinski 
Signed-off-by: Bui Quang Minh 
---
Changes in v2:
- Make the commit message clearer.

 tools/testing/selftests/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 6aa11cd3db42..5b04d83ad9a1 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -205,7 +205,7 @@ export KHDR_INCLUDES
 
 all:
@ret=1; \
-   for TARGET in $(TARGETS); do\
+   for TARGET in $(TARGETS) $(INSTALL_DEP_TARGETS); do \
BUILD_TARGET=$$BUILD/$$TARGET;  \
mkdir $$BUILD_TARGET  -p;   \
$(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET   \
@@ -270,7 +270,7 @@ ifdef INSTALL_PATH
install -m 744 run_kselftest.sh $(INSTALL_PATH)/
rm -f $(TEST_LIST)
@ret=1; \
-   for TARGET in $(TARGETS) $(INSTALL_DEP_TARGETS); do \
+   for TARGET in $(TARGETS); do \
BUILD_TARGET=$$BUILD/$$TARGET;  \
$(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install \
INSTALL_PATH=$(INSTALL_PATH)/$$TARGET \
-- 
2.43.0

Re: [PATCH net v2] selftests: net: build net/lib dependency in all target

2025-05-30 Thread Bui Quang Minh


On 5/31/25 12:10, Bui Quang Minh wrote:

We have the logic to include net/lib automatically for net related
selftests. However, currently, this logic is only in install target
which means only `make install` will have net/lib included. This commit
moves the logic to all target so that all `make`, `make run_tests` and
`make install` will have net/lib included in net related selftests.

Reviewed-by: Jakub Kicinski 
Signed-off-by: Bui Quang Minh 
---
Changes in v2:
- Make the commit message clearer.

  tools/testing/selftests/Makefile | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 6aa11cd3db42..5b04d83ad9a1 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -205,7 +205,7 @@ export KHDR_INCLUDES
  
  all:

@ret=1; \
-   for TARGET in $(TARGETS); do\
+   for TARGET in $(TARGETS) $(INSTALL_DEP_TARGETS); do \
BUILD_TARGET=$$BUILD/$$TARGET;  \
mkdir $$BUILD_TARGET  -p;   \
$(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET   \
@@ -270,7 +270,7 @@ ifdef INSTALL_PATH
install -m 744 run_kselftest.sh $(INSTALL_PATH)/
rm -f $(TEST_LIST)
@ret=1; \
-   for TARGET in $(TARGETS) $(INSTALL_DEP_TARGETS); do \
+   for TARGET in $(TARGETS); do \
BUILD_TARGET=$$BUILD/$$TARGET;  \
$(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install \
INSTALL_PATH=$(INSTALL_PATH)/$$TARGET \


I'm sorry but please don't merge this. I realize that we cannot remove 
INSTALL_DEP_TARGETS in install target, if we do so, net/lib will not be 
copied the the INSTALL_PATH. I'll post a version 3 shortly after.


Thanks,
Quang Minh.

Re: [PATCH v3 2/4] media: qcom: camss: Add support for MSM8939

2025-05-30 Thread Konrad Dybcio

On 5/30/25 1:49 PM, Bryan O'Donoghue wrote:
> On 30/05/2025 10:00, Vincent Knecht via B4 Relay wrote:
>> +camss->res->version == CAMSS_8x39 ||
> 
> This is not correct - it should be 893x since 8939 and 8936 are ~ the 
> same SoC - probably 36 is just a binned version of 39.
> 
> Anyway the x is the least significant digit.

x here is the modem config, but we generally agree to just use the
most obvious representative, so 8939 is ok

Konrad

[PATCH v1 1/1] selftests/x86: Add a test to detect infinite sigtrap handler loop

2025-05-30 Thread Xin Li (Intel)

When FRED is enabled, if the Trap Flag (TF) is set without an external
debugger attached, it can lead to an infinite loop in the SIGTRAP
handler.  To avoid this, the software event flag in the augmented SS
must be cleared, ensuring that no single-step trap remains pending when
ERETU completes.

This test checks for that specific scenario—verifying whether the kernel
correctly prevents an infinite SIGTRAP loop in this edge case.

Signed-off-by: Xin Li (Intel) 
---
 tools/testing/selftests/x86/Makefile  |  2 +-
 .../selftests/x86/test_sigtrap_handler.c  | 80 +++
 2 files changed, 81 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/x86/test_sigtrap_handler.c

diff --git a/tools/testing/selftests/x86/Makefile 
b/tools/testing/selftests/x86/Makefile
index f703fcfe9f7c..c486fd88ebb1 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -12,7 +12,7 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh "$(CC)" 
trivial_program.c -no-pie)
 
 TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt 
test_mremap_vdso \
check_initial_reg_state sigreturn iopl ioperm \
-   test_vsyscall mov_ss_trap \
+   test_vsyscall mov_ss_trap test_sigtrap_handler \
syscall_arg_fault fsgsbase_restore sigaltstack
 TARGETS_C_BOTHBITS += nx_stack
 TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
diff --git a/tools/testing/selftests/x86/test_sigtrap_handler.c 
b/tools/testing/selftests/x86/test_sigtrap_handler.c
new file mode 100644
index ..9c5c2cf0cf88
--- /dev/null
+++ b/tools/testing/selftests/x86/test_sigtrap_handler.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *  Copyright (C) 2025 Intel Corporation
+ */
+#define _GNU_SOURCE
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#ifdef __x86_64__
+# define REG_IP REG_RIP
+#else
+# define REG_IP REG_EIP
+#endif
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), int 
flags)
+{
+   struct sigaction sa;
+
+   memset(&sa, 0, sizeof(sa));
+   sa.sa_sigaction = handler;
+   sa.sa_flags = SA_SIGINFO | flags;
+   sigemptyset(&sa.sa_mask);
+
+   if (sigaction(sig, &sa, 0))
+   err(1, "sigaction");
+
+   return;
+}
+
+static unsigned int loop_count_on_same_ip;
+
+static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
+{
+   ucontext_t *ctx = (ucontext_t *)ctx_void;
+   static unsigned long last_trap_ip;
+
+   if (last_trap_ip == ctx->uc_mcontext.gregs[REG_IP]) {
+   printf("trapped on %016lx\n", last_trap_ip);
+
+   if (++loop_count_on_same_ip > 10) {
+   printf("trap loop detected, test failed\n");
+   exit(2);
+   }
+
+   return;
+   }
+
+   loop_count_on_same_ip = 0;
+   last_trap_ip = ctx->uc_mcontext.gregs[REG_IP];
+   printf("trapped on %016lx\n", last_trap_ip);
+}
+
+int main(int argc, char *argv[])
+{
+   sethandler(SIGTRAP, sigtrap, 0);
+
+   asm volatile(
+#ifdef __x86_64__
+   /* Avoid clobbering the redzone */
+   "sub $128, %rsp\n\t"
+#endif
+   "push $0x302\n\t"
+   "popf\n\t"
+   "nop\n\t"
+   "nop\n\t"
+   "push $0x202\n\t"
+   "popf\n\t"
+#ifdef __x86_64__
+   "add $128, %rsp\n\t"
+#endif
+   );
+
+   printf("test passed\n");
+   return 0;
+}

base-commit: 485d11d84a2452ac16466cc7ae041c93d38929bc
-- 
2.49.0

46 matches

Mail list logo