[PATCH v2 32/32] HACK: selftests/nolibc: demonstrate usage of the kselftest harness

2025-04-07 Thread Thomas Weißschuh
Show how to use the kselftest harness together with nolibc.
This just runs the existing harness selftest by crudely replacing the
regular nolibc-test.c with the harness-selftest.c to get that wired up easily.
To use it:
$ cd tools/testing/selftests/nolibc/
$ ./run-tests -m user

In the future nolibc-test can use the harness for itself.

Not-Signed-off-by: Thomas Weißschuh 
---
 .../testing/selftests/kselftest/harness-selftest.c |2 +-
 tools/testing/selftests/nolibc/Makefile|   13 +-
 tools/testing/selftests/nolibc/harness-selftest.c  |1 +
 tools/testing/selftests/nolibc/nolibc-test.c   | 1715 +---
 tools/testing/selftests/nolibc/run-tests.sh|2 +-
 5 files changed, 11 insertions(+), 1722 deletions(-)

diff --git a/tools/testing/selftests/kselftest/harness-selftest.c 
b/tools/testing/selftests/kselftest/harness-selftest.c
index 
8d39e7a0b99c41a5d33edfe2dbf875cac04c098d..bbb2fda7042ca8bac608625e6f4302466b23f7b3
 100644
--- a/tools/testing/selftests/kselftest/harness-selftest.c
+++ b/tools/testing/selftests/kselftest/harness-selftest.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
 
 #include 
 
diff --git a/tools/testing/selftests/nolibc/Makefile 
b/tools/testing/selftests/nolibc/Makefile
index 
58bcbbd029bc3ad9ccac968191b703ccf5df0717..7c037fb3f3c8d8e510cf8a6e80fcd11f0ec3538c
 100644
--- a/tools/testing/selftests/nolibc/Makefile
+++ b/tools/testing/selftests/nolibc/Makefile
@@ -165,8 +165,8 @@ Q=@
 endif
 
 CFLAGS_i386 = $(call cc-option,-m32)
-CFLAGS_arm = -marm
-CFLAGS_armthumb = -mthumb -march=armv6t2
+CFLAGS_arm = -marm -march=armv7-a
+CFLAGS_armthumb = -mthumb -march=armv7
 CFLAGS_ppc = -m32 -mbig-endian -mno-vsx $(call cc-option,-mmultiple)
 CFLAGS_ppc64 = -m64 -mbig-endian -mno-vsx $(call cc-option,-mmultiple)
 CFLAGS_ppc64le = -m64 -mlittle-endian -mno-vsx $(call cc-option,-mabi=elfv2)
@@ -175,8 +175,10 @@ CFLAGS_s390 = -m31
 CFLAGS_mips32le = -EL -mabi=32 -fPIC
 CFLAGS_mips32be = -EB -mabi=32
 CFLAGS_STACKPROTECTOR ?= $(call cc-option,-mstack-protector-guard=global 
$(call cc-option,-fstack-protector-all))
-CFLAGS  ?= -Os -fno-ident -fno-asynchronous-unwind-tables -std=c89 -W -Wall 
-Wextra \
+CFLAGS  ?= -Os -fno-ident -fno-asynchronous-unwind-tables -std=c89 -W -Wall 
-Wextra -ggdb -ffreestanding \
+   -ffile-prefix-map=./= \
$(call cc-option,-fno-stack-protector) $(call 
cc-option,-Wmissing-prototypes) \
+   $(call cc-option,-mno-outline-atomics) \
$(CFLAGS_$(XARCH)) $(CFLAGS_STACKPROTECTOR) $(CFLAGS_EXTRA)
 LDFLAGS :=
 
@@ -193,10 +195,7 @@ include $(srctree)/tools/scripts/Makefile.include
 # GCC uses "s390", clang "systemz"
 CLANG_CROSS_FLAGS := $(subst 
--target=s390-linux,--target=systemz-linux,$(CLANG_CROSS_FLAGS))
 
-REPORT  ?= awk '/\[OK\][\r]*$$/{p++} /\[FAIL\][\r]*$$/{if (!f) printf("\n"); 
f++; print;} /\[SKIPPED\][\r]*$$/{s++} \
-   END{ printf("\n%3d test(s): %3d passed, %3d skipped, %3d failed 
=> status: ", p+s+f, p, s, f); \
-   if (f || !p) printf("failure\n"); else if (s) 
printf("warning\n"); else printf("success\n");; \
-   printf("\nSee all results in %s\n", ARGV[1]); }'
+REPORT  = sed -i -e '/^\[/d' -e 's/\x0d//' run.out; cmp 
../kselftest/harness-selftest.expected run.out && echo ok; true
 
 help:
@echo "Supported targets under selftests/nolibc:"
diff --git a/tools/testing/selftests/nolibc/harness-selftest.c 
b/tools/testing/selftests/nolibc/harness-selftest.c
new file mode 12
index 
..847b121e60482513cd0911422cfdb19bdf681bd6
--- /dev/null
+++ b/tools/testing/selftests/nolibc/harness-selftest.c
@@ -0,0 +1 @@
+../kselftest/harness-selftest.c
\ No newline at end of file
diff --git a/tools/testing/selftests/nolibc/nolibc-test.c 
b/tools/testing/selftests/nolibc/nolibc-test.c
index 
16ec4f658bbec43440679c5d5c35014827c377bc..9a074d2b24c99d86bf27f8399f2e7dc719dbcd24
 100644
--- a/tools/testing/selftests/nolibc/nolibc-test.c
+++ b/tools/testing/selftests/nolibc/nolibc-test.c
@@ -1,1716 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
-#define _GNU_SOURCE
-#define _LARGEFILE64_SOURCE
+#define inline __inline__
 
-/* libc-specific include files
- * The program may be built in 3 ways:
- *   $(CC) -nostdlib -include /path/to/nolibc.h => NOLIBC already defined
- *   $(CC) -nostdlib -I/path/to/nolibc/sysroot  => _NOLIBC_* guards are present
- *   $(CC) with default libc=> NOLIBC* never defined
- */
-#ifndef NOLIBC
-#include 
-#include 
-#include 
-#ifndef _NOLIBC_STDIO_H
-/* standard libcs need more includes */
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#endif
-#endif
-
-#pr

[PATCH v2 30/32] selftests/nolibc: add test for snprintf() truncation

2025-04-07 Thread Thomas Weißschuh
Now that we have a proper snprintf() implementation,
make sure truncation is handled properly.

Signed-off-by: Thomas Weißschuh 
Acked-by: Willy Tarreau 
---
 tools/testing/selftests/nolibc/nolibc-test.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/nolibc/nolibc-test.c 
b/tools/testing/selftests/nolibc/nolibc-test.c
index 
6dfa94df37547dae46ab19195a763fe22b065bab..9bd0a9c68b903cbd660ff81d4b0386b0b7c13977
 100644
--- a/tools/testing/selftests/nolibc/nolibc-test.c
+++ b/tools/testing/selftests/nolibc/nolibc-test.c
@@ -1307,7 +1307,8 @@ static int expect_vfprintf(int llen, int c, const char 
*expected, const char *fm
 
 
va_start(args, fmt);
-   w = vsnprintf(buf, sizeof(buf), fmt, args);
+   /* Only allow writing 21 bytes, to test truncation */
+   w = vsnprintf(buf, 21, fmt, args);
va_end(args);
 
if (w != c) {
@@ -1413,6 +1414,7 @@ static int run_printf(int min, int max)
CASE_TEST(pointer);  EXPECT_VFPRINTF(3, "0x1", "%p", (void 
*) 0x1); break;
CASE_TEST(uintmax_t);EXPECT_VFPRINTF(20, 
"18446744073709551615", "%ju", 0xULL); break;
CASE_TEST(intmax_t); EXPECT_VFPRINTF(20, 
"-9223372036854775807", "%jd", 0x8001LL); break;
+   CASE_TEST(truncation);   EXPECT_VFPRINTF(25, 
"01234567890123456789", "%s", "0123456789012345678901234"); break;
CASE_TEST(scanf);EXPECT_ZR(1, test_scanf()); break;
case __LINE__:
return ret; /* must be last */

-- 
2.49.0




[PATCH v17 08/15] PCI: endpoint: pci-ep-msi: Add MSI address/data pair mutable check

2025-04-07 Thread Frank Li
Some MSI controller change address/data pair when irq_set_affinity().
Current PCI endpoint can't support this type MSI controller. So add flag
MSI_FLAG_MUTABLE in include/linux/msi.h and check it when allocate
doorbell.

Signed-off-by: Frank Li 
---
change from v14 to v16
- none

change from  v13 to v14
- bring v10 back

Change from v9 to v10
- new patch
---
 drivers/pci/endpoint/pci-ep-msi.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/drivers/pci/endpoint/pci-ep-msi.c 
b/drivers/pci/endpoint/pci-ep-msi.c
index 549b55b864d0e..c0e2d806ee658 100644
--- a/drivers/pci/endpoint/pci-ep-msi.c
+++ b/drivers/pci/endpoint/pci-ep-msi.c
@@ -44,6 +44,14 @@ int pci_epf_alloc_doorbell(struct pci_epf *epf, u16 num_db)
 
dev_set_msi_domain(dev, dom);
 
+   if (!irq_domain_is_msi_parent(dom))
+   return -EINVAL;
+
+   if (!irq_domain_is_msi_immutable(dom)) {
+   dev_err(dev, "Can't support mutable address/data pair MSI 
controller\n");
+   return -EINVAL;
+   }
+
msg = kcalloc(num_db, sizeof(struct pci_epf_doorbell_msg), GFP_KERNEL);
if (!msg)
return -ENOMEM;

-- 
2.34.1




Re: [PATCH] kbuild: Require pahole >v1.29 with GENDWARFKSYMS and BTF on X86

2025-04-07 Thread Sami Tolvanen
Hi Masahiro,

On Sun, Apr 6, 2025 at 1:21 PM Masahiro Yamada  wrote:
>
> On Fri, Mar 21, 2025 at 8:28 AM Sami Tolvanen  wrote:
> >
> > With CONFIG_GENDWARFKSYMS, __gendwarfksyms_ptr variables are added
> > to the kernel in EXPORT_SYMBOL() to ensure DWARF type information
> > is available for exported symbols in the TUs where they're actually
> > exported. These symbols are dropped when linking vmlinux, but
> > dangling references to them remain in DWARF.
> >
> > With CONFIG_DEBUG_INFO_BTF enabled on X86, pahole versions
> > before commit 9810758003ce ("btf_encoder: Verify 0 address
> > DWARF variables are in ELF section") place these symbols in the
> > .data..percpu section, which results in an "Invalid offset" error in
> > btf_datasec_check_meta() during boot, as all the variables are at
> > zero offset and have non-zero size. If CONFIG_DEBUG_INFO_BTF_MODULES
> > is enabled, this also results in a failure to load modules with:
> >
> >   failed to validate module [$module] BTF: -22
> >
> > The pahole commit that adds 0 address DWARF variable verification
> > was merged after v1.29 was released, so later versions of pahole
> > shouldn't have this issue. Require pahole >v1.29 when GENDWARFKSYMS
> > is enabled with DEBUG_INFO_BTF on X86.
> >
> > Reported-by: Paolo Pisati 
> > Signed-off-by: Sami Tolvanen 
>
> The issue occurs with
> 47dcb534e253 ("btf_encoder: Stop indexing symbols for VARs"),
> then fixed by  9810758003ce ("btf_encoder: Verify 0 address
> DWARF variables are in ELF section")
>
>
> Perhaps, does it make sense to do this?
>
>  depends on !X86 || !DEBUG_INFO_BTF || (PAHOLE_VERSION > 129 ||
> PAHOLE_VERSION < 128)

That's a good point. I confirmed that v1.27 works fine too. I'll send v2.

Sami



Re: [PATCH] vhost/net: remove zerocopy support

2025-04-07 Thread Jon Kohler


> On Apr 6, 2025, at 7:14 PM, Jason Wang  wrote:
> 
> !---|
>  CAUTION: External Email
> 
> |---!
> 
> On Fri, Apr 4, 2025 at 10:24 PM Jon Kohler  wrote:
>> 
>> Commit 098eadce3c62 ("vhost_net: disable zerocopy by default") disabled
>> the module parameter for the handle_tx_zerocopy path back in 2019,
>> nothing that many downstream distributions (e.g., RHEL7 and later) had
>> already done the same.
>> 
>> Both upstream and downstream disablement suggest this path is rarely
>> used.
>> 
>> Testing the module parameter shows that while the path allows packet
>> forwarding, the zerocopy functionality itself is broken. On outbound
>> traffic (guest TX -> external), zerocopy SKBs are orphaned by either
>> skb_orphan_frags_rx() (used with the tun driver via tun_net_xmit())
> 
> This is by design to avoid DOS.

I understand that, but it makes ZC non-functional in general, as ZC fails
and immediately increments the error counters.

> 
>> or
>> skb_orphan_frags() elsewhere in the stack,
> 
> Basically zerocopy is expected to work for guest -> remote case, so
> could we still hit skb_orphan_frags() in this case?

Yes, you’d hit that in tun_net_xmit(). If you punch a hole in that *and* in the
zc error counter (such that failed ZC doesn’t disable ZC in vhost), you get ZC
from vhost; however, the network interrupt handler under net_tx_action and
eventually incurs the memcpy under dev_queue_xmit_nit().

This is no more performant, and in fact is actually worse since the time spent
waiting on that memcpy to resolve is longer.

> 
>> as vhost_net does not set
>> SKBFL_DONT_ORPHAN.
>> 
>> Orphaning enforces a memcpy and triggers the completion callback, which
>> increments the failed TX counter, effectively disabling zerocopy again.
>> 
>> Even after addressing these issues to prevent SKB orphaning and error
>> counter increments, performance remains poor. By default, only 64
>> messages can be zerocopied, which is immediately exhausted by workloads
>> like iperf, resulting in most messages being memcpy'd anyhow.
>> 
>> Additionally, memcpy'd messages do not benefit from the XDP batching
>> optimizations present in the handle_tx_copy path.
>> 
>> Given these limitations and the lack of any tangible benefits, remove
>> zerocopy entirely to simplify the code base.
>> 
>> Signed-off-by: Jon Kohler 
> 
> Any chance we can fix those issues? Actually, we had a plan to make
> use of vhost-net and its tx zerocopy (or even implement the rx
> zerocopy) in pasta.

Happy to take direction and ideas here, but I don’t see a clear way to fix these
issues, without dealing with the assertions that skb_orphan_frags_rx calls out.

Said another way, I’d be interested in hearing if there is a config where ZC in
current host-net implementation works, as I was driving myself crazy trying to
reverse engineer.

Happy to collaborate if there is something we could do here.

> 
> Eugenio may explain more here.
> 
> Thanks
> 



Re: [PATCH] selftest/mm: Make hugetlb_reparenting_test tolerant to async reparenting

2025-04-07 Thread Donet Tom



On 4/7/25 2:12 PM, Li Wang wrote:

In cgroup v2, memory and hugetlb usage reparenting is asynchronous.
This can cause test flakiness when immediately asserting usage after
deleting a child cgroup. To address this, add a helper function
`assert_with_retry()` that checks usage values with a timeout-based retry.
This improves test stability without relying on fixed sleep delays.

Also bump up the tolerance size to 7MB.

To avoid False Positives:
   ...
   # Assert memory charged correctly for child only use.
   # actual a = 11 MB
   # expected a = 0 MB
   # fail
   # cleanup
   # [FAIL]
   not ok 11 hugetlb_reparenting_test.sh -cgroup-v2 # exit=1
   # 0
   # SUMMARY: PASS=10 SKIP=0 FAIL=1



I was also seeing this failure. I have tested this patch on my powerPC
setup and it is passing now.

./hugetlb_reparenting_test.sh -cgroup-v2
cleanup

Test charge, rmdir, uncharge
mkdir
write
Writing to this path: /mnt/huge/test
Writing this size: 52428800
Populating.
Not writing to memory.
Using method=0
Shared mapping.
RESERVE mapping.
Allocating using HUGETLBFS.

rmdir
uncharge
cleanup
done


Test child only hugetlb usage
setup
write
Writing to this path: /mnt/huge/test2
Writing this size: 52428800
Populating.
Not writing to memory.
Using method=0
Shared mapping.
RESERVE mapping.
Allocating using HUGETLBFS.

Assert memory charged correctly for child only use.
actual = 10 MB
expected = 0 MB
cleanup


Feel free to add
Tested-by Donet Tom 




Signed-off-by: Li Wang 
Cc: Waiman Long 
Cc: Anshuman Khandual 
Cc: Dev Jain 
Cc: Kirill A. Shuemov 
Cc: Shuah Khan 
---
  .../selftests/mm/hugetlb_reparenting_test.sh  | 96 ---
  1 file changed, 41 insertions(+), 55 deletions(-)

diff --git a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh 
b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
index 11f9bbe7dc22..1c172c6999f4 100755
--- a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
+++ b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
@@ -36,7 +36,7 @@ else
  do_umount=1
fi
  fi
-MNT='/mnt/huge/'
+MNT='/mnt/huge'
  
  function get_machine_hugepage_size() {

hpz=$(grep -i hugepagesize /proc/meminfo)
@@ -60,6 +60,41 @@ function cleanup() {
set -e
  }
  
+function assert_with_retry() {

+  local actual_path="$1"
+  local expected="$2"
+  local tolerance=$((7 * 1024 * 1024))
+  local timeout=20
+  local interval=1
+  local start_time
+  local now
+  local elapsed
+  local actual
+
+  start_time=$(date +%s)
+
+  while true; do
+actual="$(cat "$actual_path")"
+
+if [[ $actual -ge $(($expected - $tolerance)) ]] &&
+[[ $actual -le $(($expected + $tolerance)) ]]; then
+  return 0
+fi
+
+now=$(date +%s)
+elapsed=$((now - start_time))
+
+if [[ $elapsed -ge $timeout ]]; then
+  echo "actual = $((${actual%% *} / 1024 / 1024)) MB"
+  echo "expected = $((${expected%% *} / 1024 / 1024)) MB"
+  cleanup
+  exit 1
+fi
+
+sleep $interval
+  done
+}
+
  function assert_state() {
local expected_a="$1"
local expected_a_hugetlb="$2"
@@ -70,58 +105,13 @@ function assert_state() {
  expected_b="$3"
  expected_b_hugetlb="$4"
fi
-  local tolerance=$((5 * 1024 * 1024))
-
-  local actual_a
-  actual_a="$(cat "$CGROUP_ROOT"/a/memory.$usage_file)"
-  if [[ $actual_a -lt $(($expected_a - $tolerance)) ]] ||
-[[ $actual_a -gt $(($expected_a + $tolerance)) ]]; then
-echo actual a = $((${actual_a%% *} / 1024 / 1024)) MB
-echo expected a = $((${expected_a%% *} / 1024 / 1024)) MB
-echo fail
-
-cleanup
-exit 1
-  fi
-
-  local actual_a_hugetlb
-  actual_a_hugetlb="$(cat "$CGROUP_ROOT"/a/hugetlb.${MB}MB.$usage_file)"
-  if [[ $actual_a_hugetlb -lt $(($expected_a_hugetlb - $tolerance)) ]] ||
-[[ $actual_a_hugetlb -gt $(($expected_a_hugetlb + $tolerance)) ]]; then
-echo actual a hugetlb = $((${actual_a_hugetlb%% *} / 1024 / 1024)) MB
-echo expected a hugetlb = $((${expected_a_hugetlb%% *} / 1024 / 1024)) MB
-echo fail
-
-cleanup
-exit 1
-  fi
-
-  if [[ -z "$expected_b" || -z "$expected_b_hugetlb" ]]; then
-return
-  fi
-
-  local actual_b
-  actual_b="$(cat "$CGROUP_ROOT"/a/b/memory.$usage_file)"
-  if [[ $actual_b -lt $(($expected_b - $tolerance)) ]] ||
-[[ $actual_b -gt $(($expected_b + $tolerance)) ]]; then
-echo actual b = $((${actual_b%% *} / 1024 / 1024)) MB
-echo expected b = $((${expected_b%% *} / 1024 / 1024)) MB
-echo fail
-
-cleanup
-exit 1
-  fi
  
-  local actual_b_hugetlb

-  actual_b_hugetlb="$(cat "$CGROUP_ROOT"/a/b/hugetlb.${MB}MB.$usage_file)"
-  if [[ $actual_b_hugetlb -lt $(($expected_b_hugetlb - $tolerance)) ]] ||
-[[ $actual_b_hugetlb -gt $(($expected_b_hugetlb + $tolerance)) ]]; then
-echo actual b hugetlb = $((${actual_b_hugetlb%% *} / 1024 / 1024)) MB
-echo expected b hugetlb = $((${expected_b_hugetlb%% *} / 1024 / 1024)) MB
-echo fail
+  assert_with_retry "$CGROUP_ROOT/a/memory.$usage_file" "$expected_a"
+  assert_wi

Re: [PATCH] selftests: mptcp: add comment for getaddrinfo

2025-04-07 Thread Geliang Tang
Hi zhenwei,

On Mon, 2025-04-07 at 16:51 +0800, zhenwei pi wrote:
> mptcp_connect.c is a startup tutorial of MPTCP programming, however
> there is a lack of ai_protocol(IPPROTO_MPTCP) usage. Add comment for
> getaddrinfo MPTCP support.
> 
> Signed-off-by: zhenwei pi 
> Signed-off-by: zhenwei pi 
> ---
>  tools/testing/selftests/net/mptcp/mptcp_connect.c | 12 
>  1 file changed, 12 insertions(+)
> 
> diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c
> b/tools/testing/selftests/net/mptcp/mptcp_connect.c
> index c83a8b47bbdf..6b9031273964 100644
> --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c
> +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c
> @@ -179,6 +179,18 @@ static void xgetnameinfo(const struct sockaddr
> *addr, socklen_t addrlen,
>   }
>  }
>  
> +/* There is a lack of MPTCP support from glibc, these code leads
> error:
> + *   struct addrinfo hints = {
> + *   .ai_protocol = IPPROTO_MPTCP,
> + *   ...
> + *   };
> + *   err = getaddrinfo(node, service, &hints, res);
> + *   ...
> + * So using IPPROTO_TCP to resolve, and use TCP/MPTCP to create
> socket.
> + *
> + * glibc starts to support MPTCP since v2.42.
> + * Link:
> https://sourceware.org/git/?p=glibc.git;a=commit;h=a8e9022e0f82

Thanks for adding getaddrinfo mptcp support to glibc. I think we should
not only add a comment for getaddrinfo mptcp here, but also add an
example of using it in mptcp_connect.c. I will work with you to
implement this example in v2.

Thanks,
-Geliang

> + */
>  static void xgetaddrinfo(const char *node, const char *service,
>    const struct addrinfo *hints,
>    struct addrinfo **res)




Re: [PATCH v1] s390/virtio_ccw: don't allocate/assign airqs for non-existing queues

2025-04-07 Thread Michael S. Tsirkin
On Mon, Apr 07, 2025 at 08:47:05PM +0200, David Hildenbrand wrote:
> > In my opinion, it makes the most sense to keep the spec as it is and
> > change QEMU and the kernel to match, but obviously that's not trivial
> > to do in a way that doesn't break existing devices and drivers.
> 
> If only it would be limited to QEMU and Linux ... :)
> 
> Out of curiosity, assuming we'd make the spec match the current QEMU/Linux
> implementation at least for the 3 involved features only, would there be a
> way to adjust crossvm without any disruption?
> 
> I still have the feeling that it will be rather hard to get that all
> implementations match the spec ... For new features+queues it will be easy
> to force the usage of fixed virtqueue numbers, but for free-page-hinting and
> reporting, it's a mess :(


Still thinking about a way to fix drivers... We can discuss this
theoretically, maybe?


-- 
MST




Re: [PATCH v2 2/2] x86/sgx: Implement EUPDATESVN and opportunistically call it during first EPC page alloc

2025-04-07 Thread Huang, Kai
On Mon, 2025-04-07 at 08:23 +, Reshetova, Elena wrote:
> > On Fri, Apr 04, 2025 at 06:53:17AM +, Reshetova, Elena wrote:
> > > > On Wed, Apr 02, 2025 at 01:11:25PM +, Reshetova, Elena wrote:
> > > > > > > current SGX kernel code does not handle such errors in any other
> > way
> > > > > > > than notifying that operation failed for other ENCLS leaves. So, 
> > > > > > > I don't
> > > > > > > see why ENCLS[EUPDATESVN] should be different from existing
> > > > behaviour?
> > > > > > 
> > > > > > While not disagreeing fully (it depends on call site), in some
> > > > > > situations it is more difficult to take more preventive actions.
> > > > > > 
> > > > > > This is a situation where we know that there are *zero* EPC pages in
> > > > > > traffic so it is relatively easy to stop the madness, isn't it?
> > > > > > 
> > > > > > I guess the best action would be make sgx_alloc_epc_page() return
> > > > > > consistently -ENOMEM, if the unexpected happens.
> > > > > 
> > > > > But this would be very misleading imo. We do have memory, even page
> > > > > allocation might function as normal in EPC, the only thing that is 
> > > > > broken
> > > > > can be EUPDATESVN functionality. Returning -ENOMEM in this case
> > seems
> > > > > wrong.
> > > > 
> > > > This makes it not misleading at all:
> > > > 
> > > > pr_err("EUPDATESVN: unknown error %d\n", ret);
> > > > 
> > > > Since hardware should never return this, it indicates a kernel bug.
> > > 
> > > OK, so you propose in this case to print the above message, sgx_updatesvn
> > > returning an error, and then NULL from __sgx_alloc_epc_page_from_node
> > and
> > > the __sgx_alloc_epc_page returning -ENOMEM after an iteration over
> > > a whole set of numa nodes given that we will keep getting the unknown
> > error
> > > on each node upon trying to do an allocation from each one?
> > 
> > I'd disable ioctl's in this case and return -ENOMEM. It's a cheap sanity
> > check. Should not ever happen, but if e.g., a new kernel patch breaks
> > anything, it could help catching issues.
> > 
> > We are talking here about situation that is never expected to happen so I
> > don't think it is too heavy hammer here. Here it makes sense because not
> > much effort is required to implement the counter-measures.
> 
> OK, but does it really make sense to explicitly disable ioctls? 
> Note that everything *in practice* will be disabled simply because not a 
> single page
> anymore can be allocated from EPC since we are getting -ENOMEM on EPC
> page allocation. Also, note that any approach we chose should be symmetrical
> to SGX virtualization side also, which doesn’t use ioctls at all. Simply 
> returning
> -ENOMEM for page allocation in EPC seems like a correct symmetrical solution
> that would work for both nativel enclaves and EPC pages allocated for VMs.
> And nothing would  be able to proceed creating/managing enclaves at this 
> point. 
> 

Right, failing ioctls() doesn't cover SGX virtualization.  If we ever want to
fail, we should fail the EPC allocation.

Btw, for the unknown error, and any other errors which should not happen,
couldn't we use the ENCLS_WARN()?  AFAICT there are already cases that we are
using ENCLS_WARN() for those "impossible-to-happen-errors".

E.g., in __sgx_encl_extend():

ret = __eextend(sgx_get_epc_virt_addr(encl->secs.epc_page),
sgx_get_epc_virt_addr(epc_page) + offset);
if (ret) {
if (encls_failed(ret))
ENCLS_WARN(ret, "EEXTEND");
   
return -EIO;
}


[PATCH AUTOSEL 6.14 11/31] sound/virtio: Fix cancel_sync warnings on uninitialized work_structs

2025-04-07 Thread Sasha Levin
From: John Stultz 

[ Upstream commit 3c7df2e27346eb40a0e86230db1ccab195c97cfe ]

Betty reported hitting the following warning:

[8.709131][  T221] WARNING: CPU: 2 PID: 221 at kernel/workqueue.c:4182
...
[8.713282][  T221] Call trace:
[8.713365][  T221]  __flush_work+0x8d0/0x914
[8.713468][  T221]  __cancel_work_sync+0xac/0xfc
[8.713570][  T221]  cancel_work_sync+0x24/0x34
[8.713667][  T221]  virtsnd_remove+0xa8/0xf8 [virtio_snd 
ab15f34d0dd772f6d11327e08a81d46dc9c36276]
[8.713868][  T221]  virtsnd_probe+0x48c/0x664 [virtio_snd 
ab15f34d0dd772f6d11327e08a81d46dc9c36276]
[8.714035][  T221]  virtio_dev_probe+0x28c/0x390
[8.714139][  T221]  really_probe+0x1bc/0x4c8
...

It seems we're hitting the error path in virtsnd_probe(), which
triggers a virtsnd_remove() which iterates over the substreams
calling cancel_work_sync() on the elapsed_period work_struct.

Looking at the code, from earlier in:
virtsnd_probe()->virtsnd_build_devs()->virtsnd_pcm_parse_cfg()

We set snd->nsubstreams, allocate the snd->substreams, and if
we then hit an error on the info allocation or something in
virtsnd_ctl_query_info() fails, we will exit without having
initialized the elapsed_period work_struct.

When that error path unwinds we then call virtsnd_remove()
which as long as the substreams array is allocated, will iterate
through calling cancel_work_sync() on the uninitialized work
struct hitting this warning.

Takashi Iwai suggested this fix, which initializes the substreams
structure right after allocation, so that if we hit the error
paths we avoid trying to cleanup uninitialized data.

Note: I have not yet managed to reproduce the issue myself, so
this patch has had limited testing.

Feedback or thoughts would be appreciated!

Cc: Anton Yakovlev 
Cc: "Michael S. Tsirkin" 
Cc: Jaroslav Kysela 
Cc: Takashi Iwai 
Cc: virtualizat...@lists.linux.dev
Cc: linux-so...@vger.kernel.org
Cc: kernel-t...@android.com
Reported-by: Betty Zhou 
Suggested-by: Takashi Iwai 
Signed-off-by: John Stultz 
Message-Id: <20250116194114.3375616-1-jstu...@google.com>
Signed-off-by: Michael S. Tsirkin 
Signed-off-by: Sasha Levin 
---
 sound/virtio/virtio_pcm.c | 21 +++--
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/sound/virtio/virtio_pcm.c b/sound/virtio/virtio_pcm.c
index 967e4c45be9bb..2f7c5e709f075 100644
--- a/sound/virtio/virtio_pcm.c
+++ b/sound/virtio/virtio_pcm.c
@@ -339,6 +339,21 @@ int virtsnd_pcm_parse_cfg(struct virtio_snd *snd)
if (!snd->substreams)
return -ENOMEM;
 
+   /*
+* Initialize critical substream fields early in case we hit an
+* error path and end up trying to clean up uninitialized structures
+* elsewhere.
+*/
+   for (i = 0; i < snd->nsubstreams; ++i) {
+   struct virtio_pcm_substream *vss = &snd->substreams[i];
+
+   vss->snd = snd;
+   vss->sid = i;
+   INIT_WORK(&vss->elapsed_period, virtsnd_pcm_period_elapsed);
+   init_waitqueue_head(&vss->msg_empty);
+   spin_lock_init(&vss->lock);
+   }
+
info = kcalloc(snd->nsubstreams, sizeof(*info), GFP_KERNEL);
if (!info)
return -ENOMEM;
@@ -352,12 +367,6 @@ int virtsnd_pcm_parse_cfg(struct virtio_snd *snd)
struct virtio_pcm_substream *vss = &snd->substreams[i];
struct virtio_pcm *vpcm;
 
-   vss->snd = snd;
-   vss->sid = i;
-   INIT_WORK(&vss->elapsed_period, virtsnd_pcm_period_elapsed);
-   init_waitqueue_head(&vss->msg_empty);
-   spin_lock_init(&vss->lock);
-
rc = virtsnd_pcm_build_hw(vss, &info[i]);
if (rc)
goto on_exit;
-- 
2.39.5




Re: [PATCH] kbuild: Require pahole >v1.29 with GENDWARFKSYMS and BTF on X86

2025-04-07 Thread Sam James
Alan Maguire  writes:

> On 07/04/2025 09:25, Sam James wrote:
>> [with regard to
>> https://lore.kernel.org/linux-kbuild/20250320232757.2283956-2-samitolva...@google.com/]
>> 
>> Would it be possible to have a new release with that fix, to avoid
>> distros all having to cherrypick the fix commit?
>> 
>> Thanks in advance,
>> sam
>> 
>
> We're planning to release 1.30 shortly to follow the recent 6.14 kernel
> release - hopefully this week, or perhaps early next week if any bugs
> are discovered during final testing.
>
> If folks can help by testing the next branch of
>
> https://git.kernel.org/pub/scm/devel/pahole/pahole.git
>
> ...prior to that, that would be great. Thanks!

Will do, thanks!

>
> Alan



Re: Re: [PATCH] selftests: mptcp: add comment for getaddrinfo

2025-04-07 Thread zhenwei pi




On 4/8/25 09:43, Geliang Tang wrote:

Hi zhenwei,

On Mon, 2025-04-07 at 16:51 +0800, zhenwei pi wrote:

mptcp_connect.c is a startup tutorial of MPTCP programming, however
there is a lack of ai_protocol(IPPROTO_MPTCP) usage. Add comment for
getaddrinfo MPTCP support.

Signed-off-by: zhenwei pi 
Signed-off-by: zhenwei pi 
---
  tools/testing/selftests/net/mptcp/mptcp_connect.c | 12 
  1 file changed, 12 insertions(+)

diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c
b/tools/testing/selftests/net/mptcp/mptcp_connect.c
index c83a8b47bbdf..6b9031273964 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.c
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c
@@ -179,6 +179,18 @@ static void xgetnameinfo(const struct sockaddr
*addr, socklen_t addrlen,
    }
  }
  
+/* There is a lack of MPTCP support from glibc, these code leads

error:
+ * struct addrinfo hints = {
+ * .ai_protocol = IPPROTO_MPTCP,
+ * ...
+ * };
+ * err = getaddrinfo(node, service, &hints, res);
+ * ...
+ * So using IPPROTO_TCP to resolve, and use TCP/MPTCP to create
socket.
+ *
+ * glibc starts to support MPTCP since v2.42.
+ * Link:
https://sourceware.org/git/?p=glibc.git;a=commit;h=a8e9022e0f82


Thanks for adding getaddrinfo mptcp support to glibc. I think we should
not only add a comment for getaddrinfo mptcp here, but also add an
example of using it in mptcp_connect.c. I will work with you to
implement this example in v2.

Thanks,
-Geliang



Good idea, thank you Geliang!


+ */
  static void xgetaddrinfo(const char *node, const char *service,
     const struct addrinfo *hints,
     struct addrinfo **res)







Re: [PATCH v2 2/2] x86/sgx: Implement EUPDATESVN and opportunistically call it during first EPC page alloc

2025-04-07 Thread Jarkko Sakkinen
On Tue, Apr 08, 2025 at 12:06:32AM +, Huang, Kai wrote:
> On Mon, 2025-04-07 at 08:23 +, Reshetova, Elena wrote:
> > > On Fri, Apr 04, 2025 at 06:53:17AM +, Reshetova, Elena wrote:
> > > > > On Wed, Apr 02, 2025 at 01:11:25PM +, Reshetova, Elena wrote:
> > > > > > > > current SGX kernel code does not handle such errors in any other
> > > way
> > > > > > > > than notifying that operation failed for other ENCLS leaves. 
> > > > > > > > So, I don't
> > > > > > > > see why ENCLS[EUPDATESVN] should be different from existing
> > > > > behaviour?
> > > > > > > 
> > > > > > > While not disagreeing fully (it depends on call site), in some
> > > > > > > situations it is more difficult to take more preventive actions.
> > > > > > > 
> > > > > > > This is a situation where we know that there are *zero* EPC pages 
> > > > > > > in
> > > > > > > traffic so it is relatively easy to stop the madness, isn't it?
> > > > > > > 
> > > > > > > I guess the best action would be make sgx_alloc_epc_page() return
> > > > > > > consistently -ENOMEM, if the unexpected happens.
> > > > > > 
> > > > > > But this would be very misleading imo. We do have memory, even page
> > > > > > allocation might function as normal in EPC, the only thing that is 
> > > > > > broken
> > > > > > can be EUPDATESVN functionality. Returning -ENOMEM in this case
> > > seems
> > > > > > wrong.
> > > > > 
> > > > > This makes it not misleading at all:
> > > > > 
> > > > >   pr_err("EUPDATESVN: unknown error %d\n", ret);
> > > > > 
> > > > > Since hardware should never return this, it indicates a kernel bug.
> > > > 
> > > > OK, so you propose in this case to print the above message, 
> > > > sgx_updatesvn
> > > > returning an error, and then NULL from __sgx_alloc_epc_page_from_node
> > > and
> > > > the __sgx_alloc_epc_page returning -ENOMEM after an iteration over
> > > > a whole set of numa nodes given that we will keep getting the unknown
> > > error
> > > > on each node upon trying to do an allocation from each one?
> > > 
> > > I'd disable ioctl's in this case and return -ENOMEM. It's a cheap sanity
> > > check. Should not ever happen, but if e.g., a new kernel patch breaks
> > > anything, it could help catching issues.
> > > 
> > > We are talking here about situation that is never expected to happen so I
> > > don't think it is too heavy hammer here. Here it makes sense because not
> > > much effort is required to implement the counter-measures.
> > 
> > OK, but does it really make sense to explicitly disable ioctls? 
> > Note that everything *in practice* will be disabled simply because not a 
> > single page
> > anymore can be allocated from EPC since we are getting -ENOMEM on EPC
> > page allocation. Also, note that any approach we chose should be symmetrical
> > to SGX virtualization side also, which doesn´t use ioctls at all. Simply 
> > returning
> > -ENOMEM for page allocation in EPC seems like a correct symmetrical solution
> > that would work for both nativel enclaves and EPC pages allocated for VMs.
> > And nothing would  be able to proceed creating/managing enclaves at this 
> > point. 
> > 
> 
> Right, failing ioctls() doesn't cover SGX virtualization.  If we ever want to
> fail, we should fail the EPC allocation.

"I guess the best action would be make sgx_alloc_epc_page() return
 consistently -ENOMEM, if the unexpected happens." -me

> 
> Btw, for the unknown error, and any other errors which should not happen,
> couldn't we use the ENCLS_WARN()?  AFAICT there are already cases that we are
> using ENCLS_WARN() for those "impossible-to-happen-errors".
> 
> E.g., in __sgx_encl_extend():
> 
>   ret = __eextend(sgx_get_epc_virt_addr(encl->secs.epc_page),
> sgx_get_epc_virt_addr(epc_page) + offset);
> if (ret) {
> if (encls_failed(ret))
> ENCLS_WARN(ret, "EEXTEND");
>
> return -EIO;
> }

BR, Jarkko



Re: [PATCH v2 2/2] x86/sgx: Implement EUPDATESVN and opportunistically call it during first EPC page alloc

2025-04-07 Thread Jarkko Sakkinen
On Tue, Apr 08, 2025 at 09:40:14AM +0300, Jarkko Sakkinen wrote:
> On Tue, Apr 08, 2025 at 12:06:32AM +, Huang, Kai wrote:
> > On Mon, 2025-04-07 at 08:23 +, Reshetova, Elena wrote:
> > > > On Fri, Apr 04, 2025 at 06:53:17AM +, Reshetova, Elena wrote:
> > > > > > On Wed, Apr 02, 2025 at 01:11:25PM +, Reshetova, Elena wrote:
> > > > > > > > > current SGX kernel code does not handle such errors in any 
> > > > > > > > > other
> > > > way
> > > > > > > > > than notifying that operation failed for other ENCLS leaves. 
> > > > > > > > > So, I don't
> > > > > > > > > see why ENCLS[EUPDATESVN] should be different from existing
> > > > > > behaviour?
> > > > > > > > 
> > > > > > > > While not disagreeing fully (it depends on call site), in some
> > > > > > > > situations it is more difficult to take more preventive actions.
> > > > > > > > 
> > > > > > > > This is a situation where we know that there are *zero* EPC 
> > > > > > > > pages in
> > > > > > > > traffic so it is relatively easy to stop the madness, isn't it?
> > > > > > > > 
> > > > > > > > I guess the best action would be make sgx_alloc_epc_page() 
> > > > > > > > return
> > > > > > > > consistently -ENOMEM, if the unexpected happens.
> > > > > > > 
> > > > > > > But this would be very misleading imo. We do have memory, even 
> > > > > > > page
> > > > > > > allocation might function as normal in EPC, the only thing that 
> > > > > > > is broken
> > > > > > > can be EUPDATESVN functionality. Returning -ENOMEM in this case
> > > > seems
> > > > > > > wrong.
> > > > > > 
> > > > > > This makes it not misleading at all:
> > > > > > 
> > > > > > pr_err("EUPDATESVN: unknown error %d\n", ret);
> > > > > > 
> > > > > > Since hardware should never return this, it indicates a kernel bug.
> > > > > 
> > > > > OK, so you propose in this case to print the above message, 
> > > > > sgx_updatesvn
> > > > > returning an error, and then NULL from __sgx_alloc_epc_page_from_node
> > > > and
> > > > > the __sgx_alloc_epc_page returning -ENOMEM after an iteration over
> > > > > a whole set of numa nodes given that we will keep getting the unknown
> > > > error
> > > > > on each node upon trying to do an allocation from each one?
> > > > 
> > > > I'd disable ioctl's in this case and return -ENOMEM. It's a cheap sanity
> > > > check. Should not ever happen, but if e.g., a new kernel patch breaks
> > > > anything, it could help catching issues.
> > > > 
> > > > We are talking here about situation that is never expected to happen so 
> > > > I
> > > > don't think it is too heavy hammer here. Here it makes sense because not
> > > > much effort is required to implement the counter-measures.
> > > 
> > > OK, but does it really make sense to explicitly disable ioctls? 
> > > Note that everything *in practice* will be disabled simply because not a 
> > > single page
> > > anymore can be allocated from EPC since we are getting -ENOMEM on EPC
> > > page allocation. Also, note that any approach we chose should be 
> > > symmetrical
> > > to SGX virtualization side also, which doesn´t use ioctls at all. Simply 
> > > returning
> > > -ENOMEM for page allocation in EPC seems like a correct symmetrical 
> > > solution
> > > that would work for both nativel enclaves and EPC pages allocated for VMs.
> > > And nothing would  be able to proceed creating/managing enclaves at this 
> > > point. 
> > > 
> > 
> > Right, failing ioctls() doesn't cover SGX virtualization.  If we ever want 
> > to
> > fail, we should fail the EPC allocation.
> 
> "I guess the best action would be make sgx_alloc_epc_page() return
>  consistently -ENOMEM, if the unexpected happens." -me
> 
> > 
> > Btw, for the unknown error, and any other errors which should not happen,
> > couldn't we use the ENCLS_WARN()?  AFAICT there are already cases that we 
> > are
> > using ENCLS_WARN() for those "impossible-to-happen-errors".

Sorry forgot to response this. I don't have anything against this but at
minimum disabling allocation should be combined with it (in case kernel
command-line does not have oops_on_warn or whatever the option was
called).

> > 
> > E.g., in __sgx_encl_extend():
> > 
> > ret = __eextend(sgx_get_epc_virt_addr(encl->secs.epc_page),
> > sgx_get_epc_virt_addr(epc_page) + offset);
> > if (ret) {
> > if (encls_failed(ret))
> > ENCLS_WARN(ret, "EEXTEND");
> >
> > return -EIO;
> > }
> 
> BR, Jarkko
> 

BR, Jarkko



RE: [PATCH v2 2/2] x86/sgx: Implement EUPDATESVN and opportunistically call it during first EPC page alloc

2025-04-07 Thread Reshetova, Elena
> 
> On Tue, Apr 08, 2025 at 09:40:14AM +0300, Jarkko Sakkinen wrote:
> > On Tue, Apr 08, 2025 at 12:06:32AM +, Huang, Kai wrote:
> > > On Mon, 2025-04-07 at 08:23 +, Reshetova, Elena wrote:
> > > > > On Fri, Apr 04, 2025 at 06:53:17AM +, Reshetova, Elena wrote:
> > > > > > > On Wed, Apr 02, 2025 at 01:11:25PM +, Reshetova, Elena
> wrote:
> > > > > > > > > > current SGX kernel code does not handle such errors in any
> other
> > > > > way
> > > > > > > > > > than notifying that operation failed for other ENCLS 
> > > > > > > > > > leaves. So,
> I don't
> > > > > > > > > > see why ENCLS[EUPDATESVN] should be different from
> existing
> > > > > > > behaviour?
> > > > > > > > >
> > > > > > > > > While not disagreeing fully (it depends on call site), in some
> > > > > > > > > situations it is more difficult to take more preventive 
> > > > > > > > > actions.
> > > > > > > > >
> > > > > > > > > This is a situation where we know that there are *zero* EPC
> pages in
> > > > > > > > > traffic so it is relatively easy to stop the madness, isn't 
> > > > > > > > > it?
> > > > > > > > >
> > > > > > > > > I guess the best action would be make sgx_alloc_epc_page()
> return
> > > > > > > > > consistently -ENOMEM, if the unexpected happens.
> > > > > > > >
> > > > > > > > But this would be very misleading imo. We do have memory,
> even page
> > > > > > > > allocation might function as normal in EPC, the only thing that 
> > > > > > > > is
> broken
> > > > > > > > can be EUPDATESVN functionality. Returning -ENOMEM in this
> case
> > > > > seems
> > > > > > > > wrong.
> > > > > > >
> > > > > > > This makes it not misleading at all:
> > > > > > >
> > > > > > >   pr_err("EUPDATESVN: unknown error %d\n", ret);
> > > > > > >
> > > > > > > Since hardware should never return this, it indicates a kernel 
> > > > > > > bug.
> > > > > >
> > > > > > OK, so you propose in this case to print the above message,
> sgx_updatesvn
> > > > > > returning an error, and then NULL from
> __sgx_alloc_epc_page_from_node
> > > > > and
> > > > > > the __sgx_alloc_epc_page returning -ENOMEM after an iteration
> over
> > > > > > a whole set of numa nodes given that we will keep getting the
> unknown
> > > > > error
> > > > > > on each node upon trying to do an allocation from each one?
> > > > >
> > > > > I'd disable ioctl's in this case and return -ENOMEM. It's a cheap 
> > > > > sanity
> > > > > check. Should not ever happen, but if e.g., a new kernel patch breaks
> > > > > anything, it could help catching issues.
> > > > >
> > > > > We are talking here about situation that is never expected to happen
> so I
> > > > > don't think it is too heavy hammer here. Here it makes sense because
> not
> > > > > much effort is required to implement the counter-measures.
> > > >
> > > > OK, but does it really make sense to explicitly disable ioctls?
> > > > Note that everything *in practice* will be disabled simply because not a
> single page
> > > > anymore can be allocated from EPC since we are getting -ENOMEM on
> EPC
> > > > page allocation. Also, note that any approach we chose should be
> symmetrical
> > > > to SGX virtualization side also, which doesn´t use ioctls at all. Simply
> returning
> > > > -ENOMEM for page allocation in EPC seems like a correct symmetrical
> solution
> > > > that would work for both nativel enclaves and EPC pages allocated for
> VMs.
> > > > And nothing would  be able to proceed creating/managing enclaves at
> this point.
> > > >
> > >
> > > Right, failing ioctls() doesn't cover SGX virtualization.  If we ever 
> > > want to
> > > fail, we should fail the EPC allocation.
> >
> > "I guess the best action would be make sgx_alloc_epc_page() return
> >  consistently -ENOMEM, if the unexpected happens." -me
> >
> > >
> > > Btw, for the unknown error, and any other errors which should not
> happen,
> > > couldn't we use the ENCLS_WARN()?  AFAICT there are already cases that
> we are
> > > using ENCLS_WARN() for those "impossible-to-happen-errors".

Ok, so to summarise the approach I will be sending in the next version:

In case unknown error returns, issue ENCLS_WARN (uses WARN_ON underneath)
and return -ENOMEM from EPC page allocation. No other explicit ioctl disabling 
needed
since nothing can proceed anyhow if we cannot allocate a page from EPC.

Does this sound right? 

Best Regards,
Elena.


[PATCH] rpmsg: Use strscpy() instead of strscpy_pad()

2025-04-07 Thread Thorsten Blum
kzalloc() already zero-initializes the destination buffer, making
strscpy() sufficient for safely copying the name. The additional NUL-
padding performed by strscpy_pad() is unnecessary.

The size parameter is optional, and strscpy() automatically determines
the size of the destination buffer using sizeof() when the argument is
omitted. RPMSG_NAME_SIZE is equal to sizeof(rpdev->id.name) and can be
removed - remove it.

No functional changes intended.

Signed-off-by: Thorsten Blum 
---
 drivers/rpmsg/qcom_glink_native.c | 2 +-
 drivers/rpmsg/qcom_smd.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/rpmsg/qcom_glink_native.c 
b/drivers/rpmsg/qcom_glink_native.c
index a2f9d85c7156..820a6ca5b1d7 100644
--- a/drivers/rpmsg/qcom_glink_native.c
+++ b/drivers/rpmsg/qcom_glink_native.c
@@ -1663,7 +1663,7 @@ static int qcom_glink_rx_open(struct qcom_glink *glink, 
unsigned int rcid,
}
 
rpdev->ept = &channel->ept;
-   strscpy_pad(rpdev->id.name, name, RPMSG_NAME_SIZE);
+   strscpy(rpdev->id.name, name);
rpdev->src = RPMSG_ADDR_ANY;
rpdev->dst = RPMSG_ADDR_ANY;
rpdev->ops = &glink_device_ops;
diff --git a/drivers/rpmsg/qcom_smd.c b/drivers/rpmsg/qcom_smd.c
index 40d386809d6b..3c86c5553de6 100644
--- a/drivers/rpmsg/qcom_smd.c
+++ b/drivers/rpmsg/qcom_smd.c
@@ -1089,7 +1089,7 @@ static int qcom_smd_create_device(struct qcom_smd_channel 
*channel)
 
/* Assign public information to the rpmsg_device */
rpdev = &qsdev->rpdev;
-   strscpy_pad(rpdev->id.name, channel->name, RPMSG_NAME_SIZE);
+   strscpy(rpdev->id.name, channel->name);
rpdev->src = RPMSG_ADDR_ANY;
rpdev->dst = RPMSG_ADDR_ANY;
 
-- 
2.49.0




Re: [PATCH v1] s390/virtio_ccw: don't allocate/assign airqs for non-existing queues

2025-04-07 Thread David Hildenbrand

On 07.04.25 10:44, Michael S. Tsirkin wrote:

Wow great job digging through all these hypervisors!


There is more ... :(

aloith: 
https://github.com/google/alioth/blob/main/alioth/src/virtio/dev/balloon.rs


It uses the incremental vq index assignment like QEMU.

impl VirtioMio for Balloon {
fn activate<'a, 'm, Q: VirtQueue<'m>, S: IrqSender>(
&mut self,
feature: u64,
_active_mio: &mut ActiveMio<'a, 'm, Q, S>,
) -> Result<()> {
let feature = BalloonFeature::from_bits_retain(feature);
self.queues[0] = BalloonQueue::Inflate;
self.queues[1] = BalloonQueue::Deflate;
let mut index = 2;
if feature.contains(BalloonFeature::STATS_VQ) {
self.queues[index] = BalloonQueue::Stats;
index += 1;
}
if feature.contains(BalloonFeature::FREE_PAGE_HINT) {
self.queues[index] = BalloonQueue::FreePage;
index += 1;
}
if feature.contains(BalloonFeature::PAGE_REPORTING) {
self.queues[index] = BalloonQueue::Reporting;
}
Ok(())
}


I'll dig some more, but this is getting out of hand :D

--
Cheers,

David / dhildenb




Re: [PATCH 3/3] arm64: dts: qcom: sdm632-fairphone-fp3: Add AW8898 amplifier

2025-04-07 Thread Konrad Dybcio
On 4/6/25 3:03 PM, Luca Weiss wrote:
> Add a node for the amplifier found on Fairphone 3, receiving sound via
> I2S from the SoC and being connected to the speaker.
> 
> Signed-off-by: Luca Weiss 
> ---

Reviewed-by: Konrad Dybcio 

Konrad



Re: [PATCH v1] s390/virtio_ccw: don't allocate/assign airqs for non-existing queues

2025-04-07 Thread David Hildenbrand

On 07.04.25 10:17, David Hildenbrand wrote:

On 07.04.25 09:52, Michael S. Tsirkin wrote:

On Fri, Apr 04, 2025 at 05:39:10PM +0200, Halil Pasic wrote:


Not perfect, but AFAIKS, not horrible.


It is like it is. QEMU does queue exist if the corresponding feature
is offered by the device, and that is what we have to live with.


I don't think we can live with this properly though.
It means a guest that does not know about some features
does not know where to find things.


Please describe a real scenario, I'm missing the point.

Whoever adds new feat_X *must be aware* about all previous features,
otherwise we'd be reusing feature bits and everything falls to pieces.



So now, I am inclined to add linux code to work with current qemu and
with spec compliant one, and add qemu code to work with current linux
and spec compliant one.

Document the bug in the spec, maybe, in a non conformance section.


I'm afraid this results in a lot of churn without really making things
better.

IMHO, documenting things how they actually behave, and maybe moving
towards fixed queue indexes for new features is the low hanging fruit.

As raised, it's not just qemu+linux, it's *at least* also cloud-hypervisor.


I'm digging for other virtio-balloon implementations.


virtio-win: 
https://github.com/virtio-win/kvm-guest-drivers-windows/blob/master/Balloon/sys/balloon.c


-> Does not support hinting/reporting -> no problem


libkrun: 
https://github.com/containers/libkrun/blob/main/src/devices/src/virtio/balloon/device.rs


-> Hard-codes queue indexes but always seems to offer all features
 -> Offers VIRTIO_BALLOON_F_FREE_PAGE_HINT and VIRTIO_BALLOON_F_STATS_VQ
even though it doesn't seem to implement them (device-triggered, so
nothing to do probably?)
 -> Actually seems to implements VIRTIO_BALLOON_F_REPORTING


crossvm: 
https://github.com/google/crosvm/blob/main/devices/src/virtio/balloon.rs


-> Hard-codes queue numbers; does *not* offer/implement
   VIRTIO_BALLOON_F_STATS_VQ but does offer VIRTIO_BALLOON_F_STATS_VQ
   and VIRTIO_BALLOON_F_DEFLATE_ON_OOM.

-> Implements something that is not in the virtio-spec

const VIRTIO_BALLOON_F_WS_REPORTING: u32 = 8; // Working Set Reporting 
virtqueues


and

const WS_DATA_VQ: usize = 5;
const WS_OP_VQ: usize = 6;


IIUC, Linux inside cross-vm might actually be problematic? They would 
disagree on the virtqueue for free-page-reporting



Maybe I am missing something, it's a mess ...

--
Cheers,

David / dhildenb




Re: [PATCH v1] s390/virtio_ccw: don't allocate/assign airqs for non-existing queues

2025-04-07 Thread Michael S. Tsirkin
On Mon, Apr 07, 2025 at 09:18:21AM +0200, David Hildenbrand wrote:
> > Now I am beginning to think we should leave the spec alone
> > and fix the drivers ... Ugh 
> 
> We could always say that starting with feature X, queue indexes are fixed
> again. E.g., VIRTIO_BALLOON_F_X would have it's virtqueue fixed at index 5,
> independent of the other (older) features where the virtqueue indexes are
> determined like today.
> 
> Won't make the implementation easier, though, I'm afraid.
> 
> (I also thought about a way to query the virtqueue index for a feature, but
> that's probably overengineering)

The best contract we have is the spec. Sometimes it is hopelessly broken
and we have to fix it, but not in this case.

Let's do a theoretical excercise, assuming we want to fix the drivers,
but we also want to have workarounds in place in qemu and in
drivers to support existing ones. How would we go about it?



Maybe we want a feature bit BALLOON_FIXED and ask everyone
to negotiate it?  But if we go this way, we really need to fix
the 48 bit limitation too.




-- 
MST




Re: [PATCH v1] s390/virtio_ccw: don't allocate/assign airqs for non-existing queues

2025-04-07 Thread David Hildenbrand

On 07.04.25 10:58, Michael S. Tsirkin wrote:

On Mon, Apr 07, 2025 at 10:54:00AM +0200, David Hildenbrand wrote:

On 07.04.25 10:49, Michael S. Tsirkin wrote:

On Mon, Apr 07, 2025 at 10:44:21AM +0200, David Hildenbrand wrote:





Whoever adds new feat_X *must be aware* about all previous features,
otherwise we'd be reusing feature bits and everything falls to pieces.



The knowledge is supposed be limited to which feature bit to use.


I think we also have to know which virtqueue bits can be used, right?



what are virtqueue bits? vq number?


Yes, sorry.


I got confused myself, it's vq index actually now, we made the spec
consistent with that terminology. used to be number/index
interchangeably.


Assume cross-vm as an example. It would make use of virtqueue indexes 5+6
with their VIRTIO_BALLOON_F_WS_REPORTING.



crossvm guys really should have reserved the feature bit even if they
did not bother specifying it. Let's reserve it now at least?


Along with the virtqueue indices, right?

Note that there was

https://lists.gnu.org/archive/html/qemu-devel/2023-05/msg02503.html

and

https://groups.oasis-open.org/communities/community-home/digestviewer/viewthread?GroupId=3973&MessageKey=afb07613-f56c-4d40-8981-2fad1c723998&CommunityKey=2f26be99-3aa1-48f6-93a5-018dce262226&hlmlt=VT

But it only was RFC, and as the QEMU implementation didn't materialize, 
nobody seemed to care ...






So whatever feature another device implements couldn't use this feature bit
or these virtqueue indexes.

(as long the other device never intends to implement
VIRTIO_BALLOON_F_WS_REPORTING, the virtqueue indexes could be reused. But
the spec will also be a mess, because virtqueue indexes could also have
duplicate meanings ... ugh)


what do they do with vq indices btw?


See above links, they use the two for "s_vq and notification_vq".

--
Cheers,

David / dhildenb




Re: [PATCH v1] s390/virtio_ccw: don't allocate/assign airqs for non-existing queues

2025-04-07 Thread David Hildenbrand

On 07.04.25 10:34, Michael S. Tsirkin wrote:

On Mon, Apr 07, 2025 at 10:17:10AM +0200, David Hildenbrand wrote:

On 07.04.25 09:52, Michael S. Tsirkin wrote:

On Fri, Apr 04, 2025 at 05:39:10PM +0200, Halil Pasic wrote:


Not perfect, but AFAIKS, not horrible.


It is like it is. QEMU does queue exist if the corresponding feature
is offered by the device, and that is what we have to live with.


I don't think we can live with this properly though.
It means a guest that does not know about some features
does not know where to find things.


Please describe a real scenario, I'm missing the point.



OK so.

Device has VIRTIO_BALLOON_F_FREE_PAGE_HINT and VIRTIO_BALLOON_F_REPORTING
Driver only knows about VIRTIO_BALLOON_F_REPORTING so
it does not know what does VIRTIO_BALLOON_F_FREE_PAGE_HINT do.
How does it know which vq to use for reporting?
It will try to use the free page hint one.


"only knows" -- VIRTIO_BALLOON_F_FREE_PAGE_HINT was proposed + specified 
in the spec.


So I think this is not a very good example.






Whoever adds new feat_X *must be aware* about all previous features,
otherwise we'd be reusing feature bits and everything falls to pieces.



The knowledge is supposed be limited to which feature bit to use.


I think we also have to know which virtqueue bits can be used, right?

I mean, I agree that it's all nasty ...



So now, I am inclined to add linux code to work with current qemu and
with spec compliant one, and add qemu code to work with current linux
and spec compliant one.

Document the bug in the spec, maybe, in a non conformance section.


I'm afraid this results in a lot of churn without really making things
better.



IMHO, documenting things how they actually behave, and maybe moving towards
fixed queue indexes for new features is the low hanging fruit.


I worry about how to we ensure that?
If old code is messed up people will just keep propagating that.
I would like to fix old code so that new code is correct.



As raised, it's not just qemu+linux, it's *at least* also cloud-hypervisor.

--
Cheers,

David / dhildenb


There's a slippery slope here in that people will come to us
with buggy devices and ask to change the spec.


I yet have to fully digest cross-vm: 
https://github.com/google/crosvm/blob/main/devices/src/virtio/balloon.rs


and how it would free-page-reporting work with upstream Linux ... :(

--
Cheers,

David / dhildenb




Re: [PATCH v1] s390/virtio_ccw: don't allocate/assign airqs for non-existing queues

2025-04-07 Thread Michael S. Tsirkin
On Mon, Apr 07, 2025 at 11:11:34AM +0200, David Hildenbrand wrote:
> On 07.04.25 10:58, Michael S. Tsirkin wrote:
> > On Mon, Apr 07, 2025 at 10:54:00AM +0200, David Hildenbrand wrote:
> > > On 07.04.25 10:49, Michael S. Tsirkin wrote:
> > > > On Mon, Apr 07, 2025 at 10:44:21AM +0200, David Hildenbrand wrote:
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > > Whoever adds new feat_X *must be aware* about all previous 
> > > > > > > features,
> > > > > > > otherwise we'd be reusing feature bits and everything falls to 
> > > > > > > pieces.
> > > > > > 
> > > > > > 
> > > > > > The knowledge is supposed be limited to which feature bit to use.
> > > > > 
> > > > > I think we also have to know which virtqueue bits can be used, right?
> > > > > 
> > > > 
> > > > what are virtqueue bits? vq number?
> > > 
> > > Yes, sorry.
> > 
> > I got confused myself, it's vq index actually now, we made the spec
> > consistent with that terminology. used to be number/index
> > interchangeably.
> > 
> > > Assume cross-vm as an example. It would make use of virtqueue indexes 5+6
> > > with their VIRTIO_BALLOON_F_WS_REPORTING.
> > 
> > 
> > crossvm guys really should have reserved the feature bit even if they
> > did not bother specifying it. Let's reserve it now at least?
> 
> Along with the virtqueue indices, right?

Well ... as long as the implementation is careful to check that feature
is negotiated, reusing vq index at least causes no trouble for others.


> Note that there was
> 
> https://lists.gnu.org/archive/html/qemu-devel/2023-05/msg02503.html
> 
> and
> 
> https://groups.oasis-open.org/communities/community-home/digestviewer/viewthread?GroupId=3973&MessageKey=afb07613-f56c-4d40-8981-2fad1c723998&CommunityKey=2f26be99-3aa1-48f6-93a5-018dce262226&hlmlt=VT
> 
> But it only was RFC, and as the QEMU implementation didn't materialize,
> nobody seemed to care ...

Thanks! I will try poke the author again.


> > 
> > 
> > > So whatever feature another device implements couldn't use this feature 
> > > bit
> > > or these virtqueue indexes.
> > > 
> > > (as long the other device never intends to implement
> > > VIRTIO_BALLOON_F_WS_REPORTING, the virtqueue indexes could be reused. But
> > > the spec will also be a mess, because virtqueue indexes could also have
> > > duplicate meanings ... ugh)
> > 
> > what do they do with vq indices btw?
> 
> See above links, they use the two for "s_vq and notification_vq".
> 
> -- 
> Cheers,
> 
> David / dhildenb




[PATCH v2 31/32] tools/nolibc: implement width padding in printf()

2025-04-07 Thread Thomas Weißschuh
printf can pad each argument to a certain width.
Implement this for compatibility with the kselftest harness.
Currently only padding with spaces is supported.

Signed-off-by: Thomas Weißschuh 
Acked-by: Willy Tarreau 
---
 tools/include/nolibc/stdio.h | 17 -
 tools/testing/selftests/nolibc/nolibc-test.c |  3 +++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/tools/include/nolibc/stdio.h b/tools/include/nolibc/stdio.h
index 
46bd90f96d654fadda20292baddc98358a3afc62..fb0417477759ee6c9663e84807c1d1067e735dec
 100644
--- a/tools/include/nolibc/stdio.h
+++ b/tools/include/nolibc/stdio.h
@@ -220,7 +220,7 @@ int __nolibc_printf(__nolibc_printf_cb cb, intptr_t state, 
size_t n, const char
 {
char escape, lpref, c;
unsigned long long v;
-   unsigned int written;
+   unsigned int written, width;
size_t len, ofs, w;
char tmpbuf[21];
const char *outstr;
@@ -228,10 +228,20 @@ int __nolibc_printf(__nolibc_printf_cb cb, intptr_t 
state, size_t n, const char
written = ofs = escape = lpref = 0;
while (1) {
c = fmt[ofs++];
+   width = 0;
 
if (escape) {
/* we're in an escape sequence, ofs == 1 */
escape = 0;
+
+   /* width */
+   while (c >= '0' && c <= '9') {
+   width *= 10;
+   width += c - '0';
+
+   c = fmt[ofs++];
+   }
+
if (c == 'c' || c == 'd' || c == 'u' || c == 'x' || c 
== 'p') {
char *out = tmpbuf;
 
@@ -309,6 +319,11 @@ int __nolibc_printf(__nolibc_printf_cb cb, intptr_t state, 
size_t n, const char
if (n) {
w = len < n ? len : n;
n -= w;
+   while (width-- > w) {
+   if (cb(state, " ", 1) != 0)
+   break;
+   written += 1;
+   }
if (cb(state, outstr, w) != 0)
break;
}
diff --git a/tools/testing/selftests/nolibc/nolibc-test.c 
b/tools/testing/selftests/nolibc/nolibc-test.c
index 
9bd0a9c68b903cbd660ff81d4b0386b0b7c13977..16ec4f658bbec43440679c5d5c35014827c377bc
 100644
--- a/tools/testing/selftests/nolibc/nolibc-test.c
+++ b/tools/testing/selftests/nolibc/nolibc-test.c
@@ -1415,6 +1415,9 @@ static int run_printf(int min, int max)
CASE_TEST(uintmax_t);EXPECT_VFPRINTF(20, 
"18446744073709551615", "%ju", 0xULL); break;
CASE_TEST(intmax_t); EXPECT_VFPRINTF(20, 
"-9223372036854775807", "%jd", 0x8001LL); break;
CASE_TEST(truncation);   EXPECT_VFPRINTF(25, 
"01234567890123456789", "%s", "0123456789012345678901234"); break;
+   CASE_TEST(string_width); EXPECT_VFPRINTF(10, " 1", 
"%10s", "1"); break;
+   CASE_TEST(number_width); EXPECT_VFPRINTF(10, " 1", 
"%10d", 1); break;
+   CASE_TEST(width_trunc);  EXPECT_VFPRINTF(25, "  
  ", "%25d", 1); break;
CASE_TEST(scanf);EXPECT_ZR(1, test_scanf()); break;
case __LINE__:
return ret; /* must be last */

-- 
2.49.0




Re: [PATCH v3 0/6] KVM: guest_memfd: support for uffd minor

2025-04-07 Thread Liam R. Howlett
* Nikita Kalyazin  [250407 07:04]:
> 
> 
> On 04/04/2025 18:12, Liam R. Howlett wrote:
> > +To authors of v7 series referenced in [1]
> > 
> > * Nikita Kalyazin  [250404 11:44]:
> > > This series is built on top of the Fuad's v7 "mapping guest_memfd backed
> > > memory at the host" [1].
> > 
> > I didn't see their addresses in the to/cc, so I added them to my
> > response as I reference the v7 patch set below.
> 
> Hi Liam,
> 
> Thanks for the feedback and for extending the list.
> 
> > 
> > > 
> > > With James's KVM userfault [2], it is possible to handle stage-2 faults
> > > in guest_memfd in userspace.  However, KVM itself also triggers faults
> > > in guest_memfd in some cases, for example: PV interfaces like kvmclock,
> > > PV EOI and page table walking code when fetching the MMIO instruction on
> > > x86.  It was agreed in the guest_memfd upstream call on 23 Jan 2025 [3]
> > > that KVM would be accessing those pages via userspace page tables.
> > 
> > Thanks for being open about the technical call, but it would be better
> > to capture the reasons and not the call date.  I explain why in the
> > linking section as well.
> 
> Thanks for bringing that up.  The document mostly contains the decision
> itself.  The main alternative considered previously was a temporary
> reintroduction of the pages to the direct map whenever a KVM-internal access
> is required.  It was coming with a significant complexity of guaranteeing
> correctness in all cases [1].  Since the memslot structure already contains
> a guest memory pointer supplied by the userspace, KVM can use it directly
> when in the VMM or vCPU context.  I will add this in the cover for the next
> version.

Thank you.

> 
> [1] 
> https://lore.kernel.org/kvm/20240709132041.3625501-1-roy...@amazon.co.uk/T/#m4f367c52bbad0f0ba7fb07ca347c7b37258a73e5
> 
> > 
> > > In
> > > order for such faults to be handled in userspace, guest_memfd needs to
> > > support userfaultfd.
> > > 
> > > Changes since v2 [4]:
> > >   - James: Fix sgp type when calling shmem_get_folio_gfp
> > >   - James: Improved vm_ops->fault() error handling
> > >   - James: Add and make use of the can_userfault() VMA operation
> > >   - James: Add UFFD_FEATURE_MINOR_GUEST_MEMFD feature flag
> > >   - James: Fix typos and add more checks in the test
> > > 
> > > Nikita
> > 
> > Please slow down...
> > 
> > This patch is at v3, the v7 patch that you are building off has lockdep
> > issues [1] reported by one of the authors, and (sorry for sounding harsh
> > about the v7 of that patch) the cover letter reads a bit more like an
> > RFC than a set ready to go into linux-mm.
> 
> AFAIK the lockdep issue was reported on a v7 of a different change.
> I'm basing my series on [2] ("KVM: Mapping guest_memfd backed memory at the
> host for software protected VMs"), while the issue was reported on [2]
> ("KVM: Restricted mapping of guest_memfd at the host and arm64 support"),
> which is also built on top of [2].  Please correct me if I'm missing
> something.

I think you messed up the numbering in your statement above.

I believe you are making the point that I messed up which patches depend
on what and your code does not depend on faulty locking, which appears
to be the case.

There are a few issues with the required patch set?

> 
> The key feature that is required by my series is the ability to mmap
> guest_memfd when the VM type allows.  My understanding is no-one is opposed
> to that as of now, that's why I assumed it's safe to build on top of that.
> 
> [2] https://lore.kernel.org/kvm/20250318161823.4005529-1-ta...@google.com/T/
> [3] 
> https://lore.kernel.org/all/diqz1puanquh@ackerleytng-ctop.c.googlers.com/T/

All of this is extremely confusing because the onus of figuring out what
the final code will look like is put on the reviewer.  As it is, we have
issues with people not doing enough review of the code (due to limited
time).  One way to get reviews is to make the barrier of entry as low as
possible.

I spent Friday going down a rabbit hole of patches referring to each
other as dependencies and I gave up.  It looks like I mistook one set of
patches as required vs them requiring the same in-flight ones as your
patches.

I am struggling to see how we can adequately support all of you given
the way the patches are sent out in batches with dependencies - it is
just too time consuming to sort out.

Thank you,
Liam




Re: [PATCH v3 0/6] KVM: guest_memfd: support for uffd minor

2025-04-07 Thread Nikita Kalyazin




On 07/04/2025 14:40, Liam R. Howlett wrote:

* Nikita Kalyazin  [250407 07:04]:



On 04/04/2025 18:12, Liam R. Howlett wrote:

+To authors of v7 series referenced in [1]

* Nikita Kalyazin  [250404 11:44]:

This series is built on top of the Fuad's v7 "mapping guest_memfd backed
memory at the host" [1].


I didn't see their addresses in the to/cc, so I added them to my
response as I reference the v7 patch set below.


Hi Liam,

Thanks for the feedback and for extending the list.





With James's KVM userfault [2], it is possible to handle stage-2 faults
in guest_memfd in userspace.  However, KVM itself also triggers faults
in guest_memfd in some cases, for example: PV interfaces like kvmclock,
PV EOI and page table walking code when fetching the MMIO instruction on
x86.  It was agreed in the guest_memfd upstream call on 23 Jan 2025 [3]
that KVM would be accessing those pages via userspace page tables.


Thanks for being open about the technical call, but it would be better
to capture the reasons and not the call date.  I explain why in the
linking section as well.


Thanks for bringing that up.  The document mostly contains the decision
itself.  The main alternative considered previously was a temporary
reintroduction of the pages to the direct map whenever a KVM-internal access
is required.  It was coming with a significant complexity of guaranteeing
correctness in all cases [1].  Since the memslot structure already contains
a guest memory pointer supplied by the userspace, KVM can use it directly
when in the VMM or vCPU context.  I will add this in the cover for the next
version.


Thank you.



[1] 
https://lore.kernel.org/kvm/20240709132041.3625501-1-roy...@amazon.co.uk/T/#m4f367c52bbad0f0ba7fb07ca347c7b37258a73e5




In
order for such faults to be handled in userspace, guest_memfd needs to
support userfaultfd.

Changes since v2 [4]:
   - James: Fix sgp type when calling shmem_get_folio_gfp
   - James: Improved vm_ops->fault() error handling
   - James: Add and make use of the can_userfault() VMA operation
   - James: Add UFFD_FEATURE_MINOR_GUEST_MEMFD feature flag
   - James: Fix typos and add more checks in the test

Nikita


Please slow down...

This patch is at v3, the v7 patch that you are building off has lockdep
issues [1] reported by one of the authors, and (sorry for sounding harsh
about the v7 of that patch) the cover letter reads a bit more like an
RFC than a set ready to go into linux-mm.


AFAIK the lockdep issue was reported on a v7 of a different change.
I'm basing my series on [2] ("KVM: Mapping guest_memfd backed memory at the
host for software protected VMs"), while the issue was reported on [2]
("KVM: Restricted mapping of guest_memfd at the host and arm64 support"),
which is also built on top of [2].  Please correct me if I'm missing
something.


I think you messed up the numbering in your statement above.


I did, in an attempt to make it "even more clear" :) Sorry about that, 
glad you got the intention.




I believe you are making the point that I messed up which patches depend
on what and your code does not depend on faulty locking, which appears
to be the case.

There are a few issues with the required patch set?


There are indeed, but not in the part this series depends on, as far as 
I can see.






The key feature that is required by my series is the ability to mmap
guest_memfd when the VM type allows.  My understanding is no-one is opposed
to that as of now, that's why I assumed it's safe to build on top of that.

[2] https://lore.kernel.org/kvm/20250318161823.4005529-1-ta...@google.com/T/
[3] 
https://lore.kernel.org/all/diqz1puanquh@ackerleytng-ctop.c.googlers.com/T/


All of this is extremely confusing because the onus of figuring out what
the final code will look like is put on the reviewer.  As it is, we have
issues with people not doing enough review of the code (due to limited
time).  One way to get reviews is to make the barrier of entry as low as
possible.

I spent Friday going down a rabbit hole of patches referring to each
other as dependencies and I gave up.  It looks like I mistook one set of
patches as required vs them requiring the same in-flight ones as your
patches.

I am struggling to see how we can adequately support all of you given
the way the patches are sent out in batches with dependencies - it is
just too time consuming to sort out.


I'm happy to do whatever I can to make the review easier.  I suppose the 
extreme case is to wait for the dependencies to get accepted, 
effectively serialising submissions, but that slows the process down 
significantly.  For example, I received very good feedback on v1 and v2 
of this series and was able to address it instead of waiting for the 
dependency.  Would including the required patches directly in the series 
help?  My only concern is in that case the same patch will be submitted 
multiple times (as a part of every depending series), but if it's 
better, I'll be doing that instead.




Thank you

Re: [PATCH v3 0/6] KVM: guest_memfd: support for uffd minor

2025-04-07 Thread Nikita Kalyazin




On 04/04/2025 18:12, Liam R. Howlett wrote:

+To authors of v7 series referenced in [1]

* Nikita Kalyazin  [250404 11:44]:

This series is built on top of the Fuad's v7 "mapping guest_memfd backed
memory at the host" [1].


I didn't see their addresses in the to/cc, so I added them to my
response as I reference the v7 patch set below.


Hi Liam,

Thanks for the feedback and for extending the list.





With James's KVM userfault [2], it is possible to handle stage-2 faults
in guest_memfd in userspace.  However, KVM itself also triggers faults
in guest_memfd in some cases, for example: PV interfaces like kvmclock,
PV EOI and page table walking code when fetching the MMIO instruction on
x86.  It was agreed in the guest_memfd upstream call on 23 Jan 2025 [3]
that KVM would be accessing those pages via userspace page tables.


Thanks for being open about the technical call, but it would be better
to capture the reasons and not the call date.  I explain why in the
linking section as well.


Thanks for bringing that up.  The document mostly contains the decision 
itself.  The main alternative considered previously was a temporary 
reintroduction of the pages to the direct map whenever a KVM-internal 
access is required.  It was coming with a significant complexity of 
guaranteeing correctness in all cases [1].  Since the memslot structure 
already contains a guest memory pointer supplied by the userspace, KVM 
can use it directly when in the VMM or vCPU context.  I will add this in 
the cover for the next version.


[1] 
https://lore.kernel.org/kvm/20240709132041.3625501-1-roy...@amazon.co.uk/T/#m4f367c52bbad0f0ba7fb07ca347c7b37258a73e5





In
order for such faults to be handled in userspace, guest_memfd needs to
support userfaultfd.

Changes since v2 [4]:
  - James: Fix sgp type when calling shmem_get_folio_gfp
  - James: Improved vm_ops->fault() error handling
  - James: Add and make use of the can_userfault() VMA operation
  - James: Add UFFD_FEATURE_MINOR_GUEST_MEMFD feature flag
  - James: Fix typos and add more checks in the test

Nikita


Please slow down...

This patch is at v3, the v7 patch that you are building off has lockdep
issues [1] reported by one of the authors, and (sorry for sounding harsh
about the v7 of that patch) the cover letter reads a bit more like an
RFC than a set ready to go into linux-mm.


AFAIK the lockdep issue was reported on a v7 of a different change.
I'm basing my series on [2] ("KVM: Mapping guest_memfd backed memory at 
the host for software protected VMs"), while the issue was reported on 
[2] ("KVM: Restricted mapping of guest_memfd at the host and arm64 
support"), which is also built on top of [2].  Please correct me if I'm 
missing something.


The key feature that is required by my series is the ability to mmap 
guest_memfd when the VM type allows.  My understanding is no-one is 
opposed to that as of now, that's why I assumed it's safe to build on 
top of that.


[2] https://lore.kernel.org/kvm/20250318161823.4005529-1-ta...@google.com/T/
[3] 
https://lore.kernel.org/all/diqz1puanquh@ackerleytng-ctop.c.googlers.com/T/




Maybe the lockdep issue is just a patch ordering thing or removed in a
later patch set, but that's not mentioned in the discovery email?

What exactly is the goal here and the path forward for the rest of us
trying to build on this once it's in mm-new/mm-unstable?

Note that mm-unstable is shared with a lot of other people through
linux-next, and we are really trying to stop breaking stuff on them.

Obviously v7 cannot go in until it works with lockdep - otherwise none
of us can use lockdep which is not okay.

Also, I am concerned about the amount of testing in the v7 and v3 patch
sets that did not bring up a lockdep issue..



[1] https://lore.kernel.org/kvm/20250318161823.4005529-1-ta...@google.com/T/
[2] 
https://lore.kernel.org/kvm/20250109204929.1106563-1-jthough...@google.com/T/
[3] 
https://docs.google.com/document/d/1M6766BzdY1Lhk7LiR5IqVR8B8mG3cr-cxTxOrAosPOk/edit?tab=t.0#heading=h.w1126rgli5e3


If there is anything we need to know about the decisions in the call and
that document, can you please pull it into this change log?

I don't think anyone can ensure google will not rename docs to some
other office theme tomorrow - as they famously ditch basically every
name and application.

Also, most of the community does not want to go to a 17 page (and
growing) spreadsheet to hunt down the facts when there is an acceptable
and ideal place to document them in git.  It's another barrier of entry
on reviewing your code as well.

But please, don't take this suggestion as carte blanche for copying a
conversation from the doc, just give us the technical reasons for your
decisions as briefly as possible.



[4] https://lore.kernel.org/kvm/20250402160721.97596-1-kalya...@amazon.com/T/


[1]. 
https://lore.kernel.org/all/diqz1puanquh@ackerleytng-ctop.c.googlers.com/

Thanks,
Liam





Re: [PATCH v3] remoteproc: imx_dsp_rproc: Add support for DSP-specific features

2025-04-07 Thread Iuliana Prodan

Hi Frank,

On 4/3/2025 10:12 PM, Frank Li wrote:

On Thu, Apr 03, 2025 at 01:01:24PM +0300, Iuliana Prodan (OSS) wrote:

From: Iuliana Prodan 

subject: remoteproc: imx_dsp_rproc: add handle_rsc callback to handle 
DSP-specific features


Some DSP firmware requires a FW_READY signal before proceeding, while
others do not.
Therefore, add support to handle i.MX DSP-specific features.

Add support to handle i.MX DSP-specific features because Some DSP firmware
requires a FW_READY signal before proceeding


Implement handle_rsc callback to handle resource table parsing and to
process DSP-specific resource, to determine if waiting is needed.


Implement the handle_rsc callback to parse the resource table and process
DSP-specific resources to determine if waiting is needed.


Update imx_dsp_rproc_start() to handle this condition accordingly.

Signed-off-by: Iuliana Prodan 
---
Changes in v3:
- Reviews from Mathieu Poirier:
   - Added version and magic number to vendor-specific resource table entry.
   - Updated defines to maintain backward compatibility with a resource table 
that doesn't have a vendor-specific resource.
 - By default, wait for `fw_ready`, unless specified otherwise.
- Link to v2: 
https://lore.kernel.org/all/20250318215007.2109726-1-iuliana.pro...@oss.nxp.com

Changes in v2:
- Reviews from Mathieu Poirier:
   - Use vendor-specific resource table entry.
   - Implement resource handler specific to the i.MX DSP.
- Revise commit message to include recent updates.
- Link to v1: 
https://lore.kernel.org/all/20250305123923.514386-1-iuliana.pro...@oss.nxp.com/

  drivers/remoteproc/imx_dsp_rproc.c | 102 -
  1 file changed, 100 insertions(+), 2 deletions(-)

diff --git a/drivers/remoteproc/imx_dsp_rproc.c 
b/drivers/remoteproc/imx_dsp_rproc.c
index b9bb15970966..80d4470cc731 100644
--- a/drivers/remoteproc/imx_dsp_rproc.c
+++ b/drivers/remoteproc/imx_dsp_rproc.c
@@ -35,9 +35,17 @@ module_param_named(no_mailboxes, no_mailboxes, int, 0644);
  MODULE_PARM_DESC(no_mailboxes,
 "There is no mailbox between cores, so ignore remote proc reply 
after start, default is 0 (off).");

+/* Flag indicating that the remote is up and running */
  #define REMOTE_IS_READY   BIT(0)
+/* Flag indicating that the host should wait for a firmware-ready response */
+#define WAIT_FW_READY  BIT(1)
  #define REMOTE_READY_WAIT_MAX_RETRIES 500

+/* This flag is set in the DSP resource table's features field to indicate
+ * that the firmware requires the host NOT to wait for a FW_READY response.
+ */

multi line comments should be
/*
  * This ..
  */

+#define FEATURE_DONT_WAIT_FW_READY BIT(0)
+
  /* att flags */
  /* DSP own area */
  #define ATT_OWN   BIT(31)
@@ -72,6 +80,10 @@ MODULE_PARM_DESC(no_mailboxes,

  #define IMX8ULP_SIP_HIFI_XRDC 0xc20e

+#define FW_RSC_NXP_S_MAGIC ((uint32_t)'n' << 24 |\
+(uint32_t)'x' << 16 |\
+(uint32_t)'p' << 8 | \
+(uint32_t)'s')
  /*
   * enum - Predefined Mailbox Messages
   *
@@ -136,6 +148,24 @@ struct imx_dsp_rproc_dcfg {
int (*reset)(struct imx_dsp_rproc *priv);
  };

+/**
+ * struct fw_rsc_imx_dsp - i.MX DSP specific info
+ *
+ * @len: length of the resource entry
+ * @magic_num: 32-bit magic number
+ * @version: version of data structure
+ * @features: feature flags supported by the i.MX DSP firmware
+ *
+ * This represents a DSP-specific resource in the firmware's
+ * resource table, providing information on supported features.
+ */
+struct fw_rsc_imx_dsp {
+   uint32_t len;
+   uint32_t magic_num;
+   uint32_t version;
+   uint32_t features;
+} __packed;
+
  static const struct imx_rproc_att imx_dsp_rproc_att_imx8qm[] = {
/* dev addr , sys addr  , size  , flags */
{ 0x596e8000, 0x556e8000, 0x8000, ATT_OWN },
@@ -300,6 +330,73 @@ static int imx_dsp_rproc_ready(struct rproc *rproc)
return -ETIMEDOUT;
  }

+/**
+ * imx_dsp_rproc_handle_rsc() - Handle DSP-specific resource table entries
+ * @rproc: remote processor instance
+ * @rsc_type: resource type identifier
+ * @rsc: pointer to the resource entry
+ * @offset: offset of the resource entry
+ * @avail: available space in the resource table
+ *
+ * Parse the DSP-specific resource entry and update flags accordingly.
+ * If the WAIT_FW_READY feature is set, the host must wait for the firmware
+ * to signal readiness before proceeding with execution.
+ *
+ * Return: RSC_HANDLED if processed successfully, RSC_IGNORED otherwise.
+ */
+static int imx_dsp_rproc_handle_rsc(struct rproc *rproc, u32 rsc_type,
+   void *rsc, int offset, int avail)
+{
+   struct imx_dsp_rproc *priv = rproc->priv;
+   struct fw_rsc_

Re: [PATCH v1] s390/virtio_ccw: don't allocate/assign airqs for non-existing queues

2025-04-07 Thread David Hildenbrand

On 07.04.25 09:52, Michael S. Tsirkin wrote:

On Fri, Apr 04, 2025 at 05:39:10PM +0200, Halil Pasic wrote:


Not perfect, but AFAIKS, not horrible.


It is like it is. QEMU does queue exist if the corresponding feature
is offered by the device, and that is what we have to live with.


I don't think we can live with this properly though.
It means a guest that does not know about some features
does not know where to find things.


Please describe a real scenario, I'm missing the point.

Whoever adds new feat_X *must be aware* about all previous features, 
otherwise we'd be reusing feature bits and everything falls to pieces.




So now, I am inclined to add linux code to work with current qemu and
with spec compliant one, and add qemu code to work with current linux
and spec compliant one.

Document the bug in the spec, maybe, in a non conformance section.


I'm afraid this results in a lot of churn without really making things 
better.


IMHO, documenting things how they actually behave, and maybe moving 
towards fixed queue indexes for new features is the low hanging fruit.


As raised, it's not just qemu+linux, it's *at least* also cloud-hypervisor.

--
Cheers,

David / dhildenb




[PATCH] selftests: mptcp: add comment for getaddrinfo

2025-04-07 Thread zhenwei pi
mptcp_connect.c is a startup tutorial of MPTCP programming, however
there is a lack of ai_protocol(IPPROTO_MPTCP) usage. Add comment for
getaddrinfo MPTCP support.

Signed-off-by: zhenwei pi 
Signed-off-by: zhenwei pi 
---
 tools/testing/selftests/net/mptcp/mptcp_connect.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c 
b/tools/testing/selftests/net/mptcp/mptcp_connect.c
index c83a8b47bbdf..6b9031273964 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.c
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c
@@ -179,6 +179,18 @@ static void xgetnameinfo(const struct sockaddr *addr, 
socklen_t addrlen,
}
 }
 
+/* There is a lack of MPTCP support from glibc, these code leads error:
+ * struct addrinfo hints = {
+ * .ai_protocol = IPPROTO_MPTCP,
+ * ...
+ * };
+ * err = getaddrinfo(node, service, &hints, res);
+ * ...
+ * So using IPPROTO_TCP to resolve, and use TCP/MPTCP to create socket.
+ *
+ * glibc starts to support MPTCP since v2.42.
+ * Link: https://sourceware.org/git/?p=glibc.git;a=commit;h=a8e9022e0f82
+ */
 static void xgetaddrinfo(const char *node, const char *service,
 const struct addrinfo *hints,
 struct addrinfo **res)
-- 
2.34.1




Re: [PATCH v9 08/26] remoteproc: k3-r5: Refactor sequential core power up/down operations

2025-04-07 Thread Andrew Davis

On 3/17/25 7:06 AM, Beleswar Padhi wrote:

The existing implementation of the waiting mechanism in
"k3_r5_cluster_rproc_init()" waits for the "released_from_reset" flag to
be set as part of the firmware boot process in "k3_r5_rproc_start()".
The "k3_r5_cluster_rproc_init()" function is invoked in the probe
routine which causes unexpected failures in cases where the firmware is
unavailable at boot time, resulting in probe failure and removal of the
remoteproc handles in the sysfs paths.

To address this, the waiting mechanism is refactored out of the probe
routine into the appropriate "k3_r5_rproc_{prepare/unprepare}()"
functions. This allows the probe routine to complete without depending
on firmware booting, while still maintaining the required
power-synchronization between cores.

Further, this wait mechanism is dropped from
"k3_r5_rproc_{start/stop}()" functions as they deal with Core Run/Halt
operations, and as such, there is no constraint in Running or Halting
the cores of a cluster in order.

Fixes: 61f6f68447ab ("remoteproc: k3-r5: Wait for core0 power-up before powering up 
core1")
Signed-off-by: Beleswar Padhi 
---


Same as the above two patches in this series, these are all valid fixes, but 
should be
done first before the refactoring begins, so move them to the start of the 
series.

Andrew


  drivers/remoteproc/ti_k3_r5_remoteproc.c | 114 +--
  1 file changed, 65 insertions(+), 49 deletions(-)

diff --git a/drivers/remoteproc/ti_k3_r5_remoteproc.c 
b/drivers/remoteproc/ti_k3_r5_remoteproc.c
index c0e4da82775d..30081eafbd36 100644
--- a/drivers/remoteproc/ti_k3_r5_remoteproc.c
+++ b/drivers/remoteproc/ti_k3_r5_remoteproc.c
@@ -475,7 +475,7 @@ static int k3_r5_rproc_request_mbox(struct rproc *rproc)
  static int k3_r5_rproc_prepare(struct rproc *rproc)
  {
struct k3_r5_rproc *kproc = rproc->priv;
-   struct k3_r5_core *core = kproc->priv;
+   struct k3_r5_core *core = kproc->priv, *core0, *core1;
struct k3_r5_cluster *cluster = core->cluster;
struct device *dev = kproc->dev;
u32 ctrl = 0, cfg = 0, stat = 0;
@@ -483,6 +483,29 @@ static int k3_r5_rproc_prepare(struct rproc *rproc)
bool mem_init_dis;
int ret;
  
+	/*

+* R5 cores require to be powered on sequentially, core0 should be in
+* higher power state than core1 in a cluster. So, wait for core0 to
+* power up before proceeding to core1 and put timeout of 2sec. This
+* waiting mechanism is necessary because rproc_auto_boot_callback() for
+* core1 can be called before core0 due to thread execution order.
+*
+* By placing the wait mechanism here in .prepare() ops, this condition
+* is enforced for rproc boot requests from sysfs as well.
+*/
+   core0 = list_first_entry(&cluster->cores, struct k3_r5_core, elem);
+   core1 = list_last_entry(&cluster->cores, struct k3_r5_core, elem);
+   if (cluster->mode == CLUSTER_MODE_SPLIT && core == core1 &&
+   !core0->released_from_reset) {
+   ret = wait_event_interruptible_timeout(cluster->core_transition,
+  
core0->released_from_reset,
+  msecs_to_jiffies(2000));
+   if (ret <= 0) {
+   dev_err(dev, "can not power up core1 before core0");
+   return -EPERM;
+   }
+   }
+
ret = ti_sci_proc_get_status(kproc->tsp, &boot_vec, &cfg, &ctrl, &stat);
if (ret < 0)
return ret;
@@ -498,6 +521,14 @@ static int k3_r5_rproc_prepare(struct rproc *rproc)
return ret;
}
  
+	/*

+* Notify all threads in the wait queue when core0 state has changed so
+* that threads waiting for this condition can be executed.
+*/
+   core->released_from_reset = true;
+   if (core == core0)
+   wake_up_interruptible(&cluster->core_transition);
+
/*
 * Newer IP revisions like on J7200 SoCs support h/w auto-initialization
 * of TCMs, so there is no need to perform the s/w memzero. This bit is
@@ -542,11 +573,31 @@ static int k3_r5_rproc_prepare(struct rproc *rproc)
  static int k3_r5_rproc_unprepare(struct rproc *rproc)
  {
struct k3_r5_rproc *kproc = rproc->priv;
-   struct k3_r5_core *core = kproc->priv;
+   struct k3_r5_core *core = kproc->priv, *core0, *core1;
struct k3_r5_cluster *cluster = core->cluster;
struct device *dev = kproc->dev;
int ret;
  
+	/*

+* Ensure power-down of cores is sequential in split mode. Core1 must
+* power down before Core0 to maintain the expected state. By placing
+* the wait mechanism here in .unprepare() ops, this condition is
+* enforced for rproc stop or shutdown requests from sysfs and device
+* removal as well.
+*/
+   core0 = list_first_entry

Re: [PATCH v1] s390/virtio_ccw: don't allocate/assign airqs for non-existing queues

2025-04-07 Thread David Hildenbrand

On 07.04.25 10:49, Michael S. Tsirkin wrote:

On Mon, Apr 07, 2025 at 10:44:21AM +0200, David Hildenbrand wrote:





Whoever adds new feat_X *must be aware* about all previous features,
otherwise we'd be reusing feature bits and everything falls to pieces.



The knowledge is supposed be limited to which feature bit to use.


I think we also have to know which virtqueue bits can be used, right?



what are virtqueue bits? vq number?


Yes, sorry.

Assume cross-vm as an example. It would make use of virtqueue indexes 
5+6 with their VIRTIO_BALLOON_F_WS_REPORTING.


So whatever feature another device implements couldn't use this feature 
bit or these virtqueue indexes.


(as long the other device never intends to implement 
VIRTIO_BALLOON_F_WS_REPORTING, the virtqueue indexes could be reused. 
But the spec will also be a mess, because virtqueue indexes could also 
have duplicate meanings ... ugh)


--
Cheers,

David / dhildenb




Re: [PATCH v1] s390/virtio_ccw: don't allocate/assign airqs for non-existing queues

2025-04-07 Thread Halil Pasic
On Mon, 7 Apr 2025 04:34:29 -0400
"Michael S. Tsirkin"  wrote:

> On Mon, Apr 07, 2025 at 10:17:10AM +0200, David Hildenbrand wrote:
> > On 07.04.25 09:52, Michael S. Tsirkin wrote:  
> > > On Fri, Apr 04, 2025 at 05:39:10PM +0200, Halil Pasic wrote:  
> > > > > 
> > > > > Not perfect, but AFAIKS, not horrible.  
> > > > 
> > > > It is like it is. QEMU does queue exist if the corresponding feature
> > > > is offered by the device, and that is what we have to live with.  
> > > 
> > > I don't think we can live with this properly though.
> > > It means a guest that does not know about some features
> > > does not know where to find things.  
> > 
> > Please describe a real scenario, I'm missing the point.  
> 
> 
> OK so.
> 
> Device has VIRTIO_BALLOON_F_FREE_PAGE_HINT and VIRTIO_BALLOON_F_REPORTING
> Driver only knows about VIRTIO_BALLOON_F_REPORTING so
> it does not know what does VIRTIO_BALLOON_F_FREE_PAGE_HINT do.
> How does it know which vq to use for reporting?
> It will try to use the free page hint one.

First, sorry for not catching up again with the discussion earlier.

I think David's point is based on the assumption that by the time feature
with the feature bit N+1 is specified and allocates a queue Q, all
queues with indexes smaller than Q are allocated and possibly associated
with features that were previously specified (and probably have feature
bits smaller than N+1). 

I.e. that we can mandate, even if you don't want to care about other
optional features, you have to, because we say so, for the matter of
virtqueue existence. And anything in the future, you don't have to care
about because the queue index associated with future features is larger
than Q, so it does not affect our position.

I think that argument can fall a part if:
* future features reference optional queues defined in the past
* somebody managed to introduce a limbo where a feature is reserved, and
  they can not decide if they want a queue or not, or make the existence
  of the queue depend on something else than a feature bit.

Frankly I don't think the risks are huge, but it would undoubtedly make
the spec more ugly.

> 
> 
> 
> > Whoever adds new feat_X *must be aware* about all previous features,
> > otherwise we'd be reusing feature bits and everything falls to pieces.  
> 
> 
> The knowledge is supposed be limited to which feature bit to use.
> 

I do agree! This is why I brought this question up. Creating exceptions
from that rule would be very ugly IMHO. But I would not say it is
impossible.

> 
> 
> > > 
> > > So now, I am inclined to add linux code to work with current qemu and
> > > with spec compliant one, and add qemu code to work with current linux
> > > and spec compliant one.
> > > 
> > > Document the bug in the spec, maybe, in a non conformance section.  
> > 
> > I'm afraid this results in a lot of churn without really making things
> > better.  
> 
> > IMHO, documenting things how they actually behave, and maybe moving towards
> > fixed queue indexes for new features is the low hanging fruit.  
> 
> I worry about how to we ensure that?
> If old code is messed up people will just keep propagating that.
> I would like to fix old code so that new code is correct.
> 
> > 
> > As raised, it's not just qemu+linux, it's *at least* also cloud-hypervisor.
> > 
> > -- 
> > Cheers,
> > 
> > David / dhildenb  
> 
> There's a slippery slope here in that people will come to us
> with buggy devices and ask to change the spec.
> 

I agree! IMHO all we have are bad options. To decide for myself which is
less ugly I would love to see both.

I agree making the spec bend to the fact that implementations are buggy
does not seem right from the spec perspective. But if making things work
is not practical without sacrificing the sanctity of the spec, I am
willing to swallow the bitter pill and bend the spec.

If you don't mind try to keep me in the loop, even if I'm not able to
be as responsive as I would like to be. I'm happy fixing the code is
going to get another round of consideration.

Regards,
Halil




Re: [PATCH v1] s390/virtio_ccw: don't allocate/assign airqs for non-existing queues

2025-04-07 Thread Cornelia Huck
On Mon, Apr 07 2025, David Hildenbrand  wrote:

> On 07.04.25 15:12, Halil Pasic wrote:
>> On Mon, 7 Apr 2025 04:34:29 -0400
>> "Michael S. Tsirkin"  wrote:
>> 
>>> On Mon, Apr 07, 2025 at 10:17:10AM +0200, David Hildenbrand wrote:
 On 07.04.25 09:52, Michael S. Tsirkin wrote:
> On Fri, Apr 04, 2025 at 05:39:10PM +0200, Halil Pasic wrote:
>>>
>>> Not perfect, but AFAIKS, not horrible.
>>
>> It is like it is. QEMU does queue exist if the corresponding feature
>> is offered by the device, and that is what we have to live with.
>
> I don't think we can live with this properly though.
> It means a guest that does not know about some features
> does not know where to find things.

 Please describe a real scenario, I'm missing the point.
>>>
>>>
>>> OK so.
>>>
>>> Device has VIRTIO_BALLOON_F_FREE_PAGE_HINT and VIRTIO_BALLOON_F_REPORTING
>>> Driver only knows about VIRTIO_BALLOON_F_REPORTING so
>>> it does not know what does VIRTIO_BALLOON_F_FREE_PAGE_HINT do.
>>> How does it know which vq to use for reporting?
>>> It will try to use the free page hint one.
>> 
>> First, sorry for not catching up again with the discussion earlier.
>> 
>> I think David's point is based on the assumption that by the time feature
>> with the feature bit N+1 is specified and allocates a queue Q, all
>> queues with indexes smaller than Q are allocated and possibly associated
>> with features that were previously specified (and probably have feature
>> bits smaller than N+1).
>> 
>> I.e. that we can mandate, even if you don't want to care about other
>> optional features, you have to, because we say so, for the matter of
>> virtqueue existence. And anything in the future, you don't have to care
>> about because the queue index associated with future features is larger
>> than Q, so it does not affect our position.
>> 
>> I think that argument can fall a part if:
>> * future features reference optional queues defined in the past
>> * somebody managed to introduce a limbo where a feature is reserved, and
>>they can not decide if they want a queue or not, or make the existence
>>of the queue depend on something else than a feature bit.
>
> Staring at the cross-vmm, including the adding+removing of features and 
> queues that are not in the spec, I am wondering if (in a world with 
> fixed virtqueues)
>
> 1) Feature bits must be reserved before used.
>
> 2) Queue indices must be reserved before used.
>
> It all smells like a problem similar to device IDs ...

Indeed, we need a rule "reserve a feature bit/queue index before using
it, even if you do not plan to spec it properly".




Re: [PATCH v9 01/26] remoteproc: k3-r5: Re-order internal memory initialization function

2025-04-07 Thread Andrew Davis

On 3/17/25 7:05 AM, Beleswar Padhi wrote:

The core's internal memory data structure will be refactored to be part
of the k3_r5_rproc structure in a future commit. As a result, internal
memory initialization will need to be performed inside
k3_r5_cluster_rproc_init() after rproc_alloc().

Therefore, move the internal memory initialization function,
k3_r5_core_of_get_internal_memories() above k3_r5_rproc_init() so that
it can be invoked from there.

Signed-off-by: Beleswar Padhi 
---


Just to keep things organized, does it make sense to also move
the other k3_r5_core_of_get_*_memories() up with this?

Also, you move k3_r5_release_tsp() up too but don't mention
that in the commit message.

Andrew


  drivers/remoteproc/ti_k3_r5_remoteproc.c | 158 +++
  1 file changed, 79 insertions(+), 79 deletions(-)

diff --git a/drivers/remoteproc/ti_k3_r5_remoteproc.c 
b/drivers/remoteproc/ti_k3_r5_remoteproc.c
index dbc513c5569c..b2738b9a1b2d 100644
--- a/drivers/remoteproc/ti_k3_r5_remoteproc.c
+++ b/drivers/remoteproc/ti_k3_r5_remoteproc.c
@@ -1199,6 +1199,85 @@ static int k3_r5_rproc_configure_mode(struct k3_r5_rproc 
*kproc)
return ret;
  }
  
+static int k3_r5_core_of_get_internal_memories(struct platform_device *pdev,

+  struct k3_r5_core *core)
+{
+   static const char * const mem_names[] = {"atcm", "btcm"};
+   struct device *dev = &pdev->dev;
+   struct resource *res;
+   int num_mems;
+   int i;
+
+   num_mems = ARRAY_SIZE(mem_names);
+   core->mem = devm_kcalloc(dev, num_mems, sizeof(*core->mem), GFP_KERNEL);
+   if (!core->mem)
+   return -ENOMEM;
+
+   for (i = 0; i < num_mems; i++) {
+   res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
+  mem_names[i]);
+   if (!res) {
+   dev_err(dev, "found no memory resource for %s\n",
+   mem_names[i]);
+   return -EINVAL;
+   }
+   if (!devm_request_mem_region(dev, res->start,
+resource_size(res),
+dev_name(dev))) {
+   dev_err(dev, "could not request %s region for 
resource\n",
+   mem_names[i]);
+   return -EBUSY;
+   }
+
+   /*
+* TCMs are designed in general to support RAM-like backing
+* memories. So, map these as Normal Non-Cached memories. This
+* also avoids/fixes any potential alignment faults due to
+* unaligned data accesses when using memcpy() or memset()
+* functions (normally seen with device type memory).
+*/
+   core->mem[i].cpu_addr = devm_ioremap_wc(dev, res->start,
+   resource_size(res));
+   if (!core->mem[i].cpu_addr) {
+   dev_err(dev, "failed to map %s memory\n", mem_names[i]);
+   return -ENOMEM;
+   }
+   core->mem[i].bus_addr = res->start;
+
+   /*
+* TODO:
+* The R5F cores can place ATCM & BTCM anywhere in its address
+* based on the corresponding Region Registers in the System
+* Control coprocessor. For now, place ATCM and BTCM at
+* addresses 0 and 0x4101 (same as the bus address on AM65x
+* SoCs) based on loczrama setting
+*/
+   if (!strcmp(mem_names[i], "atcm")) {
+   core->mem[i].dev_addr = core->loczrama ?
+   0 : K3_R5_TCM_DEV_ADDR;
+   } else {
+   core->mem[i].dev_addr = core->loczrama ?
+   K3_R5_TCM_DEV_ADDR : 0;
+   }
+   core->mem[i].size = resource_size(res);
+
+   dev_dbg(dev, "memory %5s: bus addr %pa size 0x%zx va %pK da 
0x%x\n",
+   mem_names[i], &core->mem[i].bus_addr,
+   core->mem[i].size, core->mem[i].cpu_addr,
+   core->mem[i].dev_addr);
+   }
+   core->num_mems = num_mems;
+
+   return 0;
+}
+
+static void k3_r5_release_tsp(void *data)
+{
+   struct ti_sci_proc *tsp = data;
+
+   ti_sci_proc_release(tsp);
+}
+
  static int k3_r5_cluster_rproc_init(struct platform_device *pdev)
  {
struct k3_r5_cluster *cluster = platform_get_drvdata(pdev);
@@ -1358,78 +1437,6 @@ static void k3_r5_cluster_rproc_exit(void *data)
}
  }
  
-static int k3_r5_core_of_get_internal_memories(struct platform_device *pdev,

-  struct k3_r5_core *core)
-{
-   static const char *

Re: [PATCH v9 06/26] remoteproc: k3-r5: Drop check performed in k3_r5_rproc_{mbox_callback/kick}

2025-04-07 Thread Andrew Davis

On 3/17/25 7:06 AM, Beleswar Padhi wrote:

From: Siddharth Vadapalli 

Commit f3f11cfe8907 ("remoteproc: k3-r5: Acquire mailbox handle during
probe routine") introduced a check in the "k3_r5_rproc_mbox_callback()"
and "k3_r5_rproc_kick()" callbacks, causing them to exit if the remote
core's state is "RPROC_DETACHED". However, the "__rproc_attach()"
function that is responsible for attaching to a remote core, updates
the state of the remote core to "RPROC_ATTACHED" only after invoking
"rproc_start_subdevices()".

The "rproc_start_subdevices()" function triggers the probe of the Virtio
RPMsg devices associated with the remote core, which require that the
"k3_r5_rproc_kick()" and "k3_r5_rproc_mbox_callback()" callbacks are
functional. Hence, drop the check in the callbacks.

Fixes: f3f11cfe8907 ("remoteproc: k3-r5: Acquire mailbox handle during probe 
routine")
Signed-off-by: Siddharth Vadapalli 
Signed-off-by: Beleswar Padhi 
---


This patch seems out of place here, while you do need to do this before
the next couple patches, this patch stands alone and probably should go
at the start of the series before the start of the refactoring.

Andrew


  drivers/remoteproc/ti_k3_r5_remoteproc.c | 8 
  1 file changed, 8 deletions(-)

diff --git a/drivers/remoteproc/ti_k3_r5_remoteproc.c 
b/drivers/remoteproc/ti_k3_r5_remoteproc.c
index 29205d9e21af..c0e4da82775d 100644
--- a/drivers/remoteproc/ti_k3_r5_remoteproc.c
+++ b/drivers/remoteproc/ti_k3_r5_remoteproc.c
@@ -224,10 +224,6 @@ static void k3_r5_rproc_mbox_callback(struct mbox_client 
*client, void *data)
const char *name = kproc->rproc->name;
u32 msg = omap_mbox_message(data);
  
-	/* Do not forward message from a detached core */

-   if (kproc->rproc->state == RPROC_DETACHED)
-   return;
-
dev_dbg(dev, "mbox msg: 0x%x\n", msg);
  
  	switch (msg) {

@@ -263,10 +259,6 @@ static void k3_r5_rproc_kick(struct rproc *rproc, int vqid)
mbox_msg_t msg = (mbox_msg_t)vqid;
int ret;
  
-	/* Do not forward message to a detached core */

-   if (kproc->rproc->state == RPROC_DETACHED)
-   return;
-
/* send the index of the triggered virtqueue in the mailbox payload */
ret = mbox_send_message(kproc->mbox, (void *)msg);
if (ret < 0)




Re: [RESEND] virtiofs: add filesystem context source name check

2025-04-07 Thread Christian Brauner
On Mon, 07 Apr 2025 19:50:49 +0800, Xiangsheng Hou wrote:
> In certain scenarios, for example, during fuzz testing, the source
> name may be NULL, which could lead to a kernel panic. Therefore, an
> extra check for the source name should be added.
> 
> 

Applied to the vfs.fixes branch of the vfs/vfs.git tree.
Patches in the vfs.fixes branch should appear in linux-next soon.

Please report any outstanding bugs that were missed during review in a
new review to the original patch series allowing us to drop it.

It's encouraged to provide Acked-bys and Reviewed-bys even though the
patch has now been applied. If possible patch trailers will be updated.

Note that commit hashes shown below are subject to change due to rebase,
trailer updates or similar. If in doubt, please check the listed branch.

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git
branch: vfs.fixes

[1/1] virtiofs: add filesystem context source name check
  https://git.kernel.org/vfs/vfs/c/a94fd938df2b



Re: [PATCH v9 02/26] remoteproc: k3-r5: Refactor Data Structures to Align with DSP and M4

2025-04-07 Thread Andrew Davis

On 3/17/25 7:05 AM, Beleswar Padhi wrote:

Currently, struct members such as mem, num_mems, reset, tsp, ti_sci and
ti_sci_id are part of the k3_r5_core structure. To align the rproc->priv
data structure of the R5 remote processor with that of the DSP and M4,
move the above members from k3_r5_core to k3_r5_rproc.

Additionally, introduce a void *priv pointer in k3_r5_rproc that can be
typecasted to point to the k3_r5_core structure. This abstraction is
done to ensure common functionalities across R5, DSP and M4 drivers can
be refactored at a later stage.

Signed-off-by: Beleswar Padhi 
---
  drivers/remoteproc/ti_k3_r5_remoteproc.c | 381 ---
  1 file changed, 198 insertions(+), 183 deletions(-)

diff --git a/drivers/remoteproc/ti_k3_r5_remoteproc.c 
b/drivers/remoteproc/ti_k3_r5_remoteproc.c
index b2738b9a1b2d..525f26996b56 100644
--- a/drivers/remoteproc/ti_k3_r5_remoteproc.c
+++ b/drivers/remoteproc/ti_k3_r5_remoteproc.c
@@ -114,19 +114,16 @@ struct k3_r5_cluster {
const struct k3_r5_soc_data *soc_data;
  };
  
+struct k3_r5_rproc;

+
  /**
   * struct k3_r5_core - K3 R5 core structure
   * @elem: linked list item
   * @dev: cached device pointer
- * @rproc: rproc handle representing this core
- * @mem: internal memory regions data
+ * @kproc: K3 rproc handle representing this core
+ * @cluster: cached pointer to parent cluster structure
   * @sram: on-chip SRAM memory regions data
- * @num_mems: number of internal memory regions
   * @num_sram: number of on-chip SRAM memory regions
- * @reset: reset control handle
- * @tsp: TI-SCI processor control handle
- * @ti_sci: TI-SCI handle
- * @ti_sci_id: TI-SCI device identifier
   * @atcm_enable: flag to control ATCM enablement
   * @btcm_enable: flag to control BTCM enablement
   * @loczrama: flag to dictate which TCM is at device address 0x0
@@ -135,15 +132,10 @@ struct k3_r5_cluster {
  struct k3_r5_core {
struct list_head elem;
struct device *dev;
-   struct rproc *rproc;
-   struct k3_r5_mem *mem;
+   struct k3_r5_rproc *kproc;
+   struct k3_r5_cluster *cluster;
struct k3_r5_mem *sram;
-   int num_mems;
int num_sram;
-   struct reset_control *reset;
-   struct ti_sci_proc *tsp;
-   const struct ti_sci_handle *ti_sci;
-   u32 ti_sci_id;
u32 atcm_enable;
u32 btcm_enable;
u32 loczrama;
@@ -153,23 +145,33 @@ struct k3_r5_core {
  /**
   * struct k3_r5_rproc - K3 remote processor state
   * @dev: cached device pointer
- * @cluster: cached pointer to parent cluster structure
- * @mbox: mailbox channel handle
- * @client: mailbox client to request the mailbox channel
   * @rproc: rproc handle
- * @core: cached pointer to r5 core structure being used
+ * @mem: internal memory regions data
+ * @num_mems: number of internal memory regions
   * @rmem: reserved memory regions data
   * @num_rmems: number of reserved memory regions
+ * @reset: reset control handle
+ * @tsp: TI-SCI processor control handle
+ * @ti_sci: TI-SCI handle
+ * @ti_sci_id: TI-SCI device identifier
+ * @mbox: mailbox channel handle
+ * @client: mailbox client to request the mailbox channel
+ * @priv: Remote processor private data
   */
  struct k3_r5_rproc {
struct device *dev;
-   struct k3_r5_cluster *cluster;
-   struct mbox_chan *mbox;
-   struct mbox_client client;
struct rproc *rproc;
-   struct k3_r5_core *core;
+   struct k3_r5_mem *mem;
+   int num_mems;
struct k3_r5_mem *rmem;
int num_rmems;
+   struct reset_control *reset;
+   struct ti_sci_proc *tsp;
+   const struct ti_sci_handle *ti_sci;
+   u32 ti_sci_id;
+   struct mbox_chan *mbox;
+   struct mbox_client client;
+   void *priv;
  };
  
  /**

@@ -244,48 +246,48 @@ static void k3_r5_rproc_kick(struct rproc *rproc, int 
vqid)
ret);
  }
  
-static int k3_r5_split_reset(struct k3_r5_core *core)

+static int k3_r5_split_reset(struct k3_r5_rproc *kproc)
  {
int ret;
  
-	ret = reset_control_assert(core->reset);

+   ret = reset_control_assert(kproc->reset);
if (ret) {
-   dev_err(core->dev, "local-reset assert failed, ret = %d\n",
+   dev_err(kproc->dev, "local-reset assert failed, ret = %d\n",
ret);
return ret;
}
  
-	ret = core->ti_sci->ops.dev_ops.put_device(core->ti_sci,

-  core->ti_sci_id);
+   ret = kproc->ti_sci->ops.dev_ops.put_device(kproc->ti_sci,
+   kproc->ti_sci_id);
if (ret) {
-   dev_err(core->dev, "module-reset assert failed, ret = %d\n",
+   dev_err(kproc->dev, "module-reset assert failed, ret = %d\n",
ret);
-   if (reset_control_deassert(core->reset))
-   dev_warn(core->dev, "local-reset deassert back 
failed\n");
+ 

Re: [PATCH v1] s390/virtio_ccw: don't allocate/assign airqs for non-existing queues

2025-04-07 Thread Michael S. Tsirkin
On Mon, Apr 07, 2025 at 03:28:13PM +0200, Cornelia Huck wrote:
> On Mon, Apr 07 2025, David Hildenbrand  wrote:
> 
> > On 07.04.25 15:12, Halil Pasic wrote:
> >> On Mon, 7 Apr 2025 04:34:29 -0400
> >> "Michael S. Tsirkin"  wrote:
> >> 
> >>> On Mon, Apr 07, 2025 at 10:17:10AM +0200, David Hildenbrand wrote:
>  On 07.04.25 09:52, Michael S. Tsirkin wrote:
> > On Fri, Apr 04, 2025 at 05:39:10PM +0200, Halil Pasic wrote:
> >>>
> >>> Not perfect, but AFAIKS, not horrible.
> >>
> >> It is like it is. QEMU does queue exist if the corresponding feature
> >> is offered by the device, and that is what we have to live with.
> >
> > I don't think we can live with this properly though.
> > It means a guest that does not know about some features
> > does not know where to find things.
> 
>  Please describe a real scenario, I'm missing the point.
> >>>
> >>>
> >>> OK so.
> >>>
> >>> Device has VIRTIO_BALLOON_F_FREE_PAGE_HINT and VIRTIO_BALLOON_F_REPORTING
> >>> Driver only knows about VIRTIO_BALLOON_F_REPORTING so
> >>> it does not know what does VIRTIO_BALLOON_F_FREE_PAGE_HINT do.
> >>> How does it know which vq to use for reporting?
> >>> It will try to use the free page hint one.
> >> 
> >> First, sorry for not catching up again with the discussion earlier.
> >> 
> >> I think David's point is based on the assumption that by the time feature
> >> with the feature bit N+1 is specified and allocates a queue Q, all
> >> queues with indexes smaller than Q are allocated and possibly associated
> >> with features that were previously specified (and probably have feature
> >> bits smaller than N+1).
> >> 
> >> I.e. that we can mandate, even if you don't want to care about other
> >> optional features, you have to, because we say so, for the matter of
> >> virtqueue existence. And anything in the future, you don't have to care
> >> about because the queue index associated with future features is larger
> >> than Q, so it does not affect our position.
> >> 
> >> I think that argument can fall a part if:
> >> * future features reference optional queues defined in the past
> >> * somebody managed to introduce a limbo where a feature is reserved, and
> >>they can not decide if they want a queue or not, or make the existence
> >>of the queue depend on something else than a feature bit.
> >
> > Staring at the cross-vmm, including the adding+removing of features and 
> > queues that are not in the spec, I am wondering if (in a world with 
> > fixed virtqueues)
> >
> > 1) Feature bits must be reserved before used.
> >
> > 2) Queue indices must be reserved before used.
> >
> > It all smells like a problem similar to device IDs ...
> 
> Indeed, we need a rule "reserve a feature bit/queue index before using
> it, even if you do not plan to spec it properly".


Reserving feature bits is something I do my best to advocate for
in all presentations I do.


-- 
MST




Re: [PATCH v9 03/26] remoteproc: k3-r5: Use k3_r5_rproc_mem_data structure for memory info

2025-04-07 Thread Andrew Davis

On 3/17/25 7:05 AM, Beleswar Padhi wrote:

The ti_k3_r5_remoteproc.c driver previously hardcoded device memory
region addresses and names. Change this to use the k3_r5_rproc_mem_data
structure to store memory information. This aligns with K3 DSP and M4
drivers, and can be refactored out later.

Signed-off-by: Beleswar Padhi 
---


Reviewed-by: Andrew Davis 


  drivers/remoteproc/ti_k3_r5_remoteproc.c | 65 
  1 file changed, 56 insertions(+), 9 deletions(-)

diff --git a/drivers/remoteproc/ti_k3_r5_remoteproc.c 
b/drivers/remoteproc/ti_k3_r5_remoteproc.c
index 525f26996b56..29205d9e21af 100644
--- a/drivers/remoteproc/ti_k3_r5_remoteproc.c
+++ b/drivers/remoteproc/ti_k3_r5_remoteproc.c
@@ -84,18 +84,44 @@ enum cluster_mode {
CLUSTER_MODE_SINGLECORE
  };
  
+/**

+ * struct k3_r5_mem_data - memory definitions for a R5
+ * @name: name for this memory entry
+ * @dev_addr: device address for the memory entry
+ */
+struct k3_r5_mem_data {
+   const char *name;
+   const u32 dev_addr;
+};
+
+/**
+ * struct k3_r5_dev_data - device data structure for a R5
+ * @mems: pointer to memory definitions for a R5
+ * @num_mems: number of memory regions in @mems
+ * @boot_align_addr: boot vector address alignment granularity
+ * @uses_lreset: flag to denote the need for local reset management
+ */
+struct k3_r5_dev_data {
+   const struct k3_r5_mem_data *mems;
+   u32 num_mems;
+   u32 boot_align_addr;
+   bool uses_lreset;
+};
+
  /**
   * struct k3_r5_soc_data - match data to handle SoC variations
   * @tcm_is_double: flag to denote the larger unified TCMs in certain modes
   * @tcm_ecc_autoinit: flag to denote the auto-initialization of TCMs for ECC
   * @single_cpu_mode: flag to denote if SoC/IP supports Single-CPU mode
   * @is_single_core: flag to denote if SoC/IP has only single core R5
+ * @core_data: pointer to R5-core-specific device data
   */
  struct k3_r5_soc_data {
bool tcm_is_double;
bool tcm_ecc_autoinit;
bool single_cpu_mode;
bool is_single_core;
+   const struct k3_r5_dev_data *core_data;
  };
  
  /**

@@ -151,6 +177,7 @@ struct k3_r5_core {
   * @rmem: reserved memory regions data
   * @num_rmems: number of reserved memory regions
   * @reset: reset control handle
+ * @data: pointer to R5-core-specific device data
   * @tsp: TI-SCI processor control handle
   * @ti_sci: TI-SCI handle
   * @ti_sci_id: TI-SCI device identifier
@@ -166,6 +193,7 @@ struct k3_r5_rproc {
struct k3_r5_mem *rmem;
int num_rmems;
struct reset_control *reset;
+   const struct k3_r5_dev_data *data;
struct ti_sci_proc *tsp;
const struct ti_sci_handle *ti_sci;
u32 ti_sci_id;
@@ -1207,31 +1235,32 @@ static int k3_r5_rproc_configure_mode(struct 
k3_r5_rproc *kproc)
  static int k3_r5_core_of_get_internal_memories(struct platform_device *pdev,
   struct k3_r5_rproc *kproc)
  {
-   static const char * const mem_names[] = {"atcm", "btcm"};
+   const struct k3_r5_dev_data *data = kproc->data;
struct device *dev = &pdev->dev;
struct k3_r5_core *core = kproc->priv;
struct resource *res;
int num_mems;
int i;
  
-	num_mems = ARRAY_SIZE(mem_names);

-   kproc->mem = devm_kcalloc(dev, num_mems, sizeof(*kproc->mem), 
GFP_KERNEL);
+   num_mems = kproc->data->num_mems;
+   kproc->mem = devm_kcalloc(kproc->dev, num_mems, sizeof(*kproc->mem),
+ GFP_KERNEL);
if (!kproc->mem)
return -ENOMEM;
  
  	for (i = 0; i < num_mems; i++) {

res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
-  mem_names[i]);
+  data->mems[i].name);
if (!res) {
dev_err(dev, "found no memory resource for %s\n",
-   mem_names[i]);
+   data->mems[i].name);
return -EINVAL;
}
if (!devm_request_mem_region(dev, res->start,
 resource_size(res),
 dev_name(dev))) {
dev_err(dev, "could not request %s region for 
resource\n",
-   mem_names[i]);
+   data->mems[i].name);
return -EBUSY;
}
  
@@ -1245,7 +1274,8 @@ static int k3_r5_core_of_get_internal_memories(struct platform_device *pdev,

kproc->mem[i].cpu_addr = devm_ioremap_wc(dev, res->start,
 resource_size(res));
if (!kproc->mem[i].cpu_addr) {
-   dev_err(dev, "failed to map %s memory\n", mem_names[i]);
+   dev_err(dev, "failed to map %s memory\n

Re: [PATCH v9 04/26] remoteproc: k3-{m4/dsp}: Align internal rproc data structure with R5

2025-04-07 Thread Andrew Davis

On 3/17/25 7:06 AM, Beleswar Padhi wrote:

Introduce a void pointer in the k3_{m4/dsp}_rproc internal data
structure which can be used to point to any private data needed by the
driver. Currently, the M4/DSP drivers do not have any private data, so
the pointer can be left pointing to NULL. Additionally, add a pointer to
the rproc struct within k3_m4_rproc internal struct. This is done to
align the data structures with R5 driver which can be factored out at a
later stage.



This does two things, even if trivial things, just make this into two patches.

Andrew


Signed-off-by: Beleswar Padhi 
---
  drivers/remoteproc/ti_k3_dsp_remoteproc.c | 2 ++
  drivers/remoteproc/ti_k3_m4_remoteproc.c  | 5 +
  2 files changed, 7 insertions(+)

diff --git a/drivers/remoteproc/ti_k3_dsp_remoteproc.c 
b/drivers/remoteproc/ti_k3_dsp_remoteproc.c
index a695890254ff..31e43e49f1e4 100644
--- a/drivers/remoteproc/ti_k3_dsp_remoteproc.c
+++ b/drivers/remoteproc/ti_k3_dsp_remoteproc.c
@@ -76,6 +76,7 @@ struct k3_dsp_dev_data {
   * @ti_sci_id: TI-SCI device identifier
   * @mbox: mailbox channel handle
   * @client: mailbox client to request the mailbox channel
+ * @priv: Remote processor private data
   */
  struct k3_dsp_rproc {
struct device *dev;
@@ -91,6 +92,7 @@ struct k3_dsp_rproc {
u32 ti_sci_id;
struct mbox_chan *mbox;
struct mbox_client client;
+   void *priv;
  };
  
  /**

diff --git a/drivers/remoteproc/ti_k3_m4_remoteproc.c 
b/drivers/remoteproc/ti_k3_m4_remoteproc.c
index a16fb165fced..d0ee7a8d460d 100644
--- a/drivers/remoteproc/ti_k3_m4_remoteproc.c
+++ b/drivers/remoteproc/ti_k3_m4_remoteproc.c
@@ -50,6 +50,7 @@ struct k3_m4_rproc_mem_data {
  /**
   * struct k3_m4_rproc - k3 remote processor driver structure
   * @dev: cached device pointer
+ * @rproc: remoteproc device handle
   * @mem: internal memory regions data
   * @num_mems: number of internal memory regions
   * @rmem: reserved memory regions data
@@ -60,9 +61,11 @@ struct k3_m4_rproc_mem_data {
   * @ti_sci_id: TI-SCI device identifier
   * @mbox: mailbox channel handle
   * @client: mailbox client to request the mailbox channel
+ * @priv: Remote processor private data
   */
  struct k3_m4_rproc {
struct device *dev;
+   struct rproc *rproc;
struct k3_m4_rproc_mem *mem;
int num_mems;
struct k3_m4_rproc_mem *rmem;
@@ -73,6 +76,7 @@ struct k3_m4_rproc {
u32 ti_sci_id;
struct mbox_chan *mbox;
struct mbox_client client;
+   void *priv;
  };
  
  /**

@@ -578,6 +582,7 @@ static int k3_m4_rproc_probe(struct platform_device *pdev)
rproc->recovery_disabled = true;
kproc = rproc->priv;
kproc->dev = dev;
+   kproc->rproc = rproc;
platform_set_drvdata(pdev, rproc);
  
  	kproc->ti_sci = devm_ti_sci_get_by_phandle(dev, "ti,sci");




Re: [PATCH v9 05/26] remoteproc: k3-m4: Use k3_rproc_mem_data structure for memory info

2025-04-07 Thread Andrew Davis

On 3/17/25 7:06 AM, Beleswar Padhi wrote:

The ti_k3_m4_remoteproc.c driver previously hardcoded device memory
region addresses and names. Change this to use the k3_rproc_mem_data
structure to store memory information. This aligns with DSP and R5
drivers, and can be refactored out later.

Signed-off-by: Beleswar Padhi 
---
  drivers/remoteproc/ti_k3_m4_remoteproc.c | 60 ++--
  1 file changed, 45 insertions(+), 15 deletions(-)

diff --git a/drivers/remoteproc/ti_k3_m4_remoteproc.c 
b/drivers/remoteproc/ti_k3_m4_remoteproc.c
index d0ee7a8d460d..e83bef7cfddf 100644
--- a/drivers/remoteproc/ti_k3_m4_remoteproc.c
+++ b/drivers/remoteproc/ti_k3_m4_remoteproc.c
@@ -20,9 +20,6 @@
  #include "remoteproc_internal.h"
  #include "ti_sci_proc.h"
  
-#define K3_M4_IRAM_DEV_ADDR 0x0

-#define K3_M4_DRAM_DEV_ADDR 0x3
-


So two patches ago when you did this same thing for R5, you kept the
K3_R5_TCM_DEV_ADDR define. But here you remove the adress #defines.
I don't care if you leave them or keep them, but just do the same
either way for both M4 and R5.

Andrew


  /**
   * struct k3_m4_rproc_mem - internal memory structure
   * @cpu_addr: MPU virtual address of the memory region
@@ -38,15 +35,29 @@ struct k3_m4_rproc_mem {
  };
  
  /**

- * struct k3_m4_rproc_mem_data - memory definitions for a remote processor
+ * struct k3_m4_mem_data - memory definitions for a remote processor
   * @name: name for this memory entry
   * @dev_addr: device address for the memory entry
   */
-struct k3_m4_rproc_mem_data {
+struct k3_m4_mem_data {
const char *name;
const u32 dev_addr;
  };
  
+/**

+ * struct k3_m4_dev_data - device data structure for a M4 core
+ * @mems: pointer to memory definitions for a M4 core
+ * @num_mems: number of memory regions in @mems
+ * @boot_align_addr: boot vector address alignment granularity
+ * @uses_lreset: flag to denote the need for local reset management
+ */
+struct k3_m4_dev_data {
+   const struct k3_m4_mem_data *mems;
+   u32 num_mems;
+   u32 boot_align_addr;
+   bool uses_lreset;
+};
+
  /**
   * struct k3_m4_rproc - k3 remote processor driver structure
   * @dev: cached device pointer
@@ -56,6 +67,7 @@ struct k3_m4_rproc_mem_data {
   * @rmem: reserved memory regions data
   * @num_rmems: number of reserved memory regions
   * @reset: reset control handle
+ * @data: pointer to M4-specific device data
   * @tsp: TI-SCI processor control handle
   * @ti_sci: TI-SCI handle
   * @ti_sci_id: TI-SCI device identifier
@@ -71,6 +83,7 @@ struct k3_m4_rproc {
struct k3_m4_rproc_mem *rmem;
int num_rmems;
struct reset_control *reset;
+   const struct k3_m4_dev_data *data;
struct ti_sci_proc *tsp;
const struct ti_sci_handle *ti_sci;
u32 ti_sci_id;
@@ -336,14 +349,13 @@ static void *k3_m4_rproc_da_to_va(struct rproc *rproc, 
u64 da, size_t len, bool
  static int k3_m4_rproc_of_get_memories(struct platform_device *pdev,
   struct k3_m4_rproc *kproc)
  {
-   static const char * const mem_names[] = { "iram", "dram" };
-   static const u32 mem_addrs[] = { K3_M4_IRAM_DEV_ADDR, 
K3_M4_DRAM_DEV_ADDR };
+   const struct k3_m4_dev_data *data = kproc->data;
struct device *dev = &pdev->dev;
struct resource *res;
int num_mems;
int i;
  
-	num_mems = ARRAY_SIZE(mem_names);

+   num_mems = kproc->data->num_mems;
kproc->mem = devm_kcalloc(kproc->dev, num_mems,
  sizeof(*kproc->mem), GFP_KERNEL);
if (!kproc->mem)
@@ -351,17 +363,17 @@ static int k3_m4_rproc_of_get_memories(struct 
platform_device *pdev,
  
  	for (i = 0; i < num_mems; i++) {

res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
-  mem_names[i]);
+  data->mems[i].name);
if (!res) {
dev_err(dev, "found no memory resource for %s\n",
-   mem_names[i]);
+   data->mems[i].name);
return -EINVAL;
}
if (!devm_request_mem_region(dev, res->start,
 resource_size(res),
 dev_name(dev))) {
dev_err(dev, "could not request %s region for 
resource\n",
-   mem_names[i]);
+   data->mems[i].name);
return -EBUSY;
}
  
@@ -369,15 +381,15 @@ static int k3_m4_rproc_of_get_memories(struct platform_device *pdev,

 resource_size(res));
if (!kproc->mem[i].cpu_addr) {
dev_err(dev, "failed to map %s memory\n",
-   mem_names[i]);
+ 

Re: [PATCH v3 0/6] KVM: guest_memfd: support for uffd minor

2025-04-07 Thread David Hildenbrand

On 07.04.25 17:14, Lorenzo Stoakes wrote:

On Mon, Apr 07, 2025 at 04:46:48PM +0200, David Hildenbrand wrote:

On 07.04.25 16:24, Liam R. Howlett wrote:

* Nikita Kalyazin  [250407 10:05]:




...



All of this is extremely confusing because the onus of figuring out what
the final code will look like is put on the reviewer.  As it is, we have
issues with people not doing enough review of the code (due to limited
time).  One way to get reviews is to make the barrier of entry as low as
possible.

I spent Friday going down a rabbit hole of patches referring to each
other as dependencies and I gave up.  It looks like I mistook one set of
patches as required vs them requiring the same in-flight ones as your
patches.

I am struggling to see how we can adequately support all of you given
the way the patches are sent out in batches with dependencies - it is
just too time consuming to sort out.


I'm happy to do whatever I can to make the review easier.  I suppose the
extreme case is to wait for the dependencies to get accepted, effectively
serialising submissions, but that slows the process down significantly.  For
example, I received very good feedback on v1 and v2 of this series and was
able to address it instead of waiting for the dependency.  Would including
the required patches directly in the series help?  My only concern is in
that case the same patch will be submitted multiple times (as a part of
every depending series), but if it's better, I'll be doing that instead.


Don't resend patches that someone else is upstreaming, that'll cause
other problems.

Three methods come to mind:

1. As you stated, wait for the dependencies to land.  This is will mean
what you are working against is well tested and won't change (and you
won't have to re-spin due to an unstable base).

2. Combine them into a bigger patch set.  I can then pull one patch set
and look at the parts of interest to the mm side.

3. Provide a git repo with the necessary changes together.

I think 2 and 3 together should be used for the guest_memfd patches.
Someone needs to be managing these to send upstream.  See the discussion
in another patch set on guest_memfd here [1].


The issue is that most extensions are fairly independent from each other,
except that they built up on Fuad's mmap support,

Sending all together as one thing might not be the best option.

Once basic mmap support is upstream, some of the extensions (e.g., directmap
removal) can go in next.

So until that is upstream, I agree that tagging the stuff that builds up on
that is the right thing to do, and providing git trees is another very good
idea.

I'll prioritize getting Fuad's mmap stuff reviewed. (I keep saying that, I
know)


Which series is this? Sorry maybe lost track of this one.


Heh, not your fault :)

The most important one for basic mmap support is "KVM: Mapping 
guest_memfd backed memory at the host for software protected VMs" [1]. 
Some stuff (e.g., direct map removal) should be able to make progress 
once that landed.


I do expect the MM-specific patch in there ("mm: Consolidate freeing of 
typed folios on final folio_put()") to not be included as part of that work.


[I shared the feedback from the LSF/MM session in the upstream 
guest_memfd call, and we decided to minimize the usage of the 
folio_put() callback to where absolutely required; that will simplify 
things and avoid issues as pointed out by Willy, which is great]


The next important one will be "[PATCH v7 0/7] KVM: Restricted mapping 
of guest_memfd at the host and arm64 support" [2], but I similarly 
expect a simplification as we try moving away from folio_put() for the 
"shared <-> private" page conversion case.



So I expect a v8 of [1] (and that also [2] needs to be updated).

@Fuad, please let me know if I am wrong.

[1] 
https://lore.kernel.org/all/20250318161823.4005529-1-ta...@google.com/T/#u

[2] https://lore.kernel.org/all/20250328153133.3504118-1-ta...@google.com/

--
Cheers,

David / dhildenb




Re: [PATCH v8 5/8] vhost: Reintroduce kthread mode support in vhost

2025-04-07 Thread Mike Christie
On 3/28/25 5:02 AM, Cindy Lu wrote:
> +static int vhost_kthread_worker_create(struct vhost_worker *worker,
> +struct vhost_dev *dev, const char *name)
> +{
> + struct task_struct *task;
> + u32 id;
> + int ret;
> +
> + task = kthread_create(vhost_run_work_kthread_list, worker, "%s", name);
> + if (IS_ERR(task))
> + return PTR_ERR(task);
> +
> + worker->kthread_task = task;
> + wake_up_process(task);
> + ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL);
> + if (ret < 0)
> + goto stop_worker;
> +
> + ret = vhost_attach_task_to_cgroups(worker);
> + if (ret)

If you go to stop_worker here, it will leave the worker in the xa above. I
think you need another goto to unwind that.

> + goto stop_worker;
> +
> + worker->id = id;
> + return 0;
> +
> +stop_worker:
> + vhost_kthread_do_stop(worker);
> + return ret;
> +}
> +



[PATCH v2 27/32] tools/nolibc: add snprintf() and friends

2025-04-07 Thread Thomas Weißschuh
Add more of the printf() functions.

Signed-off-by: Thomas Weißschuh 
Acked-by: Willy Tarreau 
---
 tools/include/nolibc/stdio.h | 55 
 1 file changed, 55 insertions(+)

diff --git a/tools/include/nolibc/stdio.h b/tools/include/nolibc/stdio.h
index 
b17b473bd8751a6283309178b4848e61e1683305..46bd90f96d654fadda20292baddc98358a3afc62
 100644
--- a/tools/include/nolibc/stdio.h
+++ b/tools/include/nolibc/stdio.h
@@ -389,6 +389,61 @@ int dprintf(int fd, const char *fmt, ...)
va_start(args, fmt);
ret = vdprintf(fd, fmt, args);
va_end(args);
+
+   return ret;
+}
+
+static int __nolibc_sprintf_cb(intptr_t _state, const char *buf, size_t size)
+{
+   char **state = (char **)_state;
+
+   memcpy(*state, buf, size);
+   *state += size;
+   return 0;
+}
+
+static __attribute__((unused, format(printf, 3, 0)))
+int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
+{
+   char *state = buf;
+   int ret;
+
+   ret = __nolibc_printf(__nolibc_sprintf_cb, (intptr_t)&state, size, fmt, 
args);
+   if (ret < 0)
+   return ret;
+   buf[(size_t)ret < size ? (size_t)ret : size - 1] = '\0';
+   return ret;
+}
+
+static __attribute__((unused, format(printf, 3, 4)))
+int snprintf(char *buf, size_t size, const char *fmt, ...)
+{
+   va_list args;
+   int ret;
+
+   va_start(args, fmt);
+   ret = vsnprintf(buf, size, fmt, args);
+   va_end(args);
+
+   return ret;
+}
+
+static __attribute__((unused, format(printf, 2, 0)))
+int vsprintf(char *buf, const char *fmt, va_list args)
+{
+   return vsnprintf(buf, SIZE_MAX, fmt, args);
+}
+
+static __attribute__((unused, format(printf, 2, 3)))
+int sprintf(char *buf, const char *fmt, ...)
+{
+   va_list args;
+   int ret;
+
+   va_start(args, fmt);
+   ret = vsprintf(buf, fmt, args);
+   va_end(args);
+
return ret;
 }
 

-- 
2.49.0




Re: [PATCH v1] s390/virtio_ccw: don't allocate/assign airqs for non-existing queues

2025-04-07 Thread David Hildenbrand

On 06.04.25 20:42, Michael S. Tsirkin wrote:

On Fri, Apr 04, 2025 at 03:48:49PM +0200, David Hildenbrand wrote:

On 04.04.25 15:36, Halil Pasic wrote:

On Fri, 4 Apr 2025 12:55:09 +0200
David Hildenbrand  wrote:


For virito-balloon, we should probably do the following:

   From 38e340c2bb53c2a7cc7c675f5dfdd44ecf7701d9 Mon Sep 17 00:00:00 2001
From: David Hildenbrand 
Date: Fri, 4 Apr 2025 12:53:16 +0200
Subject: [PATCH] virtio-balloon: Fix queue index assignment for
non-existing queues

Signed-off-by: David Hildenbrand 
---
device-types/balloon/description.tex | 22 --
1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/device-types/balloon/description.tex 
b/device-types/balloon/description.tex
index a1d9603..a7396ff 100644
--- a/device-types/balloon/description.tex
+++ b/device-types/balloon/description.tex
@@ -16,6 +16,21 @@ \subsection{Device ID}\label{sec:Device Types / Memory 
Balloon Device / Device I
  5
\subsection{Virtqueues}\label{sec:Device Types / Memory Balloon Device / 
Virtqueues}
+
+\begin{description}
+\item[inflateq] Exists unconditionally.
+\item[deflateq] Exists unconditionally.
+\item[statsq] Only exists if VIRTIO_BALLOON_F_STATS_VQ is set.
+\item[free_page_vq] Only exists if VIRTIO_BALLOON_F_FREE_PAGE_HINT is set.
+\item[reporting_vq] Only exists if VIRTIO_BALLOON_F_PAGE_REPORTING is set.


s/is set/is negotiated/?

I think we should stick to "feature is offered" and "feature is
negotiated".


+\end{description}
+
+\begin{note}
+Virtqueue indexes are assigned sequentially for existing queues, starting
+with index 0; consequently, if a virtqueue does not exist, it does not get
+an index assigned. Assuming all virtqueues exist for a device, the indexes
+are:
+
\begin{description}
\item[0] inflateq
\item[1] deflateq
@@ -23,12 +38,7 @@ \subsection{Virtqueues}\label{sec:Device Types / Memory 
Balloon Device / Virtque
\item[3] free_page_vq
\item[4] reporting_vq
\end{description}
-
-  statsq only exists if VIRTIO_BALLOON_F_STATS_VQ is set.
-
-  free_page_vq only exists if VIRTIO_BALLOON_F_FREE_PAGE_HINT is set.
-
-  reporting_vq only exists if VIRTIO_BALLOON_F_PAGE_REPORTING is set.
+\end{note}
\subsection{Feature bits}\label{sec:Device Types / Memory Balloon Device / 
Feature bits}
\begin{description}


Sounds good to me! But I'm still a little confused by the "holes". What
confuses me is that i can think of at least 2 distinct types of "holes":
1) Holes that can be filled later. The queue conceptually exists, but
 there is no need to back it with any resources for now because it is
 dormant (it can be seen a hole in comparison to queues that need to
materialize -- vring, notifiers, ...)
2) Holes that can not be filled without resetting the device: i.e. if
 certain features are not negotiated, then a queue X does not exist,
 but subsequent queues retain their index.


I think it is not about "negotiated", that might be the wrong terminology.

E.g., in QEMU virtio_balloon_device_realize() we define the virtqueues
(virtio_add_queue()) if virtio_has_feature(s->host_features).

That is, it's independent of a feature negotiation (IIUC), it's static for
the device --  "host_features"



No no that is a bad idea. Breaks forward compatibility.

Oh my. I did not realize. It is really broken hopelessly.

Because, note, the guest looks at the guest features :)


Can you elaborate why?

statsq = 2

free_page_vq = statsq + host_offered_feat(VIRTIO_BALLOON_F_STATS_VQ)

reporting_vq = free_page_vq + 
host_offered_feat(VIRTIO_BALLOON_F_FREE_PAGE_HINT)



Independent of any upcoming features. And if a new feature defines a new 
virtqueue


new_vq = reporting_vq +  host_offered_feat(VIRTIO_BALLOON_F_PAGE_REPORTING)

We only have to make sure in the spec that these calculations always hold.

Querying of the host offered features already happens as part of 
determining the actual guest usable feature (driver_offered & host_offered).




Now I am beginning to think we should leave the spec alone
and fix the drivers ... Ugh 


We could always say that starting with feature X, queue indexes are 
fixed again. E.g., VIRTIO_BALLOON_F_X would have it's virtqueue fixed at 
index 5, independent of the other (older) features where the virtqueue 
indexes are determined like today.


Won't make the implementation easier, though, I'm afraid.

(I also thought about a way to query the virtqueue index for a feature, 
but that's probably overengineering)


--
Cheers,

David / dhildenb




[PATCH v2 12/32] selftests: harness: Stop using setjmp()/longjmp()

2025-04-07 Thread Thomas Weißschuh
Usage of longjmp() was added to ensure that teardown is always run in
commit 63e6b2a42342 ("selftests/harness: Run TEARDOWN for ASSERT failures")
However instead of calling longjmp() to the teardown handler it is easier to
just call the teardown handler directly from __bail().
Any potential duplicate teardown invocations are harmless as the actual
handler will only ever be executed once since
commit fff37bd32c76 ("selftests/harness: Fix fixture teardown").

Additionally this removes a incompatibility with nolibc,
which does not support setjmp()/longjmp().

Signed-off-by: Thomas Weißschuh 
---
 tools/testing/selftests/kselftest_harness.h | 49 ++---
 1 file changed, 17 insertions(+), 32 deletions(-)

diff --git a/tools/testing/selftests/kselftest_harness.h 
b/tools/testing/selftests/kselftest_harness.h
index 
5373b8da8886aef5df3368aeff95080636ae2343..1e584f39a42023c400988dea96f0274d4dc3645b
 100644
--- a/tools/testing/selftests/kselftest_harness.h
+++ b/tools/testing/selftests/kselftest_harness.h
@@ -67,7 +67,6 @@
 #include 
 #include 
 #include 
-#include 
 
 #include "kselftest.h"
 
@@ -178,9 +177,7 @@
struct __test_metadata *_metadata, \
struct __fixture_variant_metadata __attribute__((unused)) 
*variant) \
{ \
-   if (setjmp(_metadata->env) == 0) \
-   test_name(_metadata, NULL, NULL); \
-   __test_check_assert(_metadata); \
+   test_name(_metadata, NULL, NULL); \
} \
static struct __test_metadata _##test_name##_object = \
{ .name = #test_name, \
@@ -425,24 +422,20 @@
self = &self_private; \
} \
} \
-   if (setjmp(_metadata->env) == 0) { \
-   /* _metadata and potentially self are shared with all 
forks. */ \
-   child = fork(); \
-   if (child == 0) { \
-   fixture_name##_setup(_metadata, self, 
variant->data); \
-   /* Let setup failure terminate early. */ \
-   if (_metadata->exit_code) \
-   _exit(0); \
-   *_metadata->no_teardown = false; \
-   fixture_name##_##test_name(_metadata, self, 
variant->data); \
-   } else if (child < 0 || child != waitpid(child, 
&status, 0)) { \
-   ksft_print_msg("ERROR SPAWNING TEST 
GRANDCHILD\n"); \
-   _metadata->exit_code = KSFT_FAIL; \
-   } \
-   } \
+   /* _metadata and potentially self are shared with all forks. */ 
\
+   child = fork(); \
if (child == 0) { \
+   fixture_name##_setup(_metadata, self, variant->data); \
+   /* Let setup failure terminate early. */ \
+   if (_metadata->exit_code) \
+   _exit(0); \
+   *_metadata->no_teardown = false; \
+   fixture_name##_##test_name(_metadata, self, 
variant->data); \
_metadata->teardown_fn(false, _metadata, self, 
variant->data); \
_exit(0); \
+   } else if (child < 0 || child != waitpid(child, &status, 0)) { \
+   ksft_print_msg("ERROR SPAWNING TEST GRANDCHILD\n"); \
+   _metadata->exit_code = KSFT_FAIL; \
} \
_metadata->teardown_fn(true, _metadata, self, variant->data); \
munmap(_metadata->no_teardown, 
sizeof(*_metadata->no_teardown)); \
@@ -456,7 +449,6 @@
/* Forward signal to __wait_for_test(). */ \
kill(getpid(), WTERMSIG(status)); \
} \
-   __test_check_assert(_metadata); \
} \
static void wrapper_##fixture_name##_##test_name##_teardown( \
bool in_parent, struct __test_metadata *_metadata, \
@@ -757,7 +749,7 @@
  */
 #define OPTIONAL_HANDLER(_assert) \
for (; _metadata->trigger; _metadata->trigger = \
-   __bail(_assert, _metadata))
+   __bail(_assert, _metadata, self, variant))
 
 #define is_signed_type(var)   (!!(((__typeof__(var))(-1)) < 
(__typeof__(var))1))
 
@@ -927,7 +919,6 @@ struct __test_metadata {
int timeout;/* seconds to wait for test timeout */
bool aborted;   /* stopped test due to failed ASSERT */
bool *no_teardown; /* fixture needs teardown */
-   jmp_buf env;/* for exiting out of test early */
struct __test_results *results;
struct __test_metadata *prev, *next;
 };
@@ -957,23 +948,18 @@ static inline void __register_xfail(struct __test_xfail 
*xf)
__LIST_APPEND(xf->variant->xfails, xf);
 }
 
-

Re: [PATCH v1] s390/virtio_ccw: don't allocate/assign airqs for non-existing queues

2025-04-07 Thread Michael S. Tsirkin
On Fri, Apr 04, 2025 at 05:39:10PM +0200, Halil Pasic wrote:
> > 
> > Not perfect, but AFAIKS, not horrible.
> 
> It is like it is. QEMU does queue exist if the corresponding feature
> is offered by the device, and that is what we have to live with.

I don't think we can live with this properly though.
It means a guest that does not know about some features
does not know where to find things.

So now, I am inclined to add linux code to work with current qemu and
with spec compliant one, and add qemu code to work with current linux
and spec compliant one.

Document the bug in the spec, maybe, in a non conformance section.

-- 
MST




Re: [PATCH v8 4/8] vhost: Introduce vhost_worker_ops in vhost_worker

2025-04-07 Thread Stefano Garzarella
On Mon, 7 Apr 2025 at 05:14, Cindy Lu  wrote:
>
> On Tue, Apr 1, 2025 at 9:48 PM Stefano Garzarella  wrote:
> >
> > On Fri, Mar 28, 2025 at 06:02:48PM +0800, Cindy Lu wrote:
> > >Abstract vhost worker operations (create/stop/wakeup) into an ops
> > >structure to prepare for kthread mode support.
> > >
> > >Signed-off-by: Cindy Lu 
> > >---
> > > drivers/vhost/vhost.c | 63 ++-
> > > drivers/vhost/vhost.h | 11 
> > > 2 files changed, 56 insertions(+), 18 deletions(-)
> > >
> > >diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > >index 20571bd6f7bd..c162ad772f8f 100644
> > >--- a/drivers/vhost/vhost.c
> > >+++ b/drivers/vhost/vhost.c
> > >@@ -243,7 +243,7 @@ static void vhost_worker_queue(struct vhost_worker 
> > >*worker,
> > >* test_and_set_bit() implies a memory barrier.
> > >*/
> > >   llist_add(&work->node, &worker->work_list);
> > >-  vhost_task_wake(worker->vtsk);
> > >+  worker->ops->wakeup(worker);
> > >   }
> > > }
> > >
> > >@@ -706,7 +706,7 @@ static void vhost_worker_destroy(struct vhost_dev *dev,
> > >
> > >   WARN_ON(!llist_empty(&worker->work_list));
> > >   xa_erase(&dev->worker_xa, worker->id);
> > >-  vhost_task_stop(worker->vtsk);
> > >+  worker->ops->stop(worker);
> > >   kfree(worker);
> > > }
> > >
> > >@@ -729,42 +729,69 @@ static void vhost_workers_free(struct vhost_dev *dev)
> > >   xa_destroy(&dev->worker_xa);
> > > }
> > >
> > >+static void vhost_task_wakeup(struct vhost_worker *worker)
> > >+{
> > >+  return vhost_task_wake(worker->vtsk);
> > >+}
> > >+
> > >+static void vhost_task_do_stop(struct vhost_worker *worker)
> > >+{
> > >+  return vhost_task_stop(worker->vtsk);
> > >+}
> > >+
> > >+static int vhost_task_worker_create(struct vhost_worker *worker,
> > >+  struct vhost_dev *dev, const char *name)
> > >+{
> > >+  struct vhost_task *vtsk;
> > >+  u32 id;
> > >+  int ret;
> > >+
> > >+  vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed,
> > >+   worker, name);
> > >+  if (IS_ERR(vtsk))
> > >+  return PTR_ERR(vtsk);
> > >+
> > >+  worker->vtsk = vtsk;
> > >+  vhost_task_start(vtsk);
> > >+  ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, 
> > >GFP_KERNEL);
> > >+  if (ret < 0) {
> > >+  vhost_task_do_stop(worker);
> > >+  return ret;
> > >+  }
> >
> > In the final code, xa_alloc() is duplicated among the functions that
> > create ktrhead or task, might it make sense to leave it out and do it in
> > vhost_worker_create() ?
> >
> > Thanks,
> > Stefano
> >
> Thanks a lot Stefano. I previously tried moving xa_alloc() out, but
> that made the code strange.
> I think keeping xa_alloc() in the create_ops function completes the
> job in  a single function, and maybe it could be used in some other
> functions in the future

Sure, if you tried, and it doesn't add benefits, that's perfectly fine
to ignore this suggestion! ;-)

Thanks,
Stefano

> thanks
> cindy
>
> > >+  worker->id = id;
> > >+  return 0;
> > >+}
> > >+
> > >+static const struct vhost_worker_ops vhost_task_ops = {
> > >+  .create = vhost_task_worker_create,
> > >+  .stop = vhost_task_do_stop,
> > >+  .wakeup = vhost_task_wakeup,
> > >+};
> > >+
> > > static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
> > > {
> > >   struct vhost_worker *worker;
> > >-  struct vhost_task *vtsk;
> > >   char name[TASK_COMM_LEN];
> > >   int ret;
> > >-  u32 id;
> > >+  const struct vhost_worker_ops *ops = &vhost_task_ops;
> > >
> > >   worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT);
> > >   if (!worker)
> > >   return NULL;
> > >
> > >   worker->dev = dev;
> > >+  worker->ops = ops;
> > >   snprintf(name, sizeof(name), "vhost-%d", current->pid);
> > >
> > >-  vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed,
> > >-   worker, name);
> > >-  if (IS_ERR(vtsk))
> > >-  goto free_worker;
> > >-
> > >   mutex_init(&worker->mutex);
> > >   init_llist_head(&worker->work_list);
> > >   worker->kcov_handle = kcov_common_handle();
> > >-  worker->vtsk = vtsk;
> > >-
> > >-  vhost_task_start(vtsk);
> > >-
> > >-  ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, 
> > >GFP_KERNEL);
> > >+  ret = ops->create(worker, dev, name);
> > >   if (ret < 0)
> > >-  goto stop_worker;
> > >-  worker->id = id;
> > >+  goto free_worker;
> > >
> > >   return worker;
> > >
> > >-stop_worker:
> > >-  vhost_task_stop(vtsk);
> > > free_worker:
> > >   kfree(worker);
> > >   return NULL;
> > >diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > >index 19bb94922a0e..98895e299efa 100644
> > >--- a

[PATCH] kunit: qemu_configs: Add riscv32 config

2025-04-07 Thread Thomas Weißschuh
Add a basic config to run kunit tests on riscv32.

Signed-off-by: Thomas Weißschuh 
---
 tools/testing/kunit/qemu_configs/riscv32.py | 17 +
 1 file changed, 17 insertions(+)

diff --git a/tools/testing/kunit/qemu_configs/riscv32.py 
b/tools/testing/kunit/qemu_configs/riscv32.py
new file mode 100644
index 
..b79ba0ae30f8573035b3401be337b379eba97e26
--- /dev/null
+++ b/tools/testing/kunit/qemu_configs/riscv32.py
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0
+
+from ..qemu_config import QemuArchParams
+
+QEMU_ARCH = QemuArchParams(linux_arch='riscv',
+  kconfig='''
+CONFIG_NONPORTABLE=y
+CONFIG_ARCH_RV32I=y
+CONFIG_ARCH_VIRT=y
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_SERIAL_OF_PLATFORM=y
+''',
+  qemu_arch='riscv32',
+  kernel_path='arch/riscv/boot/Image',
+  kernel_command_line='console=ttyS0',
+  extra_qemu_params=['-machine', 'virt'])

---
base-commit: 0af2f6be1b4281385b618cb86ad946eded089ac8
change-id: 20250214-kunit-qemu-riscv32-fb38d659c373

Best regards,
-- 
Thomas Weißschuh 




Re: [PATCH v8 4/8] vhost: Introduce vhost_worker_ops in vhost_worker

2025-04-07 Thread Michael S. Tsirkin
On Fri, Mar 28, 2025 at 06:02:48PM +0800, Cindy Lu wrote:
> Abstract vhost worker operations (create/stop/wakeup) into an ops
> structure to prepare for kthread mode support.
> 
> Signed-off-by: Cindy Lu 

I worry about the overhead of indirect calls here.

We have the wrappers, and only two options,
why did you decide to add it like this,
with ops?



> ---
>  drivers/vhost/vhost.c | 63 ++-
>  drivers/vhost/vhost.h | 11 
>  2 files changed, 56 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 20571bd6f7bd..c162ad772f8f 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -243,7 +243,7 @@ static void vhost_worker_queue(struct vhost_worker 
> *worker,
>* test_and_set_bit() implies a memory barrier.
>*/
>   llist_add(&work->node, &worker->work_list);
> - vhost_task_wake(worker->vtsk);
> + worker->ops->wakeup(worker);
>   }
>  }
>  
> @@ -706,7 +706,7 @@ static void vhost_worker_destroy(struct vhost_dev *dev,
>  
>   WARN_ON(!llist_empty(&worker->work_list));
>   xa_erase(&dev->worker_xa, worker->id);
> - vhost_task_stop(worker->vtsk);
> + worker->ops->stop(worker);
>   kfree(worker);
>  }
>  
> @@ -729,42 +729,69 @@ static void vhost_workers_free(struct vhost_dev *dev)
>   xa_destroy(&dev->worker_xa);
>  }
>  
> +static void vhost_task_wakeup(struct vhost_worker *worker)
> +{
> + return vhost_task_wake(worker->vtsk);
> +}
> +
> +static void vhost_task_do_stop(struct vhost_worker *worker)
> +{
> + return vhost_task_stop(worker->vtsk);
> +}
> +
> +static int vhost_task_worker_create(struct vhost_worker *worker,
> + struct vhost_dev *dev, const char *name)
> +{
> + struct vhost_task *vtsk;
> + u32 id;
> + int ret;
> +
> + vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed,
> +  worker, name);
> + if (IS_ERR(vtsk))
> + return PTR_ERR(vtsk);
> +
> + worker->vtsk = vtsk;
> + vhost_task_start(vtsk);
> + ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL);
> + if (ret < 0) {
> + vhost_task_do_stop(worker);
> + return ret;
> + }
> + worker->id = id;
> + return 0;
> +}
> +
> +static const struct vhost_worker_ops vhost_task_ops = {
> + .create = vhost_task_worker_create,
> + .stop = vhost_task_do_stop,
> + .wakeup = vhost_task_wakeup,
> +};
> +
>  static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
>  {
>   struct vhost_worker *worker;
> - struct vhost_task *vtsk;
>   char name[TASK_COMM_LEN];
>   int ret;
> - u32 id;
> + const struct vhost_worker_ops *ops = &vhost_task_ops;
>  
>   worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT);
>   if (!worker)
>   return NULL;
>  
>   worker->dev = dev;
> + worker->ops = ops;
>   snprintf(name, sizeof(name), "vhost-%d", current->pid);
>  
> - vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed,
> -  worker, name);
> - if (IS_ERR(vtsk))
> - goto free_worker;
> -
>   mutex_init(&worker->mutex);
>   init_llist_head(&worker->work_list);
>   worker->kcov_handle = kcov_common_handle();
> - worker->vtsk = vtsk;
> -
> - vhost_task_start(vtsk);
> -
> - ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL);
> + ret = ops->create(worker, dev, name);
>   if (ret < 0)
> - goto stop_worker;
> - worker->id = id;
> + goto free_worker;
>  
>   return worker;
>  
> -stop_worker:
> - vhost_task_stop(vtsk);
>  free_worker:
>   kfree(worker);
>   return NULL;
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 19bb94922a0e..98895e299efa 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -26,6 +26,16 @@ struct vhost_work {
>   unsigned long   flags;
>  };
>  
> +struct vhost_worker;
> +struct vhost_dev;
> +
> +struct vhost_worker_ops {
> + int (*create)(struct vhost_worker *worker, struct vhost_dev *dev,
> +   const char *name);
> + void (*stop)(struct vhost_worker *worker);
> + void (*wakeup)(struct vhost_worker *worker);
> +};
> +
>  struct vhost_worker {
>   struct vhost_task   *vtsk;
>   struct vhost_dev*dev;
> @@ -36,6 +46,7 @@ struct vhost_worker {
>   u32 id;
>   int attachment_cnt;
>   boolkilled;
> + const struct vhost_worker_ops *ops;
>  };
>  
>  /* Poll a file (eventfd or socket) */
> -- 
> 2.45.0




Re: [PATCH 13/19] virtio_ring: introduce virtqueue ops

2025-04-07 Thread Michael S. Tsirkin
On Mon, Mar 24, 2025 at 02:01:21PM +0800, Jason Wang wrote:
> This patch introduces virtqueue ops which is a set of the callbacks
> that will be called for different queue layout or features. This would
> help to avoid branches for split/packed and will ease the future
> implementation like in order.
> 
> Signed-off-by: Jason Wang 




> ---
>  drivers/virtio/virtio_ring.c | 96 +---
>  1 file changed, 67 insertions(+), 29 deletions(-)
> 
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index a2884eae14d9..ce1dc90ee89d 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -159,9 +159,30 @@ struct vring_virtqueue_packed {
>   size_t event_size_in_bytes;
>  };
>  
> +struct vring_virtqueue;
> +
> +struct virtqueue_ops {
> + int (*add)(struct vring_virtqueue *_vq, struct scatterlist *sgs[],
> +unsigned int total_sg, unsigned int out_sgs,
> +unsigned int in_sgs, void *data,
> +void *ctx, bool premapped, gfp_t gfp);
> + void *(*get)(struct vring_virtqueue *vq, unsigned int *len, void **ctx);
> + bool (*kick_prepare)(struct vring_virtqueue *vq);
> + void (*disable_cb)(struct vring_virtqueue *vq);
> + bool (*enable_cb_delayed)(struct vring_virtqueue *vq);
> + unsigned int (*enable_cb_prepare)(struct vring_virtqueue *vq);
> + bool (*poll)(const struct vring_virtqueue *vq, u16 last_used_idx);
> + void *(*detach_unused_buf)(struct vring_virtqueue *vq);
> + bool (*more_used)(const struct vring_virtqueue *vq);
> + int (*resize)(struct vring_virtqueue *vq, u32 num);
> + void (*reset)(struct vring_virtqueue *vq);
> +};

I like it that it's organized but
I worry about the overhead of indirect calls here.
How about a switch statement instead?

struct vring_virtqueue {
enum vring_virtqueue_ops ops;

}


@@ -2248,10 +2303,8 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 {
  struct vring_virtqueue *vq = to_vvq(_vq);

switch (vq->ops) {
 VQ_PACKED:
 VQ_SPLIT:
 VQ_IN_ORDER:
}


}


What do you think?



> +
>  struct vring_virtqueue {
>   struct virtqueue vq;
>  
> + struct virtqueue_ops *ops;
> +
>   /* Is this a packed ring? */
>   bool packed_ring;
>  
> @@ -1116,6 +1137,8 @@ static int vring_alloc_queue_split(struct 
> vring_virtqueue_split *vring_split,
>   return 0;
>  }
>  
> +struct virtqueue_ops split_ops;
> +
>  static struct virtqueue *__vring_new_virtqueue_split(unsigned int index,
>  struct vring_virtqueue_split 
> *vring_split,
>  struct virtio_device *vdev,
> @@ -1134,6 +1157,7 @@ static struct virtqueue 
> *__vring_new_virtqueue_split(unsigned int index,
>   return NULL;
>  
>   vq->packed_ring = false;
> + vq->ops = &split_ops;
>   vq->vq.callback = callback;
>   vq->vq.vdev = vdev;
>   vq->vq.name = name;
> @@ -2076,6 +2100,8 @@ static void virtqueue_reset_packed(struct 
> vring_virtqueue *vq)
>   virtqueue_vring_init_packed(&vq->packed, !!vq->vq.callback);
>  }
>  
> +struct virtqueue_ops packed_ops;
> +
>  static struct virtqueue *__vring_new_virtqueue_packed(unsigned int index,
>  struct vring_virtqueue_packed 
> *vring_packed,
>  struct virtio_device *vdev,
> @@ -2107,6 +2133,7 @@ static struct virtqueue 
> *__vring_new_virtqueue_packed(unsigned int index,
>   vq->broken = false;
>  #endif
>   vq->packed_ring = true;
> + vq->ops = &packed_ops;
>   vq->dma_dev = dma_dev;
>   vq->use_dma_api = vring_use_dma_api(vdev);
>  
> @@ -2194,6 +2221,34 @@ static int virtqueue_resize_packed(struct 
> vring_virtqueue *vq, u32 num)
>   return -ENOMEM;
>  }
>  
> +struct virtqueue_ops split_ops = {
> + .add = virtqueue_add_split,
> + .get = virtqueue_get_buf_ctx_split,
> + .kick_prepare = virtqueue_kick_prepare_split,
> + .disable_cb = virtqueue_disable_cb_split,
> + .enable_cb_delayed = virtqueue_enable_cb_delayed_split,
> + .enable_cb_prepare = virtqueue_enable_cb_prepare_split,
> + .poll = virtqueue_poll_split,
> + .detach_unused_buf = virtqueue_detach_unused_buf_split,
> + .more_used = more_used_split,
> + .resize = virtqueue_resize_split,
> + .reset = virtqueue_reset_split,
> +};
> +
> +struct virtqueue_ops packed_ops = {
> + .add = virtqueue_add_packed,
> + .get = virtqueue_get_buf_ctx_packed,
> + .kick_prepare = virtqueue_kick_prepare_packed,
> + .disable_cb = virtqueue_disable_cb_packed,
> + .enable_cb_delayed = virtqueue_enable_cb_delayed_packed,
> + .enable_cb_prepare = virtqueue_enable_cb_prepare_packed,
> + .poll = virtqueue_poll_packed,
> + .detach_unused_buf = virtqueue_detach_unused_buf_packed,
> + .more_used = more_used_packed,

RE: [PATCH v2 2/2] x86/sgx: Implement EUPDATESVN and opportunistically call it during first EPC page alloc

2025-04-07 Thread Reshetova, Elena

> On Fri, Apr 04, 2025 at 06:53:17AM +, Reshetova, Elena wrote:
> > > On Wed, Apr 02, 2025 at 01:11:25PM +, Reshetova, Elena wrote:
> > > > > > current SGX kernel code does not handle such errors in any other
> way
> > > > > > than notifying that operation failed for other ENCLS leaves. So, I 
> > > > > > don't
> > > > > > see why ENCLS[EUPDATESVN] should be different from existing
> > > behaviour?
> > > > >
> > > > > While not disagreeing fully (it depends on call site), in some
> > > > > situations it is more difficult to take more preventive actions.
> > > > >
> > > > > This is a situation where we know that there are *zero* EPC pages in
> > > > > traffic so it is relatively easy to stop the madness, isn't it?
> > > > >
> > > > > I guess the best action would be make sgx_alloc_epc_page() return
> > > > > consistently -ENOMEM, if the unexpected happens.
> > > >
> > > > But this would be very misleading imo. We do have memory, even page
> > > > allocation might function as normal in EPC, the only thing that is 
> > > > broken
> > > > can be EUPDATESVN functionality. Returning -ENOMEM in this case
> seems
> > > > wrong.
> > >
> > > This makes it not misleading at all:
> > >
> > >   pr_err("EUPDATESVN: unknown error %d\n", ret);
> > >
> > > Since hardware should never return this, it indicates a kernel bug.
> >
> > OK, so you propose in this case to print the above message, sgx_updatesvn
> > returning an error, and then NULL from __sgx_alloc_epc_page_from_node
> and
> > the __sgx_alloc_epc_page returning -ENOMEM after an iteration over
> > a whole set of numa nodes given that we will keep getting the unknown
> error
> > on each node upon trying to do an allocation from each one?
> 
> I'd disable ioctl's in this case and return -ENOMEM. It's a cheap sanity
> check. Should not ever happen, but if e.g., a new kernel patch breaks
> anything, it could help catching issues.
> 
> We are talking here about situation that is never expected to happen so I
> don't think it is too heavy hammer here. Here it makes sense because not
> much effort is required to implement the counter-measures.

OK, but does it really make sense to explicitly disable ioctls? 
Note that everything *in practice* will be disabled simply because not a single 
page
anymore can be allocated from EPC since we are getting -ENOMEM on EPC
page allocation. Also, note that any approach we chose should be symmetrical
to SGX virtualization side also, which doesn’t use ioctls at all. Simply 
returning
-ENOMEM for page allocation in EPC seems like a correct symmetrical solution
that would work for both nativel enclaves and EPC pages allocated for VMs.
And nothing would  be able to proceed creating/managing enclaves at this point. 

Best Regards,
Elena.



Re: [PATCH] kbuild: Require pahole >v1.29 with GENDWARFKSYMS and BTF on X86

2025-04-07 Thread Sam James
[with regard to
https://lore.kernel.org/linux-kbuild/20250320232757.2283956-2-samitolva...@google.com/]

Would it be possible to have a new release with that fix, to avoid
distros all having to cherrypick the fix commit?

Thanks in advance,
sam



Re: [PATCH v1] s390/virtio_ccw: don't allocate/assign airqs for non-existing queues

2025-04-07 Thread David Hildenbrand

On 07.04.25 10:54, Michael S. Tsirkin wrote:

On Mon, Apr 07, 2025 at 09:18:21AM +0200, David Hildenbrand wrote:

Now I am beginning to think we should leave the spec alone
and fix the drivers ... Ugh 


We could always say that starting with feature X, queue indexes are fixed
again. E.g., VIRTIO_BALLOON_F_X would have it's virtqueue fixed at index 5,
independent of the other (older) features where the virtqueue indexes are
determined like today.

Won't make the implementation easier, though, I'm afraid.

(I also thought about a way to query the virtqueue index for a feature, but
that's probably overengineering)


The best contract we have is the spec. Sometimes it is hopelessly broken
and we have to fix it, but not in this case.

Let's do a theoretical excercise, assuming we want to fix the drivers,
but we also want to have workarounds in place in qemu and in
drivers to support existing ones. How would we go about it?


QEMU could likely be changed to always offer 
VIRTIO_BALLOON_F_FREE_PAGE_HINT, but not actually use it unless enabled 
for QEMU. That should work, because all action is initiated by the device.


That way, all virtqueue indexes would always be according to the spec.

We'll likely need compat machine handling 


Regarding Linux, I'll have to think about it further ...


Maybe we want a feature bit BALLOON_FIXED and ask everyone
to negotiate it?  But if we go this way, we really need to fix
the 48 bit limitation too.


I was thinking about the same, but it's all a mess ...

--
Cheers,

David / dhildenb




Re: [PATCH 0/2] Some small preparations around CAMSS D-PHY / C-PHY support

2025-04-07 Thread Bryan O'Donoghue

On 17/03/2025 07:45, Luca Weiss wrote:

On Wed Feb 26, 2025 at 3:47 PM CET, Bryan O'Donoghue wrote:

On 26/02/2025 14:13, Luca Weiss wrote:

Hi all,

On Mon Dec 9, 2024 at 1:01 PM CET, Luca Weiss wrote:

Since the hardware blocks on the SoCs generally support both D-PHY and
C-PHY standards for camera, but the camss driver currently is only
supporting D-PHY, do some preparations in order to add C-PHY support at
some point.

Make the dt bindings explicit that the hardware supports both (except
for MSM8916) but also add a check to the driver that errors out in case
a dt tries to use C-PHY since that's not supported yet.

Signed-off-by: Luca Weiss 
---
Luca Weiss (2):
media: dt-bindings: media: camss: Restrict bus-type property
media: qcom: camss: Restrict endpoint bus-type to D-PHY


This series is still pending, both patches got reviews and no pending
comments from what I can see.

Would be nice to get it in for 6.15.


Yes this should be merged.

Thanks for following up.


Hi Bryan, hi Hans,

6.15 merge window is approaching fast, I wonder if this series was
missed still.


As soon as possible.

---
bod



Re: [PATCH v3 0/6] KVM: guest_memfd: support for uffd minor

2025-04-07 Thread David Hildenbrand

On 07.04.25 16:24, Liam R. Howlett wrote:

* Nikita Kalyazin  [250407 10:05]:




...



All of this is extremely confusing because the onus of figuring out what
the final code will look like is put on the reviewer.  As it is, we have
issues with people not doing enough review of the code (due to limited
time).  One way to get reviews is to make the barrier of entry as low as
possible.

I spent Friday going down a rabbit hole of patches referring to each
other as dependencies and I gave up.  It looks like I mistook one set of
patches as required vs them requiring the same in-flight ones as your
patches.

I am struggling to see how we can adequately support all of you given
the way the patches are sent out in batches with dependencies - it is
just too time consuming to sort out.


I'm happy to do whatever I can to make the review easier.  I suppose the
extreme case is to wait for the dependencies to get accepted, effectively
serialising submissions, but that slows the process down significantly.  For
example, I received very good feedback on v1 and v2 of this series and was
able to address it instead of waiting for the dependency.  Would including
the required patches directly in the series help?  My only concern is in
that case the same patch will be submitted multiple times (as a part of
every depending series), but if it's better, I'll be doing that instead.


Don't resend patches that someone else is upstreaming, that'll cause
other problems.

Three methods come to mind:

1. As you stated, wait for the dependencies to land.  This is will mean
what you are working against is well tested and won't change (and you
won't have to re-spin due to an unstable base).

2. Combine them into a bigger patch set.  I can then pull one patch set
and look at the parts of interest to the mm side.

3. Provide a git repo with the necessary changes together.

I think 2 and 3 together should be used for the guest_memfd patches.
Someone needs to be managing these to send upstream.  See the discussion
in another patch set on guest_memfd here [1].


The issue is that most extensions are fairly independent from each 
other, except that they built up on Fuad's mmap support,


Sending all together as one thing might not be the best option.

Once basic mmap support is upstream, some of the extensions (e.g., 
directmap removal) can go in next.


So until that is upstream, I agree that tagging the stuff that builds up 
on that is the right thing to do, and providing git trees is another 
very good idea.


I'll prioritize getting Fuad's mmap stuff reviewed. (I keep saying that, 
I know)


--
Cheers,

David / dhildenb




Re: [PATCH] scripts/spdxcheck: Limit the scope of git.Repo

2025-04-07 Thread Duje Mihanović
On Tuesday, 25 February 2025 14:10:41 Central European Summer Time Ricardo 
Ribalda wrote:
> If the git.Repo object's scope extends to the Python interpreter's
> shutdown phase, its destructor may fail due to the interpreter's state.
> 
> Exception ignored in: 
> Traceback (most recent call last):
>   File "/usr/lib/python3/dist-packages/git/cmd.py", line 565, in __del__
>   File "/usr/lib/python3/dist-packages/git/cmd.py", line 546, in _terminate
>   File "/usr/lib/python3.13/subprocess.py", line 2227, in terminate
> ImportError: sys.meta_path is None, Python is likely shutting down
> 
> Use the `with` statement to limit the scope of git.Repo and ensure
> proper resource management.
> 
> Signed-off-by: Ricardo Ribalda 
> ---

checkpatch suddenly broke for me with the same error as shown here and the 
patch fixed it.

Tested-by: Duje Mihanović 

Regards,
-- 
Duje






Re: [PATCH v3 0/6] KVM: guest_memfd: support for uffd minor

2025-04-07 Thread Liam R. Howlett
* Nikita Kalyazin  [250407 10:05]:
> 

...

> > 
> > All of this is extremely confusing because the onus of figuring out what
> > the final code will look like is put on the reviewer.  As it is, we have
> > issues with people not doing enough review of the code (due to limited
> > time).  One way to get reviews is to make the barrier of entry as low as
> > possible.
> > 
> > I spent Friday going down a rabbit hole of patches referring to each
> > other as dependencies and I gave up.  It looks like I mistook one set of
> > patches as required vs them requiring the same in-flight ones as your
> > patches.
> > 
> > I am struggling to see how we can adequately support all of you given
> > the way the patches are sent out in batches with dependencies - it is
> > just too time consuming to sort out.
> 
> I'm happy to do whatever I can to make the review easier.  I suppose the
> extreme case is to wait for the dependencies to get accepted, effectively
> serialising submissions, but that slows the process down significantly.  For
> example, I received very good feedback on v1 and v2 of this series and was
> able to address it instead of waiting for the dependency.  Would including
> the required patches directly in the series help?  My only concern is in
> that case the same patch will be submitted multiple times (as a part of
> every depending series), but if it's better, I'll be doing that instead.

Don't resend patches that someone else is upstreaming, that'll cause
other problems.

Three methods come to mind:

1. As you stated, wait for the dependencies to land.  This is will mean
what you are working against is well tested and won't change (and you
won't have to re-spin due to an unstable base).

2. Combine them into a bigger patch set.  I can then pull one patch set
and look at the parts of interest to the mm side.

3. Provide a git repo with the necessary changes together.

I think 2 and 3 together should be used for the guest_memfd patches.
Someone needs to be managing these to send upstream.  See the discussion
in another patch set on guest_memfd here [1].

As this is not based on fully upstream patches, this should be marked as
RFC, imo.

Thanks,
Liam

[1]. 
https://lore.kernel.org/all/aizia2elwspxcmfrjote5h7k5wdw2stp42slytkl5visrjvzwi@jj3lwuudiyjk/



Re: [RESEND] virtiofs: add filesystem context source name check

2025-04-07 Thread Miklos Szeredi
On Mon, 7 Apr 2025 at 13:51, Xiangsheng Hou  wrote:
>
> In certain scenarios, for example, during fuzz testing, the source
> name may be NULL, which could lead to a kernel panic. Therefore, an
> extra check for the source name should be added.
>
> Signed-off-by: Xiangsheng Hou 

Acked-by: Miklos Szeredi 

Thanks,
Miklos



[PATCH v5 0/2] memcg: Fix test_memcg_min/low test failures

2025-04-07 Thread Waiman Long
v5:
 - Use mem_cgroup_usage() as originally suggested by Johannes.

v4:
 - Add "#ifdef CONFIG_MEMCG" directives around shrink_node_memcgs() to
   avoid compilation problem with !CONFIG_MEMCG configs.

The test_memcontrol selftest consistently fails its test_memcg_low
sub-test and sporadically fails its test_memcg_min sub-test. This
patchset fixes the test_memcg_min and test_memcg_low failures by
skipping the !usage case in shrink_node_memcgs() and adjust the
test_memcontrol selftest to fix other causes of the test failures.

Waiman Long (2):
  mm/vmscan: Skip memcg with !usage in shrink_node_memcgs()
  selftests: memcg: Increase error tolerance of child memory.current
check in test_memcg_protection()

 mm/internal.h|  9 +
 mm/memcontrol-v1.h   |  2 --
 mm/vmscan.c  |  4 
 tools/testing/selftests/cgroup/test_memcontrol.c | 11 ---
 4 files changed, 21 insertions(+), 5 deletions(-)

-- 
2.48.1




[PATCH v5 2/2] selftests: memcg: Increase error tolerance of child memory.current check in test_memcg_protection()

2025-04-07 Thread Waiman Long
The test_memcg_protection() function is used for the test_memcg_min and
test_memcg_low sub-tests. This function generates a set of parent/child
cgroups like:

  parent:  memory.min/low = 50M
  child 0: memory.min/low = 75M,  memory.current = 50M
  child 1: memory.min/low = 25M,  memory.current = 50M
  child 2: memory.min/low = 0,memory.current = 50M

After applying memory pressure, the function expects the following
actual memory usages.

  parent:  memory.current ~= 50M
  child 0: memory.current ~= 29M
  child 1: memory.current ~= 21M
  child 2: memory.current ~= 0

In reality, the actual memory usages can differ quite a bit from the
expected values. It uses an error tolerance of 10% with the values_close()
helper.

Both the test_memcg_min and test_memcg_low sub-tests can fail
sporadically because the actual memory usage exceeds the 10% error
tolerance. Below are a sample of the usage data of the tests runs
that fail.

  Child   Actual usageExpected usage%err
  -   --
1   16990208 22020096  -12.9%
1   17252352 22020096  -12.1%
0   37699584 30408704  +10.7%
1   14368768 22020096  -21.0%
1   16871424 22020096  -13.2%

The current 10% error tolerenace might be right at the time
test_memcontrol.c was first introduced in v4.18 kernel, but memory
reclaim have certainly evolved quite a bit since then which may result
in a bit more run-to-run variation than previously expected.

Increase the error tolerance to 15% for child 0 and 20% for child 1 to
minimize the chance of this type of failure. The tolerance is bigger
for child 1 because an upswing in child 0 corresponds to a smaller
%err than a similar downswing in child 1 due to the way %err is used
in values_close().

Before this patch, a 100 test runs of test_memcontrol produced the
following results:

 17 not ok 1 test_memcg_min
 22 not ok 2 test_memcg_low

After applying this patch, there were no test failure for test_memcg_min
and test_memcg_low in 100 test runs.

Signed-off-by: Waiman Long 
---
 tools/testing/selftests/cgroup/test_memcontrol.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c 
b/tools/testing/selftests/cgroup/test_memcontrol.c
index bab826b6b7b0..8f4f2479650e 100644
--- a/tools/testing/selftests/cgroup/test_memcontrol.c
+++ b/tools/testing/selftests/cgroup/test_memcontrol.c
@@ -495,10 +495,10 @@ static int test_memcg_protection(const char *root, bool 
min)
for (i = 0; i < ARRAY_SIZE(children); i++)
c[i] = cg_read_long(children[i], "memory.current");
 
-   if (!values_close(c[0], MB(29), 10))
+   if (!values_close(c[0], MB(29), 15))
goto cleanup;
 
-   if (!values_close(c[1], MB(21), 10))
+   if (!values_close(c[1], MB(21), 20))
goto cleanup;
 
if (c[3] != 0)
-- 
2.48.1




[PATCH] dt-bindings: virtio: pci-iommu: Add ref to pci-device.yaml

2025-04-07 Thread Rob Herring (Arm)
The virtio pci-iommu is a PCI device, so it should have a reference to
the pci-device.yaml schema. The pci-device.yaml schema defines the 'reg'
format as a schema, so the text description for 'reg' can be dropped.

Signed-off-by: Rob Herring (Arm) 
---
 .../devicetree/bindings/virtio/pci-iommu.yaml  | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/Documentation/devicetree/bindings/virtio/pci-iommu.yaml 
b/Documentation/devicetree/bindings/virtio/pci-iommu.yaml
index 972a785a42de..8bd6ad72ac7a 100644
--- a/Documentation/devicetree/bindings/virtio/pci-iommu.yaml
+++ b/Documentation/devicetree/bindings/virtio/pci-iommu.yaml
@@ -20,6 +20,9 @@ description: |
   virtio-iommu node doesn't have an "iommus" property, and is omitted from
   the iommu-map property of the root complex.
 
+allOf:
+  - $ref: /schemas/pci/pci-device.yaml#
+
 properties:
   # If compatible is present, it should contain the vendor and device ID
   # according to the PCI Bus Binding specification. Since PCI provides
@@ -33,12 +36,7 @@ properties:
   - const: pci1af4,1057
 
   reg:
-description: |
-  PCI address of the IOMMU. As defined in the PCI Bus Binding
-  reference, the reg property is a five-cell address encoded as (phys.hi
-  phys.mid phys.lo size.hi size.lo). phys.hi should contain the device's
-  BDF as 0b  dfff . The other cells should be
-  zero. See Documentation/devicetree/bindings/pci/pci.txt
+maxItems: 1
 
   '#iommu-cells':
 const: 1
-- 
2.47.2




[RESEND] virtiofs: add filesystem context source name check

2025-04-07 Thread Xiangsheng Hou
In certain scenarios, for example, during fuzz testing, the source
name may be NULL, which could lead to a kernel panic. Therefore, an
extra check for the source name should be added.

Signed-off-by: Xiangsheng Hou 
---
 fs/fuse/virtio_fs.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 2c7b24cb67ad..53c2626e90e7 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -1669,6 +1669,9 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
unsigned int virtqueue_size;
int err = -EIO;
 
+   if (!fsc->source)
+   return invalf(fsc, "No source specified");
+
/* This gets a reference on virtio_fs object. This ptr gets installed
 * in fc->iq->priv. Once fuse_conn is going away, it calls ->put()
 * to drop the reference to this object.
-- 
2.46.0




[PATCH RESEND] kunit: qemu_configs: SH: Respect kunit cmdline

2025-04-07 Thread Thomas Weißschuh
The default SH kunit configuration sets CONFIG_CMDLINE_OVERWRITE which
completely disregards the cmdline passed from the bootloader/QEMU in favor
of the builtin CONFIG_CMDLINE.
However the kunit tool needs to pass arguments to the in-kernel kunit core,
for filters and other runtime parameters.

Enable CONFIG_CMDLINE_EXTEND instead, so kunit arguments are respected.

Fixes: 8110a3cab05e ("kunit: tool: Add support for SH under QEMU")
Signed-off-by: Thomas Weißschuh 
---
 tools/testing/kunit/qemu_configs/sh.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/testing/kunit/qemu_configs/sh.py 
b/tools/testing/kunit/qemu_configs/sh.py
index 
78a474a5b95f3a7d6064a2d3b728810ced095606..f00cb89fdef6aa1c0abd83ca18e7004a4fdd96e1
 100644
--- a/tools/testing/kunit/qemu_configs/sh.py
+++ b/tools/testing/kunit/qemu_configs/sh.py
@@ -7,7 +7,9 @@ CONFIG_CPU_SUBTYPE_SH7751R=y
 CONFIG_MEMORY_START=0x0c00
 CONFIG_SH_RTS7751R2D=y
 CONFIG_RTS7751R2D_PLUS=y
-CONFIG_SERIAL_SH_SCI=y''',
+CONFIG_SERIAL_SH_SCI=y
+CONFIG_CMDLINE_EXTEND=y
+''',
   qemu_arch='sh4',
   kernel_path='arch/sh/boot/zImage',
   kernel_command_line='console=ttySC1',

---
base-commit: 2014c95afecee3e76ca4a56956a936e23283f05b
change-id: 20250220-kunit-sh-f42a3a8cce35

Best regards,
-- 
Thomas Weißschuh 




Re: [PATCH v3 0/6] KVM: guest_memfd: support for uffd minor

2025-04-07 Thread Lorenzo Stoakes
On Mon, Apr 07, 2025 at 04:46:48PM +0200, David Hildenbrand wrote:
> On 07.04.25 16:24, Liam R. Howlett wrote:
> > * Nikita Kalyazin  [250407 10:05]:
> > >
> >
> > ...
> >
> > > >
> > > > All of this is extremely confusing because the onus of figuring out what
> > > > the final code will look like is put on the reviewer.  As it is, we have
> > > > issues with people not doing enough review of the code (due to limited
> > > > time).  One way to get reviews is to make the barrier of entry as low as
> > > > possible.
> > > >
> > > > I spent Friday going down a rabbit hole of patches referring to each
> > > > other as dependencies and I gave up.  It looks like I mistook one set of
> > > > patches as required vs them requiring the same in-flight ones as your
> > > > patches.
> > > >
> > > > I am struggling to see how we can adequately support all of you given
> > > > the way the patches are sent out in batches with dependencies - it is
> > > > just too time consuming to sort out.
> > >
> > > I'm happy to do whatever I can to make the review easier.  I suppose the
> > > extreme case is to wait for the dependencies to get accepted, effectively
> > > serialising submissions, but that slows the process down significantly.  
> > > For
> > > example, I received very good feedback on v1 and v2 of this series and was
> > > able to address it instead of waiting for the dependency.  Would including
> > > the required patches directly in the series help?  My only concern is in
> > > that case the same patch will be submitted multiple times (as a part of
> > > every depending series), but if it's better, I'll be doing that instead.
> >
> > Don't resend patches that someone else is upstreaming, that'll cause
> > other problems.
> >
> > Three methods come to mind:
> >
> > 1. As you stated, wait for the dependencies to land.  This is will mean
> > what you are working against is well tested and won't change (and you
> > won't have to re-spin due to an unstable base).
> >
> > 2. Combine them into a bigger patch set.  I can then pull one patch set
> > and look at the parts of interest to the mm side.
> >
> > 3. Provide a git repo with the necessary changes together.
> >
> > I think 2 and 3 together should be used for the guest_memfd patches.
> > Someone needs to be managing these to send upstream.  See the discussion
> > in another patch set on guest_memfd here [1].
>
> The issue is that most extensions are fairly independent from each other,
> except that they built up on Fuad's mmap support,
>
> Sending all together as one thing might not be the best option.
>
> Once basic mmap support is upstream, some of the extensions (e.g., directmap
> removal) can go in next.
>
> So until that is upstream, I agree that tagging the stuff that builds up on
> that is the right thing to do, and providing git trees is another very good
> idea.
>
> I'll prioritize getting Fuad's mmap stuff reviewed. (I keep saying that, I
> know)

Which series is this? Sorry maybe lost track of this one.

>
> --
> Cheers,
>
> David / dhildenb
>



Re: [PATCH v1] s390/virtio_ccw: don't allocate/assign airqs for non-existing queues

2025-04-07 Thread David Hildenbrand

On 07.04.25 15:12, Halil Pasic wrote:

On Mon, 7 Apr 2025 04:34:29 -0400
"Michael S. Tsirkin"  wrote:


On Mon, Apr 07, 2025 at 10:17:10AM +0200, David Hildenbrand wrote:

On 07.04.25 09:52, Michael S. Tsirkin wrote:

On Fri, Apr 04, 2025 at 05:39:10PM +0200, Halil Pasic wrote:


Not perfect, but AFAIKS, not horrible.


It is like it is. QEMU does queue exist if the corresponding feature
is offered by the device, and that is what we have to live with.


I don't think we can live with this properly though.
It means a guest that does not know about some features
does not know where to find things.


Please describe a real scenario, I'm missing the point.



OK so.

Device has VIRTIO_BALLOON_F_FREE_PAGE_HINT and VIRTIO_BALLOON_F_REPORTING
Driver only knows about VIRTIO_BALLOON_F_REPORTING so
it does not know what does VIRTIO_BALLOON_F_FREE_PAGE_HINT do.
How does it know which vq to use for reporting?
It will try to use the free page hint one.


First, sorry for not catching up again with the discussion earlier.

I think David's point is based on the assumption that by the time feature
with the feature bit N+1 is specified and allocates a queue Q, all
queues with indexes smaller than Q are allocated and possibly associated
with features that were previously specified (and probably have feature
bits smaller than N+1).

I.e. that we can mandate, even if you don't want to care about other
optional features, you have to, because we say so, for the matter of
virtqueue existence. And anything in the future, you don't have to care
about because the queue index associated with future features is larger
than Q, so it does not affect our position.

I think that argument can fall a part if:
* future features reference optional queues defined in the past
* somebody managed to introduce a limbo where a feature is reserved, and
   they can not decide if they want a queue or not, or make the existence
   of the queue depend on something else than a feature bit.


Staring at the cross-vmm, including the adding+removing of features and 
queues that are not in the spec, I am wondering if (in a world with 
fixed virtqueues)


1) Feature bits must be reserved before used.

2) Queue indices must be reserved before used.

It all smells like a problem similar to device IDs ...

--
Cheers,

David / dhildenb




Re: [PATCH v1] s390/virtio_ccw: don't allocate/assign airqs for non-existing queues

2025-04-07 Thread David Hildenbrand

On 07.04.25 11:13, David Hildenbrand wrote:

On 07.04.25 11:11, David Hildenbrand wrote:

On 07.04.25 10:58, Michael S. Tsirkin wrote:

On Mon, Apr 07, 2025 at 10:54:00AM +0200, David Hildenbrand wrote:

On 07.04.25 10:49, Michael S. Tsirkin wrote:

On Mon, Apr 07, 2025 at 10:44:21AM +0200, David Hildenbrand wrote:





Whoever adds new feat_X *must be aware* about all previous features,
otherwise we'd be reusing feature bits and everything falls to pieces.



The knowledge is supposed be limited to which feature bit to use.


I think we also have to know which virtqueue bits can be used, right?



what are virtqueue bits? vq number?


Yes, sorry.


I got confused myself, it's vq index actually now, we made the spec
consistent with that terminology. used to be number/index
interchangeably.


Assume cross-vm as an example. It would make use of virtqueue indexes 5+6
with their VIRTIO_BALLOON_F_WS_REPORTING.



crossvm guys really should have reserved the feature bit even if they
did not bother specifying it. Let's reserve it now at least?


Along with the virtqueue indices, right?

Note that there was

https://lists.gnu.org/archive/html/qemu-devel/2023-05/msg02503.html

and

https://groups.oasis-open.org/communities/community-home/digestviewer/viewthread?GroupId=3973&MessageKey=afb07613-f56c-4d40-8981-2fad1c723998&CommunityKey=2f26be99-3aa1-48f6-93a5-018dce262226&hlmlt=VT

But it only was RFC, and as the QEMU implementation didn't materialize,
nobody seemed to care ...


Heh, but that one said:

+\item[ VIRTIO_BALLOON_F_WS_REPORTING(6) ] The device has support for
Working Set

Which does not seem to reflect reality ...



I dug a bit more into cross-vm, because that one seems to be the only
one out there that does not behave like everybody else I found (maybe good,
maybe bad :) ).


1) There was temporarily even another feature (VIRTIO_BALLOON_F_EVENTS_VQ)
and another queue.

It got removed from cross-vm in:

commit 9ba634b82b55ba762dc8724676b2cf9419460145
Author: Daniel Verkamp 
Date:   Thu Jul 11 11:29:52 2024 -0700

devices: virtio-balloon: remove event queue support

VIRTIO_BALLOON_F_EVENTS_VQ was part of a proposed virtio spec change.

It is not currently supported by upstream Linux, so removing this should

have no effect except for guest kernels that had CHROMIUM patches
applied.

The virtqueue indexes for the ws-related queues are decremented to fill

the hole left by the removal of the event VQ; these are non-standard as
well, so they do not have virtqueue indexes assigned in the virtio spec,
but the proposed spec extension did actually use vq indexes 5 and 6.

BUG=b:214864326



2) cross-vm is aware of the upstream Linux driver

They thought your fix would go upstream; it didn't.

commit a2fa119e759d0238a42ff15a9aff0dfd122afebd
Author: Daniel Verkamp 
Date:   Wed Jul 10 16:16:28 2024 -0700

devices: virtio-balloon: warn about queue index mismatches

The Linux kernel virtio-balloon driver spec non-compliance related to

queue numbering is being fixed; add some diagnostics to our device that
help to check if everything is working as expected.



Additionally, replace the num_expected_queues() function with per-queue

checking to avoid the need for the duplicate feature checks and queue
count calculation; each pop_queue() call will be checked using the `?`
operator and return a more useful error message if a particular queue is
missing.

BUG=None

TEST=crosvm run --balloon-page-reporting ...


IIRC, in that commit they switched to the "spec" behavior.

That's when they started hard-coding the queue indexes.

CCing Daniel. All Linux versions should be incompatible with cross-vmm 
regarding free page reporting.
How is that handled?

--
Cheers,

David / dhildenb




Re: [PATCH v4 1/2] mm/vmscan: Skip memcg with !usage in shrink_node_memcgs()

2025-04-07 Thread Michal Koutný
Hi Waiman.

On Sun, Apr 06, 2025 at 09:41:58PM -0400, Waiman Long  
wrote:
 ...
> diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c 
> b/tools/testing/selftests/cgroup/test_memcontrol.c
> index 16f5d74ae762..bab826b6b7b0 100644
> --- a/tools/testing/selftests/cgroup/test_memcontrol.c
> +++ b/tools/testing/selftests/cgroup/test_memcontrol.c

I'd suggest updating also the header of the test for clarity and then
exempt the Child 2 ('E') conditionally from comparisons, something like:

@@ -380,10 +380,10 @@ static bool reclaim_until(const char *memcg, long goal);
  *
  * Then it checks actual memory usages and expects that:
  * A/Bmemory.current ~= 50M
- * A/B/C  memory.current ~= 29M
- * A/B/D  memory.current ~= 21M
- * A/B/E  memory.current ~= 0
- * A/B/F  memory.current  = 0
+ * A/B/C  memory.current ~= 29M, memory.events:low > 0
+ * A/B/D  memory.current ~= 21M, memory.events:low > 0
+ * A/B/E  memory.current ~= 0,   memory.events:low not specified (==0 w/out 
memory_recursiveprot)
+ * A/B/F  memory.current  = 0,   memory.events:low == 0
  * (for origin of the numbers, see model in memcg_protection.m.)
  *
  * After that it tries to allocate more than there is
@@ -527,6 +527,7 @@ static int test_memcg_protection(const char *root, bool min)

for (i = 0; i < ARRAY_SIZE(children); i++) {
int no_low_events_index = 1;
+   int ignore_low_events_index = has_recursiveprot ? 2 : -1;
long low, oom;

oom = cg_read_key_long(children[i], "memory.events", "oom ");
@@ -534,6 +535,8 @@ static int test_memcg_protection(const char *root, bool min)

if (oom)
goto cleanup;
+   if (i == ignore_low_events_index)
+   continue;
if (i <= no_low_events_index && low <= 0)
goto cleanup;
if (i > no_low_events_index && low)


signature.asc
Description: PGP signature


Re: [PATCH v3] remoteproc: imx_dsp_rproc: Add support for DSP-specific features

2025-04-07 Thread Mathieu Poirier
Good morning,

On Thu, Apr 03, 2025 at 01:01:24PM +0300, Iuliana Prodan (OSS) wrote:
> From: Iuliana Prodan 
> 
> Some DSP firmware requires a FW_READY signal before proceeding, while
> others do not.
> Therefore, add support to handle i.MX DSP-specific features.
> 
> Implement handle_rsc callback to handle resource table parsing and to
> process DSP-specific resource, to determine if waiting is needed.
> 
> Update imx_dsp_rproc_start() to handle this condition accordingly.
> 
> Signed-off-by: Iuliana Prodan 
> ---
> Changes in v3:
> - Reviews from Mathieu Poirier:
>   - Added version and magic number to vendor-specific resource table entry.
>   - Updated defines to maintain backward compatibility with a resource table 
> that doesn't have a vendor-specific resource.
> - By default, wait for `fw_ready`, unless specified otherwise.
> - Link to v2: 
> https://lore.kernel.org/all/20250318215007.2109726-1-iuliana.pro...@oss.nxp.com
> 
> Changes in v2:
> - Reviews from Mathieu Poirier:
>   - Use vendor-specific resource table entry.
>   - Implement resource handler specific to the i.MX DSP.
> - Revise commit message to include recent updates.
> - Link to v1: 
> https://lore.kernel.org/all/20250305123923.514386-1-iuliana.pro...@oss.nxp.com/
> 
>  drivers/remoteproc/imx_dsp_rproc.c | 102 -
>  1 file changed, 100 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/remoteproc/imx_dsp_rproc.c 
> b/drivers/remoteproc/imx_dsp_rproc.c
> index b9bb15970966..80d4470cc731 100644
> --- a/drivers/remoteproc/imx_dsp_rproc.c
> +++ b/drivers/remoteproc/imx_dsp_rproc.c
> @@ -35,9 +35,17 @@ module_param_named(no_mailboxes, no_mailboxes, int, 0644);
>  MODULE_PARM_DESC(no_mailboxes,
>"There is no mailbox between cores, so ignore remote proc 
> reply after start, default is 0 (off).");
>  
> +/* Flag indicating that the remote is up and running */
>  #define REMOTE_IS_READY  BIT(0)
> +/* Flag indicating that the host should wait for a firmware-ready response */
> +#define WAIT_FW_READYBIT(1)
>  #define REMOTE_READY_WAIT_MAX_RETRIES500
>  
> +/* This flag is set in the DSP resource table's features field to indicate
> + * that the firmware requires the host NOT to wait for a FW_READY response.
> + */
> +#define FEATURE_DONT_WAIT_FW_READY   BIT(0)
> +
>  /* att flags */
>  /* DSP own area */
>  #define ATT_OWN  BIT(31)
> @@ -72,6 +80,10 @@ MODULE_PARM_DESC(no_mailboxes,
>  
>  #define IMX8ULP_SIP_HIFI_XRDC0xc20e
>  
> +#define FW_RSC_NXP_S_MAGIC   ((uint32_t)'n' << 24 |  \
> +  (uint32_t)'x' << 16 |  \
> +  (uint32_t)'p' << 8 |   \
> +  (uint32_t)'s')
>  /*
>   * enum - Predefined Mailbox Messages
>   *
> @@ -136,6 +148,24 @@ struct imx_dsp_rproc_dcfg {
>   int (*reset)(struct imx_dsp_rproc *priv);
>  };
>  
> +/**
> + * struct fw_rsc_imx_dsp - i.MX DSP specific info
> + *
> + * @len: length of the resource entry
> + * @magic_num: 32-bit magic number
> + * @version: version of data structure
> + * @features: feature flags supported by the i.MX DSP firmware
> + *
> + * This represents a DSP-specific resource in the firmware's
> + * resource table, providing information on supported features.
> + */
> +struct fw_rsc_imx_dsp {
> + uint32_t len;
> + uint32_t magic_num;
> + uint32_t version;
> + uint32_t features;
> +} __packed;
> +
>  static const struct imx_rproc_att imx_dsp_rproc_att_imx8qm[] = {
>   /* dev addr , sys addr  , size  , flags */
>   { 0x596e8000, 0x556e8000, 0x8000, ATT_OWN },
> @@ -300,6 +330,73 @@ static int imx_dsp_rproc_ready(struct rproc *rproc)
>   return -ETIMEDOUT;
>  }
>  
> +/**
> + * imx_dsp_rproc_handle_rsc() - Handle DSP-specific resource table entries
> + * @rproc: remote processor instance
> + * @rsc_type: resource type identifier
> + * @rsc: pointer to the resource entry
> + * @offset: offset of the resource entry
> + * @avail: available space in the resource table
> + *
> + * Parse the DSP-specific resource entry and update flags accordingly.
> + * If the WAIT_FW_READY feature is set, the host must wait for the firmware
> + * to signal readiness before proceeding with execution.
> + *
> + * Return: RSC_HANDLED if processed successfully, RSC_IGNORED otherwise.
> + */
> +static int imx_dsp_rproc_handle_rsc(struct rproc *rproc, u32 rsc_type,
> + void *rsc, int offset, int avail)
> +{
> + struct imx_dsp_rproc *priv = rproc->priv;
> + struct fw_rsc_imx_dsp *imx_dsp_rsc = rsc;
> + struct device *dev = rproc->dev.parent;
> + size_t expected_size;
> +
> + if (!imx_dsp_rsc) {
> + dev_dbg(dev, "Invalid fw_rsc_imx_dsp.\n");
> + goto ignored;
> + }
> +
> + 

Re: [PATCH v1] s390/virtio_ccw: don't allocate/assign airqs for non-existing queues

2025-04-07 Thread Halil Pasic
On Mon, 07 Apr 2025 15:28:13 +0200
Cornelia Huck  wrote:

> > Staring at the cross-vmm, including the adding+removing of features and 
> > queues that are not in the spec, I am wondering if (in a world with 
> > fixed virtqueues)
> >
> > 1) Feature bits must be reserved before used.
> >
> > 2) Queue indices must be reserved before used.
> >
> > It all smells like a problem similar to device IDs ...  
> 
> Indeed, we need a rule "reserve a feature bit/queue index before using
> it, even if you do not plan to spec it properly".

What definition of usage do you guys have in mind?  Would an RFC patch
constitute usage.

I think reserving/allocating an identifier of this type before relying on
it for anything remotely serious is very basic common sense.

Frankly I would go even further and advocate for the following rule: we
don't accept anything virtio into Linux unless it is reasonably/properly
spec-ed. My train of thought is following: if a virtio thing gains
traction with Linux it has a fair chance of becoming a de-facto standard.

Consider our thinking on this one. Despite the fact that what is spec-ed
is obviously nicer, we almost decided to change the spec to fit what is
implemented and fielded out there. And IMHO for good reason. For any
rule we come up with, I think one of the most crucial questions is, who
is going to enforce it if anybody. The Linux (and probably also QEMU)
virtio maintainers are in my opinion the most reasonable point of
enforcement. Another thing to consider. After the code is in and things
work, I speculate that the motivation for writing a proper spec may
wane. I hope we do strive for consistency between the spec and the
implementations we are talking about. Having and eye on the spec while
looking at and trying to understand the code suits my workflow better
at least than the other way around. And licensing-wise getting the spec
merged first is probably the better option. 

Regards,
Halil





Re: [PATCH RESEND 0/4] media: i2c: imx214: Problem with CCS PLL calculator

2025-04-07 Thread André Apitzsch
Am Montag, dem 10.03.2025 um 23:35 +0100 schrieb André Apitzsch:
> Hi Sakari,
> 
> Am Montag, dem 10.03.2025 um 11:11 + schrieb Sakari Ailus:
> > Hi André,
> > 
> > On Sat, Mar 08, 2025 at 10:47:54PM +0100, André Apitzsch via B4
> > Relay
> > wrote:
> > > The imx214 driver currently supports only a 24MHz external clock.
> > > But
> > > there are devices, like Qualcomm-MSM8916-based phones, which
> > > cannot
> > > provide this frequency. To make the sensor usable by those
> > > devices,
> > > add
> > > support for 23.88MHz clock.
> > > 
> > > Signed-off-by: André Apitzsch 
> > > ---
> > > André Apitzsch (4):
> > >   media: i2c: imx214: Calculate link bit rate from clock
> > > frequency
> > >   media: i2c: imx214: Prepare for variable clock frequency
> > >   media: i2c: imx214: Read clock frequency from device tree
> > >   media: i2c: imx214: Add support for 23.88MHz clock
> > > 
> > >  drivers/media/i2c/imx214.c | 188
> > > +++--
> > >  1 file changed, 146 insertions(+), 42 deletions(-)
> > 
> > Thanks for the patches.
> > 
> > Do you think the driver could use the CCS PLL calculator? The PLL
> > appears to be compliant. The AR0234 driver will do the same. (The
> > sensor might just work with the CCS driver, too, but that's another
> > discussion.)
> > 
> Using the CCS PLL calculator seems quite complicated compared to
> switching to the CCS driver. That's why I looked at the later first.
> But for it to work, quirks already need to be applied in
> ccs_power_on(), to disable writing to COMPRESSION_MODE, and in
> ccs_identify_module(), to change the MODULE_MANUFACTURER_ID register.
> 
> I'll check if CCS PLL calculator could be used.
> 
> Best regards,
> André

Hi Sakari,

the CCS PLL calculator seems to work (up to one problem) and to be a
more elegant way forward.
The problem is, that the pixel rate is too small by a factor of 10 and
I cannot figure out why. Any help would be appreciated.

My devices uses a clock-frequency of 2400 and a link-frequency of
6. There are four data lanes.
The calculator returns a pixel rate of 480.000.000. The expected value
is 4800.000.000.

You can find the PLL input parameters in [1] and the generated debug
output below.

Best regards,
André

[1] 
https://github.com/a-andre/linux/blob/58e10a814985f700579847ac7c99468a65cb55bb/drivers/media/i2c/imx214.c#L1116-L1196

$ dmesg | grep imx
[   17.851215] imx214 4-0010: vt_lanes: 4
[   17.851245] imx214 4-0010: op_lanes: 4
[   17.851254] imx214 4-0010: binning: 1x1
[   17.851262] imx214 4-0010: min / max op_pre_pll_clk_div: 1 / 15
[   17.851272] imx214 4-0010: pre-pll check: min / max op_pre_pll_clk_div: 1 / 
15
[   17.851281] imx214 4-0010: mul 50 / div 1
[   17.851290] imx214 4-0010: pll_op check: min / max op_pre_pll_clk_div: 1 / 15
[   17.851300] imx214 4-0010: op_pre_pll_clk_div 1
[   17.851308] imx214 4-0010: more_mul_max: max_op_pll_multiplier check: 24
[   17.851317] imx214 4-0010: more_mul_max: max_pll_op_clk_freq_hz check: 1
[   17.851325] imx214 4-0010: more_mul_max: max_op_sys_clk_div check: 1
[   17.851333] imx214 4-0010: more_mul_max: min_pll_multiplier check: 1
[   17.851341] imx214 4-0010: more_mul_min: min_op_pll_op_clk_freq_hz check: 1
[   17.851349] imx214 4-0010: more_mul_min: min_op_pll_multiplier check: 1
[   17.851357] imx214 4-0010: more_mul_factor: 1
[   17.851365] imx214 4-0010: more_mul_factor: min_op_sys_clk_div: 1
[   17.851373] imx214 4-0010: final more_mul: 1
[   17.851381] imx214 4-0010: op_sys_clk_div: 1
[   17.851389] imx214 4-0010: op_pix_clk_div: 10
[   17.851398] imx214 4-0010: min_vt_div: 10
[   17.851406] imx214 4-0010: min_vt_div: max_vt_pix_clk_freq_hz: 10
[   17.851414] imx214 4-0010: min_vt_div: min_vt_clk_div: 10
[   17.851422] imx214 4-0010: max_vt_div: 40
[   17.851486] imx214 4-0010: max_vt_div: min_vt_pix_clk_freq_hz: 40
[   17.851502] imx214 4-0010: min_sys_div: 2
[   17.851510] imx214 4-0010: min_sys_div: max_vt_pix_clk_div: 2
[   17.851518] imx214 4-0010: min_sys_div: max_pll_op_clk_freq_hz: 2
[   17.851526] imx214 4-0010: min_sys_div: one or even: 2
[   17.851534] imx214 4-0010: max_sys_div: 4
[   17.851541] imx214 4-0010: max_sys_div: min_vt_pix_clk_div: 4
[   17.851549] imx214 4-0010: max_sys_div: min_vt_pix_clk_freq_hz: 4
[   17.851557] imx214 4-0010: pix_div 3 too small or too big (5--10)
[   17.851568] imx214 4-0010: ext_clk_freq_hz   2400
[   17.851578] imx214 4-0010: vt_pre_pll_clk_div1
[   17.851587] imx214 4-0010: vt_pll_multiplier 50
[   17.851595] imx214 4-0010: vt_pll_ip_clk_freq_hz 2400
[   17.851603] imx214 4-0010: vt_pll_op_clk_freq_hz 12
[   17.851612] imx214 4-0010: vt_sys_clk_div2
[   17.851620] imx214 4-0010: vt_pix_clk_div5
[   17.851629] imx214 4-0010: vt_sys_clk_freq_hz6
[   17.851637] imx214 4-0010: vt_pix_clk_freq_hz12000
[   17.851645] imx214 4-0010: op_sys_clk_div1
[

[PATCH v3 4/12] rcutorture: Make torture.sh --do-rt use CONFIG_PREEMPT_RT

2025-04-07 Thread Paul E. McKenney
The torture.sh --do-rt command-line parameter is intended to mimic -rt
kernels.  Now that CONFIG_PREEMPT_RT is upstream, this commit makes this
mimicking more precise.

Note that testing of RCU priority boosting is disabled in favor
of forward-progress testing of RCU callbacks.  If it turns out to be
possible to make kernels built with CONFIG_PREEMPT_RT=y to tolerate
testing of both, both will be enabled.

[ paulmck: Apply Sebastian Siewior feedback. ]

Signed-off-by: Paul E. McKenney 
Cc: Sebastian Andrzej Siewior 
---
 tools/testing/selftests/rcutorture/bin/torture.sh | 14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh 
b/tools/testing/selftests/rcutorture/bin/torture.sh
index 0447c4a00cc4d..d53ee1e0ffc79 100755
--- a/tools/testing/selftests/rcutorture/bin/torture.sh
+++ b/tools/testing/selftests/rcutorture/bin/torture.sh
@@ -448,13 +448,17 @@ fi
 
 if test "$do_rt" = "yes"
 then
-   # With all post-boot grace periods forced to normal.
-   torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 
torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=3 
rcupdate.rcu_normal=1"
-   torture_set "rcurttorture" 
tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 
"$duration_rcutorture" --configs "TREE03" --trust-make
+   # In both runs, disable testing of RCU priority boosting because
+   # -rt doesn't like its interaction with testing of callback
+   # flooding.
+
+   # With all post-boot grace periods forced to normal (default for 
PREEMPT_RT).
+   torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 
torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=3 
rcutorture.test_boost=0 rcutorture.preempt_duration=0"
+   torture_set "rcurttorture" 
tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 
"$duration_rcutorture" --configs "TREE03" --kconfig "CONFIG_PREEMPT_RT=y 
CONFIG_EXPERT=y CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=y" --trust-make
 
# With all post-boot grace periods forced to expedited.
-   torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 
torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=3 
rcupdate.rcu_expedited=1"
-   torture_set "rcurttorture-exp" 
tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 
"$duration_rcutorture" --configs "TREE03" --trust-make
+   torture_bootargs="rcupdate.rcu_cpu_stall_suppress_at_boot=1 
torture.disable_onoff_at_boot rcupdate.rcu_task_stall_timeout=3 
rcutorture.test_boost=0 rcupdate.rcu_normal_after_boot=0 
rcupdate.rcu_expedited=1 rcutorture.preempt_duration=0"
+   torture_set "rcurttorture-exp" 
tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 
"$duration_rcutorture" --configs "TREE03" --kconfig "CONFIG_PREEMPT_RT=y 
CONFIG_EXPERT=y CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_FULL=y" --trust-make
 fi
 
 if test "$do_srcu_lockdep" = "yes"
-- 
2.40.1




[PATCH net-next v25 00/23] Introducing OpenVPN Data Channel Offload

2025-04-07 Thread Antonio Quartulli
Notable changes since v24:
* disable TCP disconnections of attached sockets (tcp_disconnect()
  returns -EBUSY) - similarly to kTLS.
* used rcu_replace_pointer instead of 
rcu_dereference_protected+rcu_assign_pointer
* dropped useless skb->ignore_df = 1
* dropped unneded EXPORT_SYMBOL_GPL(udpv6_prot)
* dropped obsolete comment for ovpn_crypto_key_slots_swap()
* dropped calls to kfree() in ovpn_aead_encrypt/decrypt() (release is
  performed in ovpn_encrypt/decrypt_post())
* dropped NULL check before calling kfree() in
  ovpn_encrypt/decrypt_done()
* converted seq_num from atomic64_t to atomic_t (IV exhaustion is now
  detected in case of wrap around)
* call consume_skb() on skb when dropping keepalive message (it is not a
  failure)
* made REMOTE_PORT mandatory when REMOTE_IPV4/6 is specified in
  peer_new/set call
* ensured ovpn_nl_key_swap_notify() is called only once, even when
  parsing a batch of received packets concurrently

Please note that some patches were already reviewed/tested by a few
people. These patches have retained the tags as they have hardly been
touched.

The latest code can also be found at:

https://github.com/OpenVPN/ovpn-net-next

Thanks a lot!
Best Regards,

Antonio Quartulli
OpenVPN Inc.

---
Antonio Quartulli (23):
  net: introduce OpenVPN Data Channel Offload (ovpn)
  ovpn: add basic netlink support
  ovpn: add basic interface creation/destruction/management routines
  ovpn: keep carrier always on for MP interfaces
  ovpn: introduce the ovpn_peer object
  ovpn: introduce the ovpn_socket object
  ovpn: implement basic TX path (UDP)
  ovpn: implement basic RX path (UDP)
  ovpn: implement packet processing
  ovpn: store tunnel and transport statistics
  ovpn: implement TCP transport
  skb: implement skb_send_sock_locked_with_flags()
  ovpn: add support for MSG_NOSIGNAL in tcp_sendmsg
  ovpn: implement multi-peer support
  ovpn: implement peer lookup logic
  ovpn: implement keepalive mechanism
  ovpn: add support for updating local or remote UDP endpoint
  ovpn: implement peer add/get/dump/delete via netlink
  ovpn: implement key add/get/del/swap via netlink
  ovpn: kill key and notify userspace in case of IV exhaustion
  ovpn: notify userspace when a peer is deleted
  ovpn: add basic ethtool support
  testing/selftests: add test tool and scripts for ovpn module

 Documentation/netlink/specs/ovpn.yaml  |  367 +++
 Documentation/netlink/specs/rt_link.yaml   |   16 +
 MAINTAINERS|   11 +
 drivers/net/Kconfig|   15 +
 drivers/net/Makefile   |1 +
 drivers/net/ovpn/Makefile  |   22 +
 drivers/net/ovpn/bind.c|   55 +
 drivers/net/ovpn/bind.h|  101 +
 drivers/net/ovpn/crypto.c  |  210 ++
 drivers/net/ovpn/crypto.h  |  145 ++
 drivers/net/ovpn/crypto_aead.c |  383 
 drivers/net/ovpn/crypto_aead.h |   29 +
 drivers/net/ovpn/io.c  |  446 
 drivers/net/ovpn/io.h  |   34 +
 drivers/net/ovpn/main.c|  330 +++
 drivers/net/ovpn/main.h|   14 +
 drivers/net/ovpn/netlink-gen.c |  213 ++
 drivers/net/ovpn/netlink-gen.h |   41 +
 drivers/net/ovpn/netlink.c | 1258 ++
 drivers/net/ovpn/netlink.h |   18 +
 drivers/net/ovpn/ovpnpriv.h|   57 +
 drivers/net/ovpn/peer.c| 1364 +++
 drivers/net/ovpn/peer.h|  163 ++
 drivers/net/ovpn/pktid.c   |  129 ++
 drivers/net/ovpn/pktid.h   |   86 +
 drivers/net/ovpn/proto.h   |  118 +
 drivers/net/ovpn/skb.h |   61 +
 drivers/net/ovpn/socket.c  |  239 ++
 drivers/net/ovpn/socket.h  |   49 +
 drivers/net/ovpn/stats.c   |   21 +
 drivers/net/ovpn/stats.h   |   47 +
 drivers/net/ovpn/tcp.c |  598 +
 drivers/net/ovpn/tcp.h |   36 +
 drivers/net/ovpn/udp.c |  439 
 drivers/net/ovpn/udp.h |   25 +
 include/linux/skbuff.h |2 +
 include/uapi/linux/if_link.h   |   15 +
 include/uapi/linux/ovpn.h  |  109 +
 include/uapi/linux/udp.h   |1 +
 net/core/skbuff.c  |   18 +-
 net/ipv6/af_inet6.c|1 +
 tools/testing/selftests/Makefile 

[PATCH net-next v25 07/23] ovpn: implement basic TX path (UDP)

2025-04-07 Thread Antonio Quartulli
Packets sent over the ovpn interface are processed and transmitted to the
connected peer, if any.

Implementation is UDP only. TCP will be added by a later patch.

Note: no crypto/encapsulation exists yet. Packets are just captured and
sent.

Signed-off-by: Antonio Quartulli 
---
 drivers/net/Kconfig   |   1 +
 drivers/net/ovpn/io.c | 137 ++-
 drivers/net/ovpn/peer.c   |  32 +++
 drivers/net/ovpn/peer.h   |   2 +
 drivers/net/ovpn/skb.h|  55 +++
 drivers/net/ovpn/socket.c |   3 +-
 drivers/net/ovpn/udp.c| 233 +-
 drivers/net/ovpn/udp.h|   6 ++
 8 files changed, 465 insertions(+), 4 deletions(-)

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 
2806fcc22a2dbd9b2985b09dd6ef65dd1dc4ebc1..305f04dd97234c4aa43da78217448b914cc7ede0
 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -120,6 +120,7 @@ config OVPN
depends on NET && INET
depends on IPV6 || !IPV6
select DST_CACHE
+   select NET_UDP_TUNNEL
help
  This module enhances the performance of the OpenVPN userspace software
  by offloading the data channel processing to kernelspace.
diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c
index 
4b71c38165d7adbb1a2d1a64d27a13b7f76cfbfe..94b466bf2ef70d60d3e60d9820b64877c44f2e51
 100644
--- a/drivers/net/ovpn/io.c
+++ b/drivers/net/ovpn/io.c
@@ -9,14 +9,149 @@
 
 #include 
 #include 
+#include 
 
 #include "io.h"
+#include "ovpnpriv.h"
+#include "peer.h"
+#include "udp.h"
+#include "skb.h"
+#include "socket.h"
+
+static void ovpn_encrypt_post(struct sk_buff *skb, int ret)
+{
+   struct ovpn_peer *peer = ovpn_skb_cb(skb)->peer;
+   struct ovpn_socket *sock;
+
+   if (unlikely(ret < 0))
+   goto err;
+
+   skb_mark_not_on_list(skb);
+
+   rcu_read_lock();
+   sock = rcu_dereference(peer->sock);
+   if (unlikely(!sock))
+   goto err_unlock;
+
+   switch (sock->sock->sk->sk_protocol) {
+   case IPPROTO_UDP:
+   ovpn_udp_send_skb(peer, sock->sock, skb);
+   break;
+   default:
+   /* no transport configured yet */
+   goto err_unlock;
+   }
+   /* skb passed down the stack - don't free it */
+   skb = NULL;
+err_unlock:
+   rcu_read_unlock();
+err:
+   if (unlikely(skb))
+   dev_core_stats_tx_dropped_inc(peer->ovpn->dev);
+   ovpn_peer_put(peer);
+   kfree_skb(skb);
+}
+
+static bool ovpn_encrypt_one(struct ovpn_peer *peer, struct sk_buff *skb)
+{
+   ovpn_skb_cb(skb)->peer = peer;
+
+   /* take a reference to the peer because the crypto code may run async.
+* ovpn_encrypt_post() will release it upon completion
+*/
+   if (unlikely(!ovpn_peer_hold(peer))) {
+   DEBUG_NET_WARN_ON_ONCE(1);
+   return false;
+   }
+
+   ovpn_encrypt_post(skb, 0);
+   return true;
+}
+
+/* send skb to connected peer, if any */
+static void ovpn_send(struct ovpn_priv *ovpn, struct sk_buff *skb,
+ struct ovpn_peer *peer)
+{
+   struct sk_buff *curr, *next;
+
+   /* this might be a GSO-segmented skb list: process each skb
+* independently
+*/
+   skb_list_walk_safe(skb, curr, next) {
+   if (unlikely(!ovpn_encrypt_one(peer, curr))) {
+   dev_core_stats_tx_dropped_inc(ovpn->dev);
+   kfree_skb(curr);
+   }
+   }
+
+   ovpn_peer_put(peer);
+}
 
 /* Send user data to the network
  */
 netdev_tx_t ovpn_net_xmit(struct sk_buff *skb, struct net_device *dev)
 {
+   struct ovpn_priv *ovpn = netdev_priv(dev);
+   struct sk_buff *segments, *curr, *next;
+   struct sk_buff_head skb_list;
+   struct ovpn_peer *peer;
+   __be16 proto;
+   int ret;
+
+   /* reset netfilter state */
+   nf_reset_ct(skb);
+
+   /* verify IP header size in network packet */
+   proto = ovpn_ip_check_protocol(skb);
+   if (unlikely(!proto || skb->protocol != proto))
+   goto drop;
+
+   if (skb_is_gso(skb)) {
+   segments = skb_gso_segment(skb, 0);
+   if (IS_ERR(segments)) {
+   ret = PTR_ERR(segments);
+   net_err_ratelimited("%s: cannot segment payload packet: 
%d\n",
+   netdev_name(dev), ret);
+   goto drop;
+   }
+
+   consume_skb(skb);
+   skb = segments;
+   }
+
+   /* from this moment on, "skb" might be a list */
+
+   __skb_queue_head_init(&skb_list);
+   skb_list_walk_safe(skb, curr, next) {
+   skb_mark_not_on_list(curr);
+
+   curr = skb_share_check(curr, GFP_ATOMIC);
+   if (unlikely(!curr)) {
+   net_err_ratelimited("%s: skb_share_check failed for 
payload packet\n",
+ 

[PATCH net-next v25 15/23] ovpn: implement peer lookup logic

2025-04-07 Thread Antonio Quartulli
In a multi-peer scenario there are a number of situations when a
specific peer needs to be looked up.

We may want to lookup a peer by:
1. its ID
2. its VPN destination IP
3. its transport IP/port couple

For each of the above, there is a specific routing table referencing all
peers for fast look up.

Case 2. is a bit special in the sense that an outgoing packet may not be
sent to the peer VPN IP directly, but rather to a network behind it. For
this reason we first perform a nexthop lookup in the system routing
table and then we use the retrieved nexthop as peer search key.

Signed-off-by: Antonio Quartulli 
---
 drivers/net/ovpn/peer.c | 301 ++--
 1 file changed, 291 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ovpn/peer.c b/drivers/net/ovpn/peer.c
index 
bed2e591c000c4efecdcd92db484e590f97f9f7f..2d6cecc28c5d1f7d5516f7f89bc0ba274c72d5e1
 100644
--- a/drivers/net/ovpn/peer.c
+++ b/drivers/net/ovpn/peer.c
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "ovpnpriv.h"
 #include "bind.h"
@@ -150,6 +151,121 @@ static int ovpn_peer_skb_to_sockaddr(struct sk_buff *skb,
return -1;
 }
 
+/**
+ * ovpn_nexthop_from_skb4 - retrieve IPv4 nexthop for outgoing skb
+ * @skb: the outgoing packet
+ *
+ * Return: the IPv4 of the nexthop
+ */
+static __be32 ovpn_nexthop_from_skb4(struct sk_buff *skb)
+{
+   const struct rtable *rt = skb_rtable(skb);
+
+   if (rt && rt->rt_uses_gateway)
+   return rt->rt_gw4;
+
+   return ip_hdr(skb)->daddr;
+}
+
+/**
+ * ovpn_nexthop_from_skb6 - retrieve IPv6 nexthop for outgoing skb
+ * @skb: the outgoing packet
+ *
+ * Return: the IPv6 of the nexthop
+ */
+static struct in6_addr ovpn_nexthop_from_skb6(struct sk_buff *skb)
+{
+   const struct rt6_info *rt = skb_rt6_info(skb);
+
+   if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
+   return ipv6_hdr(skb)->daddr;
+
+   return rt->rt6i_gateway;
+}
+
+/* variable name __tbl2 needs to be different from __tbl1
+ * in the macro below to avoid confusing clang
+ */
+#define ovpn_get_hash_slot(_tbl, _key, _key_len) ({\
+   typeof(_tbl) *__tbl2 = &(_tbl); \
+   jhash(_key, _key_len, 0) % HASH_SIZE(*__tbl2);  \
+})
+
+#define ovpn_get_hash_head(_tbl, _key, _key_len) ({\
+   typeof(_tbl) *__tbl1 = &(_tbl); \
+   &(*__tbl1)[ovpn_get_hash_slot(*__tbl1, _key, _key_len)];\
+})
+
+/**
+ * ovpn_peer_get_by_vpn_addr4 - retrieve peer by its VPN IPv4 address
+ * @ovpn: the openvpn instance to search
+ * @addr: VPN IPv4 to use as search key
+ *
+ * Refcounter is not increased for the returned peer.
+ *
+ * Return: the peer if found or NULL otherwise
+ */
+static struct ovpn_peer *ovpn_peer_get_by_vpn_addr4(struct ovpn_priv *ovpn,
+   __be32 addr)
+{
+   struct hlist_nulls_head *nhead;
+   struct hlist_nulls_node *ntmp;
+   struct ovpn_peer *tmp;
+   unsigned int slot;
+
+begin:
+   slot = ovpn_get_hash_slot(ovpn->peers->by_vpn_addr4, &addr,
+ sizeof(addr));
+   nhead = &ovpn->peers->by_vpn_addr4[slot];
+
+   hlist_nulls_for_each_entry_rcu(tmp, ntmp, nhead, hash_entry_addr4)
+   if (addr == tmp->vpn_addrs.ipv4.s_addr)
+   return tmp;
+
+   /* item may have moved during lookup - check nulls and restart
+* if that's the case
+*/
+   if (get_nulls_value(ntmp) != slot)
+   goto begin;
+
+   return NULL;
+}
+
+/**
+ * ovpn_peer_get_by_vpn_addr6 - retrieve peer by its VPN IPv6 address
+ * @ovpn: the openvpn instance to search
+ * @addr: VPN IPv6 to use as search key
+ *
+ * Refcounter is not increased for the returned peer.
+ *
+ * Return: the peer if found or NULL otherwise
+ */
+static struct ovpn_peer *ovpn_peer_get_by_vpn_addr6(struct ovpn_priv *ovpn,
+   struct in6_addr *addr)
+{
+   struct hlist_nulls_head *nhead;
+   struct hlist_nulls_node *ntmp;
+   struct ovpn_peer *tmp;
+   unsigned int slot;
+
+begin:
+   slot = ovpn_get_hash_slot(ovpn->peers->by_vpn_addr6, addr,
+ sizeof(*addr));
+   nhead = &ovpn->peers->by_vpn_addr6[slot];
+
+   hlist_nulls_for_each_entry_rcu(tmp, ntmp, nhead, hash_entry_addr6)
+   if (ipv6_addr_equal(addr, &tmp->vpn_addrs.ipv6))
+   return tmp;
+
+   /* item may have moved during lookup - check nulls and restart
+* if that's the case
+*/
+   if (get_nulls_value(ntmp) != slot)
+   goto begin;
+
+   return NULL;
+}
+
 /**
  * ovpn_peer_transp_match - check if sockaddr and peer binding match
  * @peer: the peer to get the binding from
@@ -227,14 +343,43 @@ ovpn_peer_get_by_transp_addr_p2p(struct ovpn_priv *ovpn,
 struct ovpn_peer *ovpn_peer_get_by_transp_addr(struct ovpn_priv *ovpn,

[PATCH net-next v25 13/23] ovpn: add support for MSG_NOSIGNAL in tcp_sendmsg

2025-04-07 Thread Antonio Quartulli
Userspace may want to pass the MSG_NOSIGNAL flag to
tcp_sendmsg() in order to avoid generating a SIGPIPE.

To pass this flag down the TCP stack a new skb sending API
accepting a flags argument is introduced.

Cc: Eric Dumazet 
Cc: Paolo Abeni 
Signed-off-by: Antonio Quartulli 
---
 drivers/net/ovpn/skb.h |  1 +
 drivers/net/ovpn/tcp.c | 12 
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ovpn/skb.h b/drivers/net/ovpn/skb.h
index 
bd3cbcfc770d2c28d234fcdd081b4d02e6496ea0..64430880f1dae33a41f698d713cf151be5b38577
 100644
--- a/drivers/net/ovpn/skb.h
+++ b/drivers/net/ovpn/skb.h
@@ -25,6 +25,7 @@ struct ovpn_cb {
struct scatterlist *sg;
u8 *iv;
unsigned int payload_offset;
+   bool nosignal;
 };
 
 static inline struct ovpn_cb *ovpn_skb_cb(struct sk_buff *skb)
diff --git a/drivers/net/ovpn/tcp.c b/drivers/net/ovpn/tcp.c
index 
e643cd8a66350eb92c6785317440fcda6c5ab6eb..dde9707d74442a9a6a9e38631196d2c4a09a74f9
 100644
--- a/drivers/net/ovpn/tcp.c
+++ b/drivers/net/ovpn/tcp.c
@@ -220,6 +220,7 @@ void ovpn_tcp_socket_wait_finish(struct ovpn_socket *sock)
 static void ovpn_tcp_send_sock(struct ovpn_peer *peer, struct sock *sk)
 {
struct sk_buff *skb = peer->tcp.out_msg.skb;
+   int ret, flags;
 
if (!skb)
return;
@@ -230,9 +231,11 @@ static void ovpn_tcp_send_sock(struct ovpn_peer *peer, 
struct sock *sk)
peer->tcp.tx_in_progress = true;
 
do {
-   int ret = skb_send_sock_locked(sk, skb,
-  peer->tcp.out_msg.offset,
-  peer->tcp.out_msg.len);
+   flags = ovpn_skb_cb(skb)->nosignal ? MSG_NOSIGNAL : 0;
+   ret = skb_send_sock_locked_with_flags(sk, skb,
+ peer->tcp.out_msg.offset,
+ peer->tcp.out_msg.len,
+ flags);
if (unlikely(ret < 0)) {
if (ret == -EAGAIN)
goto out;
@@ -380,7 +383,7 @@ static int ovpn_tcp_sendmsg(struct sock *sk, struct msghdr 
*msg, size_t size)
rcu_read_unlock();
peer = sock->peer;
 
-   if (msg->msg_flags & ~MSG_DONTWAIT) {
+   if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_NOSIGNAL)) {
ret = -EOPNOTSUPP;
goto peer_free;
}
@@ -413,6 +416,7 @@ static int ovpn_tcp_sendmsg(struct sock *sk, struct msghdr 
*msg, size_t size)
goto peer_free;
}
 
+   ovpn_skb_cb(skb)->nosignal = msg->msg_flags & MSG_NOSIGNAL;
ovpn_tcp_send_sock_skb(peer, sk, skb);
ret = size;
 peer_free:

-- 
2.49.0




[PATCH net-next v25 14/23] ovpn: implement multi-peer support

2025-04-07 Thread Antonio Quartulli
With this change an ovpn instance will be able to stay connected to
multiple remote endpoints.

This functionality is strictly required when running ovpn on an
OpenVPN server.

Signed-off-by: Antonio Quartulli 
---
 drivers/net/ovpn/main.c |  64 +--
 drivers/net/ovpn/ovpnpriv.h |  19 +
 drivers/net/ovpn/peer.c | 189 ++--
 drivers/net/ovpn/peer.h |  12 ++-
 drivers/net/ovpn/udp.c  |   4 +-
 5 files changed, 272 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c
index 
948c36129914ca45fb7118b821c7c60359e600f6..1b3d5b6c83202743a15f38029251bf102f45
 100644
--- a/drivers/net/ovpn/main.c
+++ b/drivers/net/ovpn/main.c
@@ -25,11 +25,66 @@
 #include "tcp.h"
 #include "udp.h"
 
+static void ovpn_priv_free(struct net_device *net)
+{
+   struct ovpn_priv *ovpn = netdev_priv(net);
+
+   kfree(ovpn->peers);
+}
+
+static int ovpn_mp_alloc(struct ovpn_priv *ovpn)
+{
+   struct in_device *dev_v4;
+   int i;
+
+   if (ovpn->mode != OVPN_MODE_MP)
+   return 0;
+
+   dev_v4 = __in_dev_get_rtnl(ovpn->dev);
+   if (dev_v4) {
+   /* disable redirects as Linux gets confused by ovpn
+* handling same-LAN routing.
+* This happens because a multipeer interface is used as
+* relay point between hosts in the same subnet, while
+* in a classic LAN this would not be needed because the
+* two hosts would be able to talk directly.
+*/
+   IN_DEV_CONF_SET(dev_v4, SEND_REDIRECTS, false);
+   IPV4_DEVCONF_ALL(dev_net(ovpn->dev), SEND_REDIRECTS) = false;
+   }
+
+   /* the peer container is fairly large, therefore we allocate it only in
+* MP mode
+*/
+   ovpn->peers = kzalloc(sizeof(*ovpn->peers), GFP_KERNEL);
+   if (!ovpn->peers)
+   return -ENOMEM;
+
+   for (i = 0; i < ARRAY_SIZE(ovpn->peers->by_id); i++) {
+   INIT_HLIST_HEAD(&ovpn->peers->by_id[i]);
+   INIT_HLIST_NULLS_HEAD(&ovpn->peers->by_vpn_addr4[i], i);
+   INIT_HLIST_NULLS_HEAD(&ovpn->peers->by_vpn_addr6[i], i);
+   INIT_HLIST_NULLS_HEAD(&ovpn->peers->by_transp_addr[i], i);
+   }
+
+   return 0;
+}
+
 static int ovpn_net_init(struct net_device *dev)
 {
struct ovpn_priv *ovpn = netdev_priv(dev);
+   int err = gro_cells_init(&ovpn->gro_cells, dev);
 
-   return gro_cells_init(&ovpn->gro_cells, dev);
+   if (err < 0)
+   return err;
+
+   err = ovpn_mp_alloc(ovpn);
+   if (err < 0) {
+   gro_cells_destroy(&ovpn->gro_cells);
+   return err;
+   }
+
+   return 0;
 }
 
 static void ovpn_net_uninit(struct net_device *dev)
@@ -91,6 +146,8 @@ static void ovpn_setup(struct net_device *dev)
 
dev->netdev_ops = &ovpn_netdev_ops;
 
+   dev->priv_destructor = ovpn_priv_free;
+
dev->hard_header_len = 0;
dev->addr_len = 0;
dev->mtu = ETH_DATA_LEN - OVPN_HEAD_ROOM;
@@ -187,10 +244,7 @@ static int ovpn_netdev_notifier_call(struct notifier_block 
*nb,
 
netif_carrier_off(dev);
ovpn->registered = false;
-
-   if (ovpn->mode == OVPN_MODE_P2P)
-   ovpn_peer_release_p2p(ovpn, NULL,
- OVPN_DEL_PEER_REASON_TEARDOWN);
+   ovpn_peers_free(ovpn, NULL, OVPN_DEL_PEER_REASON_TEARDOWN);
break;
case NETDEV_POST_INIT:
case NETDEV_GOING_DOWN:
diff --git a/drivers/net/ovpn/ovpnpriv.h b/drivers/net/ovpn/ovpnpriv.h
index 
2e3f4baf305f0b37b474d7b7d94751aa4af8a2ea..b26ad97215a3d42242ba349b348c2749f570797c
 100644
--- a/drivers/net/ovpn/ovpnpriv.h
+++ b/drivers/net/ovpn/ovpnpriv.h
@@ -15,12 +15,30 @@
 #include 
 #include 
 
+/**
+ * struct ovpn_peer_collection - container of peers for MultiPeer mode
+ * @by_id: table of peers index by ID
+ * @by_vpn_addr4: table of peers indexed by VPN IPv4 address (items can be
+ *   rehashed on the fly due to peer IP change)
+ * @by_vpn_addr6: table of peers indexed by VPN IPv6 address (items can be
+ *   rehashed on the fly due to peer IP change)
+ * @by_transp_addr: table of peers indexed by transport address (items can be
+ * rehashed on the fly due to peer IP change)
+ */
+struct ovpn_peer_collection {
+   DECLARE_HASHTABLE(by_id, 12);
+   struct hlist_nulls_head by_vpn_addr4[1 << 12];
+   struct hlist_nulls_head by_vpn_addr6[1 << 12];
+   struct hlist_nulls_head by_transp_addr[1 << 12];
+};
+
 /**
  * struct ovpn_priv - per ovpn interface state
  * @dev: the actual netdev representing the tunnel
  * @registered: whether dev is still registered with netdev or not
  * @mode: device operation mode (i.e. p2p, mp, ..)
  * @lock: protect this object
+ * @peers: data structures holding mu

[PATCH net-next v25 03/23] ovpn: add basic interface creation/destruction/management routines

2025-04-07 Thread Antonio Quartulli
Add basic infrastructure for handling ovpn interfaces.

Tested-by: Donald Hunter 
Signed-off-by: Antonio Quartulli 
---
 Documentation/netlink/specs/rt_link.yaml | 16 ++
 drivers/net/ovpn/Makefile|  1 +
 drivers/net/ovpn/io.c| 22 +++
 drivers/net/ovpn/io.h| 24 
 drivers/net/ovpn/main.c  | 99 ++--
 drivers/net/ovpn/ovpnpriv.h  |  7 +++
 drivers/net/ovpn/proto.h | 38 
 include/uapi/linux/if_link.h | 15 +
 8 files changed, 217 insertions(+), 5 deletions(-)

diff --git a/Documentation/netlink/specs/rt_link.yaml 
b/Documentation/netlink/specs/rt_link.yaml
index 
31238455f8e9d29531884cad4951391fa47ccfaf..a50d9d7d882e7e4f9de29b2a4e7acc602972f6b3
 100644
--- a/Documentation/netlink/specs/rt_link.yaml
+++ b/Documentation/netlink/specs/rt_link.yaml
@@ -938,6 +938,12 @@ definitions:
 entries:
   - name: none
   - name: default
+  -
+name: ovpn-mode
+type: enum
+entries:
+  - p2p
+  - mp
 
 attribute-sets:
   -
@@ -2272,6 +2278,13 @@ attribute-sets:
   -
 name: tailroom
 type: u16
+  -
+name: linkinfo-ovpn-attrs
+attributes:
+  -
+name: mode
+type: u8
+enum: ovpn-mode
 
 sub-messages:
   -
@@ -2322,6 +2335,9 @@ sub-messages:
   -
 value: netkit
 attribute-set: linkinfo-netkit-attrs
+  -
+value: ovpn
+attribute-set: linkinfo-ovpn-attrs
   -
 name: linkinfo-member-data-msg
 formats:
diff --git a/drivers/net/ovpn/Makefile b/drivers/net/ovpn/Makefile
index 
75ac62bba02937bc49cb2a0dec5ca3cc31a8ee00..0e5f686672fb5052cee5a2c28797b70859514a7f
 100644
--- a/drivers/net/ovpn/Makefile
+++ b/drivers/net/ovpn/Makefile
@@ -8,5 +8,6 @@
 
 obj-$(CONFIG_OVPN) := ovpn.o
 ovpn-y += main.o
+ovpn-y += io.o
 ovpn-y += netlink.o
 ovpn-y += netlink-gen.o
diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c
new file mode 100644
index 
..4b71c38165d7adbb1a2d1a64d27a13b7f76cfbfe
--- /dev/null
+++ b/drivers/net/ovpn/io.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0
+/*  OpenVPN data channel offload
+ *
+ *  Copyright (C) 2019-2025 OpenVPN, Inc.
+ *
+ *  Author:James Yonan 
+ * Antonio Quartulli 
+ */
+
+#include 
+#include 
+
+#include "io.h"
+
+/* Send user data to the network
+ */
+netdev_tx_t ovpn_net_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+   skb_tx_error(skb);
+   kfree_skb(skb);
+   return NET_XMIT_DROP;
+}
diff --git a/drivers/net/ovpn/io.h b/drivers/net/ovpn/io.h
new file mode 100644
index 
..afea5f81f5628dcb9afda9a78974bbf6f2101c13
--- /dev/null
+++ b/drivers/net/ovpn/io.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* OpenVPN data channel offload
+ *
+ *  Copyright (C) 2019-2025 OpenVPN, Inc.
+ *
+ *  Author:James Yonan 
+ * Antonio Quartulli 
+ */
+
+#ifndef _NET_OVPN_OVPN_H_
+#define _NET_OVPN_OVPN_H_
+
+/* DATA_V2 header size with AEAD encryption */
+#define OVPN_HEAD_ROOM (OVPN_OPCODE_SIZE + OVPN_NONCE_WIRE_SIZE + \
+   16 /* AEAD TAG length */ + \
+   max(sizeof(struct udphdr), sizeof(struct tcphdr)) +\
+   max(sizeof(struct ipv6hdr), sizeof(struct iphdr)))
+
+/* max padding required by encryption */
+#define OVPN_MAX_PADDING 16
+
+netdev_tx_t ovpn_net_xmit(struct sk_buff *skb, struct net_device *dev);
+
+#endif /* _NET_OVPN_OVPN_H_ */
diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c
index 
28133e7e15e74b8a4a937ed03f70d9f83d7a14c8..b19f1406d87d5a1ed45b00133d642b1ad9f4f6f7
 100644
--- a/drivers/net/ovpn/main.c
+++ b/drivers/net/ovpn/main.c
@@ -10,14 +10,28 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
-#include 
+#include 
 
 #include "ovpnpriv.h"
 #include "main.h"
 #include "netlink.h"
+#include "io.h"
+#include "proto.h"
 
 static const struct net_device_ops ovpn_netdev_ops = {
+   .ndo_start_xmit = ovpn_net_xmit,
+};
+
+static const struct device_type ovpn_type = {
+   .name = OVPN_FAMILY_NAME,
+};
+
+static const struct nla_policy ovpn_policy[IFLA_OVPN_MAX + 1] = {
+   [IFLA_OVPN_MODE] = NLA_POLICY_RANGE(NLA_U8, OVPN_MODE_P2P,
+   OVPN_MODE_MP),
 };
 
 /**
@@ -31,44 +45,119 @@ bool ovpn_dev_is_valid(const struct net_device *dev)
return dev->netdev_ops == &ovpn_netdev_ops;
 }
 
+static void ovpn_setup(struct net_device *dev)
+{
+   netdev_features_t feat = NETIF_F_SG | NETIF_F_GSO |
+NETIF_F_GSO_SOFTWARE | NETIF_F_HIGHDMA;
+
+   dev->needs_free_netdev = true;
+
+   dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
+
+   dev->netdev_ops = &ovpn_netdev_ops;
+
+   dev->hard_header_len = 0;
+   dev->addr_len = 0;
+ 

[PATCH net-next v25 02/23] ovpn: add basic netlink support

2025-04-07 Thread Antonio Quartulli
This commit introduces basic netlink support with family
registration/unregistration functionalities and stub pre/post-doit.

More importantly it introduces the YAML uAPI description along
with its auto-generated files:
- include/uapi/linux/ovpn.h
- drivers/net/ovpn/netlink-gen.c
- drivers/net/ovpn/netlink-gen.h

Reviewed-by: Donald Hunter 
Signed-off-by: Antonio Quartulli 
---
 Documentation/netlink/specs/ovpn.yaml | 367 ++
 MAINTAINERS   |   2 +
 drivers/net/ovpn/Makefile |   2 +
 drivers/net/ovpn/main.c   |  17 +-
 drivers/net/ovpn/main.h   |  14 ++
 drivers/net/ovpn/netlink-gen.c| 213 
 drivers/net/ovpn/netlink-gen.h|  41 
 drivers/net/ovpn/netlink.c| 160 +++
 drivers/net/ovpn/netlink.h|  15 ++
 drivers/net/ovpn/ovpnpriv.h   |  21 ++
 include/uapi/linux/ovpn.h | 109 ++
 11 files changed, 960 insertions(+), 1 deletion(-)

diff --git a/Documentation/netlink/specs/ovpn.yaml 
b/Documentation/netlink/specs/ovpn.yaml
new file mode 100644
index 
..6f131bead88f83a9d5a957190001a815f76395be
--- /dev/null
+++ b/Documentation/netlink/specs/ovpn.yaml
@@ -0,0 +1,367 @@
+# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+#
+# Author: Antonio Quartulli 
+#
+# Copyright (c) 2024-2025, OpenVPN Inc.
+#
+
+name: ovpn
+
+protocol: genetlink
+
+doc: Netlink protocol to control OpenVPN network devices
+
+definitions:
+  -
+type: const
+name: nonce-tail-size
+value: 8
+  -
+type: enum
+name: cipher-alg
+entries: [ none, aes-gcm, chacha20-poly1305 ]
+  -
+type: enum
+name: del-peer-reason
+entries:
+  - teardown
+  - userspace
+  - expired
+  - transport-error
+  - transport-disconnect
+  -
+type: enum
+name: key-slot
+entries: [ primary, secondary ]
+
+attribute-sets:
+  -
+name: peer
+attributes:
+  -
+name: id
+type: u32
+doc: >-
+  The unique ID of the peer in the device context. To be used to 
identify
+  peers during operations for a specific device
+checks:
+  max: 0xFF
+  -
+name: remote-ipv4
+type: u32
+doc: The remote IPv4 address of the peer
+byte-order: big-endian
+display-hint: ipv4
+  -
+name: remote-ipv6
+type: binary
+doc: The remote IPv6 address of the peer
+display-hint: ipv6
+checks:
+  exact-len: 16
+  -
+name: remote-ipv6-scope-id
+type: u32
+doc: The scope id of the remote IPv6 address of the peer (RFC2553)
+  -
+name: remote-port
+type: u16
+doc: The remote port of the peer
+byte-order: big-endian
+checks:
+  min: 1
+  -
+name: socket
+type: u32
+doc: The socket to be used to communicate with the peer
+  -
+name: socket-netnsid
+type: s32
+doc: The ID of the netns the socket assigned to this peer lives in
+  -
+name: vpn-ipv4
+type: u32
+doc: The IPv4 address assigned to the peer by the server
+byte-order: big-endian
+display-hint: ipv4
+  -
+name: vpn-ipv6
+type: binary
+doc: The IPv6 address assigned to the peer by the server
+display-hint: ipv6
+checks:
+  exact-len: 16
+  -
+name: local-ipv4
+type: u32
+doc: The local IPv4 to be used to send packets to the peer (UDP only)
+byte-order: big-endian
+display-hint: ipv4
+  -
+name: local-ipv6
+type: binary
+doc: The local IPv6 to be used to send packets to the peer (UDP only)
+display-hint: ipv6
+checks:
+  exact-len: 16
+  -
+name: local-port
+type: u16
+doc: The local port to be used to send packets to the peer (UDP only)
+byte-order: big-endian
+checks:
+  min: 1
+  -
+name: keepalive-interval
+type: u32
+doc: >-
+  The number of seconds after which a keep alive message is sent to the
+  peer
+  -
+name: keepalive-timeout
+type: u32
+doc: >-
+  The number of seconds from the last activity after which the peer is
+  assumed dead
+  -
+name: del-reason
+type: u32
+doc: The reason why a peer was deleted
+enum: del-peer-reason
+  -
+name: vpn-rx-bytes
+type: uint
+doc: Number of bytes received over the tunnel
+  -
+name: vpn-tx-bytes
+type: uint
+doc: Number of bytes transmitted over the tunnel
+  -
+name: vpn-rx-packets
+type: uint
+doc: Number of packets received over the tunnel
+  

[PATCH net-next v25 01/23] net: introduce OpenVPN Data Channel Offload (ovpn)

2025-04-07 Thread Antonio Quartulli
OpenVPN is a userspace software existing since around 2005 that allows
users to create secure tunnels.

So far OpenVPN has implemented all operations in userspace, which
implies several back and forth between kernel and user land in order to
process packets (encapsulate/decapsulate, encrypt/decrypt, rerouting..).

With `ovpn` we intend to move the fast path (data channel) entirely
in kernel space and thus improve user measured throughput over the
tunnel.

`ovpn` is implemented as a simple virtual network device driver, that
can be manipulated by means of the standard RTNL APIs. A device of kind
`ovpn` allows only IPv4/6 traffic and can be of type:
* P2P (peer-to-peer): any packet sent over the interface will be
  encapsulated and transmitted to the other side (typical OpenVPN
  client or peer-to-peer behaviour);
* P2MP (point-to-multipoint): packets sent over the interface are
  transmitted to peers based on existing routes (typical OpenVPN
  server behaviour).

After the interface has been created, OpenVPN in userspace can
configure it using a new Netlink API. Specifically it is possible
to manage peers and their keys.

The OpenVPN control channel is multiplexed over the same transport
socket by means of OP codes. Anything that is not DATA_V2 (OpenVPN
OP code for data traffic) is sent to userspace and handled there.
This way the `ovpn` codebase is kept as compact as possible while
focusing on handling data traffic only (fast path).

Any OpenVPN control feature (like cipher negotiation, TLS handshake,
rekeying, etc.) is still fully handled by the userspace process.

When userspace establishes a new connection with a peer, it first
performs the handshake and then passes the socket to the `ovpn` kernel
module, which takes ownership. From this moment on `ovpn` will handle
data traffic for the new peer.
When control packets are received on the link, they are forwarded to
userspace through the same transport socket they were received on, as
userspace is still listening to them.

Some events (like peer deletion) are sent to a Netlink multicast group.

Although it wasn't easy to convince the community, `ovpn` implements
only a limited number of the data-channel features supported by the
userspace program.

Each feature that made it to `ovpn` was attentively vetted to
avoid carrying too much legacy along with us (and to give a clear cut to
old and probalby-not-so-useful features).

Notably, only encryption using AEAD ciphers (specifically
ChaCha20Poly1305 and AES-GCM) was implemented. Supporting any other
cipher out there was not deemed useful.

Both UDP and TCP sockets are supported.

As explained above, in case of P2MP mode, OpenVPN will use the main system
routing table to decide which packet goes to which peer. This implies
that no routing table was re-implemented in the `ovpn` kernel module.

This kernel module can be enabled by selecting the CONFIG_OVPN entry
in the networking drivers section.

NOTE: this first patch introduces the very basic framework only.
Features are then added patch by patch, however, although each patch
will compile and possibly not break at runtime, only after having
applied the full set it is expected to see the ovpn module fully working.

Cc: steffen.klass...@secunet.com
Cc: antony.ant...@secunet.com
Signed-off-by: Antonio Quartulli 
---
 MAINTAINERS   |   8 
 drivers/net/Kconfig   |   8 
 drivers/net/Makefile  |   1 +
 drivers/net/ovpn/Makefile |  10 +
 drivers/net/ovpn/main.c   | 112 ++
 5 files changed, 139 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 
4c5c2e2c127877a8283793637b0e935ceec27aff..599e821b64131e0b63f5f14be1a62e9ff570063a
 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -18130,6 +18130,14 @@ F: arch/openrisc/
 F: drivers/irqchip/irq-ompic.c
 F: drivers/irqchip/irq-or1k-*
 
+OPENVPN DATA CHANNEL OFFLOAD
+M: Antonio Quartulli 
+L: openvpn-de...@lists.sourceforge.net (subscribers-only)
+L: net...@vger.kernel.org
+S: Supported
+T: git https://github.com/OpenVPN/linux-kernel-ovpn.git
+F: drivers/net/ovpn/
+
 OPENVSWITCH
 M: Aaron Conole 
 M: Eelco Chaudron 
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 
271520510b5fc6866bbf4fc6a0d728d110e6b5e4..5fbe25ae1e11e558aa9aaa857cf110127e459854
 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -115,6 +115,14 @@ config WIREGUARD_DEBUG
 
  Say N here unless you know what you're doing.
 
+config OVPN
+   tristate "OpenVPN data channel offload"
+   depends on NET && INET
+   depends on IPV6 || !IPV6
+   help
+ This module enhances the performance of the OpenVPN userspace software
+ by offloading the data channel processing to kernelspace.
+
 config EQUALIZER
tristate "EQL (serial line load balancing) support"
help
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 
75333251a01a87e18d6c1bc9eec97b96b571b77b..73bc6

[PATCH net-next v25 17/23] ovpn: add support for updating local or remote UDP endpoint

2025-04-07 Thread Antonio Quartulli
In case of UDP links, the local or remote endpoint used to communicate
with a given peer may change without a connection restart.

Add support for learning the new address in case of change.

Signed-off-by: Antonio Quartulli 
---
 drivers/net/ovpn/io.c   |   8 ++
 drivers/net/ovpn/peer.c | 213 +---
 drivers/net/ovpn/peer.h |   2 +
 3 files changed, 210 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c
index 
680c7692b09b61247da3817c18d80ddfdc2814a4..07be4edf0dda060de2ce4161e323a2c2ee40591d
 100644
--- a/drivers/net/ovpn/io.c
+++ b/drivers/net/ovpn/io.c
@@ -96,6 +96,7 @@ void ovpn_decrypt_post(void *data, int ret)
struct ovpn_crypto_key_slot *ks;
unsigned int payload_offset = 0;
struct sk_buff *skb = data;
+   struct ovpn_socket *sock;
struct ovpn_peer *peer;
__be16 proto;
__be32 *pid;
@@ -131,6 +132,13 @@ void ovpn_decrypt_post(void *data, int ret)
/* keep track of last received authenticated packet for keepalive */
WRITE_ONCE(peer->last_recv, ktime_get_real_seconds());
 
+   rcu_read_lock();
+   sock = rcu_dereference(peer->sock);
+   if (sock && sock->sock->sk->sk_protocol == IPPROTO_UDP)
+   /* check if this peer changed local or remote endpoint */
+   ovpn_peer_endpoints_update(peer, skb);
+   rcu_read_unlock();
+
/* point to encapsulated IP packet */
__skb_pull(skb, payload_offset);
 
diff --git a/drivers/net/ovpn/peer.c b/drivers/net/ovpn/peer.c
index 
45e87ac155b554044388490a403f64c777d283a6..0d8b12fd5de4cd6fe15455b435c7d6807203a825
 100644
--- a/drivers/net/ovpn/peer.c
+++ b/drivers/net/ovpn/peer.c
@@ -127,6 +127,206 @@ struct ovpn_peer *ovpn_peer_new(struct ovpn_priv *ovpn, 
u32 id)
return peer;
 }
 
+/**
+ * ovpn_peer_reset_sockaddr - recreate binding for peer
+ * @peer: peer to recreate the binding for
+ * @ss: sockaddr to use as remote endpoint for the binding
+ * @local_ip: local IP for the binding
+ *
+ * Return: 0 on success or a negative error code otherwise
+ */
+static int ovpn_peer_reset_sockaddr(struct ovpn_peer *peer,
+   const struct sockaddr_storage *ss,
+   const void *local_ip)
+{
+   struct ovpn_bind *bind;
+   size_t ip_len;
+
+   lockdep_assert_held(&peer->lock);
+
+   /* create new ovpn_bind object */
+   bind = ovpn_bind_from_sockaddr(ss);
+   if (IS_ERR(bind))
+   return PTR_ERR(bind);
+
+   if (ss->ss_family == AF_INET) {
+   ip_len = sizeof(struct in_addr);
+   } else if (ss->ss_family == AF_INET6) {
+   ip_len = sizeof(struct in6_addr);
+   } else {
+   net_dbg_ratelimited("%s: invalid family %u for remote endpoint 
for peer %u\n",
+   netdev_name(peer->ovpn->dev),
+   ss->ss_family, peer->id);
+   kfree(bind);
+   return -EINVAL;
+   }
+
+   memcpy(&bind->local, local_ip, ip_len);
+
+   /* set binding */
+   ovpn_bind_reset(peer, bind);
+
+   return 0;
+}
+
+/* variable name __tbl2 needs to be different from __tbl1
+ * in the macro below to avoid confusing clang
+ */
+#define ovpn_get_hash_slot(_tbl, _key, _key_len) ({\
+   typeof(_tbl) *__tbl2 = &(_tbl); \
+   jhash(_key, _key_len, 0) % HASH_SIZE(*__tbl2);  \
+})
+
+#define ovpn_get_hash_head(_tbl, _key, _key_len) ({\
+   typeof(_tbl) *__tbl1 = &(_tbl); \
+   &(*__tbl1)[ovpn_get_hash_slot(*__tbl1, _key, _key_len)];\
+})
+
+/**
+ * ovpn_peer_endpoints_update - update remote or local endpoint for peer
+ * @peer: peer to update the remote endpoint for
+ * @skb: incoming packet to retrieve the source/destination address from
+ */
+void ovpn_peer_endpoints_update(struct ovpn_peer *peer, struct sk_buff *skb)
+{
+   struct hlist_nulls_head *nhead;
+   struct sockaddr_storage ss;
+   struct sockaddr_in6 *sa6;
+   bool reset_cache = false;
+   struct sockaddr_in *sa;
+   struct ovpn_bind *bind;
+   const void *local_ip;
+   size_t salen = 0;
+
+   spin_lock_bh(&peer->lock);
+   bind = rcu_dereference_protected(peer->bind,
+lockdep_is_held(&peer->lock));
+   if (unlikely(!bind))
+   goto unlock;
+
+   switch (skb->protocol) {
+   case htons(ETH_P_IP):
+   /* float check */
+   if (unlikely(!ovpn_bind_skb_src_match(bind, skb))) {
+   /* unconditionally save local endpoint in case
+* of float, as it may have changed as well
+*/
+   local_ip = &ip_hdr(skb)->daddr;
+   sa = (struct sockaddr_in *)&ss;
+   sa->sin_family = AF_INET;
+  

[PATCH net-next v25 22/23] ovpn: add basic ethtool support

2025-04-07 Thread Antonio Quartulli
Implement support for basic ethtool functionality.

Note that ovpn is a virtual device driver, therefore
various ethtool APIs are just not meaningful and thus
not implemented.

Signed-off-by: Antonio Quartulli 
Reviewed-by: Andrew Lunn 
---
 drivers/net/ovpn/main.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c
index 
0e016d39c95e52515437313064e892aa3038adad..46abbbed384a4a98e87fadf5a3d749adb78d1de2
 100644
--- a/drivers/net/ovpn/main.c
+++ b/drivers/net/ovpn/main.c
@@ -7,6 +7,7 @@
  * James Yonan 
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -135,6 +136,19 @@ bool ovpn_dev_is_valid(const struct net_device *dev)
return dev->netdev_ops == &ovpn_netdev_ops;
 }
 
+static void ovpn_get_drvinfo(struct net_device *dev,
+struct ethtool_drvinfo *info)
+{
+   strscpy(info->driver, "ovpn", sizeof(info->driver));
+   strscpy(info->bus_info, "ovpn", sizeof(info->bus_info));
+}
+
+static const struct ethtool_ops ovpn_ethtool_ops = {
+   .get_drvinfo= ovpn_get_drvinfo,
+   .get_link   = ethtool_op_get_link,
+   .get_ts_info= ethtool_op_get_ts_info,
+};
+
 static void ovpn_setup(struct net_device *dev)
 {
netdev_features_t feat = NETIF_F_SG | NETIF_F_GSO |
@@ -144,6 +158,7 @@ static void ovpn_setup(struct net_device *dev)
 
dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
 
+   dev->ethtool_ops = &ovpn_ethtool_ops;
dev->netdev_ops = &ovpn_netdev_ops;
 
dev->priv_destructor = ovpn_priv_free;

-- 
2.49.0




[PATCH net-next v25 08/23] ovpn: implement basic RX path (UDP)

2025-04-07 Thread Antonio Quartulli
Packets received over the socket are forwarded to the user device.

Implementation is UDP only. TCP will be added by a later patch.

Note: no decryption/decapsulation exists yet, packets are forwarded as
they arrive without much processing.

Signed-off-by: Antonio Quartulli 
---
 drivers/net/ovpn/io.c   |  64 -
 drivers/net/ovpn/io.h   |   2 +
 drivers/net/ovpn/main.c |  14 -
 drivers/net/ovpn/ovpnpriv.h |   3 +
 drivers/net/ovpn/proto.h|  50 +++-
 drivers/net/ovpn/socket.c   |  13 -
 drivers/net/ovpn/socket.h   |   9 ++-
 drivers/net/ovpn/udp.c  | 135 
 8 files changed, 285 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c
index 
94b466bf2ef70d60d3e60d9820b64877c44f2e51..46ad27e8eb8425f810c7d2b6c63984ea008d90fa
 100644
--- a/drivers/net/ovpn/io.c
+++ b/drivers/net/ovpn/io.c
@@ -9,15 +9,77 @@
 
 #include 
 #include 
+#include 
 #include 
 
-#include "io.h"
 #include "ovpnpriv.h"
 #include "peer.h"
+#include "io.h"
+#include "netlink.h"
+#include "proto.h"
 #include "udp.h"
 #include "skb.h"
 #include "socket.h"
 
+/* Called after decrypt to write the IP packet to the device.
+ * This method is expected to manage/free the skb.
+ */
+static void ovpn_netdev_write(struct ovpn_peer *peer, struct sk_buff *skb)
+{
+   unsigned int pkt_len;
+   int ret;
+
+   /* we can't guarantee the packet wasn't corrupted before entering the
+* VPN, therefore we give other layers a chance to check that
+*/
+   skb->ip_summed = CHECKSUM_NONE;
+
+   /* skb hash for transport packet no longer valid after decapsulation */
+   skb_clear_hash(skb);
+
+   /* post-decrypt scrub -- prepare to inject encapsulated packet onto the
+* interface, based on __skb_tunnel_rx() in dst.h
+*/
+   skb->dev = peer->ovpn->dev;
+   skb_set_queue_mapping(skb, 0);
+   skb_scrub_packet(skb, true);
+
+   skb_reset_network_header(skb);
+   skb_reset_transport_header(skb);
+   skb_reset_inner_headers(skb);
+
+   /* cause packet to be "received" by the interface */
+   pkt_len = skb->len;
+   ret = gro_cells_receive(&peer->ovpn->gro_cells, skb);
+   if (likely(ret == NET_RX_SUCCESS))
+   /* update RX stats with the size of decrypted packet */
+   dev_sw_netstats_rx_add(peer->ovpn->dev, pkt_len);
+}
+
+static void ovpn_decrypt_post(struct sk_buff *skb, int ret)
+{
+   struct ovpn_peer *peer = ovpn_skb_cb(skb)->peer;
+
+   if (unlikely(ret < 0))
+   goto drop;
+
+   ovpn_netdev_write(peer, skb);
+   /* skb is passed to upper layer - don't free it */
+   skb = NULL;
+drop:
+   if (unlikely(skb))
+   dev_core_stats_rx_dropped_inc(peer->ovpn->dev);
+   ovpn_peer_put(peer);
+   kfree_skb(skb);
+}
+
+/* RX path entry point: decrypt packet and forward it to the device */
+void ovpn_recv(struct ovpn_peer *peer, struct sk_buff *skb)
+{
+   ovpn_skb_cb(skb)->peer = peer;
+   ovpn_decrypt_post(skb, 0);
+}
+
 static void ovpn_encrypt_post(struct sk_buff *skb, int ret)
 {
struct ovpn_peer *peer = ovpn_skb_cb(skb)->peer;
diff --git a/drivers/net/ovpn/io.h b/drivers/net/ovpn/io.h
index 
afea5f81f5628dcb9afda9a78974bbf6f2101c13..1cfa66873a2d4840ce57e337f8b4e8143e8b8e79
 100644
--- a/drivers/net/ovpn/io.h
+++ b/drivers/net/ovpn/io.h
@@ -21,4 +21,6 @@
 
 netdev_tx_t ovpn_net_xmit(struct sk_buff *skb, struct net_device *dev);
 
+void ovpn_recv(struct ovpn_peer *peer, struct sk_buff *skb);
+
 #endif /* _NET_OVPN_OVPN_H_ */
diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c
index 
6f3f649bdf2d8da352bdea94a368ea95f153af68..f6219b05485f03c3b2309db8661e38700f6b8b1c
 100644
--- a/drivers/net/ovpn/main.c
+++ b/drivers/net/ovpn/main.c
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -21,10 +22,20 @@
 #include "io.h"
 #include "peer.h"
 #include "proto.h"
+#include "udp.h"
 
 static int ovpn_net_init(struct net_device *dev)
 {
-   return 0;
+   struct ovpn_priv *ovpn = netdev_priv(dev);
+
+   return gro_cells_init(&ovpn->gro_cells, dev);
+}
+
+static void ovpn_net_uninit(struct net_device *dev)
+{
+   struct ovpn_priv *ovpn = netdev_priv(dev);
+
+   gro_cells_destroy(&ovpn->gro_cells);
 }
 
 static int ovpn_net_open(struct net_device *dev)
@@ -43,6 +54,7 @@ static int ovpn_net_open(struct net_device *dev)
 
 static const struct net_device_ops ovpn_netdev_ops = {
.ndo_init   = ovpn_net_init,
+   .ndo_uninit = ovpn_net_uninit,
.ndo_open   = ovpn_net_open,
.ndo_start_xmit = ovpn_net_xmit,
 };
diff --git a/drivers/net/ovpn/ovpnpriv.h b/drivers/net/ovpn/ovpnpriv.h
index 
fae2682b424b03222a5ce881a4a1b4518a7ff311..9d0640e9c71e7fd494e3d9df155732bd5d82463e
 100644
--- a/drivers/net/ovpn/ovpnpriv.h
+++ b/drivers/

[PATCH net-next v25 16/23] ovpn: implement keepalive mechanism

2025-04-07 Thread Antonio Quartulli
OpenVPN supports configuring a periodic keepalive packet.
message to allow the remote endpoint detect link failures.

This change implements the keepalive sending and timer expiring logic.

Signed-off-by: Antonio Quartulli 
---
 drivers/net/ovpn/io.c   |  78 -
 drivers/net/ovpn/io.h   |   5 ++
 drivers/net/ovpn/main.c |   3 +
 drivers/net/ovpn/ovpnpriv.h |   2 +
 drivers/net/ovpn/peer.c | 205 
 drivers/net/ovpn/peer.h |  21 -
 6 files changed, 311 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c
index 
9833d711070a4b476a02a96a6ad56030485c2048..680c7692b09b61247da3817c18d80ddfdc2814a4
 100644
--- a/drivers/net/ovpn/io.c
+++ b/drivers/net/ovpn/io.c
@@ -27,6 +27,33 @@
 #include "skb.h"
 #include "socket.h"
 
+const unsigned char ovpn_keepalive_message[OVPN_KEEPALIVE_SIZE] = {
+   0x2a, 0x18, 0x7b, 0xf3, 0x64, 0x1e, 0xb4, 0xcb,
+   0x07, 0xed, 0x2d, 0x0a, 0x98, 0x1f, 0xc7, 0x48
+};
+
+/**
+ * ovpn_is_keepalive - check if skb contains a keepalive message
+ * @skb: packet to check
+ *
+ * Assumes that the first byte of skb->data is defined.
+ *
+ * Return: true if skb contains a keepalive or false otherwise
+ */
+static bool ovpn_is_keepalive(struct sk_buff *skb)
+{
+   if (*skb->data != ovpn_keepalive_message[0])
+   return false;
+
+   if (skb->len != OVPN_KEEPALIVE_SIZE)
+   return false;
+
+   if (!pskb_may_pull(skb, OVPN_KEEPALIVE_SIZE))
+   return false;
+
+   return !memcmp(skb->data, ovpn_keepalive_message, OVPN_KEEPALIVE_SIZE);
+}
+
 /* Called after decrypt to write the IP packet to the device.
  * This method is expected to manage/free the skb.
  */
@@ -101,6 +128,9 @@ void ovpn_decrypt_post(void *data, int ret)
goto drop;
}
 
+   /* keep track of last received authenticated packet for keepalive */
+   WRITE_ONCE(peer->last_recv, ktime_get_real_seconds());
+
/* point to encapsulated IP packet */
__skb_pull(skb, payload_offset);
 
@@ -118,6 +148,15 @@ void ovpn_decrypt_post(void *data, int ret)
goto drop;
}
 
+   if (ovpn_is_keepalive(skb)) {
+   net_dbg_ratelimited("%s: ping received from peer %u\n",
+   netdev_name(peer->ovpn->dev),
+   peer->id);
+   /* we drop the packet, but this is not a failure */
+   consume_skb(skb);
+   goto drop_nocount;
+   }
+
net_info_ratelimited("%s: unsupported protocol received from 
peer %u\n",
 netdev_name(peer->ovpn->dev), peer->id);
goto drop;
@@ -143,11 +182,12 @@ void ovpn_decrypt_post(void *data, int ret)
 drop:
if (unlikely(skb))
dev_core_stats_rx_dropped_inc(peer->ovpn->dev);
+   kfree_skb(skb);
+drop_nocount:
if (likely(peer))
ovpn_peer_put(peer);
if (likely(ks))
ovpn_crypto_key_slot_put(ks);
-   kfree_skb(skb);
 }
 
 /* RX path entry point: decrypt packet and forward it to the device */
@@ -221,6 +261,8 @@ void ovpn_encrypt_post(void *data, int ret)
}
 
ovpn_peer_stats_increment_tx(&peer->link_stats, orig_len);
+   /* keep track of last sent packet for keepalive */
+   WRITE_ONCE(peer->last_sent, ktime_get_real_seconds());
/* skb passed down the stack - don't free it */
skb = NULL;
 err_unlock:
@@ -346,3 +388,37 @@ netdev_tx_t ovpn_net_xmit(struct sk_buff *skb, struct 
net_device *dev)
kfree_skb_list(skb);
return NET_XMIT_DROP;
 }
+
+/**
+ * ovpn_xmit_special - encrypt and transmit an out-of-band message to peer
+ * @peer: peer to send the message to
+ * @data: message content
+ * @len: message length
+ *
+ * Assumes that caller holds a reference to peer, which will be
+ * passed to ovpn_send()
+ */
+void ovpn_xmit_special(struct ovpn_peer *peer, const void *data,
+  const unsigned int len)
+{
+   struct ovpn_priv *ovpn;
+   struct sk_buff *skb;
+
+   ovpn = peer->ovpn;
+   if (unlikely(!ovpn)) {
+   ovpn_peer_put(peer);
+   return;
+   }
+
+   skb = alloc_skb(256 + len, GFP_ATOMIC);
+   if (unlikely(!skb)) {
+   ovpn_peer_put(peer);
+   return;
+   }
+
+   skb_reserve(skb, 128);
+   skb->priority = TC_PRIO_BESTEFFORT;
+   __skb_put_data(skb, data, len);
+
+   ovpn_send(ovpn, skb, peer);
+}
diff --git a/drivers/net/ovpn/io.h b/drivers/net/ovpn/io.h
index 
5143104b2c4b896a030ec4a8c8aea7015f40ef02..db9e10f9077c4738ee79e5723e2a4bf5ef72f633
 100644
--- a/drivers/net/ovpn/io.h
+++ b/drivers/net/ovpn/io.h
@@ -19,9 +19,14 @@
 /* max padding required by encryption */
 #define OVPN_MAX_PADDING 16
 
+#defin

[PATCH net-next v25 05/23] ovpn: introduce the ovpn_peer object

2025-04-07 Thread Antonio Quartulli
An ovpn_peer object holds the whole status of a remote peer
(regardless whether it is a server or a client).

This includes status for crypto, tx/rx buffers, napi, etc.

Only support for one peer is introduced (P2P mode).
Multi peer support is introduced with a later patch.

Along with the ovpn_peer, also the ovpn_bind object is introcued
as the two are strictly related.
An ovpn_bind object wraps a sockaddr representing the local
coordinates being used to talk to a specific peer.

Signed-off-by: Antonio Quartulli 
---
 drivers/net/Kconfig |   1 +
 drivers/net/ovpn/Makefile   |   2 +
 drivers/net/ovpn/bind.c |  58 +++
 drivers/net/ovpn/bind.h | 101 +++
 drivers/net/ovpn/main.c |  12 ++
 drivers/net/ovpn/ovpnpriv.h |   4 +
 drivers/net/ovpn/peer.c | 411 
 drivers/net/ovpn/peer.h |  80 +
 8 files changed, 669 insertions(+)

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 
5fbe25ae1e11e558aa9aaa857cf110127e459854..2806fcc22a2dbd9b2985b09dd6ef65dd1dc4ebc1
 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -119,6 +119,7 @@ config OVPN
tristate "OpenVPN data channel offload"
depends on NET && INET
depends on IPV6 || !IPV6
+   select DST_CACHE
help
  This module enhances the performance of the OpenVPN userspace software
  by offloading the data channel processing to kernelspace.
diff --git a/drivers/net/ovpn/Makefile b/drivers/net/ovpn/Makefile
index 
0e5f686672fb5052cee5a2c28797b70859514a7f..618328ae338861b9764b42485df71ebd0fc1fe90
 100644
--- a/drivers/net/ovpn/Makefile
+++ b/drivers/net/ovpn/Makefile
@@ -7,7 +7,9 @@
 # Author:  Antonio Quartulli 
 
 obj-$(CONFIG_OVPN) := ovpn.o
+ovpn-y += bind.o
 ovpn-y += main.o
 ovpn-y += io.o
 ovpn-y += netlink.o
 ovpn-y += netlink-gen.o
+ovpn-y += peer.o
diff --git a/drivers/net/ovpn/bind.c b/drivers/net/ovpn/bind.c
new file mode 100644
index 
..d4a1aeed12c99c71eaf5e8e9fc9c0fe61af6aaac
--- /dev/null
+++ b/drivers/net/ovpn/bind.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0
+/*  OpenVPN data channel offload
+ *
+ *  Copyright (C) 2012-2025 OpenVPN, Inc.
+ *
+ *  Author:James Yonan 
+ * Antonio Quartulli 
+ */
+
+#include 
+#include 
+
+#include "ovpnpriv.h"
+#include "bind.h"
+#include "peer.h"
+
+/**
+ * ovpn_bind_from_sockaddr - retrieve binding matching sockaddr
+ * @ss: the sockaddr to match
+ *
+ * Return: the bind matching the passed sockaddr if found, NULL otherwise
+ */
+struct ovpn_bind *ovpn_bind_from_sockaddr(const struct sockaddr_storage *ss)
+{
+   struct ovpn_bind *bind;
+   size_t sa_len;
+
+   if (ss->ss_family == AF_INET)
+   sa_len = sizeof(struct sockaddr_in);
+   else if (ss->ss_family == AF_INET6)
+   sa_len = sizeof(struct sockaddr_in6);
+   else
+   return ERR_PTR(-EAFNOSUPPORT);
+
+   bind = kzalloc(sizeof(*bind), GFP_ATOMIC);
+   if (unlikely(!bind))
+   return ERR_PTR(-ENOMEM);
+
+   memcpy(&bind->remote, ss, sa_len);
+
+   return bind;
+}
+
+/**
+ * ovpn_bind_reset - assign new binding to peer
+ * @peer: the peer whose binding has to be replaced
+ * @new: the new bind to assign
+ */
+void ovpn_bind_reset(struct ovpn_peer *peer, struct ovpn_bind *new)
+{
+   struct ovpn_bind *old;
+
+   spin_lock_bh(&peer->lock);
+   old = rcu_replace_pointer(peer->bind, new, true);
+   spin_unlock_bh(&peer->lock);
+
+   kfree_rcu(old, rcu);
+}
diff --git a/drivers/net/ovpn/bind.h b/drivers/net/ovpn/bind.h
new file mode 100644
index 
..4e0b8398bfd9ed60ecb01c777c61a5e6841d7dec
--- /dev/null
+++ b/drivers/net/ovpn/bind.h
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*  OpenVPN data channel offload
+ *
+ *  Copyright (C) 2012-2025 OpenVPN, Inc.
+ *
+ *  Author:James Yonan 
+ * Antonio Quartulli 
+ */
+
+#ifndef _NET_OVPN_OVPNBIND_H_
+#define _NET_OVPN_OVPNBIND_H_
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+struct ovpn_peer;
+
+/**
+ * union ovpn_sockaddr - basic transport layer address
+ * @in4: IPv4 address
+ * @in6: IPv6 address
+ */
+union ovpn_sockaddr {
+   struct sockaddr_in in4;
+   struct sockaddr_in6 in6;
+};
+
+/**
+ * struct ovpn_bind - remote peer binding
+ * @remote: the remote peer sockaddress
+ * @local: local endpoint used to talk to the peer
+ * @local.ipv4: local IPv4 used to talk to the peer
+ * @local.ipv6: local IPv6 used to talk to the peer
+ * @rcu: used to schedule RCU cleanup job
+ */
+struct ovpn_bind {
+   union ovpn_sockaddr remote;  /* remote sockaddr */
+
+   union {
+   struct in_addr ipv4;
+   struct in6_addr ipv6;
+   } local;
+
+   struct rcu_head rcu;
+};
+
+/**
+ * ovpn_bind_skb_src_match - match packet source with binding
+ * @bind: the binding to ma

[PATCH net-next v25 09/23] ovpn: implement packet processing

2025-04-07 Thread Antonio Quartulli
This change implements encryption/decryption and
encapsulation/decapsulation of OpenVPN packets.

Support for generic crypto state is added along with
a wrapper for the AEAD crypto kernel API.

Signed-off-by: Antonio Quartulli 
---
 drivers/net/Kconfig|   4 +
 drivers/net/ovpn/Makefile  |   3 +
 drivers/net/ovpn/bind.c|   9 +-
 drivers/net/ovpn/crypto.c  | 148 +
 drivers/net/ovpn/crypto.h  | 139 
 drivers/net/ovpn/crypto_aead.c | 366 +
 drivers/net/ovpn/crypto_aead.h |  27 +++
 drivers/net/ovpn/io.c  | 137 +--
 drivers/net/ovpn/io.h  |   3 +
 drivers/net/ovpn/peer.c|  29 
 drivers/net/ovpn/peer.h|   5 +
 drivers/net/ovpn/pktid.c   | 129 +++
 drivers/net/ovpn/pktid.h   |  86 ++
 drivers/net/ovpn/proto.h   |  32 
 drivers/net/ovpn/skb.h |   5 +
 15 files changed, 1105 insertions(+), 17 deletions(-)

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 
305f04dd97234c4aa43da78217448b914cc7ede0..a5cee847911b17a3584ab3d9c1cf7d166d4e1298
 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -121,6 +121,10 @@ config OVPN
depends on IPV6 || !IPV6
select DST_CACHE
select NET_UDP_TUNNEL
+   select CRYPTO
+   select CRYPTO_AES
+   select CRYPTO_GCM
+   select CRYPTO_CHACHA20POLY1305
help
  This module enhances the performance of the OpenVPN userspace software
  by offloading the data channel processing to kernelspace.
diff --git a/drivers/net/ovpn/Makefile b/drivers/net/ovpn/Makefile
index 
164f2058ea8e6dc5b9287afb59758a268b2f8b56..38c9fdca0e2e8e4af3c369ceb3971b58ab52d77b
 100644
--- a/drivers/net/ovpn/Makefile
+++ b/drivers/net/ovpn/Makefile
@@ -8,10 +8,13 @@
 
 obj-$(CONFIG_OVPN) := ovpn.o
 ovpn-y += bind.o
+ovpn-y += crypto.o
+ovpn-y += crypto_aead.o
 ovpn-y += main.o
 ovpn-y += io.o
 ovpn-y += netlink.o
 ovpn-y += netlink-gen.o
 ovpn-y += peer.o
+ovpn-y += pktid.o
 ovpn-y += socket.o
 ovpn-y += udp.o
diff --git a/drivers/net/ovpn/bind.c b/drivers/net/ovpn/bind.c
index 
d4a1aeed12c99c71eaf5e8e9fc9c0fe61af6aaac..24d2788a277e674bde80b5aac9407c6528b108e5
 100644
--- a/drivers/net/ovpn/bind.c
+++ b/drivers/net/ovpn/bind.c
@@ -48,11 +48,8 @@ struct ovpn_bind *ovpn_bind_from_sockaddr(const struct 
sockaddr_storage *ss)
  */
 void ovpn_bind_reset(struct ovpn_peer *peer, struct ovpn_bind *new)
 {
-   struct ovpn_bind *old;
+   lockdep_assert_held(&peer->lock);
 
-   spin_lock_bh(&peer->lock);
-   old = rcu_replace_pointer(peer->bind, new, true);
-   spin_unlock_bh(&peer->lock);
-
-   kfree_rcu(old, rcu);
+   kfree_rcu(rcu_replace_pointer(peer->bind, new,
+ lockdep_is_held(&peer->lock)), rcu);
 }
diff --git a/drivers/net/ovpn/crypto.c b/drivers/net/ovpn/crypto.c
new file mode 100644
index 
..9544255c4588ad0bcccda5bdc83a4d8729458ff8
--- /dev/null
+++ b/drivers/net/ovpn/crypto.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0
+/*  OpenVPN data channel offload
+ *
+ *  Copyright (C) 2020-2025 OpenVPN, Inc.
+ *
+ *  Author:James Yonan 
+ * Antonio Quartulli 
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "ovpnpriv.h"
+#include "main.h"
+#include "pktid.h"
+#include "crypto_aead.h"
+#include "crypto.h"
+
+static void ovpn_ks_destroy_rcu(struct rcu_head *head)
+{
+   struct ovpn_crypto_key_slot *ks;
+
+   ks = container_of(head, struct ovpn_crypto_key_slot, rcu);
+   ovpn_aead_crypto_key_slot_destroy(ks);
+}
+
+void ovpn_crypto_key_slot_release(struct kref *kref)
+{
+   struct ovpn_crypto_key_slot *ks;
+
+   ks = container_of(kref, struct ovpn_crypto_key_slot, refcount);
+   call_rcu(&ks->rcu, ovpn_ks_destroy_rcu);
+}
+
+/* can only be invoked when all peer references have been dropped (i.e. RCU
+ * release routine)
+ */
+void ovpn_crypto_state_release(struct ovpn_crypto_state *cs)
+{
+   struct ovpn_crypto_key_slot *ks;
+
+   ks = rcu_access_pointer(cs->slots[0]);
+   if (ks) {
+   RCU_INIT_POINTER(cs->slots[0], NULL);
+   ovpn_crypto_key_slot_put(ks);
+   }
+
+   ks = rcu_access_pointer(cs->slots[1]);
+   if (ks) {
+   RCU_INIT_POINTER(cs->slots[1], NULL);
+   ovpn_crypto_key_slot_put(ks);
+   }
+}
+
+/* Reset the ovpn_crypto_state object in a way that is atomic
+ * to RCU readers.
+ */
+int ovpn_crypto_state_reset(struct ovpn_crypto_state *cs,
+   const struct ovpn_peer_key_reset *pkr)
+{
+   struct ovpn_crypto_key_slot *old = NULL, *new;
+   u8 idx;
+
+   if (pkr->slot != OVPN_KEY_SLOT_PRIMARY &&
+   pkr->slot != OVPN_KEY_SLOT_SECONDARY)
+   return -EINVAL;
+
+   new = ovpn_aead_crypto_key_slot_new(&pkr->key);
+   if (IS_ERR(new))
+   retur

[PATCH v17 04/15] dt-bindings: PCI: pci-ep: Add support for iommu-map and msi-map

2025-04-07 Thread Frank Li
Document the use of (msi|iommu)-map for PCI Endpoint (EP) controllers,
which can use MSI as a doorbell mechanism. Each EP controller can support
up to 8 physical functions and 65,536 virtual functions.

Define how to construct device IDs using function bits [2:0] and virtual
function index bits [31:3], enabling (msi|iommu)-map to associate each
child device with a specific (msi|iommu)-specifier.

The EP cannot rely on PCI Requester ID (RID) because the RID is determined
by the PCI topology of the host system. Since the EP may be connected to
different PCI hosts, the RID can vary between systems and is therefore not
a reliable identifier.

Signed-off-by: Frank Li 
---
Change from v16 to v17
- new patch
---
 Documentation/devicetree/bindings/pci/pci-ep.yaml | 67 +++
 1 file changed, 67 insertions(+)

diff --git a/Documentation/devicetree/bindings/pci/pci-ep.yaml 
b/Documentation/devicetree/bindings/pci/pci-ep.yaml
index f75000e3093db..a1a5b9b8ef859 100644
--- a/Documentation/devicetree/bindings/pci/pci-ep.yaml
+++ b/Documentation/devicetree/bindings/pci/pci-ep.yaml
@@ -53,6 +53,73 @@ properties:
   must be unique.
 $ref: /schemas/types.yaml#/definitions/uint32
 
+  msi-map:
+description: |
+  Maps a Device ID to an MSI and associated MSI specifier data.
+
+  A PCI Endpoint (EP) can use MSI as a doorbell function. This is achieved 
by
+  mapping the MSI controller's address into PCI BAR. The PCI Root 
Complex
+  can write to this BAR, triggering the EP to generate IRQ. This 
notifies
+  the EP-side driver of an event, eliminating the need for the driver to
+  continuously poll for status changes.
+
+  However, the EP cannot rely on Requester ID (RID) because the RID is
+  determined by the PCI topology of the host system. Since the EP may be
+  connected to different PCI hosts, the RID can vary between systems and is
+  therefore not a reliable identifier.
+
+  Each EP can support up to 8 physical functions and up to 65,536 virtual
+  functions. To uniquely identify each child device, a device ID is defined
+  as
+ - Bits [2:0] for the function number (func)
+ - Bits [18:3] for the virtual function index (vfunc)
+
+  The resulting device ID is computed as:
+
+(func & 0x7) | (vfunc << 3)
+
+  The property is an arbitrary number of tuples of
+  (device-id-base, msi, msi-base,length).
+
+  Any Device ID id in the interval [id-base, id-base + length) is
+  associated with the listed MSI, with the MSI specifier
+  (id - id-base + msi-base).
+$ref: /schemas/types.yaml#/definitions/uint32-matrix
+items:
+  items:
+- description: The Device ID base matched by the entry
+  maximum: 0x7
+- description: phandle to msi-controller node
+- description: (optional) The msi-specifier produced for the first
+Device ID matched by the entry. Currently, msi-specifier is 0 or
+1 cells.
+- description: The length of consecutive Device IDs following the
+Device ID base
+  maximum: 0x8
+
+  msi-map-mask:
+description: A mask to be applied to each Device ID prior to being
+  mapped to an msi-specifier per the msi-map property.
+$ref: /schemas/types.yaml#/definitions/uint32
+
+  iommu-map:
+$ref: /schemas/types.yaml#/definitions/uint32-matrix
+items:
+  items:
+- description: Device ID (see msi-map) base
+  maximum: 0x7
+- description: phandle to IOMMU
+- description: IOMMU specifier base (currently always 1 cell)
+- description: Number of Device IDs
+  maximum: 0x8
+
+  iommu-map-mask:
+description:
+  A mask to be applied to each Device ID prior to being mapped to an
+  IOMMU specifier per the iommu-map property.
+$ref: /schemas/types.yaml#/definitions/uint32
+maximum: 0x
+
 required:
   - compatible
 

-- 
2.34.1




[PATCH v17 02/15] irqdomain: Add IRQ_DOMAIN_FLAG_MSI_IMMUTABLE and irq_domain_is_msi_immutable()

2025-04-07 Thread Frank Li
Add the flag IRQ_DOMAIN_FLAG_MSI_IMMUTABLE and the API function
irq_domain_is_msi_immutable() to check if the MSI controller retains an
immutable address/data pair during irq_set_affinity().

Ensure compatibility with MSI users like PCIe Endpoint Doorbell, which
require the address/data pair to remain unchanged after setup. Use this
function to verify if the MSI controller is immutable.

Signed-off-by: Frank Li 
---
change from v14 to v16
- none

change from v13 to v14
- Roll back to v12 version because Marc Zyngier have concern about add
DOMAIN_BUS_DEVICE_PCI_EP_MSI.
https://lore.kernel.org/imx/861pxfq315.wl-...@kernel.org/

Change from v11 to v12
- change to IRQ_DOMAIN_FLAG_MSI_IMMUTABLE to minimized the code change.
---
 include/linux/irqdomain.h | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 33ff41eef8f73..86222602744e9 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -231,6 +231,9 @@ enum {
/* Irq domain must destroy generic chips when removed */
IRQ_DOMAIN_FLAG_DESTROY_GC  = (1 << 10),
 
+   /* Address and data pair is mutable when irq_set_affinity() */
+   IRQ_DOMAIN_FLAG_MSI_IMMUTABLE   = (1 << 11),
+
/*
 * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved
 * for implementation specific purposes and ignored by the
@@ -691,6 +694,10 @@ static inline bool irq_domain_is_msi_device(struct 
irq_domain *domain)
return domain->flags & IRQ_DOMAIN_FLAG_MSI_DEVICE;
 }
 
+static inline bool irq_domain_is_msi_immutable(struct irq_domain *domain)
+{
+   return domain->flags & IRQ_DOMAIN_FLAG_MSI_IMMUTABLE;
+}
 #else  /* CONFIG_IRQ_DOMAIN_HIERARCHY */
 static inline int irq_domain_alloc_irqs(struct irq_domain *domain,
unsigned int nr_irqs, int node, void *arg)

-- 
2.34.1




[PATCH v17 03/15] irqchip/gic-v3-its: Set IRQ_DOMAIN_FLAG_MSI_IMMUTABLE for ITS

2025-04-07 Thread Frank Li
Set the IRQ_DOMAIN_FLAG_MSI_IMMUTABLE flag for ITS, as it does not change
the address/data pair after setup.

Ensure compatibility with MSI users, such as PCIe Endpoint Doorbell, which
require the address/data pair to remain unchanged. Enable PCIe endpoints to
use ITS for triggering doorbells from the PCIe Root Complex (RC) side.

Signed-off-by: Frank Li 
---
change from v14 to v16
- none

change from v13 to v12
- roll back to v12 version because Marc Zyngier have concern about add
DOMAIN_BUS_DEVICE_PCI_EP_MSI.
https://lore.kernel.org/imx/861pxfq315.wl-...@kernel.org/

change from v11 to v12
- new patch
---
 drivers/irqchip/irq-gic-v3-its.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 0115ad6c82593..fd6e7c170d37e 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -5140,7 +5140,7 @@ static int its_init_domain(struct its_node *its)
irq_domain_update_bus_token(inner_domain, DOMAIN_BUS_NEXUS);
 
inner_domain->msi_parent_ops = &gic_v3_its_msi_parent_ops;
-   inner_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT;
+   inner_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT | 
IRQ_DOMAIN_FLAG_MSI_IMMUTABLE;
 
return 0;
 }

-- 
2.34.1




[PATCH v17 14/15] pci: imx6: Add LUT setting for MSI/IOMMU in Endpoint mode

2025-04-07 Thread Frank Li
Support only one physical function, so call imx_pcie_add_lut_by_rid(0)
to add a single LUT entry when operating in EP mode.

Signed-off-by: Frank Li 
---
change from v14 to v16
- none

change from v13 to v14
- new patch
---
 drivers/pci/controller/dwc/pci-imx6.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/controller/dwc/pci-imx6.c 
b/drivers/pci/controller/dwc/pci-imx6.c
index d1b1365e7469e..a6e4630fcf530 100644
--- a/drivers/pci/controller/dwc/pci-imx6.c
+++ b/drivers/pci/controller/dwc/pci-imx6.c
@@ -992,7 +992,10 @@ static int imx_pcie_add_lut(struct imx_pcie *imx_pcie, u16 
rid, u8 sid)
data1 |= IMX95_PE0_LUT_VLD;
regmap_write(imx_pcie->iomuxc_gpr, IMX95_PE0_LUT_DATA1, data1);
 
-   data2 = IMX95_PE0_LUT_MASK; /* Match all bits of RID */
+   if (imx_pcie->drvdata->mode == DW_PCIE_EP_TYPE)
+   data2 = 0x7; /* EP side's RID from RC, only 'D' is meansful */
+   else
+   data2 = IMX95_PE0_LUT_MASK; /* Match all bits of RID */
data2 |= FIELD_PREP(IMX95_PE0_LUT_REQID, rid);
regmap_write(imx_pcie->iomuxc_gpr, IMX95_PE0_LUT_DATA2, data2);
 
@@ -1622,6 +1625,9 @@ static int imx_pcie_probe(struct platform_device *pdev)
ret = imx_add_pcie_ep(imx_pcie, pdev);
if (ret < 0)
return ret;
+
+   /* Only support one physical function */
+   imx_pcie_add_lut_by_rid(imx_pcie, 0);
} else {
pci->pp.use_atu_msg = true;
ret = dw_pcie_host_init(&pci->pp);

-- 
2.34.1




[PATCH net-next v25 19/23] ovpn: implement key add/get/del/swap via netlink

2025-04-07 Thread Antonio Quartulli
This change introduces the netlink commands needed to add, get, delete
and swap keys for a specific peer.

Userspace is expected to use these commands to create, inspect (non
sensitive data only), destroy and rotate session keys for a specific
peer.

Signed-off-by: Antonio Quartulli 
---
 drivers/net/ovpn/crypto.c  |  40 ++
 drivers/net/ovpn/crypto.h  |   4 +
 drivers/net/ovpn/crypto_aead.c |  17 +++
 drivers/net/ovpn/crypto_aead.h |   2 +
 drivers/net/ovpn/netlink.c | 301 -
 5 files changed, 360 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ovpn/crypto.c b/drivers/net/ovpn/crypto.c
index 
9544255c4588ad0bcccda5bdc83a4d8729458ff8..deeefffc098162b17ea53eb7a5de6b0f19a38022
 100644
--- a/drivers/net/ovpn/crypto.c
+++ b/drivers/net/ovpn/crypto.c
@@ -146,3 +146,43 @@ void ovpn_crypto_key_slots_swap(struct ovpn_crypto_state 
*cs)
 
spin_unlock_bh(&cs->lock);
 }
+
+/**
+ * ovpn_crypto_config_get - populate keyconf object with non-sensible key data
+ * @cs: the crypto state to extract the key data from
+ * @slot: the specific slot to inspect
+ * @keyconf: the output object to populate
+ *
+ * Return: 0 on success or a negative error code otherwise
+ */
+int ovpn_crypto_config_get(struct ovpn_crypto_state *cs,
+  enum ovpn_key_slot slot,
+  struct ovpn_key_config *keyconf)
+{
+   struct ovpn_crypto_key_slot *ks;
+   int idx;
+
+   switch (slot) {
+   case OVPN_KEY_SLOT_PRIMARY:
+   idx = cs->primary_idx;
+   break;
+   case OVPN_KEY_SLOT_SECONDARY:
+   idx = !cs->primary_idx;
+   break;
+   default:
+   return -EINVAL;
+   }
+
+   rcu_read_lock();
+   ks = rcu_dereference(cs->slots[idx]);
+   if (!ks) {
+   rcu_read_unlock();
+   return -ENOENT;
+   }
+
+   keyconf->cipher_alg = ovpn_aead_crypto_alg(ks);
+   keyconf->key_id = ks->key_id;
+   rcu_read_unlock();
+
+   return 0;
+}
diff --git a/drivers/net/ovpn/crypto.h b/drivers/net/ovpn/crypto.h
index 
5155791b87df776a011fa751686180074982..487d24a7d26635c9ca0fd66c75717502f60e7a0c
 100644
--- a/drivers/net/ovpn/crypto.h
+++ b/drivers/net/ovpn/crypto.h
@@ -136,4 +136,8 @@ void ovpn_crypto_state_release(struct ovpn_crypto_state 
*cs);
 
 void ovpn_crypto_key_slots_swap(struct ovpn_crypto_state *cs);
 
+int ovpn_crypto_config_get(struct ovpn_crypto_state *cs,
+  enum ovpn_key_slot slot,
+  struct ovpn_key_config *keyconf);
+
 #endif /* _NET_OVPN_OVPNCRYPTO_H_ */
diff --git a/drivers/net/ovpn/crypto_aead.c b/drivers/net/ovpn/crypto_aead.c
index 
83ec18e4b9a4f7960bf789cee952ac11cb77083d..74ee639ac86880da9e22b88f182f5e0851cb2746
 100644
--- a/drivers/net/ovpn/crypto_aead.c
+++ b/drivers/net/ovpn/crypto_aead.c
@@ -364,3 +364,20 @@ ovpn_aead_crypto_key_slot_new(const struct ovpn_key_config 
*kc)
ovpn_aead_crypto_key_slot_destroy(ks);
return ERR_PTR(ret);
 }
+
+enum ovpn_cipher_alg ovpn_aead_crypto_alg(struct ovpn_crypto_key_slot *ks)
+{
+   const char *alg_name;
+
+   if (!ks->encrypt)
+   return OVPN_CIPHER_ALG_NONE;
+
+   alg_name = crypto_tfm_alg_name(crypto_aead_tfm(ks->encrypt));
+
+   if (!strcmp(alg_name, ALG_NAME_AES))
+   return OVPN_CIPHER_ALG_AES_GCM;
+   else if (!strcmp(alg_name, ALG_NAME_CHACHAPOLY))
+   return OVPN_CIPHER_ALG_CHACHA20_POLY1305;
+   else
+   return OVPN_CIPHER_ALG_NONE;
+}
diff --git a/drivers/net/ovpn/crypto_aead.h b/drivers/net/ovpn/crypto_aead.h
index 
40c056558add3b9d17fda5c43eb858cb44c95945..65a2ff30789862bf52bbda389a995f8edff48e7e
 100644
--- a/drivers/net/ovpn/crypto_aead.h
+++ b/drivers/net/ovpn/crypto_aead.h
@@ -24,4 +24,6 @@ struct ovpn_crypto_key_slot *
 ovpn_aead_crypto_key_slot_new(const struct ovpn_key_config *kc);
 void ovpn_aead_crypto_key_slot_destroy(struct ovpn_crypto_key_slot *ks);
 
+enum ovpn_cipher_alg ovpn_aead_crypto_alg(struct ovpn_crypto_key_slot *ks);
+
 #endif /* _NET_OVPN_OVPNAEAD_H_ */
diff --git a/drivers/net/ovpn/netlink.c b/drivers/net/ovpn/netlink.c
index 
5653a588a47c6c94ff10e042f4f6d32bd8205e58..1f4220021df3a6e74d6e8946a58882bf5d66e444
 100644
--- a/drivers/net/ovpn/netlink.c
+++ b/drivers/net/ovpn/netlink.c
@@ -17,6 +17,7 @@
 #include "netlink.h"
 #include "netlink-gen.h"
 #include "bind.h"
+#include "crypto.h"
 #include "peer.h"
 #include "socket.h"
 
@@ -790,24 +791,316 @@ int ovpn_nl_peer_del_doit(struct sk_buff *skb, struct 
genl_info *info)
return ret;
 }
 
+static int ovpn_nl_get_key_dir(struct genl_info *info, struct nlattr *key,
+  enum ovpn_cipher_alg cipher,
+  struct ovpn_key_direction *dir)
+{
+   struct nlattr *attrs[OVPN_A_KEYDIR_MAX + 1];
+   int ret;
+
+   ret = nla_parse_nested(attrs, OVPN_A_KEYDIR_MAX, key,
+

[PATCH v17 10/15] PCI: endpoint: pci-epf-test: Add doorbell test support

2025-04-07 Thread Frank Li
Add three registers: doorbell_bar, doorbell_addr, and doorbell_data. Use
pci_epf_alloc_doorbell() to allocate a doorbell address space.

Enable the Root Complex (RC) side driver to trigger pci-epc-test's doorbell
callback handler by writing doorbell_data to the mapped doorbell_bar's
address space.

Set STATUS_DOORBELL_SUCCESS in the doorbell callback to indicate
completion.

Avoid breaking compatibility between host and endpoint, add new command
COMMAND_ENABLE_DOORBELL and COMMAND_DISABLE_DOORBELL. Host side need send
COMMAND_ENABLE_DOORBELL to map one bar's inbound address to MSI space.
the command COMMAND_DISABLE_DOORBELL to recovery original inbound address
mapping.

Host side new driverHost side old driver

EP: new driver  S   F
EP: old driver  F   F

S: If EP side support MSI, 'pci_endpoint_test -f pcie_ep_doorbell' return
success.
   If EP side doesn't support MSI, the same to 'F'.

F: 'pci_endpoint_test -f pcie_ep_doorbell' return failure, other case as
usual.

Tested-by: Niklas Cassel 
Signed-off-by: Frank Li 
---
change from v15 to v16
- use le32 for doorbell_* register and use cpu_to_le32() and le32_to_cpu()
when use it.

change from v14 to v15
- none

Change from v9 to v14
- update commit message by use pci_endpoint_test -f pcie_ep_doorbell

Change from v8 to v9
- move pci_epf_alloc_doorbell() into pci_epf_{enable/disable}_doorbell().
- remove doorbell_done in commit message.
- rename pci_epf_{enable/disable}_doorbell() to
pci_epf_test_{enable/disable}_doorbell() to align corrent code style.

Change from v7 to v8
- rename to pci_epf_align_inbound_addr_lo_hi()

Change from v6 to v7
- use help function pci_epf_align_addr_lo_hi()

Change from v5 to v6
- rename doorbell_addr to doorbell_offset

Chagne from v4 to v5
- Add doorbell free at unbind function.
- Move msi irq handler to here to more complex user case, such as differece
doorbell can use difference handler function.
- Add Niklas's code to handle fixed bar's case. If need add your signed-off
tag or co-developer tag, please let me know.

change from v3 to v4
- remove revid requirement
- Add command COMMAND_ENABLE_DOORBELL and COMMAND_DISABLE_DOORBELL.
- call pci_epc_set_bar() to map inbound address to MSI space only at
COMMAND_ENABLE_DOORBELL.
---
 drivers/pci/endpoint/functions/pci-epf-test.c | 142 ++
 1 file changed, 142 insertions(+)

diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c 
b/drivers/pci/endpoint/functions/pci-epf-test.c
index 50eb4106369f4..b9cb1ab218f2b 100644
--- a/drivers/pci/endpoint/functions/pci-epf-test.c
+++ b/drivers/pci/endpoint/functions/pci-epf-test.c
@@ -11,12 +11,14 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
 
 #include 
 #include 
+#include 
 #include 
 
 #define IRQ_TYPE_INTX  0
@@ -29,6 +31,8 @@
 #define COMMAND_READ   BIT(3)
 #define COMMAND_WRITE  BIT(4)
 #define COMMAND_COPY   BIT(5)
+#define COMMAND_ENABLE_DOORBELLBIT(6)
+#define COMMAND_DISABLE_DOORBELL   BIT(7)
 
 #define STATUS_READ_SUCCESSBIT(0)
 #define STATUS_READ_FAIL   BIT(1)
@@ -39,6 +43,11 @@
 #define STATUS_IRQ_RAISED  BIT(6)
 #define STATUS_SRC_ADDR_INVALIDBIT(7)
 #define STATUS_DST_ADDR_INVALIDBIT(8)
+#define STATUS_DOORBELL_SUCCESSBIT(9)
+#define STATUS_DOORBELL_ENABLE_SUCCESS BIT(10)
+#define STATUS_DOORBELL_ENABLE_FAILBIT(11)
+#define STATUS_DOORBELL_DISABLE_SUCCESS BIT(12)
+#define STATUS_DOORBELL_DISABLE_FAIL   BIT(13)
 
 #define FLAG_USE_DMA   BIT(0)
 
@@ -66,6 +75,7 @@ struct pci_epf_test {
booldma_supported;
booldma_private;
const struct pci_epc_features *epc_features;
+   struct pci_epf_bar  db_bar;
 };
 
 struct pci_epf_test_reg {
@@ -80,6 +90,9 @@ struct pci_epf_test_reg {
__le32 irq_number;
__le32 flags;
__le32 caps;
+   __le32 doorbell_bar;
+   __le32 doorbell_offset;
+   __le32 doorbell_data;
 } __packed;
 
 static struct pci_epf_header test_header = {
@@ -667,6 +680,126 @@ static void pci_epf_test_raise_irq(struct pci_epf_test 
*epf_test,
}
 }
 
+static irqreturn_t pci_epf_test_doorbell_handler(int irq, void *data)
+{
+   struct pci_epf_test *epf_test = data;
+   enum pci_barno test_reg_bar = epf_test->test_reg_bar;
+   struct pci_epf_test_reg *reg = epf_test->reg[test_reg_bar];
+   u32 status = le32_to_cpu(reg->status);
+
+   status |= STATUS_DOORBELL_SUCCESS;
+   reg->status = cpu_to_le32(status);
+   pci_epf_test_raise_irq(epf_test, reg);
+
+   return IRQ_HANDLED;
+}
+
+static void pci_epf_test_doorbell_cleanup(struct pci_epf_test *epf_test)
+{
+   struct pci_epf_test_reg *reg = epf_test->reg[epf_test->test_reg_bar];
+   struct pci_epf *epf = epf_

[PATCH net-next v25 21/23] ovpn: notify userspace when a peer is deleted

2025-04-07 Thread Antonio Quartulli
Whenever a peer is deleted, send a notification to userspace so that it
can react accordingly.

This is most important when a peer is deleted due to ping timeout,
because it all happens in kernelspace and thus userspace has no direct
way to learn about it.

Signed-off-by: Antonio Quartulli 
---
 drivers/net/ovpn/netlink.c | 65 ++
 drivers/net/ovpn/netlink.h |  1 +
 drivers/net/ovpn/peer.c|  1 +
 3 files changed, 67 insertions(+)

diff --git a/drivers/net/ovpn/netlink.c b/drivers/net/ovpn/netlink.c
index 
f0b5716059364a1deee1c7d4da1d5341b53dffca..bea03913bfb1e1948d57bd613d2bc6241c76fc06
 100644
--- a/drivers/net/ovpn/netlink.c
+++ b/drivers/net/ovpn/netlink.c
@@ -1103,6 +1103,71 @@ int ovpn_nl_key_del_doit(struct sk_buff *skb, struct 
genl_info *info)
return 0;
 }
 
+/**
+ * ovpn_nl_peer_del_notify - notify userspace about peer being deleted
+ * @peer: the peer being deleted
+ *
+ * Return: 0 on success or a negative error code otherwise
+ */
+int ovpn_nl_peer_del_notify(struct ovpn_peer *peer)
+{
+   struct ovpn_socket *sock;
+   struct sk_buff *msg;
+   struct nlattr *attr;
+   int ret = -EMSGSIZE;
+   void *hdr;
+
+   netdev_info(peer->ovpn->dev, "deleting peer with id %u, reason %d\n",
+   peer->id, peer->delete_reason);
+
+   msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+   if (!msg)
+   return -ENOMEM;
+
+   hdr = genlmsg_put(msg, 0, 0, &ovpn_nl_family, 0, OVPN_CMD_PEER_DEL_NTF);
+   if (!hdr) {
+   ret = -ENOBUFS;
+   goto err_free_msg;
+   }
+
+   if (nla_put_u32(msg, OVPN_A_IFINDEX, peer->ovpn->dev->ifindex))
+   goto err_cancel_msg;
+
+   attr = nla_nest_start(msg, OVPN_A_PEER);
+   if (!attr)
+   goto err_cancel_msg;
+
+   if (nla_put_u32(msg, OVPN_A_PEER_DEL_REASON, peer->delete_reason))
+   goto err_cancel_msg;
+
+   if (nla_put_u32(msg, OVPN_A_PEER_ID, peer->id))
+   goto err_cancel_msg;
+
+   nla_nest_end(msg, attr);
+
+   genlmsg_end(msg, hdr);
+
+   rcu_read_lock();
+   sock = rcu_dereference(peer->sock);
+   if (!sock) {
+   ret = -EINVAL;
+   goto err_unlock;
+   }
+   genlmsg_multicast_netns(&ovpn_nl_family, sock_net(sock->sock->sk),
+   msg, 0, OVPN_NLGRP_PEERS, GFP_ATOMIC);
+   rcu_read_unlock();
+
+   return 0;
+
+err_unlock:
+   rcu_read_unlock();
+err_cancel_msg:
+   genlmsg_cancel(msg, hdr);
+err_free_msg:
+   nlmsg_free(msg);
+   return ret;
+}
+
 /**
  * ovpn_nl_key_swap_notify - notify userspace peer's key must be renewed
  * @peer: the peer whose key needs to be renewed
diff --git a/drivers/net/ovpn/netlink.h b/drivers/net/ovpn/netlink.h
index 
5dc84c8e5e803014053faa0d892fc3a7259d40e5..8615dfc3c4720a2a550b5cd1a8454ccc58a3c6ba
 100644
--- a/drivers/net/ovpn/netlink.h
+++ b/drivers/net/ovpn/netlink.h
@@ -12,6 +12,7 @@
 int ovpn_nl_register(void);
 void ovpn_nl_unregister(void);
 
+int ovpn_nl_peer_del_notify(struct ovpn_peer *peer);
 int ovpn_nl_key_swap_notify(struct ovpn_peer *peer, u8 key_id);
 
 #endif /* _NET_OVPN_NETLINK_H_ */
diff --git a/drivers/net/ovpn/peer.c b/drivers/net/ovpn/peer.c
index 
3af4531393f66a4c9e0fe64dc333f89b98efff6f..0b1d26388dba9b7129922287e43a226f9a2346c2
 100644
--- a/drivers/net/ovpn/peer.c
+++ b/drivers/net/ovpn/peer.c
@@ -706,6 +706,7 @@ static void ovpn_peer_remove(struct ovpn_peer *peer,
}
 
peer->delete_reason = reason;
+   ovpn_nl_peer_del_notify(peer);
 
/* append to provided list for later socket release and ref drop */
llist_add(&peer->release_entry, release_list);

-- 
2.49.0




[PATCH net-next v25 20/23] ovpn: kill key and notify userspace in case of IV exhaustion

2025-04-07 Thread Antonio Quartulli
IV wrap-around is cryptographically dangerous for a number of ciphers,
therefore kill the key and inform userspace (via netlink) should the
IV space go exhausted.

Userspace has two ways of deciding when the key has to be renewed before
exhausting the IV space:
1) time based approach:
   after X seconds/minutes userspace generates a new key and sends it
   to the kernel. This is based on guestimate and normally default
   timer value works well.

2) packet count based approach:
   after X packets/bytes userspace generates a new key and sends it to
   the kernel. Userspace keeps track of the amount of traffic by
   periodically polling GET_PEER and fetching the VPN/LINK stats.

Signed-off-by: Antonio Quartulli 
---
 drivers/net/ovpn/crypto.c  | 22 
 drivers/net/ovpn/crypto.h  |  2 ++
 drivers/net/ovpn/io.c  | 14 ++
 drivers/net/ovpn/netlink.c | 64 ++
 drivers/net/ovpn/netlink.h |  2 ++
 5 files changed, 104 insertions(+)

diff --git a/drivers/net/ovpn/crypto.c b/drivers/net/ovpn/crypto.c
index 
deeefffc098162b17ea53eb7a5de6b0f19a38022..90580e32052fb56c646a6df7816872366133bc75
 100644
--- a/drivers/net/ovpn/crypto.c
+++ b/drivers/net/ovpn/crypto.c
@@ -54,6 +54,28 @@ void ovpn_crypto_state_release(struct ovpn_crypto_state *cs)
}
 }
 
+/* removes the key matching the specified id from the crypto context */
+bool ovpn_crypto_kill_key(struct ovpn_crypto_state *cs, u8 key_id)
+{
+   struct ovpn_crypto_key_slot *ks = NULL;
+
+   spin_lock_bh(&cs->lock);
+   if (rcu_access_pointer(cs->slots[0])->key_id == key_id) {
+   ks = rcu_replace_pointer(cs->slots[0], NULL,
+lockdep_is_held(&cs->lock));
+   } else if (rcu_access_pointer(cs->slots[1])->key_id == key_id) {
+   ks = rcu_replace_pointer(cs->slots[1], NULL,
+lockdep_is_held(&cs->lock));
+   }
+   spin_unlock_bh(&cs->lock);
+
+   if (ks)
+   ovpn_crypto_key_slot_put(ks);
+
+   /* let the caller know if a key was actually killed */
+   return ks;
+}
+
 /* Reset the ovpn_crypto_state object in a way that is atomic
  * to RCU readers.
  */
diff --git a/drivers/net/ovpn/crypto.h b/drivers/net/ovpn/crypto.h
index 
487d24a7d26635c9ca0fd66c75717502f60e7a0c..0e284fec3a75a0a5933978ea9d136f87a2e5c57a
 100644
--- a/drivers/net/ovpn/crypto.h
+++ b/drivers/net/ovpn/crypto.h
@@ -140,4 +140,6 @@ int ovpn_crypto_config_get(struct ovpn_crypto_state *cs,
   enum ovpn_key_slot slot,
   struct ovpn_key_config *keyconf);
 
+bool ovpn_crypto_kill_key(struct ovpn_crypto_state *cs, u8 key_id);
+
 #endif /* _NET_OVPN_OVPNCRYPTO_H_ */
diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c
index 
07be4edf0dda060de2ce4161e323a2c2ee40591d..d83ead5c2c87a46709fc5458b3ada4553a8f26fc
 100644
--- a/drivers/net/ovpn/io.c
+++ b/drivers/net/ovpn/io.c
@@ -245,6 +245,20 @@ void ovpn_encrypt_post(void *data, int ret)
kfree(ovpn_skb_cb(skb)->sg);
aead_request_free(ovpn_skb_cb(skb)->req);
 
+   if (unlikely(ret == -ERANGE)) {
+   /* we ran out of IVs and we must kill the key as it can't be
+* use anymore
+*/
+   netdev_warn(peer->ovpn->dev,
+   "killing key %u for peer %u\n", ks->key_id,
+   peer->id);
+   if (ovpn_crypto_kill_key(&peer->crypto, ks->key_id))
+   /* let userspace know so that a new key must be 
negotiated */
+   ovpn_nl_key_swap_notify(peer, ks->key_id);
+
+   goto err;
+   }
+
if (unlikely(ret < 0))
goto err;
 
diff --git a/drivers/net/ovpn/netlink.c b/drivers/net/ovpn/netlink.c
index 
1f4220021df3a6e74d6e8946a58882bf5d66e444..f0b5716059364a1deee1c7d4da1d5341b53dffca
 100644
--- a/drivers/net/ovpn/netlink.c
+++ b/drivers/net/ovpn/netlink.c
@@ -1103,6 +1103,70 @@ int ovpn_nl_key_del_doit(struct sk_buff *skb, struct 
genl_info *info)
return 0;
 }
 
+/**
+ * ovpn_nl_key_swap_notify - notify userspace peer's key must be renewed
+ * @peer: the peer whose key needs to be renewed
+ * @key_id: the ID of the key that needs to be renewed
+ *
+ * Return: 0 on success or a negative error code otherwise
+ */
+int ovpn_nl_key_swap_notify(struct ovpn_peer *peer, u8 key_id)
+{
+   struct ovpn_socket *sock;
+   struct nlattr *k_attr;
+   struct sk_buff *msg;
+   int ret = -EMSGSIZE;
+   void *hdr;
+
+   netdev_info(peer->ovpn->dev, "peer with id %u must rekey - primary key 
unusable.\n",
+   peer->id);
+
+   msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+   if (!msg)
+   return -ENOMEM;
+
+   hdr = genlmsg_put(msg, 0, 0, &ovpn_nl_family, 0, OVPN_CMD_KEY_SWAP_NTF);
+   if (!hdr) {
+   ret = -ENOBUFS;
+   goto err_fr

Re: [PATCH v3 0/4] Add managed SOFT RESERVE resource handling

2025-04-07 Thread Zhijian Li (Fujitsu)
Hi Terry,

If I understand correctly, this patch set has only considered the situation 
where the
soft reserved area and the region are exactly the same, as in pattern 1.

However, I believe we also need to consider situations where these two are not 
equal,
which are outlined in pattern 2 and 3 below. Let me explain them:

===
Pattern 1:
- region0 will be created during OS booting due to programed hdm decoder
- After OS booted, region0 can be re-created again after destroy it
┌┐
│   CFMW │
└┘
┌┐
│reserved0   │
└┘
┌┐
│   mem0 │
└┘
┌┐
│  region0   │
└┘


Pattern 2:
The HDM decoder is not in a committed state, so during the kernel boot process,
egion0 will not be created automatically. In this case, the soft reserved area 
will
not be removed from the iomem tree. After the OS starts,
users cannot create a region (cxl create-region) either, as there should
be an intersection between the soft reserved area and the region.
  
┌┐
│   CFMW │
└┘
┌┐
│reserved0   │
└┘
┌┐
│   mem0*│
└┘
┌┐
│  N/A   │ region0
└┘
*HDM decoder in mem0 is not committed.
   
   
Pattern 3:
Region0 is a child of the soft reserved area. In this case, the soft reserved 
area will
not be removed from the iomem tree, resulting in being unable to be recreated 
later after destroy.
┌┐
│   CFMW │
└┘
┌┐
│   reserved │
└┘
┌┐
│ mem0| mem1*│
└┘
┌┐
│region0  |  N/A │ region1
└┘
*HDM decoder in mem1 is not committed.


Thanks
Zhijian



On 04/04/2025 02:33, Terry Bowman wrote:
> Add the ability to manage SOFT RESERVE iomem resources prior to them being
> added to the iomem resource tree. This allows drivers, such as CXL, to
> remove any pieces of the SOFT RESERVE resource that intersect with created
> CXL regions.
> 
> The current approach of leaving the SOFT RESERVE resources as is can cause
> failures during hotplug of devices, such as CXL, because the resource is
> not available for reuse after teardown of the device.
> 
> The approach is to add SOFT RESERVE resources to a separate tree during
> boot. This allows any drivers to update the SOFT RESERVE resources before
> they are merged into the iomem resource tree. In addition a notifier chain
> is added so that drivers can be notified when these SOFT RESERVE resources
> are added to the ioeme resource tree.
> 
> The CXL driver is modified to use a worker thread that waits for the CXL
> PCI and CXL mem drivers to be loaded and for their probe routine to
> complete. Then the driver walks through any created CXL regions to trim any
> intersections with SOFT RESERVE resources in the iomem tree.
> 
> The dax driver uses the new soft reserve notifier chain so it can consume
> any remaining SOFT RESERVES once they're added to the iomem tree.
> 
> V3 updates:
>   - Remove srmem resource tree from kernel/resource.c, this is no longer
> needed in the current implementation. All SOFT RESERVE resources now
> put on the iomem resource tree.
>   - Remove the no longer needed SOFT_RESERVED_MANAGED kernel config option.
>   - Add the 'nid' parameter back to hmem_register_resource();
>   - Remove the no longer used soft reserve notification chain (introduced
> in v2). The dax driver is now notified of SOFT RESERVED resources by
> the CXL driver.
> 
> v2 updates:
>   - Add config option SOFT_RESERVE_MANAGED to control use of the
> separate srmem resource tree at boot.
>   - Only add SOFT RESERVE resources to the soft reserve tree during
> boot, they go to the iomem resource tree after boot.
>   - Remove the resource trimming code in the previous patch to re-use
> the existing code in kernel/resource.c
>   - Add functionality for the cxl acpi driver to wait for the cxl PCI
> and me drivers to load.
> 
> Nathan Fontenot (4):
>kernel/resource: Provide mem region release for SOFT RESERVES
>cxl: Update Soft Reserved resources upon region creation
>dax/mum: Save the dax mum platform device pointer
>cxl/dax: Delay consumption of SOFT RESERVE resources
> 
>   drivers/cxl/Kconfig|  4 ---
>   drivers/cxl/acpi.c | 28 +++
>   drivers/cxl/core/Makefile  |  2 +-
>   drivers/cxl/core/region.c  | 34 ++-
>   drivers/cxl/core/suspend.c | 41 
>   drivers/cxl/cxl.h  |  3 +++
>   drivers/cxl/cxlmem.h  

Re: [PATCH v1] s390/virtio_ccw: don't allocate/assign airqs for non-existing queues

2025-04-07 Thread Michael S. Tsirkin
On Mon, Apr 07, 2025 at 10:17:10AM +0200, David Hildenbrand wrote:
> On 07.04.25 09:52, Michael S. Tsirkin wrote:
> > On Fri, Apr 04, 2025 at 05:39:10PM +0200, Halil Pasic wrote:
> > > > 
> > > > Not perfect, but AFAIKS, not horrible.
> > > 
> > > It is like it is. QEMU does queue exist if the corresponding feature
> > > is offered by the device, and that is what we have to live with.
> > 
> > I don't think we can live with this properly though.
> > It means a guest that does not know about some features
> > does not know where to find things.
> 
> Please describe a real scenario, I'm missing the point.


OK so.

Device has VIRTIO_BALLOON_F_FREE_PAGE_HINT and VIRTIO_BALLOON_F_REPORTING
Driver only knows about VIRTIO_BALLOON_F_REPORTING so
it does not know what does VIRTIO_BALLOON_F_FREE_PAGE_HINT do.
How does it know which vq to use for reporting?
It will try to use the free page hint one.



> Whoever adds new feat_X *must be aware* about all previous features,
> otherwise we'd be reusing feature bits and everything falls to pieces.


The knowledge is supposed be limited to which feature bit to use.



> > 
> > So now, I am inclined to add linux code to work with current qemu and
> > with spec compliant one, and add qemu code to work with current linux
> > and spec compliant one.
> > 
> > Document the bug in the spec, maybe, in a non conformance section.
> 
> I'm afraid this results in a lot of churn without really making things
> better.

> IMHO, documenting things how they actually behave, and maybe moving towards
> fixed queue indexes for new features is the low hanging fruit.

I worry about how to we ensure that?
If old code is messed up people will just keep propagating that.
I would like to fix old code so that new code is correct.

> 
> As raised, it's not just qemu+linux, it's *at least* also cloud-hypervisor.
> 
> -- 
> Cheers,
> 
> David / dhildenb

There's a slippery slope here in that people will come to us
with buggy devices and ask to change the spec.




-- 
MST




Re: [PATCH v1] s390/virtio_ccw: don't allocate/assign airqs for non-existing queues

2025-04-07 Thread Michael S. Tsirkin
On Fri, Apr 04, 2025 at 05:39:10PM +0200, Halil Pasic wrote:
> That basically means that if I was, for the sake of fun do
> 
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -1197,7 +1197,6 @@ static unsigned int features[] = {
> VIRTIO_BALLOON_F_MUST_TELL_HOST,
> VIRTIO_BALLOON_F_STATS_VQ,
> VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
> -   VIRTIO_BALLOON_F_FREE_PAGE_HINT,
> VIRTIO_BALLOON_F_PAGE_POISON,
> VIRTIO_BALLOON_F_REPORTING,
>  };
> 
> I would end up with virtio_check_driver_offered_feature() calling
> BUG().


I mean, yes, this is exactly to catch drivers that use
features without negotiating them first. 


-- 
MST




[PATCH] selftest/mm: Make hugetlb_reparenting_test tolerant to async reparenting

2025-04-07 Thread Li Wang
In cgroup v2, memory and hugetlb usage reparenting is asynchronous.
This can cause test flakiness when immediately asserting usage after
deleting a child cgroup. To address this, add a helper function
`assert_with_retry()` that checks usage values with a timeout-based retry.
This improves test stability without relying on fixed sleep delays.

Also bump up the tolerance size to 7MB.

To avoid False Positives:
  ...
  # Assert memory charged correctly for child only use.
  # actual a = 11 MB
  # expected a = 0 MB
  # fail
  # cleanup
  # [FAIL]
  not ok 11 hugetlb_reparenting_test.sh -cgroup-v2 # exit=1
  # 0
  # SUMMARY: PASS=10 SKIP=0 FAIL=1

Signed-off-by: Li Wang 
Cc: Waiman Long 
Cc: Anshuman Khandual 
Cc: Dev Jain 
Cc: Kirill A. Shuemov 
Cc: Shuah Khan 
---
 .../selftests/mm/hugetlb_reparenting_test.sh  | 96 ---
 1 file changed, 41 insertions(+), 55 deletions(-)

diff --git a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh 
b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
index 11f9bbe7dc22..1c172c6999f4 100755
--- a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
+++ b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
@@ -36,7 +36,7 @@ else
 do_umount=1
   fi
 fi
-MNT='/mnt/huge/'
+MNT='/mnt/huge'
 
 function get_machine_hugepage_size() {
   hpz=$(grep -i hugepagesize /proc/meminfo)
@@ -60,6 +60,41 @@ function cleanup() {
   set -e
 }
 
+function assert_with_retry() {
+  local actual_path="$1"
+  local expected="$2"
+  local tolerance=$((7 * 1024 * 1024))
+  local timeout=20
+  local interval=1
+  local start_time
+  local now
+  local elapsed
+  local actual
+
+  start_time=$(date +%s)
+
+  while true; do
+actual="$(cat "$actual_path")"
+
+if [[ $actual -ge $(($expected - $tolerance)) ]] &&
+[[ $actual -le $(($expected + $tolerance)) ]]; then
+  return 0
+fi
+
+now=$(date +%s)
+elapsed=$((now - start_time))
+
+if [[ $elapsed -ge $timeout ]]; then
+  echo "actual = $((${actual%% *} / 1024 / 1024)) MB"
+  echo "expected = $((${expected%% *} / 1024 / 1024)) MB"
+  cleanup
+  exit 1
+fi
+
+sleep $interval
+  done
+}
+
 function assert_state() {
   local expected_a="$1"
   local expected_a_hugetlb="$2"
@@ -70,58 +105,13 @@ function assert_state() {
 expected_b="$3"
 expected_b_hugetlb="$4"
   fi
-  local tolerance=$((5 * 1024 * 1024))
-
-  local actual_a
-  actual_a="$(cat "$CGROUP_ROOT"/a/memory.$usage_file)"
-  if [[ $actual_a -lt $(($expected_a - $tolerance)) ]] ||
-[[ $actual_a -gt $(($expected_a + $tolerance)) ]]; then
-echo actual a = $((${actual_a%% *} / 1024 / 1024)) MB
-echo expected a = $((${expected_a%% *} / 1024 / 1024)) MB
-echo fail
-
-cleanup
-exit 1
-  fi
-
-  local actual_a_hugetlb
-  actual_a_hugetlb="$(cat "$CGROUP_ROOT"/a/hugetlb.${MB}MB.$usage_file)"
-  if [[ $actual_a_hugetlb -lt $(($expected_a_hugetlb - $tolerance)) ]] ||
-[[ $actual_a_hugetlb -gt $(($expected_a_hugetlb + $tolerance)) ]]; then
-echo actual a hugetlb = $((${actual_a_hugetlb%% *} / 1024 / 1024)) MB
-echo expected a hugetlb = $((${expected_a_hugetlb%% *} / 1024 / 1024)) MB
-echo fail
-
-cleanup
-exit 1
-  fi
-
-  if [[ -z "$expected_b" || -z "$expected_b_hugetlb" ]]; then
-return
-  fi
-
-  local actual_b
-  actual_b="$(cat "$CGROUP_ROOT"/a/b/memory.$usage_file)"
-  if [[ $actual_b -lt $(($expected_b - $tolerance)) ]] ||
-[[ $actual_b -gt $(($expected_b + $tolerance)) ]]; then
-echo actual b = $((${actual_b%% *} / 1024 / 1024)) MB
-echo expected b = $((${expected_b%% *} / 1024 / 1024)) MB
-echo fail
-
-cleanup
-exit 1
-  fi
 
-  local actual_b_hugetlb
-  actual_b_hugetlb="$(cat "$CGROUP_ROOT"/a/b/hugetlb.${MB}MB.$usage_file)"
-  if [[ $actual_b_hugetlb -lt $(($expected_b_hugetlb - $tolerance)) ]] ||
-[[ $actual_b_hugetlb -gt $(($expected_b_hugetlb + $tolerance)) ]]; then
-echo actual b hugetlb = $((${actual_b_hugetlb%% *} / 1024 / 1024)) MB
-echo expected b hugetlb = $((${expected_b_hugetlb%% *} / 1024 / 1024)) MB
-echo fail
+  assert_with_retry "$CGROUP_ROOT/a/memory.$usage_file" "$expected_a"
+  assert_with_retry "$CGROUP_ROOT/a/hugetlb.${MB}MB.$usage_file" 
"$expected_a_hugetlb"
 
-cleanup
-exit 1
+  if [[ -n "$expected_b" && -n "$expected_b_hugetlb" ]]; then
+assert_with_retry "$CGROUP_ROOT/a/b/memory.$usage_file" "$expected_b"
+assert_with_retry "$CGROUP_ROOT/a/b/hugetlb.${MB}MB.$usage_file" 
"$expected_b_hugetlb"
   fi
 }
 
@@ -174,7 +164,6 @@ size=$((${MB} * 1024 * 1024 * 25)) # 50MB = 25 * 2MB 
hugepages.
 
 cleanup
 
-echo
 echo
 echo Test charge, rmdir, uncharge
 setup
@@ -195,7 +184,6 @@ cleanup
 
 echo done
 echo
-echo
 if [[ ! $cgroup2 ]]; then
   echo "Test parent and child hugetlb usage"
   setup
@@ -212,7 +200,6 @@ if [[ ! $cgroup2 ]]; then
   assert_state 0 $(($size * 2)) 0 $size
 
   rmdir "$CGROUP_ROOT"/a/b
-  sleep 5
   echo Assert memory reparent correctly.
   assert_state 0 $(($

Re: [PATCH v1] s390/virtio_ccw: don't allocate/assign airqs for non-existing queues

2025-04-07 Thread Michael S. Tsirkin
On Mon, Apr 07, 2025 at 10:44:21AM +0200, David Hildenbrand wrote:
> > 
> > 
> > 
> > > Whoever adds new feat_X *must be aware* about all previous features,
> > > otherwise we'd be reusing feature bits and everything falls to pieces.
> > 
> > 
> > The knowledge is supposed be limited to which feature bit to use.
> 
> I think we also have to know which virtqueue bits can be used, right?
> 

what are virtqueue bits? vq number?


-- 
MST




  1   2   >