date:20240903

[PATCH] selftests: bpf: Replace sizeof(arr)/sizeof(arr[0]) with ARRAY_SIZE

2024-09-03 Thread yangfeng

From: Feng Yang 

The ARRAY_SIZE macro is more compact and more formal in linux source.

Signed-off-by: Feng Yang 
---
 tools/testing/selftests/bpf/prog_tests/fexit_stress.c| 3 ++-
 tools/testing/selftests/bpf/prog_tests/log_buf.c | 5 +++--
 .../testing/selftests/bpf/prog_tests/module_fentry_shadow.c  | 3 ++-
 .../bpf/prog_tests/raw_tp_writable_reject_nbd_invalid.c  | 3 ++-
 .../selftests/bpf/prog_tests/raw_tp_writable_test_run.c  | 5 +++--
 tools/testing/selftests/bpf/prog_tests/tc_opts.c | 2 +-
 tools/testing/selftests/bpf/prog_tests/unpriv_bpf_disabled.c | 3 ++-
 tools/testing/selftests/bpf/progs/syscall.c  | 3 ++-
 tools/testing/selftests/bpf/progs/test_rdonly_maps.c | 3 ++-
 tools/testing/selftests/bpf/progs/verifier_bits_iter.c   | 2 +-
 10 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_stress.c 
b/tools/testing/selftests/bpf/prog_tests/fexit_stress.c
index 49b1ffc9af1f..14c91b6f1e83 100644
--- a/tools/testing/selftests/bpf/prog_tests/fexit_stress.c
+++ b/tools/testing/selftests/bpf/prog_tests/fexit_stress.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2019 Facebook */
 #include 
+#include "bpf_util.h"
 
 void serial_test_fexit_stress(void)
 {
@@ -36,7 +37,7 @@ void serial_test_fexit_stress(void)
for (i = 0; i < bpf_max_tramp_links; i++) {
fexit_fd[i] = bpf_prog_load(BPF_PROG_TYPE_TRACING, NULL, "GPL",
trace_program,
-   sizeof(trace_program) / 
sizeof(struct bpf_insn),
+   ARRAY_SIZE(trace_program),
&trace_opts);
if (!ASSERT_GE(fexit_fd[i], 0, "fexit load"))
goto out;
diff --git a/tools/testing/selftests/bpf/prog_tests/log_buf.c 
b/tools/testing/selftests/bpf/prog_tests/log_buf.c
index 0f7ea4d7d9f6..46611040dec3 100644
--- a/tools/testing/selftests/bpf/prog_tests/log_buf.c
+++ b/tools/testing/selftests/bpf/prog_tests/log_buf.c
@@ -5,6 +5,7 @@
 #include 
 
 #include "test_log_buf.skel.h"
+#include "bpf_util.h"
 
 static size_t libbpf_log_pos;
 static char libbpf_log_buf[1024 * 1024];
@@ -143,11 +144,11 @@ static void bpf_prog_load_log_buf(void)
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
};
-   const size_t good_prog_insn_cnt = sizeof(good_prog_insns) / 
sizeof(struct bpf_insn);
+   const size_t good_prog_insn_cnt = ARRAY_SIZE(good_prog_insns);
const struct bpf_insn bad_prog_insns[] = {
BPF_EXIT_INSN(),
};
-   size_t bad_prog_insn_cnt = sizeof(bad_prog_insns) / sizeof(struct 
bpf_insn);
+   size_t bad_prog_insn_cnt = ARRAY_SIZE(bad_prog_insns);
LIBBPF_OPTS(bpf_prog_load_opts, opts);
const size_t log_buf_sz = 1024 * 1024;
char *log_buf;
diff --git a/tools/testing/selftests/bpf/prog_tests/module_fentry_shadow.c 
b/tools/testing/selftests/bpf/prog_tests/module_fentry_shadow.c
index aa9f67eb1c95..bea05f78de5f 100644
--- a/tools/testing/selftests/bpf/prog_tests/module_fentry_shadow.c
+++ b/tools/testing/selftests/bpf/prog_tests/module_fentry_shadow.c
@@ -4,6 +4,7 @@
 #include 
 #include "bpf/libbpf_internal.h"
 #include "cgroup_helpers.h"
+#include "bpf_util.h"
 
 static const char *module_name = "bpf_testmod";
 static const char *symbol_name = "bpf_fentry_shadow_test";
@@ -100,7 +101,7 @@ void test_module_fentry_shadow(void)
load_opts.attach_btf_obj_fd = btf_fd[i];
prog_fd[i] = bpf_prog_load(BPF_PROG_TYPE_TRACING, NULL, "GPL",
   trace_program,
-  sizeof(trace_program) / 
sizeof(struct bpf_insn),
+  ARRAY_SIZE(trace_program),
   &load_opts);
if (!ASSERT_GE(prog_fd[i], 0, "bpf_prog_load"))
goto out;
diff --git 
a/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_reject_nbd_invalid.c 
b/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_reject_nbd_invalid.c
index e2f1445b0e10..216b0dfac0fe 100644
--- 
a/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_reject_nbd_invalid.c
+++ 
b/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_reject_nbd_invalid.c
@@ -2,6 +2,7 @@
 
 #include 
 #include 
+#include "bpf_util.h"
 
 void test_raw_tp_writable_reject_nbd_invalid(void)
 {
@@ -25,7 +26,7 @@ void test_raw_tp_writable_reject_nbd_invalid(void)
);
 
bpf_fd = bpf_prog_load(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, NULL, 
"GPL v2",
-  program, sizeof(program) / sizeof(struct 
bpf_insn),
+  program, ARRAY_SIZE(program),
   &opts);
if (CHECK(bpf_fd < 0, "bpf_raw_tracepoint_writab

Re: [PATCH vhost v2 00/10] vdpa/mlx5: Parallelize device suspend/resume

2024-09-03 Thread Lei Yang

On Mon, Sep 2, 2024 at 7:05 PM Dragos Tatulea  wrote:
>
> Hi Lei,
>
> On 02.09.24 12:03, Lei Yang wrote:
> > Hi Dragos
> >
> > QE tested this series with mellanox nic, it failed with [1] when
> > booting guest, and host dmesg also will print messages [2]. This bug
> > can be reproduced boot guest with vhost-vdpa device.
> >
> > [1] qemu) qemu-kvm: vhost VQ 1 ring restore failed: -1: Operation not
> > permitted (1)
> > qemu-kvm: vhost VQ 0 ring restore failed: -1: Operation not permitted (1)
> > qemu-kvm: unable to start vhost net: 5: falling back on userspace virtio
> > qemu-kvm: vhost_set_features failed: Device or resource busy (16)
> > qemu-kvm: unable to start vhost net: 16: falling back on userspace virtio
> >
> > [2] Host dmesg:
> > [ 1406.187977] mlx5_core :0d:00.2:
> > mlx5_vdpa_compat_reset:3267:(pid 8506): performing device reset
> > [ 1406.189221] mlx5_core :0d:00.2:
> > mlx5_vdpa_compat_reset:3267:(pid 8506): performing device reset
> > [ 1406.190354] mlx5_core :0d:00.2:
> > mlx5_vdpa_show_mr_leaks:573:(pid 8506) warning: mkey still alive after
> > resource delete: mr: 0c5ccca2, mkey: 0x4000, refcount: 2
> > [ 1471.538487] mlx5_core :0d:00.2: cb_timeout_handler:938:(pid
> > 428): cmd[13]: MODIFY_GENERAL_OBJECT(0xa01) Async, timeout. Will cause
> > a leak of a command resource
> > [ 1471.539486] mlx5_core :0d:00.2: cb_timeout_handler:938:(pid
> > 428): cmd[12]: MODIFY_GENERAL_OBJECT(0xa01) Async, timeout. Will cause
> > a leak of a command resource
> > [ 1471.540351] mlx5_core :0d:00.2: modify_virtqueues:1617:(pid
> > 8511) error: modify vq 0 failed, state: 0 -> 0, err: 0
> > [ 1471.541433] mlx5_core :0d:00.2: modify_virtqueues:1617:(pid
> > 8511) error: modify vq 1 failed, state: 0 -> 0, err: -110
> > [ 1471.542388] mlx5_core :0d:00.2: mlx5_vdpa_set_status:3203:(pid
> > 8511) warning: failed to resume VQs
> > [ 1471.549778] mlx5_core :0d:00.2:
> > mlx5_vdpa_show_mr_leaks:573:(pid 8511) warning: mkey still alive after
> > resource delete: mr: 0c5ccca2, mkey: 0x4000, refcount: 2
> > [ 1512.929854] mlx5_core :0d:00.2:
> > mlx5_vdpa_compat_reset:3267:(pid 8565): performing device reset
> > [ 1513.100290] mlx5_core :0d:00.2:
> > mlx5_vdpa_show_mr_leaks:573:(pid 8565) warning: mkey still alive after
> > resource delete: mr: 0c5ccca2, mkey: 0x4000, refcount: 2
> >

Hi Dragos

> Can you provide more details about the qemu version and the vdpa device
> options used?
>
> Also, which FW version are you using? There is a relevant bug in FW
> 22.41.1000 which was fixed in the latest FW (22.42.1000). Did you
> encounter any FW syndromes in the host dmesg log?

This problem has gone when I updated the firmware version to
22.42.1000, and I tested it with regression tests using mellanox nic,
everything works well.

Tested-by: Lei Yang 
>
> Thanks,
> Dragos
>

[PATCH] selftests: net: convert comma to semicolon

2024-09-03 Thread Chen Ni

Replace a comma between expression statements by a semicolon.

Signed-off-by: Chen Ni 
---
 tools/testing/selftests/net/psock_fanout.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/net/psock_fanout.c 
b/tools/testing/selftests/net/psock_fanout.c
index 1a736f700be4..4f31e92ebd96 100644
--- a/tools/testing/selftests/net/psock_fanout.c
+++ b/tools/testing/selftests/net/psock_fanout.c
@@ -165,9 +165,9 @@ static void sock_fanout_set_ebpf(int fd)
attr.insns = (unsigned long) prog;
attr.insn_cnt = ARRAY_SIZE(prog);
attr.license = (unsigned long) "GPL";
-   attr.log_buf = (unsigned long) log_buf,
-   attr.log_size = sizeof(log_buf),
-   attr.log_level = 1,
+   attr.log_buf = (unsigned long) log_buf;
+   attr.log_size = sizeof(log_buf);
+   attr.log_level = 1;
 
pfd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
if (pfd < 0) {
-- 
2.25.1

Re: [PATCH vhost v2 00/10] vdpa/mlx5: Parallelize device suspend/resume

2024-09-03 Thread Dragos Tatulea




On 03.09.24 09:40, Lei Yang wrote:
> On Mon, Sep 2, 2024 at 7:05 PM Dragos Tatulea  wrote:
>>
>> Hi Lei,
>>
>> On 02.09.24 12:03, Lei Yang wrote:
>>> Hi Dragos
>>>
>>> QE tested this series with mellanox nic, it failed with [1] when
>>> booting guest, and host dmesg also will print messages [2]. This bug
>>> can be reproduced boot guest with vhost-vdpa device.
>>>
>>> [1] qemu) qemu-kvm: vhost VQ 1 ring restore failed: -1: Operation not
>>> permitted (1)
>>> qemu-kvm: vhost VQ 0 ring restore failed: -1: Operation not permitted (1)
>>> qemu-kvm: unable to start vhost net: 5: falling back on userspace virtio
>>> qemu-kvm: vhost_set_features failed: Device or resource busy (16)
>>> qemu-kvm: unable to start vhost net: 16: falling back on userspace virtio
>>>
>>> [2] Host dmesg:
>>> [ 1406.187977] mlx5_core :0d:00.2:
>>> mlx5_vdpa_compat_reset:3267:(pid 8506): performing device reset
>>> [ 1406.189221] mlx5_core :0d:00.2:
>>> mlx5_vdpa_compat_reset:3267:(pid 8506): performing device reset
>>> [ 1406.190354] mlx5_core :0d:00.2:
>>> mlx5_vdpa_show_mr_leaks:573:(pid 8506) warning: mkey still alive after
>>> resource delete: mr: 0c5ccca2, mkey: 0x4000, refcount: 2
>>> [ 1471.538487] mlx5_core :0d:00.2: cb_timeout_handler:938:(pid
>>> 428): cmd[13]: MODIFY_GENERAL_OBJECT(0xa01) Async, timeout. Will cause
>>> a leak of a command resource
>>> [ 1471.539486] mlx5_core :0d:00.2: cb_timeout_handler:938:(pid
>>> 428): cmd[12]: MODIFY_GENERAL_OBJECT(0xa01) Async, timeout. Will cause
>>> a leak of a command resource
>>> [ 1471.540351] mlx5_core :0d:00.2: modify_virtqueues:1617:(pid
>>> 8511) error: modify vq 0 failed, state: 0 -> 0, err: 0
>>> [ 1471.541433] mlx5_core :0d:00.2: modify_virtqueues:1617:(pid
>>> 8511) error: modify vq 1 failed, state: 0 -> 0, err: -110
>>> [ 1471.542388] mlx5_core :0d:00.2: mlx5_vdpa_set_status:3203:(pid
>>> 8511) warning: failed to resume VQs
>>> [ 1471.549778] mlx5_core :0d:00.2:
>>> mlx5_vdpa_show_mr_leaks:573:(pid 8511) warning: mkey still alive after
>>> resource delete: mr: 0c5ccca2, mkey: 0x4000, refcount: 2
>>> [ 1512.929854] mlx5_core :0d:00.2:
>>> mlx5_vdpa_compat_reset:3267:(pid 8565): performing device reset
>>> [ 1513.100290] mlx5_core :0d:00.2:
>>> mlx5_vdpa_show_mr_leaks:573:(pid 8565) warning: mkey still alive after
>>> resource delete: mr: 0c5ccca2, mkey: 0x4000, refcount: 2
>>>
> 
> Hi Dragos
> 
>> Can you provide more details about the qemu version and the vdpa device
>> options used?
>>
>> Also, which FW version are you using? There is a relevant bug in FW
>> 22.41.1000 which was fixed in the latest FW (22.42.1000). Did you
>> encounter any FW syndromes in the host dmesg log?
> 
> This problem has gone when I updated the firmware version to
> 22.42.1000, and I tested it with regression tests using mellanox nic,
> everything works well.
> 
> Tested-by: Lei Yang 
Good to hear. Thanks for the quick reaction.

Thanks,
Dragos

Re: [RFC 05/31] x86/compiler: Tweak __UNIQUE_ID naming

2024-09-03 Thread Peter Zijlstra

On Mon, Sep 02, 2024 at 08:59:48PM -0700, Josh Poimboeuf wrote:
> Add an underscore between the "name" and the counter so tooling can
> distinguish between the non-unique and unique portions of the symbol
> name.
> 
> This will come in handy for "objtool klp diff".
> 
> Signed-off-by: Josh Poimboeuf 
> ---
>  include/linux/compiler.h | 8 ++--
>  1 file changed, 6 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/compiler.h b/include/linux/compiler.h
> index 8c252e073bd8..d3f100821d45 100644
> --- a/include/linux/compiler.h
> +++ b/include/linux/compiler.h
> @@ -186,7 +186,11 @@ void ftrace_likely_update(struct ftrace_likely_data *f, 
> int val,
>   __asm__ ("" : "=r" (var) : "0" (var))
>  #endif
>  
> -#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), 
> __COUNTER__)
> +/* Format: __UNIQUE_ID__<__COUNTER__> */
> +#define __UNIQUE_ID(name)\
> + __PASTE(__UNIQUE_ID_,   \
> + __PASTE(name,   \
> + __PASTE(_, __COUNTER__)))

OK, that's just painful to read; how about so?

__PASTE(__UNIQUE_ID_,   \
__PASTE(name,   \
__PASTE(_, __COUNTER)))

>  
>  /**
>   * data_race - mark an expression as containing intentional data races
> @@ -218,7 +222,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, 
> int val,
>   */
>  #define ___ADDRESSABLE(sym, __attrs) \
>   static void * __used __attrs \
> - __UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)(uintptr_t)&sym;
> + __UNIQUE_ID(__PASTE(addressable_, sym)) = (void *)(uintptr_t)&sym;

This change doesn't get mention ?

>  #define __ADDRESSABLE(sym) \
>   ___ADDRESSABLE(sym, __section(".discard.addressable"))
>  
> -- 
> 2.45.2
>

Re: [RFC 07/31] kbuild: Remove "kmod" prefix from __KBUILD_MODNAME

2024-09-03 Thread Peter Zijlstra

On Mon, Sep 02, 2024 at 08:59:50PM -0700, Josh Poimboeuf wrote:
> Remove the arbitrary "kmod" prefix from __KBUILD_MODNAME and add it back
> manually in the __initcall_id() macro.
> 
> This makes it more consistent, now __KBUILD_MODNAME is just the
> non-stringified version of KBUILD_MODNAME.  It will come in handy for
> the upcoming "objtool klp diff".
> 
> Signed-off-by: Josh Poimboeuf 
> ---
>  include/linux/init.h | 3 ++-
>  scripts/Makefile.lib | 2 +-
>  2 files changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/init.h b/include/linux/init.h
> index 58cef4c2e59a..b1921f4a7b7e 100644
> --- a/include/linux/init.h
> +++ b/include/linux/init.h
> @@ -206,12 +206,13 @@ extern struct module __this_module;
>  
>  /* Format:  */
>  #define __initcall_id(fn)\
> + __PASTE(kmod_,  \
>   __PASTE(__KBUILD_MODNAME,   \
>   __PASTE(__, \
>   __PASTE(__COUNTER__,\
>   __PASTE(_,  \
>   __PASTE(__LINE__,   \
> - __PASTE(_, fn))
> + __PASTE(_, fn)))

:-(

Re: [PATCH vhost v2 00/10] vdpa/mlx5: Parallelize device suspend/resume

2024-09-03 Thread Eugenio Perez Martin

On Tue, Sep 3, 2024 at 9:48 AM Dragos Tatulea  wrote:
>
>
>
> On 03.09.24 09:40, Lei Yang wrote:
> > On Mon, Sep 2, 2024 at 7:05 PM Dragos Tatulea  wrote:
> >>
> >> Hi Lei,
> >>
> >> On 02.09.24 12:03, Lei Yang wrote:
> >>> Hi Dragos
> >>>
> >>> QE tested this series with mellanox nic, it failed with [1] when
> >>> booting guest, and host dmesg also will print messages [2]. This bug
> >>> can be reproduced boot guest with vhost-vdpa device.
> >>>
> >>> [1] qemu) qemu-kvm: vhost VQ 1 ring restore failed: -1: Operation not
> >>> permitted (1)
> >>> qemu-kvm: vhost VQ 0 ring restore failed: -1: Operation not permitted (1)
> >>> qemu-kvm: unable to start vhost net: 5: falling back on userspace virtio
> >>> qemu-kvm: vhost_set_features failed: Device or resource busy (16)
> >>> qemu-kvm: unable to start vhost net: 16: falling back on userspace virtio
> >>>
> >>> [2] Host dmesg:
> >>> [ 1406.187977] mlx5_core :0d:00.2:
> >>> mlx5_vdpa_compat_reset:3267:(pid 8506): performing device reset
> >>> [ 1406.189221] mlx5_core :0d:00.2:
> >>> mlx5_vdpa_compat_reset:3267:(pid 8506): performing device reset
> >>> [ 1406.190354] mlx5_core :0d:00.2:
> >>> mlx5_vdpa_show_mr_leaks:573:(pid 8506) warning: mkey still alive after
> >>> resource delete: mr: 0c5ccca2, mkey: 0x4000, refcount: 2
> >>> [ 1471.538487] mlx5_core :0d:00.2: cb_timeout_handler:938:(pid
> >>> 428): cmd[13]: MODIFY_GENERAL_OBJECT(0xa01) Async, timeout. Will cause
> >>> a leak of a command resource
> >>> [ 1471.539486] mlx5_core :0d:00.2: cb_timeout_handler:938:(pid
> >>> 428): cmd[12]: MODIFY_GENERAL_OBJECT(0xa01) Async, timeout. Will cause
> >>> a leak of a command resource
> >>> [ 1471.540351] mlx5_core :0d:00.2: modify_virtqueues:1617:(pid
> >>> 8511) error: modify vq 0 failed, state: 0 -> 0, err: 0
> >>> [ 1471.541433] mlx5_core :0d:00.2: modify_virtqueues:1617:(pid
> >>> 8511) error: modify vq 1 failed, state: 0 -> 0, err: -110
> >>> [ 1471.542388] mlx5_core :0d:00.2: mlx5_vdpa_set_status:3203:(pid
> >>> 8511) warning: failed to resume VQs
> >>> [ 1471.549778] mlx5_core :0d:00.2:
> >>> mlx5_vdpa_show_mr_leaks:573:(pid 8511) warning: mkey still alive after
> >>> resource delete: mr: 0c5ccca2, mkey: 0x4000, refcount: 2
> >>> [ 1512.929854] mlx5_core :0d:00.2:
> >>> mlx5_vdpa_compat_reset:3267:(pid 8565): performing device reset
> >>> [ 1513.100290] mlx5_core :0d:00.2:
> >>> mlx5_vdpa_show_mr_leaks:573:(pid 8565) warning: mkey still alive after
> >>> resource delete: mr: 0c5ccca2, mkey: 0x4000, refcount: 2
> >>>
> >
> > Hi Dragos
> >
> >> Can you provide more details about the qemu version and the vdpa device
> >> options used?
> >>
> >> Also, which FW version are you using? There is a relevant bug in FW
> >> 22.41.1000 which was fixed in the latest FW (22.42.1000). Did you
> >> encounter any FW syndromes in the host dmesg log?
> >
> > This problem has gone when I updated the firmware version to
> > 22.42.1000, and I tested it with regression tests using mellanox nic,
> > everything works well.
> >
> > Tested-by: Lei Yang 
> Good to hear. Thanks for the quick reaction.
>

Is it possible to add a check so it doesn't use the async fashion in old FW?

Re: [PATCH vhost v2 00/10] vdpa/mlx5: Parallelize device suspend/resume

2024-09-03 Thread Dragos Tatulea




On 03.09.24 10:10, Eugenio Perez Martin wrote:
> On Tue, Sep 3, 2024 at 9:48 AM Dragos Tatulea  wrote:
>>
>>
>>
>> On 03.09.24 09:40, Lei Yang wrote:
>>> On Mon, Sep 2, 2024 at 7:05 PM Dragos Tatulea  wrote:

 Hi Lei,

 On 02.09.24 12:03, Lei Yang wrote:
> Hi Dragos
>
> QE tested this series with mellanox nic, it failed with [1] when
> booting guest, and host dmesg also will print messages [2]. This bug
> can be reproduced boot guest with vhost-vdpa device.
>
> [1] qemu) qemu-kvm: vhost VQ 1 ring restore failed: -1: Operation not
> permitted (1)
> qemu-kvm: vhost VQ 0 ring restore failed: -1: Operation not permitted (1)
> qemu-kvm: unable to start vhost net: 5: falling back on userspace virtio
> qemu-kvm: vhost_set_features failed: Device or resource busy (16)
> qemu-kvm: unable to start vhost net: 16: falling back on userspace virtio
>
> [2] Host dmesg:
> [ 1406.187977] mlx5_core :0d:00.2:
> mlx5_vdpa_compat_reset:3267:(pid 8506): performing device reset
> [ 1406.189221] mlx5_core :0d:00.2:
> mlx5_vdpa_compat_reset:3267:(pid 8506): performing device reset
> [ 1406.190354] mlx5_core :0d:00.2:
> mlx5_vdpa_show_mr_leaks:573:(pid 8506) warning: mkey still alive after
> resource delete: mr: 0c5ccca2, mkey: 0x4000, refcount: 2
> [ 1471.538487] mlx5_core :0d:00.2: cb_timeout_handler:938:(pid
> 428): cmd[13]: MODIFY_GENERAL_OBJECT(0xa01) Async, timeout. Will cause
> a leak of a command resource
> [ 1471.539486] mlx5_core :0d:00.2: cb_timeout_handler:938:(pid
> 428): cmd[12]: MODIFY_GENERAL_OBJECT(0xa01) Async, timeout. Will cause
> a leak of a command resource
> [ 1471.540351] mlx5_core :0d:00.2: modify_virtqueues:1617:(pid
> 8511) error: modify vq 0 failed, state: 0 -> 0, err: 0
> [ 1471.541433] mlx5_core :0d:00.2: modify_virtqueues:1617:(pid
> 8511) error: modify vq 1 failed, state: 0 -> 0, err: -110
> [ 1471.542388] mlx5_core :0d:00.2: mlx5_vdpa_set_status:3203:(pid
> 8511) warning: failed to resume VQs
> [ 1471.549778] mlx5_core :0d:00.2:
> mlx5_vdpa_show_mr_leaks:573:(pid 8511) warning: mkey still alive after
> resource delete: mr: 0c5ccca2, mkey: 0x4000, refcount: 2
> [ 1512.929854] mlx5_core :0d:00.2:
> mlx5_vdpa_compat_reset:3267:(pid 8565): performing device reset
> [ 1513.100290] mlx5_core :0d:00.2:
> mlx5_vdpa_show_mr_leaks:573:(pid 8565) warning: mkey still alive after
> resource delete: mr: 0c5ccca2, mkey: 0x4000, refcount: 2
>
>>>
>>> Hi Dragos
>>>
 Can you provide more details about the qemu version and the vdpa device
 options used?

 Also, which FW version are you using? There is a relevant bug in FW
 22.41.1000 which was fixed in the latest FW (22.42.1000). Did you
 encounter any FW syndromes in the host dmesg log?
>>>
>>> This problem has gone when I updated the firmware version to
>>> 22.42.1000, and I tested it with regression tests using mellanox nic,
>>> everything works well.
>>>
>>> Tested-by: Lei Yang 
>> Good to hear. Thanks for the quick reaction.
>>
> 
> Is it possible to add a check so it doesn't use the async fashion in old FW?
> 
Unfortunately not, it would have been there otherwise.

Note that this affects only FW version 22.41.1000. Older versions are not
affected because VQ resume is not supported.

Thanks,
Dragos

Re: [RFC 20/31] objtool: Add UD1 detection

2024-09-03 Thread Peter Zijlstra

On Mon, Sep 02, 2024 at 09:00:03PM -0700, Josh Poimboeuf wrote:
> A UD1 isn't a BUG and shouldn't be treated like one.
> 
> Signed-off-by: Josh Poimboeuf 
> ---
>  tools/objtool/arch/x86/decode.c | 10 +-
>  1 file changed, 9 insertions(+), 1 deletion(-)
> 
> diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
> index 6b34b058a821..72d55dcd3d7f 100644
> --- a/tools/objtool/arch/x86/decode.c
> +++ b/tools/objtool/arch/x86/decode.c
> @@ -528,11 +528,19 @@ int arch_decode_instruction(struct objtool_file *file, 
> const struct section *sec
>   /* sysenter, sysret */
>   insn->type = INSN_CONTEXT_SWITCH;
>  
> - } else if (op2 == 0x0b || op2 == 0xb9) {
> + } else if (op2 == 0x0b) {
>  
>   /* ud2 */
>   insn->type = INSN_BUG;
>  
> + } else if (op2 == 0xb9) {
> +
> + /*
> +  * ud1 - only used for the static call trampoline to
> +  * stop speculation.  Basically used like an int3.
> +  */
> + insn->type = INSN_TRAP;
> +
>   } else if (op2 == 0x0d || op2 == 0x1f) {
>  
>   /* nopl/nopw */

We recently grew more UD1 usage...

---
commit 7424fc6b86c8980a87169e005f5cd4438d18efe6
Author: Gatlin Newhouse 
Date:   Wed Jul 24 00:01:55 2024 +

x86/traps: Enable UBSAN traps on x86

Currently ARM64 extracts which specific sanitizer has caused a trap via
encoded data in the trap instruction. Clang on x86 currently encodes the
same data in the UD1 instruction but x86 handle_bug() and
is_valid_bugaddr() currently only look at UD2.

Bring x86 to parity with ARM64, similar to commit 25b84002afb9 ("arm64:
Support Clang UBSAN trap codes for better reporting"). See the llvm
links for information about the code generation.

Enable the reporting of UBSAN sanitizer details on x86 compiled with clang
when CONFIG_UBSAN_TRAP=y by analysing UD1 and retrieving the type immediate
which is encoded by the compiler after the UD1.

[ tglx: Simplified it by moving the printk() into handle_bug() ]

Signed-off-by: Gatlin Newhouse 
Signed-off-by: Thomas Gleixner 
Acked-by: Peter Zijlstra (Intel) 
Cc: Kees Cook 
Link: 
https://lore.kernel.org/all/20240724000206.451425-1-gatlin.newho...@gmail.com
Link: 
https://github.com/llvm/llvm-project/commit/c5978f42ec8e9#diff-bb68d7cd885f41cfc35843998b0f9f534adb60b415f647109e597ce448e92d9f
Link: 
https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/X86/X86InstrSystem.td#L27

diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index a3ec87d198ac..806649c7f23d 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -13,6 +13,18 @@
 #define INSN_UD2   0x0b0f
 #define LEN_UD22
 
+/*
+ * In clang we have UD1s reporting UBSAN failures on X86, 64 and 32bit.
+ */
+#define INSN_ASOP  0x67
+#define OPCODE_ESCAPE  0x0f
+#define SECOND_BYTE_OPCODE_UD1 0xb9
+#define SECOND_BYTE_OPCODE_UD2 0x0b
+
+#define BUG_NONE   0x
+#define BUG_UD10xfffe
+#define BUG_UD20xfffd
+
 #ifdef CONFIG_GENERIC_BUG
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4fa0b17e5043..415881607c5d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -42,6 +42,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -91,6 +92,47 @@ __always_inline int is_valid_bugaddr(unsigned long addr)
return *(unsigned short *)addr == INSN_UD2;
 }
 
+/*
+ * Check for UD1 or UD2, accounting for Address Size Override Prefixes.
+ * If it's a UD1, get the ModRM byte to pass along to UBSan.
+ */
+__always_inline int decode_bug(unsigned long addr, u32 *imm)
+{
+   u8 v;
+
+   if (addr < TASK_SIZE_MAX)
+   return BUG_NONE;
+
+   v = *(u8 *)(addr++);
+   if (v == INSN_ASOP)
+   v = *(u8 *)(addr++);
+   if (v != OPCODE_ESCAPE)
+   return BUG_NONE;
+
+   v = *(u8 *)(addr++);
+   if (v == SECOND_BYTE_OPCODE_UD2)
+   return BUG_UD2;
+
+   if (!IS_ENABLED(CONFIG_UBSAN_TRAP) || v != SECOND_BYTE_OPCODE_UD1)
+   return BUG_NONE;
+
+   /* Retrieve the immediate (type value) for the UBSAN UD1 */
+   v = *(u8 *)(addr++);
+   if (X86_MODRM_RM(v) == 4)
+   addr++;
+
+   *imm = 0;
+   if (X86_MODRM_MOD(v) == 1)
+   *imm = *(u8 *)addr;
+   else if (X86_MODRM_MOD(v) == 2)
+   *imm = *(u32 *)addr;
+   else
+   WARN_ONCE(1, "Unexpected MODRM_MOD: %u\n", X86_MODRM_MOD(v));
+
+   return BUG_UD1;
+}
+
+
 static nokprobe_inline int
 do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str,

Re: [RFC 27/31] objtool: Fix weak symbol detection

2024-09-03 Thread Peter Zijlstra

On Mon, Sep 02, 2024 at 09:00:10PM -0700, Josh Poimboeuf wrote:
> @@ -433,9 +433,13 @@ static void elf_add_symbol(struct elf *elf, struct 
> symbol *sym)
>   /*
>* Don't store empty STT_NOTYPE symbols in the rbtree.  They
>* can exist within a function, confusing the sorting.
> +  *
> +  * TODO: is this still true?

a2e38dffcd93 ("objtool: Don't add empty symbols to the rbtree")

I don't think that changed.

Re: [RFC 28/31] x86/alternative: Create symbols for special section entries

2024-09-03 Thread Peter Zijlstra

On Mon, Sep 02, 2024 at 09:00:11PM -0700, Josh Poimboeuf wrote:
> Create a symbol for each special section entry.  This helps objtool
> extract needed entries.

A little more explanation would be nice,..

Re: [PATCH v4 1/2] virtiofs: use pages instead of pointer for kernel direct IO

2024-09-03 Thread Jingbo Xu




On 8/31/24 5:37 PM, Hou Tao wrote:
> From: Hou Tao 
> 
> When trying to insert a 10MB kernel module kept in a virtio-fs with cache
> disabled, the following warning was reported:
> 
>   [ cut here ]
>   WARNING: CPU: 1 PID: 404 at mm/page_alloc.c:4551 ..
>   Modules linked in:
>   CPU: 1 PID: 404 Comm: insmod Not tainted 6.9.0-rc5+ #123
>   Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) ..
>   RIP: 0010:__alloc_pages+0x2bf/0x380
>   ..
>   Call Trace:
>
>? __warn+0x8e/0x150
>? __alloc_pages+0x2bf/0x380
>__kmalloc_large_node+0x86/0x160
>__kmalloc+0x33c/0x480
>virtio_fs_enqueue_req+0x240/0x6d0
>virtio_fs_wake_pending_and_unlock+0x7f/0x190
>queue_request_and_unlock+0x55/0x60
>fuse_simple_request+0x152/0x2b0
>fuse_direct_io+0x5d2/0x8c0
>fuse_file_read_iter+0x121/0x160
>__kernel_read+0x151/0x2d0
>kernel_read+0x45/0x50
>kernel_read_file+0x1a9/0x2a0
>init_module_from_file+0x6a/0xe0
>idempotent_init_module+0x175/0x230
>__x64_sys_finit_module+0x5d/0xb0
>x64_sys_call+0x1c3/0x9e0
>do_syscall_64+0x3d/0xc0
>entry_SYSCALL_64_after_hwframe+0x4b/0x53
>..
>
>   ---[ end trace  ]---
> 
> The warning is triggered as follows:
> 
> 1) syscall finit_module() handles the module insertion and it invokes
> kernel_read_file() to read the content of the module first.
> 
> 2) kernel_read_file() allocates a 10MB buffer by using vmalloc() and
> passes it to kernel_read(). kernel_read() constructs a kvec iter by
> using iov_iter_kvec() and passes it to fuse_file_read_iter().
> 
> 3) virtio-fs disables the cache, so fuse_file_read_iter() invokes
> fuse_direct_io(). As for now, the maximal read size for kvec iter is
> only limited by fc->max_read. For virtio-fs, max_read is UINT_MAX, so
> fuse_direct_io() doesn't split the 10MB buffer. It saves the address and
> the size of the 10MB-sized buffer in out_args[0] of a fuse request and
> passes the fuse request to virtio_fs_wake_pending_and_unlock().
> 
> 4) virtio_fs_wake_pending_and_unlock() uses virtio_fs_enqueue_req() to
> queue the request. Because virtiofs need DMA-able address, so
> virtio_fs_enqueue_req() uses kmalloc() to allocate a bounce buffer for
> all fuse args, copies these args into the bounce buffer and passed the
> physical address of the bounce buffer to virtiofsd. The total length of
> these fuse args for the passed fuse request is about 10MB, so
> copy_args_to_argbuf() invokes kmalloc() with a 10MB size parameter and
> it triggers the warning in __alloc_pages():
> 
>   if (WARN_ON_ONCE_GFP(order > MAX_PAGE_ORDER, gfp))
>   return NULL;
> 
> 5) virtio_fs_enqueue_req() will retry the memory allocation in a
> kworker, but it won't help, because kmalloc() will always return NULL
> due to the abnormal size and finit_module() will hang forever.
> 
> A feasible solution is to limit the value of max_read for virtio-fs, so
> the length passed to kmalloc() will be limited. However it will affect
> the maximal read size for normal read. And for virtio-fs write initiated
> from kernel, it has the similar problem but now there is no way to limit
> fc->max_write in kernel.
> 
> So instead of limiting both the values of max_read and max_write in
> kernel, introducing use_pages_for_kvec_io in fuse_conn and setting it as
> true in virtiofs. When use_pages_for_kvec_io is enabled, fuse will use
> pages instead of pointer to pass the KVEC_IO data.
> 
> After switching to pages for KVEC_IO data, these pages will be used for
> DMA through virtio-fs. If these pages are backed by vmalloc(),
> {flush|invalidate}_kernel_vmap_range() are necessary to flush or
> invalidate the cache before the DMA operation. So add two new fields in
> fuse_args_pages to record the base address of vmalloc area and the
> condition indicating whether invalidation is needed. Perform the flush
> in fuse_get_user_pages() for write operations and the invalidation in
> fuse_release_user_pages() for read operations.
> 
> It may seem necessary to introduce another field in fuse_conn to
> indicate that these KVEC_IO pages are used for DMA, However, considering
> that virtio-fs is currently the only user of use_pages_for_kvec_io, just
> reuse use_pages_for_kvec_io to indicate that these pages will be used
> for DMA.
> 
> Fixes: a62a8ef9d97d ("virtio-fs: add virtiofs filesystem")
> Signed-off-by: Hou Tao 

Tested-by: Jingbo Xu 


> ---
>  fs/fuse/file.c  | 62 +++--
>  fs/fuse/fuse_i.h|  6 +
>  fs/fuse/virtio_fs.c |  1 +
>  3 files changed, 50 insertions(+), 19 deletions(-)
> 
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index f39456c65ed7..331208d3e4d1 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -645,7 +645,7 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct 
> file *file, loff_t pos,
>   args->out_args[0].size = count;
>  }
>  
> -static void fuse_release_user_pages(struct fuse_

Re: [PATCH v4 2/2] virtiofs: use GFP_NOFS when enqueuing request through kworker

2024-09-03 Thread Jingbo Xu




On 8/31/24 5:37 PM, Hou Tao wrote:
> From: Hou Tao 
> 
> When invoking virtio_fs_enqueue_req() through kworker, both the
> allocation of the sg array and the bounce buffer still use GFP_ATOMIC.
> Considering the size of the sg array may be greater than PAGE_SIZE, use
> GFP_NOFS instead of GFP_ATOMIC to lower the possibility of memory
> allocation failure and to avoid unnecessarily depleting the atomic
> reserves. GFP_NOFS is not passed to virtio_fs_enqueue_req() directly,
> GFP_KERNEL and memalloc_nofs_{save|restore} helpers are used instead.
> 
> It may seem OK to pass GFP_NOFS to virtio_fs_enqueue_req() as well when
> queuing the request for the first time, but this is not the case. The
> reason is that fuse_request_queue_background() may call
> ->queue_request_and_unlock() while holding fc->bg_lock, which is a
> spin-lock. Therefore, still use GFP_ATOMIC for it.

Actually, .wake_pending_and_unlock() is called under fiq->lock and
GFP_ATOMIC is requisite.


> 
> Signed-off-by: Hou Tao 
> ---
>  fs/fuse/virtio_fs.c | 24 +++-
>  1 file changed, 15 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
> index 43d66ab5e891..9bc48b3ca384 100644
> --- a/fs/fuse/virtio_fs.c
> +++ b/fs/fuse/virtio_fs.c
> @@ -95,7 +95,8 @@ struct virtio_fs_req_work {
>  };
>  
>  static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
> -  struct fuse_req *req, bool in_flight);
> +  struct fuse_req *req, bool in_flight,
> +  gfp_t gfp);
>  
>  static const struct constant_table dax_param_enums[] = {
>   {"always",  FUSE_DAX_ALWAYS },
> @@ -439,6 +440,8 @@ static void virtio_fs_request_dispatch_work(struct 
> work_struct *work)
>  
>   /* Dispatch pending requests */
>   while (1) {
> + unsigned int flags;
> +
>   spin_lock(&fsvq->lock);
>   req = list_first_entry_or_null(&fsvq->queued_reqs,
>  struct fuse_req, list);
> @@ -449,7 +452,9 @@ static void virtio_fs_request_dispatch_work(struct 
> work_struct *work)
>   list_del_init(&req->list);
>   spin_unlock(&fsvq->lock);
>  
> - ret = virtio_fs_enqueue_req(fsvq, req, true);
> + flags = memalloc_nofs_save();
> + ret = virtio_fs_enqueue_req(fsvq, req, true, GFP_KERNEL);
> + memalloc_nofs_restore(flags);
>   if (ret < 0) {
>   if (ret == -ENOSPC) {
>   spin_lock(&fsvq->lock);
> @@ -550,7 +555,7 @@ static void virtio_fs_hiprio_dispatch_work(struct 
> work_struct *work)
>  }
>  
>  /* Allocate and copy args into req->argbuf */
> -static int copy_args_to_argbuf(struct fuse_req *req)
> +static int copy_args_to_argbuf(struct fuse_req *req, gfp_t gfp)
>  {
>   struct fuse_args *args = req->args;
>   unsigned int offset = 0;
> @@ -564,7 +569,7 @@ static int copy_args_to_argbuf(struct fuse_req *req)
>   len = fuse_len_args(num_in, (struct fuse_arg *) args->in_args) +
> fuse_len_args(num_out, args->out_args);
>  
> - req->argbuf = kmalloc(len, GFP_ATOMIC);
> + req->argbuf = kmalloc(len, gfp);
>   if (!req->argbuf)
>   return -ENOMEM;
>  
> @@ -1239,7 +1244,8 @@ static unsigned int sg_init_fuse_args(struct 
> scatterlist *sg,
>  
>  /* Add a request to a virtqueue and kick the device */
>  static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
> -  struct fuse_req *req, bool in_flight)
> +  struct fuse_req *req, bool in_flight,
> +  gfp_t gfp)
>  {
>   /* requests need at least 4 elements */
>   struct scatterlist *stack_sgs[6];
> @@ -1260,8 +1266,8 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq 
> *fsvq,
>   /* Does the sglist fit on the stack? */
>   total_sgs = sg_count_fuse_req(req);
>   if (total_sgs > ARRAY_SIZE(stack_sgs)) {
> - sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), GFP_ATOMIC);
> - sg = kmalloc_array(total_sgs, sizeof(sg[0]), GFP_ATOMIC);
> + sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), gfp);
> + sg = kmalloc_array(total_sgs, sizeof(sg[0]), gfp);
>   if (!sgs || !sg) {
>   ret = -ENOMEM;
>   goto out;
> @@ -1269,7 +1275,7 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq 
> *fsvq,
>   }
>  
>   /* Use a bounce buffer since stack args cannot be mapped */
> - ret = copy_args_to_argbuf(req);
> + ret = copy_args_to_argbuf(req, gfp);
>   if (ret < 0)
>   goto out;
>  
> @@ -1367,7 +1373,7 @@ __releases(fiq->lock)
>queue_id);
>  
>   fsvq = &fs->vqs[queue_id];
> - ret = virtio_fs_enqueue_req(fsvq, req, false);
> + ret = virtio_fs_enqueue_req(fsvq, req, false, GFP_ATOMIC);
>   if (ret

[PATCH] selftests/net: do_setcpu function not need to have a return value

2024-09-03 Thread Liu Jing

in the do_setcpu, this function does not need to have a return value,
which is meaningless

Signed-off-by: Liu Jing 
---
 tools/testing/selftests/net/msg_zerocopy.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/testing/selftests/net/msg_zerocopy.c 
b/tools/testing/selftests/net/msg_zerocopy.c
index bdc03a2097e8..0b54f2011449 100644
--- a/tools/testing/selftests/net/msg_zerocopy.c
+++ b/tools/testing/selftests/net/msg_zerocopy.c
@@ -118,7 +118,7 @@ static uint16_t get_ip_csum(const uint16_t *start, int 
num_words)
return ~sum;
 }
 
-static int do_setcpu(int cpu)
+static void do_setcpu(int cpu)
 {
cpu_set_t mask;
 
@@ -129,7 +129,6 @@ static int do_setcpu(int cpu)
else if (cfg_verbose)
fprintf(stderr, "cpu: %u\n", cpu);
 
-   return 0;
 }
 
 static void do_setsockopt(int fd, int level, int optname, int val)
-- 
2.33.0

Re: [PATCH v3 3/3] kselftest: Provide __cpuid_count() stub on non-x86 archs

2024-09-03 Thread Ilpo Järvinen

On Thu, 29 Aug 2024, Ilpo Järvinen wrote:

> Building resctrl selftest fails on ARM because it uses __cpuid_count()
> that fails the build with error:
> 
>   CC   resctrl_tests
> In file included from resctrl.h:24,
>  from cat_test.c:11:
> In function 'arch_supports_noncont_cat',
> inlined from 'noncont_cat_run_test' at cat_test.c:323:6:
> ../kselftest.h:74:9: error: impossible constraint in 'asm'
>74 | __asm__ __volatile__ ("cpuid\n\t"   \
>   | ^~~
> cat_test.c:301:17: note: in expansion of macro '__cpuid_count'
>   301 | __cpuid_count(0x10, 1, eax, ebx, ecx, edx);
>   | ^
> ../kselftest.h:74:9: error: impossible constraint in 'asm'
>74 | __asm__ __volatile__ ("cpuid\n\t"   \
>   | ^~~
> cat_test.c:303:17: note: in expansion of macro '__cpuid_count'
>   303 | __cpuid_count(0x10, 2, eax, ebx, ecx, edx);
>   | ^
> 
> The resctrl selftest would run that code only on Intel CPUs but as is,
> the code cannot be build at all.
> 
> Define HAVE_CPUID in lib.mk based on ARCH (x86 or x86_64). If ARCH is
> not set, acquire it using uname -m.
> 
> Provide a stub for __cpuid_count() if HAVE_CPUID is not present to
> allow build to succeed. The stub casts its arguments to void to avoid
> causing "unused variable" or "set but not used" warnings.
> 
> Fixes: ae638551ab64 ("selftests/resctrl: Add non-contiguous CBMs CAT test")
> Reported-by: Muhammad Usama Anjum 
> Signed-off-by: Ilpo Järvinen 
> ---
> v3:
> - Remove "empty" wording
> - Also cast input parameters to void
> - Initialize ARCH from uname -m if not set (this might allow cleaning
>   up some other makefiles but that is left as future work)
> v2:
> - Removed RFC & added Fixes and Tested-by
> - Fixed the error message's line splits
> - Noted down the reason for void casts in the stub
> ---
>  tools/testing/selftests/kselftest.h | 6 ++
>  tools/testing/selftests/lib.mk  | 6 ++
>  2 files changed, 12 insertions(+)
> 
> diff --git a/tools/testing/selftests/kselftest.h 
> b/tools/testing/selftests/kselftest.h
> index b8967b6e29d5..9c4bfbf107f1 100644
> --- a/tools/testing/selftests/kselftest.h
> +++ b/tools/testing/selftests/kselftest.h
> @@ -70,10 +70,16 @@
>   * have __cpuid_count().
>   */
>  #ifndef __cpuid_count
> +#ifdef HAVE_CPUID
>  #define __cpuid_count(level, count, a, b, c, d)  
> \
>   __asm__ __volatile__ ("cpuid\n\t"   \
> : "=a" (a), "=b" (b), "=c" (c), "=d" (d)  \
> : "0" (level), "2" (count))
> +#else
> +#define __cpuid_count(level, count, a, b, c, d)  do {
> \
> + (void)level; (void)count; (void)a; (void)b; (void)c; (void)d;   \
> +} while (0)
> +#endif
>  #endif
>  
>  /* define kselftest exit codes */
> diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk
> index d6edcfcb5be8..8e3069926153 100644
> --- a/tools/testing/selftests/lib.mk
> +++ b/tools/testing/selftests/lib.mk
> @@ -23,6 +23,8 @@ CLANG_TARGET_FLAGS_x86_64   := x86_64-linux-gnu
>  
>  # Default to host architecture if ARCH is not explicitly given.
>  ifeq ($(ARCH),)
> +ARCH := $(shell uname -m 2>/dev/null || echo not)
> +ARCH := $(shell echo $(ARCH) | sed -e s/i.86/x86/)
>  CLANG_TARGET_FLAGS := $(shell $(CLANG) -print-target-triple)
>  else
>  CLANG_TARGET_FLAGS := $(CLANG_TARGET_FLAGS_$(ARCH))
> @@ -199,6 +201,10 @@ clean: $(if $(TEST_GEN_MODS_DIR),clean_mods_dir)
>  # Build with _GNU_SOURCE by default
>  CFLAGS += -D_GNU_SOURCE=
>  
> +ifeq ($(ARCH),$(filter $(ARCH),x86 x86_64))
> +CFLAGS += -DHAVE_CPUID=
> +endif

Hpmf, scratch this. CFLAGS are overwritten by x86 selftest makefile so I 
need to reorder things there before making this change.

-- 
 i.

Re: [PATCH v2] remoteproc: k3-r5: Delay notification of wakeup event

2024-09-03 Thread Beleswar Prasad Padhi


Hi Mathieu,

On 20-08-2024 16:20, Beleswar Padhi wrote:

From: Udit Kumar 

Few times, core1 was scheduled to boot first before core0, which leads
to error:

'k3_r5_rproc_start: can not start core 1 before core 0'.

This was happening due to some scheduling between prepare and start
callback. The probe function waits for event, which is getting
triggered by prepare callback. To avoid above condition move event
trigger to start instead of prepare callback.

Fixes: 61f6f68447ab ("remoteproc: k3-r5: Wait for core0 power-up before powering up 
core1")



Please put this patch on hold. I have some additional changelog that 
should go in v3.


Thanks,
Beleswar


Signed-off-by: Udit Kumar 
[ Applied wakeup event trigger only for Split-Mode booted rprocs ]
Signed-off-by: Beleswar Padhi 
---
v2: Changelog:
* Mathieu
1) Rebased changes on top of -next-20240820 tag.

Link to v1:
https://lore.kernel.org/all/20240809060132.308642-1-b-pa...@ti.com/

  drivers/remoteproc/ti_k3_r5_remoteproc.c | 5 +++--
  1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/remoteproc/ti_k3_r5_remoteproc.c 
b/drivers/remoteproc/ti_k3_r5_remoteproc.c
index 8a63a9360c0f..e61e53381abc 100644
--- a/drivers/remoteproc/ti_k3_r5_remoteproc.c
+++ b/drivers/remoteproc/ti_k3_r5_remoteproc.c
@@ -469,8 +469,6 @@ static int k3_r5_rproc_prepare(struct rproc *rproc)
ret);
return ret;
}
-   core->released_from_reset = true;
-   wake_up_interruptible(&cluster->core_transition);
  
  	/*

 * Newer IP revisions like on J7200 SoCs support h/w auto-initialization
@@ -587,6 +585,9 @@ static int k3_r5_rproc_start(struct rproc *rproc)
ret = k3_r5_core_run(core);
if (ret)
return ret;
+
+   core->released_from_reset = true;
+   wake_up_interruptible(&cluster->core_transition);
}
  
  	return 0;

Re: [PATCH] module: abort module loading when sysfs setup suffer errors

2024-09-03 Thread Petr Pavlu

On 8/30/24 07:43, Chunhui Li wrote:
> When insmod a kernel module, if fails in add_notes_attrs or
> add_sysfs_attrs such as memory allocation fail, mod_sysfs_setup
> will still return success, but we can't access user interface
> on android device.
> 
> Patch for make mod_sysfs_setup can check the error of
> add_notes_attrs and add_sysfs_attrs

s/add_sysfs_attrs/add_sect_attrs/

I think it makes sense to propagate errors from these functions upward,
although I wonder if the authors of this code didn't intentionally make
the errors silent, possibly because the interface was mostly intended
for debugging only?

The original commits which added add_sect_attrs() and add_notes_attrs()
don't mention anything explicitly in this regard:
https://github.com/mpe/linux-fullhistory/commit/db939b519bea9b88ae1c95c3b479c0b07145f2a0
https://github.com/torvalds/linux/commit/6d76013381ed28979cd122eb4b249a88b5e384fa

> 
> Signed-off-by: Xion Wang 
> Signed-off-by: Chunhui Li 
> ---
>  kernel/module/sysfs.c | 49 ++-
>  1 file changed, 35 insertions(+), 14 deletions(-)
> 
> diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c
> index 26efe1305c12..a9ee650d995d 100644
> --- a/kernel/module/sysfs.c
> +++ b/kernel/module/sysfs.c
> @@ -69,12 +69,13 @@ static void free_sect_attrs(struct module_sect_attrs 
> *sect_attrs)
>   kfree(sect_attrs);
>  }
>  
> -static void add_sect_attrs(struct module *mod, const struct load_info *info)
> +static int add_sect_attrs(struct module *mod, const struct load_info *info)
>  {
>   unsigned int nloaded = 0, i, size[2];
>   struct module_sect_attrs *sect_attrs;
>   struct module_sect_attr *sattr;
>   struct bin_attribute **gattr;
> + int ret = 0;

Nit: It isn't necessary to initialize this variable to 0 because the
code explicitly does "return 0;" on success. While on the error path,
the variable is always assigned.

>  
>   /* Count loaded sections and allocate structures */
>   for (i = 0; i < info->hdr->e_shnum; i++)
> @@ -85,7 +86,7 @@ static void add_sect_attrs(struct module *mod, const struct 
> load_info *info)
>   size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]);
>   sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
>   if (!sect_attrs)
> - return;
> + return -ENOMEM;
>  
>   /* Setup section attributes. */
>   sect_attrs->grp.name = "sections";
> @@ -103,8 +104,10 @@ static void add_sect_attrs(struct module *mod, const 
> struct load_info *info)
>   sattr->address = sec->sh_addr;
>   sattr->battr.attr.name =
>   kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL);
> - if (!sattr->battr.attr.name)
> + if (!sattr->battr.attr.name) {
> + ret = -ENOMEM;
>   goto out;
> + }
>   sect_attrs->nsections++;
>   sattr->battr.read = module_sect_read;
>   sattr->battr.size = MODULE_SECT_READ_SIZE;
> @@ -113,13 +116,16 @@ static void add_sect_attrs(struct module *mod, const 
> struct load_info *info)
>   }
>   *gattr = NULL;
>  
> - if (sysfs_create_group(&mod->mkobj.kobj, §_attrs->grp))
> + if (sysfs_create_group(&mod->mkobj.kobj, §_attrs->grp)) {
> + ret = -EIO;
>   goto out;
> + }

Why does the logic return -EIO instead of propagating the error code
from sysfs_create_group()?

>  
>   mod->sect_attrs = sect_attrs;
> - return;
> + return 0;
>  out:
>   free_sect_attrs(sect_attrs);
> + return ret;
>  }
>  
>  static void remove_sect_attrs(struct module *mod)
> @@ -158,15 +164,16 @@ static void free_notes_attrs(struct module_notes_attrs 
> *notes_attrs,
>   kfree(notes_attrs);
>  }
>  
> -static void add_notes_attrs(struct module *mod, const struct load_info *info)
> +static int add_notes_attrs(struct module *mod, const struct load_info *info)
>  {
>   unsigned int notes, loaded, i;
>   struct module_notes_attrs *notes_attrs;
>   struct bin_attribute *nattr;
> + int ret = 0;

Similarly here, the initialization is not necessary.

>  
>   /* failed to create section attributes, so can't create notes */
>   if (!mod->sect_attrs)
> - return;
> + return -EINVAL;

Since the patch modifies mod_sysfs_setup() to bail out when registering
section attributes fails, this condition can no longer be true and the
check can be removed.

>  
>   /* Count notes sections and allocate structures.  */
>   notes = 0;
> @@ -176,12 +183,12 @@ static void add_notes_attrs(struct module *mod, const 
> struct load_info *info)
>   ++notes;
>  
>   if (notes == 0)
> - return;
> + return 0;
>  
>   notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes),
> GFP_KERNEL);
>   if (!notes_attrs)
> - return;
> + retur

[PATCH v2] selftests/futex: Create test for robust list

2024-09-03 Thread André Almeida

Create a test for the robust list mechanism.

Signed-off-by: André Almeida 
---
Changes from v1:
- Change futex type from int to _Atomic(unsigned int)
- Use old futex(FUTEX_WAIT) instead of the new sys_futex_wait()
---
 .../selftests/futex/functional/.gitignore |   1 +
 .../selftests/futex/functional/Makefile   |   3 +-
 .../selftests/futex/functional/robust_list.c  | 448 ++
 3 files changed, 451 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/futex/functional/robust_list.c

diff --git a/tools/testing/selftests/futex/functional/.gitignore 
b/tools/testing/selftests/futex/functional/.gitignore
index fbcbdb6963b3..4726e1be7497 100644
--- a/tools/testing/selftests/futex/functional/.gitignore
+++ b/tools/testing/selftests/futex/functional/.gitignore
@@ -9,3 +9,4 @@ futex_wait_wouldblock
 futex_wait
 futex_requeue
 futex_waitv
+robust_list
diff --git a/tools/testing/selftests/futex/functional/Makefile 
b/tools/testing/selftests/futex/functional/Makefile
index f79f9bac7918..b8635a1ac7f6 100644
--- a/tools/testing/selftests/futex/functional/Makefile
+++ b/tools/testing/selftests/futex/functional/Makefile
@@ -17,7 +17,8 @@ TEST_GEN_PROGS := \
futex_wait_private_mapped_file \
futex_wait \
futex_requeue \
-   futex_waitv
+   futex_waitv \
+   robust_list
 
 TEST_PROGS := run.sh
 
diff --git a/tools/testing/selftests/futex/functional/robust_list.c 
b/tools/testing/selftests/futex/functional/robust_list.c
new file mode 100644
index ..9308eb189d48
--- /dev/null
+++ b/tools/testing/selftests/futex/functional/robust_list.c
@@ -0,0 +1,448 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2024 Igalia S.L.
+ *
+ * Robust list test by André Almeida 
+ *
+ * The robust list uAPI allows userspace to create "robust" locks, in the sense
+ * that if the lock holder thread dies, the remaining threads that are waiting
+ * for the lock won't block forever, waiting for a lock that will never be
+ * released.
+ *
+ * This is achieve by userspace setting a list where a thread can enter all the
+ * locks (futexes) that it is holding. The robust list is a linked list, and
+ * userspace register the start of the list with the syscall set_robust_list().
+ * If such thread eventually dies, the kernel will walk this list, waking up 
one
+ * thread waiting for each futex and marking the futex word with the flag
+ * FUTEX_OWNER_DIED.
+ *
+ * See also
+ * man set_robust_list
+ * Documententation/locking/robust-futex-ABI.rst
+ * Documententation/locking/robust-futexes.rst
+ */
+
+#define _GNU_SOURCE
+
+#include "../../kselftest_harness.h"
+
+#include "futextest.h"
+
+#include 
+#include 
+#include 
+
+#define STACK_SIZE (1024 * 1024)
+
+#define FUTEX_TIMEOUT 3
+
+static pthread_barrier_t barrier, barrier2;
+
+int set_robust_list(struct robust_list_head *head, size_t len)
+{
+   return syscall(SYS_set_robust_list, head, len);
+}
+
+int get_robust_list(int pid, struct robust_list_head **head, size_t *len_ptr)
+{
+   return syscall(SYS_get_robust_list, pid, head, len_ptr);
+}
+
+/*
+ * Basic lock struct, contains just the futex word and the robust list element
+ * Real implementations have also a *prev to easily walk in the list
+ */
+struct lock_struct {
+   _Atomic(unsigned int) futex;
+   struct robust_list list;
+};
+
+/*
+ * Helper function to spawn a child thread. Returns -1 on error, pid on success
+ */
+static int create_child(int (*fn)(void *arg), void *arg)
+{
+   char *stack;
+   pid_t pid;
+
+   stack = mmap(NULL, STACK_SIZE, PROT_READ | PROT_WRITE,
+MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
+   if (stack == MAP_FAILED)
+   return -1;
+
+   stack += STACK_SIZE;
+
+   pid = clone(fn, stack, CLONE_VM | SIGCHLD, arg);
+
+   if (pid == -1)
+   return -1;
+
+   return pid;
+}
+
+/*
+ * Helper function to prepare and register a robust list
+ */
+static int set_list(struct robust_list_head *head)
+{
+   int ret;
+
+   ret = set_robust_list(head, sizeof(struct robust_list_head));
+   if (ret)
+   return ret;
+
+   head->futex_offset = (size_t) offsetof(struct lock_struct, futex) -
+(size_t) offsetof(struct lock_struct, list);
+   head->list.next = &head->list;
+   head->list_op_pending = NULL;
+
+   return 0;
+}
+
+/*
+ * A basic (and incomplete) mutex lock function with robustness
+ */
+static int mutex_lock(struct lock_struct *lock, struct robust_list_head *head, 
bool error_inject)
+{
+   _Atomic(unsigned int) *futex = &lock->futex;
+   int zero = 0, ret = -1;
+   pid_t tid = gettid();
+
+   /*
+* Set list_op_pending before starting the lock, so the kernel can catch
+* the case where the thread died during the lock operation
+*/
+   head->list_op_pending = &lock->list;
+
+   if (atomic_compare_exchange_

Re: [PATCH v5] ptp: Add support for the AMZNC10C 'vmclock' device

2024-09-03 Thread Paolo Abeni


On 8/23/24 12:09, David Woodhouse wrote:

diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig
index 604541dcb320..e98c9767e0ef 100644
--- a/drivers/ptp/Kconfig
+++ b/drivers/ptp/Kconfig
@@ -131,6 +131,19 @@ config PTP_1588_CLOCK_KVM
  To compile this driver as a module, choose M here: the module
  will be called ptp_kvm.
  
+config PTP_1588_CLOCK_VMCLOCK

+   tristate "Virtual machine PTP clock"
+   depends on X86_TSC || ARM_ARCH_TIMER
+   depends on PTP_1588_CLOCK && ACPI && ARCH_SUPPORTS_INT128
+   default y
+   help
+ This driver adds support for using a virtual precision clock
+ advertised by the hypervisor. This clock is only useful in virtual
+ machines where such a device is present.
+
+ To compile this driver as a module, choose M here: the module
+ will be called ptp_vmclock.
+
  config PTP_1588_CLOCK_IDT82P33
tristate "IDT 82P33xxx PTP clock"
depends on PTP_1588_CLOCK && I2C
diff --git a/drivers/ptp/Makefile b/drivers/ptp/Makefile
index 68bf02078053..01b5cd91eb61 100644
--- a/drivers/ptp/Makefile
+++ b/drivers/ptp/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_PTP_1588_CLOCK_DTE)  += ptp_dte.o
  obj-$(CONFIG_PTP_1588_CLOCK_INES) += ptp_ines.o
  obj-$(CONFIG_PTP_1588_CLOCK_PCH)  += ptp_pch.o
  obj-$(CONFIG_PTP_1588_CLOCK_KVM)  += ptp_kvm.o
+obj-$(CONFIG_PTP_1588_CLOCK_VMCLOCK)   += ptp_vmclock.o
  obj-$(CONFIG_PTP_1588_CLOCK_QORIQ)+= ptp-qoriq.o
  ptp-qoriq-y   += ptp_qoriq.o
  ptp-qoriq-$(CONFIG_DEBUG_FS)  += ptp_qoriq_debugfs.o
diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c
new file mode 100644
index ..f772bcb23599
--- /dev/null
+++ b/drivers/ptp/ptp_vmclock.c
@@ -0,0 +1,607 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Virtual PTP 1588 clock for use with LM-safe VMclock device.
+ *
+ * Copyright © 2024 Amazon.com, Inc. or its affiliates.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#include 
+
+#ifdef CONFIG_X86
+#include 
+#include 
+#endif
+
+#ifdef CONFIG_KVM_GUEST
+#define SUPPORT_KVMCLOCK
+#endif
+
+static DEFINE_IDA(vmclock_ida);
+
+ACPI_MODULE_NAME("vmclock");
+
+struct vmclock_state {
+   struct resource res;
+   struct vmclock_abi *clk;
+   struct miscdevice miscdev;
+   struct ptp_clock_info ptp_clock_info;
+   struct ptp_clock *ptp_clock;
+   enum clocksource_ids cs_id, sys_cs_id;
+   int index;
+   char *name;
+};
+
+#define VMCLOCK_MAX_WAIT ms_to_ktime(100)
+
+/* Require at least the flags field to be present. All else can be optional. */
+#define VMCLOCK_MIN_SIZE offsetof(struct vmclock_abi, pad)
+
+#define VMCLOCK_FIELD_PRESENT(_c, _f)\
+   le32_to_cpu((_c)->size) >= (offsetof(struct vmclock_abi, _f) +\
+   sizeof((_c)->_f))
+
+/*
+ * Multiply a 64-bit count by a 64-bit tick 'period' in units of seconds >> 64
+ * and add the fractional second part of the reference time.
+ *
+ * The result is a 128-bit value, the top 64 bits of which are seconds, and
+ * the low 64 bits are (seconds >> 64).
+ */
+static uint64_t mul_u64_u64_shr_add_u64(uint64_t *res_hi, uint64_t delta,
+   uint64_t period, uint8_t shift,
+   uint64_t frac_sec)


I'm sorry for the late feedback.

uint64_t should be u64 (in a lot of places), mutatis mutandis uint32_t, 
etc...



+{
+   unsigned __int128 res = (unsigned __int128)delta * period;
+
+   res >>= shift;
+   res += frac_sec;
+   *res_hi = res >> 64;
+   return (uint64_t)res;
+}
+
+static bool tai_adjust(struct vmclock_abi *clk, uint64_t *sec)
+{
+   if (likely(clk->time_type == VMCLOCK_TIME_UTC))
+   return true;
+
+   if (clk->time_type == VMCLOCK_TIME_TAI &&
+   (le64_to_cpu(clk->flags) & VMCLOCK_FLAG_TAI_OFFSET_VALID)) {
+   if (sec)
+   *sec += (int16_t)le16_to_cpu(clk->tai_offset_sec);
+   return true;
+   }
+   return false;
+}
+
+static int vmclock_get_crosststamp(struct vmclock_state *st,
+  struct ptp_system_timestamp *sts,
+  struct system_counterval_t *system_counter,
+  struct timespec64 *tspec)
+{
+   ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT);
+   struct system_time_snapshot systime_snapshot;
+   uint64_t cycle, delta, seq, frac_sec;
+
+#ifdef CONFIG_X86
+   /*
+* We'd expect the hypervisor to know this and to report the clock
+* status as VMCLOCK_STATUS_UNRELIABLE. But be paranoid.
+*/
+   if (check_tsc_unstable())
+   return -EINVAL;
+#endif
+
+   while (1) {
+   seq = le32_to_cpu(st->clk->seq_count) & ~1ULL;
+

Re: [PATCH v2] modpost: compile constant module information only once

2024-09-03 Thread Masahiro Yamada

On Mon, Sep 2, 2024 at 2:56 AM Thomas Weißschuh  wrote:
>
> Various information about modules is compiled into the info sections.
> For that a dedicated .mod.c file is generated by modpost for each module
> and then linked into the module.
> However most of the information in the .mod.c is the same for all
> modules, internal and external.
> Split the shared information into a dedicated source file that is
> compiled once and then linked into all modules.
>
> This avoids frequent rebuilds for all .mod.c files when using
> CONFIG_LOCALVERSION_AUTO because the local version ends up in .mod.c
> through UTS_RELEASE and VERMAGIC_STRING.
> The modules are still relinked in this case.
>
> The code is also easier to maintain as it's now in a proper source file
> instead of an inline string literal.
>
> Signed-off-by: Thomas Weißschuh 


Applied to linux-kbuild. Thanks!


> ---
> Changes in v2:
> - Remove RFC status
> - Incorporate Masahiro's proposals
>   - Rename modinfo.o to .module-common.o
>   - Build a dedicated .module-common.o for external modules
> - Link to v1: 
> https://lore.kernel.org/r/20240824-modinfo-const-v1-1-485f9c64b...@weissschuh.net
> ---
> Masahiro, feel free to add some attribution for yourself when applying.
> The new appraoch is pleasantly simpler.
> ---
>  scripts/Makefile.modfinal |  7 +--
>  scripts/mod/modpost.c | 23 ---
>  scripts/module-common.c   | 25 +
>  3 files changed, 30 insertions(+), 25 deletions(-)
>
> diff --git a/scripts/Makefile.modfinal b/scripts/Makefile.modfinal
> index 306a6bb86e4d..6b1b72257b29 100644
> --- a/scripts/Makefile.modfinal
> +++ b/scripts/Makefile.modfinal
> @@ -30,6 +30,9 @@ quiet_cmd_cc_o_c = CC [M]  $@
>  %.mod.o: %.mod.c FORCE
> $(call if_changed_dep,cc_o_c)
>
> +$(extmod_prefix).module-common.o: $(srctree)/scripts/module-common.c FORCE
> +   $(call if_changed_dep,cc_o_c)
> +
>  quiet_cmd_ld_ko_o = LD [M]  $@
>cmd_ld_ko_o +=   \
> $(LD) -r $(KBUILD_LDFLAGS)  \
> @@ -54,13 +57,13 @@ if_changed_except = $(if $(call 
> newer_prereqs_except,$(2))$(cmd-check),  \
> printf '%s\n' 'savedcmd_$@ := $(make-cmd)' > $(dot-target).cmd, @:)
>
>  # Re-generate module BTFs if either module's .ko or vmlinux changed
> -%.ko: %.o %.mod.o scripts/module.lds $(and 
> $(CONFIG_DEBUG_INFO_BTF_MODULES),$(KBUILD_BUILTIN),vmlinux) FORCE
> +%.ko: %.o %.mod.o $(extmod_prefix).module-common.o scripts/module.lds $(and 
> $(CONFIG_DEBUG_INFO_BTF_MODULES),$(KBUILD_BUILTIN),vmlinux) FORCE
> +$(call if_changed_except,ld_ko_o,vmlinux)
>  ifdef CONFIG_DEBUG_INFO_BTF_MODULES
> +$(if $(newer-prereqs),$(call cmd,btf_ko))
>  endif
>
> -targets += $(modules:%.o=%.ko) $(modules:%.o=%.mod.o)
> +targets += $(modules:%.o=%.ko) $(modules:%.o=%.mod.o) 
> $(extmod_prefix).module-common.o
>
>  # Add FORCE to the prerequisites of a target to force it to be always 
> rebuilt.
>  # ---
> diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
> index c8cd5d822bb6..107393a8c48a 100644
> --- a/scripts/mod/modpost.c
> +++ b/scripts/mod/modpost.c
> @@ -1755,26 +1755,9 @@ static void check_modname_len(struct module *mod)
>  static void add_header(struct buffer *b, struct module *mod)
>  {
> buf_printf(b, "#include \n");
> -   /*
> -* Include build-salt.h after module.h in order to
> -* inherit the definitions.
> -*/
> -   buf_printf(b, "#define INCLUDE_VERMAGIC\n");
> -   buf_printf(b, "#include \n");
> -   buf_printf(b, "#include \n");
> buf_printf(b, "#include \n");
> -   buf_printf(b, "#include \n");
> buf_printf(b, "#include \n");
> buf_printf(b, "\n");
> -   buf_printf(b, "#ifdef CONFIG_UNWINDER_ORC\n");
> -   buf_printf(b, "#include \n");
> -   buf_printf(b, "ORC_HEADER;\n");
> -   buf_printf(b, "#endif\n");
> -   buf_printf(b, "\n");
> -   buf_printf(b, "BUILD_SALT;\n");
> -   buf_printf(b, "BUILD_LTO_INFO;\n");
> -   buf_printf(b, "\n");
> -   buf_printf(b, "MODULE_INFO(vermagic, VERMAGIC_STRING);\n");
> buf_printf(b, "MODULE_INFO(name, KBUILD_MODNAME);\n");
> buf_printf(b, "\n");
> buf_printf(b, "__visible struct module __this_module\n");
> @@ -1792,12 +1775,6 @@ static void add_header(struct buffer *b, struct module 
> *mod)
> if (!external_module)
> buf_printf(b, "\nMODULE_INFO(intree, \"Y\");\n");
>
> -   buf_printf(b,
> -  "\n"
> -  "#ifdef CONFIG_MITIGATION_RETPOLINE\n"
> -  "MODULE_INFO(retpoline, \"Y\");\n"
> -  "#endif\n");
> -
> if (strstarts(mod->name, "drivers/staging"))
> buf_printf(b, "\nMODULE_INFO(staging, \"Y\");\n");
>
> diff --git a/scripts/module-common.c b/scripts/modul

Re: [PATCH v2] remoteproc: k3-r5: Delay notification of wakeup event

2024-09-03 Thread Mathieu Poirier

On Tue, 3 Sept 2024 at 04:15, Beleswar Prasad Padhi  wrote:
>
> Hi Mathieu,
>
> On 20-08-2024 16:20, Beleswar Padhi wrote:
> > From: Udit Kumar 
> >
> > Few times, core1 was scheduled to boot first before core0, which leads
> > to error:
> >
> > 'k3_r5_rproc_start: can not start core 1 before core 0'.
> >
> > This was happening due to some scheduling between prepare and start
> > callback. The probe function waits for event, which is getting
> > triggered by prepare callback. To avoid above condition move event
> > trigger to start instead of prepare callback.
> >
> > Fixes: 61f6f68447ab ("remoteproc: k3-r5: Wait for core0 power-up before 
> > powering up core1")
>
>
> Please put this patch on hold. I have some additional changelog that
> should go in v3.
>

I applied this patch a couple of weeks ago - are those changes to the
code?  If so please send another patch on top of rproc-next.

> Thanks,
> Beleswar
>
> > Signed-off-by: Udit Kumar 
> > [ Applied wakeup event trigger only for Split-Mode booted rprocs ]
> > Signed-off-by: Beleswar Padhi 
> > ---
> > v2: Changelog:
> > * Mathieu
> > 1) Rebased changes on top of -next-20240820 tag.
> >
> > Link to v1:
> > https://lore.kernel.org/all/20240809060132.308642-1-b-pa...@ti.com/
> >
> >   drivers/remoteproc/ti_k3_r5_remoteproc.c | 5 +++--
> >   1 file changed, 3 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/remoteproc/ti_k3_r5_remoteproc.c 
> > b/drivers/remoteproc/ti_k3_r5_remoteproc.c
> > index 8a63a9360c0f..e61e53381abc 100644
> > --- a/drivers/remoteproc/ti_k3_r5_remoteproc.c
> > +++ b/drivers/remoteproc/ti_k3_r5_remoteproc.c
> > @@ -469,8 +469,6 @@ static int k3_r5_rproc_prepare(struct rproc *rproc)
> >   ret);
> >   return ret;
> >   }
> > - core->released_from_reset = true;
> > - wake_up_interruptible(&cluster->core_transition);
> >
> >   /*
> >* Newer IP revisions like on J7200 SoCs support h/w 
> > auto-initialization
> > @@ -587,6 +585,9 @@ static int k3_r5_rproc_start(struct rproc *rproc)
> >   ret = k3_r5_core_run(core);
> >   if (ret)
> >   return ret;
> > +
> > + core->released_from_reset = true;
> > + wake_up_interruptible(&cluster->core_transition);
> >   }
> >
> >   return 0;

[PATCH v4 0/4] selftests: Fix cpuid / vendor checking build issues

2024-09-03 Thread Ilpo Järvinen

This series first generalizes resctrl selftest non-contiguous CAT check
to not assume non-AMD vendor implies Intel. Second, it improves
selftests such that the use of __cpuid_count() does not lead into a
build failure (happens at least on ARM).

While ARM does not currently support resctrl features, there's an
ongoing work to enable resctrl support also for it on the kernel side.
In any case, a common header such as kselftest.h should have a proper
fallback in place for what it provides, thus it seems justified to fix
this common level problem on the common level rather than e.g.
disabling build for resctrl selftest for archs lacking resctrl support.

I've dropped reviewed and tested by tags from the last patch in v3 due
to major changes into the makefile logic. So it would be helpful if
Muhammad could retest with this version.

Acquiring ARCH in lib.mk will likely allow some cleanup into some
subdirectory makefiles but that is left as future work because this
series focuses in fixing cpuid/build.

v4:
- New patch to reorder x86 selftest makefile to avoid clobbering CFLAGS
  (would cause __cpuid_count() related build fail otherwise)

v3:
- Remove "empty" wording
- Also cast input parameters to void
- Initialize ARCH from uname -m if not set (this might allow cleaning
  up some other makefiles but that is left as future work)

v2:
- Removed RFC from the last patch & added Fixes and tags
- Fixed the error message's line splits
- Noted down the reason for void casts in the stub

Ilpo Järvinen (4):
  selftests/resctrl: Generalize non-contiguous CAT check
  selftests/resctrl: Always initialize ecx to avoid build warnings
  selftests/x86: don't clobber CFLAGS
  kselftest: Provide __cpuid_count() stub on non-x86 archs

 tools/testing/selftests/kselftest.h|  6 +
 tools/testing/selftests/lib.mk |  6 +
 tools/testing/selftests/resctrl/cat_test.c | 28 +-
 tools/testing/selftests/x86/Makefile   |  4 +++-
 4 files changed, 32 insertions(+), 12 deletions(-)

-- 
2.39.2

[PATCH v4 1/4] selftests/resctrl: Generalize non-contiguous CAT check

2024-09-03 Thread Ilpo Järvinen

arch_supports_noncont_cat() checks if vendor is ARCH_AMD and if that is
not true, ARCH_INTEL is assumed which might not be true either because
get_vendor() can also return zero if neither AMD nor Intel is detected.

Generalize the vendor check using switch/case logic and return false
for unknown vendors.

Signed-off-by: Ilpo Järvinen 
Reviewed-by: Muhammad Usama Anjum 
---
 tools/testing/selftests/resctrl/cat_test.c | 26 +-
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/tools/testing/selftests/resctrl/cat_test.c 
b/tools/testing/selftests/resctrl/cat_test.c
index 742782438ca3..51a1cb6aac34 100644
--- a/tools/testing/selftests/resctrl/cat_test.c
+++ b/tools/testing/selftests/resctrl/cat_test.c
@@ -292,19 +292,25 @@ static bool arch_supports_noncont_cat(const struct 
resctrl_test *test)
 {
unsigned int eax, ebx, ecx, edx;
 
-   /* AMD always supports non-contiguous CBM. */
-   if (get_vendor() == ARCH_AMD)
+   switch (get_vendor()) {
+   case ARCH_AMD:
+   /* AMD always supports non-contiguous CBM. */
return true;
 
-   /* Intel support for non-contiguous CBM needs to be discovered. */
-   if (!strcmp(test->resource, "L3"))
-   __cpuid_count(0x10, 1, eax, ebx, ecx, edx);
-   else if (!strcmp(test->resource, "L2"))
-   __cpuid_count(0x10, 2, eax, ebx, ecx, edx);
-   else
-   return false;
+   case ARCH_INTEL:
+   /* Intel support for non-contiguous CBM needs to be discovered. 
*/
+   if (!strcmp(test->resource, "L3"))
+   __cpuid_count(0x10, 1, eax, ebx, ecx, edx);
+   else if (!strcmp(test->resource, "L2"))
+   __cpuid_count(0x10, 2, eax, ebx, ecx, edx);
+   else
+   return false;
+
+   return ((ecx >> 3) & 1);
 
-   return ((ecx >> 3) & 1);
+   default:
+   return false;
+   }
 }
 
 static int noncont_cat_run_test(const struct resctrl_test *test,
-- 
2.39.2

[PATCH v4 2/4] selftests/resctrl: Always initialize ecx to avoid build warnings

2024-09-03 Thread Ilpo Järvinen

To avoid warnings when __cpuid_count() is an empty stub, always
initialize ecx because it is used in the return statement.

Signed-off-by: Ilpo Järvinen 
Reviewed-by: Muhammad Usama Anjum 
---
 tools/testing/selftests/resctrl/cat_test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/resctrl/cat_test.c 
b/tools/testing/selftests/resctrl/cat_test.c
index 51a1cb6aac34..9882c5d19408 100644
--- a/tools/testing/selftests/resctrl/cat_test.c
+++ b/tools/testing/selftests/resctrl/cat_test.c
@@ -290,7 +290,7 @@ static int cat_run_test(const struct resctrl_test *test, 
const struct user_param
 
 static bool arch_supports_noncont_cat(const struct resctrl_test *test)
 {
-   unsigned int eax, ebx, ecx, edx;
+   unsigned int eax, ebx, ecx = 0, edx;
 
switch (get_vendor()) {
case ARCH_AMD:
-- 
2.39.2

[PATCH v4 3/4] selftests/x86: don't clobber CFLAGS

2024-09-03 Thread Ilpo Järvinen

The x86 selftests makefile clobbers CFLAGS preventing lib.mk from
making the necessary adjustments into CFLAGS. This would lead to a
build failure after upcoming change which wants to add -DHAVE_CPUID=
into CFLAGS.

Reorder CFLAGS initialization in x86 selftest. Place the constant part
of CFLAGS initialization before inclusion of lib.mk but leave adding
KHDR_INCLUDES into CFLAGS into the existing position because
KHDR_INCLUDES might be generated inside lib.mk.

Signed-off-by: Ilpo Järvinen 
---
v4:
- New patch

 tools/testing/selftests/x86/Makefile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/x86/Makefile 
b/tools/testing/selftests/x86/Makefile
index 5c8757a25998..88a6576de92f 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -1,4 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
+CFLAGS := -O2 -g -std=gnu99 -pthread -Wall
+
 all:
 
 include ../lib.mk
@@ -35,7 +37,7 @@ BINARIES_64 := $(TARGETS_C_64BIT_ALL:%=%_64)
 BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32))
 BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64))
 
-CFLAGS := -O2 -g -std=gnu99 -pthread -Wall $(KHDR_INCLUDES)
+CFLAGS += $(KHDR_INCLUDES)
 
 # call32_from_64 in thunks.S uses absolute addresses.
 ifeq ($(CAN_BUILD_WITH_NOPIE),1)
-- 
2.39.2

[PATCH v4 4/4] kselftest: Provide __cpuid_count() stub on non-x86 archs

2024-09-03 Thread Ilpo Järvinen

Building resctrl selftest fails on ARM because it uses __cpuid_count()
that fails the build with error:

  CC   resctrl_tests
In file included from resctrl.h:24,
 from cat_test.c:11:
In function 'arch_supports_noncont_cat',
inlined from 'noncont_cat_run_test' at cat_test.c:323:6:
../kselftest.h:74:9: error: impossible constraint in 'asm'
   74 | __asm__ __volatile__ ("cpuid\n\t"   \
  | ^~~
cat_test.c:301:17: note: in expansion of macro '__cpuid_count'
  301 | __cpuid_count(0x10, 1, eax, ebx, ecx, edx);
  | ^
../kselftest.h:74:9: error: impossible constraint in 'asm'
   74 | __asm__ __volatile__ ("cpuid\n\t"   \
  | ^~~
cat_test.c:303:17: note: in expansion of macro '__cpuid_count'
  303 | __cpuid_count(0x10, 2, eax, ebx, ecx, edx);
  | ^

The resctrl selftest would run that code only on Intel CPUs but as is,
the code cannot be build at all.

Define HAVE_CPUID in lib.mk based on ARCH (x86 or x86_64). If ARCH is
not set, acquire it using uname -m.

Provide a stub for __cpuid_count() if HAVE_CPUID is not present to
allow build to succeed. The stub casts its arguments to void to avoid
causing "unused variable" or "set but not used" warnings.

Fixes: ae638551ab64 ("selftests/resctrl: Add non-contiguous CBMs CAT test")
Reported-by: Muhammad Usama Anjum 
Signed-off-by: Ilpo Järvinen 
Tested-by: Muhammad Usama Anjum 
Reviewed-by: Muhammad Usama Anjum 
---
v3:
- Remove "empty" wording
- Also cast input parameters to void
- Initialize ARCH from uname -m if not set (this might allow cleaning
  up some other makefiles but that is left as future work)
v2:
- Removed RFC & added Fixes and Tested-by
- Fixed the error message's line splits
- Noted down the reason for void casts in the stub
---
 tools/testing/selftests/kselftest.h | 6 ++
 tools/testing/selftests/lib.mk  | 6 ++
 2 files changed, 12 insertions(+)

diff --git a/tools/testing/selftests/kselftest.h 
b/tools/testing/selftests/kselftest.h
index b8967b6e29d5..9c4bfbf107f1 100644
--- a/tools/testing/selftests/kselftest.h
+++ b/tools/testing/selftests/kselftest.h
@@ -70,10 +70,16 @@
  * have __cpuid_count().
  */
 #ifndef __cpuid_count
+#ifdef HAVE_CPUID
 #define __cpuid_count(level, count, a, b, c, d)
\
__asm__ __volatile__ ("cpuid\n\t"   \
  : "=a" (a), "=b" (b), "=c" (c), "=d" (d)  \
  : "0" (level), "2" (count))
+#else
+#define __cpuid_count(level, count, a, b, c, d)do {
\
+   (void)level; (void)count; (void)a; (void)b; (void)c; (void)d;   \
+} while (0)
+#endif
 #endif
 
 /* define kselftest exit codes */
diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk
index d6edcfcb5be8..8e3069926153 100644
--- a/tools/testing/selftests/lib.mk
+++ b/tools/testing/selftests/lib.mk
@@ -23,6 +23,8 @@ CLANG_TARGET_FLAGS_x86_64   := x86_64-linux-gnu
 
 # Default to host architecture if ARCH is not explicitly given.
 ifeq ($(ARCH),)
+ARCH := $(shell uname -m 2>/dev/null || echo not)
+ARCH := $(shell echo $(ARCH) | sed -e s/i.86/x86/)
 CLANG_TARGET_FLAGS := $(shell $(CLANG) -print-target-triple)
 else
 CLANG_TARGET_FLAGS := $(CLANG_TARGET_FLAGS_$(ARCH))
@@ -199,6 +201,10 @@ clean: $(if $(TEST_GEN_MODS_DIR),clean_mods_dir)
 # Build with _GNU_SOURCE by default
 CFLAGS += -D_GNU_SOURCE=
 
+ifeq ($(ARCH),$(filter $(ARCH),x86 x86_64))
+CFLAGS += -DHAVE_CPUID=
+endif
+
 # Enables to extend CFLAGS and LDFLAGS from command line, e.g.
 # make USERCFLAGS=-Werror USERLDFLAGS=-static
 CFLAGS += $(USERCFLAGS)
-- 
2.39.2

Re: [PATCH v2 08/19] gendwarfksyms: Expand subroutine_type

2024-09-03 Thread Petr Pavlu

On 8/15/24 19:39, Sami Tolvanen wrote:
> Add support for expanding DW_TAG_subroutine_type and the parameters
> in DW_TAG_formal_parameter. Use this to also expand subprograms.
> 
> Example output with --debug:
> 
>   subprogram(
> formal_parameter base_type usize byte_size(8),
> formal_parameter base_type usize byte_size(8),
>   )
>   -> base_type void;
> 
> Signed-off-by: Sami Tolvanen 
> ---
>  scripts/gendwarfksyms/dwarf.c | 57 ++-
>  scripts/gendwarfksyms/gendwarfksyms.h |  1 +
>  2 files changed, 57 insertions(+), 1 deletion(-)
> 
> diff --git a/scripts/gendwarfksyms/dwarf.c b/scripts/gendwarfksyms/dwarf.c
> index 82185737fa2a..c81652426be8 100644
> --- a/scripts/gendwarfksyms/dwarf.c
> +++ b/scripts/gendwarfksyms/dwarf.c
> [...]
>  
> +static int __process_subroutine_type(struct state *state, struct die *cache,
> +  Dwarf_Die *die, const char *type)
> +{
> + check(process(state, cache, type));
> + check(process(state, cache, "("));
> + check(process_linebreak(cache, 1));
> + /* Parameters */
> + check(process_die_container(state, cache, die, process_type,
> + match_formal_parameter_type));
> + check(process_linebreak(cache, -1));
> + check(process(state, cache, ")"));
> + process_linebreak(cache, 0);
> + /* Return type */
> + check(process(state, cache, "-> "));
> + return check(process_type_attr(state, cache, die));
> +}

If I understand correctly, this formatting logic also affects the
symtypes output. Looking at its format, I would like to propose a few
minor changes.

Example of the current symtypes output:
kprobe_event_cmd_init subprogram( formal_parameter pointer_type  { 
s#dynevent_cmd } byte_size(8), formal_parameter pointer_type  { 
base_type char byte_size(1) encoding(8) } byte_size(8), formal_parameter 
base_type int byte_size(4) encoding(5),  ) -> base_type void

Proposed changes:
kprobe_event_cmd_init subprogram ( formal_parameter pointer_type  { 
s#dynevent_cmd } byte_size(8) , formal_parameter pointer_type  { 
base_type char byte_size(1) encoding(8) } byte_size(8) , formal_parameter 
base_type int byte_size(4) encoding(5) ) -> base_type void
^- (1)  
  ^- (2)

   ^- (3)

(1) "subprogram(" is split to "subprogram (".
(2) A space is added prior to ",".
(3) String ", " is removed after the last parameter.

Separating each token with a whitespace matches the current genksyms
format, makes the data trivially parsable and easy to pretty-print by
additional tools. If some tokens are merged, as in "subprogram(", then
such a tool needs to have extra logic to parse each word and split it
into tokens.

For attributes with one value, such as "byte_size(4)", I think the
current format is probably ok.

-- 
Thanks,
Petr

Re: [PATCH v2 11/19] gendwarfksyms: Limit structure expansion

2024-09-03 Thread Petr Pavlu

On 8/15/24 19:39, Sami Tolvanen wrote:
> Expand each structure type only once per exported symbol. This
> is necessary to support self-referential structures, which would
> otherwise result in infinite recursion, but is still sufficient for
> catching ABI changes.
> 
> For pointers to structure types, limit type expansion inside the
> pointer to two levels. This should be plenty for detecting ABI
> differences, but it stops us from pulling in half the kernel for
> types that contain pointers to large kernel data structures, like
> task_struct, for example.

I'm quite worried about this optimization for pointer types. It could
result in some kABI changes not being recognized.

I assume the goal of the optimization is to speed up the tool's runtime.
How much does it improve the processing time and is there any other way
how it could be done?

> diff --git a/scripts/gendwarfksyms/dwarf.c b/scripts/gendwarfksyms/dwarf.c
> index 92b6ca4c5c91..2f1601015c4e 100644
> --- a/scripts/gendwarfksyms/dwarf.c
> +++ b/scripts/gendwarfksyms/dwarf.c
> [...]
> @@ -651,6 +742,7 @@ static int process_exported_symbols(struct state *state, 
> struct die *cache,
>   else
>   check(process_variable(state, &state->die));
>  
> + cache_clear_expanded(&state->expansion_cache);
>   return 0;
>   default:
>   return 0;

I wonder if it would make sense to share the cache between processing
individual exported symbols.

The hard case looks to be the following:
s#A struct A { int ; }
s#B struct B { s#A ; }
foo void foo ( s#B )
bar void bar ( s#A , s#B )

When processing foo, the code would cache s#B with expanded s#A.
However, when processing bar and reaching s#B, the cache should report
a miss because s#B with unexpanded s#A is required.

So the code would need to track which types were already expanded and
have each cache entry accordingly tagged with similar data.

Hm, it might be that doing all this additional tracking would then be
actually slower than processing the types repeatedly for each symbol.
I'm not sure.

-- 
Thanks,
Petr

[PATCH v4 15/15] fs/fuse/virtio_fs: allow idmapped mounts

2024-09-03 Thread Alexander Mikhalitsyn

Allow idmapped mounts for virtiofs.
It's absolutely safe as for virtiofs we have the same
feature negotiation mechanism as for classical fuse
filesystems. This does not affect any existing
setups anyhow.

virtiofsd support:
https://gitlab.com/virtio-fs/virtiofsd/-/merge_requests/245

Cc: Christian Brauner 
Cc: Seth Forshee 
Cc: Miklos Szeredi 
Cc: Vivek Goyal 
Cc: German Maglione 
Cc: Amir Goldstein 
Cc: Bernd Schubert 
Cc: 
Signed-off-by: Alexander Mikhalitsyn 
Reviewed-by: Christian Brauner 
---
v3:
- this commit added
---
 fs/fuse/virtio_fs.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index dd5260141615..7e5bbaef6f76 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -1628,6 +1628,7 @@ static struct file_system_type virtio_fs_type = {
.name   = "virtiofs",
.init_fs_context = virtio_fs_init_fs_context,
.kill_sb= virtio_kill_sb,
+   .fs_flags   = FS_ALLOW_IDMAP,
 };
 
 static int virtio_fs_uevent(const struct kobject *kobj, struct kobj_uevent_env 
*env)
-- 
2.34.1

Re: [PATCH 5/5] KVM: VMX: Always honor guest PAT on CPUs that support self-snoop

2024-09-03 Thread Sean Christopherson

On Mon, Sep 02, 2024, Vitaly Kuznetsov wrote:
> FWIW, I use QEMU-9.0 from the same C10S (qemu-kvm-9.0.0-7.el10.x86_64)
> but I don't think it matters in this case. My CPU is "Intel(R) Xeon(R)
> Silver 4410Y".

Has this been reproduced on any other hardware besides SPR?  I.e. did we stumble
on another hardware issue?

Re: [PATCH 2/2] vdpa: Add support to update speed/duplex in vDPA/mlx5_vnet

2024-09-03 Thread Carlos Bilbao

Hello,

On 9/1/24 11:27 PM, Jason Wang wrote:
> On Fri, Aug 30, 2024 at 9:15 PM Carlos Bilbao  
> wrote:
>> Hello,
>>
>> On 8/29/24 9:31 PM, Jason Wang wrote:
>>> On Fri, Aug 30, 2024 at 5:08 AM Dragos Tatulea  wrote:
 (resending as I accidentally replied only to Carlos)

 On 29.08.24 18:16, Carlos Bilbao wrote:
> From: Carlos Bilbao 
>
> Include support to update the vDPA configuration fields of speed and
> duplex (as needed by VHOST_VDPA_SET_CONFIG). This includes function
> mlx5_vdpa_set_config() as well as changes in vdpa.c to fill the initial
> values to UNKNOWN. Also add a warning message for when
> mlx5_vdpa_get_config() receives offset and length out of bounds.
>
> Signed-off-by: Carlos Bilbao 
> ---
>  drivers/vdpa/mlx5/net/mlx5_vnet.c | 34 ++-
>  drivers/vdpa/vdpa.c   | 27 
>  include/uapi/linux/vdpa.h |  2 ++
>  3 files changed, 62 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
> b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> index c47009a8b472..a44bb2072eec 100644
> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> @@ -3221,12 +3221,44 @@ static void mlx5_vdpa_get_config(struct 
> vdpa_device *vdev, unsigned int offset,
>
>   if (offset + len <= sizeof(struct virtio_net_config))
>   memcpy(buf, (u8 *)&ndev->config + offset, len);
> + else
> + mlx5_vdpa_warn(mvdev, "Offset and length out of bounds\n");
>  }
>
>  static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int 
> offset, const void *buf,
>unsigned int len)
>  {
> - /* not supported */
> + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
> + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
> +
> + if (offset + len > sizeof(struct virtio_net_config)) {
> + mlx5_vdpa_warn(mvdev, "Offset and length out of bounds\n");
> + return;
> + }
> +
> + /*
> +  * Note that this will update the speed/duplex configuration fields
> +  * but the hardware support to actually perform this change does
> +  * not exist yet.
> +  */
> + switch (offset) {
> + case offsetof(struct virtio_net_config, speed):
> + if (len == sizeof(((struct virtio_net_config *) 0)->speed))
> + memcpy(&ndev->config.speed, buf, len);
> + else
> + mlx5_vdpa_warn(mvdev, "Invalid length for 
> speed.\n");
> + break;
> +
> + case offsetof(struct virtio_net_config, duplex):
> + if (len == sizeof(((struct virtio_net_config *)0)->duplex))
> + memcpy(&ndev->config.duplex, buf, len);
> + else
> + mlx5_vdpa_warn(mvdev, "Invalid length for 
> duplex.\n");
> + break;
> +
> + default:
> + mlx5_vdpa_warn(mvdev, "Configuration field not 
> supported.\n");
 This will trigger noise in dmesg because there is a MAC configuration here.
> + }
 I would prefer that the .set_config remains a stub TBH. Setting the fields 
 here is
 misleading: the user might deduce that the configuration worked when they 
 read the
 values and see that they were updated.
>>> Yes, and actually, those fields are read-only according to the spec:
>>>
>>> """
>>> The network device has the following device configuration layout. All
>>> of the device configuration fields are read-only for the driver.
>>> """
>>>
>>> Thanks
>>
>> Should I go ahead and remove the ioctl then?
> If you meant mlx5_vdpa_set_config, I think yes.


Ack, I will send a new patch set in which the second commit gets rid of the
ioctl -- but not only for mlx5 but for all vDPA implementations.


>
> Thanks
>

Thanks, Carlos

Re: [PATCH] x86/sgx: Fix deadloop in __sgx_alloc_epc_page()

2024-09-03 Thread Jarkko Sakkinen

On Fri Aug 30, 2024 at 9:14 AM EEST, Aaron Lu wrote:
> On Thu, Aug 29, 2024 at 07:44:13PM +0300, Jarkko Sakkinen wrote:
> > On Thu Aug 29, 2024 at 5:38 AM EEST, Aaron Lu wrote:
> > > When current node doesn't have a EPC section configured by firmware and
> > > all other EPC sections memory are used up, CPU can stuck inside the
> > > while loop in __sgx_alloc_epc_page() forever and soft lockup will happen.
> > > Note how nid_of_current will never equal to nid in that while loop because
> >  
> > 
> > Oh *that* while loop ;-) Please be more specific.
>
> What about:
> Note how nid_of_current will never be equal to nid in the while loop that
> searches an available EPC page from remote nodes because nid_of_current is
> not set in sgx_numa_mask.

That would work I think!

>
> > > nid_of_current is not set in sgx_numa_mask.
> > >
> > > Also worth mentioning is that it's perfectly fine for firmware to not
> > > seup an EPC section on a node. Setting an EPC section on each node can
> > > be good for performance but that's not a requirement functionality wise.
> > 
> > This lacks any description of what is done to __sgx_alloc_epc_page().
>
> Will add what Dave suggested on how the problem is fixed to the changelog.

Great. I think the code change is correct reflecting these additions.
I'll look the next version as a whole but with high probability I can
ack that as long as the commit message has these updates.

>
> > >
> > > Fixes: 901ddbb9ecf5 ("x86/sgx: Add a basic NUMA allocation scheme to 
> > > sgx_alloc_epc_page()")
> > > Reported-by: Zhimin Luo 
> > > Tested-by: Zhimin Luo 
> > > Signed-off-by: Aaron Lu 
>
> Thanks,
> Aaron

BR, Jarkko

Re: [PATCH] selftests: net: convert comma to semicolon

2024-09-03 Thread Simon Horman

On Tue, Sep 03, 2024 at 03:45:19PM +0800, Chen Ni wrote:
> Replace a comma between expression statements by a semicolon.
> 
> Signed-off-by: Chen Ni 

As mentioned at [1] I think it would be nice if the patch description
was a bit more descriptive.

[1] https://lore.kernel.org/all/20240903152125.ga4...@kernel.org/

Re: [PATCH 5/5] KVM: VMX: Always honor guest PAT on CPUs that support self-snoop

2024-09-03 Thread Vitaly Kuznetsov

Sean Christopherson  writes:

> On Mon, Sep 02, 2024, Vitaly Kuznetsov wrote:
>> FWIW, I use QEMU-9.0 from the same C10S (qemu-kvm-9.0.0-7.el10.x86_64)
>> but I don't think it matters in this case. My CPU is "Intel(R) Xeon(R)
>> Silver 4410Y".
>
> Has this been reproduced on any other hardware besides SPR?  I.e. did we 
> stumble
> on another hardware issue?

Very possible, as according to Yan Zhao this doesn't reproduce on at
least "Coffee Lake-S". Let me try to grab some random hardware around
and I'll be back with my observations.

-- 
Vitaly

Re: [PATCH net-next] virtio_net: Fix napi_skb_cache_put warning

2024-09-03 Thread Leonardo Bras

On Mon, Jul 15, 2024 at 04:25:06AM -0700, Breno Leitao wrote:
> Hello Michael,
> 
> On Sun, Jul 14, 2024 at 03:38:42AM -0400, Michael S. Tsirkin wrote:
> > On Fri, Jul 12, 2024 at 04:53:25AM -0700, Breno Leitao wrote:
> > > After the commit bdacf3e34945 ("net: Use nested-BH locking for
> > > napi_alloc_cache.") was merged, the following warning began to appear:
> > > 
> > >WARNING: CPU: 5 PID: 1 at net/core/skbuff.c:1451 
> > > napi_skb_cache_put+0x82/0x4b0
> > > 
> > > __warn+0x12f/0x340
> > > napi_skb_cache_put+0x82/0x4b0
> > > napi_skb_cache_put+0x82/0x4b0
> > > report_bug+0x165/0x370
> > > handle_bug+0x3d/0x80
> > > exc_invalid_op+0x1a/0x50
> > > asm_exc_invalid_op+0x1a/0x20
> > > __free_old_xmit+0x1c8/0x510
> > > napi_skb_cache_put+0x82/0x4b0
> > > __free_old_xmit+0x1c8/0x510
> > > __free_old_xmit+0x1c8/0x510
> > > __pfx___free_old_xmit+0x10/0x10
> > > 
> > > The issue arises because virtio is assuming it's running in NAPI context
> > > even when it's not, such as in the netpoll case.
> > > 
> > > To resolve this, modify virtnet_poll_tx() to only set NAPI when budget
> > > is available. Same for virtnet_poll_cleantx(), which always assumed that
> > > it was in a NAPI context.
> > > 
> > > Fixes: df133f3f9625 ("virtio_net: bulk free tx skbs")
> > > Suggested-by: Jakub Kicinski 
> > > Signed-off-by: Breno Leitao 
> > 
> > Acked-by: Michael S. Tsirkin 
> > 
> > though I'm not sure I understand the connection with bdacf3e34945.
> 
> The warning above appeared after bdacf3e34945 landed.

Hi Breno,
Thanks for fixing this!

I think the confusion is around the fact that the commit on Fixes 
(df133f3f9625) tag is different from the commit in the commit message
(bdacf3e34945).

Please help me check if the following is correct:
###
Any tree which includes df133f3f9625 ("virtio_net: bulk free tx skbs") 
should also include your patch, since it fixes stuff in there.

The fact that the warning was only made visible in 
bdacf3e34945 ("net: Use nested-BH locking for napi_alloc_cache.")
does not change the fact that it was already present before.

Also, having bdacf3e34945 is not necessary for the backport, since
it only made the bug visible.
###

Are above statements right?

It's important to make it clear since this helps the backporting process.

Thanks!
Leo

[PATCH rcu 0/11] Add light-weight readers for SRCU

2024-09-03 Thread Paul E. McKenney

Hello!

This series provides light-weight readers for SRCU.  This lightness
is selected by the caller by using the new srcu_read_lock_lite() and
srcu_read_unlock_lite() flavors instead of the usual srcu_read_lock() and
srcu_read_unlock() flavors.  Although this passes significant rcutorture
testing, this should still be considered to be experimental.

There are a few restrictions:  (1) If srcu_read_lock_lite() is called
on a given srcu_struct structure, then no other flavor may be used on
that srcu_struct structure, before, during, or after.  (2) The _lite()
readers may only be invoked from regions of code where RCU is watching
(as in those regions in which rcu_is_watching() returns true).  (3)
There is no auto-expediting for srcu_struct structures that have
been passed to _lite() readers.  (4) SRCU grace periods for _lite()
srcu_struct structures invoke synchronize_rcu() at least twice, thus
having longer latencies than their non-_lite() counterparts.  (5) Even
with synchronize_srcu_expedited(), the resulting SRCU grace period
will invoke synchronize_rcu() at least twice, as opposed to invoking
the IPI-happy synchronize_rcu_expedited() function.  (6)  Just as with
srcu_read_lock() and srcu_read_unlock(), the srcu_read_lock_lite() and
srcu_read_unlock_lite() functions may not (repeat, *not*) be invoked
from NMI handlers (that is what the _nmisafe() interface are for).
Although one could imagine readers that were both _lite() and _nmisafe(),
one might also imagine that the read-modify-write atomic operations that
are needed by any NMI-safe SRCU read marker would make this unhelpful
from a performance perspective.

All that said, the patches in this series are as follows:

1.  Rename srcu_might_be_idle() to srcu_should_expedite().

2.  Introduce srcu_gp_is_expedited() helper function.

3.  Renaming in preparation for additional reader flavor.

4.  Bit manipulation changes for additional reader flavor.

5.  Standardize srcu_data pointers to "sdp" and similar.

6.  Convert srcu_data ->srcu_reader_flavor to bit field.

7.  Add srcu_read_lock_lite() and srcu_read_unlock_lite().

8.  rcutorture: Expand RCUTORTURE_RDR_MASK_[12] to eight bits.

9.  rcutorture: Add reader_flavor parameter for SRCU readers.

10. rcutorture: Add srcu_read_lock_lite() support to
rcutorture.reader_flavor.

11. refscale: Add srcu_read_lock_lite() support using "srcu-lite".

Thanx, Paul



 Documentation/admin-guide/kernel-parameters.txt   |4 
 b/Documentation/admin-guide/kernel-parameters.txt |8 +
 b/include/linux/srcu.h|   21 +-
 b/include/linux/srcutree.h|2 
 b/kernel/rcu/rcutorture.c |   28 +--
 b/kernel/rcu/refscale.c   |   54 +--
 b/kernel/rcu/srcutree.c   |   16 +-
 include/linux/srcu.h  |   86 +--
 include/linux/srcutree.h  |5 
 kernel/rcu/rcutorture.c   |   37 +++-
 kernel/rcu/srcutree.c |  168 +++---
 11 files changed, 308 insertions(+), 121 deletions(-)

[PATCH rcu 01/11] srcu: Rename srcu_might_be_idle() to srcu_should_expedite()

2024-09-03 Thread Paul E. McKenney

SRCU auto-expedites grace periods that follow a sufficiently long idle
period, and the srcu_might_be_idle() function is used to make this
decision.  However, the upcoming light-weight SRCU readers will not do
auto-expediting because doing so would cause the grace-period machinery
to invoke synchronize_rcu_expedited() twice, with IPIs all around.
However, software-engineering considerations force this determination
to remain in srcu_might_be_idle().

This commit therefore changes the name of srcu_might_be_idle() to
srcu_should_expedite(), thus moving from what it currently does to why
it does it, this latter being more future-proof.

Signed-off-by: Paul E. McKenney 
Cc: Alexei Starovoitov 
Cc: Andrii Nakryiko 
Cc: Peter Zijlstra 
Cc: Kent Overstreet 
Cc: 
---
 kernel/rcu/srcutree.c | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 78afaffd1b262..2fe0abade9c06 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1139,7 +1139,8 @@ static void srcu_flip(struct srcu_struct *ssp)
 }
 
 /*
- * If SRCU is likely idle, return true, otherwise return false.
+ * If SRCU is likely idle, in other words, the next SRCU grace period
+ * should be expedited, return true, otherwise return false.
  *
  * Note that it is OK for several current from-idle requests for a new
  * grace period from idle to specify expediting because they will all end
@@ -1159,7 +1160,7 @@ static void srcu_flip(struct srcu_struct *ssp)
  * negligible when amortized over that time period, and the extra latency
  * of a needlessly non-expedited grace period is similarly negligible.
  */
-static bool srcu_might_be_idle(struct srcu_struct *ssp)
+static bool srcu_should_expedite(struct srcu_struct *ssp)
 {
unsigned long curseq;
unsigned long flags;
@@ -1469,14 +1470,15 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
  * Implementation of these memory-ordering guarantees is similar to
  * that of synchronize_rcu().
  *
- * If SRCU is likely idle, expedite the first request.  This semantic
- * was provided by Classic SRCU, and is relied upon by its users, so TREE
- * SRCU must also provide it.  Note that detecting idleness is heuristic
- * and subject to both false positives and negatives.
+ * If SRCU is likely idle as determined by srcu_should_expedite(),
+ * expedite the first request.  This semantic was provided by Classic SRCU,
+ * and is relied upon by its users, so TREE SRCU must also provide it.
+ * Note that detecting idleness is heuristic and subject to both false
+ * positives and negatives.
  */
 void synchronize_srcu(struct srcu_struct *ssp)
 {
-   if (srcu_might_be_idle(ssp) || rcu_gp_is_expedited())
+   if (srcu_should_expedite(ssp) || rcu_gp_is_expedited())
synchronize_srcu_expedited(ssp);
else
__synchronize_srcu(ssp, true);
-- 
2.40.1

[PATCH rcu 03/11] srcu: Renaming in preparation for additional reader flavor

2024-09-03 Thread Paul E. McKenney

Currently, there are only two flavors of readers, normal and NMI-safe.
A number of fields, functions, and types reflect this restriction.
This renaming-only commit prepares for the addition of light-weight
(as in memory-barrier-free) readers.  OK, OK, there is also a drive-by
white-space fixeup!

Signed-off-by: Paul E. McKenney 
Cc: Alexei Starovoitov 
Cc: Andrii Nakryiko 
Cc: Peter Zijlstra 
Cc: Kent Overstreet 
Cc: 
---
 include/linux/srcu.h | 21 ++---
 include/linux/srcutree.h |  2 +-
 kernel/rcu/srcutree.c| 22 +++---
 3 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 835bbb2d1f88a..06728ef6f32a4 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -181,10 +181,9 @@ static inline int srcu_read_lock_held(const struct 
srcu_struct *ssp)
 #define SRCU_NMI_SAFE  0x2
 
 #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_TREE_SRCU)
-void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe);
+void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor);
 #else
-static inline void srcu_check_nmi_safety(struct srcu_struct *ssp,
-bool nmi_safe) { }
+static inline void srcu_check_read_flavor(struct srcu_struct *ssp, int 
read_flavor) { }
 #endif
 
 
@@ -245,7 +244,7 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) 
__acquires(ssp)
 {
int retval;
 
-   srcu_check_nmi_safety(ssp, false);
+   srcu_check_read_flavor(ssp, false);
retval = __srcu_read_lock(ssp);
srcu_lock_acquire(&ssp->dep_map);
return retval;
@@ -262,7 +261,7 @@ static inline int srcu_read_lock_nmisafe(struct srcu_struct 
*ssp) __acquires(ssp
 {
int retval;
 
-   srcu_check_nmi_safety(ssp, true);
+   srcu_check_read_flavor(ssp, true);
retval = __srcu_read_lock_nmisafe(ssp);
rcu_try_lock_acquire(&ssp->dep_map);
return retval;
@@ -274,7 +273,7 @@ srcu_read_lock_notrace(struct srcu_struct *ssp) 
__acquires(ssp)
 {
int retval;
 
-   srcu_check_nmi_safety(ssp, false);
+   srcu_check_read_flavor(ssp, false);
retval = __srcu_read_lock(ssp);
return retval;
 }
@@ -303,7 +302,7 @@ srcu_read_lock_notrace(struct srcu_struct *ssp) 
__acquires(ssp)
 static inline int srcu_down_read(struct srcu_struct *ssp) __acquires(ssp)
 {
WARN_ON_ONCE(in_nmi());
-   srcu_check_nmi_safety(ssp, false);
+   srcu_check_read_flavor(ssp, false);
return __srcu_read_lock(ssp);
 }
 
@@ -318,7 +317,7 @@ static inline void srcu_read_unlock(struct srcu_struct 
*ssp, int idx)
__releases(ssp)
 {
WARN_ON_ONCE(idx & ~0x1);
-   srcu_check_nmi_safety(ssp, false);
+   srcu_check_read_flavor(ssp, false);
srcu_lock_release(&ssp->dep_map);
__srcu_read_unlock(ssp, idx);
 }
@@ -334,7 +333,7 @@ static inline void srcu_read_unlock_nmisafe(struct 
srcu_struct *ssp, int idx)
__releases(ssp)
 {
WARN_ON_ONCE(idx & ~0x1);
-   srcu_check_nmi_safety(ssp, true);
+   srcu_check_read_flavor(ssp, true);
rcu_lock_release(&ssp->dep_map);
__srcu_read_unlock_nmisafe(ssp, idx);
 }
@@ -343,7 +342,7 @@ static inline void srcu_read_unlock_nmisafe(struct 
srcu_struct *ssp, int idx)
 static inline notrace void
 srcu_read_unlock_notrace(struct srcu_struct *ssp, int idx) __releases(ssp)
 {
-   srcu_check_nmi_safety(ssp, false);
+   srcu_check_read_flavor(ssp, false);
__srcu_read_unlock(ssp, idx);
 }
 
@@ -360,7 +359,7 @@ static inline void srcu_up_read(struct srcu_struct *ssp, 
int idx)
 {
WARN_ON_ONCE(idx & ~0x1);
WARN_ON_ONCE(in_nmi());
-   srcu_check_nmi_safety(ssp, false);
+   srcu_check_read_flavor(ssp, false);
__srcu_read_unlock(ssp, idx);
 }
 
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index ed57598394de3..ab7d8d215b84b 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -25,7 +25,7 @@ struct srcu_data {
/* Read-side state. */
atomic_long_t srcu_lock_count[2];   /* Locks per CPU. */
atomic_long_t srcu_unlock_count[2]; /* Unlocks per CPU. */
-   int srcu_nmi_safety;/* NMI-safe srcu_struct 
structure? */
+   int srcu_reader_flavor; /* Reader flavor for 
srcu_struct structure? */
 
/* Update-side state. */
spinlock_t __private lock cacheline_internodealigned_in_smp;
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 5b1a315f77bc6..f259dd8342721 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -460,7 +460,7 @@ static unsigned long srcu_readers_unlock_idx(struct 
srcu_struct *ssp, int idx)
 
sum += atomic_long_read(&cpuc->srcu_unlock_count[idx]);
if (IS_ENABLED(CONFIG_PROVE_RCU))
-   mask = mask | READ_ONCE(cpuc->srcu_nmi_safety);
+   mask = m

[PATCH rcu 02/11] srcu: Introduce srcu_gp_is_expedited() helper function

2024-09-03 Thread Paul E. McKenney

Even though the open-coded expressions usually fit on one line, this
commit replaces them with a call to a new srcu_gp_is_expedited()
helper function in order to improve readability.

Signed-off-by: Paul E. McKenney 
Cc: Alexei Starovoitov 
Cc: Andrii Nakryiko 
Cc: Peter Zijlstra 
Cc: Kent Overstreet 
Cc: 
---
 kernel/rcu/srcutree.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 2fe0abade9c06..5b1a315f77bc6 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -418,6 +418,16 @@ static void check_init_srcu_struct(struct srcu_struct *ssp)
spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
 }
 
+/*
+ * Is the current or any upcoming grace period to be expedited?
+ */
+static bool srcu_gp_is_expedited(struct srcu_struct *ssp)
+{
+   struct srcu_usage *sup = ssp->srcu_sup;
+
+   return ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), 
READ_ONCE(sup->srcu_gp_seq_needed_exp));
+}
+
 /*
  * Returns approximate total of the readers' ->srcu_lock_count[] values
  * for the rank of per-CPU counters specified by idx.
@@ -622,7 +632,7 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
unsigned long jbase = SRCU_INTERVAL;
struct srcu_usage *sup = ssp->srcu_sup;
 
-   if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), 
READ_ONCE(sup->srcu_gp_seq_needed_exp)))
+   if (srcu_gp_is_expedited(ssp))
jbase = 0;
if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) {
j = jiffies - 1;
@@ -867,7 +877,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
spin_lock_irq_rcu_node(sup);
idx = rcu_seq_state(sup->srcu_gp_seq);
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
-   if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), 
READ_ONCE(sup->srcu_gp_seq_needed_exp)))
+   if (srcu_gp_is_expedited(ssp))
cbdelay = 0;
 
WRITE_ONCE(sup->srcu_last_gp_end, ktime_get_mono_fast_ns());
-- 
2.40.1

[PATCH rcu 04/11] srcu: Bit manipulation changes for additional reader flavor

2024-09-03 Thread Paul E. McKenney

Currently, there are only two flavors of readers, normal and NMI-safe.
Very straightforward state updates suffice to check for erroneous
mixing of reader flavors on a given srcu_struct structure.  This commit
upgrades the checking in preparation for the addition of light-weight
(as in memory-barrier-free) readers.

Signed-off-by: Paul E. McKenney 
Cc: Alexei Starovoitov 
Cc: Andrii Nakryiko 
Cc: Peter Zijlstra 
Cc: Kent Overstreet 
Cc: 
---
 kernel/rcu/srcutree.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index f259dd8342721..06ade8ca7804f 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -462,7 +462,7 @@ static unsigned long srcu_readers_unlock_idx(struct 
srcu_struct *ssp, int idx)
if (IS_ENABLED(CONFIG_PROVE_RCU))
mask = mask | READ_ONCE(cpuc->srcu_reader_flavor);
}
-   WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask >> 1)),
+   WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)),
  "Mixed NMI-safe readers for srcu_struct at %ps.\n", ssp);
return sum;
 }
@@ -712,8 +712,10 @@ void srcu_check_read_flavor(struct srcu_struct *ssp, int 
read_flavor)
sdp = raw_cpu_ptr(ssp->sda);
old_reader_flavor_mask = READ_ONCE(sdp->srcu_reader_flavor);
if (!old_reader_flavor_mask) {
-   WRITE_ONCE(sdp->srcu_reader_flavor, reader_flavor_mask);
-   return;
+   old_reader_flavor_mask = cmpxchg(&sdp->srcu_reader_flavor, 0, 
reader_flavor_mask);
+   if (!old_reader_flavor_mask) {
+   return;
+   }
}
WARN_ONCE(old_reader_flavor_mask != reader_flavor_mask, "CPU %d old 
state %d new state %d\n", sdp->cpu, old_reader_flavor_mask, reader_flavor_mask);
 }
-- 
2.40.1

[PATCH rcu 05/11] srcu: Standardize srcu_data pointers to "sdp" and similar

2024-09-03 Thread Paul E. McKenney

This commit changes a few "cpuc" variables to "sdp" to align wiht usage
elsewhere.

Signed-off-by: Paul E. McKenney 
Cc: Alexei Starovoitov 
Cc: Andrii Nakryiko 
Cc: Peter Zijlstra 
Cc: Kent Overstreet 
Cc: 
---
 kernel/rcu/srcutree.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 06ade8ca7804f..54ca2ea2b7d68 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -438,9 +438,9 @@ static unsigned long srcu_readers_lock_idx(struct 
srcu_struct *ssp, int idx)
unsigned long sum = 0;
 
for_each_possible_cpu(cpu) {
-   struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
+   struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
 
-   sum += atomic_long_read(&cpuc->srcu_lock_count[idx]);
+   sum += atomic_long_read(&sdp->srcu_lock_count[idx]);
}
return sum;
 }
@@ -456,11 +456,11 @@ static unsigned long srcu_readers_unlock_idx(struct 
srcu_struct *ssp, int idx)
unsigned long sum = 0;
 
for_each_possible_cpu(cpu) {
-   struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
+   struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
 
-   sum += atomic_long_read(&cpuc->srcu_unlock_count[idx]);
+   sum += atomic_long_read(&sdp->srcu_unlock_count[idx]);
if (IS_ENABLED(CONFIG_PROVE_RCU))
-   mask = mask | READ_ONCE(cpuc->srcu_reader_flavor);
+   mask = mask | READ_ONCE(sdp->srcu_reader_flavor);
}
WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)),
  "Mixed NMI-safe readers for srcu_struct at %ps.\n", ssp);
@@ -564,12 +564,12 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
unsigned long sum = 0;
 
for_each_possible_cpu(cpu) {
-   struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
+   struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
 
-   sum += atomic_long_read(&cpuc->srcu_lock_count[0]);
-   sum += atomic_long_read(&cpuc->srcu_lock_count[1]);
-   sum -= atomic_long_read(&cpuc->srcu_unlock_count[0]);
-   sum -= atomic_long_read(&cpuc->srcu_unlock_count[1]);
+   sum += atomic_long_read(&sdp->srcu_lock_count[0]);
+   sum += atomic_long_read(&sdp->srcu_lock_count[1]);
+   sum -= atomic_long_read(&sdp->srcu_unlock_count[0]);
+   sum -= atomic_long_read(&sdp->srcu_unlock_count[1]);
}
return sum;
 }
-- 
2.40.1

[PATCH rcu 11/11] refscale: Add srcu_read_lock_lite() support using "srcu-lite"

2024-09-03 Thread Paul E. McKenney

This commit creates a new srcu-lite option for the refscale.scale_type
module parameter that selects srcu_read_lock_lite() and
srcu_read_unlock_lite().

Signed-off-by: Paul E. McKenney 
Cc: Alexei Starovoitov 
Cc: Andrii Nakryiko 
Cc: Peter Zijlstra 
Cc: Kent Overstreet 
Cc: 
---
 kernel/rcu/refscale.c | 54 ---
 1 file changed, 41 insertions(+), 13 deletions(-)

diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index be66e5a67ee19..9a392822cd5a7 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -216,6 +216,36 @@ static const struct ref_scale_ops srcu_ops = {
.name   = "srcu"
 };
 
+static void srcu_lite_ref_scale_read_section(const int nloops)
+{
+   int i;
+   int idx;
+
+   for (i = nloops; i >= 0; i--) {
+   idx = srcu_read_lock_lite(srcu_ctlp);
+   srcu_read_unlock_lite(srcu_ctlp, idx);
+   }
+}
+
+static void srcu_lite_ref_scale_delay_section(const int nloops, const int udl, 
const int ndl)
+{
+   int i;
+   int idx;
+
+   for (i = nloops; i >= 0; i--) {
+   idx = srcu_read_lock_lite(srcu_ctlp);
+   un_delay(udl, ndl);
+   srcu_read_unlock_lite(srcu_ctlp, idx);
+   }
+}
+
+static const struct ref_scale_ops srcu_lite_ops = {
+   .init   = rcu_sync_scale_init,
+   .readsection= srcu_lite_ref_scale_read_section,
+   .delaysection   = srcu_lite_ref_scale_delay_section,
+   .name   = "srcu-lite"
+};
+
 #ifdef CONFIG_TASKS_RCU
 
 // Definitions for RCU Tasks ref scale testing: Empty read markers.
@@ -1133,27 +1163,25 @@ ref_scale_init(void)
long i;
int firsterr = 0;
static const struct ref_scale_ops *scale_ops[] = {
-   &rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, 
&rwlock_ops,
-   &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, 
&sched_clock_ops, &clock_ops,
-   &jiffies_ops, &typesafe_ref_ops, &typesafe_lock_ops, 
&typesafe_seqlock_ops,
+   &rcu_ops, &srcu_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS
+   &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, 
&acqrel_ops,
+   &sched_clock_ops, &clock_ops, &jiffies_ops, &typesafe_ref_ops, 
&typesafe_lock_ops,
+   &typesafe_seqlock_ops,
};
 
if (!torture_init_begin(scale_type, verbose))
return -EBUSY;
 
for (i = 0; i < ARRAY_SIZE(scale_ops); i++) {
-   cur_ops = scale_ops[i];
-   if (strcmp(scale_type, cur_ops->name) == 0)
+   cur_ops = scale_ops[i]; if (strcmp(scale_type,
+   cur_ops->name) == 0)
break;
-   }
-   if (i == ARRAY_SIZE(scale_ops)) {
-   pr_alert("rcu-scale: invalid scale type: \"%s\"\n", scale_type);
-   pr_alert("rcu-scale types:");
-   for (i = 0; i < ARRAY_SIZE(scale_ops); i++)
+   } if (i == ARRAY_SIZE(scale_ops)) {
+   pr_alert("rcu-scale: invalid scale type: \"%s\"\n",
+   scale_type); pr_alert("rcu-scale types:"); for (i = 0;
+   i < ARRAY_SIZE(scale_ops); i++)
pr_cont(" %s", scale_ops[i]->name);
-   pr_cont("\n");
-   firsterr = -EINVAL;
-   cur_ops = NULL;
+   pr_cont("\n"); firsterr = -EINVAL; cur_ops = NULL;
goto unwind;
}
if (cur_ops->init)
-- 
2.40.1

[PATCH rcu 07/11] srcu: Add srcu_read_lock_lite() and srcu_read_unlock_lite()

2024-09-03 Thread Paul E. McKenney

This patch adds srcu_read_lock_lite() and srcu_read_unlock_lite(), which
dispense with the read-side smp_mb() but also are restricted to code
regions that RCU is watching.  If a given srcu_struct structure uses
srcu_read_lock_lite() and srcu_read_unlock_lite(), it is not permitted
to use any other SRCU read-side marker, before, during, or after.

Another price of light-weight readers is heavier weight grace periods.
Such readers mean that SRCU grace periods on srcu_struct structures
used by light-weight readers will incur at least two calls to
synchronize_rcu().  In addition, normal SRCU grace periods for
light-weight-reader srcu_struct structures never auto-expedite.
Note that expedited SRCU grace periods for light-weight-reader
srcu_struct structures still invoke synchronize_rcu(), not
synchronize_srcu_expedited().  Something about wishing to keep
the IPIs down to a dull roar.

The srcu_read_lock_lite() and srcu_read_unlock_lite() functions may not
(repeat, *not*) be used from NMI handlers, but if this is needed, an
additional flavor of SRCU reader can be added by some future commit.

[ paulmck: Apply Alexei Starovoitov expediting feedback. ]
[ paulmck: Apply kernel test robot feedback. ]

Signed-off-by: Paul E. McKenney 
Cc: Alexei Starovoitov 
Cc: Andrii Nakryiko 
Cc: Peter Zijlstra 
Cc: Kent Overstreet 
Cc: 
---
 include/linux/srcu.h | 51 -
 include/linux/srcutree.h |  1 +
 kernel/rcu/srcutree.c| 82 ++--
 3 files changed, 122 insertions(+), 12 deletions(-)

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 84daaa33ea0ab..4ba96e2cfa405 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -56,6 +56,13 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head 
*head,
 void cleanup_srcu_struct(struct srcu_struct *ssp);
 int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp);
 void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp);
+#ifdef CONFIG_TINY_SRCU
+#define __srcu_read_lock_lite __srcu_read_lock
+#define __srcu_read_unlock_lite __srcu_read_unlock
+#else // #ifdef CONFIG_TINY_SRCU
+int __srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp);
+void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) __releases(ssp);
+#endif // #else // #ifdef CONFIG_TINY_SRCU
 void synchronize_srcu(struct srcu_struct *ssp);
 
 #define SRCU_GET_STATE_COMPLETED 0x1
@@ -179,7 +186,7 @@ static inline int srcu_read_lock_held(const struct 
srcu_struct *ssp)
 #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_TREE_SRCU)
 void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor);
 #else
-static inline void srcu_check_read_flavor(struct srcu_struct *ssp, int 
read_flavor) { }
+#define srcu_check_read_flavor(ssp, read_flavor) do { } while (0)
 #endif
 
 
@@ -249,6 +256,32 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) 
__acquires(ssp)
return retval;
 }
 
+/**
+ * srcu_read_lock_lite - register a new reader for an SRCU-protected structure.
+ * @ssp: srcu_struct in which to register the new reader.
+ *
+ * Enter an SRCU read-side critical section, but for a light-weight
+ * smp_mb()-free reader.  See srcu_read_lock() for more information.
+ *
+ * If srcu_read_lock_lite() is ever used on an srcu_struct structure,
+ * then none of the other flavors may be used, whether before, during,
+ * or after.  Note that grace-period auto-expediting is disabled for _lite
+ * srcu_struct structures because auto-expedited grace periods invoke
+ * synchronize_rcu_expedited(), IPIs and all.
+ *
+ * Note that srcu_read_lock_lite() can be invoked only from those contexts
+ * where RCU is watching.  Otherwise, lockdep will complain.
+ */
+static inline int srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp)
+{
+   int retval;
+
+   srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE);
+   retval = __srcu_read_lock_lite(ssp);
+   rcu_try_lock_acquire(&ssp->dep_map);
+   return retval;
+}
+
 /**
  * srcu_read_lock_nmisafe - register a new reader for an SRCU-protected 
structure.
  * @ssp: srcu_struct in which to register the new reader.
@@ -325,6 +358,22 @@ static inline void srcu_read_unlock(struct srcu_struct 
*ssp, int idx)
__srcu_read_unlock(ssp, idx);
 }
 
+/**
+ * srcu_read_unlock_lite - unregister a old reader from an SRCU-protected 
structure.
+ * @ssp: srcu_struct in which to unregister the old reader.
+ * @idx: return value from corresponding srcu_read_lock().
+ *
+ * Exit a light-weight SRCU read-side critical section.
+ */
+static inline void srcu_read_unlock_lite(struct srcu_struct *ssp, int idx)
+   __releases(ssp)
+{
+   WARN_ON_ONCE(idx & ~0x1);
+   srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE);
+   srcu_lock_release(&ssp->dep_map);
+   __srcu_read_unlock(ssp, idx);
+}
+
 /**
  * srcu_read_unlock_nmisafe - unregister a old reader from an SRCU-protected 
structure.
  * @ssp: srcu_struct in which to unregister the old

[PATCH rcu 06/11] srcu: Convert srcu_data ->srcu_reader_flavor to bit field

2024-09-03 Thread Paul E. McKenney

This commit adds SRCU_READ_FLAVOR_NORMAL and SRCU_READ_FLAVOR_NMI bit
definitions and uses them in preparation for adding a third SRCU reader
flavor.

While in the area, improve a few comments.

Signed-off-by: Paul E. McKenney 
Cc: Alexei Starovoitov 
Cc: Andrii Nakryiko 
Cc: Peter Zijlstra 
Cc: Kent Overstreet 
Cc: 
---
 include/linux/srcu.h | 35 +++
 include/linux/srcutree.h |  4 
 kernel/rcu/srcutree.c| 22 +++---
 3 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 06728ef6f32a4..84daaa33ea0ab 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -176,10 +176,6 @@ static inline int srcu_read_lock_held(const struct 
srcu_struct *ssp)
 
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
-#define SRCU_NMI_UNKNOWN   0x0
-#define SRCU_NMI_UNSAFE0x1
-#define SRCU_NMI_SAFE  0x2
-
 #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_TREE_SRCU)
 void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor);
 #else
@@ -235,16 +231,19 @@ static inline void srcu_check_read_flavor(struct 
srcu_struct *ssp, int read_flav
  * a mutex that is held elsewhere while calling synchronize_srcu() or
  * synchronize_srcu_expedited().
  *
- * Note that srcu_read_lock() and the matching srcu_read_unlock() must
- * occur in the same context, for example, it is illegal to invoke
- * srcu_read_unlock() in an irq handler if the matching srcu_read_lock()
- * was invoked in process context.
+ * The return value from srcu_read_lock() must be passed unaltered
+ * to the matching srcu_read_unlock().  Note that srcu_read_lock() and
+ * the matching srcu_read_unlock() must occur in the same context, for
+ * example, it is illegal to invoke srcu_read_unlock() in an irq handler
+ * if the matching srcu_read_lock() was invoked in process context.  Or,
+ * for that matter to invoke srcu_read_unlock() from one task and the
+ * matching srcu_read_lock() from another.
  */
 static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp)
 {
int retval;
 
-   srcu_check_read_flavor(ssp, false);
+   srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);
retval = __srcu_read_lock(ssp);
srcu_lock_acquire(&ssp->dep_map);
return retval;
@@ -256,12 +255,16 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) 
__acquires(ssp)
  *
  * Enter an SRCU read-side critical section, but in an NMI-safe manner.
  * See srcu_read_lock() for more information.
+ *
+ * If srcu_read_lock_nmisafe() is ever used on an srcu_struct structure,
+ * then none of the other flavors may be used, whether before, during,
+ * or after.
  */
 static inline int srcu_read_lock_nmisafe(struct srcu_struct *ssp) 
__acquires(ssp)
 {
int retval;
 
-   srcu_check_read_flavor(ssp, true);
+   srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NMI);
retval = __srcu_read_lock_nmisafe(ssp);
rcu_try_lock_acquire(&ssp->dep_map);
return retval;
@@ -273,7 +276,7 @@ srcu_read_lock_notrace(struct srcu_struct *ssp) 
__acquires(ssp)
 {
int retval;
 
-   srcu_check_read_flavor(ssp, false);
+   srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);
retval = __srcu_read_lock(ssp);
return retval;
 }
@@ -302,7 +305,7 @@ srcu_read_lock_notrace(struct srcu_struct *ssp) 
__acquires(ssp)
 static inline int srcu_down_read(struct srcu_struct *ssp) __acquires(ssp)
 {
WARN_ON_ONCE(in_nmi());
-   srcu_check_read_flavor(ssp, false);
+   srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);
return __srcu_read_lock(ssp);
 }
 
@@ -317,7 +320,7 @@ static inline void srcu_read_unlock(struct srcu_struct 
*ssp, int idx)
__releases(ssp)
 {
WARN_ON_ONCE(idx & ~0x1);
-   srcu_check_read_flavor(ssp, false);
+   srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);
srcu_lock_release(&ssp->dep_map);
__srcu_read_unlock(ssp, idx);
 }
@@ -333,7 +336,7 @@ static inline void srcu_read_unlock_nmisafe(struct 
srcu_struct *ssp, int idx)
__releases(ssp)
 {
WARN_ON_ONCE(idx & ~0x1);
-   srcu_check_read_flavor(ssp, true);
+   srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NMI);
rcu_lock_release(&ssp->dep_map);
__srcu_read_unlock_nmisafe(ssp, idx);
 }
@@ -342,7 +345,7 @@ static inline void srcu_read_unlock_nmisafe(struct 
srcu_struct *ssp, int idx)
 static inline notrace void
 srcu_read_unlock_notrace(struct srcu_struct *ssp, int idx) __releases(ssp)
 {
-   srcu_check_read_flavor(ssp, false);
+   srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);
__srcu_read_unlock(ssp, idx);
 }
 
@@ -359,7 +362,7 @@ static inline void srcu_up_read(struct srcu_struct *ssp, 
int idx)
 {
WARN_ON_ONCE(idx & ~0x1);
WARN_ON_ONCE(in_nmi());
-   srcu_check_read_flavor(ssp, false);
+   srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);

[PATCH rcu 08/11] rcutorture: Expand RCUTORTURE_RDR_MASK_[12] to eight bits

2024-09-03 Thread Paul E. McKenney

This commit prepares for testing of multiple SRCU reader flavors by
expanding RCUTORTURE_RDR_MASK_1 and RCUTORTURE_RDR_MASK_2 from a single
bit to eight bits, allowing them to accommodate the return values from
multiple calls to srcu_read_lock*().  This will in turn permit better
testing coverage for these SRCU reader flavors, including testing of
the diagnostics for inproper use of mixed reader flavors.

Signed-off-by: Paul E. McKenney 
Cc: Alexei Starovoitov 
Cc: Andrii Nakryiko 
Cc: Peter Zijlstra 
Cc: Kent Overstreet 
Cc: 
---
 kernel/rcu/rcutorture.c | 28 ++--
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index b4cb7623a8bfc..d883f01407178 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -57,9 +57,9 @@ MODULE_AUTHOR("Paul E. McKenney  and 
Josh Triplett extendables field, extendables param, and related definitions. */
 #define RCUTORTURE_RDR_SHIFT_1  8  /* Put SRCU index in upper bits. */
-#define RCUTORTURE_RDR_MASK_1   (1 << RCUTORTURE_RDR_SHIFT_1)
-#define RCUTORTURE_RDR_SHIFT_2  9  /* Put SRCU index in upper bits. */
-#define RCUTORTURE_RDR_MASK_2   (1 << RCUTORTURE_RDR_SHIFT_2)
+#define RCUTORTURE_RDR_MASK_1   (0xff << RCUTORTURE_RDR_SHIFT_1)
+#define RCUTORTURE_RDR_SHIFT_2  16 /* Put SRCU index in upper bits. */
+#define RCUTORTURE_RDR_MASK_2   (0xff << RCUTORTURE_RDR_SHIFT_2)
 #define RCUTORTURE_RDR_BH   0x01   /* Extend readers by disabling bh. */
 #define RCUTORTURE_RDR_IRQ  0x02   /*  ... disabling interrupts. */
 #define RCUTORTURE_RDR_PREEMPT  0x04   /*  ... disabling preemption. */
@@ -71,6 +71,9 @@ MODULE_AUTHOR("Paul E. McKenney  and 
Josh Triplett > RCUTORTURE_RDR_SHIFT_2) > 1);
+   WARN_ON_ONCE(idxold2 & ~RCUTORTURE_RDR_ALLBITS);
rtrsp->rt_readstate = newstate;
 
/* First, put new protection in place to avoid critical-section gap. */
@@ -1845,9 +1848,9 @@ static void rcutorture_one_extend(int *readstate, int 
newstate,
if (statesnew & RCUTORTURE_RDR_SCHED)
rcu_read_lock_sched();
if (statesnew & RCUTORTURE_RDR_RCU_1)
-   idxnew1 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_1;
+   idxnew1 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_1) & 
RCUTORTURE_RDR_MASK_1;
if (statesnew & RCUTORTURE_RDR_RCU_2)
-   idxnew2 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_2;
+   idxnew2 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_2) & 
RCUTORTURE_RDR_MASK_2;
 
/*
 * Next, remove old protection, in decreasing order of strength
@@ -1867,7 +1870,7 @@ static void rcutorture_one_extend(int *readstate, int 
newstate,
if (statesold & RCUTORTURE_RDR_RBH)
rcu_read_unlock_bh();
if (statesold & RCUTORTURE_RDR_RCU_2) {
-   cur_ops->readunlock((idxold2 >> RCUTORTURE_RDR_SHIFT_2) & 0x1);
+   cur_ops->readunlock((idxold2 & RCUTORTURE_RDR_MASK_2) >> 
RCUTORTURE_RDR_SHIFT_2);
WARN_ON_ONCE(idxnew2 != -1);
idxold2 = 0;
}
@@ -1877,7 +1880,7 @@ static void rcutorture_one_extend(int *readstate, int 
newstate,
lockit = !cur_ops->no_pi_lock && !statesnew && 
!(torture_random(trsp) & 0x);
if (lockit)
raw_spin_lock_irqsave(¤t->pi_lock, flags);
-   cur_ops->readunlock((idxold1 >> RCUTORTURE_RDR_SHIFT_1) & 0x1);
+   cur_ops->readunlock((idxold1 & RCUTORTURE_RDR_MASK_1) >> 
RCUTORTURE_RDR_SHIFT_1);
WARN_ON_ONCE(idxnew1 != -1);
idxold1 = 0;
if (lockit)
@@ -1892,16 +1895,13 @@ static void rcutorture_one_extend(int *readstate, int 
newstate,
if (idxnew1 == -1)
idxnew1 = idxold1 & RCUTORTURE_RDR_MASK_1;
WARN_ON_ONCE(idxnew1 < 0);
-   if (WARN_ON_ONCE((idxnew1 >> RCUTORTURE_RDR_SHIFT_1) > 1))
-   pr_info("Unexpected idxnew1 value of %#x\n", idxnew1);
if (idxnew2 == -1)
idxnew2 = idxold2 & RCUTORTURE_RDR_MASK_2;
WARN_ON_ONCE(idxnew2 < 0);
-   WARN_ON_ONCE((idxnew2 >> RCUTORTURE_RDR_SHIFT_2) > 1);
*readstate = idxnew1 | idxnew2 | newstate;
WARN_ON_ONCE(*readstate < 0);
-   if (WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT_2) > 1))
-   pr_info("Unexpected idxnew2 value of %#x\n", idxnew2);
+   if (WARN_ON_ONCE(*readstate & ~RCUTORTURE_RDR_ALLBITS))
+   pr_info("Unexpected readstate value of %#x\n", *readstate);
 }
 
 /* Return the biggest extendables mask given current RCU and boot parameters. 
*/
@@ -1926,7 +1926,7 @@ rcutorture_extend_mask(int oldmask, struct 
torture_random_state *trsp)
unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ;
unsigned long bhs = RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
 
-   WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT_1);
+   WARN_ON_ONCE(mask >> RC

[PATCH rcu 10/11] rcutorture: Add srcu_read_lock_lite() support to rcutorture.reader_flavor

2024-09-03 Thread Paul E. McKenney

This commit causes bit 0x4 of rcutorture.reader_flavor to select the new
srcu_read_lock_lite() and srcu_read_unlock_lite() functions.

Signed-off-by: Paul E. McKenney 
Cc: Alexei Starovoitov 
Cc: Andrii Nakryiko 
Cc: Peter Zijlstra 
Cc: Kent Overstreet 
Cc: 
---
 Documentation/admin-guide/kernel-parameters.txt | 4 ++--
 kernel/rcu/rcutorture.c | 7 +++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index e107c82f0b21b..39bf8ce17e992 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5357,8 +5357,8 @@
If there is more than one bit set, the readers
are entered from low-order bit up, and are
exited in the opposite order.  For SRCU, the
-   0x1 bit is normal readers and the 0x2 bit is
-   for NMI-safe readers.
+   0x1 bit is normal readers, 0x2 NMI-safe readers,
+   and 0x4 light-weight readers.
 
rcutorture.shuffle_interval= [KNL]
Set task-shuffle interval (s).  Shuffling tasks
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 1a3e0fdca7139..306a449bbad87 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -660,6 +660,11 @@ static int srcu_torture_read_lock(void)
WARN_ON_ONCE(idx & ~0x1);
ret += idx << 1;
}
+   if (reader_flavor & 0x4) {
+   idx = srcu_read_lock_lite(srcu_ctlp);
+   WARN_ON_ONCE(idx & ~0x1);
+   ret += idx << 2;
+   }
return ret;
 }
 
@@ -685,6 +690,8 @@ srcu_read_delay(struct torture_random_state *rrsp, struct 
rt_read_seg *rtrsp)
 static void srcu_torture_read_unlock(int idx)
 {
WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || 
(!reader_flavor && (idx & ~0x1)));
+   if (reader_flavor & 0x4)
+   srcu_read_unlock_lite(srcu_ctlp, (idx & 0x4) >> 2);
if (reader_flavor & 0x2)
srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1);
if ((reader_flavor & 0x1) || !(reader_flavor & 0x7))
-- 
2.40.1

[PATCH rcu 09/11] rcutorture: Add reader_flavor parameter for SRCU readers

2024-09-03 Thread Paul E. McKenney

This commit adds an rcutorture.reader_flavor parameter whose bits
correspond to reader flavors.  For example, SRCU's readers are 0x1 for
normal and 0x2 for NMI-safe.

Signed-off-by: Paul E. McKenney 
Cc: Alexei Starovoitov 
Cc: Andrii Nakryiko 
Cc: Peter Zijlstra 
Cc: Kent Overstreet 
Cc: 
---
 .../admin-guide/kernel-parameters.txt |  8 +
 kernel/rcu/rcutorture.c   | 30 ++-
 2 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index f8f0800852585..e107c82f0b21b 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5352,6 +5352,14 @@
The delay, in seconds, between successive
read-then-exit testing episodes.
 
+   rcutorture.reader_flavor= [KNL]
+   A bit mask indicating which readers to use.
+   If there is more than one bit set, the readers
+   are entered from low-order bit up, and are
+   exited in the opposite order.  For SRCU, the
+   0x1 bit is normal readers and the 0x2 bit is
+   for NMI-safe readers.
+
rcutorture.shuffle_interval= [KNL]
Set task-shuffle interval (s).  Shuffling tasks
allows some CPUs to go into dyntick-idle mode
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index d883f01407178..1a3e0fdca7139 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -111,6 +111,7 @@ torture_param(int, nocbs_nthreads, 0, "Number of NOCB 
toggle threads, 0 to disab
 torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state 
(ms)");
 torture_param(int, read_exit_delay, 13, "Delay between read-then-exit episodes 
(s)");
 torture_param(int, read_exit_burst, 16, "# of read-then-exit bursts per 
episode, zero to disable");
+torture_param(int, reader_flavor, 0x1, "Reader flavors to use, one per bit.");
 torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles");
 torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
 torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
@@ -646,10 +647,20 @@ static void srcu_get_gp_data(int *flags, unsigned long 
*gp_seq)
 
 static int srcu_torture_read_lock(void)
 {
-   if (cur_ops == &srcud_ops)
-   return srcu_read_lock_nmisafe(srcu_ctlp);
-   else
-   return srcu_read_lock(srcu_ctlp);
+   int idx;
+   int ret = 0;
+
+   if ((reader_flavor & 0x1) || !(reader_flavor & 0x7)) {
+   idx = srcu_read_lock(srcu_ctlp);
+   WARN_ON_ONCE(idx & ~0x1);
+   ret += idx;
+   }
+   if (reader_flavor & 0x2) {
+   idx = srcu_read_lock_nmisafe(srcu_ctlp);
+   WARN_ON_ONCE(idx & ~0x1);
+   ret += idx << 1;
+   }
+   return ret;
 }
 
 static void
@@ -673,10 +684,11 @@ srcu_read_delay(struct torture_random_state *rrsp, struct 
rt_read_seg *rtrsp)
 
 static void srcu_torture_read_unlock(int idx)
 {
-   if (cur_ops == &srcud_ops)
-   srcu_read_unlock_nmisafe(srcu_ctlp, idx);
-   else
-   srcu_read_unlock(srcu_ctlp, idx);
+   WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || 
(!reader_flavor && (idx & ~0x1)));
+   if (reader_flavor & 0x2)
+   srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1);
+   if ((reader_flavor & 0x1) || !(reader_flavor & 0x7))
+   srcu_read_unlock(srcu_ctlp, idx & 0x1);
 }
 
 static int torture_srcu_read_lock_held(void)
@@ -2399,6 +2411,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops 
*cur_ops, const char *tag)
 "n_barrier_cbs=%d "
 "onoff_interval=%d onoff_holdoff=%d "
 "read_exit_delay=%d read_exit_burst=%d "
+"reader_flavor=%x "
 "nocbs_nthreads=%d nocbs_toggle=%d "
 "test_nmis=%d\n",
 torture_type, tag, nrealreaders, nfakewriters,
@@ -2411,6 +2424,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops 
*cur_ops, const char *tag)
 n_barrier_cbs,
 onoff_interval, onoff_holdoff,
 read_exit_delay, read_exit_burst,
+reader_flavor,
 nocbs_nthreads, nocbs_toggle,
 test_nmis);
 }
-- 
2.40.1

Re: [PATCH] selftests/net: do_setcpu function not need to have a return value

2024-09-03 Thread Simon Horman

On Tue, Sep 03, 2024 at 05:51:11PM +0800, Liu Jing wrote:
> in the do_setcpu, this function does not need to have a return value,
> which is meaningless
> 
> Signed-off-by: Liu Jing 

Thanks,

I also see that the caller does not check the return value.

Reviewed-by: Simon Horman

[PATCH v2 0/2] Properly initialize speed/duplex and remove vDPA config updates

2024-09-03 Thread Carlos Bilbao

From: Carlos Bilbao 

Initialize speed and duplex for virtio_net_config to UNKNOWN (mlx5_vdpa
vDPA devices currently do not support VIRTIO_NET_F_SPEED_DUPLEX). Remove
ioctl VHOST_VDPA_SET_CONFIG and its related logic as it is not supported;
see: https://docs.oasis-open.org/virtio/virtio/v1.3/virtio-v1.3.html

Carlos:
  vdpa/mlx5: Set speed and duplex of vDPA devices to UNKNOWN
  vdpa: Remove ioctl VHOST_VDPA_SET_CONFIG per spec compliance

---

Changes since v1:
 Link: https://lkml.org/lkml/2024/8/29/1368
 - Fix prefix of the first commit and add Reviewed-By tag.
 - Redo second commit completely: instead of attempting to add support to
   set configuration fields, remove ioctl and support entirely from vDPA
   implementations -- because it's not allowed by spec.

---
 drivers/vdpa/alibaba/eni_vdpa.c| 17 -
 drivers/vdpa/ifcvf/ifcvf_main.c| 10 --
 drivers/vdpa/mlx5/net/mlx5_vnet.c  | 19 ---
 drivers/vdpa/pds/vdpa_dev.c| 16 
 drivers/vdpa/solidrun/snet_main.c  | 18 --
 drivers/vdpa/vdpa.c| 16 
 drivers/vdpa/vdpa_sim/vdpa_sim.c   | 16 
 drivers/vdpa/vdpa_sim/vdpa_sim.h   |  1 -
 drivers/vdpa/vdpa_user/vduse_dev.c |  7 ---
 drivers/vdpa/virtio_pci/vp_vdpa.c  | 14 --
 drivers/vhost/vdpa.c   | 26 --
 drivers/virtio/virtio_vdpa.c   |  9 -
 include/linux/vdpa.h   |  9 -
 include/uapi/linux/vhost.h |  8 
 14 files changed, 16 insertions(+), 170 deletions(-)

[PATCH v2 1/2] vdpa/mlx5: Set speed and duplex of vDPA devices to UNKNOWN

2024-09-03 Thread Carlos Bilbao

From: Carlos Bilbao 

Initialize the speed and duplex fields in virtio_net_config to UNKNOWN.
This is needed because mlx5_vdpa vDPA devicess currently do not support the
VIRTIO_NET_F_SPEED_DUPLEX feature which reports speed and duplex. Add
needed helper cpu_to_mlx5vdpa32() to convert endianness of speed.

Signed-off-by: Carlos Bilbao 
Reviewed-by: Dragos Tatulea 
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index b56aae3f7be3..5fce6d62af4f 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -173,6 +173,11 @@ static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev 
*mvdev, u16 val)
return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val);
 }
 
+static __virtio32 cpu_to_mlx5vdpa32(struct mlx5_vdpa_dev *mvdev, u32 val)
+{
+return __cpu_to_virtio32(mlx5_vdpa_is_little_endian(mvdev), val);
+}
+
 static u16 ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev)
 {
if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ)))
@@ -3433,6 +3438,13 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev 
*v_mdev, const char *name,
init_rwsem(&ndev->reslock);
config = &ndev->config;
 
+   /*
+* mlx5_vdpa vDPA devices currently don't support reporting or
+* setting the speed or duplex.
+*/
+   config->speed  = cpu_to_mlx5vdpa32(mvdev, SPEED_UNKNOWN);
+   config->duplex = DUPLEX_UNKNOWN;
+
if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU)) {
err = config_func_mtu(mdev, add_config->net.mtu);
if (err)
-- 
2.34.1

[PATCH v2 2/2] vdpa: Remove ioctl VHOST_VDPA_SET_CONFIG per spec compliance

2024-09-03 Thread Carlos Bilbao

From: Carlos Bilbao 

Remove invalid ioctl VHOST_VDPA_SET_CONFIG and all its implementations
with vdpa_config_ops->set_config(). This is needed per virtio spec
requirements; virtio-spec v3.1 Sec 5.1.4 states that "All of the device
configuration fields are read-only for the driver."

Signed-off-by: Carlos Bilbao 
---
 drivers/vdpa/alibaba/eni_vdpa.c| 17 -
 drivers/vdpa/ifcvf/ifcvf_main.c| 10 --
 drivers/vdpa/mlx5/net/mlx5_vnet.c  |  7 ---
 drivers/vdpa/pds/vdpa_dev.c| 16 
 drivers/vdpa/solidrun/snet_main.c  | 18 --
 drivers/vdpa/vdpa.c| 16 
 drivers/vdpa/vdpa_sim/vdpa_sim.c   | 16 
 drivers/vdpa/vdpa_sim/vdpa_sim.h   |  1 -
 drivers/vdpa/vdpa_user/vduse_dev.c |  7 ---
 drivers/vdpa/virtio_pci/vp_vdpa.c  | 14 --
 drivers/vhost/vdpa.c   | 26 --
 drivers/virtio/virtio_vdpa.c   |  9 -
 include/linux/vdpa.h   |  9 -
 include/uapi/linux/vhost.h |  8 
 14 files changed, 4 insertions(+), 170 deletions(-)

diff --git a/drivers/vdpa/alibaba/eni_vdpa.c b/drivers/vdpa/alibaba/eni_vdpa.c
index cce3d1837104..d8f70b385661 100644
--- a/drivers/vdpa/alibaba/eni_vdpa.c
+++ b/drivers/vdpa/alibaba/eni_vdpa.c
@@ -383,22 +383,6 @@ static void eni_vdpa_get_config(struct vdpa_device *vdpa,
*p++ = ioread8(ioaddr + i);
 }
 
-static void eni_vdpa_set_config(struct vdpa_device *vdpa,
-   unsigned int offset, const void *buf,
-   unsigned int len)
-{
-   struct eni_vdpa *eni_vdpa = vdpa_to_eni(vdpa);
-   struct virtio_pci_legacy_device *ldev = &eni_vdpa->ldev;
-   void __iomem *ioaddr = ldev->ioaddr +
-   VIRTIO_PCI_CONFIG_OFF(eni_vdpa->vectors) +
-   offset;
-   const u8 *p = buf;
-   int i;
-
-   for (i = 0; i < len; i++)
-   iowrite8(*p++, ioaddr + i);
-}
-
 static void eni_vdpa_set_config_cb(struct vdpa_device *vdpa,
   struct vdpa_callback *cb)
 {
@@ -429,7 +413,6 @@ static const struct vdpa_config_ops eni_vdpa_ops = {
.get_vq_align   = eni_vdpa_get_vq_align,
.get_config_size = eni_vdpa_get_config_size,
.get_config = eni_vdpa_get_config,
-   .set_config = eni_vdpa_set_config,
.set_config_cb  = eni_vdpa_set_config_cb,
.get_vq_irq = eni_vdpa_get_vq_irq,
 };
diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c
index e98fa8100f3c..7cbac787ad5f 100644
--- a/drivers/vdpa/ifcvf/ifcvf_main.c
+++ b/drivers/vdpa/ifcvf/ifcvf_main.c
@@ -568,15 +568,6 @@ static void ifcvf_vdpa_get_config(struct vdpa_device 
*vdpa_dev,
ifcvf_read_dev_config(vf, offset, buf, len);
 }
 
-static void ifcvf_vdpa_set_config(struct vdpa_device *vdpa_dev,
- unsigned int offset, const void *buf,
- unsigned int len)
-{
-   struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev);
-
-   ifcvf_write_dev_config(vf, offset, buf, len);
-}
-
 static void ifcvf_vdpa_set_config_cb(struct vdpa_device *vdpa_dev,
 struct vdpa_callback *cb)
 {
@@ -640,7 +631,6 @@ static const struct vdpa_config_ops ifc_vdpa_ops = {
.get_vq_group   = ifcvf_vdpa_get_vq_group,
.get_config_size= ifcvf_vdpa_get_config_size,
.get_config = ifcvf_vdpa_get_config,
-   .set_config = ifcvf_vdpa_set_config,
.set_config_cb  = ifcvf_vdpa_set_config_cb,
.get_vq_notification = ifcvf_get_vq_notification,
 };
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 5fce6d62af4f..fd0db86ba2cf 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -2918,12 +2918,6 @@ static void mlx5_vdpa_get_config(struct vdpa_device 
*vdev, unsigned int offset,
memcpy(buf, (u8 *)&ndev->config + offset, len);
 }
 
-static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int 
offset, const void *buf,
-unsigned int len)
-{
-   /* not supported */
-}
-
 static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev)
 {
struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
@@ -3218,7 +3212,6 @@ static const struct vdpa_config_ops mlx5_vdpa_ops = {
.reset = mlx5_vdpa_reset,
.get_config_size = mlx5_vdpa_get_config_size,
.get_config = mlx5_vdpa_get_config,
-   .set_config = mlx5_vdpa_set_config,
.get_generation = mlx5_vdpa_get_generation,
.set_map = mlx5_vdpa_set_map,
.set_group_asid = mlx5_set_group_asid,
diff --git a/drivers/vdpa/pds/vdpa_dev.c b/drivers/vdpa/pds/vdpa_dev.c
index 25c0fe5ec3d5..553dcd2aa065 100644
--- a/drivers/vdpa/pds/vdpa_dev.c
+++ b/drivers/vdpa/pds/vdpa_dev.c
@@ -553,21 +553,6 @@ static void pds_vdpa_get_config(struct

Re: [RFC 00/31] objtool, livepatch: Livepatch module generation

2024-09-03 Thread Song Liu

Hi Josh,

Thanks for the patchset! We really need this work so that we can undo our
hack for LTO enabled kernels.

On Mon, Sep 2, 2024 at 9:00 PM Josh Poimboeuf  wrote:
>
> Hi,
>
> Here's a new way to build livepatch modules called klp-build.
>
> I started working on it when I realized that objtool already does 99% of
> the work needed for detecting function changes.
>
> This is similar in concept to kpatch-build, but the implementation is
> much cleaner.
>
> Personally I still have reservations about the "source-based" approach
> (klp-convert and friends), including the fragility and performance
> concerns of -flive-patching.  I would submit that klp-build might be
> considered the "official" way to make livepatch modules.
>
> Please try it out and let me know what you think.  Based on v6.10.
>
> Also avaiable at:
>
>   git://git.kernel.org/pub/scm/linux/kernel/git/jpoimboe/linux.git 
> klp-build-rfc

I tried to compile the code in this branch with gcc-12 and llvm-18. Some
of these errors are easy to fix (attached below). But some are trickier, for
example:

with gcc-12:
  ...
  BTFIDS  vmlinux
  NM  System.map
  SORTTAB vmlinux
incomplete ORC unwind tables in file: vmlinux
Failed to sort kernel tables

with clang-18:

:4:1: error: symbol '__alt_0' is already defined
4 | __alt_0:
  | ^
:4:1: error: symbol '__alt_1' is already defined
4 | __alt_1:
  | ^

Thanks,
Song

Fix/hack I have on top of this branch:

diff --git i/tools/objtool/check.c w/tools/objtool/check.c
index f55dec2932de..5c4152d60780 100644
--- i/tools/objtool/check.c
+++ w/tools/objtool/check.c
@@ -2,7 +2,7 @@
 /*
  * Copyright (C) 2015-2017 Josh Poimboeuf 
  */
-
+#define _GNU_SOURCE
 #include 
 #include 
 #include 
@@ -1519,7 +1519,7 @@ static void add_jump_destinations(struct
objtool_file *file)
struct reloc *reloc;

for_each_insn(file, insn) {
-   struct instruction *dest_insn;
+   struct instruction *dest_insn = NULL;
struct section *dest_sec = NULL;
struct symbol *dest_sym = NULL;
unsigned long dest_off;
diff --git i/tools/objtool/elf.c w/tools/objtool/elf.c
index 7960921996bd..462ce897ff29 100644
--- i/tools/objtool/elf.c
+++ w/tools/objtool/elf.c
@@ -468,10 +468,8 @@ static void elf_add_symbol(struct elf *elf,
struct symbol *sym)
 *
 * TODO: is this still true?
 */
-#if 0
-   if (sym->type == STT_NOTYPE && !sym->len)
+   if (sym->type == STT_NOTYPE && !sym->len && false)
__sym_remove(sym, &sym->sec->symbol_tree);
-#endif

sym->demangled_name = demangle_name(sym);
 }
diff --git i/tools/objtool/klp-diff.c w/tools/objtool/klp-diff.c
index 76296e38f9ff..4a3f4172f4a5 100644
--- i/tools/objtool/klp-diff.c
+++ w/tools/objtool/klp-diff.c
@@ -2,6 +2,7 @@
 /*
  * Copyright (C) 2024 Josh Poimboeuf 
  */
+#define _GNU_SOURCE
 #include 
 #include 
 #include 
@@ -1109,4 +1110,3 @@ int cmd_klp_diff(int argc, const char **argv)
elf_write(elf_out);
return 0;
 }
-

Re: [PATCH v2 1/2] vdpa/mlx5: Set speed and duplex of vDPA devices to UNKNOWN

2024-09-03 Thread Nelson, Shannon


On 9/3/2024 10:15 AM, Carlos Bilbao wrote:

From: Carlos Bilbao 

Initialize the speed and duplex fields in virtio_net_config to UNKNOWN.
This is needed because mlx5_vdpa vDPA devicess currently do not support the
VIRTIO_NET_F_SPEED_DUPLEX feature which reports speed and duplex. Add
needed helper cpu_to_mlx5vdpa32() to convert endianness of speed.

Signed-off-by: Carlos Bilbao 
Reviewed-by: Dragos Tatulea 
---
  drivers/vdpa/mlx5/net/mlx5_vnet.c | 12 
  1 file changed, 12 insertions(+)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index b56aae3f7be3..5fce6d62af4f 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -173,6 +173,11 @@ static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev 
*mvdev, u16 val)
 return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val);
  }

+static __virtio32 cpu_to_mlx5vdpa32(struct mlx5_vdpa_dev *mvdev, u32 val)
+{
+return __cpu_to_virtio32(mlx5_vdpa_is_little_endian(mvdev), val);


I hate picking at little things like this, but this should be a tab 
rather than 4 spaces.


sln



+}
+
  static u16 ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev)
  {
 if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ)))
@@ -3433,6 +3438,13 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev 
*v_mdev, const char *name,
 init_rwsem(&ndev->reslock);
 config = &ndev->config;

+   /*
+* mlx5_vdpa vDPA devices currently don't support reporting or
+* setting the speed or duplex.
+*/
+   config->speed  = cpu_to_mlx5vdpa32(mvdev, SPEED_UNKNOWN);
+   config->duplex = DUPLEX_UNKNOWN;
+
 if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU)) {
 err = config_func_mtu(mdev, add_config->net.mtu);
 if (err)
--
2.34.1

Re: [PATCH] selftests: filesystems: fix warn_unused_result build warnings

2024-09-03 Thread Shuah Khan


On 8/16/24 07:11, Shuah Khan wrote:

On 8/10/24 07:53, Abhinav Jain wrote:

Add return value checks for read & write calls in test_listmount_ns
function. This patch resolves below compilation warnings:

```
statmount_test_ns.c: In function ‘test_listmount_ns’:

statmount_test_ns.c:322:17: warning: ignoring return value of ‘write’
declared with attribute ‘warn_unused_result’ [-Wunused-result]

statmount_test_ns.c:323:17: warning: ignoring return value of ‘read’
declared with attribute ‘warn_unused_result’ [-Wunused-result]
```

Signed-off-by: Abhinav Jain 
---



Hi Christian,

Let me know if it is okay to take this patch through kselftest tree.
The change looks good to me.

thanks,
-- Shuah


Thank you for the patch.

Applied to linux-kselftest next for Linux 6.12-rc1

thanks,
-- Shuah

Re: [PATCH v4 3/4] selftests/x86: don't clobber CFLAGS

2024-09-03 Thread Reinette Chatre


Hi Ilpo,

On 9/3/24 7:45 AM, Ilpo Järvinen wrote:

The x86 selftests makefile clobbers CFLAGS preventing lib.mk from
making the necessary adjustments into CFLAGS. This would lead to a
build failure after upcoming change which wants to add -DHAVE_CPUID=
into CFLAGS.

Reorder CFLAGS initialization in x86 selftest. Place the constant part
of CFLAGS initialization before inclusion of lib.mk but leave adding
KHDR_INCLUDES into CFLAGS into the existing position because
KHDR_INCLUDES might be generated inside lib.mk.

Signed-off-by: Ilpo Järvinen 
---
v4:
- New patch

  tools/testing/selftests/x86/Makefile | 4 +++-
  1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/x86/Makefile 
b/tools/testing/selftests/x86/Makefile
index 5c8757a25998..88a6576de92f 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -1,4 +1,6 @@
  # SPDX-License-Identifier: GPL-2.0
+CFLAGS := -O2 -g -std=gnu99 -pthread -Wall
+
  all:
  
  include ../lib.mk

@@ -35,7 +37,7 @@ BINARIES_64 := $(TARGETS_C_64BIT_ALL:%=%_64)
  BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32))
  BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64))
  
-CFLAGS := -O2 -g -std=gnu99 -pthread -Wall $(KHDR_INCLUDES)

+CFLAGS += $(KHDR_INCLUDES)
  
  # call32_from_64 in thunks.S uses absolute addresses.

  ifeq ($(CAN_BUILD_WITH_NOPIE),1)


These changes are becoming less obvious to me. The first two
red flags are:
- Most other top level Makefiles assign KHDR_INCLUDES to CFLAGS
  *before* including lib.mk
- What current Makefiles do matches the guidance from
  Documentation/dev-tools/kselftest.rst as per example (verbatim copy):
CFLAGS = $(KHDR_INCLUDES)
TEST_GEN_PROGS := close_range_test
include ../lib.mk

Looking closer it is not clear to me why lib.mk sets KHDR_INCLUDES.
As I understand the usage is intended to be:
  make TARGETS="target" -C tools/testing/selftests
This means that it is tools/testing/selftests/Makefile that will
run first and it ensures that KHDR_INCLUDES is set and supports
the usage documented in Documentation/dev-tools/kselftest.rst

One usage that a change like in this patch could support is
for users to be able to run "make" from within the test
directory self ... but considering the current KHDR_INCLUDES
custom this does not seem to be supported? (but we cannot
know which tests/users rely on this behavior)

Looking further I also noticed that
tools/testing/selftests/Makefile even sets ARCH (but does
not export it). When considering the next patch it almost looks
like what is missing is for tools/testing/selftests/Makefile to
"export ARCH" ... but that potentially impacts the Makefiles that
do manipulation of ARCH.

I initially started to look at this because of the
resctrl impact, but I clearly am not familiar enough
with the kselftest build files to understand the
impacts nor provide guidance. I do hope the kselftest
folks can help here.

Reinette

Re: [PATCH] nvdimm: Use of_property_present() and of_property_read_bool()

2024-09-03 Thread Rob Herring

On Wed, Jul 31, 2024 at 2:14 PM Rob Herring (Arm)  wrote:
>
> Use of_property_present() and of_property_read_bool() to test
> property presence and read boolean properties rather than
> of_(find|get)_property(). This is part of a larger effort to remove
> callers of of_find_property() and similar functions.
> of_(find|get)_property() leak the DT struct property and data pointers
> which is a problem for dynamically allocated nodes which may be freed.
>
> Signed-off-by: Rob Herring (Arm) 
> ---
>  drivers/nvdimm/of_pmem.c | 2 +-
>  drivers/nvmem/layouts.c  | 2 +-
>  2 files changed, 2 insertions(+), 2 deletions(-)

Ping

> diff --git a/drivers/nvdimm/of_pmem.c b/drivers/nvdimm/of_pmem.c
> index 403384f25ce3..b4a1cf70e8b7 100644
> --- a/drivers/nvdimm/of_pmem.c
> +++ b/drivers/nvdimm/of_pmem.c
> @@ -47,7 +47,7 @@ static int of_pmem_region_probe(struct platform_device 
> *pdev)
> }
> platform_set_drvdata(pdev, priv);
>
> -   is_volatile = !!of_find_property(np, "volatile", NULL);
> +   is_volatile = of_property_read_bool(np, "volatile");
> dev_dbg(&pdev->dev, "Registering %s regions from %pOF\n",
> is_volatile ? "volatile" : "non-volatile",  np);
>
> diff --git a/drivers/nvmem/layouts.c b/drivers/nvmem/layouts.c
> index 77a4119efea8..65d39e19f6ec 100644
> --- a/drivers/nvmem/layouts.c
> +++ b/drivers/nvmem/layouts.c
> @@ -123,7 +123,7 @@ static int nvmem_layout_bus_populate(struct nvmem_device 
> *nvmem,
> int ret;
>
> /* Make sure it has a compatible property */
> -   if (!of_get_property(layout_dn, "compatible", NULL)) {
> +   if (!of_property_present(layout_dn, "compatible")) {
> pr_debug("%s() - skipping %pOF, no compatible prop\n",
>  __func__, layout_dn);
> return 0;
> --
> 2.43.0
>

Re: [PATCH rcu 07/11] srcu: Add srcu_read_lock_lite() and srcu_read_unlock_lite()

2024-09-03 Thread Alexei Starovoitov

On Tue, Sep 3, 2024 at 9:33 AM Paul E. McKenney  wrote:
>
> diff --git a/include/linux/srcu.h b/include/linux/srcu.h
> index 84daaa33ea0ab..4ba96e2cfa405 100644
> --- a/include/linux/srcu.h
> +++ b/include/linux/srcu.h
...

> +static inline int srcu_read_lock_lite(struct srcu_struct *ssp) 
> __acquires(ssp)
> +{
> +   int retval;
> +
> +   srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE);
> +   retval = __srcu_read_lock_lite(ssp);
> +   rcu_try_lock_acquire(&ssp->dep_map);
> +   return retval;
> +}

...

> diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
> index 602b4b8c4b891..bab888e86b9bb 100644
> --- a/kernel/rcu/srcutree.c
> +++ b/kernel/rcu/srcutree.c
> +int __srcu_read_lock_lite(struct srcu_struct *ssp)
> +{
> +   int idx;
> +
> +   RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching 
> srcu_read_lock_lite().");
> +   idx = READ_ONCE(ssp->srcu_idx) & 0x1;
> +   this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter); /* Y */
> +   barrier(); /* Avoid leaking the critical section. */
> +   return idx;
> +}
> +EXPORT_SYMBOL_GPL(__srcu_read_lock_lite);

The use cases where smp_mb() penalty is noticeable probably will notice
the cost of extra call too.
Can the main part be in srcu.h as well to make it truly "lite" ?
Otherwise we'd have to rely on compilers doing LTO which may or may not happen.

[PATCH RESEND] selftests/futex: Order calls in futex_requeue

2024-09-03 Thread Edward Liaw

Similar to fbf4dec70277 ("selftests/futex: Order calls to
futex_lock_pi"), which fixed a flake in futex_lock_pi due to racing
between the parent and child threads.

The same issue can occur in the futex_requeue test, because it expects
waiterfn to make progress to futex_wait before the parent starts to
requeue. This is mitigated by the parent sleeping for WAKE_WAIT_US, but
it still fails occasionally. This can be reproduced by adding a sleep in
the waiterfn before futex_wait:

TAP version 13
1..2
not ok 1 futex_requeue simple returned: 0
not ok 2 futex_requeue simple returned: 0
not ok 3 futex_requeue many returned: 0
not ok 4 futex_requeue many returned: 0

Instead, replace the sleep with barriers to make the sequencing
explicit.

Fixes: 7cb5dd8e2c8c ("selftests: futex: Add futex compare requeue test")
Reviewed-by: Muhammad Usama Anjum 
Signed-off-by: Edward Liaw 
---
 .../selftests/futex/functional/futex_requeue.c   | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/futex/functional/futex_requeue.c 
b/tools/testing/selftests/futex/functional/futex_requeue.c
index 51485be6eb2f..8f7d3e8bf32a 100644
--- a/tools/testing/selftests/futex/functional/futex_requeue.c
+++ b/tools/testing/selftests/futex/functional/futex_requeue.c
@@ -12,9 +12,9 @@
 
 #define TEST_NAME "futex-requeue"
 #define timeout_ns  3000
-#define WAKE_WAIT_US 1
 
 volatile futex_t *f1;
+static pthread_barrier_t barrier;
 
 void usage(char *prog)
 {
@@ -32,6 +32,8 @@ void *waiterfn(void *arg)
to.tv_sec = 0;
to.tv_nsec = timeout_ns;
 
+   pthread_barrier_wait(&barrier);
+
if (futex_wait(f1, *f1, &to, 0))
printf("waiter failed errno %d\n", errno);
 
@@ -70,13 +72,15 @@ int main(int argc, char *argv[])
ksft_print_msg("%s: Test futex_requeue\n",
   basename(argv[0]));
 
+   pthread_barrier_init(&barrier, NULL, 2);
/*
 * Requeue a waiter from f1 to f2, and wake f2.
 */
if (pthread_create(&waiter[0], NULL, waiterfn, NULL))
error("pthread_create failed\n", errno);
 
-   usleep(WAKE_WAIT_US);
+   pthread_barrier_wait(&barrier);
+   pthread_barrier_destroy(&barrier);
 
info("Requeuing 1 futex from f1 to f2\n");
res = futex_cmp_requeue(f1, 0, &f2, 0, 1, 0);
@@ -99,6 +103,7 @@ int main(int argc, char *argv[])
ksft_test_result_pass("futex_requeue simple succeeds\n");
}
 
+   pthread_barrier_init(&barrier, NULL, 11);
 
/*
 * Create 10 waiters at f1. At futex_requeue, wake 3 and requeue 7.
@@ -109,7 +114,8 @@ int main(int argc, char *argv[])
error("pthread_create failed\n", errno);
}
 
-   usleep(WAKE_WAIT_US);
+   pthread_barrier_wait(&barrier);
+   pthread_barrier_destroy(&barrier);
 
info("Waking 3 futexes at f1 and requeuing 7 futexes from f1 to f2\n");
res = futex_cmp_requeue(f1, 0, &f2, 3, 7, 0);
-- 
2.46.0.469.g59c65b2a67-goog

Re: [PATCH] selftests/vDSO: support DT_GNU_HASH

2024-09-03 Thread Shuah Khan


On 8/27/24 07:37, Fangrui Song wrote:

On Tue, Aug 27, 2024 at 10:12 PM Shuah Khan  wrote:


On 8/26/24 00:07, Xi Ruoyao wrote:

On Wed, 2024-08-14 at 20:26 -0700, Fangrui Song wrote:

glibc added support for DT_GNU_HASH in 2006 and DT_HASH has been
obsoleted for more than one decade in many Linux distributions.

Many vDSOs support DT_GNU_HASH. This patch adds selftests support.

Signed-off-by: Fangrui Song 
---


Ping.

Some context: I'd change LoongArch vDSO to use the toolchain default
instead of forcing DT_HASH (note that LoongArch is launched decades
after all major distros switched to DT_GNU_HASH), but without the
selftest support we'll lose test coverage.

And now ARM64 has already lost test coverage after commit 48f6430505c0.



I am seeing several checkpatch errors - please fix them and send me v2.

thanks,
-- Shuah



The applicable change is:

--- i/tools/testing/selftests/vDSO/parse_vdso.c
+++ w/tools/testing/selftests/vDSO/parse_vdso.c
@@ -177,7 +177,7 @@ void vdso_init_from_sysinfo_ehdr(uintptr_t base)
 if (vdso_info.gnu_hash) {
 vdso_info.nbucket = vdso_info.gnu_hash[0];
 /* The bucket array is located after the header (4
uint32) and the bloom
-  filter (size_t array of gnu_hash[2] elements). */
+* filter (size_t array of gnu_hash[2] elements). */
 vdso_info.bucket = vdso_info.gnu_hash + 4 +
sizeof(size_t) / 4 * vdso_info.gnu_hash[2];
 } else {


Other checkpatch.pl output is not actionable. `ELF(Sym) *sym` instead
of `ELF(Sym) * sym` has the correct spacing (used in this file and
elsewhere ElfW in the code base).




Okay. Send v2 with the actionable change.

thanks,
-- Shuah

[PATCH v2 0/1] Add KUnit tests for kfifo

2024-09-03 Thread Diego Vieira

Hi all,

This is part of a hackathon organized by LKCAMP [1], focused on writing
tests using KUnit. We reached out a while ago asking for advice on what would
be a useful contribution [2] and ended up choosing data structures that did
not yet have tests.

This patch series depends on the patch that moves the KUnit tests on lib/
into lib/tests/ [3].

This patch adds tests for the kfifo data structure, defined in
include/linux/kfifo.h, and is inspired by the KUnit tests for the doubly
linked list in lib/tests/list-test.c (previously at lib/list-test.c) [4].

[1] https://lkcamp.dev/about/
[2] https://lore.kernel.org/all/Zktnt7rjKryTh9-N@arch/
[3] https://lore.kernel.org/all/20240720181025.work.002-k...@kernel.org/
[4] https://elixir.bootlin.com/linux/latest/source/lib/list-test.c

---
Changes in v2:
- Add MODULE_DESCRIPTION()
- Move the tests from lib/kfifo-test.c to lib/tests/kfifo_kunit.c

Diego Vieira (1):
  lib/tests/kfifo_kunit.c: add tests for the kfifo structure

 lib/Kconfig.debug   |  14 +++
 lib/tests/Makefile  |   1 +
 lib/tests/kfifo_kunit.c | 224 
 3 files changed, 239 insertions(+)
 create mode 100644 lib/tests/kfifo_kunit.c

-- 
2.34.1

[PATCH v2 1/1] lib/tests/kfifo_kunit.c: add tests for the kfifo structure

2024-09-03 Thread Diego Vieira

Add KUnit tests for the kfifo data structure.
They test the vast majority of macros defined in the kfifo
header (include/linux/kfifo.h).

These are inspired by the existing tests for the doubly
linked list in lib/tests/list-test.c (previously at lib/list-test.c) [1].

Note that this patch depends on the patch that moves the KUnit tests on 
lib/ into lib/tests/ [2].

[1] 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/lib/list-test.c?h=v6.11-rc6
[2] https://lore.kernel.org/all/20240720181025.work.002-k...@kernel.org/

Signed-off-by: Diego Vieira 
---
 lib/Kconfig.debug   |  14 +++
 lib/tests/Makefile  |   1 +
 lib/tests/kfifo_kunit.c | 224 
 3 files changed, 239 insertions(+)
 create mode 100644 lib/tests/kfifo_kunit.c

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 59b6765d86b8..d7a4b996d731 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2646,6 +2646,20 @@ config SYSCTL_KUNIT_TEST
 
  If unsure, say N.
 
+config KFIFO_KUNIT_TEST
+   tristate "KUnit Test for the generic kernel FIFO implementation" if 
!KUNIT_ALL_TESTS
+   depends on KUNIT
+   default KUNIT_ALL_TESTS
+   help
+ This builds the generic FIFO implementation KUnit test suite.
+ It tests that the API and basic functionality of the kfifo type
+ and associated macros.
+
+ For more information on KUnit and unit tests in general please refer
+ to the KUnit documentation in Documentation/dev-tools/kunit/.
+
+ If unsure, say N.
+
 config LIST_KUNIT_TEST
tristate "KUnit Test for Kernel Linked-list structures" if 
!KUNIT_ALL_TESTS
depends on KUNIT
diff --git a/lib/tests/Makefile b/lib/tests/Makefile
index c6a14cc8663e..42699c7ee638 100644
--- a/lib/tests/Makefile
+++ b/lib/tests/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_TEST_IOV_ITER) += kunit_iov_iter.o
 obj-$(CONFIG_IS_SIGNED_TYPE_KUNIT_TEST) += is_signed_type_kunit.o
 obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
 obj-$(CONFIG_LIST_KUNIT_TEST) += list-test.o
+obj-$(CONFIG_KFIFO_KUNIT_TEST) += kfifo_kunit.o
 obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o
 obj-$(CONFIG_LINEAR_RANGES_TEST) += test_linear_ranges.o
 obj-$(CONFIG_MEMCPY_KUNIT_TEST) += memcpy_kunit.o
diff --git a/lib/tests/kfifo_kunit.c b/lib/tests/kfifo_kunit.c
new file mode 100644
index ..a85eedc3195a
--- /dev/null
+++ b/lib/tests/kfifo_kunit.c
@@ -0,0 +1,224 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit test for the generic kernel FIFO implementation.
+ *
+ * Copyright (C) 2024 Diego Vieira 
+ */
+#include 
+
+#include 
+
+#define KFIFO_SIZE 32
+#define N_ELEMENTS 5
+
+static void kfifo_test_reset_should_clear_the_fifo(struct kunit *test)
+{
+   DEFINE_KFIFO(my_fifo, u8, KFIFO_SIZE);
+
+   kfifo_put(&my_fifo, 1);
+   kfifo_put(&my_fifo, 2);
+   kfifo_put(&my_fifo, 3);
+   KUNIT_EXPECT_EQ(test, kfifo_len(&my_fifo), 3);
+
+   kfifo_reset(&my_fifo);
+
+   KUNIT_EXPECT_EQ(test, kfifo_len(&my_fifo), 0);
+   KUNIT_EXPECT_TRUE(test, kfifo_is_empty(&my_fifo));
+}
+
+static void kfifo_test_define_should_define_an_empty_fifo(struct kunit *test)
+{
+   DEFINE_KFIFO(my_fifo, u8, KFIFO_SIZE);
+
+   KUNIT_EXPECT_TRUE(test, kfifo_initialized(&my_fifo));
+   KUNIT_EXPECT_TRUE(test, kfifo_is_empty(&my_fifo));
+   KUNIT_EXPECT_EQ(test, kfifo_len(&my_fifo), 0);
+}
+
+static void kfifo_test_len_should_ret_n_of_stored_elements(struct kunit *test)
+{
+   u8 buffer1[N_ELEMENTS];
+
+   for (int i = 0; i < N_ELEMENTS; i++)
+   buffer1[i] = i + 1;
+
+   DEFINE_KFIFO(my_fifo, u8, KFIFO_SIZE);
+
+   KUNIT_EXPECT_EQ(test, kfifo_len(&my_fifo), 0);
+
+   kfifo_in(&my_fifo, buffer1, N_ELEMENTS);
+   KUNIT_EXPECT_EQ(test, kfifo_len(&my_fifo), N_ELEMENTS);
+
+   kfifo_in(&my_fifo, buffer1, N_ELEMENTS);
+   KUNIT_EXPECT_EQ(test, kfifo_len(&my_fifo), N_ELEMENTS * 2);
+
+   kfifo_reset(&my_fifo);
+   KUNIT_EXPECT_EQ(test, kfifo_len(&my_fifo), 0);
+}
+
+static void kfifo_test_put_should_insert_and_get_should_pop(struct kunit *test)
+{
+   u8 out_data = 0;
+   int processed_elements;
+   u8 elements[] = { 3, 5, 11 };
+
+   DEFINE_KFIFO(my_fifo, u8, KFIFO_SIZE);
+
+   // If the fifo is empty, get returns 0
+   processed_elements = kfifo_get(&my_fifo, &out_data);
+   KUNIT_EXPECT_EQ(test, processed_elements, 0);
+   KUNIT_EXPECT_EQ(test, out_data, 0);
+
+   for (int i = 0; i < 3; i++)
+   kfifo_put(&my_fifo, elements[i]);
+
+   for (int i = 0; i < 3; i++) {
+   processed_elements = kfifo_get(&my_fifo, &out_data);
+   KUNIT_EXPECT_EQ(test, processed_elements, 1);
+   KUNIT_EXPECT_EQ(test, out_data, elements[i]);
+   }
+}
+
+static void kfifo_test_in_should_insert_multiple_elements(struct kunit *test)
+{
+   u8 in_buffer[] = { 11, 25, 65 };
+   u8 out_data;
+   int processed_

Re: [PATCH rcu 0/11] Add light-weight readers for SRCU

2024-09-03 Thread Kent Overstreet

On Tue, Sep 03, 2024 at 09:32:51AM GMT, Paul E. McKenney wrote:
> Hello!
> 
> This series provides light-weight readers for SRCU.  This lightness
> is selected by the caller by using the new srcu_read_lock_lite() and
> srcu_read_unlock_lite() flavors instead of the usual srcu_read_lock() and
> srcu_read_unlock() flavors.  Although this passes significant rcutorture
> testing, this should still be considered to be experimental.

This avoids memory barriers, correct?

> There are a few restrictions:  (1) If srcu_read_lock_lite() is called
> on a given srcu_struct structure, then no other flavor may be used on
> that srcu_struct structure, before, during, or after.  (2) The _lite()
> readers may only be invoked from regions of code where RCU is watching
> (as in those regions in which rcu_is_watching() returns true).  (3)
> There is no auto-expediting for srcu_struct structures that have
> been passed to _lite() readers.  (4) SRCU grace periods for _lite()
> srcu_struct structures invoke synchronize_rcu() at least twice, thus
> having longer latencies than their non-_lite() counterparts.  (5) Even
> with synchronize_srcu_expedited(), the resulting SRCU grace period
> will invoke synchronize_rcu() at least twice, as opposed to invoking
> the IPI-happy synchronize_rcu_expedited() function.  (6)  Just as with
> srcu_read_lock() and srcu_read_unlock(), the srcu_read_lock_lite() and
> srcu_read_unlock_lite() functions may not (repeat, *not*) be invoked
> from NMI handlers (that is what the _nmisafe() interface are for).
> Although one could imagine readers that were both _lite() and _nmisafe(),
> one might also imagine that the read-modify-write atomic operations that
> are needed by any NMI-safe SRCU read marker would make this unhelpful
> from a performance perspective.

So if I'm following, this should work fine for bcachefs, and be a nifty
small perforance boost.

Can I give you an account for my test cluster? If you'd like, we can
convert bcachefs to it and point it at your branch.

[PATCH v2 0/1] Add KUnit tests for llist

2024-09-03 Thread Artur Alves

Hi all,

This is part of a hackathon organized by LKCAMP[1], focused on writing
tests using KUnit. We reached out a while ago asking for advice on what
would be a useful contribution[2] and ended up choosing data structures
that did not yet have tests. 

This patch adds tests for the llist data structure, defined in 
include/linux/llist.h, and is inspired by the KUnit tests for the doubly
linked list in lib/list-test.c[3].

It is important to note that this patch depends on the patch referenced
in [4], as it utilizes the newly created lib/tests/ subdirectory.

[1] https://lkcamp.dev/about/
[2] https://lore.kernel.org/all/Zktnt7rjKryTh9-N@arch/
[3] https://elixir.bootlin.com/linux/latest/source/lib/list-test.c
[4] https://lore.kernel.org/all/20240720181025.work.002-k...@kernel.org/

---
Changes in v2:
- Add MODULE_DESCRIPTION()
- Move the tests from lib/llist_kunit.c to lib/tests/llist_kunit.c
- Change the license from "GPL v2" to "GPL"

Artur Alves (1):
  lib/llist_kunit.c: add KUnit tests for llist

 lib/Kconfig.debug   |  11 ++
 lib/tests/Makefile  |   1 +
 lib/tests/llist_kunit.c | 361 
 3 files changed, 373 insertions(+)
 create mode 100644 lib/tests/llist_kunit.c

-- 
2.46.0

[PATCH v2 1/1] lib/llist_kunit.c: add KUnit tests for llist

2024-09-03 Thread Artur Alves

Add KUnit tests for the llist data structure. They test the vast
majority of methods and macros defined in include/linux/llist.h.

These are inspired by the existing tests for the 'list' doubly
linked in lib/list-test.c [1]. Each test case (llist_test_x)
tests the behaviour of the llist function/macro 'x'.

[1] 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/lib/list-test.c?h=v6.11-rc6

Signed-off-by: Artur Alves 
---
 lib/Kconfig.debug   |  11 ++
 lib/tests/Makefile  |   1 +
 lib/tests/llist_kunit.c | 361 
 3 files changed, 373 insertions(+)
 create mode 100644 lib/tests/llist_kunit.c

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index a30c03a66172..b2725daccc52 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2813,6 +2813,17 @@ config USERCOPY_KUNIT_TEST
  on the copy_to/from_user infrastructure, making sure basic
  user/kernel boundary testing is working.
 
+config LLIST_KUNIT_TEST
+   tristate "KUnit tests for lib/llist" if !KUNIT_ALL_TESTS
+   depends on KUNIT
+   default KUNIT_ALL_TESTS
+   help
+ This option builds the "llist_kunit" test module that
+ helps to verify the correctness of the functions and
+ macros defined in ().
+
+ If unsure, say N.
+
 config TEST_UDELAY
tristate "udelay test driver"
help
diff --git a/lib/tests/Makefile b/lib/tests/Makefile
index c6a14cc8663e..8d7c40a73110 100644
--- a/lib/tests/Makefile
+++ b/lib/tests/Makefile
@@ -34,4 +34,5 @@ CFLAGS_stackinit_kunit.o += $(call cc-disable-warning, 
switch-unreachable)
 obj-$(CONFIG_STACKINIT_KUNIT_TEST) += stackinit_kunit.o
 obj-$(CONFIG_STRING_KUNIT_TEST) += string_kunit.o
 obj-$(CONFIG_STRING_HELPERS_KUNIT_TEST) += string_helpers_kunit.o
+obj-$(CONFIG_LLIST_KUNIT_TEST) += llist_kunit.o
 
diff --git a/lib/tests/llist_kunit.c b/lib/tests/llist_kunit.c
new file mode 100644
index ..f273c0d175c7
--- /dev/null
+++ b/lib/tests/llist_kunit.c
@@ -0,0 +1,361 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit test for the Kernel lock-less linked-list structure.
+ *
+ * Author: Artur Alves 
+ */
+
+#include 
+#include 
+
+#define ENTRIES_SIZE 3
+
+struct llist_test_struct {
+   int data;
+   struct llist_node node;
+};
+
+static void llist_test_init_llist(struct kunit *test)
+{
+   /* test if the llist is correctly initialized */
+   struct llist_head llist1 = LLIST_HEAD_INIT(llist1);
+   LLIST_HEAD(llist2);
+   struct llist_head llist3, *llist4, *llist5;
+
+   KUNIT_EXPECT_TRUE(test, llist_empty(&llist1));
+
+   KUNIT_EXPECT_TRUE(test, llist_empty(&llist2));
+
+   init_llist_head(&llist3);
+   KUNIT_EXPECT_TRUE(test, llist_empty(&llist3));
+
+   llist4 = kzalloc(sizeof(*llist4), GFP_KERNEL | __GFP_NOFAIL);
+   init_llist_head(llist4);
+   KUNIT_EXPECT_TRUE(test, llist_empty(llist4));
+   kfree(llist4);
+
+   llist5 = kmalloc(sizeof(*llist5), GFP_KERNEL | __GFP_NOFAIL);
+   memset(llist5, 0xFF, sizeof(*llist5));
+   init_llist_head(llist5);
+   KUNIT_EXPECT_TRUE(test, llist_empty(llist5));
+   kfree(llist5);
+}
+
+static void llist_test_init_llist_node(struct kunit *test)
+{
+   struct llist_node a;
+
+   init_llist_node(&a);
+
+   KUNIT_EXPECT_PTR_EQ(test, a.next, &a);
+}
+
+static void llist_test_llist_entry(struct kunit *test)
+{
+   struct llist_test_struct test_struct, *aux;
+   struct llist_node *llist = &test_struct.node;
+
+   aux = llist_entry(llist, struct llist_test_struct, node);
+   KUNIT_EXPECT_PTR_EQ(test, &test_struct, aux);
+}
+
+static void llist_test_add(struct kunit *test)
+{
+   struct llist_node a, b;
+   LLIST_HEAD(llist);
+
+   init_llist_node(&a);
+   init_llist_node(&b);
+
+   /* The first assertion must be true, given that llist is empty */
+   KUNIT_EXPECT_TRUE(test, llist_add(&a, &llist));
+   KUNIT_EXPECT_FALSE(test, llist_add(&b, &llist));
+
+   /* Should be [List] -> b -> a */
+   KUNIT_EXPECT_PTR_EQ(test, llist.first, &b);
+   KUNIT_EXPECT_PTR_EQ(test, b.next, &a);
+}
+
+static void llist_test_add_batch(struct kunit *test)
+{
+   struct llist_node a, b, c;
+   LLIST_HEAD(llist);
+   LLIST_HEAD(llist2);
+
+   init_llist_node(&a);
+   init_llist_node(&b);
+   init_llist_node(&c);
+
+   llist_add(&a, &llist2);
+   llist_add(&b, &llist2);
+   llist_add(&c, &llist2);
+
+   /* This assertion must be true, given that llist is empty */
+   KUNIT_EXPECT_TRUE(test, llist_add_batch(&c, &a, &llist));
+
+   /* should be [List] -> c -> b -> a */
+   KUNIT_EXPECT_PTR_EQ(test, llist.first, &c);
+   KUNIT_EXPECT_PTR_EQ(test, c.next, &b);
+   KUNIT_EXPECT_PTR_EQ(test, b.next, &a);
+}
+
+static void llist_test_llist_next(struct kunit *test)
+{
+   struct llist_node a, b;
+   LLIST_HEAD(llist);
+
+   init_llist_node(&a);
+   init_l

Re: [PATCH v6 1/2] selftests: Rename sigaltstack to generic signal

2024-09-03 Thread Shuah Khan


On 8/30/24 10:29, Dev Jain wrote:


On 8/27/24 17:16, Dev Jain wrote:


On 8/27/24 17:14, Shuah Khan wrote:

On 8/22/24 06:14, Dev Jain wrote:

Rename sigaltstack to generic signal directory, to allow adding more
signal tests in the future.


Sorry - I think I mentioned I don't like this test renamed. Why are you sending
this rename still included in the patch series?


I am not renaming the test, just the directory. The directory name
is changed to signal, and I have retained the name of the test -
sas.c.


Gentle ping: I guess there was a misunderstanding; in v5, I was
also changing the name of the test, to which you objected, and
I agreed. But, we need to change the name of the directory since
the new test has no relation to the current directory name,
"sigaltstack". The patch description explains that the directory
should be generically named.



Right. You are no longer changing the test name. You are still
changing the directory name. The problem I mentioned stays the
same. Any fixes to the existing tests in this directory can no
longer auto applied to stables releases.

Other than the desire to rename the directory to generic, what
other value does this change bring?

thanks,
-- Shuah

Re: [PATCH v2] selftests: splice: Add usage() to splice_read.c

2024-09-03 Thread Shuah Khan


On 8/30/24 23:14, Rong Tao wrote:

From: Rong Tao 

Give the programmer more help information to inform the program on how to
use it.

Signed-off-by: Rong Tao 
---
  tools/testing/selftests/splice/splice_read.c | 9 -
  1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/splice/splice_read.c 
b/tools/testing/selftests/splice/splice_read.c
index 46dae6a25cfb..73a8bc146f97 100644
--- a/tools/testing/selftests/splice/splice_read.c
+++ b/tools/testing/selftests/splice/splice_read.c
@@ -9,6 +9,12 @@
  #include 
  #include 
  
+void usage(const char *prog)

+{
+   fprintf(stderr, "Usage: %s INPUT [BYTES]\n", prog);
+   fprintf(stderr, "   %s /etc/os-release | cat\n", prog);


How does replacing %s INPUT [BYTES]\n", prog with /etc/os-release | cat\n" do?
Also what happens on distros that don't have os-release file?


+}
+
  int main(int argc, char *argv[])
  {
int fd;
@@ -16,7 +22,7 @@ int main(int argc, char *argv[])
ssize_t spliced;
  
  	if (argc < 2) {

-   fprintf(stderr, "Usage: %s INPUT [BYTES]\n", argv[0]);
+   usage(argv[0]);
return EXIT_FAILURE;
}
  
@@ -49,6 +55,7 @@ int main(int argc, char *argv[])

  size, SPLICE_F_MOVE);
if (spliced < 0) {
perror("splice");
+   usage(argv[0]);
return EXIT_FAILURE;
}
  


I don't understand how this change helps user.

thanks,
-- Shuah

Re: [PATCH rcu 0/11] Add light-weight readers for SRCU

2024-09-03 Thread Andrii Nakryiko

On Tue, Sep 3, 2024 at 9:32 AM Paul E. McKenney  wrote:
>
> Hello!
>
> This series provides light-weight readers for SRCU.  This lightness
> is selected by the caller by using the new srcu_read_lock_lite() and
> srcu_read_unlock_lite() flavors instead of the usual srcu_read_lock() and
> srcu_read_unlock() flavors.  Although this passes significant rcutorture
> testing, this should still be considered to be experimental.
>
> There are a few restrictions:  (1) If srcu_read_lock_lite() is called
> on a given srcu_struct structure, then no other flavor may be used on
> that srcu_struct structure, before, during, or after.  (2) The _lite()
> readers may only be invoked from regions of code where RCU is watching
> (as in those regions in which rcu_is_watching() returns true).  (3)
> There is no auto-expediting for srcu_struct structures that have
> been passed to _lite() readers.  (4) SRCU grace periods for _lite()
> srcu_struct structures invoke synchronize_rcu() at least twice, thus
> having longer latencies than their non-_lite() counterparts.  (5) Even
> with synchronize_srcu_expedited(), the resulting SRCU grace period
> will invoke synchronize_rcu() at least twice, as opposed to invoking
> the IPI-happy synchronize_rcu_expedited() function.  (6)  Just as with
> srcu_read_lock() and srcu_read_unlock(), the srcu_read_lock_lite() and
> srcu_read_unlock_lite() functions may not (repeat, *not*) be invoked
> from NMI handlers (that is what the _nmisafe() interface are for).
> Although one could imagine readers that were both _lite() and _nmisafe(),
> one might also imagine that the read-modify-write atomic operations that
> are needed by any NMI-safe SRCU read marker would make this unhelpful
> from a performance perspective.
>
> All that said, the patches in this series are as follows:
>
> 1.  Rename srcu_might_be_idle() to srcu_should_expedite().
>
> 2.  Introduce srcu_gp_is_expedited() helper function.
>
> 3.  Renaming in preparation for additional reader flavor.
>
> 4.  Bit manipulation changes for additional reader flavor.
>
> 5.  Standardize srcu_data pointers to "sdp" and similar.
>
> 6.  Convert srcu_data ->srcu_reader_flavor to bit field.
>
> 7.  Add srcu_read_lock_lite() and srcu_read_unlock_lite().
>
> 8.  rcutorture: Expand RCUTORTURE_RDR_MASK_[12] to eight bits.
>
> 9.  rcutorture: Add reader_flavor parameter for SRCU readers.
>
> 10. rcutorture: Add srcu_read_lock_lite() support to
> rcutorture.reader_flavor.
>
> 11. refscale: Add srcu_read_lock_lite() support using "srcu-lite".
>
> Thanx, Paul
>

Thanks Paul for working on this!

I applied your patches on top of all my uprobe changes (including the
RFC patches that remove locks, optimize VMA to inode resolution, etc,
etc; basically the fastest uprobe/uretprobe state I can get to). And
then tested a few changes:

  - A) baseline (no SRCU-lite, RCU Tasks Trace for uprobe, normal SRCU
for uretprobes)
  - B) A + SRCU-lite for uretprobes (i.e., SRCU to SRCU-lite conversion)
  - C) B + RCU Tasks Trace converted to SRCU-lite
  - D) I also pessimized baseline by reverting RCU Tasks Trace, so
both uprobes and uretprobes are SRCU protected. This allowed me to see
a pure gain of SRCU-lite over SRCU for uprobes, taking RCU Tasks Trace
performance out of the equation.

In uprobes I used basically two benchmarks. One, uprobe-nop, that
benchmarks entry uprobes (which are the fastest most optimized case,
using RCU Tasks Trace in A and SRCU in D), and another that benchmarks
return uprobes (uretprobes), called uretprobe-nop, which is normal
SRCU both in A) and D). The latter uretprobe-nop benchmark basically
combines entry and return probe overheads, because that's how
uretprobes work.

So, below are the most meaningful comparisons. First, SRCU vs
SRCU-lite for uretprobes:

BASELINE (A)

uretprobe-nop ( 1 cpus):1.941 ± 0.002M/s  (  1.941M/s/cpu)
uretprobe-nop ( 2 cpus):3.731 ± 0.001M/s  (  1.866M/s/cpu)
uretprobe-nop ( 3 cpus):5.492 ± 0.002M/s  (  1.831M/s/cpu)
uretprobe-nop ( 4 cpus):7.234 ± 0.003M/s  (  1.808M/s/cpu)
uretprobe-nop ( 8 cpus):   13.448 ± 0.098M/s  (  1.681M/s/cpu)
uretprobe-nop (16 cpus):   22.905 ± 0.009M/s  (  1.432M/s/cpu)
uretprobe-nop (32 cpus):   44.760 ± 0.069M/s  (  1.399M/s/cpu)
uretprobe-nop (40 cpus):   52.986 ± 0.104M/s  (  1.325M/s/cpu)
uretprobe-nop (64 cpus):   43.650 ± 0.435M/s  (  0.682M/s/cpu)
uretprobe-nop (80 cpus):   46.831 ± 0.938M/s  (  0.585M/s/cpu)

SRCU-lite for uretprobe (B)
===
uretprobe-nop ( 1 cpus):2.014 ± 0.014M/s  (  2.014M/s/cpu)
uretprobe-nop ( 2 cpus):3.820 ± 0.002M/s  (  1.910M/s/cpu)
uretprobe-nop ( 3 cpus):5.640 ± 0.003M/s  (  1.880M/s/cpu)
uretprobe-nop ( 4 cpus):7.410 ± 0.003M/s  (  1.852M/s/cpu)
uretprobe-nop ( 8 cpus):   1

Re: [PATCH rcu 07/11] srcu: Add srcu_read_lock_lite() and srcu_read_unlock_lite()

2024-09-03 Thread Paul E. McKenney

On Tue, Sep 03, 2024 at 12:45:23PM -0700, Alexei Starovoitov wrote:
> On Tue, Sep 3, 2024 at 9:33 AM Paul E. McKenney  wrote:
> >
> > diff --git a/include/linux/srcu.h b/include/linux/srcu.h
> > index 84daaa33ea0ab..4ba96e2cfa405 100644
> > --- a/include/linux/srcu.h
> > +++ b/include/linux/srcu.h
> ...
> 
> > +static inline int srcu_read_lock_lite(struct srcu_struct *ssp) 
> > __acquires(ssp)
> > +{
> > +   int retval;
> > +
> > +   srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE);
> > +   retval = __srcu_read_lock_lite(ssp);
> > +   rcu_try_lock_acquire(&ssp->dep_map);
> > +   return retval;
> > +}
> 
> ...
> 
> > diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
> > index 602b4b8c4b891..bab888e86b9bb 100644
> > --- a/kernel/rcu/srcutree.c
> > +++ b/kernel/rcu/srcutree.c
> > +int __srcu_read_lock_lite(struct srcu_struct *ssp)
> > +{
> > +   int idx;
> > +
> > +   RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching 
> > srcu_read_lock_lite().");
> > +   idx = READ_ONCE(ssp->srcu_idx) & 0x1;
> > +   this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter); /* Y */
> > +   barrier(); /* Avoid leaking the critical section. */
> > +   return idx;
> > +}
> > +EXPORT_SYMBOL_GPL(__srcu_read_lock_lite);
> 
> The use cases where smp_mb() penalty is noticeable probably will notice
> the cost of extra call too.
> Can the main part be in srcu.h as well to make it truly "lite" ?
> Otherwise we'd have to rely on compilers doing LTO which may or may not 
> happen.

I vaguely recall #include issues for old-times srcu_read_lock() and
srcu_read_unlock(), but I will try it and see what happens.

In the meantime, if you are too curious for your own good...  ;-)

One way to check the performance is to work in the other direction.  For
example, add ndelay(10) or similar to the current srcu_read_lock_lite()
implementation and seeing if this is visible at the system level.

Thanx, Paul

Re: [PATCH rcu 0/11] Add light-weight readers for SRCU

2024-09-03 Thread Paul E. McKenney

On Tue, Sep 03, 2024 at 05:38:05PM -0400, Kent Overstreet wrote:
> On Tue, Sep 03, 2024 at 09:32:51AM GMT, Paul E. McKenney wrote:
> > Hello!
> > 
> > This series provides light-weight readers for SRCU.  This lightness
> > is selected by the caller by using the new srcu_read_lock_lite() and
> > srcu_read_unlock_lite() flavors instead of the usual srcu_read_lock() and
> > srcu_read_unlock() flavors.  Although this passes significant rcutorture
> > testing, this should still be considered to be experimental.
> 
> This avoids memory barriers, correct?

Yes, there are no smp_mb() invocations in either srcu_read_lock_lite()
or srcu_read_unlock_lite().  As usual, nothing comes for free, so the
overhead is moved to the update side, and amplified, in the guise of
the at least two calls to synchronize_rcu().

> > There are a few restrictions:  (1) If srcu_read_lock_lite() is called
> > on a given srcu_struct structure, then no other flavor may be used on
> > that srcu_struct structure, before, during, or after.  (2) The _lite()
> > readers may only be invoked from regions of code where RCU is watching
> > (as in those regions in which rcu_is_watching() returns true).  (3)
> > There is no auto-expediting for srcu_struct structures that have
> > been passed to _lite() readers.  (4) SRCU grace periods for _lite()
> > srcu_struct structures invoke synchronize_rcu() at least twice, thus
> > having longer latencies than their non-_lite() counterparts.  (5) Even
> > with synchronize_srcu_expedited(), the resulting SRCU grace period
> > will invoke synchronize_rcu() at least twice, as opposed to invoking
> > the IPI-happy synchronize_rcu_expedited() function.  (6)  Just as with
> > srcu_read_lock() and srcu_read_unlock(), the srcu_read_lock_lite() and
> > srcu_read_unlock_lite() functions may not (repeat, *not*) be invoked
> > from NMI handlers (that is what the _nmisafe() interface are for).
> > Although one could imagine readers that were both _lite() and _nmisafe(),
> > one might also imagine that the read-modify-write atomic operations that
> > are needed by any NMI-safe SRCU read marker would make this unhelpful
> > from a performance perspective.
> 
> So if I'm following, this should work fine for bcachefs, and be a nifty
> small perforance boost.

Glad you like it!

> Can I give you an account for my test cluster? If you'd like, we can
> convert bcachefs to it and point it at your branch.

Thank you, but I will pass on more accounts.  I have a fair amount of
hardware at my disposal.  ;-)

Thanx, Paul

Re: [PATCH rcu 0/11] Add light-weight readers for SRCU

2024-09-03 Thread Paul E. McKenney

On Tue, Sep 03, 2024 at 03:08:21PM -0700, Andrii Nakryiko wrote:
> On Tue, Sep 3, 2024 at 9:32 AM Paul E. McKenney  wrote:
> >
> > Hello!
> >
> > This series provides light-weight readers for SRCU.  This lightness
> > is selected by the caller by using the new srcu_read_lock_lite() and
> > srcu_read_unlock_lite() flavors instead of the usual srcu_read_lock() and
> > srcu_read_unlock() flavors.  Although this passes significant rcutorture
> > testing, this should still be considered to be experimental.
> >
> > There are a few restrictions:  (1) If srcu_read_lock_lite() is called
> > on a given srcu_struct structure, then no other flavor may be used on
> > that srcu_struct structure, before, during, or after.  (2) The _lite()
> > readers may only be invoked from regions of code where RCU is watching
> > (as in those regions in which rcu_is_watching() returns true).  (3)
> > There is no auto-expediting for srcu_struct structures that have
> > been passed to _lite() readers.  (4) SRCU grace periods for _lite()
> > srcu_struct structures invoke synchronize_rcu() at least twice, thus
> > having longer latencies than their non-_lite() counterparts.  (5) Even
> > with synchronize_srcu_expedited(), the resulting SRCU grace period
> > will invoke synchronize_rcu() at least twice, as opposed to invoking
> > the IPI-happy synchronize_rcu_expedited() function.  (6)  Just as with
> > srcu_read_lock() and srcu_read_unlock(), the srcu_read_lock_lite() and
> > srcu_read_unlock_lite() functions may not (repeat, *not*) be invoked
> > from NMI handlers (that is what the _nmisafe() interface are for).
> > Although one could imagine readers that were both _lite() and _nmisafe(),
> > one might also imagine that the read-modify-write atomic operations that
> > are needed by any NMI-safe SRCU read marker would make this unhelpful
> > from a performance perspective.
> >
> > All that said, the patches in this series are as follows:
> >
> > 1.  Rename srcu_might_be_idle() to srcu_should_expedite().
> >
> > 2.  Introduce srcu_gp_is_expedited() helper function.
> >
> > 3.  Renaming in preparation for additional reader flavor.
> >
> > 4.  Bit manipulation changes for additional reader flavor.
> >
> > 5.  Standardize srcu_data pointers to "sdp" and similar.
> >
> > 6.  Convert srcu_data ->srcu_reader_flavor to bit field.
> >
> > 7.  Add srcu_read_lock_lite() and srcu_read_unlock_lite().
> >
> > 8.  rcutorture: Expand RCUTORTURE_RDR_MASK_[12] to eight bits.
> >
> > 9.  rcutorture: Add reader_flavor parameter for SRCU readers.
> >
> > 10. rcutorture: Add srcu_read_lock_lite() support to
> > rcutorture.reader_flavor.
> >
> > 11. refscale: Add srcu_read_lock_lite() support using "srcu-lite".
> >
> > Thanx, Paul
> >
> 
> Thanks Paul for working on this!
> 
> I applied your patches on top of all my uprobe changes (including the
> RFC patches that remove locks, optimize VMA to inode resolution, etc,
> etc; basically the fastest uprobe/uretprobe state I can get to). And
> then tested a few changes:
> 
>   - A) baseline (no SRCU-lite, RCU Tasks Trace for uprobe, normal SRCU
> for uretprobes)
>   - B) A + SRCU-lite for uretprobes (i.e., SRCU to SRCU-lite conversion)
>   - C) B + RCU Tasks Trace converted to SRCU-lite
>   - D) I also pessimized baseline by reverting RCU Tasks Trace, so
> both uprobes and uretprobes are SRCU protected. This allowed me to see
> a pure gain of SRCU-lite over SRCU for uprobes, taking RCU Tasks Trace
> performance out of the equation.
> 
> In uprobes I used basically two benchmarks. One, uprobe-nop, that
> benchmarks entry uprobes (which are the fastest most optimized case,
> using RCU Tasks Trace in A and SRCU in D), and another that benchmarks
> return uprobes (uretprobes), called uretprobe-nop, which is normal
> SRCU both in A) and D). The latter uretprobe-nop benchmark basically
> combines entry and return probe overheads, because that's how
> uretprobes work.
> 
> So, below are the most meaningful comparisons. First, SRCU vs
> SRCU-lite for uretprobes:
> 
> BASELINE (A)
> 
> uretprobe-nop ( 1 cpus):1.941 ± 0.002M/s  (  1.941M/s/cpu)
> uretprobe-nop ( 2 cpus):3.731 ± 0.001M/s  (  1.866M/s/cpu)
> uretprobe-nop ( 3 cpus):5.492 ± 0.002M/s  (  1.831M/s/cpu)
> uretprobe-nop ( 4 cpus):7.234 ± 0.003M/s  (  1.808M/s/cpu)
> uretprobe-nop ( 8 cpus):   13.448 ± 0.098M/s  (  1.681M/s/cpu)
> uretprobe-nop (16 cpus):   22.905 ± 0.009M/s  (  1.432M/s/cpu)
> uretprobe-nop (32 cpus):   44.760 ± 0.069M/s  (  1.399M/s/cpu)
> uretprobe-nop (40 cpus):   52.986 ± 0.104M/s  (  1.325M/s/cpu)
> uretprobe-nop (64 cpus):   43.650 ± 0.435M/s  (  0.682M/s/cpu)
> uretprobe-nop (80 cpus):   46.831 ± 0.938M/s  (  0.585M/s/cpu)
> 
> SRCU-lite for uretprobe (B)
> ===
> uretprobe-nop ( 1 cpus):2.014 ± 0.014M/s

Re: [PATCH net-next 00/11] mptcp: MIB counters for MPJ TX + misc improvements

2024-09-03 Thread patchwork-bot+netdevbpf

Hello:

This series was applied to netdev/net-next.git (main)
by Jakub Kicinski :

On Mon, 02 Sep 2024 12:45:51 +0200 you wrote:
> Recently, a few issues have been discovered around the creation of
> additional subflows. Without these counters, it was difficult to point
> out the reason why some subflows were not created as expected.
> 
> In patch 3, all error paths from __mptcp_subflow_connect() are covered,
> except the one related to the 'fully established mode', because it can
> only happen with the userspace PM, which will propagate the error to the
> userspace in this case (ENOTCONN).
> 
> [...]

Here is the summary with links:
  - [net-next,01/11] mptcp: pm: rename helpers linked to 'flush'
https://git.kernel.org/netdev/net-next/c/7bcf4d8022f9
  - [net-next,02/11] mptcp: pm: reduce entries iterations on connect
https://git.kernel.org/netdev/net-next/c/b83fbca1b4c9
  - [net-next,03/11] mptcp: MIB counters for sent MP_JOIN
https://git.kernel.org/netdev/net-next/c/1bd1788b6cab
  - [net-next,04/11] selftests: mptcp: join: reduce join_nr params
https://git.kernel.org/netdev/net-next/c/1b2965a8cd8d
  - [net-next,05/11] selftests: mptcp: join: one line for join check
https://git.kernel.org/netdev/net-next/c/ba8a664004da
  - [net-next,06/11] selftests: mptcp: join: validate MPJ SYN TX MIB counters
https://git.kernel.org/netdev/net-next/c/004125c251a6
  - [net-next,07/11] selftests: mptcp: join: more explicit check name
https://git.kernel.org/netdev/net-next/c/6ed495345be8
  - [net-next,08/11] selftests: mptcp: join: specify host being checked
https://git.kernel.org/netdev/net-next/c/8d328dbcf61b
  - [net-next,09/11] selftests: mptcp: join: mute errors when ran in the 
background
https://git.kernel.org/netdev/net-next/c/08eecd7e7fe7
  - [net-next,10/11] selftests: mptcp: join: simplify checksum_tests
https://git.kernel.org/netdev/net-next/c/0e2b4584d61a
  - [net-next,11/11] selftests: mptcp: pm_nl_ctl: remove re-definition
https://git.kernel.org/netdev/net-next/c/38dc0708bcc8

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html

Re: [PATCH rcu 0/11] Add light-weight readers for SRCU

2024-09-03 Thread Kent Overstreet

On Tue, Sep 03, 2024 at 03:13:40PM GMT, Paul E. McKenney wrote:
> On Tue, Sep 03, 2024 at 05:38:05PM -0400, Kent Overstreet wrote:
> > On Tue, Sep 03, 2024 at 09:32:51AM GMT, Paul E. McKenney wrote:
> > > Hello!
> > > 
> > > This series provides light-weight readers for SRCU.  This lightness
> > > is selected by the caller by using the new srcu_read_lock_lite() and
> > > srcu_read_unlock_lite() flavors instead of the usual srcu_read_lock() and
> > > srcu_read_unlock() flavors.  Although this passes significant rcutorture
> > > testing, this should still be considered to be experimental.
> > 
> > This avoids memory barriers, correct?
> 
> Yes, there are no smp_mb() invocations in either srcu_read_lock_lite()
> or srcu_read_unlock_lite().  As usual, nothing comes for free, so the
> overhead is moved to the update side, and amplified, in the guise of
> the at least two calls to synchronize_rcu().
> 
> > > There are a few restrictions:  (1) If srcu_read_lock_lite() is called
> > > on a given srcu_struct structure, then no other flavor may be used on
> > > that srcu_struct structure, before, during, or after.  (2) The _lite()
> > > readers may only be invoked from regions of code where RCU is watching
> > > (as in those regions in which rcu_is_watching() returns true).  (3)
> > > There is no auto-expediting for srcu_struct structures that have
> > > been passed to _lite() readers.  (4) SRCU grace periods for _lite()
> > > srcu_struct structures invoke synchronize_rcu() at least twice, thus
> > > having longer latencies than their non-_lite() counterparts.  (5) Even
> > > with synchronize_srcu_expedited(), the resulting SRCU grace period
> > > will invoke synchronize_rcu() at least twice, as opposed to invoking
> > > the IPI-happy synchronize_rcu_expedited() function.  (6)  Just as with
> > > srcu_read_lock() and srcu_read_unlock(), the srcu_read_lock_lite() and
> > > srcu_read_unlock_lite() functions may not (repeat, *not*) be invoked
> > > from NMI handlers (that is what the _nmisafe() interface are for).
> > > Although one could imagine readers that were both _lite() and _nmisafe(),
> > > one might also imagine that the read-modify-write atomic operations that
> > > are needed by any NMI-safe SRCU read marker would make this unhelpful
> > > from a performance perspective.
> > 
> > So if I'm following, this should work fine for bcachefs, and be a nifty
> > small perforance boost.
> 
> Glad you like it!
> 
> > Can I give you an account for my test cluster? If you'd like, we can
> > convert bcachefs to it and point it at your branch.
> 
> Thank you, but I will pass on more accounts.  I have a fair amount of
> hardware at my disposal.  ;-)

Well - bcachefs might be a good torture test; if you patch bcachefs to
use the new API point me at a branch and I'll point the CI at it

Re: [PATCH v4 0/4] selftests: Fix cpuid / vendor checking build issues

2024-09-03 Thread Shuah Khan


On 9/3/24 08:45, Ilpo Järvinen wrote:

This series first generalizes resctrl selftest non-contiguous CAT check
to not assume non-AMD vendor implies Intel. Second, it improves
selftests such that the use of __cpuid_count() does not lead into a
build failure (happens at least on ARM).

While ARM does not currently support resctrl features, there's an
ongoing work to enable resctrl support also for it on the kernel side.
In any case, a common header such as kselftest.h should have a proper
fallback in place for what it provides, thus it seems justified to fix
this common level problem on the common level rather than e.g.
disabling build for resctrl selftest for archs lacking resctrl support.

I've dropped reviewed and tested by tags from the last patch in v3 due
to major changes into the makefile logic. So it would be helpful if
Muhammad could retest with this version.

Acquiring ARCH in lib.mk will likely allow some cleanup into some
subdirectory makefiles but that is left as future work because this
series focuses in fixing cpuid/build.




v4:
- New patch to reorder x86 selftest makefile to avoid clobbering CFLAGS
   (would cause __cpuid_count() related build fail otherwise)


I don't like the way this patch series is mushrooming. I am not
convinced that changes to lib.mk and x86 Makefile are necessary.

I will take a look at this to see if this can be simplified.

thanks,
-- Shuah

Re: [PATCH net-next 0/3] selftests: mptcp: add time per subtests in TAP output

2024-09-03 Thread Jakub Kicinski

On Mon, 02 Sep 2024 13:13:03 +0200 Matthieu Baerts (NGI0) wrote:
> Patches here add 'time=ms' in the diagnostic data of the TAP output,
> e.g.
> 
>   ok 1 - pm_netlink: defaults addr list # time=9ms

Looking closer, this:

# ok 3 - mptcp[...] MPTCP # time=7184ms
# ok 4 - mptcp[...] TCP   # time=6458ms

Makes NIPA unhappy. The match results for regexps look like this:

(None, '4', ' -', 'mptcp[...] MPTCP', ' # ', 'time=6173ms')
(None, '4', ' -', 'mptcp[...] TC', None, 'P   # time=6173ms')

IOW the first one is neat, second one gepooped. The regex really wants
there to be no more than a single space before the #. KTAP definition
doesn't say that description must not have trailing white space.

Best I could come up with is:

diff --git a/contest/remote/vmksft-p.py b/contest/remote/vmksft-p.py
index fe9e87abdb5c..a37245bd5b30 100755
--- a/contest/remote/vmksft-p.py
+++ b/contest/remote/vmksft-p.py
@@ -73,7 +73,7 @@ group3 testV skip
 tests = []
 nested_tests = False
 
-result_re = re.compile(r"(not )?ok (\d+)( -)? ([^#]*[^ ])( # )?([^ ].*)?$")
+result_re = re.compile(r"(not )?ok (\d+)( -)? ([^#]*[^ ])( +# )?([^ 
].*)?$")
 time_re = re.compile(r"time=(\d+)ms")
 
 for line in full_run.split('\n'):

Thoughts?

Re: [PATCH net-next 0/3] selftests: mptcp: add time per subtests in TAP output

2024-09-03 Thread Jakub Kicinski

On Tue, 3 Sep 2024 16:22:17 -0700 Jakub Kicinski wrote:
> (None, '4', ' -', 'mptcp[...] MPTCP', ' # ', 'time=6173ms')
> (None, '4', ' -', 'mptcp[...] TC', None, 'P   # time=6173ms')

Sorry I fumbled copy/pasting the results, ignore the exact time values
in the last group, but the mis-parsing stands.

Re: [PATCH net-next 2/3] sefltests: mptcp: connect: remote time in TAP output

2024-09-03 Thread Jakub Kicinski

On Mon, 02 Sep 2024 13:13:05 +0200 Matthieu Baerts (NGI0) wrote:
> Subject: [PATCH net-next 2/3] sefltests: mptcp: connect: remote time in TAP 
> output

nit: typo in the subject
-- 
pw-bot: cr

[PATCH] mm/damon/tests/vaddr-kunit: don't use mas_lock for MM_MT_FLAGS-initialized maple tree

2024-09-03 Thread SeongJae Park

damon_test_three_regions_in_vmas() initializes a maple tree with
MM_MT_FLAGS.  The flags contains MT_FLAGS_LOCK_EXTERN, which means
mt_lock of the maple tree will not be used.  And therefore the maple
tree initialization code skips initialization of the mt_lock.  However,
__link_vmas(), which adds vmas for test to the maple tree, uses the
mt_lock.  In other words, the uninitialized spinlock is used.  The
problem becomes celar when spinlock debugging is turned on, since it
reports spinlock bad magic bug.  Fix the issue by not using the mt_lock
as promised.

Reported-by: Guenter Roeck 
Closes: 
https://lore.kernel.org/1453b2b2-6119-4082-ad9e-f3c5239bf...@roeck-us.net
Fixes: d0cf3dd47f0d ("damon: convert __damon_va_three_regions to use the VMA 
iterator")
Signed-off-by: SeongJae Park 
---
 mm/damon/tests/vaddr-kunit.h | 10 +++---
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h
index 83626483f82b..c6c7e0e0ab07 100644
--- a/mm/damon/tests/vaddr-kunit.h
+++ b/mm/damon/tests/vaddr-kunit.h
@@ -17,23 +17,19 @@
 static int __link_vmas(struct maple_tree *mt, struct vm_area_struct *vmas,
ssize_t nr_vmas)
 {
-   int i, ret = -ENOMEM;
+   int i;
MA_STATE(mas, mt, 0, 0);
 
if (!nr_vmas)
return 0;
 
-   mas_lock(&mas);
for (i = 0; i < nr_vmas; i++) {
mas_set_range(&mas, vmas[i].vm_start, vmas[i].vm_end - 1);
if (mas_store_gfp(&mas, &vmas[i], GFP_KERNEL))
-   goto failed;
+   return -ENOMEM;
}
 
-   ret = 0;
-failed:
-   mas_unlock(&mas);
-   return ret;
+   return 0;
 }
 
 /*
-- 
2.39.2

Re: [PATCH] mm/damon/tests/vaddr-kunit: don't use mas_lock for MM_MT_FLAGS-initialized maple tree

2024-09-03 Thread Liam R. Howlett

* SeongJae Park  [240903 20:45]:
> damon_test_three_regions_in_vmas() initializes a maple tree with
> MM_MT_FLAGS.  The flags contains MT_FLAGS_LOCK_EXTERN, which means
> mt_lock of the maple tree will not be used.  And therefore the maple
> tree initialization code skips initialization of the mt_lock.  However,
> __link_vmas(), which adds vmas for test to the maple tree, uses the
> mt_lock.  In other words, the uninitialized spinlock is used.  The
> problem becomes celar when spinlock debugging is turned on, since it
> reports spinlock bad magic bug.  Fix the issue by not using the mt_lock
> as promised.

You can't do this, lockdep will tell you this is wrong.  We need a lock
and to use the lock for writes.

I'd suggest using different flags so the spinlock is used.

> 
> Reported-by: Guenter Roeck 
> Closes: 
> https://lore.kernel.org/1453b2b2-6119-4082-ad9e-f3c5239bf...@roeck-us.net
> Fixes: d0cf3dd47f0d ("damon: convert __damon_va_three_regions to use the VMA 
> iterator")
> Signed-off-by: SeongJae Park 
> ---
>  mm/damon/tests/vaddr-kunit.h | 10 +++---
>  1 file changed, 3 insertions(+), 7 deletions(-)
> 
> diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h
> index 83626483f82b..c6c7e0e0ab07 100644
> --- a/mm/damon/tests/vaddr-kunit.h
> +++ b/mm/damon/tests/vaddr-kunit.h
> @@ -17,23 +17,19 @@
>  static int __link_vmas(struct maple_tree *mt, struct vm_area_struct *vmas,
>   ssize_t nr_vmas)
>  {
> - int i, ret = -ENOMEM;
> + int i;
>   MA_STATE(mas, mt, 0, 0);
>  
>   if (!nr_vmas)
>   return 0;
>  
> - mas_lock(&mas);
>   for (i = 0; i < nr_vmas; i++) {
>   mas_set_range(&mas, vmas[i].vm_start, vmas[i].vm_end - 1);
>   if (mas_store_gfp(&mas, &vmas[i], GFP_KERNEL))
> - goto failed;
> + return -ENOMEM;
>   }
>  
> - ret = 0;
> -failed:
> - mas_unlock(&mas);
> - return ret;
> + return 0;
>  }
>  
>  /*
> -- 
> 2.39.2
>

Re: [PATCH] mm/damon/tests/vaddr-kunit: don't use mas_lock for MM_MT_FLAGS-initialized maple tree

2024-09-03 Thread SeongJae Park

On Tue, 3 Sep 2024 20:48:53 -0400 "Liam R. Howlett"  
wrote:

> * SeongJae Park  [240903 20:45]:
> > damon_test_three_regions_in_vmas() initializes a maple tree with
> > MM_MT_FLAGS.  The flags contains MT_FLAGS_LOCK_EXTERN, which means
> > mt_lock of the maple tree will not be used.  And therefore the maple
> > tree initialization code skips initialization of the mt_lock.  However,
> > __link_vmas(), which adds vmas for test to the maple tree, uses the
> > mt_lock.  In other words, the uninitialized spinlock is used.  The
> > problem becomes celar when spinlock debugging is turned on, since it
> > reports spinlock bad magic bug.  Fix the issue by not using the mt_lock
> > as promised.
> 
> You can't do this, lockdep will tell you this is wrong.

Hmm, but lockdep was silence on my setup?

> We need a lock and to use the lock for writes.

This code is executed by a single-thread test code.  Do we still need the lock?

> 
> I'd suggest using different flags so the spinlock is used.

The reporter mentioned simply dropping MT_FLAGS_LOCK_EXTERN from the flags
causes suspicious RCU usage message.  May I ask if you have a suggestion of
better flags?


Thanks,
SJ

> 
> > 
> > Reported-by: Guenter Roeck 
> > Closes: 
> > https://lore.kernel.org/1453b2b2-6119-4082-ad9e-f3c5239bf...@roeck-us.net
> > Fixes: d0cf3dd47f0d ("damon: convert __damon_va_three_regions to use the 
> > VMA iterator")
> > Signed-off-by: SeongJae Park 
> > ---
> >  mm/damon/tests/vaddr-kunit.h | 10 +++---
> >  1 file changed, 3 insertions(+), 7 deletions(-)
> > 
> > diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h
> > index 83626483f82b..c6c7e0e0ab07 100644
> > --- a/mm/damon/tests/vaddr-kunit.h
> > +++ b/mm/damon/tests/vaddr-kunit.h
> > @@ -17,23 +17,19 @@
> >  static int __link_vmas(struct maple_tree *mt, struct vm_area_struct *vmas,
> > ssize_t nr_vmas)
> >  {
> > -   int i, ret = -ENOMEM;
> > +   int i;
> > MA_STATE(mas, mt, 0, 0);
> >  
> > if (!nr_vmas)
> > return 0;
> >  
> > -   mas_lock(&mas);
> > for (i = 0; i < nr_vmas; i++) {
> > mas_set_range(&mas, vmas[i].vm_start, vmas[i].vm_end - 1);
> > if (mas_store_gfp(&mas, &vmas[i], GFP_KERNEL))
> > -   goto failed;
> > +   return -ENOMEM;
> > }
> >  
> > -   ret = 0;
> > -failed:
> > -   mas_unlock(&mas);
> > -   return ret;
> > +   return 0;
> >  }
> >  
> >  /*
> > -- 
> > 2.39.2
> >

Re: [PATCH] mm/damon/tests/vaddr-kunit: don't use mas_lock for MM_MT_FLAGS-initialized maple tree

2024-09-03 Thread SeongJae Park

On Tue,  3 Sep 2024 17:58:15 -0700 SeongJae Park  wrote:

> On Tue, 3 Sep 2024 20:48:53 -0400 "Liam R. Howlett"  
> wrote:
> 
> > * SeongJae Park  [240903 20:45]:
> > > damon_test_three_regions_in_vmas() initializes a maple tree with
> > > MM_MT_FLAGS.  The flags contains MT_FLAGS_LOCK_EXTERN, which means
> > > mt_lock of the maple tree will not be used.  And therefore the maple
> > > tree initialization code skips initialization of the mt_lock.  However,
> > > __link_vmas(), which adds vmas for test to the maple tree, uses the
> > > mt_lock.  In other words, the uninitialized spinlock is used.  The
> > > problem becomes celar when spinlock debugging is turned on, since it
> > > reports spinlock bad magic bug.  Fix the issue by not using the mt_lock
> > > as promised.
> > 
> > You can't do this, lockdep will tell you this is wrong.
> 
> Hmm, but lockdep was silence on my setup?
> 
> > We need a lock and to use the lock for writes.
> 
> This code is executed by a single-thread test code.  Do we still need the 
> lock?
> 
> > 
> > I'd suggest using different flags so the spinlock is used.
> 
> The reporter mentioned simply dropping MT_FLAGS_LOCK_EXTERN from the flags
> causes suspicious RCU usage message.  May I ask if you have a suggestion of
> better flags?

I was actually thinking replacing the mt_init_flags() with mt_init(), which
same to mt_init_flags() with zero flag, like below.

```
--- a/mm/damon/tests/vaddr-kunit.h
+++ b/mm/damon/tests/vaddr-kunit.h
@@ -77,7 +77,7 @@ static void damon_test_three_regions_in_vmas(struct kunit 
*test)
(struct vm_area_struct) {.vm_start = 307, .vm_end = 330},
};

-   mt_init_flags(&mm.mm_mt, MM_MT_FLAGS);
+   mt_init(&mm.mm_mt);
if (__link_vmas(&mm.mm_mt, vmas, ARRAY_SIZE(vmas)))
kunit_skip(test, "Failed to create VMA tree");
```

And just confirmed it also convinces the reproducer.  But because I'm obviously
not familiar with maple tree, would like to hear some comments from Liam or
others first.

FYI, I ended up writing v1 to simply remove lock usage based on my humble
understanding of the documetnation.

The maple tree uses a spinlock by default, but external locks can be used 
for
tree updates as well.  To use an external lock, the tree must be initialized
with the ``MT_FLAGS_LOCK_EXTERN flag``, this is usually done with the
MTREE_INIT_EXT() #define, which takes an external lock as an argument.

(from Documentation/core-api/maple_tree.rst)

I was thinking the fact that the test code is executed in single thread is same
to use of external lock.  I will be happy to learn if I missed something.


Thanks,
SJ

> 
> 
> Thanks,
> SJ
> 
> > 
> > > 
> > > Reported-by: Guenter Roeck 
> > > Closes: 
> > > https://lore.kernel.org/1453b2b2-6119-4082-ad9e-f3c5239bf...@roeck-us.net
> > > Fixes: d0cf3dd47f0d ("damon: convert __damon_va_three_regions to use the 
> > > VMA iterator")
> > > Signed-off-by: SeongJae Park 
> > > ---
> > >  mm/damon/tests/vaddr-kunit.h | 10 +++---
> > >  1 file changed, 3 insertions(+), 7 deletions(-)
> > > 
> > > diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h
> > > index 83626483f82b..c6c7e0e0ab07 100644
> > > --- a/mm/damon/tests/vaddr-kunit.h
> > > +++ b/mm/damon/tests/vaddr-kunit.h
> > > @@ -17,23 +17,19 @@
> > >  static int __link_vmas(struct maple_tree *mt, struct vm_area_struct 
> > > *vmas,
> > >   ssize_t nr_vmas)
> > >  {
> > > - int i, ret = -ENOMEM;
> > > + int i;
> > >   MA_STATE(mas, mt, 0, 0);
> > >  
> > >   if (!nr_vmas)
> > >   return 0;
> > >  
> > > - mas_lock(&mas);
> > >   for (i = 0; i < nr_vmas; i++) {
> > >   mas_set_range(&mas, vmas[i].vm_start, vmas[i].vm_end - 1);
> > >   if (mas_store_gfp(&mas, &vmas[i], GFP_KERNEL))
> > > - goto failed;
> > > + return -ENOMEM;
> > >   }
> > >  
> > > - ret = 0;
> > > -failed:
> > > - mas_unlock(&mas);
> > > - return ret;
> > > + return 0;
> > >  }
> > >  
> > >  /*
> > > -- 
> > > 2.39.2
> > >

Re: [PATCH] mm/damon/tests/vaddr-kunit: don't use mas_lock for MM_MT_FLAGS-initialized maple tree

2024-09-03 Thread Guenter Roeck


On 9/3/24 17:58, SeongJae Park wrote:

On Tue, 3 Sep 2024 20:48:53 -0400 "Liam R. Howlett"  
wrote:


* SeongJae Park  [240903 20:45]:

damon_test_three_regions_in_vmas() initializes a maple tree with
MM_MT_FLAGS.  The flags contains MT_FLAGS_LOCK_EXTERN, which means
mt_lock of the maple tree will not be used.  And therefore the maple
tree initialization code skips initialization of the mt_lock.  However,
__link_vmas(), which adds vmas for test to the maple tree, uses the
mt_lock.  In other words, the uninitialized spinlock is used.  The
problem becomes celar when spinlock debugging is turned on, since it


Just in case you need to resend: s/celar/clear/


reports spinlock bad magic bug.  Fix the issue by not using the mt_lock
as promised.


You can't do this, lockdep will tell you this is wrong.


Hmm, but lockdep was silence on my setup?


We need a lock and to use the lock for writes.


This code is executed by a single-thread test code.  Do we still need the lock?



I'd suggest using different flags so the spinlock is used.


The reporter mentioned simply dropping MT_FLAGS_LOCK_EXTERN from the flags
causes suspicious RCU usage message.  May I ask if you have a suggestion of
better flags?



Correct. I don't see those messages with your patch. From my perspective, this 
is

Tested-by: Guenter Roeck 

Thanks,
Guenter

Re: [PATCH] x86/sgx: Fix deadloop in __sgx_alloc_epc_page()

2024-09-03 Thread Aaron Lu

On Tue, Sep 03, 2024 at 07:05:40PM +0300, Jarkko Sakkinen wrote:
> On Fri Aug 30, 2024 at 9:14 AM EEST, Aaron Lu wrote:
> > On Thu, Aug 29, 2024 at 07:44:13PM +0300, Jarkko Sakkinen wrote:
> > > On Thu Aug 29, 2024 at 5:38 AM EEST, Aaron Lu wrote:
> > > > When current node doesn't have a EPC section configured by firmware and
> > > > all other EPC sections memory are used up, CPU can stuck inside the
> > > > while loop in __sgx_alloc_epc_page() forever and soft lockup will 
> > > > happen.
> > > > Note how nid_of_current will never equal to nid in that while loop 
> > > > because
> > >  
> > > 
> > > Oh *that* while loop ;-) Please be more specific.
> >
> > What about:
> > Note how nid_of_current will never be equal to nid in the while loop that
> > searches an available EPC page from remote nodes because nid_of_current is
> > not set in sgx_numa_mask.
> 
> That would work I think!

While rewriting the changelog, I find it more natural to explain this
"while loop" when I first mentioned it, i.e.

When the current node doesn't have an EPC section configured by firmware
and all other EPC sections are used up, CPU can get stuck inside the
while loop that looks for an available EPC page from remote nodes
indefinitely, leading to a soft lockup. Note how nid_of_current will
never be equal to nid in that while loop because nid_of_current is not
set in sgx_numa_mask.

I hope this looks fine to you.

> >
> > > > nid_of_current is not set in sgx_numa_mask.
> > > >
> > > > Also worth mentioning is that it's perfectly fine for firmware to not
> > > > seup an EPC section on a node. Setting an EPC section on each node can
> > > > be good for performance but that's not a requirement functionality wise.
> > > 
> > > This lacks any description of what is done to __sgx_alloc_epc_page().
> >
> > Will add what Dave suggested on how the problem is fixed to the changelog.
> 
> Great. I think the code change is correct reflecting these additions.
> I'll look the next version as a whole but with high probability I can
> ack that as long as the commit message has these updates.

Thanks.

> >
> > > >
> > > > Fixes: 901ddbb9ecf5 ("x86/sgx: Add a basic NUMA allocation scheme to 
> > > > sgx_alloc_epc_page()")
> > > > Reported-by: Zhimin Luo 
> > > > Tested-by: Zhimin Luo 
> > > > Signed-off-by: Aaron Lu 
> >
> > Thanks,
> > Aaron
> 
> BR, Jarkko

[PATCH] selftest: drivers: Add support to check duplicate hwirq

2024-09-03 Thread Joseph Jang

Validate there are no duplicate hwirq from the irq debug
file system /sys/kernel/debug/irq/irqs/* per chip name.

One example log show 2 duplicated hwirq in the irq debug
file system.

$ sudo cat /sys/kernel/debug/irq/irqs/163
handler:  handle_fasteoi_irq
device:   0019:00:00.0
 
node: 1
affinity: 72-143
effectiv: 76
domain:  irqchip@0x10002204-3
 hwirq:   0xc800
 chip:ITS-MSI
  flags:   0x20

$ sudo cat /sys/kernel/debug/irq/irqs/174
handler:  handle_fasteoi_irq
device:   0039:00:00.0

node: 3
affinity: 216-287
effectiv: 221
domain:  irqchip@0x30002204-3
 hwirq:   0xc800
 chip:ITS-MSI
  flags:   0x20

The irq-check.sh can help to collect hwirq and chip name from
/sys/kernel/debug/irq/irqs/* and print error log when find duplicate
hwirq per chip name.

Kernel patch ("PCI/MSI: Fix MSI hwirq truncation") [1] fix above issue.
[1]: https://lore.kernel.org/all/20240115135649.708536-1-vid...@nvidia.com/

Signed-off-by: Joseph Jang 
Reviewed-by: Matthew R. Ochs 
---
 tools/testing/selftests/drivers/irq/Makefile  |  5 +++
 tools/testing/selftests/drivers/irq/config|  2 +
 .../selftests/drivers/irq/irq-check.sh| 39 +++
 3 files changed, 46 insertions(+)
 create mode 100644 tools/testing/selftests/drivers/irq/Makefile
 create mode 100644 tools/testing/selftests/drivers/irq/config
 create mode 100755 tools/testing/selftests/drivers/irq/irq-check.sh

diff --git a/tools/testing/selftests/drivers/irq/Makefile 
b/tools/testing/selftests/drivers/irq/Makefile
new file mode 100644
index ..d6998017c861
--- /dev/null
+++ b/tools/testing/selftests/drivers/irq/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
+TEST_PROGS := irq-check.sh
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/drivers/irq/config 
b/tools/testing/selftests/drivers/irq/config
new file mode 100644
index ..a53d3b713728
--- /dev/null
+++ b/tools/testing/selftests/drivers/irq/config
@@ -0,0 +1,2 @@
+CONFIG_GENERIC_IRQ_DEBUGFS=y
+CONFIG_GENERIC_IRQ_INJECTION=y
diff --git a/tools/testing/selftests/drivers/irq/irq-check.sh 
b/tools/testing/selftests/drivers/irq/irq-check.sh
new file mode 100755
index ..e784777043a1
--- /dev/null
+++ b/tools/testing/selftests/drivers/irq/irq-check.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This script need root permission
+uid=$(id -u)
+if [ $uid -ne 0 ]; then
+   echo "SKIP: Must be run as root"
+   exit 4
+fi
+
+# Ensure debugfs is mounted
+mount -t debugfs nodev /sys/kernel/debug 2>/dev/null
+if [ ! -d "/sys/kernel/debug/irq/irqs" ]; then
+   echo "SKIP: irq debugfs not found"
+   exit 4
+fi
+
+# Traverse the irq debug file system directory to collect chip_name and hwirq
+hwirq_list=$(for irq_file in /sys/kernel/debug/irq/irqs/*; do
+   # Read chip name and hwirq from the irq_file
+   chip_name=$(cat "$irq_file" | grep -m 1 'chip:' | awk '{print $2}')
+   hwirq=$(cat "$irq_file" | grep -m 1 'hwirq:' | awk '{print $2}' )
+
+   if [ -z "$chip_name" ] || [ -z "$hwirq" ]; then
+   continue
+   fi
+
+   echo "$chip_name $hwirq"
+done)
+
+dup_hwirq_list=$(echo "$hwirq_list" | sort | uniq -cd)
+
+if [ -n "$dup_hwirq_list" ]; then
+   echo "ERROR: Found duplicate hwirq"
+   echo "$dup_hwirq_list"
+   exit 1
+fi
+
+exit 0
-- 
2.34.1

[PATCH v2] selftests: net: convert comma to semicolon

2024-09-03 Thread Chen Ni

Replace comma between expressions with semicolons.

Using a ',' in place of a ';' can have unintended side effects.
Although that is not the case here, it is seems best to use ';'
unless ',' is intended.

Found by inspection.
No functional change intended.
Compile tested only.

Signed-off-by: Chen Ni 
---
Changelog:

v1 -> v2:

1. Update commit message.
---
 tools/testing/selftests/net/psock_fanout.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/net/psock_fanout.c 
b/tools/testing/selftests/net/psock_fanout.c
index 1a736f700be4..4f31e92ebd96 100644
--- a/tools/testing/selftests/net/psock_fanout.c
+++ b/tools/testing/selftests/net/psock_fanout.c
@@ -165,9 +165,9 @@ static void sock_fanout_set_ebpf(int fd)
attr.insns = (unsigned long) prog;
attr.insn_cnt = ARRAY_SIZE(prog);
attr.license = (unsigned long) "GPL";
-   attr.log_buf = (unsigned long) log_buf,
-   attr.log_size = sizeof(log_buf),
-   attr.log_level = 1,
+   attr.log_buf = (unsigned long) log_buf;
+   attr.log_size = sizeof(log_buf);
+   attr.log_level = 1;
 
pfd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
if (pfd < 0) {
-- 
2.25.1

Re: [PATCH v4 2/2] livepatch: Add using attribute to klp_func for using function show

2024-09-03 Thread zhang warden




> On Aug 28, 2024, at 10:23, Wardenjohn  wrote:
> 
> One system may contains more than one livepatch module. We can see
> which patch is enabled. If some patches applied to one system
> modifing the same function, livepatch will use the function enabled
> on top of the function stack. However, we can not excatly know
> which function of which patch is now enabling.
> 
> This patch introduce one sysfs attribute of "using" to klp_func.
> For example, if there are serval patches  make changes to function
> "meminfo_proc_show", the attribute "enabled" of all the patch is 1.
> With this attribute, we can easily know the version enabling belongs
> to which patch.
> 
> The "using" is set as three state. 0 is disabled, it means that this
> version of function is not used. 1 is running, it means that this
> version of function is now running. -1 is unknown, it means that
> this version of function is under transition, some task is still
> chaning their running version of this function.
> 
> cat /sys/kernel/livepatchusing -> 0
> means that the function1 of patch1 is disabled.
> 
> cat /sys/kernel/livepatchusing -> 1
> means that the function1 of patchN is enabled.
> 
> cat /sys/kernel/livepatchusing -> -1
> means that the function1 of patchN is under transition and unknown.
> 
> Signed-off-by: Wardenjohn 
> 
> 

Hi, Maintainers.

How about this patch? I am looking for your suggestions.

Regards,
Wardenjohn.

Re: [PATCH] mm/damon/tests/vaddr-kunit: don't use mas_lock for MM_MT_FLAGS-initialized maple tree

2024-09-03 Thread Guenter Roeck


On 9/3/24 18:18, SeongJae Park wrote:

On Tue,  3 Sep 2024 17:58:15 -0700 SeongJae Park  wrote:


On Tue, 3 Sep 2024 20:48:53 -0400 "Liam R. Howlett"  
wrote:


* SeongJae Park  [240903 20:45]:

damon_test_three_regions_in_vmas() initializes a maple tree with
MM_MT_FLAGS.  The flags contains MT_FLAGS_LOCK_EXTERN, which means
mt_lock of the maple tree will not be used.  And therefore the maple
tree initialization code skips initialization of the mt_lock.  However,
__link_vmas(), which adds vmas for test to the maple tree, uses the
mt_lock.  In other words, the uninitialized spinlock is used.  The
problem becomes celar when spinlock debugging is turned on, since it
reports spinlock bad magic bug.  Fix the issue by not using the mt_lock
as promised.


You can't do this, lockdep will tell you this is wrong.


Hmm, but lockdep was silence on my setup?


We need a lock and to use the lock for writes.


This code is executed by a single-thread test code.  Do we still need the lock?



I'd suggest using different flags so the spinlock is used.


The reporter mentioned simply dropping MT_FLAGS_LOCK_EXTERN from the flags
causes suspicious RCU usage message.  May I ask if you have a suggestion of
better flags?


I was actually thinking replacing the mt_init_flags() with mt_init(), which
same to mt_init_flags() with zero flag, like below.

```
--- a/mm/damon/tests/vaddr-kunit.h
+++ b/mm/damon/tests/vaddr-kunit.h
@@ -77,7 +77,7 @@ static void damon_test_three_regions_in_vmas(struct kunit 
*test)
 (struct vm_area_struct) {.vm_start = 307, .vm_end = 330},
 };

-   mt_init_flags(&mm.mm_mt, MM_MT_FLAGS);
+   mt_init(&mm.mm_mt);
 if (__link_vmas(&mm.mm_mt, vmas, ARRAY_SIZE(vmas)))
 kunit_skip(test, "Failed to create VMA tree");
```

And just confirmed it also convinces the reproducer.  But because I'm obviously
not familiar with maple tree, would like to hear some comments from Liam or
others first.


Same here. That is why I gave up after trying MT_FLAGS_ALLOC_RANGE and
"MT_FLAGS_ALLOC_RANGE | MT_FLAGS_USE_RCU". After all, I really don't know what
I am doing and was just playing around ... and there isn't really a good
explanation why initializing the maple tree with MT_FLAGS_ALLOC_RANGE (but not
MT_FLAGS_USE_RCU) would trigger rcu warnings.

Guenter

Re: [RFC 05/31] x86/compiler: Tweak __UNIQUE_ID naming

2024-09-03 Thread Josh Poimboeuf

On Tue, Sep 03, 2024 at 09:56:34AM +0200, Peter Zijlstra wrote:
> > -#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), 
> > __COUNTER__)
> > +/* Format: __UNIQUE_ID__<__COUNTER__> */
> > +#define __UNIQUE_ID(name)  \
> > +   __PASTE(__UNIQUE_ID_,   \
> > +   __PASTE(name,   \
> > +   __PASTE(_, __COUNTER__)))
> 
> OK, that's just painful to read; how about so?
> 
>   __PASTE(__UNIQUE_ID_,   \
>   __PASTE(name,   \
>   __PASTE(_, __COUNTER)))

Sure.

> >  /**
> >   * data_race - mark an expression as containing intentional data races
> > @@ -218,7 +222,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, 
> > int val,
> >   */
> >  #define ___ADDRESSABLE(sym, __attrs) \
> > static void * __used __attrs \
> > -   __UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)(uintptr_t)&sym;
> > +   __UNIQUE_ID(__PASTE(addressable_, sym)) = (void *)(uintptr_t)&sym;
> 
> This change doesn't get mention ?

Hm, I have no idea why I did that...  I'll drop it.

-- 
Josh

Re: [RFC 07/31] kbuild: Remove "kmod" prefix from __KBUILD_MODNAME

2024-09-03 Thread Josh Poimboeuf

On Tue, Sep 03, 2024 at 09:58:13AM +0200, Peter Zijlstra wrote:
> On Mon, Sep 02, 2024 at 08:59:50PM -0700, Josh Poimboeuf wrote:
> > Remove the arbitrary "kmod" prefix from __KBUILD_MODNAME and add it back
> > manually in the __initcall_id() macro.
> > 
> > This makes it more consistent, now __KBUILD_MODNAME is just the
> > non-stringified version of KBUILD_MODNAME.  It will come in handy for
> > the upcoming "objtool klp diff".
> > 
> > Signed-off-by: Josh Poimboeuf 
> > ---
> >  include/linux/init.h | 3 ++-
> >  scripts/Makefile.lib | 2 +-
> >  2 files changed, 3 insertions(+), 2 deletions(-)
> > 
> > diff --git a/include/linux/init.h b/include/linux/init.h
> > index 58cef4c2e59a..b1921f4a7b7e 100644
> > --- a/include/linux/init.h
> > +++ b/include/linux/init.h
> > @@ -206,12 +206,13 @@ extern struct module __this_module;
> >  
> >  /* Format:  */
> >  #define __initcall_id(fn)  \
> > +   __PASTE(kmod_,  \
> > __PASTE(__KBUILD_MODNAME,   \
> > __PASTE(__, \
> > __PASTE(__COUNTER__,\
> > __PASTE(_,  \
> > __PASTE(__LINE__,   \
> > -   __PASTE(_, fn))
> > +   __PASTE(_, fn)))
> 
> :-(

Yeah, I was just keeping the existing format here.

But actually, I strongly prefer it compared to this:

/* Format:  */
#define __initcall_id(fn)   \
__PASTE(kmod_,  \
__PASTE(__KBUILD_MODNAME,   \
__PASTE(__, \
__PASTE(__COUNTER__,\
__PASTE(_,  \
__PASTE(__LINE__,   \
__PASTE(_, fn)))

That gives headaches.  Sure, the vertically aligned version is a bit
unorthodox but it *much* easier to read if you know how to read it: just
scan down.

And the "Format:" comment at the top clarifies the result.

-- 
Josh

Re: [RFC 20/31] objtool: Add UD1 detection

2024-09-03 Thread Josh Poimboeuf

On Tue, Sep 03, 2024 at 10:17:48AM +0200, Peter Zijlstra wrote:
> On Mon, Sep 02, 2024 at 09:00:03PM -0700, Josh Poimboeuf wrote:
> > A UD1 isn't a BUG and shouldn't be treated like one.
> > 
> > Signed-off-by: Josh Poimboeuf 
> > ---
> >  tools/objtool/arch/x86/decode.c | 10 +-
> >  1 file changed, 9 insertions(+), 1 deletion(-)
> > 
> > diff --git a/tools/objtool/arch/x86/decode.c 
> > b/tools/objtool/arch/x86/decode.c
> > index 6b34b058a821..72d55dcd3d7f 100644
> > --- a/tools/objtool/arch/x86/decode.c
> > +++ b/tools/objtool/arch/x86/decode.c
> > @@ -528,11 +528,19 @@ int arch_decode_instruction(struct objtool_file 
> > *file, const struct section *sec
> > /* sysenter, sysret */
> > insn->type = INSN_CONTEXT_SWITCH;
> >  
> > -   } else if (op2 == 0x0b || op2 == 0xb9) {
> > +   } else if (op2 == 0x0b) {
> >  
> > /* ud2 */
> > insn->type = INSN_BUG;
> >  
> > +   } else if (op2 == 0xb9) {
> > +
> > +   /*
> > +* ud1 - only used for the static call trampoline to
> > +* stop speculation.  Basically used like an int3.
> > +*/
> > +   insn->type = INSN_TRAP;
> > +
> > } else if (op2 == 0x0d || op2 == 0x1f) {
> >  
> > /* nopl/nopw */
> 
> We recently grew more UD1 usage...
> 
> ---
> commit 7424fc6b86c8980a87169e005f5cd4438d18efe6
> Author: Gatlin Newhouse 
> Date:   Wed Jul 24 00:01:55 2024 +

Interesting, thanks.  I'm having a senior moment as I can't remember why
this patch exists -- maybe it was code inspection?  My patch desciption
is awful.  I'll just drop it for now.

-- 
Josh

Re: [PATCH 5/5] KVM: VMX: Always honor guest PAT on CPUs that support self-snoop

2024-09-03 Thread Yan Zhao

On Tue, Sep 03, 2024 at 06:20:27PM +0200, Vitaly Kuznetsov wrote:
> Sean Christopherson  writes:
> 
> > On Mon, Sep 02, 2024, Vitaly Kuznetsov wrote:
> >> FWIW, I use QEMU-9.0 from the same C10S (qemu-kvm-9.0.0-7.el10.x86_64)
> >> but I don't think it matters in this case. My CPU is "Intel(R) Xeon(R)
> >> Silver 4410Y".
> >
> > Has this been reproduced on any other hardware besides SPR?  I.e. did we 
> > stumble
> > on another hardware issue?
> 
> Very possible, as according to Yan Zhao this doesn't reproduce on at
> least "Coffee Lake-S". Let me try to grab some random hardware around
> and I'll be back with my observations.

Update some new findings from my side:

BAR 0 of bochs VGA (fb_map) is used for frame buffer, covering phys range
from 0xfd00 to 0xfe00.

On "Sapphire Rapids XCC":

1. If KVM forces this fb_map range to be WC+IPAT, installer/gdm can launch
   correctly. 
   i.e.
   if (gfn >= 0xfd000 && gfn < 0xfe000) {
return (MTRR_TYPE_WRCOMB << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
   }
   return MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT;

2. If KVM forces this fb_map range to be UC+IPAT, installer failes to show / gdm
   restarts endlessly. (though on Coffee Lake-S, installer/gdm can launch
   correctly in this case).

3. On starting GDM, ttm_kmap_iter_linear_io_init() in guest is called to set
   this fb_map range as WC, with
   iosys_map_set_vaddr_iomem(&iter_io->dmap, ioremap_wc(mem->bus.offset, 
mem->size));

   However, during bochs_pci_probe()-->bochs_load()-->bochs_hw_init(), pfns for
   this fb_map has been reserved as uc- by ioremap().
   Then, the ioremap_wc() during starting GDM will only map guest PAT with UC-.

   So, with KVM setting WB (no IPAT) to this fb_map range, the effective
   memory type is UC- and installer/gdm restarts endlessly.

4. If KVM sets WB (no IPAT) to this fb_map range, and changes guest bochs driver
   to call ioremap_wc() instead in bochs_hw_init(), gdm can launch correctly.
   (didn't verify the installer's case as I can't update the driver in that 
case).

   The reason is that the ioremap_wc() called during starting GDM will no longer
   meet conflict and can map guest PAT as WC.

WIP to find out why effective UC in fb_map range will make gdm to restart
endlessly.

Re: [PATCH] mm/damon/tests/vaddr-kunit: don't use mas_lock for MM_MT_FLAGS-initialized maple tree

2024-09-03 Thread Liam R. Howlett

* SeongJae Park  [240903 21:18]:
> On Tue,  3 Sep 2024 17:58:15 -0700 SeongJae Park  wrote:
> 
> > On Tue, 3 Sep 2024 20:48:53 -0400 "Liam R. Howlett" 
> >  wrote:
> > 
> > > * SeongJae Park  [240903 20:45]:
> > > > damon_test_three_regions_in_vmas() initializes a maple tree with
> > > > MM_MT_FLAGS.  The flags contains MT_FLAGS_LOCK_EXTERN, which means
> > > > mt_lock of the maple tree will not be used.  And therefore the maple
> > > > tree initialization code skips initialization of the mt_lock.  However,
> > > > __link_vmas(), which adds vmas for test to the maple tree, uses the
> > > > mt_lock.  In other words, the uninitialized spinlock is used.  The
> > > > problem becomes celar when spinlock debugging is turned on, since it
> > > > reports spinlock bad magic bug.  Fix the issue by not using the mt_lock
> > > > as promised.
> > > 
> > > You can't do this, lockdep will tell you this is wrong.
> > 
> > Hmm, but lockdep was silence on my setup?
> > 
> > > We need a lock and to use the lock for writes.
> > 
> > This code is executed by a single-thread test code.  Do we still need the 
> > lock?
> > 
> > > 
> > > I'd suggest using different flags so the spinlock is used.
> > 
> > The reporter mentioned simply dropping MT_FLAGS_LOCK_EXTERN from the flags
> > causes suspicious RCU usage message.  May I ask if you have a suggestion of
> > better flags?

That would be the lockdep complaining, so that's good.

> 
> I was actually thinking replacing the mt_init_flags() with mt_init(), which
> same to mt_init_flags() with zero flag, like below.

Yes.  This will use the spinlock which should fix your issue, but it
will use a different style of maple tree.

Perhaps use MT_FLAGS_ALLOC_RANGE to use the same type of maple tree, if
you ever add threading you will want the rcu flag as well
(MT_FLAGS_USE_RCU).

I would recommend those two and just use the spinlock.

> 
> ```
> --- a/mm/damon/tests/vaddr-kunit.h
> +++ b/mm/damon/tests/vaddr-kunit.h
> @@ -77,7 +77,7 @@ static void damon_test_three_regions_in_vmas(struct kunit 
> *test)
> (struct vm_area_struct) {.vm_start = 307, .vm_end = 330},
> };
> 
> -   mt_init_flags(&mm.mm_mt, MM_MT_FLAGS);
> +   mt_init(&mm.mm_mt);
> if (__link_vmas(&mm.mm_mt, vmas, ARRAY_SIZE(vmas)))
> kunit_skip(test, "Failed to create VMA tree");
> ```
> 
> And just confirmed it also convinces the reproducer.  But because I'm 
> obviously
> not familiar with maple tree, would like to hear some comments from Liam or
> others first.
> 
> FYI, I ended up writing v1 to simply remove lock usage based on my humble
> understanding of the documetnation.
> 
> The maple tree uses a spinlock by default, but external locks can be used 
> for
> tree updates as well.  To use an external lock, the tree must be 
> initialized
> with the ``MT_FLAGS_LOCK_EXTERN flag``, this is usually done with the
> MTREE_INIT_EXT() #define, which takes an external lock as an argument.
> 
> (from Documentation/core-api/maple_tree.rst)
> 
> I was thinking the fact that the test code is executed in single thread is 
> same
> to use of external lock.  I will be happy to learn if I missed something.
> 
> 
> Thanks,
> SJ
> 
> > 
> > 
> > Thanks,
> > SJ
> > 
> > > 
> > > > 
> > > > Reported-by: Guenter Roeck 
> > > > Closes: 
> > > > https://lore.kernel.org/1453b2b2-6119-4082-ad9e-f3c5239bf...@roeck-us.net
> > > > Fixes: d0cf3dd47f0d ("damon: convert __damon_va_three_regions to use 
> > > > the VMA iterator")
> > > > Signed-off-by: SeongJae Park 
> > > > ---
> > > >  mm/damon/tests/vaddr-kunit.h | 10 +++---
> > > >  1 file changed, 3 insertions(+), 7 deletions(-)
> > > > 
> > > > diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h
> > > > index 83626483f82b..c6c7e0e0ab07 100644
> > > > --- a/mm/damon/tests/vaddr-kunit.h
> > > > +++ b/mm/damon/tests/vaddr-kunit.h
> > > > @@ -17,23 +17,19 @@
> > > >  static int __link_vmas(struct maple_tree *mt, struct vm_area_struct 
> > > > *vmas,
> > > > ssize_t nr_vmas)
> > > >  {
> > > > -   int i, ret = -ENOMEM;
> > > > +   int i;
> > > > MA_STATE(mas, mt, 0, 0);
> > > >  
> > > > if (!nr_vmas)
> > > > return 0;
> > > >  
> > > > -   mas_lock(&mas);
> > > > for (i = 0; i < nr_vmas; i++) {
> > > > mas_set_range(&mas, vmas[i].vm_start, vmas[i].vm_end - 
> > > > 1);
> > > > if (mas_store_gfp(&mas, &vmas[i], GFP_KERNEL))
> > > > -   goto failed;
> > > > +   return -ENOMEM;
> > > > }
> > > >  
> > > > -   ret = 0;
> > > > -failed:
> > > > -   mas_unlock(&mas);
> > > > -   return ret;
> > > > +   return 0;
> > > >  }
> > > >  
> > > >  /*
> > > > -- 
> > > > 2.39.2
> > > >

Re: [PATCH] mm/damon/tests/vaddr-kunit: don't use mas_lock for MM_MT_FLAGS-initialized maple tree

2024-09-03 Thread Guenter Roeck


On 9/3/24 19:31, Liam R. Howlett wrote:

* SeongJae Park  [240903 21:18]:

On Tue,  3 Sep 2024 17:58:15 -0700 SeongJae Park  wrote:


On Tue, 3 Sep 2024 20:48:53 -0400 "Liam R. Howlett"  
wrote:


* SeongJae Park  [240903 20:45]:

damon_test_three_regions_in_vmas() initializes a maple tree with
MM_MT_FLAGS.  The flags contains MT_FLAGS_LOCK_EXTERN, which means
mt_lock of the maple tree will not be used.  And therefore the maple
tree initialization code skips initialization of the mt_lock.  However,
__link_vmas(), which adds vmas for test to the maple tree, uses the
mt_lock.  In other words, the uninitialized spinlock is used.  The
problem becomes celar when spinlock debugging is turned on, since it
reports spinlock bad magic bug.  Fix the issue by not using the mt_lock
as promised.


You can't do this, lockdep will tell you this is wrong.


Hmm, but lockdep was silence on my setup?


We need a lock and to use the lock for writes.


This code is executed by a single-thread test code.  Do we still need the lock?



I'd suggest using different flags so the spinlock is used.


The reporter mentioned simply dropping MT_FLAGS_LOCK_EXTERN from the flags
causes suspicious RCU usage message.  May I ask if you have a suggestion of
better flags?


That would be the lockdep complaining, so that's good.



I was actually thinking replacing the mt_init_flags() with mt_init(), which
same to mt_init_flags() with zero flag, like below.


Yes.  This will use the spinlock which should fix your issue, but it
will use a different style of maple tree.

Perhaps use MT_FLAGS_ALLOC_RANGE to use the same type of maple tree, if
you ever add threading you will want the rcu flag as well
(MT_FLAGS_USE_RCU).

I would recommend those two and just use the spinlock.



I tried that (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_USE_RCU). it also triggers
the suspicious RCU usage message.

Guenter



```
--- a/mm/damon/tests/vaddr-kunit.h
+++ b/mm/damon/tests/vaddr-kunit.h
@@ -77,7 +77,7 @@ static void damon_test_three_regions_in_vmas(struct kunit 
*test)
 (struct vm_area_struct) {.vm_start = 307, .vm_end = 330},
 };

-   mt_init_flags(&mm.mm_mt, MM_MT_FLAGS);
+   mt_init(&mm.mm_mt);
 if (__link_vmas(&mm.mm_mt, vmas, ARRAY_SIZE(vmas)))
 kunit_skip(test, "Failed to create VMA tree");
```

And just confirmed it also convinces the reproducer.  But because I'm obviously
not familiar with maple tree, would like to hear some comments from Liam or
others first.

FYI, I ended up writing v1 to simply remove lock usage based on my humble
understanding of the documetnation.

 The maple tree uses a spinlock by default, but external locks can be used 
for
 tree updates as well.  To use an external lock, the tree must be 
initialized
 with the ``MT_FLAGS_LOCK_EXTERN flag``, this is usually done with the
 MTREE_INIT_EXT() #define, which takes an external lock as an argument.

(from Documentation/core-api/maple_tree.rst)

I was thinking the fact that the test code is executed in single thread is same
to use of external lock.  I will be happy to learn if I missed something.


Thanks,
SJ




Thanks,
SJ





Reported-by: Guenter Roeck 
Closes: 
https://lore.kernel.org/1453b2b2-6119-4082-ad9e-f3c5239bf...@roeck-us.net
Fixes: d0cf3dd47f0d ("damon: convert __damon_va_three_regions to use the VMA 
iterator")
Signed-off-by: SeongJae Park 
---
  mm/damon/tests/vaddr-kunit.h | 10 +++---
  1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h
index 83626483f82b..c6c7e0e0ab07 100644
--- a/mm/damon/tests/vaddr-kunit.h
+++ b/mm/damon/tests/vaddr-kunit.h
@@ -17,23 +17,19 @@
  static int __link_vmas(struct maple_tree *mt, struct vm_area_struct *vmas,
ssize_t nr_vmas)
  {
-   int i, ret = -ENOMEM;
+   int i;
MA_STATE(mas, mt, 0, 0);
  
  	if (!nr_vmas)

return 0;
  
-	mas_lock(&mas);

for (i = 0; i < nr_vmas; i++) {
mas_set_range(&mas, vmas[i].vm_start, vmas[i].vm_end - 1);
if (mas_store_gfp(&mas, &vmas[i], GFP_KERNEL))
-   goto failed;
+   return -ENOMEM;
}
  
-	ret = 0;

-failed:
-   mas_unlock(&mas);
-   return ret;
+   return 0;
  }
  
  /*

--
2.39.2

Re: [PATCH] mm/damon/tests/vaddr-kunit: don't use mas_lock for MM_MT_FLAGS-initialized maple tree

2024-09-03 Thread Liam R. Howlett

* Guenter Roeck  [240903 21:54]:
> On 9/3/24 18:18, SeongJae Park wrote:
> > On Tue,  3 Sep 2024 17:58:15 -0700 SeongJae Park  wrote:
> > 
> > > On Tue, 3 Sep 2024 20:48:53 -0400 "Liam R. Howlett" 
> > >  wrote:
> > > 
> > > > * SeongJae Park  [240903 20:45]:
> > > > > damon_test_three_regions_in_vmas() initializes a maple tree with
> > > > > MM_MT_FLAGS.  The flags contains MT_FLAGS_LOCK_EXTERN, which means
> > > > > mt_lock of the maple tree will not be used.  And therefore the maple
> > > > > tree initialization code skips initialization of the mt_lock.  
> > > > > However,
> > > > > __link_vmas(), which adds vmas for test to the maple tree, uses the
> > > > > mt_lock.  In other words, the uninitialized spinlock is used.  The
> > > > > problem becomes celar when spinlock debugging is turned on, since it
> > > > > reports spinlock bad magic bug.  Fix the issue by not using the 
> > > > > mt_lock
> > > > > as promised.
> > > > 
> > > > You can't do this, lockdep will tell you this is wrong.
> > > 
> > > Hmm, but lockdep was silence on my setup?
> > > 
> > > > We need a lock and to use the lock for writes.
> > > 
> > > This code is executed by a single-thread test code.  Do we still need the 
> > > lock?
> > > 
> > > > 
> > > > I'd suggest using different flags so the spinlock is used.
> > > 
> > > The reporter mentioned simply dropping MT_FLAGS_LOCK_EXTERN from the flags
> > > causes suspicious RCU usage message.  May I ask if you have a suggestion 
> > > of
> > > better flags?
> > 
> > I was actually thinking replacing the mt_init_flags() with mt_init(), which
> > same to mt_init_flags() with zero flag, like below.
> > 
> > ```
> > --- a/mm/damon/tests/vaddr-kunit.h
> > +++ b/mm/damon/tests/vaddr-kunit.h
> > @@ -77,7 +77,7 @@ static void damon_test_three_regions_in_vmas(struct kunit 
> > *test)
> >  (struct vm_area_struct) {.vm_start = 307, .vm_end = 330},
> >  };
> > 
> > -   mt_init_flags(&mm.mm_mt, MM_MT_FLAGS);
> > +   mt_init(&mm.mm_mt);
> >  if (__link_vmas(&mm.mm_mt, vmas, ARRAY_SIZE(vmas)))
> >  kunit_skip(test, "Failed to create VMA tree");
> > ```
> > 
> > And just confirmed it also convinces the reproducer.  But because I'm 
> > obviously
> > not familiar with maple tree, would like to hear some comments from Liam or
> > others first.

Again, I'd use the flags "MT_FLAGS_ALLOC_RANGE | MT_FLAGS_USE_RCU"
because that gets you the gap tracking that may be necessary for tests
in the future - it's closer to the MM_MT_FLAGS, so maybe some mm
function you use depends on that.

> > 
> Same here. That is why I gave up after trying MT_FLAGS_ALLOC_RANGE and
> "MT_FLAGS_ALLOC_RANGE | MT_FLAGS_USE_RCU". After all, I really don't know what
> I am doing and was just playing around ... and there isn't really a good
> explanation why initializing the maple tree with MT_FLAGS_ALLOC_RANGE (but not
> MT_FLAGS_USE_RCU) would trigger rcu warnings.

Thanks, I'll add that to my list of things to do.

Regards,
Liam

Re: [PATCH] mm/damon/tests/vaddr-kunit: don't use mas_lock for MM_MT_FLAGS-initialized maple tree

2024-09-03 Thread Liam R. Howlett

* Guenter Roeck  [240903 22:38]:
> On 9/3/24 19:31, Liam R. Howlett wrote:
> > * SeongJae Park  [240903 21:18]:
> > > On Tue,  3 Sep 2024 17:58:15 -0700 SeongJae Park  wrote:
> > > 
> > > > On Tue, 3 Sep 2024 20:48:53 -0400 "Liam R. Howlett" 
> > > >  wrote:
> > > > 
> > > > > * SeongJae Park  [240903 20:45]:
> > > > > > damon_test_three_regions_in_vmas() initializes a maple tree with
> > > > > > MM_MT_FLAGS.  The flags contains MT_FLAGS_LOCK_EXTERN, which means
> > > > > > mt_lock of the maple tree will not be used.  And therefore the maple
> > > > > > tree initialization code skips initialization of the mt_lock.  
> > > > > > However,
> > > > > > __link_vmas(), which adds vmas for test to the maple tree, uses the
> > > > > > mt_lock.  In other words, the uninitialized spinlock is used.  The
> > > > > > problem becomes celar when spinlock debugging is turned on, since it
> > > > > > reports spinlock bad magic bug.  Fix the issue by not using the 
> > > > > > mt_lock
> > > > > > as promised.
> > > > > 
> > > > > You can't do this, lockdep will tell you this is wrong.
> > > > 
> > > > Hmm, but lockdep was silence on my setup?
> > > > 
> > > > > We need a lock and to use the lock for writes.
> > > > 
> > > > This code is executed by a single-thread test code.  Do we still need 
> > > > the lock?
> > > > 
> > > > > 
> > > > > I'd suggest using different flags so the spinlock is used.
> > > > 
> > > > The reporter mentioned simply dropping MT_FLAGS_LOCK_EXTERN from the 
> > > > flags
> > > > causes suspicious RCU usage message.  May I ask if you have a 
> > > > suggestion of
> > > > better flags?
> > 
> > That would be the lockdep complaining, so that's good.
> > 
> > > 
> > > I was actually thinking replacing the mt_init_flags() with mt_init(), 
> > > which
> > > same to mt_init_flags() with zero flag, like below.
> > 
> > Yes.  This will use the spinlock which should fix your issue, but it
> > will use a different style of maple tree.
> > 
> > Perhaps use MT_FLAGS_ALLOC_RANGE to use the same type of maple tree, if
> > you ever add threading you will want the rcu flag as well
> > (MT_FLAGS_USE_RCU).
> > 
> > I would recommend those two and just use the spinlock.
> > 
> 
> I tried that (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_USE_RCU). it also triggers
> the suspicious RCU usage message.

Just to be clear, you changed the init flags and kept the mas_lock() and
mas_unlock() ?

Thanks,
Liam

[PATCH] selftests: futex: Fix missing free in main

2024-09-03 Thread zhangjiao2

From: zhang jiao 

Free string allocated by asprintf().

Signed-off-by: zhang jiao 
---
 tools/testing/selftests/futex/functional/futex_requeue_pi.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi.c 
b/tools/testing/selftests/futex/functional/futex_requeue_pi.c
index 215c6cb539b4..fb2dab46087f 100644
--- a/tools/testing/selftests/futex/functional/futex_requeue_pi.c
+++ b/tools/testing/selftests/futex/functional/futex_requeue_pi.c
@@ -416,5 +416,8 @@ int main(int argc, char *argv[])
ret = unit_test(broadcast, locked, owner, timeout_ns);
 
print_result(test_name, ret);
+   if (strlen(test_name) > strlen(TEST_NAME))
+   free(test_name);
+
return ret;
 }
-- 
2.33.0

Re: [PATCH] virtio_net: Fix mismatched buf address when unmapping for small packets

2024-09-03 Thread Jason Wang

On Wed, Sep 4, 2024 at 10:51 AM Wenbo Li  wrote:
>
> Currently, the virtio-net driver will perform a pre-dma-mapping for
> small or mergeable RX buffer. But for small packets, a mismatched address
> without VIRTNET_RX_PAD and xdp_headroom is used for unmapping.
>
> That will result in unsynchronized buffers when SWIOTLB is enabled, for
> example, when running as a TDX guest.
>
> This patch handles small and mergeable packets separately and fixes
> the mismatched buffer address.
>

Missing fixes tag.

> Signed-off-by: Wenbo Li 
> Signed-off-by: Jiahui Cen 
> Signed-off-by: Ying Fang 
> ---
>  drivers/net/virtio_net.c | 26 +-
>  1 file changed, 25 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index c6af18948..6215b66d8 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -891,6 +891,20 @@ static void *virtnet_rq_get_buf(struct receive_queue 
> *rq, u32 *len, void **ctx)
> return buf;
>  }
>
> +static void *virtnet_rq_get_buf_small(struct receive_queue *rq,
> + u32 *len,
> + void **ctx,
> + unsigned int header_offset)
> +{
> +   void *buf;
> +
> +   buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
> +   if (buf)
> +   virtnet_rq_unmap(rq, buf + header_offset, *len);
> +
> +   return buf;
> +}
> +
>  static void virtnet_rq_init_one_sg(struct receive_queue *rq, void *buf, u32 
> len)
>  {
> struct virtnet_rq_dma *dma;
> @@ -2692,13 +2706,23 @@ static int virtnet_receive_packets(struct 
> virtnet_info *vi,
> int packets = 0;
> void *buf;
>
> -   if (!vi->big_packets || vi->mergeable_rx_bufs) {
> +   if (vi->mergeable_rx_bufs) {
> void *ctx;
> while (packets < budget &&
>(buf = virtnet_rq_get_buf(rq, &len, &ctx))) {
> receive_buf(vi, rq, buf, len, ctx, xdp_xmit, stats);
> packets++;
> }
> +   } else if (!vi->big_packets) {
> +   void *ctx;
> +   unsigned int xdp_headroom = virtnet_get_headroom(vi);

I wonder if this is safe. The headroom is stored as the context, it
looks to me we should fetch the headroom there.

The rx buffer could be allocated before XDP is disabled. For example we had this

unsigned int xdp_headroom = (unsigned long)ctx;

at the beginning of receive_small().


> +   unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom;
> +
> +   while (packets < budget &&
> +  (buf = virtnet_rq_get_buf_small(rq, &len, &ctx, 
> header_offset))) {
> +   receive_buf(vi, rq, buf, len, ctx, xdp_xmit, stats);
> +   packets++;
> +   }
> } else {
> while (packets < budget &&
>(buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
> --
> 2.20.1
>

Thanks

Re: [PATCH] mm/damon/tests/vaddr-kunit: don't use mas_lock for MM_MT_FLAGS-initialized maple tree

2024-09-03 Thread Liam R. Howlett

* Guenter Roeck  [240903 22:38]:
> On 9/3/24 19:31, Liam R. Howlett wrote:
> > * SeongJae Park  [240903 21:18]:
> > > On Tue,  3 Sep 2024 17:58:15 -0700 SeongJae Park  wrote:
> > > 
> > > > On Tue, 3 Sep 2024 20:48:53 -0400 "Liam R. Howlett" 
> > > >  wrote:
> > > > 
> > > > > * SeongJae Park  [240903 20:45]:
> > > > > > damon_test_three_regions_in_vmas() initializes a maple tree with
> > > > > > MM_MT_FLAGS.  The flags contains MT_FLAGS_LOCK_EXTERN, which means
> > > > > > mt_lock of the maple tree will not be used.  And therefore the maple
> > > > > > tree initialization code skips initialization of the mt_lock.  
> > > > > > However,
> > > > > > __link_vmas(), which adds vmas for test to the maple tree, uses the
> > > > > > mt_lock.  In other words, the uninitialized spinlock is used.  The
> > > > > > problem becomes celar when spinlock debugging is turned on, since it
> > > > > > reports spinlock bad magic bug.  Fix the issue by not using the 
> > > > > > mt_lock
> > > > > > as promised.
> > > > > 
> > > > > You can't do this, lockdep will tell you this is wrong.
> > > > 
> > > > Hmm, but lockdep was silence on my setup?
> > > > 
> > > > > We need a lock and to use the lock for writes.
> > > > 
> > > > This code is executed by a single-thread test code.  Do we still need 
> > > > the lock?
> > > > 
> > > > > 
> > > > > I'd suggest using different flags so the spinlock is used.
> > > > 
> > > > The reporter mentioned simply dropping MT_FLAGS_LOCK_EXTERN from the 
> > > > flags
> > > > causes suspicious RCU usage message.  May I ask if you have a 
> > > > suggestion of
> > > > better flags?
> > 
> > That would be the lockdep complaining, so that's good.
> > 
> > > 
> > > I was actually thinking replacing the mt_init_flags() with mt_init(), 
> > > which
> > > same to mt_init_flags() with zero flag, like below.
> > 
> > Yes.  This will use the spinlock which should fix your issue, but it
> > will use a different style of maple tree.
> > 
> > Perhaps use MT_FLAGS_ALLOC_RANGE to use the same type of maple tree, if
> > you ever add threading you will want the rcu flag as well
> > (MT_FLAGS_USE_RCU).
> > 
> > I would recommend those two and just use the spinlock.
> > 
> 
> I tried that (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_USE_RCU). it also triggers
> the suspicious RCU usage message.
> 

I am running ./tools/testing/kunit/kunit.py run '*damon*' --arch x86_64 --raw 
with:
CONFIG_LOCKDEP=y
CONFIG_DEBUG_SPINLOCK=y

and I don't have any issue with locking in the existing code.  How do I
recreate this issue?

Thanks,
Liam

Re: [PATCH v2 2/2] vdpa: Remove ioctl VHOST_VDPA_SET_CONFIG per spec compliance

2024-09-03 Thread Jason Wang

On Wed, Sep 4, 2024 at 1:15 AM Carlos Bilbao
 wrote:
>
> From: Carlos Bilbao 
>
> Remove invalid ioctl VHOST_VDPA_SET_CONFIG and all its implementations
> with vdpa_config_ops->set_config(). This is needed per virtio spec
> requirements; virtio-spec v3.1 Sec 5.1.4 states that "All of the device
> configuration fields are read-only for the driver."
>
> Signed-off-by: Carlos Bilbao 

Note that only the config space of the modern device is read only. So
it should be fine to remove vp_vdpa which only works for modern
devices.

And for eni, it is a legacy only device, so we should not move the
set_config there.

For the rest, we need the acks for those maintainers.

Thanks

Re: [PATCH v4 1/2] virtiofs: use pages instead of pointer for kernel direct IO

2024-09-03 Thread Hou Tao

Hi,

On 9/3/2024 4:44 PM, Jingbo Xu wrote:
>
> On 8/31/24 5:37 PM, Hou Tao wrote:
>> From: Hou Tao 
>>
>> When trying to insert a 10MB kernel module kept in a virtio-fs with cache
>> disabled, the following warning was reported:
>>

SNIP
>>
>> Fixes: a62a8ef9d97d ("virtio-fs: add virtiofs filesystem")
>> Signed-off-by: Hou Tao 
> Tested-by: Jingbo Xu 

Thanks for the test.
>
>
>> ---
>>  fs/fuse/file.c  | 62 +++--
>>  fs/fuse/fuse_i.h|  6 +
>>  fs/fuse/virtio_fs.c |  1 +
>>  3 files changed, 50 insertions(+), 19 deletions(-)
>>
>> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
>> index f39456c65ed7..331208d3e4d1 100644
>> --- a/fs/fuse/file.c
>> +++ b/fs/fuse/file.c
>> @@ -645,7 +645,7 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct 
>> file *file, loff_t pos,
>>  args->out_args[0].size = count;
>>  }
>>  
>> -

SNIP
>>  static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter 
>> *ii,
>> size_t *nbytesp, int write,
>> -   unsigned int max_pages)
>> +   unsigned int max_pages,
>> +   bool use_pages_for_kvec_io)
>>  {
>> +bool flush_or_invalidate = false;
>>  size_t nbytes = 0;  /* # bytes already packed in req */
>>  ssize_t ret = 0;
>>  
>> -/* Special case for kernel I/O: can copy directly into the buffer */
>> +/* Special case for kernel I/O: can copy directly into the buffer.
>> + * However if the implementation of fuse_conn requires pages instead of
>> + * pointer (e.g., virtio-fs), use iov_iter_extract_pages() instead.
>> + */
>>  if (iov_iter_is_kvec(ii)) {
>> -unsigned long user_addr = fuse_get_user_addr(ii);
>> -size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
>> +void *user_addr = (void *)fuse_get_user_addr(ii);
>>  
>> -if (write)
>> -ap->args.in_args[1].value = (void *) user_addr;
>> -else
>> -ap->args.out_args[0].value = (void *) user_addr;
>> +if (!use_pages_for_kvec_io) {
>> +size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
>>  
>> -iov_iter_advance(ii, frag_size);
>> -*nbytesp = frag_size;
>> -return 0;
>> +if (write)
>> +ap->args.in_args[1].value = user_addr;
>> +else
>> +ap->args.out_args[0].value = user_addr;
>> +
>> +iov_iter_advance(ii, frag_size);
>> +*nbytesp = frag_size;
>> +return 0;
>> +}
>> +
>> +if (is_vmalloc_addr(user_addr)) {
>> +ap->args.vmap_base = user_addr;
>> +flush_or_invalidate = true;
> Could we move flush_kernel_vmap_range() upon here, so that
> flush_or_invalidate is not needed anymore and the code looks cleaner?

flush_kernel_vmap_range() needs to know the length of the flushed area,
if moving it here(), the length will be unknown.
>
>> +}
>>  }
>>  
>>  while (nbytes < *nbytesp && ap->num_pages < max_pages) {
>> @@ -1513,6 +1533,10 @@ static int fuse_get_user_pages(struct fuse_args_pages 
>> *ap, struct iov_iter *ii,
>>  (PAGE_SIZE - ret) & (PAGE_SIZE - 1);
>>  }
>>  
>> +if (write && flush_or_invalidate)
>> +flush_kernel_vmap_range(ap->args.vmap_base, nbytes);
>> +
>> +ap->args.invalidate_vmap = !write && flush_or_invalidate;
> How about initializing vmap_base only when the data buffer is vmalloced
> and it's a read request?  In this case invalidate_vmap is no longer needed.

You mean using the value of vmap_base to indicate whether invalidation
is needed or not, right ? I prefer to keep it, because the extra
variable invalidate_vmap indicates the required action for the vmap area
and it doesn't increase the size of fuse_args.

>
>>  ap->args.is_pinned = iov_iter_extract_will_pin(ii);
>>  ap->args.user_pages = true;
>>  if (write)
>> @@ -1581,7 +1605,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct 
>> iov_iter *iter,
>>  size_t nbytes = min(count, nmax);
>>  
>>  err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write,
>> -  max_pages);
>> +  max_pages, fc->use_pages_for_kvec_io);
>>  if (err && !nbytes)
>>  break;
>>  
>> @@ -1595,7 +1619,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct 
>> iov_iter *iter,
>>  }
>>  
>>  if (!io->async || nres < 0) {
>> -fuse_release_user_pages(&ia->ap, io->should_dirty);
>> +fuse_release_user_pages(&ia->ap, nres, 
>> io->should_dirty);
>>  fuse_io_free(ia);
>>  }
>>  ia = NULL;
>> diff --git a/f

Re: [PATCH v4 2/2] virtiofs: use GFP_NOFS when enqueuing request through kworker

2024-09-03 Thread Hou Tao




On 9/3/2024 5:34 PM, Jingbo Xu wrote:
>
> On 8/31/24 5:37 PM, Hou Tao wrote:
>> From: Hou Tao 
>>
>> When invoking virtio_fs_enqueue_req() through kworker, both the
>> allocation of the sg array and the bounce buffer still use GFP_ATOMIC.
>> Considering the size of the sg array may be greater than PAGE_SIZE, use
>> GFP_NOFS instead of GFP_ATOMIC to lower the possibility of memory
>> allocation failure and to avoid unnecessarily depleting the atomic
>> reserves. GFP_NOFS is not passed to virtio_fs_enqueue_req() directly,
>> GFP_KERNEL and memalloc_nofs_{save|restore} helpers are used instead.
>>
>> It may seem OK to pass GFP_NOFS to virtio_fs_enqueue_req() as well when
>> queuing the request for the first time, but this is not the case. The
>> reason is that fuse_request_queue_background() may call
>> ->queue_request_and_unlock() while holding fc->bg_lock, which is a
>> spin-lock. Therefore, still use GFP_ATOMIC for it.
> Actually, .wake_pending_and_unlock() is called under fiq->lock and
> GFP_ATOMIC is requisite.

Er, but virtio_fs_wake_pending_and_unlock() unlocks fiq->lock before
queuing the request.
>
>
>> Signed-off-by: Hou Tao 
>> ---
>>  fs/fuse/virtio_fs.c | 24 +++-
>>  1 file changed, 15 insertions(+), 9 deletions(-)
>>
>> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
>> index 43d66ab5e891..9bc48b3ca384 100644
>> --- a/fs/fuse/virtio_fs.c
>> +++ b/fs/fuse/virtio_fs.c
>> @@ -95,7 +95,8 @@ struct virtio_fs_req_work {
>>  };
>>  
>>  static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
>> - struct fuse_req *req, bool in_flight);
>> + struct fuse_req *req, bool in_flight,
>> + gfp_t gfp);
>>  
>>  static const struct constant_table dax_param_enums[] = {
>>  {"always",  FUSE_DAX_ALWAYS },
>> @@ -439,6 +440,8 @@ static void virtio_fs_request_dispatch_work(struct 
>> work_struct *work)
>>  
>>  /* Dispatch pending requests */
>>  while (1) {
>> +unsigned int flags;
>> +
>>  spin_lock(&fsvq->lock);
>>  req = list_first_entry_or_null(&fsvq->queued_reqs,
>> struct fuse_req, list);
>> @@ -449,7 +452,9 @@ static void virtio_fs_request_dispatch_work(struct 
>> work_struct *work)
>>  list_del_init(&req->list);
>>  spin_unlock(&fsvq->lock);
>>  
>> -ret = virtio_fs_enqueue_req(fsvq, req, true);
>> +flags = memalloc_nofs_save();
>> +ret = virtio_fs_enqueue_req(fsvq, req, true, GFP_KERNEL);
>> +memalloc_nofs_restore(flags);
>>  if (ret < 0) {
>>  if (ret == -ENOSPC) {
>>  spin_lock(&fsvq->lock);
>> @@ -550,7 +555,7 @@ static void virtio_fs_hiprio_dispatch_work(struct 
>> work_struct *work)
>>  }
>>  
>>  /* Allocate and copy args into req->argbuf */
>> -static int copy_args_to_argbuf(struct fuse_req *req)
>> +static int copy_args_to_argbuf(struct fuse_req *req, gfp_t gfp)
>>  {
>>  struct fuse_args *args = req->args;
>>  unsigned int offset = 0;
>> @@ -564,7 +569,7 @@ static int copy_args_to_argbuf(struct fuse_req *req)
>>  len = fuse_len_args(num_in, (struct fuse_arg *) args->in_args) +
>>fuse_len_args(num_out, args->out_args);
>>  
>> -req->argbuf = kmalloc(len, GFP_ATOMIC);
>> +req->argbuf = kmalloc(len, gfp);
>>  if (!req->argbuf)
>>  return -ENOMEM;
>>  
>> @@ -1239,7 +1244,8 @@ static unsigned int sg_init_fuse_args(struct 
>> scatterlist *sg,
>>  
>>  /* Add a request to a virtqueue and kick the device */
>>  static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq,
>> - struct fuse_req *req, bool in_flight)
>> + struct fuse_req *req, bool in_flight,
>> + gfp_t gfp)
>>  {
>>  /* requests need at least 4 elements */
>>  struct scatterlist *stack_sgs[6];
>> @@ -1260,8 +1266,8 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq 
>> *fsvq,
>>  /* Does the sglist fit on the stack? */
>>  total_sgs = sg_count_fuse_req(req);
>>  if (total_sgs > ARRAY_SIZE(stack_sgs)) {
>> -sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), GFP_ATOMIC);
>> -sg = kmalloc_array(total_sgs, sizeof(sg[0]), GFP_ATOMIC);
>> +sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), gfp);
>> +sg = kmalloc_array(total_sgs, sizeof(sg[0]), gfp);
>>  if (!sgs || !sg) {
>>  ret = -ENOMEM;
>>  goto out;
>> @@ -1269,7 +1275,7 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq 
>> *fsvq,
>>  }
>>  
>>  /* Use a bounce buffer since stack args cannot be mapped */
>> -ret = copy_args_to_argbuf(req);
>> +ret = copy_args_to_argbuf(req, gfp);
>>  if (ret < 0)
>>  goto out;
>>  
>> @@ -1367,7 +1373,7 @@ __releases(fiq->lock)
>>

1 2 >

1 - 100 of 120 matches

Mail list logo