[PATCH] module: Add CONFIG_MODULE_LOAD_IN_SEQUENCE option

2023-10-08 Thread Joey Jiao
When modprobe cmds are executed one by one, the final loaded modules
are not in fixed sequence as expected.

Add the option to make sure modules are in fixed sequence across reboot.

Signed-off-by: Joey Jiao 
---
 kernel/module/Kconfig | 11 +++
 kernel/module/main.c  |  6 ++
 2 files changed, 17 insertions(+)

diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index 33a2e991f608..b45a45f31d6d 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -389,4 +389,15 @@ config MODULES_TREE_LOOKUP
def_bool y
depends on PERF_EVENTS || TRACING || CFI_CLANG
 
+config MODULE_LOAD_IN_SEQUENCE
+   bool "Load module in sequence"
+   default n
+   help
+ By default, modules are loaded in random sequence depending on when 
modprobe
+ is executed.
+
+ This option allows modules to be loaded in sequence if modprobe cmds 
are
+ executed one by one in sequence. This option is helpful during 
syzkaller fuzzing
+ to make sure module is loaded into fixed address across device reboot.
+
 endif # MODULES
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 98fedfdb8db5..587fd84083ae 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2593,11 +2593,17 @@ static noinline int do_init_module(struct module *mod)
 * be cleaned up needs to sync with the queued work - ie
 * rcu_barrier()
 */
+#ifdef CONFIG_MODULE_LOAD_IN_SEQUENCE
+   llist_add(>node, _free_list);
+#else
if (llist_add(>node, _free_list))
schedule_work(_free_wq);
+#endif
 
mutex_unlock(_mutex);
+#ifdef CONFIG_MODULE_LOAD_IN_SEQUENCE
wake_up_all(_wq);
+#endif
 
mod_stat_add_long(text_size, _text_size);
mod_stat_add_long(total_size, _mod_size);
-- 
2.42.0




Re: [PATCH net-next v7] net/core: Introduce netdev_core_stats_inc()

2023-10-08 Thread Yajun Deng



On 2023/10/8 17:12, Yajun Deng wrote:


On 2023/10/8 16:53, Eric Dumazet wrote:

On Sun, Oct 8, 2023 at 10:44 AM Yajun Deng  wrote:


On 2023/10/8 15:18, Eric Dumazet wrote:
On Sun, Oct 8, 2023 at 9:00 AM Yajun Deng  
wrote:

On 2023/10/8 14:45, Eric Dumazet wrote:
On Sat, Oct 7, 2023 at 8:34 AM Yajun Deng  
wrote:

On 2023/10/7 13:29, Eric Dumazet wrote:
On Sat, Oct 7, 2023 at 7:06 AM Yajun Deng 
 wrote:
Although there is a kfree_skb_reason() helper function that 
can be used to
find the reason why this skb is dropped, but most callers 
didn't increase
one of rx_dropped, tx_dropped, rx_nohandler and 
rx_otherhost_dropped.



...


+
+void netdev_core_stats_inc(struct net_device *dev, u32 offset)
+{
+   /* This READ_ONCE() pairs with the write in 
netdev_core_stats_alloc() */
+   struct net_device_core_stats __percpu *p = 
READ_ONCE(dev->core_stats);

+   unsigned long *field;
+
+   if (unlikely(!p))
+   p = netdev_core_stats_alloc(dev);
+
+   if (p) {
+   field = (unsigned long *)((void 
*)this_cpu_ptr(p) + offset);

+   WRITE_ONCE(*field, READ_ONCE(*field) + 1);

This is broken...

As I explained earlier, dev_core_stats_(dev) can be called 
from

many different contexts:

1) process contexts, where preemption and migration are allowed.
2) interrupt contexts.

Adding WRITE_ONCE()/READ_ONCE() is not solving potential races.

I _think_ I already gave you how to deal with this ?

Yes, I replied in v6.

https://lore.kernel.org/all/e25b5f3c-bd97-56f0-de86-b93a31728...@linux.dev/ 




Please try instead:

+void netdev_core_stats_inc(struct net_device *dev, u32 offset)
+{
+   /* This READ_ONCE() pairs with the write in 
netdev_core_stats_alloc() */
+   struct net_device_core_stats __percpu *p = 
READ_ONCE(dev->core_stats);

+   unsigned long __percpu *field;
+
+   if (unlikely(!p)) {
+   p = netdev_core_stats_alloc(dev);
+   if (!p)
+   return;
+   }
+   field = (__force unsigned long __percpu *)((__force 
void *)p + offset);

+   this_cpu_inc(*field);
+}
This wouldn't trace anything even the rx_dropped is in 
increasing. It

needs to add an extra operation, such as:

I honestly do not know what you are talking about.

Have you even tried to change your patch to use

field = (__force unsigned long __percpu *)((__force void *)p + 
offset);

this_cpu_inc(*field);
Yes, I tested this code. But the following couldn't show anything 
even

if the rx_dropped is increasing.

'sudo python3 /usr/share/bcc/tools/trace netdev_core_stats_inc'

Well, I am not sure about this, "bpftrace" worked for me.

Make sure your toolchain generates something that looks like what I 
got:


ef20 :
  ef20: f3 0f 1e fa  endbr64
  ef24: e8 00 00 00 00    call   ef29 


ef25: R_X86_64_PLT32 __fentry__-0x4
  ef29: 55    push   %rbp
  ef2a: 48 89 e5  mov    %rsp,%rbp
  ef2d: 53    push   %rbx
  ef2e: 89 f3    mov    %esi,%ebx
  ef30: 48 8b 87 f0 01 00 00 mov    0x1f0(%rdi),%rax
  ef37: 48 85 c0  test   %rax,%rax
  ef3a: 74 0b    je ef47 


  ef3c: 89 d9    mov    %ebx,%ecx
  ef3e: 65 48 ff 04 08    incq   %gs:(%rax,%rcx,1)
  ef43: 5b    pop    %rbx
  ef44: 5d    pop    %rbp
  ef45: c3    ret
  ef46: cc    int3
  ef47: e8 00 00 00 00    call   ef4c 


ef48: R_X86_64_PLT32 .text.unlikely.+0x13c
  ef4c: 48 85 c0  test   %rax,%rax
  ef4f: 75 eb    jne    ef3c 

  ef51: eb f0    jmp    ef43 

  ef53: 66 66 66 66 2e 0f 1f data16 data16 data16 cs nopw 
0x0(%rax,%rax,1)

  ef5a: 84 00 00 00 00 00


I'll share some I can see it.

1.

objdump -D vmlinux

81b2f170 :
81b2f170:    e8 8b ea 55 ff   callq 8108dc00
<__fentry__>
81b2f175:    55   push   %rbp
81b2f176:    48 89 e5 mov    %rsp,%rbp
81b2f179:    48 83 ec 08  sub    $0x8,%rsp
81b2f17d:    48 8b 87 e8 01 00 00 mov 0x1e8(%rdi),%rax
81b2f184:    48 85 c0 test   %rax,%rax
81b2f187:    74 0d    je 81b2f196

81b2f189:    89 f6    mov    %esi,%esi
81b2f18b:    65 48 ff 04 30   incq %gs:(%rax,%rsi,1)
81b2f190:    c9   leaveq
81b2f191:    e9 aa 31 6d 00   jmpq 82202340
<__x86_return_thunk>
81b2f196:    89 75 fc mov %esi,-0x4(%rbp)
81b2f199:    e8 82 ff ff ff   callq 81b2f120

81b2f19e:    8b 75 fc mov -0x4(%rbp),%esi
81b2f1a1:    48 85 c0 test   %rax,%rax
81b2f1a4:    75 e3    jne 

Re: [PATCH v9 3/5] kprobes: kretprobe scalability improvement with objpool

2023-10-08 Thread Google
On Mon, 9 Oct 2023 02:31:34 +0800
wuqiang  wrote:

> On 2023/10/7 10:02, Masami Hiramatsu (Google) wrote:
> > On Tue,  5 Sep 2023 09:52:53 +0800
> > "wuqiang.matt"  wrote:
> > 
> >> kretprobe is using freelist to manage return-instances, but freelist,
> >> as LIFO queue based on singly linked list, scales badly and reduces
> >> the overall throughput of kretprobed routines, especially for high
> >> contention scenarios.
> >>
> >> Here's a typical throughput test of sys_prctl (counts in 10 seconds,
> >> measured with perf stat -a -I 1 -e syscalls:sys_enter_prctl):
> >>
> >> OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
> >> HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s
> >>
> >>1T   2T   4T   8T  16T  24T
> >> 24150045 29317964 15446741 12494489 18287272 18287272
> >>   32T  48T  64T  72T  96T 128T
> >> 16200682  1620068 11645677 11269858 10470118  9931051
> >>
> >> This patch introduces objpool to kretprobe and rethook, with orginal
> >> freelist replaced and brings near-linear scalability to kretprobed
> >> routines. Tests of kretprobe throughput show the biggest ratio as
> >> 166.7x of the original freelist. Here's the comparison:
> >>
> >>1T 2T 4T 8T16T
> >> native: 41186213   82336866  164250978  328662645  658810299
> >> freelist:   24150045   29317964   15446741   12494489   18287272
> >> objpool:24663700   49410571   98544977  197725940  396294399
> >>   32T48T64T96T   128T
> >> native:   1330338351 1969957941 2512291791 1514690434 2671040914
> >> freelist:   16200682   13737658   11645677   104701189931051
> >> objpool:78673470 1177354156 1514690434 1604472348 1655086824
> >>
> >> Tests on 96-core ARM64 system output similarly, but with the biggest
> >> ratio up to 454.8x:
> >>
> >> OS: Debian 10 AARCH64, Linux 6.5rc7
> >> HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s
> >>
> >>1T 2T 4T 8T16T
> >> native: .   30066096   13733813  126194076  257447289  505800181
> >> freelist:   16152090   11064397   1112406872157685663013
> >> objpool:13733813   27749031   56540679  112291770  223482778
> >>   24T32T48T64T96T
> >> native:763305277 1015925192 1521075123 2033009392 3021013752
> >> freelist:50158104602893376679233824782945292
> >> objpool:   334605663  448310646  675018951  903449904 1339693418
> >>
> > 
> > This looks good to me (and I have tested with updated objpool)
> > 
> > Acked-by: Masami Hiramatsu (Google) 
> > 
> > Wuqiang, can you update the above number with the simplified
> > objpool? I got better number (always 80% of the native performance)
> > with 128 node/probe.
> > (*) 
> > https://lore.kernel.org/all/20231003003923.eabc33bb3f4ffb8eac71f...@kernel.org/
> 
> That's great. I'll prepare a new patch and try to spare the testbeds for
> another round of testing.

Thanks for updating! I'm looking forward to the next version :)

Thank you,

> 
> > Thank you,
> 
> Thanks for your effort. Sorry for the late response, just back from
> a 'long' vacation.
> 
> > 
> >> Signed-off-by: wuqiang.matt 
> >> ---
> >>   include/linux/kprobes.h | 11 ++---
> >>   include/linux/rethook.h | 16 ++-
> >>   kernel/kprobes.c| 93 +
> >>   kernel/trace/fprobe.c   | 32 ++
> >>   kernel/trace/rethook.c  | 90 ++-
> >>   5 files changed, 98 insertions(+), 144 deletions(-)
> >>
> >> diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
> >> index 85a64cb95d75..365eb092e9c4 100644
> >> --- a/include/linux/kprobes.h
> >> +++ b/include/linux/kprobes.h
> >> @@ -26,8 +26,7 @@
> >>   #include 
> >>   #include 
> >>   #include 
> >> -#include 
> >> -#include 
> >> +#include 
> >>   #include 
> >>   #include 
> >>   
> >> @@ -141,7 +140,7 @@ static inline bool kprobe_ftrace(struct kprobe *p)
> >>*/
> >>   struct kretprobe_holder {
> >>struct kretprobe*rp;
> >> -  refcount_t  ref;
> >> +  struct objpool_head pool;
> >>   };
> >>   
> >>   struct kretprobe {
> >> @@ -154,7 +153,6 @@ struct kretprobe {
> >>   #ifdef CONFIG_KRETPROBE_ON_RETHOOK
> >>struct rethook *rh;
> >>   #else
> >> -  struct freelist_head freelist;
> >>struct kretprobe_holder *rph;
> >>   #endif
> >>   };
> >> @@ -165,10 +163,7 @@ struct kretprobe_instance {
> >>   #ifdef CONFIG_KRETPROBE_ON_RETHOOK
> >>struct rethook_node node;
> >>   #else
> >> -  union {
> >> -  struct freelist_node freelist;
> >> -  struct rcu_head rcu;
> >> -  };
> >> +  struct rcu_head rcu;
> >>struct llist_node llist;
> >>struct kretprobe_holder *rph;
> >>kprobe_opcode_t *ret_addr;
> >> diff --git a/include/linux/rethook.h b/include/linux/rethook.h
> >> index 

Re: [PATCH v9 0/5] lib,kprobes: kretprobe scalability improvement

2023-10-08 Thread Google
On Mon, 9 Oct 2023 02:33:09 +0800
wuqiang  wrote:

> On 2023/9/23 16:57, Masami Hiramatsu (Google) wrote:
> > Hi Wuqiang,
> > 
> > I dug my mail box and found this. Sorry for replying late.
> > 
> > On Tue,  5 Sep 2023 09:52:50 +0800
> > "wuqiang.matt"  wrote:
> > 
> >> This patch series introduces a scalable and lockless ring-array based
> >> object pool and replaces the original freelist (a LIFO queue based on
> >> singly linked list) to improve scalability of kretprobed routines.
> >>
> >> v9:
> >>1) objpool: raw_local_irq_save/restore added to prevent interruption
> >>
> >>   To avoid possible ABA issues, we must ensure objpool_try_add_slot
> >>   and objpool_try_add_slot are uninterruptible. If these operations
> >>   are blocked or interrupted in the middle, other cores could overrun
> >>   the same slot's ages[] of uint32, then after resuming back, the
> >>   interrupted pop() or push() could see same value of ages[], which
> >>   is a typical ABA problem though the possibility is small.
> >>
> >>   The pair of pop()/push() costs about 8.53 cpu cycles, measured
> >>   by IACA (Intel Architecture Code Analyzer). That is, on a 4Ghz
> >>   core dedicated for pop() & push(), theoretically it would only
> >>   need 8.53 seconds to overflow a 32bit value. Testings upon Intel
> >>   i7-10700 (2.90GHz) cost 71.88 seconds to overrun a 32bit integer.
> > 
> > What does this mean? This sounds like "There is a timing issue if it's 
> > enough fast".
> 
> Yes, that's why local irq must be disabled. If push()/pop() is interrupted or 
> preempted long enough (> 10 seconds for the extreme cases), other nodes could
> overrun the same ages[] of 32-bit, then after resuming to execution the push()
> or pop() would see the same value without notifying the overrun, which is a
> typical ABA.

Yeah, indeed.

> 
> Changing ages[] to 64-bit could be a solution, but it's inappropriate for
> 32-bit OS and looks too heavy. With local irg disabled, push() or pop() is
> uninterrupted,thus the ABA is avoided.

As I found, ages[] can be removed. In that case, you can only update the
head and tail to 64 bit (but in that case cmpxchg will be more complicated)

Thank you,

> 
> push() or pop() consumes only ~4 cycles to complete (most of the use cases), 
> so raw_local_irq_save/restore are used instead of local_irq_save/restore to
> minimize the overhead.
> 
> > Let me reivew the patch itself.
> > 
> > Thanks,
> > 
> >>
> >>2) codes improvements: thanks to Masami for the thorough inspection
> >>
> >> v8:
> >>1) objpool: refcount added for objpool lifecycle management
> >>
> >> wuqiang.matt (5):
> >>lib: objpool added: ring-array based lockless MPMC
> >>lib: objpool test module added
> >>kprobes: kretprobe scalability improvement with objpool
> >>kprobes: freelist.h removed
> >>MAINTAINERS: objpool added
> >>
> >>   MAINTAINERS  |   7 +
> >>   include/linux/freelist.h | 129 
> >>   include/linux/kprobes.h  |  11 +-
> >>   include/linux/objpool.h  | 174 ++
> >>   include/linux/rethook.h  |  16 +-
> >>   kernel/kprobes.c |  93 +++---
> >>   kernel/trace/fprobe.c|  32 +-
> >>   kernel/trace/rethook.c   |  90 +++--
> >>   lib/Kconfig.debug|  11 +
> >>   lib/Makefile |   4 +-
> >>   lib/objpool.c| 338 +++
> >>   lib/test_objpool.c   | 689 +++
> >>   12 files changed, 1320 insertions(+), 274 deletions(-)
> >>   delete mode 100644 include/linux/freelist.h
> >>   create mode 100644 include/linux/objpool.h
> >>   create mode 100644 lib/objpool.c
> >>   create mode 100644 lib/test_objpool.c
> >>
> >> -- 
> >> 2.40.1
> >>
> > 
> > 
> 
> 


-- 
Masami Hiramatsu (Google) 



Re: [PATCH v9 1/5] lib: objpool added: ring-array based lockless MPMC

2023-10-08 Thread wuqiang

On 2023/9/25 17:42, Masami Hiramatsu (Google) wrote:

Hi Wuqiang,

On Tue,  5 Sep 2023 09:52:51 +0800
"wuqiang.matt"  wrote:


The object pool is a scalable implementaion of high performance queue
for object allocation and reclamation, such as kretprobe instances.

With leveraging percpu ring-array to mitigate the hot spot of memory
contention, it could deliver near-linear scalability for high parallel
scenarios. The objpool is best suited for following cases:
1) Memory allocation or reclamation are prohibited or too expensive
2) Consumers are of different priorities, such as irqs and threads

Limitations:
1) Maximum objects (capacity) is determined during pool initializing
and can't be modified (extended) after objpool creation
2) The memory of objects won't be freed until objpool is finalized
3) Object allocation (pop) may fail after trying all cpu slots


I made a simplifying patch on this by (mainly) removing ages array.
I also rename local variable to use more readable names, like slot,
pool, and obj.

Here the results which I run the test_objpool.ko.

Original:
[   50.500235] Summary of testcases:
[   50.503139] duration: 1027135us  hits:   30628293miss:  
0sync: percpu objpool
[   50.510416] duration: 1047977us  hits:   30453023miss:  
0sync: percpu objpool from vmalloc
[   50.517421] duration: 1047098us  hits:   31145034miss:  0
sync & hrtimer: percpu objpool
[   50.524671] duration: 1053233us  hits:   30919640miss:  0
sync & hrtimer: percpu objpool from vmalloc
[   50.531382] duration: 1055822us  hits:3407221miss: 
830219sync overrun: percpu objpool
[   50.538135] duration: 1055998us  hits:3404624miss: 
854160sync overrun: percpu objpool from vmalloc
[   50.546686] duration: 1046104us  hits:   19464798miss:  
0async: percpu objpool
[   50.552989] duration: 1033054us  hits:   18957836miss:  
0async: percpu objpool from vmalloc
[   50.560289] duration: 1041778us  hits:   33806868miss:  0
async & hrtimer: percpu objpool
[   50.567425] duration: 1048901us  hits:   34211860miss:  0
async & hrtimer: percpu objpool from vmalloc

Simplified:
[   48.393236] Summary of testcases:
[   48.395384] duration: 1013002us  hits:   29661448miss:  
0sync: percpu objpool
[   48.400351] duration: 1057231us  hits:   31035578miss:  
0sync: percpu objpool from vmalloc
[   48.405660] duration: 1043196us  hits:   30546652miss:  0
sync & hrtimer: percpu objpool
[   48.411216] duration: 1047128us  hits:   30601306miss:  0
sync & hrtimer: percpu objpool from vmalloc
[   48.417231] duration: 1051695us  hits:3468287miss: 
892881sync overrun: percpu objpool
[   48.422405] duration: 1054003us  hits:3549491miss: 
898141sync overrun: percpu objpool from vmalloc
[   48.428425] duration: 1052946us  hits:   19005228miss:  
0async: percpu objpool
[   48.433597] duration: 1051757us  hits:   19670866miss:  
0async: percpu objpool from vmalloc
[   48.439280] duration: 1042951us  hits:   37135332miss:  0
async & hrtimer: percpu objpool
[   48.445085] duration: 1029803us  hits:   37093248miss:  0
async & hrtimer: percpu objpool from vmalloc

Can you test it too?


Sure, I'll do some testings locally. Also do a performance comparsion with
'entries' managed in objpool_head.



Thanks,


Thank you for your time, I appreciate it.



 From f1f442ff653e329839e5452b8b88463a80a12ff3 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" 
Date: Mon, 25 Sep 2023 16:07:12 +0900
Subject: [PATCH] objpool: Simplify objpool by removing ages array

Simplify the objpool code by removing ages array from per-cpu slot.
It chooses enough big capacity (which is a rounded up power of 2 value
of nr_objects + 1) for the entries array, the tail never catch up to
the head in per-cpu slot. Thus tail == head means the slot is empty.

This also uses consistent local variable names for pool, slot and obj.

Signed-off-by: Masami Hiramatsu (Google) 
---
  include/linux/objpool.h |  61 
  lib/objpool.c   | 310 
  2 files changed, 147 insertions(+), 224 deletions(-)

diff --git a/include/linux/objpool.h b/include/linux/objpool.h
index 33c832216b98..ecd5ecaffcd3 100644
--- a/include/linux/objpool.h
+++ b/include/linux/objpool.h
@@ -38,33 +38,23 @@
   * struct objpool_slot - percpu ring array of objpool
   * @head: head of the local ring array (to retrieve at)
   * @tail: tail of the local ring array (to append at)
- * @bits: log2 of capacity (for bitwise 

Re: [PATCH v9 1/5] lib: objpool added: ring-array based lockless MPMC

2023-10-08 Thread wuqiang

On 2023/9/23 17:48, Masami Hiramatsu (Google) wrote:

Hi Wuqiang,

Sorry for replying later.

On Tue,  5 Sep 2023 09:52:51 +0800
"wuqiang.matt"  wrote:


The object pool is a scalable implementaion of high performance queue
for object allocation and reclamation, such as kretprobe instances.

With leveraging percpu ring-array to mitigate the hot spot of memory
contention, it could deliver near-linear scalability for high parallel
scenarios. The objpool is best suited for following cases:
1) Memory allocation or reclamation are prohibited or too expensive
2) Consumers are of different priorities, such as irqs and threads

Limitations:
1) Maximum objects (capacity) is determined during pool initializing
and can't be modified (extended) after objpool creation


So the pool size is fixed in initialization.


Right. The arrary size will be up-rounded to the exponent of 2, but the
actual number of objects (to be allocated) are the exact value specified
by user.




2) The memory of objects won't be freed until objpool is finalized
3) Object allocation (pop) may fail after trying all cpu slots


This means that object allocation will fail if the all pools are empty,
right?


Yes, pop() will return NULL for this case. pop() does the checking for
only 1 round of all cpu nodes.

The objpool might not be empty since new object could be inserted back
in the meaintime by other nodes, which is natural for lockless queues.





Signed-off-by: wuqiang.matt 
---
  include/linux/objpool.h | 174 +
  lib/Makefile|   2 +-
  lib/objpool.c   | 338 
  3 files changed, 513 insertions(+), 1 deletion(-)
  create mode 100644 include/linux/objpool.h
  create mode 100644 lib/objpool.c

diff --git a/include/linux/objpool.h b/include/linux/objpool.h
new file mode 100644
index ..33c832216b98
--- /dev/null
+++ b/include/linux/objpool.h
@@ -0,0 +1,174 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_OBJPOOL_H
+#define _LINUX_OBJPOOL_H
+
+#include 
+#include 
+
+/*
+ * objpool: ring-array based lockless MPMC queue
+ *
+ * Copyright: wuqiang.m...@bytedance.com
+ *
+ * The object pool is a scalable implementaion of high performance queue
+ * for objects allocation and reclamation, such as kretprobe instances.
+ *
+ * With leveraging per-cpu ring-array to mitigate the hot spots of memory
+ * contention, it could deliver near-linear scalability for high parallel
+ * scenarios. The ring-array is compactly managed in a single cache-line
+ * to benefit from warmed L1 cache for most cases (<= 4 objects per-core).
+ * The body of pre-allocated objects is stored in continuous cache-lines
+ * just after the ring-array.


I consider the size of entries may be big if we have larger number of
CPU cores, like 72-cores. And if user specifies (2^n) + 1 entries.
In this case, each CPU has (2^n - 1)/72 objects, but has 2^(n + 1)
entries in ring buffer. So it should be noted.


Yes for the arrary size since it‘s up-rounded to the exponent of 2, but the
actual number of pre-allocated objects will stay the same as user specified.


+ *
+ * The object pool is interrupt safe. Both allocation and reclamation
+ * (object pop and push operations) can be preemptible or interruptable.


You've added raw_spinlock_disable/enable(), so it is not preemptible
or interruptible anymore. (Anyway, caller doesn't take care of that)


Sure, this decription is imporper and unnecessary. Will be removed.


+ *
+ * It's best suited for following cases:
+ * 1) Memory allocation or reclamation are prohibited or too expensive
+ * 2) Consumers are of different priorities, such as irqs and threads
+ *
+ * Limitations:
+ * 1) Maximum objects (capacity) is determined during pool initializing
+ * 2) The memory of objects won't be freed until the pool is finalized
+ * 3) Object allocation (pop) may fail after trying all cpu slots
+ */
+
+/**
+ * struct objpool_slot - percpu ring array of objpool
+ * @head: head of the local ring array (to retrieve at)
+ * @tail: tail of the local ring array (to append at)
+ * @bits: log2 of capacity (for bitwise operations)
+ * @mask: capacity - 1


These description does not give idea what those roles are.


I'll refine the description. objpool_slot is totally internal to objpool.




+ *
+ * Represents a cpu-local array-based ring buffer, its size is specialized
+ * during initialization of object pool. The percpu objpool slot is to be
+ * allocated from local memory for NUMA system, and to be kept compact in
+ * continuous memory: ages[] is stored just after the body of objpool_slot,
+ * and then entries[]. ages[] describes revision of each item, solely used
+ * to avoid ABA; entries[] contains pointers of the actual objects
+ *
+ * Layout of objpool_slot in memory:
+ *
+ * 64bit:
+ *4  8  12 1632 64
+ * | head | tail | bits | mask | ages[4] | ents[4]: (8 * 4) | objects
+ *
+ * 32bit:
+ *4  8

Re: [PATCH v9 0/5] lib,kprobes: kretprobe scalability improvement

2023-10-08 Thread wuqiang

On 2023/9/23 16:57, Masami Hiramatsu (Google) wrote:

Hi Wuqiang,

I dug my mail box and found this. Sorry for replying late.

On Tue,  5 Sep 2023 09:52:50 +0800
"wuqiang.matt"  wrote:


This patch series introduces a scalable and lockless ring-array based
object pool and replaces the original freelist (a LIFO queue based on
singly linked list) to improve scalability of kretprobed routines.

v9:
   1) objpool: raw_local_irq_save/restore added to prevent interruption

  To avoid possible ABA issues, we must ensure objpool_try_add_slot
  and objpool_try_add_slot are uninterruptible. If these operations
  are blocked or interrupted in the middle, other cores could overrun
  the same slot's ages[] of uint32, then after resuming back, the
  interrupted pop() or push() could see same value of ages[], which
  is a typical ABA problem though the possibility is small.

  The pair of pop()/push() costs about 8.53 cpu cycles, measured
  by IACA (Intel Architecture Code Analyzer). That is, on a 4Ghz
  core dedicated for pop() & push(), theoretically it would only
  need 8.53 seconds to overflow a 32bit value. Testings upon Intel
  i7-10700 (2.90GHz) cost 71.88 seconds to overrun a 32bit integer.


What does this mean? This sounds like "There is a timing issue if it's enough 
fast".


Yes, that's why local irq must be disabled. If push()/pop() is interrupted or 
preempted long enough (> 10 seconds for the extreme cases), other nodes could

overrun the same ages[] of 32-bit, then after resuming to execution the push()
or pop() would see the same value without notifying the overrun, which is a
typical ABA.

Changing ages[] to 64-bit could be a solution, but it's inappropriate for
32-bit OS and looks too heavy. With local irg disabled, push() or pop() is
uninterrupted,thus the ABA is avoided.

push() or pop() consumes only ~4 cycles to complete (most of the use cases), 
so raw_local_irq_save/restore are used instead of local_irq_save/restore to

minimize the overhead.


Let me reivew the patch itself.

Thanks,



   2) codes improvements: thanks to Masami for the thorough inspection

v8:
   1) objpool: refcount added for objpool lifecycle management

wuqiang.matt (5):
   lib: objpool added: ring-array based lockless MPMC
   lib: objpool test module added
   kprobes: kretprobe scalability improvement with objpool
   kprobes: freelist.h removed
   MAINTAINERS: objpool added

  MAINTAINERS  |   7 +
  include/linux/freelist.h | 129 
  include/linux/kprobes.h  |  11 +-
  include/linux/objpool.h  | 174 ++
  include/linux/rethook.h  |  16 +-
  kernel/kprobes.c |  93 +++---
  kernel/trace/fprobe.c|  32 +-
  kernel/trace/rethook.c   |  90 +++--
  lib/Kconfig.debug|  11 +
  lib/Makefile |   4 +-
  lib/objpool.c| 338 +++
  lib/test_objpool.c   | 689 +++
  12 files changed, 1320 insertions(+), 274 deletions(-)
  delete mode 100644 include/linux/freelist.h
  create mode 100644 include/linux/objpool.h
  create mode 100644 lib/objpool.c
  create mode 100644 lib/test_objpool.c

--
2.40.1










Re: [PATCH v9 3/5] kprobes: kretprobe scalability improvement with objpool

2023-10-08 Thread wuqiang

On 2023/10/7 10:02, Masami Hiramatsu (Google) wrote:

On Tue,  5 Sep 2023 09:52:53 +0800
"wuqiang.matt"  wrote:


kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.

Here's a typical throughput test of sys_prctl (counts in 10 seconds,
measured with perf stat -a -I 1 -e syscalls:sys_enter_prctl):

OS: Debian 10 X86_64, Linux 6.5rc7 with freelist
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s

   1T   2T   4T   8T  16T  24T
24150045 29317964 15446741 12494489 18287272 18287272
  32T  48T  64T  72T  96T 128T
16200682  1620068 11645677 11269858 10470118  9931051

This patch introduces objpool to kretprobe and rethook, with orginal
freelist replaced and brings near-linear scalability to kretprobed
routines. Tests of kretprobe throughput show the biggest ratio as
166.7x of the original freelist. Here's the comparison:

   1T 2T 4T 8T16T
native: 41186213   82336866  164250978  328662645  658810299
freelist:   24150045   29317964   15446741   12494489   18287272
objpool:24663700   49410571   98544977  197725940  396294399
  32T48T64T96T   128T
native:   1330338351 1969957941 2512291791 1514690434 2671040914
freelist:   16200682   13737658   11645677   104701189931051
objpool:78673470 1177354156 1514690434 1604472348 1655086824

Tests on 96-core ARM64 system output similarly, but with the biggest
ratio up to 454.8x:

OS: Debian 10 AARCH64, Linux 6.5rc7
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s

   1T 2T 4T 8T16T
native: .   30066096   13733813  126194076  257447289  505800181
freelist:   16152090   11064397   1112406872157685663013
objpool:13733813   27749031   56540679  112291770  223482778
  24T32T48T64T96T
native:763305277 1015925192 1521075123 2033009392 3021013752
freelist:50158104602893376679233824782945292
objpool:   334605663  448310646  675018951  903449904 1339693418



This looks good to me (and I have tested with updated objpool)

Acked-by: Masami Hiramatsu (Google) 

Wuqiang, can you update the above number with the simplified
objpool? I got better number (always 80% of the native performance)
with 128 node/probe.
(*) 
https://lore.kernel.org/all/20231003003923.eabc33bb3f4ffb8eac71f...@kernel.org/


That's great. I'll prepare a new patch and try to spare the testbeds for
another round of testing.


Thank you,


Thanks for your effort. Sorry for the late response, just back from
a 'long' vacation.




Signed-off-by: wuqiang.matt 
---
  include/linux/kprobes.h | 11 ++---
  include/linux/rethook.h | 16 ++-
  kernel/kprobes.c| 93 +
  kernel/trace/fprobe.c   | 32 ++
  kernel/trace/rethook.c  | 90 ++-
  5 files changed, 98 insertions(+), 144 deletions(-)

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 85a64cb95d75..365eb092e9c4 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -26,8 +26,7 @@
  #include 
  #include 
  #include 
-#include 
-#include 
+#include 
  #include 
  #include 
  
@@ -141,7 +140,7 @@ static inline bool kprobe_ftrace(struct kprobe *p)

   */
  struct kretprobe_holder {
struct kretprobe*rp;
-   refcount_t  ref;
+   struct objpool_head pool;
  };
  
  struct kretprobe {

@@ -154,7 +153,6 @@ struct kretprobe {
  #ifdef CONFIG_KRETPROBE_ON_RETHOOK
struct rethook *rh;
  #else
-   struct freelist_head freelist;
struct kretprobe_holder *rph;
  #endif
  };
@@ -165,10 +163,7 @@ struct kretprobe_instance {
  #ifdef CONFIG_KRETPROBE_ON_RETHOOK
struct rethook_node node;
  #else
-   union {
-   struct freelist_node freelist;
-   struct rcu_head rcu;
-   };
+   struct rcu_head rcu;
struct llist_node llist;
struct kretprobe_holder *rph;
kprobe_opcode_t *ret_addr;
diff --git a/include/linux/rethook.h b/include/linux/rethook.h
index 26b6f3c81a76..ce69b2b7bc35 100644
--- a/include/linux/rethook.h
+++ b/include/linux/rethook.h
@@ -6,11 +6,10 @@
  #define _LINUX_RETHOOK_H
  
  #include 

-#include 
+#include 
  #include 
  #include 
  #include 
-#include 
  
  struct rethook_node;
  
@@ -30,14 +29,12 @@ typedef void (*rethook_handler_t) (struct rethook_node *, void *, unsigned long,

  struct rethook {
void*data;
rethook_handler_t   handler;
-   struct freelist_headpool;
-   refcount_t  ref;
+   struct objpool_head pool;
struct rcu_head rcu;
  };
  
  /**

   *