Re: [PATCH] tools/virtio: Use the __GFP_ZERO flag of kmalloc to complete the memory initialization.

2024-06-06 Thread cuitao

Sorry, maybe I'm wrong.

I wonder if the gfp parameter in static inline void *kmalloc(size_t s, 
gfp_t gfp) can be deleted if it is not used.


Or would be better to move memset to kmalloc.

Like this:
#define __GFP_ZERO 0x1
static inline void *kmalloc(size_t s, gfp_t gfp)
{
    void *p;
    if (__kmalloc_fake)
    return __kmalloc_fake;

    p = malloc(s);
    if (!p)
    return p;

    if (gfp & __GFP_ZERO)
    memset(p, 0, s);
    return p;
}

在 2024/6/6 15:18, Michael S. Tsirkin 写道:

On Wed, Jun 05, 2024 at 09:52:45PM +0800, cuitao wrote:

Use the __GFP_ZERO flag of kmalloc to initialize memory while allocating it,
without the need for an additional memset call.

Signed-off-by: cuitao 
---
  tools/virtio/linux/kernel.h | 5 +
  1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tools/virtio/linux/kernel.h b/tools/virtio/linux/kernel.h
index 6702008f7f5c..9e401fb7c215 100644
--- a/tools/virtio/linux/kernel.h
+++ b/tools/virtio/linux/kernel.h
@@ -66,10 +66,7 @@ static inline void *kmalloc_array(unsigned n, size_t s, 
gfp_t gfp)
  
  static inline void *kzalloc(size_t s, gfp_t gfp)

  {
-   void *p = kmalloc(s, gfp);
-
-   memset(p, 0, s);
-   return p;
+   return kmalloc(s, gfp | __GFP_ZERO);
  }


Why do we care? It's just here to make things compile. The simpler the
better.


  static inline void *alloc_pages_exact(size_t s, gfp_t gfp)
--
2.25.1




Re: [PATCH] tools/virtio: Use the __GFP_ZERO flag of kmalloc to complete the memory initialization.

2024-06-06 Thread Michael S. Tsirkin
On Wed, Jun 05, 2024 at 09:52:45PM +0800, cuitao wrote:
> Use the __GFP_ZERO flag of kmalloc to initialize memory while allocating it,
> without the need for an additional memset call.
> 
> Signed-off-by: cuitao 
> ---
>  tools/virtio/linux/kernel.h | 5 +
>  1 file changed, 1 insertion(+), 4 deletions(-)
> 
> diff --git a/tools/virtio/linux/kernel.h b/tools/virtio/linux/kernel.h
> index 6702008f7f5c..9e401fb7c215 100644
> --- a/tools/virtio/linux/kernel.h
> +++ b/tools/virtio/linux/kernel.h
> @@ -66,10 +66,7 @@ static inline void *kmalloc_array(unsigned n, size_t s, 
> gfp_t gfp)
>  
>  static inline void *kzalloc(size_t s, gfp_t gfp)
>  {
> - void *p = kmalloc(s, gfp);
> -
> -     memset(p, 0, s);
> - return p;
> + return kmalloc(s, gfp | __GFP_ZERO);
>  }


Why do we care? It's just here to make things compile. The simpler the
better.

>  static inline void *alloc_pages_exact(size_t s, gfp_t gfp)
> -- 
> 2.25.1




回复: Re: [PATCH] tools/virtio: Use the __GFP_ZERO flag of kmalloc to complete the memory initialization.

2024-06-05 Thread 崔涛
Sorry, this is a stupid mistake.
 
I wonder if the gfp parameter in static inline void *kmalloc(size_t s, gfp_t gfp) can be deleted if it is not used.
 
Or would be better to move memset to kmalloc.
Like this:
#define __GFP_ZERO 0x1
static inline void *kmalloc(size_t s, gfp_t gfp)
{
 void *p;
 if (__kmalloc_fake)
 return __kmalloc_fake;
 
 p = malloc(s);
 if (!p)
         return p;
 
 if (gfp & __GFP_ZERO)
 memset(p, 0, s);
 return p;
}


 

主 题:Re: [PATCH] tools/virtio: Use the __GFP_ZERO flag of kmalloc to complete the memory initialization. 日 期:2024-06-06 08:29 发件人:jasowang 收件人:崔涛;



On Wed, Jun 5, 2024 at 9:56 PM cuitao wrote:>> Use the __GFP_ZERO flag of kmalloc to initialize memory while allocating it,> without the need for an additional memset call.>> Signed-off-by: cuitao > ---> tools/virtio/linux/kernel.h | 5 +> 1 file changed, 1 insertion(+), 4 deletions(-)>> diff --git a/tools/virtio/linux/kernel.h b/tools/virtio/linux/kernel.h> index 6702008f7f5c..9e401fb7c215 100644> --- a/tools/virtio/linux/kernel.h> +++ b/tools/virtio/linux/kernel.h> @@ -66,10 +66,7 @@ static inline void *kmalloc_array(unsigned n, size_t s, gfp_t gfp)>> static inline void *kzalloc(size_t s, gfp_t gfp)> {> - void *p = kmalloc(s, gfp);> -> - memset(p, 0, s);> - return p;> + return kmalloc(s, gfp | __GFP_ZERO);> }>> static inline void *alloc_pages_exact(size_t s, gfp_t gfp)> --> 2.25.1>Does this really work?extern void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;static inline void *kmalloc(size_t s, gfp_t gfp){if (__kmalloc_fake)return __kmalloc_fake;return malloc(s);}Thanks




Re: [PATCH] tools/virtio: Use the __GFP_ZERO flag of kmalloc to complete the memory initialization.

2024-06-05 Thread Jason Wang
On Wed, Jun 5, 2024 at 9:56 PM cuitao  wrote:
>
> Use the __GFP_ZERO flag of kmalloc to initialize memory while allocating it,
> without the need for an additional memset call.
>
> Signed-off-by: cuitao 
> ---
>  tools/virtio/linux/kernel.h | 5 +
>  1 file changed, 1 insertion(+), 4 deletions(-)
>
> diff --git a/tools/virtio/linux/kernel.h b/tools/virtio/linux/kernel.h
> index 6702008f7f5c..9e401fb7c215 100644
> --- a/tools/virtio/linux/kernel.h
> +++ b/tools/virtio/linux/kernel.h
> @@ -66,10 +66,7 @@ static inline void *kmalloc_array(unsigned n, size_t s, 
> gfp_t gfp)
>
>  static inline void *kzalloc(size_t s, gfp_t gfp)
>  {
> -   void *p = kmalloc(s, gfp);
> -
> -   memset(p, 0, s);
> -   return p;
> +   return kmalloc(s, gfp | __GFP_ZERO);
>  }
>
>  static inline void *alloc_pages_exact(size_t s, gfp_t gfp)
> --
> 2.25.1
>

Does this really work?

extern void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
static inline void *kmalloc(size_t s, gfp_t gfp)
{
  if (__kmalloc_fake)
return __kmalloc_fake;
return malloc(s);
}

Thanks




[PATCH] tools/virtio: Use the __GFP_ZERO flag of kmalloc to complete the memory initialization.

2024-06-05 Thread cuitao
Use the __GFP_ZERO flag of kmalloc to initialize memory while allocating it,
without the need for an additional memset call.

Signed-off-by: cuitao 
---
 tools/virtio/linux/kernel.h | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tools/virtio/linux/kernel.h b/tools/virtio/linux/kernel.h
index 6702008f7f5c..9e401fb7c215 100644
--- a/tools/virtio/linux/kernel.h
+++ b/tools/virtio/linux/kernel.h
@@ -66,10 +66,7 @@ static inline void *kmalloc_array(unsigned n, size_t s, 
gfp_t gfp)
 
 static inline void *kzalloc(size_t s, gfp_t gfp)
 {
-   void *p = kmalloc(s, gfp);
-
-   memset(p, 0, s);
-   return p;
+   return kmalloc(s, gfp | __GFP_ZERO);
 }
 
 static inline void *alloc_pages_exact(size_t s, gfp_t gfp)
-- 
2.25.1




Re: WARNING: kmalloc bug in bpf_uprobe_multi_link_attach

2024-05-15 Thread Jiri Olsa
On Wed, May 15, 2024 at 02:30:37PM -0700, Alexei Starovoitov wrote:
> On Tue, May 14, 2024 at 12:33 AM Ubisectech Sirius
>  wrote:
> >
> > Hello.
> > We are Ubisectech Sirius Team, the vulnerability lab of China ValiantSec. 
> > Recently, our team has discovered a issue in Linux kernel 6.7.  Attached to 
> > the email were a PoC file of the issue.
> 
> Jiri,
> 
> please take a look.
> 
> > Stack dump:
> >
> > loop3: detected capacity change from 0 to 8
> > MTD: Attempt to mount non-MTD device "/dev/loop3"
> > [ cut here ]
> > WARNING: CPU: 1 PID: 10075 at mm/util.c:632 kvmalloc_node+0x199/0x1b0 
> > mm/util.c:632

hi,
this should be already fixed via:
  https://lore.kernel.org/bpf/20231215100708.2265609-2-hou...@huaweicloud.com/

original report was in here:
  
https://lore.kernel.org/bpf/CABOYnLwwJY=yfagie59lfsusbaghfrovqbzz5edaxbfe3yi...@mail.gmail.com/

the fix should be in v6.7, can you check if your kernel has:
  8b2efe51ba85 bpf: Limit the number of uprobes when attaching program to 
multiple uprobes

thanks,
jirka


> > Modules linked in:
> > CPU: 1 PID: 10075 Comm: syz-executor.3 Not tainted 6.7.0 #2
> > Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 
> > 04/01/2014
> > RIP: 0010:kvmalloc_node+0x199/0x1b0 mm/util.c:632
> > Code: 02 1d 00 eb aa e8 a7 49 c6 ff 41 81 e5 00 20 00 00 31 ff 44 89 ee e8 
> > 36 45 c6 ff 45 85 ed 0f 85 1b ff ff ff e8 88 49 c6 ff 90 <0f> 0b 90 e9 dd 
> > fe ff ff 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40
> > RSP: 0018:c90002007b60 EFLAGS: 00010212
> > RAX: 23e4 RBX: 0400 RCX: c90003aaa000
> > RDX: 0004 RSI: 81c3acc8 RDI: 0005
> > RBP: 0037cec8 R08: 0005 R09: 
> > R10:  R11:  R12: 
> > R13:  R14:  R15: 88805ff6e1b8
> > FS:  7fc62205f640() GS:88807ec0() knlGS:
> > CS:  0010 DS:  ES:  CR0: 80050033
> > CR2: 001b2e026000 CR3: 5f338000 CR4: 00750ef0
> > DR0:  DR1:  DR2: 
> > DR3:  DR6: fffe0ff0 DR7: 0400
> > PKRU: 5554
> > Call Trace:
> >  
> >  kvmalloc include/linux/slab.h:738 [inline]
> >  kvmalloc_array include/linux/slab.h:756 [inline]
> >  kvcalloc include/linux/slab.h:761 [inline]
> >  bpf_uprobe_multi_link_attach+0x3fe/0xf60 kernel/trace/bpf_trace.c:3239
> >  link_create kernel/bpf/syscall.c:5012 [inline]
> >  __sys_bpf+0x2e85/0x4e00 kernel/bpf/syscall.c:5453
> >  __do_sys_bpf kernel/bpf/syscall.c:5487 [inline]
> >  __se_sys_bpf kernel/bpf/syscall.c:5485 [inline]
> >  __x64_sys_bpf+0x78/0xc0 kernel/bpf/syscall.c:5485
> >  do_syscall_x64 arch/x86/entry/common.c:52 [inline]
> >  do_syscall_64+0x43/0x120 arch/x86/entry/common.c:83
> >  entry_SYSCALL_64_after_hwframe+0x6f/0x77
> > RIP: 0033:0x7fc62128fd6d
> > Code: c3 e8 97 2b 00 00 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 
> > 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff 
> > ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48
> > RSP: 002b:7fc62205f028 EFLAGS: 0246 ORIG_RAX: 0141
> > RAX: ffda RBX: 7fc6213cbf80 RCX: 7fc62128fd6d
> > RDX: 0040 RSI: 21c0 RDI: 001c
> > RBP: 7fc6212f14cd R08:  R09: 
> > R10:  R11: 0246 R12: 
> > R13: 000b R14: 7fc6213cbf80 R15: 7fc62203f000
> >  
> >
> > Thank you for taking the time to read this email and we look forward to 
> > working with you further.
> >
> >
> >



Re: WARNING: kmalloc bug in bpf_uprobe_multi_link_attach

2024-05-15 Thread Alexei Starovoitov
On Tue, May 14, 2024 at 12:33 AM Ubisectech Sirius
 wrote:
>
> Hello.
> We are Ubisectech Sirius Team, the vulnerability lab of China ValiantSec. 
> Recently, our team has discovered a issue in Linux kernel 6.7.  Attached to 
> the email were a PoC file of the issue.

Jiri,

please take a look.

> Stack dump:
>
> loop3: detected capacity change from 0 to 8
> MTD: Attempt to mount non-MTD device "/dev/loop3"
> [ cut here ]
> WARNING: CPU: 1 PID: 10075 at mm/util.c:632 kvmalloc_node+0x199/0x1b0 
> mm/util.c:632
> Modules linked in:
> CPU: 1 PID: 10075 Comm: syz-executor.3 Not tainted 6.7.0 #2
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 
> 04/01/2014
> RIP: 0010:kvmalloc_node+0x199/0x1b0 mm/util.c:632
> Code: 02 1d 00 eb aa e8 a7 49 c6 ff 41 81 e5 00 20 00 00 31 ff 44 89 ee e8 36 
> 45 c6 ff 45 85 ed 0f 85 1b ff ff ff e8 88 49 c6 ff 90 <0f> 0b 90 e9 dd fe ff 
> ff 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40
> RSP: 0018:c90002007b60 EFLAGS: 00010212
> RAX: 23e4 RBX: 0400 RCX: c90003aaa000
> RDX: 0004 RSI: 81c3acc8 RDI: 0005
> RBP: 0037cec8 R08: 0005 R09: 
> R10:  R11:  R12: 
> R13:  R14:  R15: 88805ff6e1b8
> FS:  7fc62205f640() GS:88807ec0() knlGS:
> CS:  0010 DS:  ES:  CR0: 80050033
> CR2: 001b2e026000 CR3: 5f338000 CR4: 00750ef0
> DR0:  DR1:  DR2: 
> DR3:  DR6: fffe0ff0 DR7: 0400
> PKRU: 5554
> Call Trace:
>  
>  kvmalloc include/linux/slab.h:738 [inline]
>  kvmalloc_array include/linux/slab.h:756 [inline]
>  kvcalloc include/linux/slab.h:761 [inline]
>  bpf_uprobe_multi_link_attach+0x3fe/0xf60 kernel/trace/bpf_trace.c:3239
>  link_create kernel/bpf/syscall.c:5012 [inline]
>  __sys_bpf+0x2e85/0x4e00 kernel/bpf/syscall.c:5453
>  __do_sys_bpf kernel/bpf/syscall.c:5487 [inline]
>  __se_sys_bpf kernel/bpf/syscall.c:5485 [inline]
>  __x64_sys_bpf+0x78/0xc0 kernel/bpf/syscall.c:5485
>  do_syscall_x64 arch/x86/entry/common.c:52 [inline]
>  do_syscall_64+0x43/0x120 arch/x86/entry/common.c:83
>  entry_SYSCALL_64_after_hwframe+0x6f/0x77
> RIP: 0033:0x7fc62128fd6d
> Code: c3 e8 97 2b 00 00 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 
> 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 
> 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48
> RSP: 002b:7fc62205f028 EFLAGS: 0246 ORIG_RAX: 0141
> RAX: ffda RBX: 7fc6213cbf80 RCX: 7fc62128fd6d
> RDX: 0040 RSI: 21c0 RDI: 001c
> RBP: 7fc6212f14cd R08:  R09: 
> R10:  R11: 0246 R12: 
> R13: 000b R14: 7fc6213cbf80 R15: 7fc62203f000
>  
>
> Thank you for taking the time to read this email and we look forward to 
> working with you further.
>
>
>



WARNING: kmalloc bug in bpf_uprobe_multi_link_attach

2024-05-14 Thread Ubisectech Sirius
Hello.
We are Ubisectech Sirius Team, the vulnerability lab of China ValiantSec. 
Recently, our team has discovered a issue in Linux kernel 6.7.  Attached to the 
email were a PoC file of the issue.

Stack dump:

loop3: detected capacity change from 0 to 8
MTD: Attempt to mount non-MTD device "/dev/loop3"
[ cut here ]
WARNING: CPU: 1 PID: 10075 at mm/util.c:632 kvmalloc_node+0x199/0x1b0 
mm/util.c:632
Modules linked in:
CPU: 1 PID: 10075 Comm: syz-executor.3 Not tainted 6.7.0 #2
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
RIP: 0010:kvmalloc_node+0x199/0x1b0 mm/util.c:632
Code: 02 1d 00 eb aa e8 a7 49 c6 ff 41 81 e5 00 20 00 00 31 ff 44 89 ee e8 36 
45 c6 ff 45 85 ed 0f 85 1b ff ff ff e8 88 49 c6 ff 90 <0f> 0b 90 e9 dd fe ff ff 
66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40
RSP: 0018:c90002007b60 EFLAGS: 00010212
RAX: 23e4 RBX: 0400 RCX: c90003aaa000
RDX: 0004 RSI: 81c3acc8 RDI: 0005
RBP: 0037cec8 R08: 0005 R09: 
R10:  R11:  R12: 
R13:  R14:  R15: 88805ff6e1b8
FS:  7fc62205f640() GS:88807ec0() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2: 001b2e026000 CR3: 5f338000 CR4: 00750ef0
DR0:  DR1:  DR2: 
DR3:  DR6: fffe0ff0 DR7: 0400
PKRU: 5554
Call Trace:
 
 kvmalloc include/linux/slab.h:738 [inline]
 kvmalloc_array include/linux/slab.h:756 [inline]
 kvcalloc include/linux/slab.h:761 [inline]
 bpf_uprobe_multi_link_attach+0x3fe/0xf60 kernel/trace/bpf_trace.c:3239
 link_create kernel/bpf/syscall.c:5012 [inline]
 __sys_bpf+0x2e85/0x4e00 kernel/bpf/syscall.c:5453
 __do_sys_bpf kernel/bpf/syscall.c:5487 [inline]
 __se_sys_bpf kernel/bpf/syscall.c:5485 [inline]
 __x64_sys_bpf+0x78/0xc0 kernel/bpf/syscall.c:5485
 do_syscall_x64 arch/x86/entry/common.c:52 [inline]
 do_syscall_64+0x43/0x120 arch/x86/entry/common.c:83
 entry_SYSCALL_64_after_hwframe+0x6f/0x77
RIP: 0033:0x7fc62128fd6d
Code: c3 e8 97 2b 00 00 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 
89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 
c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48
RSP: 002b:7fc62205f028 EFLAGS: 0246 ORIG_RAX: 0141
RAX: ffda RBX: 7fc6213cbf80 RCX: 7fc62128fd6d
RDX: 0040 RSI: 21c0 RDI: 001c
RBP: 7fc6212f14cd R08:  R09: 
R10:  R11: 0246 R12: 
R13: 000b R14: 7fc6213cbf80 R15: 7fc62203f000
 

Thank you for taking the time to read this email and we look forward to working 
with you further.





poc.c
Description: Binary data


Re: [PATCH v6.1.y-v4.19.y] vhost: use kzalloc() instead of kmalloc() followed by memset()

2024-02-13 Thread Greg KH
On Mon, Feb 05, 2024 at 10:49:37AM +0530, Ajay Kaher wrote:
> From: Prathu Baronia 
> 
> From: Prathu Baronia 
> 
> commit 4d8df0f5f79f747d75a7d356d9b9ea40a4e4c8a9 upstream
> 
> Use kzalloc() to allocate new zeroed out msg node instead of
> memsetting a node allocated with kmalloc().
> 
> Signed-off-by: Prathu Baronia 
> Message-Id: <20230522085019.42914-1-prathubaronia2...@gmail.com>
> Signed-off-by: Michael S. Tsirkin 
> Reviewed-by: Stefano Garzarella 
> [Ajay: This is a security fix as per CVE-2024-0340]

And this is why I am so grumpy about Red Hat and CVEs, they know how to
let us know about stuff like this, but no.  Hopefully, someday soon,
they will soon not be allowed to do this anymore.

{sigh}

Now queued up, thanks.

greg k-h



[PATCH v6.1.y-v4.19.y] vhost: use kzalloc() instead of kmalloc() followed by memset()

2024-02-04 Thread Ajay Kaher
From: Prathu Baronia 

From: Prathu Baronia 

commit 4d8df0f5f79f747d75a7d356d9b9ea40a4e4c8a9 upstream

Use kzalloc() to allocate new zeroed out msg node instead of
memsetting a node allocated with kmalloc().

Signed-off-by: Prathu Baronia 
Message-Id: <20230522085019.42914-1-prathubaronia2...@gmail.com>
Signed-off-by: Michael S. Tsirkin 
Reviewed-by: Stefano Garzarella 
[Ajay: This is a security fix as per CVE-2024-0340]
Signed-off-by: Ajay Kaher 
---
 drivers/vhost/vhost.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 07427302084955..ecb3b397bb3888 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2563,12 +2563,11 @@ EXPORT_SYMBOL_GPL(vhost_disable_notify);
 /* Create a new message. */
 struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type)
 {
-   struct vhost_msg_node *node = kmalloc(sizeof *node, GFP_KERNEL);
+   /* Make sure all padding within the structure is initialized. */
+   struct vhost_msg_node *node = kzalloc(sizeof(*node), GFP_KERNEL);
if (!node)
return NULL;
 
-   /* Make sure all padding within the structure is initialized. */
-   memset(>msg, 0, sizeof node->msg);
node->vq = vq;
node->msg.type = type;
return node;



Re: WARNING: kmalloc bug in bpf_uprobe_multi_link_attach

2023-12-11 Thread Jiri Olsa
On Mon, Dec 11, 2023 at 02:01:43PM +0100, Jiri Olsa wrote:
> On Mon, Dec 11, 2023 at 07:29:40PM +0800, Hou Tao wrote:
> 
> SNIP
> 
> > 
> > It seems a big attr->link_create.uprobe_multi.cnt is passed to
> > bpf_uprobe_multi_link_attach(). Could you please try the first patch in
> > the following patch set ?
> > 
> > https://lore.kernel.org/bpf/2023122843.4147157-1-hou...@huaweicloud.com/T/#t
> > > [   68.389633][ T8223]  ? __might_fault+0x13f/0x1a0
> > > [   68.390129][ T8223]  ? bpf_kprobe_multi_link_attach+0x10/0x10
> > 
> > SNIP
> > >   res = syscall(__NR_bpf, /*cmd=*/5ul, /*arg=*/0x2140ul, 
> > > /*size=*/0x90ul);
> > >   if (res != -1) r[0] = res;
> > >   memcpy((void*)0x2000, "./file0\000", 8);
> > >   syscall(__NR_creat, /*file=*/0x2000ul, /*mode=*/0ul);
> > >   *(uint32_t*)0x2340 = r[0];
> > >   *(uint32_t*)0x2344 = 0;
> > >   *(uint32_t*)0x2348 = 0x30;
> > >   *(uint32_t*)0x234c = 0;
> > >   *(uint64_t*)0x2350 = 0x2080;
> > >   memcpy((void*)0x2080, "./file0\000", 8);
> > 
> > 0x2350 is the address of attr->link_create.uprobe_multi.path.
> > >   *(uint64_t*)0x2358 = 0x20c0;
> > >   *(uint64_t*)0x20c0 = 0;
> > >   *(uint64_t*)0x2360 = 0;
> > >   *(uint64_t*)0x2368 = 0;
> > >   *(uint32_t*)0x2370 = 0xff1f;
> > 
> > The value of attr->link_create.uprobe_multi.cnt is 0xff1f, so 
> > 0xff1f * sizeof(bpf_uprobe) will be greater than INT_MAX, and
> > triggers the warning in mm/util.c:
> > 
> >     /* Don't even allow crazy sizes */
> >     if (unlikely(size > INT_MAX)) {
> >     WARN_ON_ONCE(!(flags & __GFP_NOWARN));
> >     return NULL;
> >     }
> > 
> > Adding __GFP_NOWARN when doing kvcalloc() can fix the warning.
> 
> hi,
> looks like that's the case.. thanks for fixing that
> 
> btw while checking on that I found kprobe_multi bench attach test
> takes forever on latest bpf-next/master
> 
>   
> test_kprobe_multi_bench_attach:PASS:bpf_program__attach_kprobe_multi_opts 0 
> nsec
>   test_kprobe_multi_bench_attach: found 56140 functions
>   test_kprobe_multi_bench_attach: attached in  89.174s
>   test_kprobe_multi_bench_attach: detached in  13.245s
>   #113/1   kprobe_multi_bench_attach/kernel:OK
> 
> Masami,
> any idea of any change on fprobe/ftrace side recently? I'm going to check ;-)

nah sry, I had IBT enabled.. forgot the reason, but it's slow ;-)

jirka



Re: WARNING: kmalloc bug in bpf_uprobe_multi_link_attach

2023-12-11 Thread Jiri Olsa
On Mon, Dec 11, 2023 at 07:29:40PM +0800, Hou Tao wrote:

SNIP

> 
> It seems a big attr->link_create.uprobe_multi.cnt is passed to
> bpf_uprobe_multi_link_attach(). Could you please try the first patch in
> the following patch set ?
> 
> https://lore.kernel.org/bpf/2023122843.4147157-1-hou...@huaweicloud.com/T/#t
> > [   68.389633][ T8223]  ? __might_fault+0x13f/0x1a0
> > [   68.390129][ T8223]  ? bpf_kprobe_multi_link_attach+0x10/0x10
> 
> SNIP
> >   res = syscall(__NR_bpf, /*cmd=*/5ul, /*arg=*/0x2140ul, 
> > /*size=*/0x90ul);
> >   if (res != -1) r[0] = res;
> >   memcpy((void*)0x2000, "./file0\000", 8);
> >   syscall(__NR_creat, /*file=*/0x2000ul, /*mode=*/0ul);
> >   *(uint32_t*)0x2340 = r[0];
> >   *(uint32_t*)0x2344 = 0;
> >   *(uint32_t*)0x2348 = 0x30;
> >   *(uint32_t*)0x234c = 0;
> >   *(uint64_t*)0x2350 = 0x2080;
> >   memcpy((void*)0x2080, "./file0\000", 8);
> 
> 0x2350 is the address of attr->link_create.uprobe_multi.path.
> >   *(uint64_t*)0x2358 = 0x20c0;
> >   *(uint64_t*)0x20c0 = 0;
> >   *(uint64_t*)0x2360 = 0;
> >   *(uint64_t*)0x2368 = 0;
> >   *(uint32_t*)0x2370 = 0xff1f;
> 
> The value of attr->link_create.uprobe_multi.cnt is 0xff1f, so 
> 0xff1f * sizeof(bpf_uprobe) will be greater than INT_MAX, and
> triggers the warning in mm/util.c:
> 
>     /* Don't even allow crazy sizes */
>     if (unlikely(size > INT_MAX)) {
>     WARN_ON_ONCE(!(flags & __GFP_NOWARN));
>     return NULL;
>     }
> 
> Adding __GFP_NOWARN when doing kvcalloc() can fix the warning.

hi,
looks like that's the case.. thanks for fixing that

btw while checking on that I found kprobe_multi bench attach test
takes forever on latest bpf-next/master


test_kprobe_multi_bench_attach:PASS:bpf_program__attach_kprobe_multi_opts 0 nsec
test_kprobe_multi_bench_attach: found 56140 functions
test_kprobe_multi_bench_attach: attached in  89.174s
test_kprobe_multi_bench_attach: detached in  13.245s
#113/1   kprobe_multi_bench_attach/kernel:OK

Masami,
any idea of any change on fprobe/ftrace side recently? I'm going to check ;-)

thanks,
jirka



Re: WARNING: kmalloc bug in bpf_uprobe_multi_link_attach

2023-12-11 Thread Hou Tao
Hi,

On 12/11/2023 4:12 PM, xingwei lee wrote:
> Sorry for containing HTML part, repeat the mail
> Hello I found a bug in net/bpf in the lastest upstream linux and
> lastest net tree.
> WARNING: kmalloc bug in bpf_uprobe_multi_link_attach
>
> kernel: net 28a7cb045ab700de5554193a1642917602787784
> Kernel config: 
> https://github.com/google/syzkaller/commits/fc59b78e3174009510ed15f20665e7ab2435ebee
>
> in the lastest net tree, the crash like:
>
> [   68.363836][ T8223] [ cut here ]
> [   68.364967][ T8223] WARNING: CPU: 2 PID: 8223 at mm/util.c:632
> kvmalloc_node+0x18a/0x1a0
> [   68.366527][ T8223] Modules linked in:
> [   68.367882][ T8223] CPU: 2 PID: 8223 Comm: 36d Not tainted
> 6.7.0-rc4-00146-g28a7cb045ab7 #2
> [   68.369260][ T8223] Hardware name: QEMU Standard PC (i440FX + PIIX,
> 1996), BIOS 1.16.2-1.fc38 04/014
> [   68.370811][ T8223] RIP: 0010:kvmalloc_node+0x18a/0x1a0
> [   68.371689][ T8223] Code: dc 1c 00 eb aa e8 86 33 c6 ff 41 81 e4 00
> 20 00 00 31 ff 44 89 e6 e8 e5 20
> [   68.375001][ T8223] RSP: 0018:c9001088fb68 EFLAGS: 00010293
> [   68.375989][ T8223] RAX:  RBX: 0037cec8
> RCX: 81c1a32b
> [   68.377154][ T8223] RDX: 88802cc00040 RSI: 81c1a339
> RDI: 0005
> [   68.377950][ T8223] RBP: 0400 R08: 0005
> R09: 
> [   68.378744][ T8223] R10:  R11: 
> R12: 
> [   68.379523][ T8223] R13:  R14: 888017eb4a28
> R15: 
> [   68.380307][ T8223] FS:  00827380()
> GS:8880b990() knlGS:
> [   68.381185][ T8223] CS:  0010 DS:  ES:  CR0: 80050033
> [   68.381843][ T8223] CR2: 2140 CR3: 204d2000
> CR4: 00750ef0
> [   68.382624][ T8223] PKRU: 5554
> [   68.382978][ T8223] Call Trace:
> [   68.383312][ T8223]  
> [   68.383608][ T8223]  ? show_regs+0x8f/0xa0
> [   68.384052][ T8223]  ? __warn+0xe6/0x390
> [   68.384470][ T8223]  ? kvmalloc_node+0x18a/0x1a0
> [   68.385111][ T8223]  ? report_bug+0x3b9/0x580
> [   68.385585][ T8223]  ? handle_bug+0x67/0x90
> [   68.386032][ T8223]  ? exc_invalid_op+0x17/0x40
> [   68.386503][ T8223]  ? asm_exc_invalid_op+0x1a/0x20
> [   68.387065][ T8223]  ? kvmalloc_node+0x17b/0x1a0
> [   68.387551][ T8223]  ? kvmalloc_node+0x189/0x1a0
> [   68.388051][ T8223]  ? kvmalloc_node+0x18a/0x1a0
> [   68.388537][ T8223]  ? kvmalloc_node+0x189/0x1a0
> [   68.389038][ T8223]  bpf_uprobe_multi_link_attach+0x436/0xfb0

It seems a big attr->link_create.uprobe_multi.cnt is passed to
bpf_uprobe_multi_link_attach(). Could you please try the first patch in
the following patch set ?

https://lore.kernel.org/bpf/2023122843.4147157-1-hou...@huaweicloud.com/T/#t
> [   68.389633][ T8223]  ? __might_fault+0x13f/0x1a0
> [   68.390129][ T8223]  ? bpf_kprobe_multi_link_attach+0x10/0x10

SNIP
>   res = syscall(__NR_bpf, /*cmd=*/5ul, /*arg=*/0x2140ul, /*size=*/0x90ul);
>   if (res != -1) r[0] = res;
>   memcpy((void*)0x2000, "./file0\000", 8);
>   syscall(__NR_creat, /*file=*/0x2000ul, /*mode=*/0ul);
>   *(uint32_t*)0x2340 = r[0];
>   *(uint32_t*)0x2344 = 0;
>   *(uint32_t*)0x2348 = 0x30;
>   *(uint32_t*)0x234c = 0;
>   *(uint64_t*)0x2350 = 0x2080;
>   memcpy((void*)0x2080, "./file0\000", 8);

0x2350 is the address of attr->link_create.uprobe_multi.path.
>   *(uint64_t*)0x2358 = 0x20c0;
>   *(uint64_t*)0x20c0 = 0;
>   *(uint64_t*)0x2360 = 0;
>   *(uint64_t*)0x2368 = 0;
>   *(uint32_t*)0x2370 = 0xff1f;

The value of attr->link_create.uprobe_multi.cnt is 0xff1f, so 
0xff1f * sizeof(bpf_uprobe) will be greater than INT_MAX, and
triggers the warning in mm/util.c:

    /* Don't even allow crazy sizes */
    if (unlikely(size > INT_MAX)) {
    WARN_ON_ONCE(!(flags & __GFP_NOWARN));
    return NULL;
    }

Adding __GFP_NOWARN when doing kvcalloc() can fix the warning.
>   *(uint32_t*)0x2374 = 0;
>   *(uint32_t*)0x2378 = 0;
>   syscall(__NR_bpf, /*cmd=*/0x1cul, /*arg=*/0x2340ul, /*size=*/0x40ul);
>   return 0;
> }
>
> =* repro.txt =*
> r0 = bpf$PROG_LOAD(0x5, &(0x7f000140)={0x2, 0x3,
> &(0x7f000200)=@framed, &(0x7f000240)='GPL\x00', 0x0, 0x0, 0x0,
> 0x0, 0x0, '\x00', 0x0, 0x30, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
> 0x0, 0x0, 0x0, 0x0, 0x0}, 0x90)
> creat(&(0x7f00)='./file0\x00', 0x0)
> bpf$BPF_LINK_CREATE_XDP(0x1c, &(0x7f000340)={r0, 0x0, 0x30, 0x0,
> @val=@uprobe_multi={&(0x7f80)='./file0\x00',
> &(0x7fc0)=[0x0], 0x0, 0x0, 0xff1f}}, 0x40
>
>
> See aslo https://gist.github.com/xrivendell7/15d43946c73aa13247b4b20b68798aaa
>
> .




Re: WARNING: kmalloc bug in bpf_uprobe_multi_link_attach

2023-12-11 Thread Google
On Mon, 11 Dec 2023 16:10:32 +0800
xingwei lee  wrote:

> Hello I found a bug in net/bpf in the lastest upstream linux and lastest
> net tree.
> WARNING: kmalloc bug in bpf_uprobe_multi_link_attach

Hmm, uprobe_multi is recently introduced and it seems a normal
uprobes unlike kprobe_multi (which uses fprobe instead of kprobe). 

The warning warns that the required size is bigger than INT_MAX. Maybe
too many links or uprobes were going to be allocated?

Thanks,

> 
> kernel: net 28a7cb045ab700de5554193a1642917602787784
> Kernel config:
> https://github.com/google/syzkaller/commits/fc59b78e3174009510ed15f20665e7ab2435ebee
> 
> in the lastestcrash like:
> 
> [   68.363836][ T8223] [ cut here ]
> [   68.364967][ T8223] WARNING: CPU: 2 PID: 8223 at mm/util.c:632
> kvmalloc_node+0x18a/0x1a0
> [   68.366527][ T8223] Modules linked in:
> [   68.367882][ T8223] CPU: 2 PID: 8223 Comm: 36d Not tainted
> 6.7.0-rc4-00146-g28a7cb045ab7 #2
> [   68.369260][ T8223] Hardware name: QEMU Standard PC (i440FX + PIIX,
> 1996), BIOS 1.16.2-1.fc38 04/014
> [   68.370811][ T8223] RIP: 0010:kvmalloc_node+0x18a/0x1a0
> [   68.371689][ T8223] Code: dc 1c 00 eb aa e8 86 33 c6 ff 41 81 e4 00 20
> 00 00 31 ff 44 89 e6 e8 e5 20
> [   68.375001][ T8223] RSP: 0018:c9001088fb68 EFLAGS: 00010293
> [   68.375989][ T8223] RAX:  RBX: 0037cec8 RCX:
> 81c1a32b
> [   68.377154][ T8223] RDX: 88802cc00040 RSI: 81c1a339 RDI:
> 0005
> [   68.377950][ T8223] RBP: 0400 R08: 0005 R09:
> 
> [   68.378744][ T8223] R10:  R11:  R12:
> 
> [   68.379523][ T8223] R13:  R14: 888017eb4a28 R15:
> 
> [   68.380307][ T8223] FS:  00827380()
> GS:8880b990() knlGS:
> [   68.381185][ T8223] CS:  0010 DS:  ES:  CR0: 80050033
> [   68.381843][ T8223] CR2: 2140 CR3: 204d2000 CR4:
> 00750ef0
> [   68.382624][ T8223] PKRU: 5554
> [   68.382978][ T8223] Call Trace:
> [   68.383312][ T8223]  
> [   68.383608][ T8223]  ? show_regs+0x8f/0xa0
> [   68.384052][ T8223]  ? __warn+0xe6/0x390
> [   68.384470][ T8223]  ? kvmalloc_node+0x18a/0x1a0
> [   68.385111][ T8223]  ? report_bug+0x3b9/0x580
> [   68.385585][ T8223]  ? handle_bug+0x67/0x90
> [   68.386032][ T8223]  ? exc_invalid_op+0x17/0x40
> [   68.386503][ T8223]  ? asm_exc_invalid_op+0x1a/0x20
> [   68.387065][ T8223]  ? kvmalloc_node+0x17b/0x1a0
> [   68.387551][ T8223]  ? kvmalloc_node+0x189/0x1a0
> [   68.388051][ T8223]  ? kvmalloc_node+0x18a/0x1a0
> [   68.388537][ T8223]  ? kvmalloc_node+0x189/0x1a0
> [   68.389038][ T8223]  bpf_uprobe_multi_link_attach+0x436/0xfb0
> [   68.389633][ T8223]  ? __might_fault+0x13f/0x1a0
> [   68.390129][ T8223]  ? bpf_kprobe_multi_link_attach+0x10/0x10
> [   68.390731][ T8223]  ? __fget_light+0x1fc/0x260
> [   68.391206][ T8223]  ? __sanitizer_cov_trace_switch+0x54/0x90
> [   68.391812][ T8223]  __sys_bpf+0x3ea0/0x4840
> [   68.392267][ T8223]  ? slab_free_freelist_hook+0x114/0x1e0
> [   68.393032][ T8223]  ? bpf_perf_link_attach+0x540/0x540
> [   68.393580][ T8223]  ? putname+0x12e/0x170
> [   68.394015][ T8223]  ? kmem_cache_free+0xf8/0x350
> [   68.394509][ T8223]  ? putname+0x12e/0x170
> [   68.394948][ T8223]  ? do_sys_openat2+0xb1/0x1e0
> [   68.395442][ T8223]  ? __x64_sys_creat+0xcd/0x120
> [   68.395945][ T8223]  __x64_sys_bpf+0x78/0xc0
> [   68.396393][ T8223]  ? syscall_enter_from_user_mode+0x7f/0x120
> [   68.397040][ T8223]  do_syscall_64+0x41/0x110
> [   68.397502][ T8223]  entry_SYSCALL_64_after_hwframe+0x63/0x6b
> [   68.398098][ T8223] RIP: 0033:0x410ead
> [   68.398498][ T8223] Code: b3 66 2e 0f 1f 84 00 00 00 00 00 66 90 f3 0f
> 1e fa 48 89 f8 48 89 f7 48 88
> [   68.400432][ T8223] RSP: 002b:7ffdbabd7098 EFLAGS: 0246
> ORIG_RAX: 0141
> [   68.401271][ T8223] RAX: ffda RBX: 7ffdbabd7298 RCX:
> 00410ead
> [   68.402063][ T8223] RDX: 0040 RSI: 2340 RDI:
> 001c
> [   68.402864][ T8223] RBP: 7ffdbabd70b0 R08:  R09:
> 
> [   68.403649][ T8223] R10:  R11: 0246 R12:
> 0049be68
> [   68.404786][ T8223] R13: 0001 R14: 0001 R15:
> 0001
> [   68.405574][ T8223]  
> [   68.405890][ T8223] Kernel panic - not syncing: kernel: panic_on_warn
> set ...
> [   68.406611][ T8223] CPU: 2 PID: 8223 Comm: 36d Not tainted
> 6.7.0-rc4-00146-g28a7cb045ab7 #2
> [   68.407453][ T8223] Hardware name: QEMU Standard PC (i440FX + PIIX,
> 1996

WARNING: kmalloc bug in bpf_uprobe_multi_link_attach

2023-12-11 Thread xingwei lee
Sorry for containing HTML part, repeat the mail
Hello I found a bug in net/bpf in the lastest upstream linux and
lastest net tree.
WARNING: kmalloc bug in bpf_uprobe_multi_link_attach

kernel: net 28a7cb045ab700de5554193a1642917602787784
Kernel config: 
https://github.com/google/syzkaller/commits/fc59b78e3174009510ed15f20665e7ab2435ebee

in the lastest net tree, the crash like:

[   68.363836][ T8223] [ cut here ]
[   68.364967][ T8223] WARNING: CPU: 2 PID: 8223 at mm/util.c:632
kvmalloc_node+0x18a/0x1a0
[   68.366527][ T8223] Modules linked in:
[   68.367882][ T8223] CPU: 2 PID: 8223 Comm: 36d Not tainted
6.7.0-rc4-00146-g28a7cb045ab7 #2
[   68.369260][ T8223] Hardware name: QEMU Standard PC (i440FX + PIIX,
1996), BIOS 1.16.2-1.fc38 04/014
[   68.370811][ T8223] RIP: 0010:kvmalloc_node+0x18a/0x1a0
[   68.371689][ T8223] Code: dc 1c 00 eb aa e8 86 33 c6 ff 41 81 e4 00
20 00 00 31 ff 44 89 e6 e8 e5 20
[   68.375001][ T8223] RSP: 0018:c9001088fb68 EFLAGS: 00010293
[   68.375989][ T8223] RAX:  RBX: 0037cec8
RCX: 81c1a32b
[   68.377154][ T8223] RDX: 88802cc00040 RSI: 81c1a339
RDI: 0005
[   68.377950][ T8223] RBP: 0400 R08: 0005
R09: 
[   68.378744][ T8223] R10:  R11: 
R12: 
[   68.379523][ T8223] R13:  R14: 888017eb4a28
R15: 
[   68.380307][ T8223] FS:  00827380()
GS:8880b990() knlGS:
[   68.381185][ T8223] CS:  0010 DS:  ES:  CR0: 80050033
[   68.381843][ T8223] CR2: 2140 CR3: 204d2000
CR4: 00750ef0
[   68.382624][ T8223] PKRU: 5554
[   68.382978][ T8223] Call Trace:
[   68.383312][ T8223]  
[   68.383608][ T8223]  ? show_regs+0x8f/0xa0
[   68.384052][ T8223]  ? __warn+0xe6/0x390
[   68.384470][ T8223]  ? kvmalloc_node+0x18a/0x1a0
[   68.385111][ T8223]  ? report_bug+0x3b9/0x580
[   68.385585][ T8223]  ? handle_bug+0x67/0x90
[   68.386032][ T8223]  ? exc_invalid_op+0x17/0x40
[   68.386503][ T8223]  ? asm_exc_invalid_op+0x1a/0x20
[   68.387065][ T8223]  ? kvmalloc_node+0x17b/0x1a0
[   68.387551][ T8223]  ? kvmalloc_node+0x189/0x1a0
[   68.388051][ T8223]  ? kvmalloc_node+0x18a/0x1a0
[   68.388537][ T8223]  ? kvmalloc_node+0x189/0x1a0
[   68.389038][ T8223]  bpf_uprobe_multi_link_attach+0x436/0xfb0
[   68.389633][ T8223]  ? __might_fault+0x13f/0x1a0
[   68.390129][ T8223]  ? bpf_kprobe_multi_link_attach+0x10/0x10
[   68.390731][ T8223]  ? __fget_light+0x1fc/0x260
[   68.391206][ T8223]  ? __sanitizer_cov_trace_switch+0x54/0x90
[   68.391812][ T8223]  __sys_bpf+0x3ea0/0x4840
[   68.392267][ T8223]  ? slab_free_freelist_hook+0x114/0x1e0
[   68.393032][ T8223]  ? bpf_perf_link_attach+0x540/0x540
[   68.393580][ T8223]  ? putname+0x12e/0x170
[   68.394015][ T8223]  ? kmem_cache_free+0xf8/0x350
[   68.394509][ T8223]  ? putname+0x12e/0x170
[   68.394948][ T8223]  ? do_sys_openat2+0xb1/0x1e0
[   68.395442][ T8223]  ? __x64_sys_creat+0xcd/0x120
[   68.395945][ T8223]  __x64_sys_bpf+0x78/0xc0
[   68.396393][ T8223]  ? syscall_enter_from_user_mode+0x7f/0x120
[   68.397040][ T8223]  do_syscall_64+0x41/0x110
[   68.397502][ T8223]  entry_SYSCALL_64_after_hwframe+0x63/0x6b
[   68.398098][ T8223] RIP: 0033:0x410ead
[   68.398498][ T8223] Code: b3 66 2e 0f 1f 84 00 00 00 00 00 66 90 f3
0f 1e fa 48 89 f8 48 89 f7 48 88
[   68.400432][ T8223] RSP: 002b:7ffdbabd7098 EFLAGS: 0246
ORIG_RAX: 0141
[   68.401271][ T8223] RAX: ffda RBX: 7ffdbabd7298
RCX: 00410ead
[   68.402063][ T8223] RDX: 0040 RSI: 2340
RDI: 001c
[   68.402864][ T8223] RBP: 7ffdbabd70b0 R08: 
R09: 
[   68.403649][ T8223] R10:  R11: 0246
R12: 0049be68
[   68.404786][ T8223] R13: 0001 R14: 0001
R15: 0001
[   68.405574][ T8223]  
[   68.405890][ T8223] Kernel panic - not syncing: kernel: panic_on_warn set ...
[   68.406611][ T8223] CPU: 2 PID: 8223 Comm: 36d Not tainted
6.7.0-rc4-00146-g28a7cb045ab7 #2
[   68.407453][ T8223] Hardware name: QEMU Standard PC (i440FX + PIIX,
1996), BIOS 1.16.2-1.fc38 04/014
[   68.408386][ T8223] Call Trace:
[   68.408723][ T8223]  
[   68.409023][ T8223]  dump_stack_lvl+0xd3/0x1b0
[   68.409484][ T8223]  panic+0x6dc/0x790
[   68.409884][ T8223]  ? panic_smp_self_stop+0xa0/0xa0
[   68.410408][ T8223]  ? show_trace_log_lvl+0x363/0x4f0
[   68.410947][ T8223]  ? check_panic_on_warn+0x1f/0xb0
[   68.411458][ T8223]  ? kvmalloc_node+0x18a/0x1a0
[   68.411955][ T8223]  check_panic_on_warn+0xab/0xb0
[   68.412453][ T8223]  __warn+0xf2/0x390
[   68.412855][ T8223]  ? kvmalloc_node+0x18a/0x1a0
[   68.413334][ T8223]  report_bug+0x3b9/0x580
[   68.413778][ T8223]  handle_bug+0x67/0x90
[   68.414195][ T8223]  exc_invalid_op+0x17/0x40
[   68.414651][ T8223

[PATCH v3 3/5] params: Use size_add() for kmalloc()

2023-11-20 Thread Andy Shevchenko
Prevent allocations from integer overflow by using size_add().

Reviewed-by: Luis Chamberlain 
Reviewed-by: Kees Cook 
Signed-off-by: Andy Shevchenko 
---
 kernel/params.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/params.c b/kernel/params.c
index f8e3c4139854..c3a029fe183d 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -48,7 +49,7 @@ static void *kmalloc_parameter(unsigned int size)
 {
struct kmalloced_param *p;
 
-   p = kmalloc(sizeof(*p) + size, GFP_KERNEL);
+   p = kmalloc(size_add(sizeof(*p), size), GFP_KERNEL);
if (!p)
return NULL;
 
-- 
2.43.0.rc1.1.gbec44491f096




[PATCH v2 3/5] params: Use size_add() for kmalloc()

2023-10-02 Thread Andy Shevchenko
Prevent allocations from integer overflow by using size_add().

Signed-off-by: Andy Shevchenko 
---
 kernel/params.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/params.c b/kernel/params.c
index f8e3c4139854..c3a029fe183d 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -48,7 +49,7 @@ static void *kmalloc_parameter(unsigned int size)
 {
struct kmalloced_param *p;
 
-   p = kmalloc(sizeof(*p) + size, GFP_KERNEL);
+   p = kmalloc(size_add(sizeof(*p), size), GFP_KERNEL);
if (!p)
return NULL;
 
-- 
2.40.0.1.gaa8946217a0b




Re: [PATCH v5] Randomized slab caches for kmalloc()

2023-09-11 Thread Kees Cook
On Mon, Sep 11, 2023 at 11:18:15PM +0200, jvoisin wrote:
> I wrote a small blogpost[1] about this series, and was told[2] that it
> would be interesting to share it on this thread, so here it is, copied
> verbatim:

Thanks for posting!

> Ruiqi Gong and Xiu Jianfeng got their
> [Randomized slab caches for
> kmalloc()](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3c6152940584290668b35fa0800026f6a1ae05fe)
> patch series merged upstream, and I've and enough discussions about it to
> warrant summarising them into a small blogpost.
> 
> The main idea is to have multiple slab caches, and pick one at random
> based on
> the address of code calling `kmalloc()` and a per-boot seed, to make
> heap-spraying harder.
> It's a great idea, but comes with some shortcomings for now:
> 
> - Objects being allocated via wrappers around `kmalloc()`, like
> `sock_kmalloc`,
>   `f2fs_kmalloc`, `aligned_kmalloc`, … will end up in the same slab cache.

I'd love to see some way to "unwrap" these kinds of allocators. Right
now we try to manually mark them so the debugging options can figure out
what did the allocation, but it's not complete by any means.

I'd kind of like to see a common front end that specified some set of
"do stuff" routines. e.g. to replace devm_kmalloc(), we could have:

void *alloc(size_t usable, gfp_t flags,
size_t (*prepare)(size_t, gfp_t, void *ctx),
void * (*finish)(size_t, gfp_t, void *ctx, void *allocated)
void * ctx)

ssize_t devm_prep(size_t usable, gfp_t *flags, void *ctx)
{
ssize_t tot_size;

if (unlikely(check_add_overflow(sizeof(struct devres),
size, _size)))
return -ENOMEM;

tot_size = kmalloc_size_roundup(tot_size);
*flags |= __GFP_ZERO;

return tot_size;
}

void *devm_finish(size_t usable, gfp_t flags, void *ctx, void 
*allocated)
{
struct devres *dr = allocated;
struct device *dev = ctx;

INIT_LIST_HEAD(>node.entry);
dr->node.release = devm_kmalloc_release;

set_node_dbginfo(>node, "devm_kzalloc_release", usable);
devres_add(dev, dr->data);
return dr->data;
}

#define devm_kmalloc(dev, size, gfp)\
alloc(size, gfp, devm_prep, devm_finish, dev)

And now there's no wrapper any more, just a routine to get the actual
size, and a routine to set up the memory and return the "usable"
pointer.

> - The slabs needs to be pinned, otherwise an attacker could
> [feng-shui](https://en.wikipedia.org/wiki/Heap_feng_shui) their way
>   into having the whole slab free'ed, garbage-collected, and have a slab for
>   another type allocated at the same VA. [Jann Horn](https://thejh.net/)
> and [Matteo Rizzo](https://infosec.exchange/@nspace) have a [nice
>   set of
> patches](https://github.com/torvalds/linux/compare/master...thejh:linux:slub-virtual-upstream),
>   discussed a bit in [this Project Zero
> blogpost](https://googleprojectzero.blogspot.com/2021/10/how-simple-linux-kernel-memory.html),
>   for a feature called [`SLAB_VIRTUAL`](
> https://github.com/torvalds/linux/commit/f3afd3a2152353be355b90f5fd4367adbf6a955e),
>   implementing precisely this.

I'm hoping this will get posted to LKML soon.

> - There are 16 slabs by default, so one chance out of 16 to end up in
> the same
>   slab cache as the target.

Future work can make this more deterministic.

> - There are no guard pages between caches, so inter-caches overflows are
>   possible.

This may be addressed by SLAB_VIRTUAL.

> - As pointed by
> [andreyknvl](https://twitter.com/andreyknvl/status/1700267669336080678)
>   and [minipli](https://infosec.exchange/@minipli/111045336853055793),
>   the fewer allocations hitting a given cache means less noise,
>   so it might even help with some heap feng-shui.

That may be true, but I suspect it'll be mitigated by the overall
reduction shared caches.

> - minipli also pointed that "randomized caches still freely
>   mix kernel allocations with user controlled ones (`xattr`, `keyctl`,
> `msg_msg`, …).
>   So even though merging is disabled for these caches, i.e. no direct
> overlap
>   with `cred_jar` etc., other object types can still be targeted (`struct
>   pipe_buffer`, BPF maps, its verifier state objects,…). It’s just a
> matter of
>   probing which allocation index the targeted object falls into.",
>   but I considered this out of scope, since it's much more involved;
>   albeit something like
> [`CONFIG_KMALLOC_SPLIT_VARSIZE`](https://github.com/thejh/linux/b

Re: [PATCH v5] Randomized slab caches for kmalloc()

2023-09-11 Thread jvoisin
I wrote a small blogpost[1] about this series, and was told[2] that it
would be interesting to share it on this thread, so here it is, copied
verbatim:

Ruiqi Gong and Xiu Jianfeng got their
[Randomized slab caches for
kmalloc()](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3c6152940584290668b35fa0800026f6a1ae05fe)
patch series merged upstream, and I've and enough discussions about it to
warrant summarising them into a small blogpost.

The main idea is to have multiple slab caches, and pick one at random
based on
the address of code calling `kmalloc()` and a per-boot seed, to make
heap-spraying harder.
It's a great idea, but comes with some shortcomings for now:

- Objects being allocated via wrappers around `kmalloc()`, like
`sock_kmalloc`,
  `f2fs_kmalloc`, `aligned_kmalloc`, … will end up in the same slab cache.
- The slabs needs to be pinned, otherwise an attacker could
[feng-shui](https://en.wikipedia.org/wiki/Heap_feng_shui) their way
  into having the whole slab free'ed, garbage-collected, and have a slab for
  another type allocated at the same VA. [Jann Horn](https://thejh.net/)
and [Matteo Rizzo](https://infosec.exchange/@nspace) have a [nice
  set of
patches](https://github.com/torvalds/linux/compare/master...thejh:linux:slub-virtual-upstream),
  discussed a bit in [this Project Zero
blogpost](https://googleprojectzero.blogspot.com/2021/10/how-simple-linux-kernel-memory.html),
  for a feature called [`SLAB_VIRTUAL`](
https://github.com/torvalds/linux/commit/f3afd3a2152353be355b90f5fd4367adbf6a955e),
  implementing precisely this.
- There are 16 slabs by default, so one chance out of 16 to end up in
the same
  slab cache as the target.
- There are no guard pages between caches, so inter-caches overflows are
  possible.
- As pointed by
[andreyknvl](https://twitter.com/andreyknvl/status/1700267669336080678)
  and [minipli](https://infosec.exchange/@minipli/111045336853055793),
  the fewer allocations hitting a given cache means less noise,
  so it might even help with some heap feng-shui.
- minipli also pointed that "randomized caches still freely
  mix kernel allocations with user controlled ones (`xattr`, `keyctl`,
`msg_msg`, …).
  So even though merging is disabled for these caches, i.e. no direct
overlap
  with `cred_jar` etc., other object types can still be targeted (`struct
  pipe_buffer`, BPF maps, its verifier state objects,…). It’s just a
matter of
  probing which allocation index the targeted object falls into.",
  but I considered this out of scope, since it's much more involved;
  albeit something like
[`CONFIG_KMALLOC_SPLIT_VARSIZE`](https://github.com/thejh/linux/blob/slub-virtual/MITIGATION_README)
  wouldn't significantly increase complexity.

Also, while code addresses as a source of entropy has historically be a
great
way to provide [KASLR](https://lwn.net/Articles/569635/) bypasses,
`hash_64(caller ^
random_kmalloc_seed, ilog2(RANDOM_KMALLOC_CACHES_NR + 1))` shouldn't
trivially
leak offsets.

The segregation technique is a bit like a weaker version of grsecurity's
[AUTOSLAB](https://grsecurity.net/how_autoslab_changes_the_memory_unsafety_game),
or a weaker kernel-land version of
[PartitionAlloc](https://chromium.googlesource.com/chromium/src/+/master/base/allocator/partition_allocator/PartitionAlloc.md),
but to be fair, making use-after-free exploitation harder, and significantly
harder once pinning lands, with only ~150 lines of code and negligible
performance impact is amazing and should be praised. Moreover, I wouldn't be
surprised if this was backported in [Google's
KernelCTF](https://google.github.io/security-research/kernelctf/rules.html)
soon, so we should see if my analysis is correct.

1.
https://dustri.org/b/some-notes-on-randomized-slab-caches-for-kmalloc.html
2. https://infosec.exchange/@vba...@social.kernel.org/111046740392510260

-- 
Julien (jvoisin) Voisin
GPG: 04D041E8171901CC
dustri.org



Re: High kmalloc-32 slab cache consumption with 10k containers

2021-04-18 Thread Dave Chinner
On Fri, Apr 16, 2021 at 10:14:39AM +0530, Bharata B Rao wrote:
> On Wed, Apr 07, 2021 at 08:28:07AM +1000, Dave Chinner wrote:
> > On Mon, Apr 05, 2021 at 11:18:48AM +0530, Bharata B Rao wrote:
> > 
> > > As an alternative approach, I have this below hack that does lazy
> > > list_lru creation. The memcg-specific list is created and initialized
> > > only when there is a request to add an element to that particular
> > > list. Though I am not sure about the full impact of this change
> > > on the owners of the lists and also the performance impact of this,
> > > the overall savings look good.
> > 
> > Avoiding memory allocation in list_lru_add() was one of the main
> > reasons for up-front static allocation of memcg lists. We cannot do
> > memory allocation while callers are holding multiple spinlocks in
> > core system algorithms (e.g. dentry_kill -> retain_dentry ->
> > d_lru_add -> list_lru_add), let alone while holding an internal
> > spinlock.
> > 
> > Putting a GFP_ATOMIC allocation inside 3-4 nested spinlocks in a
> > path we know might have memory demand in the *hundreds of GB* range
> > gets an NACK from me. It's a great idea, but it's just not a
> 
> I do understand that GFP_ATOMIC allocations are really not preferrable
> but want to point out that the allocations in the range of hundreds of
> GBs get reduced to tens of MBs when we do lazy list_lru head allocations
> under GFP_ATOMIC.

That does not make GFP_ATOMIC allocations safe or desirable. In
general, using GFP_ATOMIC outside of interrupt context indicates
something is being done incorrectly. Especially if it can be
triggered from userspace, which is likely in this particular case...



> As shown earlier, this is what I see in my experimental setup with
> 10k containers:
> 
> Number of kmalloc-32 allocations
>   Before  During  After
> W/o patch 178176  3442409472  388933632
> W/  patch 190464  468992  468992

SO now we have an additional half million GFP_ATOMIC allocations
when we currently have none. That's not an improvement, that rings
loud alarm bells.

> This does really depend and vary on the type of the container and
> the number of mounts it does, but I suspect we are looking
> at GFP_ATOMIC allocations in the MB range. Also the number of
> GFP_ATOMIC slab allocation requests matter I suppose.

They are slab allocations, which mean every single one of them
could require a new slab backing page (pages!) to be allocated.
Hence the likely memory demand might be a lot higher than the
optimal case you are considering here...

> There are other users of list_lru, but I was just looking at
> dentry and inode list_lru usecase. It appears to me that for both
> dentry and inode, we can tolerate the failure from list_lru_add
> due to GFP_ATOMIC allocation failure. The failure to add dentry
> or inode to the lru list means that they won't be retained in
> the lru list, but would be freed immediately. Is this understanding
> correct?

No. Both retain_dentry() and iput_final() would currently leak
objects that fail insertion into the LRU. They don't check for
insertion success at all.

But, really, this is irrelevant - GFP_ATOMIC usage is the problem,
and allowing it to fail doesn't avoid the problems that unbound
GFP_ATOMIC allocation can have on the stability of the rest of the
system when low on memory. Being able to handle a GFP_ATOMIC memory
allocation failure doesn't change the fact that you should not be
doing GFP_ATOMIC allocation in the first place...

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com


Re: High kmalloc-32 slab cache consumption with 10k containers

2021-04-15 Thread Bharata B Rao
On Wed, Apr 07, 2021 at 08:28:07AM +1000, Dave Chinner wrote:
> On Mon, Apr 05, 2021 at 11:18:48AM +0530, Bharata B Rao wrote:
> 
> > As an alternative approach, I have this below hack that does lazy
> > list_lru creation. The memcg-specific list is created and initialized
> > only when there is a request to add an element to that particular
> > list. Though I am not sure about the full impact of this change
> > on the owners of the lists and also the performance impact of this,
> > the overall savings look good.
> 
> Avoiding memory allocation in list_lru_add() was one of the main
> reasons for up-front static allocation of memcg lists. We cannot do
> memory allocation while callers are holding multiple spinlocks in
> core system algorithms (e.g. dentry_kill -> retain_dentry ->
> d_lru_add -> list_lru_add), let alone while holding an internal
> spinlock.
> 
> Putting a GFP_ATOMIC allocation inside 3-4 nested spinlocks in a
> path we know might have memory demand in the *hundreds of GB* range
> gets an NACK from me. It's a great idea, but it's just not a

I do understand that GFP_ATOMIC allocations are really not preferrable
but want to point out that the allocations in the range of hundreds of
GBs get reduced to tens of MBs when we do lazy list_lru head allocations
under GFP_ATOMIC.

As shown earlier, this is what I see in my experimental setup with
10k containers:

Number of kmalloc-32 allocations
Before  During  After
W/o patch   178176  3442409472  388933632
W/  patch   190464  468992  468992

So 3442409472*32=102GB upfront list_lru creation-time GFP_KERNEL allocations
get reduced to 468992*32=14MB dynamic list_lru addtion-time GFP_ATOMIC
allocations.

This does really depend and vary on the type of the container and
the number of mounts it does, but I suspect we are looking
at GFP_ATOMIC allocations in the MB range. Also the number of
GFP_ATOMIC slab allocation requests matter I suppose.

There are other users of list_lru, but I was just looking at
dentry and inode list_lru usecase. It appears to me that for both
dentry and inode, we can tolerate the failure from list_lru_add
due to GFP_ATOMIC allocation failure. The failure to add dentry
or inode to the lru list means that they won't be retained in
the lru list, but would be freed immediately. Is this understanding
correct?

If so, would that likely impact the subsequent lookups adversely?
We failed to retain a dentry or inode in the lru list because
we failed to allocate memory, presumably under memory pressure.
Even in such a scenario, is failure to add dentry or inode to
lru list so bad and not tolerable? 

Regards,
Bharata.


Re: High kmalloc-32 slab cache consumption with 10k containers

2021-04-15 Thread Bharata B Rao
On Thu, Apr 15, 2021 at 08:54:43AM +0200, Michal Hocko wrote:
> On Thu 15-04-21 10:53:00, Bharata B Rao wrote:
> > On Wed, Apr 07, 2021 at 08:28:07AM +1000, Dave Chinner wrote:
> > > 
> > > Another approach may be to identify filesystem types that do not
> > > need memcg awareness and feed that into alloc_super() to set/clear
> > > the SHRINKER_MEMCG_AWARE flag. This could be based on fstype - most
> > > virtual filesystems that expose system information do not really
> > > need full memcg awareness because they are generally only visible to
> > > a single memcg instance...
> > 
> > Would something like below be appropriate?
> 
> No. First of all you are defining yet another way to say
> SHRINKER_MEMCG_AWARE which is messy.

Ok.

> And secondly why would shmem, proc
> and ramfs be any special and they would be ok to opt out? There is no
> single word about that reasoning in your changelog.

Right, I am only checking if the suggestion given by David (see above)
is indeed this. There are a few other things to take care of
which I shall if the overall direction of the patch turns
out to be acceptable.

Regards,
Bharata.


Re: High kmalloc-32 slab cache consumption with 10k containers

2021-04-15 Thread Michal Hocko
On Thu 15-04-21 10:53:00, Bharata B Rao wrote:
> On Wed, Apr 07, 2021 at 08:28:07AM +1000, Dave Chinner wrote:
> > 
> > Another approach may be to identify filesystem types that do not
> > need memcg awareness and feed that into alloc_super() to set/clear
> > the SHRINKER_MEMCG_AWARE flag. This could be based on fstype - most
> > virtual filesystems that expose system information do not really
> > need full memcg awareness because they are generally only visible to
> > a single memcg instance...
> 
> Would something like below be appropriate?

No. First of all you are defining yet another way to say
SHRINKER_MEMCG_AWARE which is messy. And secondly why would shmem, proc
and ramfs be any special and they would be ok to opt out? There is no
single word about that reasoning in your changelog.

> >From f314083ad69fde2a420a1b74febd6d3f7a25085f Mon Sep 17 00:00:00 2001
> From: Bharata B Rao 
> Date: Wed, 14 Apr 2021 11:21:24 +0530
> Subject: [PATCH 1/1] fs: Let filesystems opt out of memcg awareness
> 
> All filesystem mounts by default are memcg aware and end hence
> end up creating shrinker list_lrus for all the memcgs. Due to
> the way the memcg_nr_cache_ids grow and the list_lru heads are
> allocated for all memcgs, huge amount of memory gets consumed
> by kmalloc-32 slab cache when running thousands of containers.
> 
> Improve this situation by allowing filesystems to opt out
> of memcg awareness. In this patch, tmpfs, proc and ramfs
> opt out of memcg awareness. This leads to considerable memory
> savings when running 10k containers.
> 
> Signed-off-by: Bharata B Rao 
> ---
>  fs/proc/root.c |  1 +
>  fs/ramfs/inode.c   |  1 +
>  fs/super.c | 27 +++
>  include/linux/fs_context.h |  2 ++
>  mm/shmem.c |  1 +
>  5 files changed, 24 insertions(+), 8 deletions(-)

[...]
-- 
Michal Hocko
SUSE Labs


Re: High kmalloc-32 slab cache consumption with 10k containers

2021-04-14 Thread Bharata B Rao
On Wed, Apr 07, 2021 at 08:28:07AM +1000, Dave Chinner wrote:
> 
> Another approach may be to identify filesystem types that do not
> need memcg awareness and feed that into alloc_super() to set/clear
> the SHRINKER_MEMCG_AWARE flag. This could be based on fstype - most
> virtual filesystems that expose system information do not really
> need full memcg awareness because they are generally only visible to
> a single memcg instance...

Would something like below be appropriate?

>From f314083ad69fde2a420a1b74febd6d3f7a25085f Mon Sep 17 00:00:00 2001
From: Bharata B Rao 
Date: Wed, 14 Apr 2021 11:21:24 +0530
Subject: [PATCH 1/1] fs: Let filesystems opt out of memcg awareness

All filesystem mounts by default are memcg aware and end hence
end up creating shrinker list_lrus for all the memcgs. Due to
the way the memcg_nr_cache_ids grow and the list_lru heads are
allocated for all memcgs, huge amount of memory gets consumed
by kmalloc-32 slab cache when running thousands of containers.

Improve this situation by allowing filesystems to opt out
of memcg awareness. In this patch, tmpfs, proc and ramfs
opt out of memcg awareness. This leads to considerable memory
savings when running 10k containers.

Signed-off-by: Bharata B Rao 
---
 fs/proc/root.c |  1 +
 fs/ramfs/inode.c   |  1 +
 fs/super.c | 27 +++
 include/linux/fs_context.h |  2 ++
 mm/shmem.c |  1 +
 5 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/fs/proc/root.c b/fs/proc/root.c
index c7e3b1350ef8..7856bc2ca9f4 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -257,6 +257,7 @@ static int proc_init_fs_context(struct fs_context *fc)
fc->user_ns = get_user_ns(ctx->pid_ns->user_ns);
fc->fs_private = ctx;
fc->ops = _fs_context_ops;
+   fc->memcg_optout = true;
return 0;
 }
 
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 9ebd17d7befb..576a88bb7407 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -278,6 +278,7 @@ int ramfs_init_fs_context(struct fs_context *fc)
fsi->mount_opts.mode = RAMFS_DEFAULT_MODE;
fc->s_fs_info = fsi;
fc->ops = _context_ops;
+   fc->memcg_optout = true;
return 0;
 }
 
diff --git a/fs/super.c b/fs/super.c
index 8c1baca35c16..59aa22c678e6 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -198,7 +198,8 @@ static void destroy_unused_super(struct super_block *s)
  * returns a pointer new superblock or %NULL if allocation had failed.
  */
 static struct super_block *alloc_super(struct file_system_type *type, int 
flags,
-  struct user_namespace *user_ns)
+  struct user_namespace *user_ns,
+  bool memcg_optout)
 {
struct super_block *s = kzalloc(sizeof(struct super_block),  GFP_USER);
static const struct super_operations default_op;
@@ -266,13 +267,22 @@ static struct super_block *alloc_super(struct 
file_system_type *type, int flags,
s->s_shrink.scan_objects = super_cache_scan;
s->s_shrink.count_objects = super_cache_count;
s->s_shrink.batch = 1024;
-   s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
+   s->s_shrink.flags = SHRINKER_NUMA_AWARE;
+   if (!memcg_optout)
+   s->s_shrink.flags |= SHRINKER_MEMCG_AWARE;
if (prealloc_shrinker(>s_shrink))
goto fail;
-   if (list_lru_init_memcg(>s_dentry_lru, >s_shrink))
-   goto fail;
-   if (list_lru_init_memcg(>s_inode_lru, >s_shrink))
-   goto fail;
+   if (memcg_optout) {
+   if (list_lru_init(>s_dentry_lru))
+   goto fail;
+   if (list_lru_init(>s_inode_lru))
+   goto fail;
+   } else {
+   if (list_lru_init_memcg(>s_dentry_lru, >s_shrink))
+   goto fail;
+   if (list_lru_init_memcg(>s_inode_lru, >s_shrink))
+   goto fail;
+   }
return s;
 
 fail:
@@ -527,7 +537,8 @@ struct super_block *sget_fc(struct fs_context *fc,
}
if (!s) {
spin_unlock(_lock);
-   s = alloc_super(fc->fs_type, fc->sb_flags, user_ns);
+   s = alloc_super(fc->fs_type, fc->sb_flags, user_ns,
+   fc->memcg_optout);
if (!s)
return ERR_PTR(-ENOMEM);
goto retry;
@@ -610,7 +621,7 @@ struct super_block *sget(struct file_system_type *type,
}
if (!s) {
spin_unlock(_lock);
-   s = alloc_super(type, (flags & ~SB_SUBMOUNT), user_ns);
+   s = alloc_super(type, (flags & ~SB_SUBMOUNT), user_ns, false);
if (!s)
return ERR_P

Re: High kmalloc-32 slab cache consumption with 10k containers

2021-04-07 Thread Shakeel Butt
On Wed, Apr 7, 2021 at 4:55 AM Michal Hocko  wrote:
>
> On Mon 05-04-21 11:18:48, Bharata B Rao wrote:
> > Hi,
> >
> > When running 1 (more-or-less-empty-)containers on a bare-metal Power9
> > server(160 CPUs, 2 NUMA nodes, 256G memory), it is seen that memory
> > consumption increases quite a lot (around 172G) when the containers are
> > running. Most of it comes from slab (149G) and within slab, the majority of
> > it comes from kmalloc-32 cache (102G)
>
> Is this 10k cgroups a testing enviroment or does anybody really use that
> in production? I would be really curious to hear how that behaves when
> those containers are not idle. E.g. global memory reclaim iterating over
> 10k memcgs will likely be very visible. I do remember playing with
> similar setups few years back and the overhead was very high.
> --

I can tell about our environment. Couple of thousands of memcgs (~2k)
are very normal on our machines as machines can be running 100+ jobs
(and each job can manage their own sub-memcgs). However each job can
have a high number of mounts. There is no local disk and each package
of the job is remotely mounted (a bit more complicated).

We do have issues with global memory reclaim but mostly the proactive
reclaim makes the global reclaim a tail issue (and at tail it often
does create havoc).


Re: High kmalloc-32 slab cache consumption with 10k containers

2021-04-07 Thread Christian Brauner
On Wed, Apr 07, 2021 at 08:28:07AM +1000, Dave Chinner wrote:
> On Mon, Apr 05, 2021 at 11:18:48AM +0530, Bharata B Rao wrote:
> > Hi,
> > 
> > When running 1 (more-or-less-empty-)containers on a bare-metal Power9
> > server(160 CPUs, 2 NUMA nodes, 256G memory), it is seen that memory
> > consumption increases quite a lot (around 172G) when the containers are
> > running. Most of it comes from slab (149G) and within slab, the majority of
> > it comes from kmalloc-32 cache (102G)
> > 
> > The major allocator of kmalloc-32 slab cache happens to be the list_head
> > allocations of list_lru_one list. These lists are created whenever a
> > FS mount happens. Specially two such lists are registered by alloc_super(),
> > one for dentry and another for inode shrinker list. And these lists
> > are created for all possible NUMA nodes and for all given memcgs
> > (memcg_nr_cache_ids to be particular)
> > 
> > If,
> > 
> > A = Nr allocation request per mount: 2 (one for dentry and inode list)
> > B = Nr NUMA possible nodes
> > C = memcg_nr_cache_ids
> > D = size of each kmalloc-32 object: 32 bytes,
> > 
> > then for every mount, the amount of memory consumed by kmalloc-32 slab
> > cache for list_lru creation is A*B*C*D bytes.
> > 
> > Following factors contribute to the excessive allocations:
> > 
> > - Lists are created for possible NUMA nodes.
> > - memcg_nr_cache_ids grows in bulk (see memcg_alloc_cache_id() and 
> > additional
> >   list_lrus are created when it grows. Thus we end up creating list_lru_one
> >   list_heads even for those memcgs which are yet to be created.
> >   For example, when 1 memcgs are created, memcg_nr_cache_ids reach
> >   a value of 12286.
> 
> So, by your numbers, we have 2 * 2 * 12286 * 32 = 1.5MB per mount.
> 
> So for that to make up 100GB of RAM, you must have somewhere over
> 500,000 mounted superblocks on the machine?
> 
> That implies 50+ unique mounted superblocks per container, which
> seems like an awful lot.
> 
> > - When a memcg goes offline, the list elements are drained to the parent
> >   memcg, but the list_head entry remains.
> > - The lists are destroyed only when the FS is unmounted. So list_heads
> >   for non-existing memcgs remain and continue to contribute to the
> >   kmalloc-32 allocation. This is presumably done for performance
> >   reason as they get reused when new memcgs are created, but they end up
> >   consuming slab memory until then.
> > - In case of containers, a few file systems get mounted and are specific
> >   to the container namespace and hence to a particular memcg, but we
> >   end up creating lists for all the memcgs.
> >   As an example, if 7 FS mounts are done for every container and when
> >   10k containers are created, we end up creating 2*7*12286 list_lru_one
> >   lists for each NUMA node. It appears that no elements will get added
> >   to other than 2*7=14 of them in the case of containers.
> 
> Yeah, at first glance this doesn't strike me as a problem with the
> list_lru structure, it smells more like a problem resulting from a
> huge number of superblock instantiations on the machine. Which,
> probably, mostly have no significant need for anything other than a
> single memcg awareness?
> 
> Can you post a typical /proc/self/mounts output from one of these
> idle/empty containers so we can see exactly how many mounts and
> their type are being instantiated in each container?

Similar to Michal I wonder how much of that is really used in production
environments. From our experience it really depends on the type of
container we're talking about.
For a regular app container that essentially serves as an application
isolator the number of mounts could be fairly limited and essentially be
restricted to:

tmpfs
devptfs
sysfs
[cgroupfs]
and a few bind-mounts of standard devices such as
/dev/null
/dev/zero
/dev/full
.
.
.
from the host's devtmpfs into the container.

Then there are containers that behave like regular systems and are
managed like regular systems and those might have quite a bit more. For
example, here is the output of a regular unprivileged Fedora 33
container I created out of the box:

[root@f33 ~]# findmnt 
TARGETSOURCE
   FSTYPE  OPTIONS
/ 
/dev/mapper/ubuntu--vg-ubuntu--lv[/var/lib/lxd/storage-pools/default/containers/f33/rootfs]
│   
   xfs 
rw,relatime,attr2,inode64,logbufs=8,logbsize=32k,noquota
├─/run   

Re: High kmalloc-32 slab cache consumption with 10k containers

2021-04-07 Thread Michal Hocko
On Wed 07-04-21 19:13:42, Bharata B Rao wrote:
> On Wed, Apr 07, 2021 at 01:54:48PM +0200, Michal Hocko wrote:
> > On Mon 05-04-21 11:18:48, Bharata B Rao wrote:
> > > Hi,
> > > 
> > > When running 1 (more-or-less-empty-)containers on a bare-metal Power9
> > > server(160 CPUs, 2 NUMA nodes, 256G memory), it is seen that memory
> > > consumption increases quite a lot (around 172G) when the containers are
> > > running. Most of it comes from slab (149G) and within slab, the majority 
> > > of
> > > it comes from kmalloc-32 cache (102G)
> > 
> > Is this 10k cgroups a testing enviroment or does anybody really use that
> > in production? I would be really curious to hear how that behaves when
> > those containers are not idle. E.g. global memory reclaim iterating over
> > 10k memcgs will likely be very visible. I do remember playing with
> > similar setups few years back and the overhead was very high.
> 
> This 10k containers is only a test scenario that we are looking at.

OK, this is good to know. I would definitely recommend looking at the
runtime aspect of such a large scale deployment before optimizing for
memory footprint. I do agree that the later is an interesting topic on
its own but I do not expect such a deployment on small machines so the
overhead shouldn't be a showstopper. I would be definitely interested
to hear about the runtime overhead. I do expect some interesting
finidings there.

Btw. I do expect that memory controller will not be the only one
deployed right?

-- 
Michal Hocko
SUSE Labs


Re: High kmalloc-32 slab cache consumption with 10k containers

2021-04-07 Thread Bharata B Rao
On Wed, Apr 07, 2021 at 01:54:48PM +0200, Michal Hocko wrote:
> On Mon 05-04-21 11:18:48, Bharata B Rao wrote:
> > Hi,
> > 
> > When running 1 (more-or-less-empty-)containers on a bare-metal Power9
> > server(160 CPUs, 2 NUMA nodes, 256G memory), it is seen that memory
> > consumption increases quite a lot (around 172G) when the containers are
> > running. Most of it comes from slab (149G) and within slab, the majority of
> > it comes from kmalloc-32 cache (102G)
> 
> Is this 10k cgroups a testing enviroment or does anybody really use that
> in production? I would be really curious to hear how that behaves when
> those containers are not idle. E.g. global memory reclaim iterating over
> 10k memcgs will likely be very visible. I do remember playing with
> similar setups few years back and the overhead was very high.

This 10k containers is only a test scenario that we are looking at.

Regards,
Bharata.


Re: High kmalloc-32 slab cache consumption with 10k containers

2021-04-07 Thread Christian Brauner
On Wed, Apr 07, 2021 at 01:54:48PM +0200, Michal Hocko wrote:
> On Mon 05-04-21 11:18:48, Bharata B Rao wrote:
> > Hi,
> > 
> > When running 1 (more-or-less-empty-)containers on a bare-metal Power9
> > server(160 CPUs, 2 NUMA nodes, 256G memory), it is seen that memory
> > consumption increases quite a lot (around 172G) when the containers are
> > running. Most of it comes from slab (149G) and within slab, the majority of
> > it comes from kmalloc-32 cache (102G)
> 
> Is this 10k cgroups a testing enviroment or does anybody really use that
> in production? I would be really curious to hear how that behaves when
> those containers are not idle. E.g. global memory reclaim iterating over
> 10k memcgs will likely be very visible. I do remember playing with
> similar setups few years back and the overhead was very high.

Ccing Stéphane Graber who has experience/insight about stuff like this.

Christian


Re: High kmalloc-32 slab cache consumption with 10k containers

2021-04-07 Thread Kirill Tkhai
On 07.04.2021 14:47, Bharata B Rao wrote:
> On Wed, Apr 07, 2021 at 01:07:27PM +0300, Kirill Tkhai wrote:
>>> Here is how the calculation turns out to be in my setup:
>>>
>>> Number of possible NUMA nodes = 2
>>> Number of mounts per container = 7 (Check below to see which are these)
>>> Number of list creation requests per mount = 2
>>> Number of containers = 1
>>> memcg_nr_cache_ids for 10k containers = 12286
>>
>> Luckily, we have "+1" in memcg_nr_cache_ids formula: size = 2 * (id + 1).
>> In case of we only multiplied it, you would have to had 
>> memcg_nr_cache_ids=2.
> 
> Not really, it would grow like this for size = 2 * id
> 
> id 0 size 4
> id 4 size 8
> id 8 size 16
> id 16 size 32
> id 32 size 64
> id 64 size 128
> id 128 size 256
> id 256 size 512
> id 512 size 1024
> id 1024 size 2048
> id 2048 size 4096
> id 4096 size 8192
> id 8192 size 16384
> 
> Currently (size = 2 * (id + 1)), it grows like this:
> 
> id 0 size 4
> id 4 size 10
> id 10 size 22
> id 22 size 46
> id 46 size 94
> id 94 size 190
> id 190 size 382
> id 382 size 766
> id 766 size 1534
> id 1534 size 3070
> id 3070 size 6142
> id 6142 size 12286

Oh, thanks, I forgot what power of two is :)
 
>>
>> Maybe, we need change that formula to increase memcg_nr_cache_ids more 
>> accurate
>> for further growths of containers number. Say,
>>
>> size = id < 2000 ? 2 * (id + 1) : id + 2000
> 
> For the above, it would only be marginally better like this:
> 
> id 0 size 4
> id 4 size 10
> id 10 size 22
> id 22 size 46
> id 46 size 94
> id 94 size 190
> id 190 size 382
> id 382 size 766
> id 766 size 1534
> id 1534 size 3070
> id 3070 size 5070
> id 5070 size 7070
> id 7070 size 9070
> id 9070 size 11070
> 
> All the above numbers are for 10k memcgs.

I mean the number of containers bigger then your 1.


Re: High kmalloc-32 slab cache consumption with 10k containers

2021-04-07 Thread Michal Hocko
On Mon 05-04-21 11:18:48, Bharata B Rao wrote:
> Hi,
> 
> When running 1 (more-or-less-empty-)containers on a bare-metal Power9
> server(160 CPUs, 2 NUMA nodes, 256G memory), it is seen that memory
> consumption increases quite a lot (around 172G) when the containers are
> running. Most of it comes from slab (149G) and within slab, the majority of
> it comes from kmalloc-32 cache (102G)

Is this 10k cgroups a testing enviroment or does anybody really use that
in production? I would be really curious to hear how that behaves when
those containers are not idle. E.g. global memory reclaim iterating over
10k memcgs will likely be very visible. I do remember playing with
similar setups few years back and the overhead was very high.
-- 
Michal Hocko
SUSE Labs


Re: High kmalloc-32 slab cache consumption with 10k containers

2021-04-07 Thread Bharata B Rao
On Wed, Apr 07, 2021 at 01:07:27PM +0300, Kirill Tkhai wrote:
> > Here is how the calculation turns out to be in my setup:
> > 
> > Number of possible NUMA nodes = 2
> > Number of mounts per container = 7 (Check below to see which are these)
> > Number of list creation requests per mount = 2
> > Number of containers = 1
> > memcg_nr_cache_ids for 10k containers = 12286
> 
> Luckily, we have "+1" in memcg_nr_cache_ids formula: size = 2 * (id + 1).
> In case of we only multiplied it, you would have to had 
> memcg_nr_cache_ids=2.

Not really, it would grow like this for size = 2 * id

id 0 size 4
id 4 size 8
id 8 size 16
id 16 size 32
id 32 size 64
id 64 size 128
id 128 size 256
id 256 size 512
id 512 size 1024
id 1024 size 2048
id 2048 size 4096
id 4096 size 8192
id 8192 size 16384

Currently (size = 2 * (id + 1)), it grows like this:

id 0 size 4
id 4 size 10
id 10 size 22
id 22 size 46
id 46 size 94
id 94 size 190
id 190 size 382
id 382 size 766
id 766 size 1534
id 1534 size 3070
id 3070 size 6142
id 6142 size 12286

> 
> Maybe, we need change that formula to increase memcg_nr_cache_ids more 
> accurate
> for further growths of containers number. Say,
> 
> size = id < 2000 ? 2 * (id + 1) : id + 2000

For the above, it would only be marginally better like this:

id 0 size 4
id 4 size 10
id 10 size 22
id 22 size 46
id 46 size 94
id 94 size 190
id 190 size 382
id 382 size 766
id 766 size 1534
id 1534 size 3070
id 3070 size 5070
id 5070 size 7070
id 7070 size 9070
id 9070 size 11070

All the above numbers are for 10k memcgs.

Regards,
Bharata.


Re: High kmalloc-32 slab cache consumption with 10k containers

2021-04-07 Thread Kirill Tkhai
On 07.04.2021 08:05, Bharata B Rao wrote:
> On Wed, Apr 07, 2021 at 08:28:07AM +1000, Dave Chinner wrote:
>> On Mon, Apr 05, 2021 at 11:18:48AM +0530, Bharata B Rao wrote:
>>> Hi,
>>>
>>> When running 1 (more-or-less-empty-)containers on a bare-metal Power9
>>> server(160 CPUs, 2 NUMA nodes, 256G memory), it is seen that memory
>>> consumption increases quite a lot (around 172G) when the containers are
>>> running. Most of it comes from slab (149G) and within slab, the majority of
>>> it comes from kmalloc-32 cache (102G)
>>>
>>> The major allocator of kmalloc-32 slab cache happens to be the list_head
>>> allocations of list_lru_one list. These lists are created whenever a
>>> FS mount happens. Specially two such lists are registered by alloc_super(),
>>> one for dentry and another for inode shrinker list. And these lists
>>> are created for all possible NUMA nodes and for all given memcgs
>>> (memcg_nr_cache_ids to be particular)
>>>
>>> If,
>>>
>>> A = Nr allocation request per mount: 2 (one for dentry and inode list)
>>> B = Nr NUMA possible nodes
>>> C = memcg_nr_cache_ids
>>> D = size of each kmalloc-32 object: 32 bytes,
>>>
>>> then for every mount, the amount of memory consumed by kmalloc-32 slab
>>> cache for list_lru creation is A*B*C*D bytes.
>>>
>>> Following factors contribute to the excessive allocations:
>>>
>>> - Lists are created for possible NUMA nodes.
>>> - memcg_nr_cache_ids grows in bulk (see memcg_alloc_cache_id() and 
>>> additional
>>>   list_lrus are created when it grows. Thus we end up creating list_lru_one
>>>   list_heads even for those memcgs which are yet to be created.
>>>   For example, when 1 memcgs are created, memcg_nr_cache_ids reach
>>>   a value of 12286.
>>
>> So, by your numbers, we have 2 * 2 * 12286 * 32 = 1.5MB per mount.
>>
>> So for that to make up 100GB of RAM, you must have somewhere over
>> 500,000 mounted superblocks on the machine?
>>
>> That implies 50+ unique mounted superblocks per container, which
>> seems like an awful lot.
> 
> Here is how the calculation turns out to be in my setup:
> 
> Number of possible NUMA nodes = 2
> Number of mounts per container = 7 (Check below to see which are these)
> Number of list creation requests per mount = 2
> Number of containers = 1
> memcg_nr_cache_ids for 10k containers = 12286

Luckily, we have "+1" in memcg_nr_cache_ids formula: size = 2 * (id + 1).
In case of we only multiplied it, you would have to had 
memcg_nr_cache_ids=2.

Maybe, we need change that formula to increase memcg_nr_cache_ids more accurate
for further growths of containers number. Say,

size = id < 2000 ? 2 * (id + 1) : id + 2000

> size of kmalloc-32 = 32 byes
> 
> 2*7*2*1*12286*32 = 11008256 bytes = 102.5G
> 
>>
>>> - When a memcg goes offline, the list elements are drained to the parent
>>>   memcg, but the list_head entry remains.
>>> - The lists are destroyed only when the FS is unmounted. So list_heads
>>>   for non-existing memcgs remain and continue to contribute to the
>>>   kmalloc-32 allocation. This is presumably done for performance
>>>   reason as they get reused when new memcgs are created, but they end up
>>>   consuming slab memory until then.
>>> - In case of containers, a few file systems get mounted and are specific
>>>   to the container namespace and hence to a particular memcg, but we
>>>   end up creating lists for all the memcgs.
>>>   As an example, if 7 FS mounts are done for every container and when
>>>   10k containers are created, we end up creating 2*7*12286 list_lru_one
>>>   lists for each NUMA node. It appears that no elements will get added
>>>   to other than 2*7=14 of them in the case of containers.
>>
>> Yeah, at first glance this doesn't strike me as a problem with the
>> list_lru structure, it smells more like a problem resulting from a
>> huge number of superblock instantiations on the machine. Which,
>> probably, mostly have no significant need for anything other than a
>> single memcg awareness?
>>
>> Can you post a typical /proc/self/mounts output from one of these
>> idle/empty containers so we can see exactly how many mounts and
>> their type are being instantiated in each container?
> 
> Tracing type->name in alloc_super() lists these 7 types for
> every container.
> 
> 3-2691[041]    222.761041: alloc_super: fstype: mqueue
> 3-2692[072] ..

Re: High kmalloc-32 slab cache consumption with 10k containers

2021-04-06 Thread Bharata B Rao
On Wed, Apr 07, 2021 at 08:28:07AM +1000, Dave Chinner wrote:
> On Mon, Apr 05, 2021 at 11:18:48AM +0530, Bharata B Rao wrote:
> > Hi,
> > 
> > When running 1 (more-or-less-empty-)containers on a bare-metal Power9
> > server(160 CPUs, 2 NUMA nodes, 256G memory), it is seen that memory
> > consumption increases quite a lot (around 172G) when the containers are
> > running. Most of it comes from slab (149G) and within slab, the majority of
> > it comes from kmalloc-32 cache (102G)
> > 
> > The major allocator of kmalloc-32 slab cache happens to be the list_head
> > allocations of list_lru_one list. These lists are created whenever a
> > FS mount happens. Specially two such lists are registered by alloc_super(),
> > one for dentry and another for inode shrinker list. And these lists
> > are created for all possible NUMA nodes and for all given memcgs
> > (memcg_nr_cache_ids to be particular)
> > 
> > If,
> > 
> > A = Nr allocation request per mount: 2 (one for dentry and inode list)
> > B = Nr NUMA possible nodes
> > C = memcg_nr_cache_ids
> > D = size of each kmalloc-32 object: 32 bytes,
> > 
> > then for every mount, the amount of memory consumed by kmalloc-32 slab
> > cache for list_lru creation is A*B*C*D bytes.
> > 
> > Following factors contribute to the excessive allocations:
> > 
> > - Lists are created for possible NUMA nodes.
> > - memcg_nr_cache_ids grows in bulk (see memcg_alloc_cache_id() and 
> > additional
> >   list_lrus are created when it grows. Thus we end up creating list_lru_one
> >   list_heads even for those memcgs which are yet to be created.
> >   For example, when 1 memcgs are created, memcg_nr_cache_ids reach
> >   a value of 12286.
> 
> So, by your numbers, we have 2 * 2 * 12286 * 32 = 1.5MB per mount.
> 
> So for that to make up 100GB of RAM, you must have somewhere over
> 500,000 mounted superblocks on the machine?
> 
> That implies 50+ unique mounted superblocks per container, which
> seems like an awful lot.

Here is how the calculation turns out to be in my setup:

Number of possible NUMA nodes = 2
Number of mounts per container = 7 (Check below to see which are these)
Number of list creation requests per mount = 2
Number of containers = 1
memcg_nr_cache_ids for 10k containers = 12286
size of kmalloc-32 = 32 byes

2*7*2*1*12286*32 = 11008256 bytes = 102.5G

> 
> > - When a memcg goes offline, the list elements are drained to the parent
> >   memcg, but the list_head entry remains.
> > - The lists are destroyed only when the FS is unmounted. So list_heads
> >   for non-existing memcgs remain and continue to contribute to the
> >   kmalloc-32 allocation. This is presumably done for performance
> >   reason as they get reused when new memcgs are created, but they end up
> >   consuming slab memory until then.
> > - In case of containers, a few file systems get mounted and are specific
> >   to the container namespace and hence to a particular memcg, but we
> >   end up creating lists for all the memcgs.
> >   As an example, if 7 FS mounts are done for every container and when
> >   10k containers are created, we end up creating 2*7*12286 list_lru_one
> >   lists for each NUMA node. It appears that no elements will get added
> >   to other than 2*7=14 of them in the case of containers.
> 
> Yeah, at first glance this doesn't strike me as a problem with the
> list_lru structure, it smells more like a problem resulting from a
> huge number of superblock instantiations on the machine. Which,
> probably, mostly have no significant need for anything other than a
> single memcg awareness?
> 
> Can you post a typical /proc/self/mounts output from one of these
> idle/empty containers so we can see exactly how many mounts and
> their type are being instantiated in each container?

Tracing type->name in alloc_super() lists these 7 types for
every container.

3-2691[041]    222.761041: alloc_super: fstype: mqueue
3-2692[072]    222.812187: alloc_super: fstype: proc
3-2692[072]    222.812261: alloc_super: fstype: tmpfs
3-2692[072]    222.812329: alloc_super: fstype: devpts
3-2692[072]    222.812392: alloc_super: fstype: tmpfs
3-2692[072]    222.813102: alloc_super: fstype: tmpfs
3-2692[072]    222.813159: alloc_super: fstype: tmpfs

> 
> > One straight forward way to prevent this excessive list_lru_one
> > allocations is to limit the list_lru_one creation only to the
> > relevant memcg. However I don't see an easy way to figure out
> > that relevant memcg from FS mount path (alloc_super())
> 
> Superblocks have to support an unknown number

Re: High kmalloc-32 slab cache consumption with 10k containers

2021-04-06 Thread Yang Shi
On Tue, Apr 6, 2021 at 3:05 AM Bharata B Rao  wrote:
>
> On Mon, Apr 05, 2021 at 11:08:26AM -0700, Yang Shi wrote:
> > On Sun, Apr 4, 2021 at 10:49 PM Bharata B Rao  wrote:
> > >
> > > Hi,
> > >
> > > When running 1 (more-or-less-empty-)containers on a bare-metal Power9
> > > server(160 CPUs, 2 NUMA nodes, 256G memory), it is seen that memory
> > > consumption increases quite a lot (around 172G) when the containers are
> > > running. Most of it comes from slab (149G) and within slab, the majority 
> > > of
> > > it comes from kmalloc-32 cache (102G)
> > >
> > > The major allocator of kmalloc-32 slab cache happens to be the list_head
> > > allocations of list_lru_one list. These lists are created whenever a
> > > FS mount happens. Specially two such lists are registered by 
> > > alloc_super(),
> > > one for dentry and another for inode shrinker list. And these lists
> > > are created for all possible NUMA nodes and for all given memcgs
> > > (memcg_nr_cache_ids to be particular)
> > >
> > > If,
> > >
> > > A = Nr allocation request per mount: 2 (one for dentry and inode list)
> > > B = Nr NUMA possible nodes
> > > C = memcg_nr_cache_ids
> > > D = size of each kmalloc-32 object: 32 bytes,
> > >
> > > then for every mount, the amount of memory consumed by kmalloc-32 slab
> > > cache for list_lru creation is A*B*C*D bytes.
> >
> > Yes, this is exactly what the current implementation does.
> >
> > >
> > > Following factors contribute to the excessive allocations:
> > >
> > > - Lists are created for possible NUMA nodes.
> >
> > Yes, because filesystem caches (dentry and inode) are NUMA aware.
>
> True, but creating lists for possible nodes as against online nodes
> can hurt platforms where possible is typically higher than online.

I'm supposed just because hotplug doesn't handle memcg list_lru
creation/deletion.

Get much simpler and less-prone implementation by wasting some memory.

>
> >
> > > - memcg_nr_cache_ids grows in bulk (see memcg_alloc_cache_id() and 
> > > additional
> > >   list_lrus are created when it grows. Thus we end up creating 
> > > list_lru_one
> > >   list_heads even for those memcgs which are yet to be created.
> > >   For example, when 1 memcgs are created, memcg_nr_cache_ids reach
> > >   a value of 12286.
> > > - When a memcg goes offline, the list elements are drained to the parent
> > >   memcg, but the list_head entry remains.
> > > - The lists are destroyed only when the FS is unmounted. So list_heads
> > >   for non-existing memcgs remain and continue to contribute to the
> > >   kmalloc-32 allocation. This is presumably done for performance
> > >   reason as they get reused when new memcgs are created, but they end up
> > >   consuming slab memory until then.
> >
> > The current implementation has list_lrus attached with super_block. So
> > the list can't be freed until the super block is unmounted.
> >
> > I'm looking into consolidating list_lrus more closely with memcgs. It
> > means the list_lrus will have the same life cycles as memcgs rather
> > than filesystems. This may be able to improve some. But I'm supposed
> > the filesystem will be unmounted once the container exits and the
> > memcgs will get offlined for your usecase.
>
> Yes, but when the containers are still running, the lists that get
> created for non-existing memcgs and non-relavent memcgs are the main
> cause of increased memory consumption.

Since kernel doesn't know about containers so kernel simply doesn't
know what memcgs are non-relevant.

>
> >
> > > - In case of containers, a few file systems get mounted and are specific
> > >   to the container namespace and hence to a particular memcg, but we
> > >   end up creating lists for all the memcgs.
> >
> > Yes, because the kernel is *NOT* aware of containers.
> >
> > >   As an example, if 7 FS mounts are done for every container and when
> > >   10k containers are created, we end up creating 2*7*12286 list_lru_one
> > >   lists for each NUMA node. It appears that no elements will get added
> > >   to other than 2*7=14 of them in the case of containers.
> > >
> > > One straight forward way to prevent this excessive list_lru_one
> > > allocations is to limit the list_lru_one creation only to the
> > > relevant memcg. However I don't see an easy way to figure out
> > > that relevant memcg from F

Re: High kmalloc-32 slab cache consumption with 10k containers

2021-04-06 Thread Dave Chinner
On Mon, Apr 05, 2021 at 11:18:48AM +0530, Bharata B Rao wrote:
> Hi,
> 
> When running 1 (more-or-less-empty-)containers on a bare-metal Power9
> server(160 CPUs, 2 NUMA nodes, 256G memory), it is seen that memory
> consumption increases quite a lot (around 172G) when the containers are
> running. Most of it comes from slab (149G) and within slab, the majority of
> it comes from kmalloc-32 cache (102G)
> 
> The major allocator of kmalloc-32 slab cache happens to be the list_head
> allocations of list_lru_one list. These lists are created whenever a
> FS mount happens. Specially two such lists are registered by alloc_super(),
> one for dentry and another for inode shrinker list. And these lists
> are created for all possible NUMA nodes and for all given memcgs
> (memcg_nr_cache_ids to be particular)
> 
> If,
> 
> A = Nr allocation request per mount: 2 (one for dentry and inode list)
> B = Nr NUMA possible nodes
> C = memcg_nr_cache_ids
> D = size of each kmalloc-32 object: 32 bytes,
> 
> then for every mount, the amount of memory consumed by kmalloc-32 slab
> cache for list_lru creation is A*B*C*D bytes.
> 
> Following factors contribute to the excessive allocations:
> 
> - Lists are created for possible NUMA nodes.
> - memcg_nr_cache_ids grows in bulk (see memcg_alloc_cache_id() and additional
>   list_lrus are created when it grows. Thus we end up creating list_lru_one
>   list_heads even for those memcgs which are yet to be created.
>   For example, when 1 memcgs are created, memcg_nr_cache_ids reach
>   a value of 12286.

So, by your numbers, we have 2 * 2 * 12286 * 32 = 1.5MB per mount.

So for that to make up 100GB of RAM, you must have somewhere over
500,000 mounted superblocks on the machine?

That implies 50+ unique mounted superblocks per container, which
seems like an awful lot.

> - When a memcg goes offline, the list elements are drained to the parent
>   memcg, but the list_head entry remains.
> - The lists are destroyed only when the FS is unmounted. So list_heads
>   for non-existing memcgs remain and continue to contribute to the
>   kmalloc-32 allocation. This is presumably done for performance
>   reason as they get reused when new memcgs are created, but they end up
>   consuming slab memory until then.
> - In case of containers, a few file systems get mounted and are specific
>   to the container namespace and hence to a particular memcg, but we
>   end up creating lists for all the memcgs.
>   As an example, if 7 FS mounts are done for every container and when
>   10k containers are created, we end up creating 2*7*12286 list_lru_one
>   lists for each NUMA node. It appears that no elements will get added
>   to other than 2*7=14 of them in the case of containers.

Yeah, at first glance this doesn't strike me as a problem with the
list_lru structure, it smells more like a problem resulting from a
huge number of superblock instantiations on the machine. Which,
probably, mostly have no significant need for anything other than a
single memcg awareness?

Can you post a typical /proc/self/mounts output from one of these
idle/empty containers so we can see exactly how many mounts and
their type are being instantiated in each container?

> One straight forward way to prevent this excessive list_lru_one
> allocations is to limit the list_lru_one creation only to the
> relevant memcg. However I don't see an easy way to figure out
> that relevant memcg from FS mount path (alloc_super())

Superblocks have to support an unknown number of memcgs after they
have been mounted. bind mounts, child memcgs, etc, all mean that we
can't just have a static, single mount time memcg instantiation.

> As an alternative approach, I have this below hack that does lazy
> list_lru creation. The memcg-specific list is created and initialized
> only when there is a request to add an element to that particular
> list. Though I am not sure about the full impact of this change
> on the owners of the lists and also the performance impact of this,
> the overall savings look good.

Avoiding memory allocation in list_lru_add() was one of the main
reasons for up-front static allocation of memcg lists. We cannot do
memory allocation while callers are holding multiple spinlocks in
core system algorithms (e.g. dentry_kill -> retain_dentry ->
d_lru_add -> list_lru_add), let alone while holding an internal
spinlock.

Putting a GFP_ATOMIC allocation inside 3-4 nested spinlocks in a
path we know might have memory demand in the *hundreds of GB* range
gets an NACK from me. It's a great idea, but it's just not a
feasible, robust solution as proposed. Work out how to put the
memory allocation outside all the locks (including caller locks) and
it might be ok, but that's messy.

Another approach may be to identify filesystem types that do not
need mem

Re: High kmalloc-32 slab cache consumption with 10k containers

2021-04-06 Thread Bharata B Rao
On Mon, Apr 05, 2021 at 11:38:44AM -0700, Roman Gushchin wrote:
> > > @@ -534,7 +521,17 @@ static void memcg_drain_list_lru_node(struct 
> > > list_lru *lru, int nid,
> > > spin_lock_irq(>lock);
> > >
> > > src = list_lru_from_memcg_idx(nlru, src_idx);
> > > +   if (!src)
> > > +   goto out;
> > > +
> > > dst = list_lru_from_memcg_idx(nlru, dst_idx);
> > > +   if (!dst) {
> > > +   /* TODO: Use __GFP_NOFAIL? */
> > > +   dst = kmalloc(sizeof(struct list_lru_one), GFP_ATOMIC);
> > > +   init_one_lru(dst);
> > > +   memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, 
> > > true);
> > > +   memcg_lrus->lru[dst_idx] = dst;
> > > +   }
> 
> Hm, can't we just reuse src as dst in this case?
> We don't need src anymore and we're basically allocating dst to move all data 
> from src.

Yes, we can do that and it would be much simpler.

> If not, we can allocate up to the root memcg every time to avoid having
> !dst case and fiddle with __GFP_NOFAIL.
> 
> Otherwise I like the idea and I think it might reduce the memory overhead
> especially on (very) big machines.

Yes, I will however have to check if the callers of list_lru_add() are capable
of handling failure which can happen with this approach if kmalloc fails.

Regards,
Bharata.


Re: High kmalloc-32 slab cache consumption with 10k containers

2021-04-06 Thread Bharata B Rao
On Mon, Apr 05, 2021 at 11:08:26AM -0700, Yang Shi wrote:
> On Sun, Apr 4, 2021 at 10:49 PM Bharata B Rao  wrote:
> >
> > Hi,
> >
> > When running 1 (more-or-less-empty-)containers on a bare-metal Power9
> > server(160 CPUs, 2 NUMA nodes, 256G memory), it is seen that memory
> > consumption increases quite a lot (around 172G) when the containers are
> > running. Most of it comes from slab (149G) and within slab, the majority of
> > it comes from kmalloc-32 cache (102G)
> >
> > The major allocator of kmalloc-32 slab cache happens to be the list_head
> > allocations of list_lru_one list. These lists are created whenever a
> > FS mount happens. Specially two such lists are registered by alloc_super(),
> > one for dentry and another for inode shrinker list. And these lists
> > are created for all possible NUMA nodes and for all given memcgs
> > (memcg_nr_cache_ids to be particular)
> >
> > If,
> >
> > A = Nr allocation request per mount: 2 (one for dentry and inode list)
> > B = Nr NUMA possible nodes
> > C = memcg_nr_cache_ids
> > D = size of each kmalloc-32 object: 32 bytes,
> >
> > then for every mount, the amount of memory consumed by kmalloc-32 slab
> > cache for list_lru creation is A*B*C*D bytes.
> 
> Yes, this is exactly what the current implementation does.
> 
> >
> > Following factors contribute to the excessive allocations:
> >
> > - Lists are created for possible NUMA nodes.
> 
> Yes, because filesystem caches (dentry and inode) are NUMA aware.

True, but creating lists for possible nodes as against online nodes
can hurt platforms where possible is typically higher than online.

> 
> > - memcg_nr_cache_ids grows in bulk (see memcg_alloc_cache_id() and 
> > additional
> >   list_lrus are created when it grows. Thus we end up creating list_lru_one
> >   list_heads even for those memcgs which are yet to be created.
> >   For example, when 1 memcgs are created, memcg_nr_cache_ids reach
> >   a value of 12286.
> > - When a memcg goes offline, the list elements are drained to the parent
> >   memcg, but the list_head entry remains.
> > - The lists are destroyed only when the FS is unmounted. So list_heads
> >   for non-existing memcgs remain and continue to contribute to the
> >   kmalloc-32 allocation. This is presumably done for performance
> >   reason as they get reused when new memcgs are created, but they end up
> >   consuming slab memory until then.
> 
> The current implementation has list_lrus attached with super_block. So
> the list can't be freed until the super block is unmounted.
> 
> I'm looking into consolidating list_lrus more closely with memcgs. It
> means the list_lrus will have the same life cycles as memcgs rather
> than filesystems. This may be able to improve some. But I'm supposed
> the filesystem will be unmounted once the container exits and the
> memcgs will get offlined for your usecase.

Yes, but when the containers are still running, the lists that get
created for non-existing memcgs and non-relavent memcgs are the main
cause of increased memory consumption.

> 
> > - In case of containers, a few file systems get mounted and are specific
> >   to the container namespace and hence to a particular memcg, but we
> >   end up creating lists for all the memcgs.
> 
> Yes, because the kernel is *NOT* aware of containers.
> 
> >   As an example, if 7 FS mounts are done for every container and when
> >   10k containers are created, we end up creating 2*7*12286 list_lru_one
> >   lists for each NUMA node. It appears that no elements will get added
> >   to other than 2*7=14 of them in the case of containers.
> >
> > One straight forward way to prevent this excessive list_lru_one
> > allocations is to limit the list_lru_one creation only to the
> > relevant memcg. However I don't see an easy way to figure out
> > that relevant memcg from FS mount path (alloc_super())
> >
> > As an alternative approach, I have this below hack that does lazy
> > list_lru creation. The memcg-specific list is created and initialized
> > only when there is a request to add an element to that particular
> > list. Though I am not sure about the full impact of this change
> > on the owners of the lists and also the performance impact of this,
> > the overall savings look good.
> 
> It is fine to reduce the memory consumption for your usecase, but I'm
> not sure if this would incur any noticeable overhead for vfs
> operations since list_lru_add() should be called quite often, but it
> just needs to allocate the list for once (for each memcg +
> filesystem), so the

Re: High kmalloc-32 slab cache consumption with 10k containers

2021-04-05 Thread Roman Gushchin
On Mon, Apr 05, 2021 at 11:08:26AM -0700, Yang Shi wrote:
> On Sun, Apr 4, 2021 at 10:49 PM Bharata B Rao  wrote:
> >
> > Hi,
> >
> > When running 1 (more-or-less-empty-)containers on a bare-metal Power9
> > server(160 CPUs, 2 NUMA nodes, 256G memory), it is seen that memory
> > consumption increases quite a lot (around 172G) when the containers are
> > running. Most of it comes from slab (149G) and within slab, the majority of
> > it comes from kmalloc-32 cache (102G)
> >
> > The major allocator of kmalloc-32 slab cache happens to be the list_head
> > allocations of list_lru_one list. These lists are created whenever a
> > FS mount happens. Specially two such lists are registered by alloc_super(),
> > one for dentry and another for inode shrinker list. And these lists
> > are created for all possible NUMA nodes and for all given memcgs
> > (memcg_nr_cache_ids to be particular)
> >
> > If,
> >
> > A = Nr allocation request per mount: 2 (one for dentry and inode list)
> > B = Nr NUMA possible nodes
> > C = memcg_nr_cache_ids
> > D = size of each kmalloc-32 object: 32 bytes,
> >
> > then for every mount, the amount of memory consumed by kmalloc-32 slab
> > cache for list_lru creation is A*B*C*D bytes.
> 
> Yes, this is exactly what the current implementation does.
> 
> >
> > Following factors contribute to the excessive allocations:
> >
> > - Lists are created for possible NUMA nodes.
> 
> Yes, because filesystem caches (dentry and inode) are NUMA aware.
> 
> > - memcg_nr_cache_ids grows in bulk (see memcg_alloc_cache_id() and 
> > additional
> >   list_lrus are created when it grows. Thus we end up creating list_lru_one
> >   list_heads even for those memcgs which are yet to be created.
> >   For example, when 1 memcgs are created, memcg_nr_cache_ids reach
> >   a value of 12286.
> > - When a memcg goes offline, the list elements are drained to the parent
> >   memcg, but the list_head entry remains.
> > - The lists are destroyed only when the FS is unmounted. So list_heads
> >   for non-existing memcgs remain and continue to contribute to the
> >   kmalloc-32 allocation. This is presumably done for performance
> >   reason as they get reused when new memcgs are created, but they end up
> >   consuming slab memory until then.
> 
> The current implementation has list_lrus attached with super_block. So
> the list can't be freed until the super block is unmounted.
> 
> I'm looking into consolidating list_lrus more closely with memcgs. It
> means the list_lrus will have the same life cycles as memcgs rather
> than filesystems. This may be able to improve some. But I'm supposed
> the filesystem will be unmounted once the container exits and the
> memcgs will get offlined for your usecase.
> 
> > - In case of containers, a few file systems get mounted and are specific
> >   to the container namespace and hence to a particular memcg, but we
> >   end up creating lists for all the memcgs.
> 
> Yes, because the kernel is *NOT* aware of containers.
> 
> >   As an example, if 7 FS mounts are done for every container and when
> >   10k containers are created, we end up creating 2*7*12286 list_lru_one
> >   lists for each NUMA node. It appears that no elements will get added
> >   to other than 2*7=14 of them in the case of containers.
> >
> > One straight forward way to prevent this excessive list_lru_one
> > allocations is to limit the list_lru_one creation only to the
> > relevant memcg. However I don't see an easy way to figure out
> > that relevant memcg from FS mount path (alloc_super())
> >
> > As an alternative approach, I have this below hack that does lazy
> > list_lru creation. The memcg-specific list is created and initialized
> > only when there is a request to add an element to that particular
> > list. Though I am not sure about the full impact of this change
> > on the owners of the lists and also the performance impact of this,
> > the overall savings look good.
> 
> It is fine to reduce the memory consumption for your usecase, but I'm
> not sure if this would incur any noticeable overhead for vfs
> operations since list_lru_add() should be called quite often, but it
> just needs to allocate the list for once (for each memcg +
> filesystem), so the overhead might be fine.
> 
> And I'm wondering how much memory can be saved for real life workload.
> I don't expect most containers are idle in production environments.
> 
> Added some more memcg/list_lru experts in this loop, they may have better 
> ideas.
> 
> >
> > Used memory

Re: High kmalloc-32 slab cache consumption with 10k containers

2021-04-05 Thread Yang Shi
On Sun, Apr 4, 2021 at 10:49 PM Bharata B Rao  wrote:
>
> Hi,
>
> When running 1 (more-or-less-empty-)containers on a bare-metal Power9
> server(160 CPUs, 2 NUMA nodes, 256G memory), it is seen that memory
> consumption increases quite a lot (around 172G) when the containers are
> running. Most of it comes from slab (149G) and within slab, the majority of
> it comes from kmalloc-32 cache (102G)
>
> The major allocator of kmalloc-32 slab cache happens to be the list_head
> allocations of list_lru_one list. These lists are created whenever a
> FS mount happens. Specially two such lists are registered by alloc_super(),
> one for dentry and another for inode shrinker list. And these lists
> are created for all possible NUMA nodes and for all given memcgs
> (memcg_nr_cache_ids to be particular)
>
> If,
>
> A = Nr allocation request per mount: 2 (one for dentry and inode list)
> B = Nr NUMA possible nodes
> C = memcg_nr_cache_ids
> D = size of each kmalloc-32 object: 32 bytes,
>
> then for every mount, the amount of memory consumed by kmalloc-32 slab
> cache for list_lru creation is A*B*C*D bytes.

Yes, this is exactly what the current implementation does.

>
> Following factors contribute to the excessive allocations:
>
> - Lists are created for possible NUMA nodes.

Yes, because filesystem caches (dentry and inode) are NUMA aware.

> - memcg_nr_cache_ids grows in bulk (see memcg_alloc_cache_id() and additional
>   list_lrus are created when it grows. Thus we end up creating list_lru_one
>   list_heads even for those memcgs which are yet to be created.
>   For example, when 1 memcgs are created, memcg_nr_cache_ids reach
>   a value of 12286.
> - When a memcg goes offline, the list elements are drained to the parent
>   memcg, but the list_head entry remains.
> - The lists are destroyed only when the FS is unmounted. So list_heads
>   for non-existing memcgs remain and continue to contribute to the
>   kmalloc-32 allocation. This is presumably done for performance
>   reason as they get reused when new memcgs are created, but they end up
>   consuming slab memory until then.

The current implementation has list_lrus attached with super_block. So
the list can't be freed until the super block is unmounted.

I'm looking into consolidating list_lrus more closely with memcgs. It
means the list_lrus will have the same life cycles as memcgs rather
than filesystems. This may be able to improve some. But I'm supposed
the filesystem will be unmounted once the container exits and the
memcgs will get offlined for your usecase.

> - In case of containers, a few file systems get mounted and are specific
>   to the container namespace and hence to a particular memcg, but we
>   end up creating lists for all the memcgs.

Yes, because the kernel is *NOT* aware of containers.

>   As an example, if 7 FS mounts are done for every container and when
>   10k containers are created, we end up creating 2*7*12286 list_lru_one
>   lists for each NUMA node. It appears that no elements will get added
>   to other than 2*7=14 of them in the case of containers.
>
> One straight forward way to prevent this excessive list_lru_one
> allocations is to limit the list_lru_one creation only to the
> relevant memcg. However I don't see an easy way to figure out
> that relevant memcg from FS mount path (alloc_super())
>
> As an alternative approach, I have this below hack that does lazy
> list_lru creation. The memcg-specific list is created and initialized
> only when there is a request to add an element to that particular
> list. Though I am not sure about the full impact of this change
> on the owners of the lists and also the performance impact of this,
> the overall savings look good.

It is fine to reduce the memory consumption for your usecase, but I'm
not sure if this would incur any noticeable overhead for vfs
operations since list_lru_add() should be called quite often, but it
just needs to allocate the list for once (for each memcg +
filesystem), so the overhead might be fine.

And I'm wondering how much memory can be saved for real life workload.
I don't expect most containers are idle in production environments.

Added some more memcg/list_lru experts in this loop, they may have better ideas.

>
> Used memory
> Before  During  After
> W/o patch   23G 172G40G
> W/  patch   23G 69G 29G
>
> Slab consumption
> Before  During  After
> W/o patch   1.5G149G22G
> W/  patch   1.5G45G 10G
>
> Number of kmalloc-32 allocations
> Before  During  After
> W/o patch   178176  3442409472  388933632
> 

[PATCH 5.11 006/152] rpc: fix NULL dereference on kmalloc failure

2021-04-05 Thread Greg Kroah-Hartman
From: J. Bruce Fields 

[ Upstream commit 0ddc942394013f08992fc379ca04cffacbbe3dae ]

I think this is unlikely but possible:

svc_authenticate sets rq_authop and calls svcauth_gss_accept.  The
kmalloc(sizeof(*svcdata), GFP_KERNEL) fails, leaving rq_auth_data NULL,
and returning SVC_DENIED.

This causes svc_process_common to go to err_bad_auth, and eventually
call svc_authorise.  That calls ->release == svcauth_gss_release, which
tries to dereference rq_auth_data.

Signed-off-by: J. Bruce Fields 
Link: 
https://lore.kernel.org/linux-nfs/3f1b347f-b809-478f-a1e9-0be98e22b...@oracle.com/T/#t
Signed-off-by: Chuck Lever 
Signed-off-by: Sasha Levin 
---
 net/sunrpc/auth_gss/svcauth_gss.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/auth_gss/svcauth_gss.c 
b/net/sunrpc/auth_gss/svcauth_gss.c
index bd4678db9d76..6dff64374bfe 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1825,11 +1825,14 @@ static int
 svcauth_gss_release(struct svc_rqst *rqstp)
 {
struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
-   struct rpc_gss_wire_cred *gc = >clcred;
+   struct rpc_gss_wire_cred *gc;
struct xdr_buf *resbuf = >rq_res;
int stat = -EINVAL;
struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
 
+   if (!gsd)
+   goto out;
+   gc = >clcred;
if (gc->gc_proc != RPC_GSS_PROC_DATA)
goto out;
/* Release can be called twice, but we only wrap once. */
@@ -1870,10 +1873,10 @@ out_err:
if (rqstp->rq_cred.cr_group_info)
put_group_info(rqstp->rq_cred.cr_group_info);
rqstp->rq_cred.cr_group_info = NULL;
-   if (gsd->rsci)
+   if (gsd && gsd->rsci) {
cache_put(>rsci->h, sn->rsc_cache);
-   gsd->rsci = NULL;
-
+   gsd->rsci = NULL;
+   }
return stat;
 }
 
-- 
2.30.1





[PATCH 5.10 007/126] rpc: fix NULL dereference on kmalloc failure

2021-04-05 Thread Greg Kroah-Hartman
From: J. Bruce Fields 

[ Upstream commit 0ddc942394013f08992fc379ca04cffacbbe3dae ]

I think this is unlikely but possible:

svc_authenticate sets rq_authop and calls svcauth_gss_accept.  The
kmalloc(sizeof(*svcdata), GFP_KERNEL) fails, leaving rq_auth_data NULL,
and returning SVC_DENIED.

This causes svc_process_common to go to err_bad_auth, and eventually
call svc_authorise.  That calls ->release == svcauth_gss_release, which
tries to dereference rq_auth_data.

Signed-off-by: J. Bruce Fields 
Link: 
https://lore.kernel.org/linux-nfs/3f1b347f-b809-478f-a1e9-0be98e22b...@oracle.com/T/#t
Signed-off-by: Chuck Lever 
Signed-off-by: Sasha Levin 
---
 net/sunrpc/auth_gss/svcauth_gss.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/auth_gss/svcauth_gss.c 
b/net/sunrpc/auth_gss/svcauth_gss.c
index bd4678db9d76..6dff64374bfe 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1825,11 +1825,14 @@ static int
 svcauth_gss_release(struct svc_rqst *rqstp)
 {
struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
-   struct rpc_gss_wire_cred *gc = >clcred;
+   struct rpc_gss_wire_cred *gc;
struct xdr_buf *resbuf = >rq_res;
int stat = -EINVAL;
struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
 
+   if (!gsd)
+   goto out;
+   gc = >clcred;
if (gc->gc_proc != RPC_GSS_PROC_DATA)
goto out;
/* Release can be called twice, but we only wrap once. */
@@ -1870,10 +1873,10 @@ out_err:
if (rqstp->rq_cred.cr_group_info)
put_group_info(rqstp->rq_cred.cr_group_info);
rqstp->rq_cred.cr_group_info = NULL;
-   if (gsd->rsci)
+   if (gsd && gsd->rsci) {
cache_put(>rsci->h, sn->rsc_cache);
-   gsd->rsci = NULL;
-
+   gsd->rsci = NULL;
+   }
return stat;
 }
 
-- 
2.30.1





[PATCH 5.4 09/74] rpc: fix NULL dereference on kmalloc failure

2021-04-05 Thread Greg Kroah-Hartman
From: J. Bruce Fields 

[ Upstream commit 0ddc942394013f08992fc379ca04cffacbbe3dae ]

I think this is unlikely but possible:

svc_authenticate sets rq_authop and calls svcauth_gss_accept.  The
kmalloc(sizeof(*svcdata), GFP_KERNEL) fails, leaving rq_auth_data NULL,
and returning SVC_DENIED.

This causes svc_process_common to go to err_bad_auth, and eventually
call svc_authorise.  That calls ->release == svcauth_gss_release, which
tries to dereference rq_auth_data.

Signed-off-by: J. Bruce Fields 
Link: 
https://lore.kernel.org/linux-nfs/3f1b347f-b809-478f-a1e9-0be98e22b...@oracle.com/T/#t
Signed-off-by: Chuck Lever 
Signed-off-by: Sasha Levin 
---
 net/sunrpc/auth_gss/svcauth_gss.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/auth_gss/svcauth_gss.c 
b/net/sunrpc/auth_gss/svcauth_gss.c
index cf4d6d7e7282..d5470c7fe879 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1782,11 +1782,14 @@ static int
 svcauth_gss_release(struct svc_rqst *rqstp)
 {
struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
-   struct rpc_gss_wire_cred *gc = >clcred;
+   struct rpc_gss_wire_cred *gc;
struct xdr_buf *resbuf = >rq_res;
int stat = -EINVAL;
struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
 
+   if (!gsd)
+   goto out;
+   gc = >clcred;
if (gc->gc_proc != RPC_GSS_PROC_DATA)
goto out;
/* Release can be called twice, but we only wrap once. */
@@ -1827,10 +1830,10 @@ out_err:
if (rqstp->rq_cred.cr_group_info)
put_group_info(rqstp->rq_cred.cr_group_info);
rqstp->rq_cred.cr_group_info = NULL;
-   if (gsd->rsci)
+   if (gsd && gsd->rsci) {
cache_put(>rsci->h, sn->rsc_cache);
-   gsd->rsci = NULL;
-
+   gsd->rsci = NULL;
+   }
return stat;
 }
 
-- 
2.30.1





[PATCH 4.19 05/56] rpc: fix NULL dereference on kmalloc failure

2021-04-05 Thread Greg Kroah-Hartman
From: J. Bruce Fields 

[ Upstream commit 0ddc942394013f08992fc379ca04cffacbbe3dae ]

I think this is unlikely but possible:

svc_authenticate sets rq_authop and calls svcauth_gss_accept.  The
kmalloc(sizeof(*svcdata), GFP_KERNEL) fails, leaving rq_auth_data NULL,
and returning SVC_DENIED.

This causes svc_process_common to go to err_bad_auth, and eventually
call svc_authorise.  That calls ->release == svcauth_gss_release, which
tries to dereference rq_auth_data.

Signed-off-by: J. Bruce Fields 
Link: 
https://lore.kernel.org/linux-nfs/3f1b347f-b809-478f-a1e9-0be98e22b...@oracle.com/T/#t
Signed-off-by: Chuck Lever 
Signed-off-by: Sasha Levin 
---
 net/sunrpc/auth_gss/svcauth_gss.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/auth_gss/svcauth_gss.c 
b/net/sunrpc/auth_gss/svcauth_gss.c
index ab086081be9c..a85d78d2bdb7 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1766,11 +1766,14 @@ static int
 svcauth_gss_release(struct svc_rqst *rqstp)
 {
struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
-   struct rpc_gss_wire_cred *gc = >clcred;
+   struct rpc_gss_wire_cred *gc;
struct xdr_buf *resbuf = >rq_res;
int stat = -EINVAL;
struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
 
+   if (!gsd)
+   goto out;
+   gc = >clcred;
if (gc->gc_proc != RPC_GSS_PROC_DATA)
goto out;
/* Release can be called twice, but we only wrap once. */
@@ -1811,10 +1814,10 @@ out_err:
if (rqstp->rq_cred.cr_group_info)
put_group_info(rqstp->rq_cred.cr_group_info);
rqstp->rq_cred.cr_group_info = NULL;
-   if (gsd->rsci)
+   if (gsd && gsd->rsci) {
cache_put(>rsci->h, sn->rsc_cache);
-   gsd->rsci = NULL;
-
+   gsd->rsci = NULL;
+   }
return stat;
 }
 
-- 
2.30.1





[PATCH 4.14 04/52] rpc: fix NULL dereference on kmalloc failure

2021-04-05 Thread Greg Kroah-Hartman
From: J. Bruce Fields 

[ Upstream commit 0ddc942394013f08992fc379ca04cffacbbe3dae ]

I think this is unlikely but possible:

svc_authenticate sets rq_authop and calls svcauth_gss_accept.  The
kmalloc(sizeof(*svcdata), GFP_KERNEL) fails, leaving rq_auth_data NULL,
and returning SVC_DENIED.

This causes svc_process_common to go to err_bad_auth, and eventually
call svc_authorise.  That calls ->release == svcauth_gss_release, which
tries to dereference rq_auth_data.

Signed-off-by: J. Bruce Fields 
Link: 
https://lore.kernel.org/linux-nfs/3f1b347f-b809-478f-a1e9-0be98e22b...@oracle.com/T/#t
Signed-off-by: Chuck Lever 
Signed-off-by: Sasha Levin 
---
 net/sunrpc/auth_gss/svcauth_gss.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/auth_gss/svcauth_gss.c 
b/net/sunrpc/auth_gss/svcauth_gss.c
index 03043d5221e9..27dfd85830d8 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1713,11 +1713,14 @@ static int
 svcauth_gss_release(struct svc_rqst *rqstp)
 {
struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
-   struct rpc_gss_wire_cred *gc = >clcred;
+   struct rpc_gss_wire_cred *gc;
struct xdr_buf *resbuf = >rq_res;
int stat = -EINVAL;
struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
 
+   if (!gsd)
+   goto out;
+   gc = >clcred;
if (gc->gc_proc != RPC_GSS_PROC_DATA)
goto out;
/* Release can be called twice, but we only wrap once. */
@@ -1758,10 +1761,10 @@ out_err:
if (rqstp->rq_cred.cr_group_info)
put_group_info(rqstp->rq_cred.cr_group_info);
rqstp->rq_cred.cr_group_info = NULL;
-   if (gsd->rsci)
+   if (gsd && gsd->rsci) {
cache_put(>rsci->h, sn->rsc_cache);
-   gsd->rsci = NULL;
-
+   gsd->rsci = NULL;
+   }
return stat;
 }
 
-- 
2.30.1





[PATCH 4.9 04/35] rpc: fix NULL dereference on kmalloc failure

2021-04-05 Thread Greg Kroah-Hartman
From: J. Bruce Fields 

[ Upstream commit 0ddc942394013f08992fc379ca04cffacbbe3dae ]

I think this is unlikely but possible:

svc_authenticate sets rq_authop and calls svcauth_gss_accept.  The
kmalloc(sizeof(*svcdata), GFP_KERNEL) fails, leaving rq_auth_data NULL,
and returning SVC_DENIED.

This causes svc_process_common to go to err_bad_auth, and eventually
call svc_authorise.  That calls ->release == svcauth_gss_release, which
tries to dereference rq_auth_data.

Signed-off-by: J. Bruce Fields 
Link: 
https://lore.kernel.org/linux-nfs/3f1b347f-b809-478f-a1e9-0be98e22b...@oracle.com/T/#t
Signed-off-by: Chuck Lever 
Signed-off-by: Sasha Levin 
---
 net/sunrpc/auth_gss/svcauth_gss.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/auth_gss/svcauth_gss.c 
b/net/sunrpc/auth_gss/svcauth_gss.c
index fd897d900d12..85ad23d9a8a9 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1705,11 +1705,14 @@ static int
 svcauth_gss_release(struct svc_rqst *rqstp)
 {
struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
-   struct rpc_gss_wire_cred *gc = >clcred;
+   struct rpc_gss_wire_cred *gc;
struct xdr_buf *resbuf = >rq_res;
int stat = -EINVAL;
struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
 
+   if (!gsd)
+   goto out;
+   gc = >clcred;
if (gc->gc_proc != RPC_GSS_PROC_DATA)
goto out;
/* Release can be called twice, but we only wrap once. */
@@ -1750,10 +1753,10 @@ out_err:
if (rqstp->rq_cred.cr_group_info)
put_group_info(rqstp->rq_cred.cr_group_info);
rqstp->rq_cred.cr_group_info = NULL;
-   if (gsd->rsci)
+   if (gsd && gsd->rsci) {
cache_put(>rsci->h, sn->rsc_cache);
-   gsd->rsci = NULL;
-
+   gsd->rsci = NULL;
+   }
return stat;
 }
 
-- 
2.30.1





[PATCH 4.4 04/28] rpc: fix NULL dereference on kmalloc failure

2021-04-05 Thread Greg Kroah-Hartman
From: J. Bruce Fields 

[ Upstream commit 0ddc942394013f08992fc379ca04cffacbbe3dae ]

I think this is unlikely but possible:

svc_authenticate sets rq_authop and calls svcauth_gss_accept.  The
kmalloc(sizeof(*svcdata), GFP_KERNEL) fails, leaving rq_auth_data NULL,
and returning SVC_DENIED.

This causes svc_process_common to go to err_bad_auth, and eventually
call svc_authorise.  That calls ->release == svcauth_gss_release, which
tries to dereference rq_auth_data.

Signed-off-by: J. Bruce Fields 
Link: 
https://lore.kernel.org/linux-nfs/3f1b347f-b809-478f-a1e9-0be98e22b...@oracle.com/T/#t
Signed-off-by: Chuck Lever 
Signed-off-by: Sasha Levin 
---
 net/sunrpc/auth_gss/svcauth_gss.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/auth_gss/svcauth_gss.c 
b/net/sunrpc/auth_gss/svcauth_gss.c
index 91263d6a103b..bb8b0ef5de82 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1697,11 +1697,14 @@ static int
 svcauth_gss_release(struct svc_rqst *rqstp)
 {
struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
-   struct rpc_gss_wire_cred *gc = >clcred;
+   struct rpc_gss_wire_cred *gc;
struct xdr_buf *resbuf = >rq_res;
int stat = -EINVAL;
struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
 
+   if (!gsd)
+   goto out;
+   gc = >clcred;
if (gc->gc_proc != RPC_GSS_PROC_DATA)
goto out;
/* Release can be called twice, but we only wrap once. */
@@ -1742,10 +1745,10 @@ out_err:
if (rqstp->rq_cred.cr_group_info)
put_group_info(rqstp->rq_cred.cr_group_info);
rqstp->rq_cred.cr_group_info = NULL;
-   if (gsd->rsci)
+   if (gsd && gsd->rsci) {
cache_put(>rsci->h, sn->rsc_cache);
-   gsd->rsci = NULL;
-
+   gsd->rsci = NULL;
+   }
return stat;
 }
 
-- 
2.30.1





Re: High kmalloc-32 slab cache consumption with 10k containers

2021-04-05 Thread kernel test robot
Hi Bharata,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linux/master]
[also build test ERROR on linus/master v5.12-rc6 next-20210401]
[cannot apply to hnaz-linux-mm/master]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:
https://github.com/0day-ci/linux/commits/Bharata-B-Rao/High-kmalloc-32-slab-cache-consumption-with-10k-containers/20210405-135124
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 
5e46d1b78a03d52306f21f77a4e4a144b6d31486
config: s390-randconfig-r032-20210405 (attached as .config)
compiler: clang version 13.0.0 (https://github.com/llvm/llvm-project 
2760a808b9916a2839513b7fd7314a464f52481e)
reproduce (this is a W=1 build):
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# install s390 cross compiling tool for clang build
# apt-get install binutils-s390x-linux-gnu
# 
https://github.com/0day-ci/linux/commit/b7ba3522029771bd74c8b6324de9ce20b5a06593
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review 
Bharata-B-Rao/High-kmalloc-32-slab-cache-consumption-with-10k-containers/20210405-135124
git checkout b7ba3522029771bd74c8b6324de9ce20b5a06593
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=s390 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 

All errors (new ones prefixed by >>):

   In file included from mm/list_lru.c:14:
   In file included from include/linux/memcontrol.h:22:
   In file included from include/linux/writeback.h:14:
   In file included from include/linux/blk-cgroup.h:23:
   In file included from include/linux/blkdev.h:26:
   In file included from include/linux/scatterlist.h:9:
   In file included from arch/s390/include/asm/io.h:80:
   include/asm-generic/io.h:464:31: warning: performing pointer arithmetic on a 
null pointer has undefined behavior [-Wnull-pointer-arithmetic]
   val = __raw_readb(PCI_IOBASE + addr);
 ~~ ^
   include/asm-generic/io.h:477:61: warning: performing pointer arithmetic on a 
null pointer has undefined behavior [-Wnull-pointer-arithmetic]
   val = __le16_to_cpu((__le16 __force)__raw_readw(PCI_IOBASE + addr));
   ~~ ^
   include/uapi/linux/byteorder/big_endian.h:36:59: note: expanded from macro 
'__le16_to_cpu'
   #define __le16_to_cpu(x) __swab16((__force __u16)(__le16)(x))
 ^
   include/uapi/linux/swab.h:102:54: note: expanded from macro '__swab16'
   #define __swab16(x) (__u16)__builtin_bswap16((__u16)(x))
^
   In file included from mm/list_lru.c:14:
   In file included from include/linux/memcontrol.h:22:
   In file included from include/linux/writeback.h:14:
   In file included from include/linux/blk-cgroup.h:23:
   In file included from include/linux/blkdev.h:26:
   In file included from include/linux/scatterlist.h:9:
   In file included from arch/s390/include/asm/io.h:80:
   include/asm-generic/io.h:490:61: warning: performing pointer arithmetic on a 
null pointer has undefined behavior [-Wnull-pointer-arithmetic]
   val = __le32_to_cpu((__le32 __force)__raw_readl(PCI_IOBASE + addr));
   ~~ ^
   include/uapi/linux/byteorder/big_endian.h:34:59: note: expanded from macro 
'__le32_to_cpu'
   #define __le32_to_cpu(x) __swab32((__force __u32)(__le32)(x))
 ^
   include/uapi/linux/swab.h:115:54: note: expanded from macro '__swab32'
   #define __swab32(x) (__u32)__builtin_bswap32((__u32)(x))
^
   In file included from mm/list_lru.c:14:
   In file included from include/linux/memcontrol.h:22:
   In file included from include/linux/writeback.h:14:
   In file included from include/linux/blk-cgroup.h:23:
   In file included from include/linux/blkdev.h:26:
   In file included from include/linux/scatterlist.h:9:
   In file included from arch/s390/include/asm/io.h:80:
   include/asm-generic/io.h:501:33: warning: performing pointer arithmetic on a 
null pointer has undefined behavior [-Wnull-pointer-arithmetic]
   __raw_writeb(value, PCI_IOBASE + addr);
   ~~ ^
   include/asm-generic/io.h:511:59: warning: performing pointer arithmetic on a 
null pointer has undefined behavior [-Wnull-pointer-arithmetic]
   __raw_writew((u16 __force)cpu_to_le16(value), PCI_IOBASE 

High kmalloc-32 slab cache consumption with 10k containers

2021-04-04 Thread Bharata B Rao
Hi,

When running 1 (more-or-less-empty-)containers on a bare-metal Power9
server(160 CPUs, 2 NUMA nodes, 256G memory), it is seen that memory
consumption increases quite a lot (around 172G) when the containers are
running. Most of it comes from slab (149G) and within slab, the majority of
it comes from kmalloc-32 cache (102G)

The major allocator of kmalloc-32 slab cache happens to be the list_head
allocations of list_lru_one list. These lists are created whenever a
FS mount happens. Specially two such lists are registered by alloc_super(),
one for dentry and another for inode shrinker list. And these lists
are created for all possible NUMA nodes and for all given memcgs
(memcg_nr_cache_ids to be particular)

If,

A = Nr allocation request per mount: 2 (one for dentry and inode list)
B = Nr NUMA possible nodes
C = memcg_nr_cache_ids
D = size of each kmalloc-32 object: 32 bytes,

then for every mount, the amount of memory consumed by kmalloc-32 slab
cache for list_lru creation is A*B*C*D bytes.

Following factors contribute to the excessive allocations:

- Lists are created for possible NUMA nodes.
- memcg_nr_cache_ids grows in bulk (see memcg_alloc_cache_id() and additional
  list_lrus are created when it grows. Thus we end up creating list_lru_one
  list_heads even for those memcgs which are yet to be created.
  For example, when 1 memcgs are created, memcg_nr_cache_ids reach
  a value of 12286.
- When a memcg goes offline, the list elements are drained to the parent
  memcg, but the list_head entry remains.
- The lists are destroyed only when the FS is unmounted. So list_heads
  for non-existing memcgs remain and continue to contribute to the
  kmalloc-32 allocation. This is presumably done for performance
  reason as they get reused when new memcgs are created, but they end up
  consuming slab memory until then.
- In case of containers, a few file systems get mounted and are specific
  to the container namespace and hence to a particular memcg, but we
  end up creating lists for all the memcgs.
  As an example, if 7 FS mounts are done for every container and when
  10k containers are created, we end up creating 2*7*12286 list_lru_one
  lists for each NUMA node. It appears that no elements will get added
  to other than 2*7=14 of them in the case of containers.

One straight forward way to prevent this excessive list_lru_one
allocations is to limit the list_lru_one creation only to the
relevant memcg. However I don't see an easy way to figure out
that relevant memcg from FS mount path (alloc_super())

As an alternative approach, I have this below hack that does lazy
list_lru creation. The memcg-specific list is created and initialized
only when there is a request to add an element to that particular
list. Though I am not sure about the full impact of this change
on the owners of the lists and also the performance impact of this,
the overall savings look good.

Used memory
Before  During  After
W/o patch   23G 172G40G
W/  patch   23G 69G 29G

Slab consumption
Before  During  After
W/o patch   1.5G149G22G
W/  patch   1.5G45G 10G

Number of kmalloc-32 allocations
Before  During  After
W/o patch   178176  3442409472  388933632
W/  patch   190464  468992  468992

Any thoughts on other approaches to address this scenario and
any specific comments about the approach that I have taken is
appreciated. Meanwhile the patch looks like below:

>From 9444a0c6734c2853057b1f486f85da2c409fdc84 Mon Sep 17 00:00:00 2001
From: Bharata B Rao 
Date: Wed, 31 Mar 2021 18:21:45 +0530
Subject: [PATCH 1/1] mm: list_lru: Allocate list_lru_one only when required.

Don't pre-allocate list_lru_one list heads for all memcg_cache_ids.
Instead allocate and initialize it only when required.

Signed-off-by: Bharata B Rao 
---
 mm/list_lru.c | 79 +--
 1 file changed, 38 insertions(+), 41 deletions(-)

diff --git a/mm/list_lru.c b/mm/list_lru.c
index 6f067b6b935f..b453fa5008cc 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -112,16 +112,32 @@ list_lru_from_kmem(struct list_lru_node *nlru, void *ptr,
 }
 #endif /* CONFIG_MEMCG_KMEM */
 
+static void init_one_lru(struct list_lru_one *l)
+{
+   INIT_LIST_HEAD(>list);
+   l->nr_items = 0;
+}
+
 bool list_lru_add(struct list_lru *lru, struct list_head *item)
 {
int nid = page_to_nid(virt_to_page(item));
struct list_lru_node *nlru = >node[nid];
struct mem_cgroup *memcg;
struct list_lru_one *l;
+   struct list_lru_memcg *memcg_lrus;
 
spin_lock(>lock);
if (list_empty(item)) {
l = list_lru_from_kmem(nlru, item, );
+   if (!l) {
+   l = kmalloc(sizeof(struct list_lru_o

Re: [PATCH] drm/amd: use kmalloc_array over kmalloc with multiply

2021-04-01 Thread Alex Deucher
Applied.  Thanks!

On Wed, Mar 31, 2021 at 9:36 AM Bernard Zhao  wrote:
>
> Fix patch check warning:
> WARNING: Prefer kmalloc_array over kmalloc with multiply
> +   buf = kmalloc(MAX_KFIFO_SIZE * sizeof(*buf), GFP_KERNEL);
>
> Signed-off-by: Bernard Zhao 
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> index 17d1736367ea..246522423559 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
> @@ -81,7 +81,7 @@ static ssize_t kfd_smi_ev_read(struct file *filep, char 
> __user *user,
> struct kfd_smi_client *client = filep->private_data;
> unsigned char *buf;
>
> -   buf = kmalloc(MAX_KFIFO_SIZE * sizeof(*buf), GFP_KERNEL);
> +   buf = kmalloc_array(MAX_KFIFO_SIZE, sizeof(*buf), GFP_KERNEL);
> if (!buf)
> return -ENOMEM;
>
> --
> 2.31.0
>
> ___
> amd-gfx mailing list
> amd-...@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH] cifsd: use kfree to free memory allocated by kmalloc or kzalloc

2021-04-01 Thread Namjae Jeon
> 
> kfree should be used to free memory allocated by kmalloc or kzalloc to avoid 
> any overhead and for
> maintaining consistency.
> 
> Fixes: 5dfeb6d945 ("cifsd: use kmalloc() for small allocations")
> Signed-off-by: Muhammad Usama Anjum 
Looks good. I will apply. Thanks for your patch!



[PATCH] cifsd: use kfree to free memory allocated by kmalloc or kzalloc

2021-04-01 Thread Muhammad Usama Anjum
kfree should be used to free memory allocated by kmalloc or kzalloc to
avoid any overhead and for maintaining consistency.

Fixes: 5dfeb6d945 ("cifsd: use kmalloc() for small allocations")
Signed-off-by: Muhammad Usama Anjum 
---
 fs/cifsd/buffer_pool.c   | 4 ++--
 fs/cifsd/mgmt/share_config.c | 2 +-
 fs/cifsd/mgmt/user_config.c  | 8 
 fs/cifsd/mgmt/user_session.c | 6 +++---
 fs/cifsd/smb2pdu.c   | 4 ++--
 fs/cifsd/vfs_cache.c | 2 +-
 6 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/fs/cifsd/buffer_pool.c b/fs/cifsd/buffer_pool.c
index ad2a2c885a2c..a9ef3e703232 100644
--- a/fs/cifsd/buffer_pool.c
+++ b/fs/cifsd/buffer_pool.c
@@ -78,7 +78,7 @@ static int register_wm_size_class(size_t sz)
list_for_each_entry(l, _lists, list) {
if (l->sz == sz) {
write_unlock(_lists_lock);
-   kvfree(nl);
+   kfree(nl);
return 0;
}
}
@@ -181,7 +181,7 @@ static void wm_list_free(struct wm_list *l)
list_del(>list);
kvfree(wm);
}
-   kvfree(l);
+   kfree(l);
 }
 
 static void wm_lists_destroy(void)
diff --git a/fs/cifsd/mgmt/share_config.c b/fs/cifsd/mgmt/share_config.c
index db780febd692..e3d459c4dbb2 100644
--- a/fs/cifsd/mgmt/share_config.c
+++ b/fs/cifsd/mgmt/share_config.c
@@ -102,7 +102,7 @@ static int parse_veto_list(struct ksmbd_share_config *share,
 
p->pattern = kstrdup(veto_list, GFP_KERNEL);
if (!p->pattern) {
-   ksmbd_free(p);
+   kfree(p);
return -ENOMEM;
}
 
diff --git a/fs/cifsd/mgmt/user_config.c b/fs/cifsd/mgmt/user_config.c
index f0c2f8994a6b..c31e2c4d2d6f 100644
--- a/fs/cifsd/mgmt/user_config.c
+++ b/fs/cifsd/mgmt/user_config.c
@@ -46,8 +46,8 @@ struct ksmbd_user *ksmbd_alloc_user(struct 
ksmbd_login_response *resp)
 
if (!user->name || !user->passkey) {
kfree(user->name);
-   ksmbd_free(user->passkey);
-   ksmbd_free(user);
+   kfree(user->passkey);
+   kfree(user);
user = NULL;
}
return user;
@@ -57,8 +57,8 @@ void ksmbd_free_user(struct ksmbd_user *user)
 {
ksmbd_ipc_logout_request(user->name);
kfree(user->name);
-   ksmbd_free(user->passkey);
-   ksmbd_free(user);
+   kfree(user->passkey);
+   kfree(user);
 }
 
 int ksmbd_anonymous_user(struct ksmbd_user *user)
diff --git a/fs/cifsd/mgmt/user_session.c b/fs/cifsd/mgmt/user_session.c
index 5a2113bf18ef..fa2140d4755a 100644
--- a/fs/cifsd/mgmt/user_session.c
+++ b/fs/cifsd/mgmt/user_session.c
@@ -53,7 +53,7 @@ static void __session_rpc_close(struct ksmbd_session *sess,
 
ksmbd_free(resp);
ksmbd_rpc_id_free(entry->id);
-   ksmbd_free(entry);
+   kfree(entry);
 }
 
 static void ksmbd_session_rpc_clear_list(struct ksmbd_session *sess)
@@ -119,7 +119,7 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char 
*rpc_name)
return entry->id;
 error:
list_del(>list);
-   ksmbd_free(entry);
+   kfree(entry);
return -EINVAL;
 }
 
@@ -174,7 +174,7 @@ void ksmbd_session_destroy(struct ksmbd_session *sess)
ksmbd_release_id(session_ida, sess->id);
 
ksmbd_ida_free(sess->tree_conn_ida);
-   ksmbd_free(sess);
+   kfree(sess);
 }
 
 static struct ksmbd_session *__session_lookup(unsigned long long id)
diff --git a/fs/cifsd/smb2pdu.c b/fs/cifsd/smb2pdu.c
index 139041768f65..a4bcf6a85f02 100644
--- a/fs/cifsd/smb2pdu.c
+++ b/fs/cifsd/smb2pdu.c
@@ -1611,7 +1611,7 @@ int smb2_sess_setup(struct ksmbd_work *work)
 
ksmbd_conn_set_good(work);
sess->state = SMB2_SESSION_VALID;
-   ksmbd_free(sess->Preauth_HashValue);
+   kfree(sess->Preauth_HashValue);
sess->Preauth_HashValue = NULL;
} else if (conn->preferred_auth_mech == KSMBD_AUTH_NTLMSSP) {
rc = generate_preauth_hash(work);
@@ -1637,7 +1637,7 @@ int smb2_sess_setup(struct ksmbd_work *work)
 
ksmbd_conn_set_good(work);
sess->state = SMB2_SESSION_VALID;
-   ksmbd_free(sess->Preauth_HashValue);
+   kfree(sess->Preauth_HashValue);
sess->Preauth_HashValue = NULL;
}
} else {
diff --git a/fs/cifsd/vfs_cache.c b/fs/cifsd/vfs_cache.c
index ec631dc6f1fb..f2a863542dc7 100644
--- a/fs/cifsd/vfs_cache.c
+++ b/fs/cifsd/vfs_cache.c
@@ -829,6 +829,6 @@ void ksmbd_destroy_file_table(struct ksmbd_file_table *ft)
 
__close_file_table_ids(ft, NULL, session_

[PATCH] drm/amd: use kmalloc_array over kmalloc with multiply

2021-03-31 Thread Bernard Zhao
Fix patch check warning:
WARNING: Prefer kmalloc_array over kmalloc with multiply
+   buf = kmalloc(MAX_KFIFO_SIZE * sizeof(*buf), GFP_KERNEL);

Signed-off-by: Bernard Zhao 
---
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index 17d1736367ea..246522423559 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -81,7 +81,7 @@ static ssize_t kfd_smi_ev_read(struct file *filep, char 
__user *user,
struct kfd_smi_client *client = filep->private_data;
unsigned char *buf;
 
-   buf = kmalloc(MAX_KFIFO_SIZE * sizeof(*buf), GFP_KERNEL);
+   buf = kmalloc_array(MAX_KFIFO_SIZE, sizeof(*buf), GFP_KERNEL);
if (!buf)
return -ENOMEM;
 
-- 
2.31.0



[PATCH AUTOSEL 4.4 02/10] rpc: fix NULL dereference on kmalloc failure

2021-03-25 Thread Sasha Levin
From: "J. Bruce Fields" 

[ Upstream commit 0ddc942394013f08992fc379ca04cffacbbe3dae ]

I think this is unlikely but possible:

svc_authenticate sets rq_authop and calls svcauth_gss_accept.  The
kmalloc(sizeof(*svcdata), GFP_KERNEL) fails, leaving rq_auth_data NULL,
and returning SVC_DENIED.

This causes svc_process_common to go to err_bad_auth, and eventually
call svc_authorise.  That calls ->release == svcauth_gss_release, which
tries to dereference rq_auth_data.

Signed-off-by: J. Bruce Fields 
Link: 
https://lore.kernel.org/linux-nfs/3f1b347f-b809-478f-a1e9-0be98e22b...@oracle.com/T/#t
Signed-off-by: Chuck Lever 
Signed-off-by: Sasha Levin 
---
 net/sunrpc/auth_gss/svcauth_gss.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/auth_gss/svcauth_gss.c 
b/net/sunrpc/auth_gss/svcauth_gss.c
index 91263d6a103b..bb8b0ef5de82 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1697,11 +1697,14 @@ static int
 svcauth_gss_release(struct svc_rqst *rqstp)
 {
struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
-   struct rpc_gss_wire_cred *gc = >clcred;
+   struct rpc_gss_wire_cred *gc;
struct xdr_buf *resbuf = >rq_res;
int stat = -EINVAL;
struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
 
+   if (!gsd)
+   goto out;
+   gc = >clcred;
if (gc->gc_proc != RPC_GSS_PROC_DATA)
goto out;
/* Release can be called twice, but we only wrap once. */
@@ -1742,10 +1745,10 @@ svcauth_gss_release(struct svc_rqst *rqstp)
if (rqstp->rq_cred.cr_group_info)
put_group_info(rqstp->rq_cred.cr_group_info);
rqstp->rq_cred.cr_group_info = NULL;
-   if (gsd->rsci)
+   if (gsd && gsd->rsci) {
cache_put(>rsci->h, sn->rsc_cache);
-   gsd->rsci = NULL;
-
+   gsd->rsci = NULL;
+   }
return stat;
 }
 
-- 
2.30.1



[PATCH AUTOSEL 4.9 02/13] rpc: fix NULL dereference on kmalloc failure

2021-03-25 Thread Sasha Levin
From: "J. Bruce Fields" 

[ Upstream commit 0ddc942394013f08992fc379ca04cffacbbe3dae ]

I think this is unlikely but possible:

svc_authenticate sets rq_authop and calls svcauth_gss_accept.  The
kmalloc(sizeof(*svcdata), GFP_KERNEL) fails, leaving rq_auth_data NULL,
and returning SVC_DENIED.

This causes svc_process_common to go to err_bad_auth, and eventually
call svc_authorise.  That calls ->release == svcauth_gss_release, which
tries to dereference rq_auth_data.

Signed-off-by: J. Bruce Fields 
Link: 
https://lore.kernel.org/linux-nfs/3f1b347f-b809-478f-a1e9-0be98e22b...@oracle.com/T/#t
Signed-off-by: Chuck Lever 
Signed-off-by: Sasha Levin 
---
 net/sunrpc/auth_gss/svcauth_gss.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/auth_gss/svcauth_gss.c 
b/net/sunrpc/auth_gss/svcauth_gss.c
index fd897d900d12..85ad23d9a8a9 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1705,11 +1705,14 @@ static int
 svcauth_gss_release(struct svc_rqst *rqstp)
 {
struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
-   struct rpc_gss_wire_cred *gc = >clcred;
+   struct rpc_gss_wire_cred *gc;
struct xdr_buf *resbuf = >rq_res;
int stat = -EINVAL;
struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
 
+   if (!gsd)
+   goto out;
+   gc = >clcred;
if (gc->gc_proc != RPC_GSS_PROC_DATA)
goto out;
/* Release can be called twice, but we only wrap once. */
@@ -1750,10 +1753,10 @@ svcauth_gss_release(struct svc_rqst *rqstp)
if (rqstp->rq_cred.cr_group_info)
put_group_info(rqstp->rq_cred.cr_group_info);
rqstp->rq_cred.cr_group_info = NULL;
-   if (gsd->rsci)
+   if (gsd && gsd->rsci) {
cache_put(>rsci->h, sn->rsc_cache);
-   gsd->rsci = NULL;
-
+   gsd->rsci = NULL;
+   }
return stat;
 }
 
-- 
2.30.1



[PATCH AUTOSEL 4.14 02/16] rpc: fix NULL dereference on kmalloc failure

2021-03-25 Thread Sasha Levin
From: "J. Bruce Fields" 

[ Upstream commit 0ddc942394013f08992fc379ca04cffacbbe3dae ]

I think this is unlikely but possible:

svc_authenticate sets rq_authop and calls svcauth_gss_accept.  The
kmalloc(sizeof(*svcdata), GFP_KERNEL) fails, leaving rq_auth_data NULL,
and returning SVC_DENIED.

This causes svc_process_common to go to err_bad_auth, and eventually
call svc_authorise.  That calls ->release == svcauth_gss_release, which
tries to dereference rq_auth_data.

Signed-off-by: J. Bruce Fields 
Link: 
https://lore.kernel.org/linux-nfs/3f1b347f-b809-478f-a1e9-0be98e22b...@oracle.com/T/#t
Signed-off-by: Chuck Lever 
Signed-off-by: Sasha Levin 
---
 net/sunrpc/auth_gss/svcauth_gss.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/auth_gss/svcauth_gss.c 
b/net/sunrpc/auth_gss/svcauth_gss.c
index 03043d5221e9..27dfd85830d8 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1713,11 +1713,14 @@ static int
 svcauth_gss_release(struct svc_rqst *rqstp)
 {
struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
-   struct rpc_gss_wire_cred *gc = >clcred;
+   struct rpc_gss_wire_cred *gc;
struct xdr_buf *resbuf = >rq_res;
int stat = -EINVAL;
struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
 
+   if (!gsd)
+   goto out;
+   gc = >clcred;
if (gc->gc_proc != RPC_GSS_PROC_DATA)
goto out;
/* Release can be called twice, but we only wrap once. */
@@ -1758,10 +1761,10 @@ svcauth_gss_release(struct svc_rqst *rqstp)
if (rqstp->rq_cred.cr_group_info)
put_group_info(rqstp->rq_cred.cr_group_info);
rqstp->rq_cred.cr_group_info = NULL;
-   if (gsd->rsci)
+   if (gsd && gsd->rsci) {
cache_put(>rsci->h, sn->rsc_cache);
-   gsd->rsci = NULL;
-
+   gsd->rsci = NULL;
+   }
return stat;
 }
 
-- 
2.30.1



[PATCH AUTOSEL 4.19 02/20] rpc: fix NULL dereference on kmalloc failure

2021-03-25 Thread Sasha Levin
From: "J. Bruce Fields" 

[ Upstream commit 0ddc942394013f08992fc379ca04cffacbbe3dae ]

I think this is unlikely but possible:

svc_authenticate sets rq_authop and calls svcauth_gss_accept.  The
kmalloc(sizeof(*svcdata), GFP_KERNEL) fails, leaving rq_auth_data NULL,
and returning SVC_DENIED.

This causes svc_process_common to go to err_bad_auth, and eventually
call svc_authorise.  That calls ->release == svcauth_gss_release, which
tries to dereference rq_auth_data.

Signed-off-by: J. Bruce Fields 
Link: 
https://lore.kernel.org/linux-nfs/3f1b347f-b809-478f-a1e9-0be98e22b...@oracle.com/T/#t
Signed-off-by: Chuck Lever 
Signed-off-by: Sasha Levin 
---
 net/sunrpc/auth_gss/svcauth_gss.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/auth_gss/svcauth_gss.c 
b/net/sunrpc/auth_gss/svcauth_gss.c
index ab086081be9c..a85d78d2bdb7 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1766,11 +1766,14 @@ static int
 svcauth_gss_release(struct svc_rqst *rqstp)
 {
struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
-   struct rpc_gss_wire_cred *gc = >clcred;
+   struct rpc_gss_wire_cred *gc;
struct xdr_buf *resbuf = >rq_res;
int stat = -EINVAL;
struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
 
+   if (!gsd)
+   goto out;
+   gc = >clcred;
if (gc->gc_proc != RPC_GSS_PROC_DATA)
goto out;
/* Release can be called twice, but we only wrap once. */
@@ -1811,10 +1814,10 @@ svcauth_gss_release(struct svc_rqst *rqstp)
if (rqstp->rq_cred.cr_group_info)
put_group_info(rqstp->rq_cred.cr_group_info);
rqstp->rq_cred.cr_group_info = NULL;
-   if (gsd->rsci)
+   if (gsd && gsd->rsci) {
cache_put(>rsci->h, sn->rsc_cache);
-   gsd->rsci = NULL;
-
+   gsd->rsci = NULL;
+   }
return stat;
 }
 
-- 
2.30.1



[PATCH AUTOSEL 5.4 04/24] rpc: fix NULL dereference on kmalloc failure

2021-03-25 Thread Sasha Levin
From: "J. Bruce Fields" 

[ Upstream commit 0ddc942394013f08992fc379ca04cffacbbe3dae ]

I think this is unlikely but possible:

svc_authenticate sets rq_authop and calls svcauth_gss_accept.  The
kmalloc(sizeof(*svcdata), GFP_KERNEL) fails, leaving rq_auth_data NULL,
and returning SVC_DENIED.

This causes svc_process_common to go to err_bad_auth, and eventually
call svc_authorise.  That calls ->release == svcauth_gss_release, which
tries to dereference rq_auth_data.

Signed-off-by: J. Bruce Fields 
Link: 
https://lore.kernel.org/linux-nfs/3f1b347f-b809-478f-a1e9-0be98e22b...@oracle.com/T/#t
Signed-off-by: Chuck Lever 
Signed-off-by: Sasha Levin 
---
 net/sunrpc/auth_gss/svcauth_gss.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/auth_gss/svcauth_gss.c 
b/net/sunrpc/auth_gss/svcauth_gss.c
index cf4d6d7e7282..d5470c7fe879 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1782,11 +1782,14 @@ static int
 svcauth_gss_release(struct svc_rqst *rqstp)
 {
struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
-   struct rpc_gss_wire_cred *gc = >clcred;
+   struct rpc_gss_wire_cred *gc;
struct xdr_buf *resbuf = >rq_res;
int stat = -EINVAL;
struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
 
+   if (!gsd)
+   goto out;
+   gc = >clcred;
if (gc->gc_proc != RPC_GSS_PROC_DATA)
goto out;
/* Release can be called twice, but we only wrap once. */
@@ -1827,10 +1830,10 @@ svcauth_gss_release(struct svc_rqst *rqstp)
if (rqstp->rq_cred.cr_group_info)
put_group_info(rqstp->rq_cred.cr_group_info);
rqstp->rq_cred.cr_group_info = NULL;
-   if (gsd->rsci)
+   if (gsd && gsd->rsci) {
cache_put(>rsci->h, sn->rsc_cache);
-   gsd->rsci = NULL;
-
+   gsd->rsci = NULL;
+   }
return stat;
 }
 
-- 
2.30.1



[PATCH AUTOSEL 5.10 07/39] rpc: fix NULL dereference on kmalloc failure

2021-03-25 Thread Sasha Levin
From: "J. Bruce Fields" 

[ Upstream commit 0ddc942394013f08992fc379ca04cffacbbe3dae ]

I think this is unlikely but possible:

svc_authenticate sets rq_authop and calls svcauth_gss_accept.  The
kmalloc(sizeof(*svcdata), GFP_KERNEL) fails, leaving rq_auth_data NULL,
and returning SVC_DENIED.

This causes svc_process_common to go to err_bad_auth, and eventually
call svc_authorise.  That calls ->release == svcauth_gss_release, which
tries to dereference rq_auth_data.

Signed-off-by: J. Bruce Fields 
Link: 
https://lore.kernel.org/linux-nfs/3f1b347f-b809-478f-a1e9-0be98e22b...@oracle.com/T/#t
Signed-off-by: Chuck Lever 
Signed-off-by: Sasha Levin 
---
 net/sunrpc/auth_gss/svcauth_gss.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/auth_gss/svcauth_gss.c 
b/net/sunrpc/auth_gss/svcauth_gss.c
index bd4678db9d76..6dff64374bfe 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1825,11 +1825,14 @@ static int
 svcauth_gss_release(struct svc_rqst *rqstp)
 {
struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
-   struct rpc_gss_wire_cred *gc = >clcred;
+   struct rpc_gss_wire_cred *gc;
struct xdr_buf *resbuf = >rq_res;
int stat = -EINVAL;
struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
 
+   if (!gsd)
+   goto out;
+   gc = >clcred;
if (gc->gc_proc != RPC_GSS_PROC_DATA)
goto out;
/* Release can be called twice, but we only wrap once. */
@@ -1870,10 +1873,10 @@ svcauth_gss_release(struct svc_rqst *rqstp)
if (rqstp->rq_cred.cr_group_info)
put_group_info(rqstp->rq_cred.cr_group_info);
rqstp->rq_cred.cr_group_info = NULL;
-   if (gsd->rsci)
+   if (gsd && gsd->rsci) {
cache_put(>rsci->h, sn->rsc_cache);
-   gsd->rsci = NULL;
-
+   gsd->rsci = NULL;
+   }
return stat;
 }
 
-- 
2.30.1



[PATCH AUTOSEL 5.11 07/44] rpc: fix NULL dereference on kmalloc failure

2021-03-25 Thread Sasha Levin
From: "J. Bruce Fields" 

[ Upstream commit 0ddc942394013f08992fc379ca04cffacbbe3dae ]

I think this is unlikely but possible:

svc_authenticate sets rq_authop and calls svcauth_gss_accept.  The
kmalloc(sizeof(*svcdata), GFP_KERNEL) fails, leaving rq_auth_data NULL,
and returning SVC_DENIED.

This causes svc_process_common to go to err_bad_auth, and eventually
call svc_authorise.  That calls ->release == svcauth_gss_release, which
tries to dereference rq_auth_data.

Signed-off-by: J. Bruce Fields 
Link: 
https://lore.kernel.org/linux-nfs/3f1b347f-b809-478f-a1e9-0be98e22b...@oracle.com/T/#t
Signed-off-by: Chuck Lever 
Signed-off-by: Sasha Levin 
---
 net/sunrpc/auth_gss/svcauth_gss.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/auth_gss/svcauth_gss.c 
b/net/sunrpc/auth_gss/svcauth_gss.c
index bd4678db9d76..6dff64374bfe 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1825,11 +1825,14 @@ static int
 svcauth_gss_release(struct svc_rqst *rqstp)
 {
struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
-   struct rpc_gss_wire_cred *gc = >clcred;
+   struct rpc_gss_wire_cred *gc;
struct xdr_buf *resbuf = >rq_res;
int stat = -EINVAL;
struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
 
+   if (!gsd)
+   goto out;
+   gc = >clcred;
if (gc->gc_proc != RPC_GSS_PROC_DATA)
goto out;
/* Release can be called twice, but we only wrap once. */
@@ -1870,10 +1873,10 @@ svcauth_gss_release(struct svc_rqst *rqstp)
if (rqstp->rq_cred.cr_group_info)
put_group_info(rqstp->rq_cred.cr_group_info);
rqstp->rq_cred.cr_group_info = NULL;
-   if (gsd->rsci)
+   if (gsd && gsd->rsci) {
cache_put(>rsci->h, sn->rsc_cache);
-   gsd->rsci = NULL;
-
+   gsd->rsci = NULL;
+   }
return stat;
 }
 
-- 
2.30.1



Re: [PATCH] mm/slab: kmalloc with GFP_DMA32 allocate from SLAB_CACHE_DMA32

2021-03-12 Thread kernel test robot
Hi Jianqun,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on hnaz-linux-mm/master]

url:
https://github.com/0day-ci/linux/commits/Jianqun-Xu/mm-slab-kmalloc-with-GFP_DMA32-allocate-from-SLAB_CACHE_DMA32/20210312-160426
base:   https://github.com/hnaz/linux-mm master
config: x86_64-randconfig-a006-20210312 (attached as .config)
compiler: clang version 13.0.0 (https://github.com/llvm/llvm-project 
7b153b43d3a14d76975039408c4b922beb576735)
reproduce (this is a W=1 build):
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# install x86_64 cross compiling tool for clang build
# apt-get install binutils-x86-64-linux-gnu
# 
https://github.com/0day-ci/linux/commit/93abfa9fa97332c9d5c1727fcf76b604b6da33de
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review 
Jianqun-Xu/mm-slab-kmalloc-with-GFP_DMA32-allocate-from-SLAB_CACHE_DMA32/20210312-160426
git checkout 93abfa9fa97332c9d5c1727fcf76b604b6da33de
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 

All error/warnings (new ones prefixed by >>):

>> mm/slab_common.c:812:24: error: implicit declaration of function 
>> 'kmalloc_size' [-Werror,-Wimplicit-function-declaration]
   unsigned int size = kmalloc_size(i);
   ^
>> mm/slab_common.c:813:20: error: implicit declaration of function 
>> 'kmalloc_cache_name' [-Werror,-Wimplicit-function-declaration]
   const char *n = kmalloc_cache_name("dma32-kmalloc", 
size);
   ^
>> mm/slab_common.c:813:16: warning: incompatible integer to pointer conversion 
>> initializing 'const char *' with an expression of type 'int' 
>> [-Wint-conversion]
       const char *n = kmalloc_cache_name("dma32-kmalloc", 
size);
   ^   
~
   1 warning and 2 errors generated.


vim +/kmalloc_size +812 mm/slab_common.c

   761  
   762  /*
   763   * Create the kmalloc array. Some of the regular kmalloc arrays
   764   * may already have been created because they were needed to
   765   * enable allocations for slab creation.
   766   */
   767  void __init create_kmalloc_caches(slab_flags_t flags)
   768  {
   769  int i;
   770  enum kmalloc_cache_type type;
   771  
   772  for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) {
   773  for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; 
i++) {
   774  if (!kmalloc_caches[type][i])
   775  new_kmalloc_cache(i, type, flags);
   776  
   777  /*
   778   * Caches that are not of the 
two-to-the-power-of size.
   779   * These have to be created immediately after 
the
   780   * earlier power of two caches
   781   */
   782  if (KMALLOC_MIN_SIZE <= 32 && i == 6 &&
   783  !kmalloc_caches[type][1])
   784  new_kmalloc_cache(1, type, flags);
   785  if (KMALLOC_MIN_SIZE <= 64 && i == 7 &&
   786  !kmalloc_caches[type][2])
   787  new_kmalloc_cache(2, type, flags);
   788  }
   789  }
   790  
   791  /* Kmalloc array is now usable */
   792  slab_state = UP;
   793  
   794  #ifdef CONFIG_ZONE_DMA
   795  for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
   796  struct kmem_cache *s = 
kmalloc_caches[KMALLOC_NORMAL][i];
   797  
   798  if (s) {
   799  kmalloc_caches[KMALLOC_DMA][i] = 
create_kmalloc_cache(
   800  kmalloc_info[i].name[KMALLOC_DMA],
   801  kmalloc_info[i].size,
   802  SLAB_CACHE_DMA | flags, 0,
   803  kmalloc_info[i].size);
   804  }
   805  }
   806  #endif
   807  #ifdef CONFIG_ZONE_DMA32
   808  for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
   809  struct kmem_cache *s = 
kmalloc_caches[KMALLOC_NORMAL][i];
   810  
   811  if (s) {
 > 812  unsigned int size = kmalloc_size(i);
 > 813

Re: [PATCH] mm/slab: kmalloc with GFP_DMA32 allocate from SLAB_CACHE_DMA32

2021-03-12 Thread Christoph Hellwig
On Fri, Mar 12, 2021 at 04:03:20PM +0800, Jianqun Xu wrote:
> The flag GFP_DMA32 only effect in kmalloc_large currently.
> 
> This patch will create caches with GFP_DMA32 to support kmalloc with
> size under KMALLOC_MAX_CACHE_SIZE.

No.  No new code should use GFP_DMA32, never mind through slab.
Please use the proper DMA APIs for your addressing needs.


[PATCH] mm/slab: kmalloc with GFP_DMA32 allocate from SLAB_CACHE_DMA32

2021-03-12 Thread Jianqun Xu
The flag GFP_DMA32 only effect in kmalloc_large currently.

This patch will create caches with GFP_DMA32 to support kmalloc with
size under KMALLOC_MAX_CACHE_SIZE.

Signed-off-by: Jianqun Xu 
---
 include/linux/slab.h |  7 +++
 mm/slab_common.c | 14 ++
 2 files changed, 21 insertions(+)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index be4ba5867ac5..f4317663d148 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -307,6 +307,9 @@ enum kmalloc_cache_type {
KMALLOC_RECLAIM,
 #ifdef CONFIG_ZONE_DMA
KMALLOC_DMA,
+#endif
+#ifdef CONFIG_ZONE_DMA32
+   KMALLOC_DMA32,
 #endif
NR_KMALLOC_TYPES
 };
@@ -331,6 +334,10 @@ static __always_inline enum kmalloc_cache_type 
kmalloc_type(gfp_t flags)
 */
return flags & __GFP_DMA ? KMALLOC_DMA : KMALLOC_RECLAIM;
 #else
+#ifdef CONFIG_ZONE_DMA32
+   if (unlikely(flags & __GFP_DMA32))
+   return KMALLOC_DMA32;
+#endif
return flags & __GFP_RECLAIMABLE ? KMALLOC_RECLAIM : KMALLOC_NORMAL;
 #endif
 }
diff --git a/mm/slab_common.c b/mm/slab_common.c
index e981c80d216c..2a04736fe8f5 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -805,6 +805,20 @@ void __init create_kmalloc_caches(slab_flags_t flags)
}
}
 #endif
+#ifdef CONFIG_ZONE_DMA32
+   for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
+   struct kmem_cache *s = kmalloc_caches[KMALLOC_NORMAL][i];
+
+   if (s) {
+   unsigned int size = kmalloc_size(i);
+   const char *n = kmalloc_cache_name("dma32-kmalloc", 
size);
+
+   BUG_ON(!n);
+   kmalloc_caches[KMALLOC_DMA32][i] = create_kmalloc_cache(
+   n, size, SLAB_CACHE_DMA32 | flags, 0, 0);
+   }
+   }
+#endif
 }
 #endif /* !CONFIG_SLOB */
 
-- 
2.25.1





Re: [PATCH] ethernet: ucc_geth: Use kmemdup instead of kmalloc and memcpy

2021-03-09 Thread Rasmus Villemoes
On 05/03/2021 15.27, angkery wrote:
> From: Junlin Yang 
> 
> Fixes coccicheck warnings:
> ./drivers/net/ethernet/freescale/ucc_geth.c:3594:11-18:
> WARNING opportunity for kmemdup
> 
> Signed-off-by: Junlin Yang 
> ---
>  drivers/net/ethernet/freescale/ucc_geth.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
> 
> diff --git a/drivers/net/ethernet/freescale/ucc_geth.c 
> b/drivers/net/ethernet/freescale/ucc_geth.c
> index ef4e2fe..2c079ad 100644
> --- a/drivers/net/ethernet/freescale/ucc_geth.c
> +++ b/drivers/net/ethernet/freescale/ucc_geth.c
> @@ -3591,10 +3591,9 @@ static int ucc_geth_probe(struct platform_device* 
> ofdev)
>   if ((ucc_num < 0) || (ucc_num > 7))
>   return -ENODEV;
>  
> - ug_info = kmalloc(sizeof(*ug_info), GFP_KERNEL);
> + ug_info = kmemdup(_primary_info, sizeof(*ug_info), GFP_KERNEL);
>   if (ug_info == NULL)
>   return -ENOMEM;
> - memcpy(ug_info, _primary_info, sizeof(*ug_info));
>  
>   ug_info->uf_info.ucc_num = ucc_num;
>  
> 

Ah, yes, of course, I should have used that.

Acked-by: Rasmus Villemoes 


Re: usb: cdc-acm: BUG kmalloc-128 Poison overwritten

2021-03-08 Thread Oliver Neukum
Am Montag, den 08.03.2021, 09:50 +0100 schrieb Bruno Thomsen:
> 
> Tested-by: Bruno Thomsen 
> 
> I have not observed any oops with patches applied. Patches have seen
> more than 10 weeks of runtime testing across multiple devices.

Hi,

that is good news. I shall send them upstream.

Regards
Oliver




Re: usb: cdc-acm: BUG kmalloc-128 Poison overwritten

2021-03-08 Thread Bruno Thomsen
Den fre. 26. feb. 2021 kl. 15.14 skrev Bruno Thomsen :
>
> Den tor. 25. feb. 2021 kl. 10.57 skrev Oliver Neukum :
> >
> > Am Mittwoch, den 24.02.2021, 16:21 +0100 schrieb Bruno Thomsen:
> >
> > Hi,
> >
> > > No, this is not a regression from 5.10. It seems that many attempts to
> > > fix cdc-acm in the 5.x kernel series have failed to fix the root cause of
> > > these oops. I have not seen this on 4.14 and 4.19, but I have observed
> > > it on at least 5.3 and newer kernels in slight variations.
> > > I guess this is because cdc-acm is very common in the embedded
> > > ARM world and rarely used on servers or laptops. Combined with
> > > ARM devices still commonly use 4.x LTS kernels. Not sure if
> > > hardening options on the kernel has increased change of reproducing
> > > oops.
> >
> > OK, so this is not an additional problem.
> > According to your logs, an URB that should have been killed wasn't.
>
> Thanks for looking into this bug rapport.
>
> > > I am ready to test new patches and will continue to report oops
> >
> > Could you test the attached patches?
>
> Yes, I am already running tests on the patches.
> I have not seen any oops yet and it seems the USB cdc-acm driver is still
> working as intended.
>
> The only notable trace I have seen is this new error from the cdc-acm driver
> but everything kept on working.
> kernel: cdc_acm 1-1.1:1.7: acm_start_wb - usb_submit_urb(write bulk) failed: 
> -19
>
> Other then that I see this common error (should probably be a warning) during
> device enumeration:
> kernel: cdc_acm 1-1.2:1.0: failed to set dtr/rts
>
> I will post an update next week when the patches have survived some
> more runtime.

Tested-by: Bruno Thomsen 

I have not observed any oops with patches applied. Patches have seen
more than 10 weeks of runtime testing across multiple devices.

/Bruno


[PATCH] ethernet: ucc_geth: Use kmemdup instead of kmalloc and memcpy

2021-03-05 Thread angkery
From: Junlin Yang 

Fixes coccicheck warnings:
./drivers/net/ethernet/freescale/ucc_geth.c:3594:11-18:
WARNING opportunity for kmemdup

Signed-off-by: Junlin Yang 
---
 drivers/net/ethernet/freescale/ucc_geth.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/freescale/ucc_geth.c 
b/drivers/net/ethernet/freescale/ucc_geth.c
index ef4e2fe..2c079ad 100644
--- a/drivers/net/ethernet/freescale/ucc_geth.c
+++ b/drivers/net/ethernet/freescale/ucc_geth.c
@@ -3591,10 +3591,9 @@ static int ucc_geth_probe(struct platform_device* ofdev)
if ((ucc_num < 0) || (ucc_num > 7))
return -ENODEV;
 
-   ug_info = kmalloc(sizeof(*ug_info), GFP_KERNEL);
+   ug_info = kmemdup(_primary_info, sizeof(*ug_info), GFP_KERNEL);
if (ug_info == NULL)
return -ENOMEM;
-   memcpy(ug_info, _primary_info, sizeof(*ug_info));
 
ug_info->uf_info.ucc_num = ucc_num;
 
-- 
1.9.1




Re: usb: cdc-acm: BUG kmalloc-128 Poison overwritten

2021-02-26 Thread Bruno Thomsen
Den tor. 25. feb. 2021 kl. 10.57 skrev Oliver Neukum :
>
> Am Mittwoch, den 24.02.2021, 16:21 +0100 schrieb Bruno Thomsen:
>
> Hi,
>
> > No, this is not a regression from 5.10. It seems that many attempts to
> > fix cdc-acm in the 5.x kernel series have failed to fix the root cause of
> > these oops. I have not seen this on 4.14 and 4.19, but I have observed
> > it on at least 5.3 and newer kernels in slight variations.
> > I guess this is because cdc-acm is very common in the embedded
> > ARM world and rarely used on servers or laptops. Combined with
> > ARM devices still commonly use 4.x LTS kernels. Not sure if
> > hardening options on the kernel has increased change of reproducing
> > oops.
>
> OK, so this is not an additional problem.
> According to your logs, an URB that should have been killed wasn't.

Thanks for looking into this bug rapport.

> > I am ready to test new patches and will continue to report oops
>
> Could you test the attached patches?

Yes, I am already running tests on the patches.
I have not seen any oops yet and it seems the USB cdc-acm driver is still
working as intended.

The only notable trace I have seen is this new error from the cdc-acm driver
but everything kept on working.
kernel: cdc_acm 1-1.1:1.7: acm_start_wb - usb_submit_urb(write bulk) failed: -19

Other then that I see this common error (should probably be a warning) during
device enumeration:
kernel: cdc_acm 1-1.2:1.0: failed to set dtr/rts

I will post an update next week when the patches have survived some
more runtime.

/Bruno


Re: usb: cdc-acm: BUG kmalloc-128 Poison overwritten

2021-02-25 Thread Oliver Neukum
Am Mittwoch, den 24.02.2021, 16:21 +0100 schrieb Bruno Thomsen:

Hi,

> No, this is not a regression from 5.10. It seems that many attempts to
> fix cdc-acm in the 5.x kernel series have failed to fix the root cause of
> these oops. I have not seen this on 4.14 and 4.19, but I have observed
> it on at least 5.3 and newer kernels in slight variations.
> I guess this is because cdc-acm is very common in the embedded
> ARM world and rarely used on servers or laptops. Combined with
> ARM devices still commonly use 4.x LTS kernels. Not sure if
> hardening options on the kernel has increased change of reproducing
> oops.

OK, so this is not an additional problem.
According to your logs, an URB that should have been killed wasn't.

> I am ready to test new patches and will continue to report oops

Could you test the attached patches?

Regards
Oliver

From 307097e80657ca44ac99da8efc8397070b1aff3f Mon Sep 17 00:00:00 2001
From: Oliver Neukum 
Date: Thu, 18 Feb 2021 13:42:40 +0100
Subject: [PATCH 1/2] cdc-wdm: untangle a circular dependency between callback
 and softint

We have a cycle of callbacks scheduling works which submit
URBs with thos callbacks. This needs to be blocked, stopped
and unblocked to untangle the circle..

Signed-off-by: Oliver Neukum 
---
 drivers/usb/class/cdc-wdm.c | 30 ++
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c
index 508b1c3f8b73..d1e4a7379beb 100644
--- a/drivers/usb/class/cdc-wdm.c
+++ b/drivers/usb/class/cdc-wdm.c
@@ -321,12 +321,23 @@ static void wdm_int_callback(struct urb *urb)
 
 }
 
-static void kill_urbs(struct wdm_device *desc)
+static void poison_urbs(struct wdm_device *desc)
 {
 	/* the order here is essential */
-	usb_kill_urb(desc->command);
-	usb_kill_urb(desc->validity);
-	usb_kill_urb(desc->response);
+	usb_poison_urb(desc->command);
+	usb_poison_urb(desc->validity);
+	usb_poison_urb(desc->response);
+}
+
+static void unpoison_urbs(struct wdm_device *desc)
+{
+	/*
+	 *  the order here is not essential
+	 *  it is symmetrical just to be nice
+	 */
+	usb_unpoison_urb(desc->response);
+	usb_unpoison_urb(desc->validity);
+	usb_unpoison_urb(desc->command);
 }
 
 static void free_urbs(struct wdm_device *desc)
@@ -741,11 +752,12 @@ static int wdm_release(struct inode *inode, struct file *file)
 	if (!desc->count) {
 		if (!test_bit(WDM_DISCONNECTING, >flags)) {
 			dev_dbg(>intf->dev, "wdm_release: cleanup\n");
-			kill_urbs(desc);
+			poison_urbs(desc);
 			spin_lock_irq(>iuspin);
 			desc->resp_count = 0;
 			spin_unlock_irq(>iuspin);
 			desc->manage_power(desc->intf, 0);
+			unpoison_urbs(desc);
 		} else {
 			/* must avoid dev_printk here as desc->intf is invalid */
 			pr_debug(KBUILD_MODNAME " %s: device gone - cleaning up\n", __func__);
@@ -1037,9 +1049,9 @@ static void wdm_disconnect(struct usb_interface *intf)
 	wake_up_all(>wait);
 	mutex_lock(>rlock);
 	mutex_lock(>wlock);
+	poison_urbs(desc);
 	cancel_work_sync(>rxwork);
 	cancel_work_sync(>service_outs_intr);
-	kill_urbs(desc);
 	mutex_unlock(>wlock);
 	mutex_unlock(>rlock);
 
@@ -1080,9 +1092,10 @@ static int wdm_suspend(struct usb_interface *intf, pm_message_t message)
 		set_bit(WDM_SUSPENDING, >flags);
 		spin_unlock_irq(>iuspin);
 		/* callback submits work - order is essential */
-		kill_urbs(desc);
+		poison_urbs(desc);
 		cancel_work_sync(>rxwork);
 		cancel_work_sync(>service_outs_intr);
+		unpoison_urbs(desc);
 	}
 	if (!PMSG_IS_AUTO(message)) {
 		mutex_unlock(>wlock);
@@ -1140,7 +1153,7 @@ static int wdm_pre_reset(struct usb_interface *intf)
 	wake_up_all(>wait);
 	mutex_lock(>rlock);
 	mutex_lock(>wlock);
-	kill_urbs(desc);
+	poison_urbs(desc);
 	cancel_work_sync(>rxwork);
 	cancel_work_sync(>service_outs_intr);
 	return 0;
@@ -1151,6 +1164,7 @@ static int wdm_post_reset(struct usb_interface *intf)
 	struct wdm_device *desc = wdm_find_device(intf);
 	int rv;
 
+	unpoison_urbs(desc);
 	clear_bit(WDM_OVERFLOW, >flags);
 	clear_bit(WDM_RESETTING, >flags);
 	rv = recover_from_urb_loss(desc);
-- 
2.26.2

From 3eeb644af140174ebad6ddce5526bcaf42ccd9c9 Mon Sep 17 00:00:00 2001
From: Oliver Neukum 
Date: Thu, 18 Feb 2021 13:52:28 +0100
Subject: [PATCH 2/2] cdc-acm: untangle a circular dependency between callback
 and softint

We have a cycle of callbacks scheduling works which submit
URBs with thos callbacks. This needs to be blocked, stopped
and unblocked to untangle the circle.

Signed-off-by: Oliver Neukum 
---
 drivers/usb/class/cdc-acm.c | 41 +
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index 781905745812..235fd1f654a4 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -147,17 +147,29 @@ static inline int acm_set_control(struct acm *acm, int control)
 #define acm_send_break(acm, ms) \
 	acm_ctrl_msg(acm, USB_CDC_REQ_SEND_BREAK, ms, 

Re: usb: cdc-acm: BUG kmalloc-128 Poison overwritten

2021-02-24 Thread Bruno Thomsen
Den man. 22. feb. 2021 kl. 10.36 skrev Oliver Neukum :
>
> Am Donnerstag, den 18.02.2021, 16:52 +0100 schrieb Bruno Thomsen:
> > Den fre. 12. feb. 2021 kl. 16.33 skrev Bruno Thomsen 
> > :
> > > Hi,
> > >
> > > I have been experience random kernel oops in the cdc-acm driver on
> > > imx7 (arm arch). Normally it happens during the first 1-3min runtime
> > > after power-on. Below oops is from 5.8.17 mainline kernel with an
> > > extra patch back-ported in an attempt to fix it:
> > > 38203b8385 ("usb: cdc-acm: fix cooldown mechanism")
> >
> > I can now boot board with 5.11 kernel without any extra patches and
> > it produce similar issue. Hopefully that make the oops more useful.
> > Issue has been seen on multiple devices, so I don't think it's a bad
> > hardware issue.
>
> is this a regression from 5.10?

Hi Oliver

No, this is not a regression from 5.10. It seems that many attempts to
fix cdc-acm in the 5.x kernel series have failed to fix the root cause of
these oops. I have not seen this on 4.14 and 4.19, but I have observed
it on at least 5.3 and newer kernels in slight variations.
I guess this is because cdc-acm is very common in the embedded
ARM world and rarely used on servers or laptops. Combined with
ARM devices still commonly use 4.x LTS kernels. Not sure if
hardening options on the kernel has increased change of reproducing
oops.

I am ready to test new patches and will continue to report oops

/Bruno


Re: usb: cdc-acm: BUG kmalloc-128 Poison overwritten

2021-02-22 Thread Oliver Neukum
Am Donnerstag, den 18.02.2021, 16:52 +0100 schrieb Bruno Thomsen:
> Den fre. 12. feb. 2021 kl. 16.33 skrev Bruno Thomsen 
> :
> > Hi,
> > 
> > I have been experience random kernel oops in the cdc-acm driver on
> > imx7 (arm arch). Normally it happens during the first 1-3min runtime
> > after power-on. Below oops is from 5.8.17 mainline kernel with an
> > extra patch back-ported in an attempt to fix it:
> > 38203b8385 ("usb: cdc-acm: fix cooldown mechanism")
> 
> I can now boot board with 5.11 kernel without any extra patches and
> it produce similar issue. Hopefully that make the oops more useful.
> Issue has been seen on multiple devices, so I don't think it's a bad
> hardware issue.

Hi,

is this a regression from 5.10?

Regards
Oliver




[PATCH] media: atomisp: do not free kmalloc memory by vfree

2021-02-19 Thread Jiri Slaby
fw_minibuffer[i].buffer is allocated by kmalloc in sh_css_load_blob_info
and by vmalloc in setup_binary. So use kvfree to decide which of those
allocators to use for freeing.

Also remove the useless cast.

Signed-off-by: Jiri Slaby 
Cc: Mauro Carvalho Chehab 
---
 drivers/staging/media/atomisp/pci/sh_css_firmware.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/staging/media/atomisp/pci/sh_css_firmware.c 
b/drivers/staging/media/atomisp/pci/sh_css_firmware.c
index db25e39bea88..f4ce8ace9d50 100644
--- a/drivers/staging/media/atomisp/pci/sh_css_firmware.c
+++ b/drivers/staging/media/atomisp/pci/sh_css_firmware.c
@@ -366,7 +366,7 @@ void sh_css_unload_firmware(void)
if (fw_minibuffer[i].name)
kfree((void *)fw_minibuffer[i].name);
if (fw_minibuffer[i].buffer)
-   vfree((void *)fw_minibuffer[i].buffer);
+   kvfree(fw_minibuffer[i].buffer);
}
kfree(fw_minibuffer);
fw_minibuffer = NULL;
-- 
2.30.1



Re: usb: cdc-acm: BUG kmalloc-128 Poison overwritten

2021-02-18 Thread Bruno Thomsen
Den fre. 12. feb. 2021 kl. 16.33 skrev Bruno Thomsen :
>
> Hi,
>
> I have been experience random kernel oops in the cdc-acm driver on
> imx7 (arm arch). Normally it happens during the first 1-3min runtime
> after power-on. Below oops is from 5.8.17 mainline kernel with an
> extra patch back-ported in an attempt to fix it:
> 38203b8385 ("usb: cdc-acm: fix cooldown mechanism")

I can now boot board with 5.11 kernel without any extra patches and
it produce similar issue. Hopefully that make the oops more useful.
Issue has been seen on multiple devices, so I don't think it's a bad
hardware issue.

[ 76.458010] 8<--- cut here ---
[ 76.461178] Unable to handle kernel paging request at virtual address 6b6b6b93
[ 76.472958] pgd = f805d813
[ 76.475788] [6b6b6b93] *pgd=
[ 76.488068] Internal error: Oops: 5 [#1] PREEMPT SMP ARM
[ 76.493441] Modules linked in: xt_TCPMSS xt_tcpmss xt_hl nf_log_ipv6
nf_log_ipv4 nf_log_common xt_policy xt_limit xt_conntrack xt_tcpudp
xt_pkttype ip6table_mangle iptable_nat nf_nat nf_conntrack
nf_defrag_ipv6 nf_defrag_ipv4 iptable_mangle ip6table_filter
ip6_tables iptable_filter ip_tables des_generic md5 sch_fq_codel
cdc_mbim cdc_wdm cdc_ncm cdc_ether usbnet mii cdc_acm usb_storage
ip_tunnel xfrm_user xfrm6_tunnel tunnel6 xfrm4_tunnel tunnel4 esp6
esp4 ah6 ah4 xfrm_algo xt_LOG xt_LED xt_comment x_tables ipv6
[ 76.539032] CPU: 0 PID: 5 Comm: kworker/0:0 Tainted: G T 5.11.0 #1
[ 76.546539] Hardware name: Freescale i.MX7 Dual (Device Tree)
[ 76.552295] Workqueue: events acm_softint [cdc_acm]
[ 76.557223] PC is at usb_kill_urb+0x8/0x24
[ 76.561337] LR is at acm_softint+0x4c/0x10c [cdc_acm]
[ 76.566415] pc : [<805911a8>] lr : [<7f1168c4>] psr: 200e0113
[ 76.572689] sp : 84113f08 ip : 8575de7c fp : 840e92bc
[ 76.577920] r10:  r9 : 893222a8 r8 : 89322008
[ 76.583151] r7 : 89322000 r6 : 89322438 r5 : 89322448 r4 : 000a
[ 76.589686] r3 : 6b6b6b6b r2 : 12d79029 r1 : 800e0113 r0 : 6b6b6b6b
[ 76.596223] Flags: nzCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none
[ 76.603369] Control: 10c5387d Table: 8933406a DAC: 0051
[ 76.609120] Process kworker/0:0 (pid: 5, stack limit = 0x8fb8cf7e)
[ 76.615315] Stack: (0x84113f08 to 0x84114000)
[ 76.619685] 3f00: 89322448 840e9280 bf6caf40 bf6ce100  
[ 76.627875] 3f20:  8013f14c 84112000 bf6caf40 bf6caf58
840e9280 bf6caf40 840e9294
[ 76.636065] 3f40: bf6caf58 80c03d00 0008 84112000 bf6caf40
8013f3e8  80d0bbb0
[ 76.644255] 3f60:  840a7640 840a77c0 84112000 840ede7c
8013f3a4 840e9280 840a7664
[ 76.652445] 3f80:  801467d8  840a77c0 80146694
  
[ 76.660634] 3fa0:    80100150 
  
[ 76.668823] 3fc0:     
  
[ 76.677012] 3fe0:     0013
  
[ 76.685203] [<805911a8>] (usb_kill_urb) from [<7f1168c4>]
(acm_softint+0x4c/0x10c [cdc_acm])
[ 76.693690] [<7f1168c4>] (acm_softint [cdc_acm]) from [<8013f14c>]
(process_one_work+0x1bc/0x414)
[ 76.702605] [<8013f14c>] (process_one_work) from [<8013f3e8>]
(worker_thread+0x44/0x4dc)
[ 76.710719] [<8013f3e8>] (worker_thread) from [<801467d8>]
(kthread+0x144/0x180)
[ 76.718139] [<801467d8>] (kthread) from [<80100150>] (ret_from_fork+0x14/0x24)
[ 76.725380] Exception stack(0x84113fb0 to 0x84113ff8)
[ 76.730443] 3fa0:    
[ 76.738632] 3fc0:     
  
[ 76.746819] 3fe0:     0013 
[ 76.753448] Code: eae0 eb081505 e2503000 012fff1e (e5932028)
[ 76.761647] ---[ end trace 05b398f82b2a04b9 ]---


[PATCH 3/3] misc: lis3lv02d: Do not log an error when kmalloc fails

2021-02-17 Thread Hans de Goede
Logging an error when kmalloc fails is not necessary (and in general
should be avoided) because the malloc failure will already complain
loudly itself.

Signed-off-by: Hans de Goede 
---
 drivers/misc/lis3lv02d/lis3lv02d.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/misc/lis3lv02d/lis3lv02d.c 
b/drivers/misc/lis3lv02d/lis3lv02d.c
index 22dacfaad02f..70c5bb1e6f49 100644
--- a/drivers/misc/lis3lv02d/lis3lv02d.c
+++ b/drivers/misc/lis3lv02d/lis3lv02d.c
@@ -1179,10 +1179,8 @@ int lis3lv02d_init_device(struct lis3lv02d *lis3)
lis3->reg_cache = kzalloc(max(sizeof(lis3_wai8_regs),
 sizeof(lis3_wai12_regs)), GFP_KERNEL);
 
-   if (lis3->reg_cache == NULL) {
-   printk(KERN_ERR DRIVER_NAME "out of memory\n");
+   if (lis3->reg_cache == NULL)
return -ENOMEM;
-   }
 
mutex_init(>mutex);
atomic_set(>wake_thread, 0);
-- 
2.30.1



Re: [PATCH] staging:wlan-ng: use memdup_user instead of kmalloc/copy_from_user

2021-02-15 Thread Dan Carpenter
On Mon, Feb 15, 2021 at 09:44:24AM +0100, Michal Hocko wrote:
> On Sat 13-02-21 15:05:28, Ivan Safonov wrote:
> > memdup_user() is shorter and safer equivalent
> > of kmalloc/copy_from_user pair.
> > 
> > Signed-off-by: Ivan Safonov 
> > ---
> >  drivers/staging/wlan-ng/p80211netdev.c | 28 --
> >  1 file changed, 13 insertions(+), 15 deletions(-)
> > 
> > diff --git a/drivers/staging/wlan-ng/p80211netdev.c 
> > b/drivers/staging/wlan-ng/p80211netdev.c
> > index a15abb2c8f54..6f9666dc0277 100644
> > --- a/drivers/staging/wlan-ng/p80211netdev.c
> > +++ b/drivers/staging/wlan-ng/p80211netdev.c
> > @@ -569,24 +569,22 @@ static int p80211knetdev_do_ioctl(struct net_device 
> > *dev,
> >     goto bail;
> > }
> >  
> > -   /* Allocate a buf of size req->len */
> > -   msgbuf = kmalloc(req->len, GFP_KERNEL);
> > -   if (msgbuf) {
> > -   if (copy_from_user(msgbuf, (void __user *)req->data, req->len))
> > -   result = -EFAULT;
> > -   else
> > -   result = p80211req_dorequest(wlandev, msgbuf);
> > +   msgbuf = memdup_user(req->data, req->len);
> 
> Move to memdup_user is definitely a right step. What is the range of
> req->len though? If this can be larger than PAGE_SIZE then vmemdup_user
> would be a better alternative.

req->len shoudn't be anywhere close to PAGE_SIZE but it's actually
important to check req->len and this code does not do that which leads
to memory corruption:

drivers/staging/wlan-ng/p80211netdev.c
   566  goto bail;
   567  } else if (cmd != P80211_IFREQ) {
   568  result = -EINVAL;
   569  goto bail;
   570  }
   571  
   572  msgbuf = memdup_user(req->data, req->len);
   573  if (IS_ERR(msgbuf)) {
   574  result = PTR_ERR(msgbuf);
   575  goto bail;
   576  }
   577  
   578  result = p80211req_dorequest(wlandev, msgbuf);

We don't know that "req->len" is >= sizeof(*msgbuf), and then we pass
msgbuf top80211req_dorequest() which calls p80211req_handlemsg().  In
p80211req_handlemsg() then "req->len" has to be larger than
sizeof(struct p80211msg_lnxreq_hostwep).

   579  
   580  if (result == 0) {
   581  if (copy_to_user
   582  ((void __user *)req->data, msgbuf, req->len)) {
   583  result = -EFAULT;
   584  }
   585  }
   586  kfree(msgbuf);
   587  
   588  bail:
   589  /* If allocate,copyfrom or copyto fails, return errno */
   590  return result;
   591  }

Smatch has a problem parsing this code because struct ifreq *ifr is a
union and Smatch gets confused.  :/

regards,
dan carpenter


Re: [PATCH] staging:wlan-ng: use memdup_user instead of kmalloc/copy_from_user

2021-02-15 Thread Michal Hocko
On Sat 13-02-21 15:05:28, Ivan Safonov wrote:
> memdup_user() is shorter and safer equivalent
> of kmalloc/copy_from_user pair.
> 
> Signed-off-by: Ivan Safonov 
> ---
>  drivers/staging/wlan-ng/p80211netdev.c | 28 --
>  1 file changed, 13 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/staging/wlan-ng/p80211netdev.c 
> b/drivers/staging/wlan-ng/p80211netdev.c
> index a15abb2c8f54..6f9666dc0277 100644
> --- a/drivers/staging/wlan-ng/p80211netdev.c
> +++ b/drivers/staging/wlan-ng/p80211netdev.c
> @@ -569,24 +569,22 @@ static int p80211knetdev_do_ioctl(struct net_device 
> *dev,
>   goto bail;
>   }
>  
> -     /* Allocate a buf of size req->len */
> - msgbuf = kmalloc(req->len, GFP_KERNEL);
> - if (msgbuf) {
> - if (copy_from_user(msgbuf, (void __user *)req->data, req->len))
> - result = -EFAULT;
> - else
> - result = p80211req_dorequest(wlandev, msgbuf);
> + msgbuf = memdup_user(req->data, req->len);

Move to memdup_user is definitely a right step. What is the range of
req->len though? If this can be larger than PAGE_SIZE then vmemdup_user
would be a better alternative.

-- 
Michal Hocko
SUSE Labs


[PATCH] staging:wlan-ng: use memdup_user instead of kmalloc/copy_from_user

2021-02-13 Thread Ivan Safonov
memdup_user() is shorter and safer equivalent
of kmalloc/copy_from_user pair.

Signed-off-by: Ivan Safonov 
---
 drivers/staging/wlan-ng/p80211netdev.c | 28 --
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/drivers/staging/wlan-ng/p80211netdev.c 
b/drivers/staging/wlan-ng/p80211netdev.c
index a15abb2c8f54..6f9666dc0277 100644
--- a/drivers/staging/wlan-ng/p80211netdev.c
+++ b/drivers/staging/wlan-ng/p80211netdev.c
@@ -569,24 +569,22 @@ static int p80211knetdev_do_ioctl(struct net_device *dev,
goto bail;
}
 
-   /* Allocate a buf of size req->len */
-   msgbuf = kmalloc(req->len, GFP_KERNEL);
-   if (msgbuf) {
-   if (copy_from_user(msgbuf, (void __user *)req->data, req->len))
-   result = -EFAULT;
-   else
-   result = p80211req_dorequest(wlandev, msgbuf);
+   msgbuf = memdup_user(req->data, req->len);
+   if (IS_ERR(msgbuf)) {
+   result = PTR_ERR(msgbuf);
+   goto bail;
+   }
 
-   if (result == 0) {
-   if (copy_to_user
-   ((void __user *)req->data, msgbuf, req->len)) {
-   result = -EFAULT;
-   }
+   result = p80211req_dorequest(wlandev, msgbuf);
+
+   if (result == 0) {
+   if (copy_to_user
+   ((void __user *)req->data, msgbuf, req->len)) {
+   result = -EFAULT;
}
-   kfree(msgbuf);
-   } else {
-   result = -ENOMEM;
}
+   kfree(msgbuf);
+
 bail:
/* If allocate,copyfrom or copyto fails, return errno */
return result;
-- 
2.26.2



usb: cdc-acm: BUG kmalloc-128 Poison overwritten

2021-02-12 Thread Bruno Thomsen
Hi,

I have been experience random kernel oops in the cdc-acm driver on
imx7 (arm arch). Normally it happens during the first 1-3min runtime
after power-on. Below oops is from 5.8.17 mainline kernel with an
extra patch back-ported in an attempt to fix it:
38203b8385 ("usb: cdc-acm: fix cooldown mechanism")

Output from kconfig_hardened_check in version 2020-10-30-g2f8e7a4dc57a
has been included below oops as it might be related to the hardened kernel.

Currently I cannot update to latest mainline kernel as our board
isn't able to boot due to SPI errors on ecspi4 in versions 5.9-5.11rc6.
I am trying to bisect that issue, but I can still apply test patches if
anyone has an idea to why this is happening.

[   55.065305] 8<--- cut here ---
[   55.068392] Unable to handle kernel paging request at virtual
address 6b6b6c03
[   55.075624] pgd = be866494
[   55.078335] [6b6b6c03] *pgd=
[   55.081924] Internal error: Oops: 5 [#1] PREEMPT SMP ARM
[   55.087238] Modules linked in: ppp_async crc_ccitt ppp_generic slhc
xt_TCPMSS xt_tcpmss xt_hl nf_log_ipv6 nf_log_ipv4 nf_log_common
xt_policy xt_limit xt_conntrack xt_tcpudp xt_pkttype ip6table_mangle
iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4
iptable_mangle ip6table_filter ip6_tables iptable_filter ip_tables
des_generic md5 sch_fq_codel cdc_mbim cdc_wdm cdc_ncm usbnet mii
cdc_acm usb_storage ip_tunnel xfrm_user xfrm6_tunnel tunnel6
xfrm4_tunnel tunnel4 esp6 esp4 ah6 ah4 xfrm_algo xt_LOG xt_LED
xt_comment x_tables ipv6
[   55.134954] CPU: 0 PID: 82 Comm: kworker/0:2 Tainted: G
   T 5.8.17 #1
[   55.142526] Hardware name: Freescale i.MX7 Dual (Device Tree)
[   55.148304] Workqueue: events acm_softint [cdc_acm]
[   55.153196] PC is at kobject_get+0x10/0xa4
[   55.157302] LR is at usb_get_dev+0x14/0x1c
[   55.161402] pc : [<8047c06c>]lr : [<80560448>]psr: 2193
[   55.167671] sp : bca39ea8  ip : 7374  fp : bf6cbd80
[   55.172899] r10:   r9 : bdd92284  r8 : bdd92008
[   55.178128] r7 : 6b6b6b6b  r6 : fffe  r5 : 6113  r4 : 6b6b6be3
[   55.184658] r3 : 6b6b6b6b  r2 : 0111  r1 :   r0 : 6b6b6be3
[   55.191191] Flags: nzCv  IRQs off  FIQs on  Mode SVC_32  ISA ARM
Segment none
[   55.198417] Control: 10c5387d  Table: bcf0c06a  DAC: 0051
[   55.204168] Process kworker/0:2 (pid: 82, stack limit = 0x9bdd2a89)
[   55.210439] Stack: (0xbca39ea8 to 0xbca3a000)
[   55.214805] 9ea0:   bf6cbd80 80769a50 6b6b6b6b
80560448 bdeb0500 8056bfe8
[   55.222991] 9ec0: 0002 b76da000  bdeb0500 bdd92448
bca38000 bdeb0510 8056d69c
[   55.231177] 9ee0: bca38000  80c050fc  bca39f44
09d42015  0001
[   55.239363] 9f00: bdd92448 bdd92438 bdd92000 7f1158c4 bdd92448
bca2ee00 bf6cbd80 bf6cef00
[   55.247549] 9f20:    801412d8 bf6cbd98
80c03d00 bca2ee00 bf6cbd80
[   55.255735] 9f40: bca2ee14 bf6cbd98 80c03d00 0008 bca38000
80141568  80c446ae
[   55.263921] 9f60:  bc9ed880 bc9f0700 bca38000 bc117eb4
80141524 bca2ee00 bc9ed8a4
[   55.272107] 9f80:  80147cc8  bc9f0700 80147b84
  
[   55.280292] 9fa0:    80100148 
  
[   55.288477] 9fc0:     
  
[   55.296662] 9fe0:     0013
  
[   55.304860] [<8047c06c>] (kobject_get) from [<80560448>]
(usb_get_dev+0x14/0x1c)
[   55.312271] [<80560448>] (usb_get_dev) from [<8056bfe8>]
(usb_hcd_unlink_urb+0x50/0xd8)
[   55.320286] [<8056bfe8>] (usb_hcd_unlink_urb) from [<8056d69c>]
(usb_kill_urb.part.0+0x44/0xd0)
[   55.329004] [<8056d69c>] (usb_kill_urb.part.0) from [<7f1158c4>]
(acm_softint+0x4c/0x10c [cdc_acm])
[   55.338082] [<7f1158c4>] (acm_softint [cdc_acm]) from [<801412d8>]
(process_one_work+0x19c/0x3e8)
[   55.346969] [<801412d8>] (process_one_work) from [<80141568>]
(worker_thread+0x44/0x4dc)
[   55.355072] [<80141568>] (worker_thread) from [<80147cc8>]
(kthread+0x144/0x180)
[   55.362481] [<80147cc8>] (kthread) from [<80100148>]
(ret_from_fork+0x14/0x2c)
[   55.369706] Exception stack(0xbca39fb0 to 0xbca39ff8)
[   55.374764] 9fa0: 
  
[   55.382949] 9fc0:     
  
[   55.391133] 9fe0:     0013 
[   55.397757] Code: e92d4010 e2504000 e24dd008 0a0e (e5d43020)
[   55.403857] ---[ end trace 1ec2a82c3635d550 ]---
[   55.408479] note: kworker/0:2[82] exited with preempt_count 1
[   60.237377] 
=
[   60.245593] BUG kmalloc-128 (Tainted: G  D T): Poison overwritten
[   60.252737] 
---

[PATCH v2 03/12] kasan: optimize large kmalloc poisoning

2021-02-05 Thread Andrey Konovalov
Similarly to kasan_kmalloc(), kasan_kmalloc_large() doesn't need
to unpoison the object as it as already unpoisoned by alloc_pages()
(or by ksize() for krealloc()).

This patch changes kasan_kmalloc_large() to only poison the redzone.

Reviewed-by: Marco Elver 
Signed-off-by: Andrey Konovalov 
---
 mm/kasan/common.c | 20 +++-
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 00edbc3eb32e..f2a6bae13053 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -494,7 +494,6 @@ EXPORT_SYMBOL(__kasan_kmalloc);
 void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size,
gfp_t flags)
 {
-   struct page *page;
unsigned long redzone_start;
unsigned long redzone_end;
 
@@ -504,12 +503,23 @@ void * __must_check __kasan_kmalloc_large(const void 
*ptr, size_t size,
if (unlikely(ptr == NULL))
return NULL;
 
-   page = virt_to_page(ptr);
+   /*
+* The object has already been unpoisoned by kasan_alloc_pages() for
+* alloc_pages() or by ksize() for krealloc().
+*/
+
+   /*
+* The redzone has byte-level precision for the generic mode.
+* Partially poison the last object granule to cover the unaligned
+* part of the redzone.
+*/
+   if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+   kasan_poison_last_granule(ptr, size);
+
+   /* Poison the aligned part of the redzone. */
redzone_start = round_up((unsigned long)(ptr + size),
KASAN_GRANULE_SIZE);
-   redzone_end = (unsigned long)ptr + page_size(page);
-
-   kasan_unpoison(ptr, size);
+   redzone_end = (unsigned long)ptr + page_size(virt_to_page(ptr));
kasan_poison((void *)redzone_start, redzone_end - redzone_start,
 KASAN_PAGE_REDZONE);
 
-- 
2.30.0.365.g02bc693789-goog



[PATCH v2 02/12] kasan, mm: optimize kmalloc poisoning

2021-02-05 Thread Andrey Konovalov
For allocations from kmalloc caches, kasan_kmalloc() always follows
kasan_slab_alloc(). Currenly, both of them unpoison the whole object,
which is unnecessary.

This patch provides separate implementations for both annotations:
kasan_slab_alloc() unpoisons the whole object, and kasan_kmalloc()
only poisons the redzone.

For generic KASAN, the redzone start might not be aligned to
KASAN_GRANULE_SIZE. Therefore, the poisoning is split in two parts:
kasan_poison_last_granule() poisons the unaligned part, and then
kasan_poison() poisons the rest.

This patch also clarifies alignment guarantees of each of the poisoning
functions and drops the unnecessary round_up() call for redzone_end.

With this change, the early SLUB cache annotation needs to be changed to
kasan_slab_alloc(), as kasan_kmalloc() doesn't unpoison objects now.
The number of poisoned bytes for objects in this cache stays the same, as
kmem_cache_node->object_size is equal to sizeof(struct kmem_cache_node).

Reviewed-by: Marco Elver 
Signed-off-by: Andrey Konovalov 
---
 mm/kasan/common.c | 93 +++
 mm/kasan/kasan.h  | 43 +-
 mm/kasan/shadow.c | 28 +++---
 mm/slub.c |  3 +-
 4 files changed, 119 insertions(+), 48 deletions(-)

diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index bfdf5464f4ef..00edbc3eb32e 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -278,21 +278,11 @@ void __kasan_poison_object_data(struct kmem_cache *cache, 
void *object)
  *based on objects indexes, so that objects that are next to each other
  *get different tags.
  */
-static u8 assign_tag(struct kmem_cache *cache, const void *object,
-   bool init, bool keep_tag)
+static u8 assign_tag(struct kmem_cache *cache, const void *object, bool init)
 {
if (IS_ENABLED(CONFIG_KASAN_GENERIC))
return 0xff;
 
-   /*
-* 1. When an object is kmalloc()'ed, two hooks are called:
-*kasan_slab_alloc() and kasan_kmalloc(). We assign the
-*tag only in the first one.
-* 2. We reuse the same tag for krealloc'ed objects.
-*/
-   if (keep_tag)
-   return get_tag(object);
-
/*
 * If the cache neither has a constructor nor has SLAB_TYPESAFE_BY_RCU
 * set, assign a tag when the object is being allocated (init == false).
@@ -325,7 +315,7 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache 
*cache,
}
 
/* Tag is ignored in set_tag() without CONFIG_KASAN_SW/HW_TAGS */
-   object = set_tag(object, assign_tag(cache, object, true, false));
+   object = set_tag(object, assign_tag(cache, object, true));
 
return (void *)object;
 }
@@ -413,12 +403,46 @@ static void set_alloc_info(struct kmem_cache *cache, void 
*object,
kasan_set_track(_meta->alloc_track, flags);
 }
 
+void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
+   void *object, gfp_t flags)
+{
+   u8 tag;
+   void *tagged_object;
+
+   if (gfpflags_allow_blocking(flags))
+   kasan_quarantine_reduce();
+
+   if (unlikely(object == NULL))
+   return NULL;
+
+   if (is_kfence_address(object))
+   return (void *)object;
+
+   /*
+* Generate and assign random tag for tag-based modes.
+* Tag is ignored in set_tag() for the generic mode.
+*/
+   tag = assign_tag(cache, object, false);
+   tagged_object = set_tag(object, tag);
+
+   /*
+* Unpoison the whole object.
+* For kmalloc() allocations, kasan_kmalloc() will do precise poisoning.
+*/
+   kasan_unpoison(tagged_object, cache->object_size);
+
+   /* Save alloc info (if possible) for non-kmalloc() allocations. */
+   if (kasan_stack_collection_enabled())
+   set_alloc_info(cache, (void *)object, flags, false);
+
+   return tagged_object;
+}
+
 static void *kasan_kmalloc(struct kmem_cache *cache, const void *object,
-   size_t size, gfp_t flags, bool is_kmalloc)
+   size_t size, gfp_t flags)
 {
unsigned long redzone_start;
unsigned long redzone_end;
-   u8 tag;
 
if (gfpflags_allow_blocking(flags))
kasan_quarantine_reduce();
@@ -429,33 +453,41 @@ static void *kasan_kmalloc(struct kmem_cache *cache, 
const void *object,
if (is_kfence_address(kasan_reset_tag(object)))
return (void *)object;
 
+   /*
+* The object has already been unpoisoned by kasan_slab_alloc() for
+    * kmalloc() or by ksize() for krealloc().
+*/
+
+   /*
+* The redzone has byte-level precision for the generic mode.
+* Partially poison the last object granule to cover the unaligned
+* part of the redzone.
+*/
+   if (IS_ENABLED(CONFIG_KASA

[PATCH v3 mm 03/13] kasan: optimize large kmalloc poisoning

2021-02-05 Thread Andrey Konovalov
Similarly to kasan_kmalloc(), kasan_kmalloc_large() doesn't need
to unpoison the object as it as already unpoisoned by alloc_pages()
(or by ksize() for krealloc()).

This patch changes kasan_kmalloc_large() to only poison the redzone.

Reviewed-by: Marco Elver 
Signed-off-by: Andrey Konovalov 
---
 mm/kasan/common.c | 20 +++-
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 00edbc3eb32e..f2a6bae13053 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -494,7 +494,6 @@ EXPORT_SYMBOL(__kasan_kmalloc);
 void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size,
gfp_t flags)
 {
-   struct page *page;
unsigned long redzone_start;
unsigned long redzone_end;
 
@@ -504,12 +503,23 @@ void * __must_check __kasan_kmalloc_large(const void 
*ptr, size_t size,
if (unlikely(ptr == NULL))
return NULL;
 
-   page = virt_to_page(ptr);
+   /*
+* The object has already been unpoisoned by kasan_alloc_pages() for
+* alloc_pages() or by ksize() for krealloc().
+*/
+
+   /*
+* The redzone has byte-level precision for the generic mode.
+* Partially poison the last object granule to cover the unaligned
+* part of the redzone.
+*/
+   if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+   kasan_poison_last_granule(ptr, size);
+
+   /* Poison the aligned part of the redzone. */
redzone_start = round_up((unsigned long)(ptr + size),
KASAN_GRANULE_SIZE);
-   redzone_end = (unsigned long)ptr + page_size(page);
-
-   kasan_unpoison(ptr, size);
+   redzone_end = (unsigned long)ptr + page_size(virt_to_page(ptr));
kasan_poison((void *)redzone_start, redzone_end - redzone_start,
 KASAN_PAGE_REDZONE);
 
-- 
2.30.0.365.g02bc693789-goog



[PATCH v3 mm 02/13] kasan, mm: optimize kmalloc poisoning

2021-02-05 Thread Andrey Konovalov
For allocations from kmalloc caches, kasan_kmalloc() always follows
kasan_slab_alloc(). Currenly, both of them unpoison the whole object,
which is unnecessary.

This patch provides separate implementations for both annotations:
kasan_slab_alloc() unpoisons the whole object, and kasan_kmalloc()
only poisons the redzone.

For generic KASAN, the redzone start might not be aligned to
KASAN_GRANULE_SIZE. Therefore, the poisoning is split in two parts:
kasan_poison_last_granule() poisons the unaligned part, and then
kasan_poison() poisons the rest.

This patch also clarifies alignment guarantees of each of the poisoning
functions and drops the unnecessary round_up() call for redzone_end.

With this change, the early SLUB cache annotation needs to be changed to
kasan_slab_alloc(), as kasan_kmalloc() doesn't unpoison objects now.
The number of poisoned bytes for objects in this cache stays the same, as
kmem_cache_node->object_size is equal to sizeof(struct kmem_cache_node).

Reviewed-by: Marco Elver 
Signed-off-by: Andrey Konovalov 
---
 mm/kasan/common.c | 93 +++
 mm/kasan/kasan.h  | 43 +-
 mm/kasan/shadow.c | 28 +++---
 mm/slub.c |  3 +-
 4 files changed, 119 insertions(+), 48 deletions(-)

diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index bfdf5464f4ef..00edbc3eb32e 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -278,21 +278,11 @@ void __kasan_poison_object_data(struct kmem_cache *cache, 
void *object)
  *based on objects indexes, so that objects that are next to each other
  *get different tags.
  */
-static u8 assign_tag(struct kmem_cache *cache, const void *object,
-   bool init, bool keep_tag)
+static u8 assign_tag(struct kmem_cache *cache, const void *object, bool init)
 {
if (IS_ENABLED(CONFIG_KASAN_GENERIC))
return 0xff;
 
-   /*
-* 1. When an object is kmalloc()'ed, two hooks are called:
-*kasan_slab_alloc() and kasan_kmalloc(). We assign the
-*tag only in the first one.
-* 2. We reuse the same tag for krealloc'ed objects.
-*/
-   if (keep_tag)
-   return get_tag(object);
-
/*
 * If the cache neither has a constructor nor has SLAB_TYPESAFE_BY_RCU
 * set, assign a tag when the object is being allocated (init == false).
@@ -325,7 +315,7 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache 
*cache,
}
 
/* Tag is ignored in set_tag() without CONFIG_KASAN_SW/HW_TAGS */
-   object = set_tag(object, assign_tag(cache, object, true, false));
+   object = set_tag(object, assign_tag(cache, object, true));
 
return (void *)object;
 }
@@ -413,12 +403,46 @@ static void set_alloc_info(struct kmem_cache *cache, void 
*object,
kasan_set_track(_meta->alloc_track, flags);
 }
 
+void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
+   void *object, gfp_t flags)
+{
+   u8 tag;
+   void *tagged_object;
+
+   if (gfpflags_allow_blocking(flags))
+   kasan_quarantine_reduce();
+
+   if (unlikely(object == NULL))
+   return NULL;
+
+   if (is_kfence_address(object))
+   return (void *)object;
+
+   /*
+* Generate and assign random tag for tag-based modes.
+* Tag is ignored in set_tag() for the generic mode.
+*/
+   tag = assign_tag(cache, object, false);
+   tagged_object = set_tag(object, tag);
+
+   /*
+* Unpoison the whole object.
+* For kmalloc() allocations, kasan_kmalloc() will do precise poisoning.
+*/
+   kasan_unpoison(tagged_object, cache->object_size);
+
+   /* Save alloc info (if possible) for non-kmalloc() allocations. */
+   if (kasan_stack_collection_enabled())
+   set_alloc_info(cache, (void *)object, flags, false);
+
+   return tagged_object;
+}
+
 static void *kasan_kmalloc(struct kmem_cache *cache, const void *object,
-   size_t size, gfp_t flags, bool is_kmalloc)
+   size_t size, gfp_t flags)
 {
unsigned long redzone_start;
unsigned long redzone_end;
-   u8 tag;
 
if (gfpflags_allow_blocking(flags))
kasan_quarantine_reduce();
@@ -429,33 +453,41 @@ static void *kasan_kmalloc(struct kmem_cache *cache, 
const void *object,
if (is_kfence_address(kasan_reset_tag(object)))
return (void *)object;
 
+   /*
+* The object has already been unpoisoned by kasan_slab_alloc() for
+    * kmalloc() or by ksize() for krealloc().
+*/
+
+   /*
+* The redzone has byte-level precision for the generic mode.
+* Partially poison the last object granule to cover the unaligned
+* part of the redzone.
+*/
+   if (IS_ENABLED(CONFIG_KASA

Re: [PATCH] net/qrtr: replaced useless kzalloc with kmalloc in qrtr_tun_write_iter()

2021-02-04 Thread Jakub Kicinski
On Thu, 4 Feb 2021 10:51:59 -0800 Jakub Kicinski wrote:
> On Thu,  4 Feb 2021 15:02:30 +0600 Sabyrzhan Tasbolatov wrote:
> > Replaced kzalloc() with kmalloc(), there is no need for zeroed-out
> > memory for simple void *kbuf.  
> 
> There is no need for zeroed-out memory because it's immediately
> overwritten by copy_from_iter...

Also if you don't mind please wait a week until the fixes get merged
back into net-next and then repost. Otherwise this patch will not apply
cleanly. (Fixes are merged into a different tree than cleanups)

> > Signed-off-by: Sabyrzhan Tasbolatov 
> > ---
> >  net/qrtr/tun.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/net/qrtr/tun.c b/net/qrtr/tun.c
> > index b238c40a9984..9b607c7614de 100644
> > --- a/net/qrtr/tun.c
> > +++ b/net/qrtr/tun.c
> > @@ -86,7 +86,7 @@ static ssize_t qrtr_tun_write_iter(struct kiocb *iocb, 
> > struct iov_iter *from)
> > if (len > KMALLOC_MAX_SIZE)
> > return -ENOMEM;
> >  
> > -   kbuf = kzalloc(len, GFP_KERNEL);
> > +   kbuf = kmalloc(len, GFP_KERNEL);
> > if (!kbuf)
> > return -ENOMEM;
> >
> 



Re: [PATCH] net/qrtr: replaced useless kzalloc with kmalloc in qrtr_tun_write_iter()

2021-02-04 Thread Jakub Kicinski
On Thu,  4 Feb 2021 15:02:30 +0600 Sabyrzhan Tasbolatov wrote:
> Replaced kzalloc() with kmalloc(), there is no need for zeroed-out
> memory for simple void *kbuf.

There is no need for zeroed-out memory because it's immediately
overwritten by copy_from_iter...

> >For potential, separate clean up - this is followed 
> >by copy_from_iter_full(len) kzalloc() can probably 
> >be replaced by kmalloc()?
> >  
> >>if (!kbuf)
> >>return -ENOMEM;  
> 
> Signed-off-by: Sabyrzhan Tasbolatov 
> ---
>  net/qrtr/tun.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/net/qrtr/tun.c b/net/qrtr/tun.c
> index b238c40a9984..9b607c7614de 100644
> --- a/net/qrtr/tun.c
> +++ b/net/qrtr/tun.c
> @@ -86,7 +86,7 @@ static ssize_t qrtr_tun_write_iter(struct kiocb *iocb, 
> struct iov_iter *from)
>   if (len > KMALLOC_MAX_SIZE)
>   return -ENOMEM;
>  
> - kbuf = kzalloc(len, GFP_KERNEL);
> + kbuf = kmalloc(len, GFP_KERNEL);
>   if (!kbuf)
>   return -ENOMEM;
>  



[PATCH] net/qrtr: replaced useless kzalloc with kmalloc in qrtr_tun_write_iter()

2021-02-04 Thread Sabyrzhan Tasbolatov
Replaced kzalloc() with kmalloc(), there is no need for zeroed-out
memory for simple void *kbuf.

>For potential, separate clean up - this is followed 
>by copy_from_iter_full(len) kzalloc() can probably 
>be replaced by kmalloc()?
>
>>  if (!kbuf)
>>  return -ENOMEM;

Signed-off-by: Sabyrzhan Tasbolatov 
---
 net/qrtr/tun.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/qrtr/tun.c b/net/qrtr/tun.c
index b238c40a9984..9b607c7614de 100644
--- a/net/qrtr/tun.c
+++ b/net/qrtr/tun.c
@@ -86,7 +86,7 @@ static ssize_t qrtr_tun_write_iter(struct kiocb *iocb, struct 
iov_iter *from)
if (len > KMALLOC_MAX_SIZE)
return -ENOMEM;
 
-   kbuf = kzalloc(len, GFP_KERNEL);
+   kbuf = kmalloc(len, GFP_KERNEL);
if (!kbuf)
return -ENOMEM;
 
-- 
2.25.1



Re: [PATCH 02/12] kasan, mm: optimize kmalloc poisoning

2021-02-02 Thread Marco Elver
On Tue, 2 Feb 2021 at 18:16, Andrey Konovalov  wrote:
>
> On Tue, Feb 2, 2021 at 5:25 PM Marco Elver  wrote:
> >
> > > +#ifdef CONFIG_KASAN_GENERIC
> > > +
> > > +/**
> > > + * kasan_poison_last_granule - mark the last granule of the memory range 
> > > as
> > > + * unaccessible
> > > + * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
> > > + * @size - range size
> > > + *
> > > + * This function is only available for the generic mode, as it's the 
> > > only mode
> > > + * that has partially poisoned memory granules.
> > > + */
> > > +void kasan_poison_last_granule(const void *address, size_t size);
> > > +
> > > +#else /* CONFIG_KASAN_GENERIC */
> > > +
> > > +static inline void kasan_poison_last_granule(const void *address, size_t 
> > > size) { }
>
> ^
>
> > > +
> > > +#endif /* CONFIG_KASAN_GENERIC */
> > > +
> > >  /*
> > >   * Exported functions for interfaces called from assembly or from 
> > > generated
> > >   * code. Declarations here to avoid warning about missing declarations.
>
> > > @@ -96,6 +92,16 @@ void kasan_poison(const void *address, size_t size, u8 
> > > value)
> > >  }
> > >  EXPORT_SYMBOL(kasan_poison);
> > >
> > > +#ifdef CONFIG_KASAN_GENERIC
> > > +void kasan_poison_last_granule(const void *address, size_t size)
> > > +{
> > > + if (size & KASAN_GRANULE_MASK) {
> > > + u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size);
> > > + *shadow = size & KASAN_GRANULE_MASK;
> > > + }
> > > +}
> > > +#endif
> >
> > The function declaration still needs to exist in the dead branch if
> > !IS_ENABLED(CONFIG_KASAN_GENERIC). It appears in that case it's declared
> > (in kasan.h), but not defined.  We shouldn't get linker errors because
> > the optimizer should remove the dead branch. Nevertheless, is this code
> > generally acceptable?
>
> The function is defined as empty when !CONFIG_KASAN_GENERIC, see above.

I missed that, thanks.

Reviewed-by: Marco Elver 


Re: [PATCH 02/12] kasan, mm: optimize kmalloc poisoning

2021-02-02 Thread Andrey Konovalov
On Tue, Feb 2, 2021 at 5:25 PM Marco Elver  wrote:
>
> > +#ifdef CONFIG_KASAN_GENERIC
> > +
> > +/**
> > + * kasan_poison_last_granule - mark the last granule of the memory range as
> > + * unaccessible
> > + * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
> > + * @size - range size
> > + *
> > + * This function is only available for the generic mode, as it's the only 
> > mode
> > + * that has partially poisoned memory granules.
> > + */
> > +void kasan_poison_last_granule(const void *address, size_t size);
> > +
> > +#else /* CONFIG_KASAN_GENERIC */
> > +
> > +static inline void kasan_poison_last_granule(const void *address, size_t 
> > size) { }

^

> > +
> > +#endif /* CONFIG_KASAN_GENERIC */
> > +
> >  /*
> >   * Exported functions for interfaces called from assembly or from generated
> >   * code. Declarations here to avoid warning about missing declarations.

> > @@ -96,6 +92,16 @@ void kasan_poison(const void *address, size_t size, u8 
> > value)
> >  }
> >  EXPORT_SYMBOL(kasan_poison);
> >
> > +#ifdef CONFIG_KASAN_GENERIC
> > +void kasan_poison_last_granule(const void *address, size_t size)
> > +{
> > + if (size & KASAN_GRANULE_MASK) {
> > + u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size);
> > + *shadow = size & KASAN_GRANULE_MASK;
> > + }
> > +}
> > +#endif
>
> The function declaration still needs to exist in the dead branch if
> !IS_ENABLED(CONFIG_KASAN_GENERIC). It appears in that case it's declared
> (in kasan.h), but not defined.  We shouldn't get linker errors because
> the optimizer should remove the dead branch. Nevertheless, is this code
> generally acceptable?

The function is defined as empty when !CONFIG_KASAN_GENERIC, see above.


Re: [PATCH 03/12] kasan: optimize large kmalloc poisoning

2021-02-02 Thread Marco Elver
On Mon, Feb 01, 2021 at 08:43PM +0100, Andrey Konovalov wrote:
> Similarly to kasan_kmalloc(), kasan_kmalloc_large() doesn't need
> to unpoison the object as it as already unpoisoned by alloc_pages()
> (or by ksize() for krealloc()).
> 
> This patch changes kasan_kmalloc_large() to only poison the redzone.
> 
> Signed-off-by: Andrey Konovalov 

Reviewed-by: Marco Elver 

> ---
>  mm/kasan/common.c | 20 +++-
>  1 file changed, 15 insertions(+), 5 deletions(-)
> 
> diff --git a/mm/kasan/common.c b/mm/kasan/common.c
> index 128cb330ca73..a7eb553c8e91 100644
> --- a/mm/kasan/common.c
> +++ b/mm/kasan/common.c
> @@ -494,7 +494,6 @@ EXPORT_SYMBOL(__kasan_kmalloc);
>  void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size,
>   gfp_t flags)
>  {
> - struct page *page;
>   unsigned long redzone_start;
>   unsigned long redzone_end;
>  
> @@ -504,12 +503,23 @@ void * __must_check __kasan_kmalloc_large(const void 
> *ptr, size_t size,
>   if (unlikely(ptr == NULL))
>   return NULL;
>  
> - page = virt_to_page(ptr);
> + /*
> +  * The object has already been unpoisoned by kasan_alloc_pages() for
> +  * alloc_pages() or by ksize() for krealloc().
> +  */
> +
> + /*
> +  * The redzone has byte-level precision for the generic mode.
> +  * Partially poison the last object granule to cover the unaligned
> +  * part of the redzone.
> +  */
> + if (IS_ENABLED(CONFIG_KASAN_GENERIC))
> + kasan_poison_last_granule(ptr, size);
> +
> + /* Poison the aligned part of the redzone. */
>   redzone_start = round_up((unsigned long)(ptr + size),
>   KASAN_GRANULE_SIZE);
> - redzone_end = (unsigned long)ptr + page_size(page);
> -
> - kasan_unpoison(ptr, size);
> + redzone_end = (unsigned long)ptr + page_size(virt_to_page(ptr));
>   kasan_poison((void *)redzone_start, redzone_end - redzone_start,
>KASAN_PAGE_REDZONE);
>  
> -- 
> 2.30.0.365.g02bc693789-goog
> 


Re: [PATCH 02/12] kasan, mm: optimize kmalloc poisoning

2021-02-02 Thread Marco Elver
On Mon, Feb 01, 2021 at 08:43PM +0100, Andrey Konovalov wrote:
> For allocations from kmalloc caches, kasan_kmalloc() always follows
> kasan_slab_alloc(). Currenly, both of them unpoison the whole object,
> which is unnecessary.
> 
> This patch provides separate implementations for both annotations:
> kasan_slab_alloc() unpoisons the whole object, and kasan_kmalloc()
> only poisons the redzone.
> 
> For generic KASAN, the redzone start might not be aligned to
> KASAN_GRANULE_SIZE. Therefore, the poisoning is split in two parts:
> kasan_poison_last_granule() poisons the unaligned part, and then
> kasan_poison() poisons the rest.
> 
> This patch also clarifies alignment guarantees of each of the poisoning
> functions and drops the unnecessary round_up() call for redzone_end.
> 
> With this change, the early SLUB cache annotation needs to be changed to
> kasan_slab_alloc(), as kasan_kmalloc() doesn't unpoison objects now.
> The number of poisoned bytes for objects in this cache stays the same, as
> kmem_cache_node->object_size is equal to sizeof(struct kmem_cache_node).
> 
> Signed-off-by: Andrey Konovalov 
> ---
>  mm/kasan/common.c | 93 +++
>  mm/kasan/kasan.h  | 43 +-
>  mm/kasan/shadow.c | 28 +++---
>  mm/slub.c |  3 +-
>  4 files changed, 119 insertions(+), 48 deletions(-)
> 
> diff --git a/mm/kasan/common.c b/mm/kasan/common.c
> index 374049564ea3..128cb330ca73 100644
> --- a/mm/kasan/common.c
> +++ b/mm/kasan/common.c
> @@ -278,21 +278,11 @@ void __kasan_poison_object_data(struct kmem_cache 
> *cache, void *object)
>   *based on objects indexes, so that objects that are next to each other
>   *get different tags.
>   */
> -static u8 assign_tag(struct kmem_cache *cache, const void *object,
> - bool init, bool keep_tag)
> +static u8 assign_tag(struct kmem_cache *cache, const void *object, bool init)
>  {
>   if (IS_ENABLED(CONFIG_KASAN_GENERIC))
>   return 0xff;
>  
> - /*
> -  * 1. When an object is kmalloc()'ed, two hooks are called:
> -  *kasan_slab_alloc() and kasan_kmalloc(). We assign the
> -  *tag only in the first one.
> -  * 2. We reuse the same tag for krealloc'ed objects.
> -  */
> - if (keep_tag)
> - return get_tag(object);
> -
>   /*
>* If the cache neither has a constructor nor has SLAB_TYPESAFE_BY_RCU
>* set, assign a tag when the object is being allocated (init == false).
> @@ -325,7 +315,7 @@ void * __must_check __kasan_init_slab_obj(struct 
> kmem_cache *cache,
>   }
>  
>   /* Tag is ignored in set_tag() without CONFIG_KASAN_SW/HW_TAGS */
> - object = set_tag(object, assign_tag(cache, object, true, false));
> + object = set_tag(object, assign_tag(cache, object, true));
>  
>   return (void *)object;
>  }
> @@ -413,12 +403,46 @@ static void set_alloc_info(struct kmem_cache *cache, 
> void *object,
>   kasan_set_track(_meta->alloc_track, flags);
>  }
>  
> +void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
> + void *object, gfp_t flags)
> +{
> + u8 tag;
> + void *tagged_object;
> +
> + if (gfpflags_allow_blocking(flags))
> + kasan_quarantine_reduce();
> +
> + if (unlikely(object == NULL))
> + return NULL;
> +
> + if (is_kfence_address(object))
> + return (void *)object;
> +
> + /*
> +  * Generate and assign random tag for tag-based modes.
> +  * Tag is ignored in set_tag() for the generic mode.
> +  */
> + tag = assign_tag(cache, object, false);
> + tagged_object = set_tag(object, tag);
> +
> + /*
> +  * Unpoison the whole object.
> +  * For kmalloc() allocations, kasan_kmalloc() will do precise poisoning.
> +  */
> + kasan_unpoison(tagged_object, cache->object_size);
> +
> + /* Save alloc info (if possible) for non-kmalloc() allocations. */
> + if (kasan_stack_collection_enabled())
> + set_alloc_info(cache, (void *)object, flags, false);
> +
> + return tagged_object;
> +}
> +
>  static void *kasan_kmalloc(struct kmem_cache *cache, const void *object,
> - size_t size, gfp_t flags, bool kmalloc)
> + size_t size, gfp_t flags)
>  {
>   unsigned long redzone_start;
>   unsigned long redzone_end;
> - u8 tag;
>  
>   if (gfpflags_allow_blocking(flags))
>   kasan_quarantine_reduce();
> @@ -429,33 +453,41 @@ static void *kasan_kmalloc(struct kmem_c

[PATCH 02/12] kasan, mm: optimize kmalloc poisoning

2021-02-01 Thread Andrey Konovalov
For allocations from kmalloc caches, kasan_kmalloc() always follows
kasan_slab_alloc(). Currenly, both of them unpoison the whole object,
which is unnecessary.

This patch provides separate implementations for both annotations:
kasan_slab_alloc() unpoisons the whole object, and kasan_kmalloc()
only poisons the redzone.

For generic KASAN, the redzone start might not be aligned to
KASAN_GRANULE_SIZE. Therefore, the poisoning is split in two parts:
kasan_poison_last_granule() poisons the unaligned part, and then
kasan_poison() poisons the rest.

This patch also clarifies alignment guarantees of each of the poisoning
functions and drops the unnecessary round_up() call for redzone_end.

With this change, the early SLUB cache annotation needs to be changed to
kasan_slab_alloc(), as kasan_kmalloc() doesn't unpoison objects now.
The number of poisoned bytes for objects in this cache stays the same, as
kmem_cache_node->object_size is equal to sizeof(struct kmem_cache_node).

Signed-off-by: Andrey Konovalov 
---
 mm/kasan/common.c | 93 +++
 mm/kasan/kasan.h  | 43 +-
 mm/kasan/shadow.c | 28 +++---
 mm/slub.c |  3 +-
 4 files changed, 119 insertions(+), 48 deletions(-)

diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 374049564ea3..128cb330ca73 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -278,21 +278,11 @@ void __kasan_poison_object_data(struct kmem_cache *cache, 
void *object)
  *based on objects indexes, so that objects that are next to each other
  *get different tags.
  */
-static u8 assign_tag(struct kmem_cache *cache, const void *object,
-   bool init, bool keep_tag)
+static u8 assign_tag(struct kmem_cache *cache, const void *object, bool init)
 {
if (IS_ENABLED(CONFIG_KASAN_GENERIC))
return 0xff;
 
-   /*
-* 1. When an object is kmalloc()'ed, two hooks are called:
-*kasan_slab_alloc() and kasan_kmalloc(). We assign the
-*tag only in the first one.
-* 2. We reuse the same tag for krealloc'ed objects.
-*/
-   if (keep_tag)
-   return get_tag(object);
-
/*
 * If the cache neither has a constructor nor has SLAB_TYPESAFE_BY_RCU
 * set, assign a tag when the object is being allocated (init == false).
@@ -325,7 +315,7 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache 
*cache,
}
 
/* Tag is ignored in set_tag() without CONFIG_KASAN_SW/HW_TAGS */
-   object = set_tag(object, assign_tag(cache, object, true, false));
+   object = set_tag(object, assign_tag(cache, object, true));
 
return (void *)object;
 }
@@ -413,12 +403,46 @@ static void set_alloc_info(struct kmem_cache *cache, void 
*object,
kasan_set_track(_meta->alloc_track, flags);
 }
 
+void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
+   void *object, gfp_t flags)
+{
+   u8 tag;
+   void *tagged_object;
+
+   if (gfpflags_allow_blocking(flags))
+   kasan_quarantine_reduce();
+
+   if (unlikely(object == NULL))
+   return NULL;
+
+   if (is_kfence_address(object))
+   return (void *)object;
+
+   /*
+* Generate and assign random tag for tag-based modes.
+* Tag is ignored in set_tag() for the generic mode.
+*/
+   tag = assign_tag(cache, object, false);
+   tagged_object = set_tag(object, tag);
+
+   /*
+* Unpoison the whole object.
+* For kmalloc() allocations, kasan_kmalloc() will do precise poisoning.
+*/
+   kasan_unpoison(tagged_object, cache->object_size);
+
+   /* Save alloc info (if possible) for non-kmalloc() allocations. */
+   if (kasan_stack_collection_enabled())
+   set_alloc_info(cache, (void *)object, flags, false);
+
+   return tagged_object;
+}
+
 static void *kasan_kmalloc(struct kmem_cache *cache, const void *object,
-   size_t size, gfp_t flags, bool kmalloc)
+   size_t size, gfp_t flags)
 {
unsigned long redzone_start;
unsigned long redzone_end;
-   u8 tag;
 
if (gfpflags_allow_blocking(flags))
kasan_quarantine_reduce();
@@ -429,33 +453,41 @@ static void *kasan_kmalloc(struct kmem_cache *cache, 
const void *object,
if (is_kfence_address(kasan_reset_tag(object)))
return (void *)object;
 
+   /*
+* The object has already been unpoisoned by kasan_slab_alloc() for
+    * kmalloc() or by ksize() for krealloc().
+*/
+
+   /*
+* The redzone has byte-level precision for the generic mode.
+* Partially poison the last object granule to cover the unaligned
+* part of the redzone.
+*/
+   if (IS_ENABLED(CONFIG_KASA

[PATCH 03/12] kasan: optimize large kmalloc poisoning

2021-02-01 Thread Andrey Konovalov
Similarly to kasan_kmalloc(), kasan_kmalloc_large() doesn't need
to unpoison the object as it as already unpoisoned by alloc_pages()
(or by ksize() for krealloc()).

This patch changes kasan_kmalloc_large() to only poison the redzone.

Signed-off-by: Andrey Konovalov 
---
 mm/kasan/common.c | 20 +++-
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 128cb330ca73..a7eb553c8e91 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -494,7 +494,6 @@ EXPORT_SYMBOL(__kasan_kmalloc);
 void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size,
gfp_t flags)
 {
-   struct page *page;
unsigned long redzone_start;
unsigned long redzone_end;
 
@@ -504,12 +503,23 @@ void * __must_check __kasan_kmalloc_large(const void 
*ptr, size_t size,
if (unlikely(ptr == NULL))
return NULL;
 
-   page = virt_to_page(ptr);
+   /*
+* The object has already been unpoisoned by kasan_alloc_pages() for
+* alloc_pages() or by ksize() for krealloc().
+*/
+
+   /*
+* The redzone has byte-level precision for the generic mode.
+* Partially poison the last object granule to cover the unaligned
+* part of the redzone.
+*/
+   if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+   kasan_poison_last_granule(ptr, size);
+
+   /* Poison the aligned part of the redzone. */
redzone_start = round_up((unsigned long)(ptr + size),
KASAN_GRANULE_SIZE);
-   redzone_end = (unsigned long)ptr + page_size(page);
-
-   kasan_unpoison(ptr, size);
+   redzone_end = (unsigned long)ptr + page_size(virt_to_page(ptr));
kasan_poison((void *)redzone_start, redzone_end - redzone_start,
 KASAN_PAGE_REDZONE);
 
-- 
2.30.0.365.g02bc693789-goog



Re: [PATCH v15 09/10] arm64: Call kmalloc() to allocate DTB buffer

2021-02-01 Thread Thiago Jung Bauermann


Joe Perches  writes:

> On Thu, 2021-01-28 at 00:52 -0300, Thiago Jung Bauermann wrote:
>> The problem is that this patch implements only part of the suggestion,
>> which isn't useful in itself. So the patch series should either drop
>> this patch or consolidate the FDT allocation between the arches.
>> 
>> I just tested on powernv and pseries platforms and powerpc can use
>> vmalloc for the FDT buffer.
>
> Perhaps more sensible to use kvmalloc/kvfree.

That's true. Converting both arm64 to powerpc to kvmalloc/kvfree is a
good option. I don't think it's that much better though, because
kexec_file_load() is called infrequently and doesn't need to be fast so
the vmalloc() overhead isn't important in practice.

-- 
Thiago Jung Bauermann
IBM Linux Technology Center


Re: [Intel-gfx] v5.11-rc5 BUG kmalloc-1k (Not tainted): Redzone overwritten

2021-02-01 Thread Chris Wilson
Quoting Jani Nikula (2021-01-28 13:23:48)
> 
> A number of our CI systems are hitting redzone overwritten errors after
> s2idle, with the errors introduced between v5.11-rc4 and v5.11-rc5. See
> snippet below, full logs for one affected machine at [1].
> 
> Known issue?

Fwiw, I think this should be fixed by

commit 08d60e5999540110576e7c1346d486220751b7f9
Author: John Ogness 
Date:   Sun Jan 24 21:33:28 2021 +0106

printk: fix string termination for record_print_text()

Commit f0e386ee0c0b ("printk: fix buffer overflow potential for
print_text()") added string termination in record_print_text().
However it used the wrong base pointer for adding the terminator.
This led to a 0-byte being written somewhere beyond the buffer.

Use the correct base pointer when adding the terminator.

Fixes: f0e386ee0c0b ("printk: fix buffer overflow potential for 
print_text()")
Reported-by: Sven Schnelle 
Signed-off-by: John Ogness 
Signed-off-by: Petr Mladek 
Link: 
https://lore.kernel.org/r/20210124202728.4718-1-john.ogn...@linutronix.de

din should be rolled forward, but there's yet another regression in rc6
breaking suspend on all machines.
-Chris


v5.11-rc5 BUG kmalloc-1k (Not tainted): Redzone overwritten

2021-01-28 Thread Jani Nikula


A number of our CI systems are hitting redzone overwritten errors after
s2idle, with the errors introduced between v5.11-rc4 and v5.11-rc5. See
snippet below, full logs for one affected machine at [1].

Known issue?

BR,
Jani.


[1] 
https://intel-gfx-ci.01.org/tree/drm-intel-fixes/CI_DIF_549/fi-tgl-u2/igt@gem_exec_susp...@basic-s0.html


<6> [71.947160] Restarting tasks ... done.
<3> [71.948035] 
=
<3> [71.948545] BUG kmalloc-1k (Not tainted): Redzone overwritten
<3> [71.948577] 
-
<4> [71.948625] Disabling lock debugging due to kernel taint
<3> [71.948626] INFO: 0xde6e27d6-0xeaa949e9 @offset=29696. 
First byte 0x0 instead of 0xcc
<3> [71.948633] INFO: Allocated in syslog_print+0x39/0x200 age=1 cpu=6 pid=427
<3> [71.948642] __slab_alloc.isra.86.constprop.94+0x7e/0x90
<3> [71.948647] kmem_cache_alloc_trace+0x337/0x420
<3> [71.948651] syslog_print+0x39/0x200
<3> [71.948654] do_syslog.part.23+0x31a/0x480
<3> [71.948658] kmsg_read+0x3c/0x50
<3> [71.948663] vfs_read+0xa8/0x1b0
<3> [71.948667] ksys_read+0x5a/0xd0
<3> [71.948670] do_syscall_64+0x33/0x80
<3> [71.948674] entry_SYSCALL_64_after_hwframe+0x44/0xa9
<3> [71.948679] INFO: Freed in kfree_rcu_work+0x2ef/0x320 age=658 cpu=3 pid=195
<3> [71.948685] kmem_cache_free_bulk+0xbeb/0xcb0
<3> [71.948689] kfree_rcu_work+0x2ef/0x320
<3> [71.948693] process_one_work+0x270/0x5c0
<3> [71.948697] worker_thread+0x37/0x380
<3> [71.948701] kthread+0x146/0x170
<3> [71.948705] ret_from_fork+0x1f/0x30
<3> [71.948709] INFO: Slab 0x97533187 objects=10 used=9 
fp=0x2183a6cf flags=0x80010201
<3> [71.948714] INFO: Object 0xb43421a9 @offset=28672 
fp=0x
<3> [71.948720] Redzone 34747f33: cc cc cc cc cc cc cc cc cc cc cc cc 
cc cc cc cc  
<3> [71.948726] Redzone 1c57bf27: cc cc cc cc cc cc cc cc cc cc cc cc 
cc cc cc cc  
<3> [71.948731] Redzone 6763eea0: cc cc cc cc cc cc cc cc cc cc cc cc 
cc cc cc cc  
<3> [71.948736] Redzone 28c40de8: cc cc cc cc cc cc cc cc cc cc cc cc 
cc cc cc cc  
<3> [71.948742] Redzone c8e197cc: cc cc cc cc cc cc cc cc cc cc cc cc 
cc cc cc cc  
<3> [71.948747] Redzone 8b77f05a: cc cc cc cc cc cc cc cc cc cc cc cc 
cc cc cc cc  
<3> [71.948752] Redzone 3e1a5f65: cc cc cc cc cc cc cc cc cc cc cc cc 
cc cc cc cc  
<3> [71.948757] Redzone 568b4b04: cc cc cc cc cc cc cc cc cc cc cc cc 
cc cc cc cc  
<3> [71.948762] Redzone 183f376e: cc cc cc cc cc cc cc cc cc cc cc cc 
cc cc cc cc  
<3> [71.948767] Redzone 3aafec8b: cc cc cc cc cc cc cc cc cc cc cc cc 
cc cc cc cc  
<3> [71.948773] Redzone ec8d8c96: cc cc cc cc cc cc cc cc cc cc cc cc 
cc cc cc cc  
<3> [71.948779] Redzone 54f062b1: cc cc cc cc cc cc cc cc cc cc cc cc 
cc cc cc cc  
<3> [71.948784] Redzone 44f67988: cc cc cc cc cc cc cc cc cc cc cc cc 
cc cc cc cc  
<3> [71.948790] Redzone f7a07bd6: cc cc cc cc cc cc cc cc cc cc cc cc 
cc cc cc cc  
<3> [71.948796] Redzone 4719a4f1: cc cc cc cc cc cc cc cc cc cc cc cc 
cc cc cc cc  
<3> [71.948801] Redzone 06a35936: cc cc cc cc cc cc cc cc cc cc cc cc 
cc cc cc cc  
<3> [71.948807] Redzone f8aefa64: cc cc cc cc cc cc cc cc cc cc cc cc 
cc cc cc cc  
<3> [71.948812] Redzone 8550e102: cc cc cc cc cc cc cc cc cc cc cc cc 
cc cc cc cc  
<3> [71.948817] Redzone 9189a7a1: cc cc cc cc cc cc cc cc cc cc cc cc 
cc cc cc cc  
<3> [71.948823] Redzone 7a3b9eea: cc cc cc cc cc cc cc cc cc cc cc cc 
cc cc cc cc  
<3> [71.948828] Redzone ab035e44: cc cc cc cc cc cc cc cc cc cc cc cc 
cc cc cc cc  
<3> [71.948834] Redzone 01d5ee40: cc cc cc cc cc cc cc cc cc cc cc cc 
cc cc cc cc  
<3> [71.948840] Redzone 9c676a9c: cc cc cc cc cc cc cc cc cc cc cc cc 
cc cc cc cc  
<3> [71.948845] Redzone 309ce2cf: cc cc cc cc cc cc cc cc cc cc cc cc 
cc cc cc cc  
<3> [71.948850] Redzone f620b753: cc cc cc cc cc cc cc cc cc cc cc cc 
cc cc cc cc  
<3> [71.948855] Redzone

Re: [PATCH v15 09/10] arm64: Call kmalloc() to allocate DTB buffer

2021-01-27 Thread Joe Perches
On Thu, 2021-01-28 at 00:52 -0300, Thiago Jung Bauermann wrote:
> The problem is that this patch implements only part of the suggestion,
> which isn't useful in itself. So the patch series should either drop
> this patch or consolidate the FDT allocation between the arches.
> 
> I just tested on powernv and pseries platforms and powerpc can use
> vmalloc for the FDT buffer.

Perhaps more sensible to use kvmalloc/kvfree.





Re: [PATCH v15 09/10] arm64: Call kmalloc() to allocate DTB buffer

2021-01-27 Thread Lakshmi Ramasubramanian

On 1/27/21 8:14 PM, Thiago Jung Bauermann wrote:


Lakshmi Ramasubramanian  writes:


On 1/27/21 7:52 PM, Thiago Jung Bauermann wrote:

Will Deacon  writes:


On Wed, Jan 27, 2021 at 09:59:38AM -0800, Lakshmi Ramasubramanian wrote:

On 1/27/21 8:52 AM, Will Deacon wrote:

Hi Will,


On Fri, Jan 15, 2021 at 09:30:16AM -0800, Lakshmi Ramasubramanian wrote:

create_dtb() function allocates kernel virtual memory for
the device tree blob (DTB).  This is not consistent with other
architectures, such as powerpc, which calls kmalloc() for allocating
memory for the DTB.

Call kmalloc() to allocate memory for the DTB, and kfree() to free
the allocated memory.

Co-developed-by: Prakhar Srivastava 
Signed-off-by: Prakhar Srivastava 
Signed-off-by: Lakshmi Ramasubramanian 
---
arch/arm64/kernel/machine_kexec_file.c | 12 +++-
1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/kernel/machine_kexec_file.c 
b/arch/arm64/kernel/machine_kexec_file.c
index 7de9c47dee7c..51c40143d6fa 100644
--- a/arch/arm64/kernel/machine_kexec_file.c
+++ b/arch/arm64/kernel/machine_kexec_file.c
@@ -29,7 +29,7 @@ const struct kexec_file_ops * const kexec_file_loaders[] = {
int arch_kimage_file_post_load_cleanup(struct kimage *image)
{
-   vfree(image->arch.dtb);
+   kfree(image->arch.dtb);
image->arch.dtb = NULL;
vfree(image->arch.elf_headers);
@@ -59,19 +59,21 @@ static int create_dtb(struct kimage *image,
+ cmdline_len + DTB_EXTRA_SPACE;
for (;;) {
-   buf = vmalloc(buf_size);
+   buf = kmalloc(buf_size, GFP_KERNEL);


Is there a functional need for this patch? I build the 'dtbs' target just
now and sdm845-db845c.dtb is approaching 100K, which feels quite large
for kmalloc().


Changing the allocation from vmalloc() to kmalloc() would help us further
consolidate the DTB setup code for powerpc and arm64.


Ok, but at the risk of allocation failure. Can powerpc use vmalloc()
instead?

I believe this patch stems from this suggestion by Rob Herring:


This could be taken a step further and do the allocation of the new
FDT. The difference is arm64 uses vmalloc and powerpc uses kmalloc. The
arm64 version also retries with a bigger allocation. That seems
unnecessary.

in
https://lore.kernel.org/linux-integrity/20201211221006.1052453-3-r...@kernel.org/
The problem is that this patch implements only part of the suggestion,
which isn't useful in itself. So the patch series should either drop
this patch or consolidate the FDT allocation between the arches.
I just tested on powernv and pseries platforms and powerpc can use
vmalloc for the FDT buffer.



Thanks for verifying on powerpc platform Thiago.

I'll update the patch to do the following:

=> Use vmalloc for FDT buffer allocation on powerpc
=> Keep vmalloc for arm64, but remove the retry on allocation.
=> Also, there was a memory leak of FDT buffer in the error code path on arm64,
which I'll fix as well.

Did I miss anything?


Yes, you missed the second part of Rob's suggestion I was mentioning,
which is factoring out the code which allocates the new FDT from both
arm64 and powerpc.



Sure - I'll address that.

thanks,
 -lakshmi



Re: [PATCH v15 09/10] arm64: Call kmalloc() to allocate DTB buffer

2021-01-27 Thread Thiago Jung Bauermann


Will Deacon  writes:

> On Wed, Jan 27, 2021 at 09:59:38AM -0800, Lakshmi Ramasubramanian wrote:
>> On 1/27/21 8:52 AM, Will Deacon wrote:
>> 
>> Hi Will,
>> 
>> > On Fri, Jan 15, 2021 at 09:30:16AM -0800, Lakshmi Ramasubramanian wrote:
>> > > create_dtb() function allocates kernel virtual memory for
>> > > the device tree blob (DTB).  This is not consistent with other
>> > > architectures, such as powerpc, which calls kmalloc() for allocating
>> > > memory for the DTB.
>> > > 
>> > > Call kmalloc() to allocate memory for the DTB, and kfree() to free
>> > > the allocated memory.
>> > > 
>> > > Co-developed-by: Prakhar Srivastava 
>> > > Signed-off-by: Prakhar Srivastava 
>> > > Signed-off-by: Lakshmi Ramasubramanian 
>> > > ---
>> > >   arch/arm64/kernel/machine_kexec_file.c | 12 +++-
>> > >   1 file changed, 7 insertions(+), 5 deletions(-)
>> > > 
>> > > diff --git a/arch/arm64/kernel/machine_kexec_file.c 
>> > > b/arch/arm64/kernel/machine_kexec_file.c
>> > > index 7de9c47dee7c..51c40143d6fa 100644
>> > > --- a/arch/arm64/kernel/machine_kexec_file.c
>> > > +++ b/arch/arm64/kernel/machine_kexec_file.c
>> > > @@ -29,7 +29,7 @@ const struct kexec_file_ops * const 
>> > > kexec_file_loaders[] = {
>> > >   int arch_kimage_file_post_load_cleanup(struct kimage *image)
>> > >   {
>> > > -vfree(image->arch.dtb);
>> > > +kfree(image->arch.dtb);
>> > >  image->arch.dtb = NULL;
>> > >  vfree(image->arch.elf_headers);
>> > > @@ -59,19 +59,21 @@ static int create_dtb(struct kimage *image,
>> > >      + cmdline_len + DTB_EXTRA_SPACE;
>> > >  for (;;) {
>> > > -buf = vmalloc(buf_size);
>> > > +buf = kmalloc(buf_size, GFP_KERNEL);
>> > 
>> > Is there a functional need for this patch? I build the 'dtbs' target just
>> > now and sdm845-db845c.dtb is approaching 100K, which feels quite large
>> > for kmalloc().
>> 
>> Changing the allocation from vmalloc() to kmalloc() would help us further
>> consolidate the DTB setup code for powerpc and arm64.
>
> Ok, but at the risk of allocation failure. Can powerpc use vmalloc()
> instead?

I believe this patch stems from this suggestion by Rob Herring:

> This could be taken a step further and do the allocation of the new
> FDT. The difference is arm64 uses vmalloc and powerpc uses kmalloc. The
> arm64 version also retries with a bigger allocation. That seems
> unnecessary.

in 
https://lore.kernel.org/linux-integrity/20201211221006.1052453-3-r...@kernel.org/

The problem is that this patch implements only part of the suggestion,
which isn't useful in itself. So the patch series should either drop
this patch or consolidate the FDT allocation between the arches.

I just tested on powernv and pseries platforms and powerpc can use
vmalloc for the FDT buffer.

-- 
Thiago Jung Bauermann
IBM Linux Technology Center


Re: [PATCH v15 09/10] arm64: Call kmalloc() to allocate DTB buffer

2021-01-27 Thread Thiago Jung Bauermann


Lakshmi Ramasubramanian  writes:

> On 1/27/21 7:52 PM, Thiago Jung Bauermann wrote:
>> Will Deacon  writes:
>> 
>>> On Wed, Jan 27, 2021 at 09:59:38AM -0800, Lakshmi Ramasubramanian wrote:
>>>> On 1/27/21 8:52 AM, Will Deacon wrote:
>>>>
>>>> Hi Will,
>>>>
>>>>> On Fri, Jan 15, 2021 at 09:30:16AM -0800, Lakshmi Ramasubramanian wrote:
>>>>>> create_dtb() function allocates kernel virtual memory for
>>>>>> the device tree blob (DTB).  This is not consistent with other
>>>>>> architectures, such as powerpc, which calls kmalloc() for allocating
>>>>>> memory for the DTB.
>>>>>>
>>>>>> Call kmalloc() to allocate memory for the DTB, and kfree() to free
>>>>>> the allocated memory.
>>>>>>
>>>>>> Co-developed-by: Prakhar Srivastava 
>>>>>> Signed-off-by: Prakhar Srivastava 
>>>>>> Signed-off-by: Lakshmi Ramasubramanian 
>>>>>> ---
>>>>>>arch/arm64/kernel/machine_kexec_file.c | 12 +++-
>>>>>>1 file changed, 7 insertions(+), 5 deletions(-)
>>>>>>
>>>>>> diff --git a/arch/arm64/kernel/machine_kexec_file.c 
>>>>>> b/arch/arm64/kernel/machine_kexec_file.c
>>>>>> index 7de9c47dee7c..51c40143d6fa 100644
>>>>>> --- a/arch/arm64/kernel/machine_kexec_file.c
>>>>>> +++ b/arch/arm64/kernel/machine_kexec_file.c
>>>>>> @@ -29,7 +29,7 @@ const struct kexec_file_ops * const 
>>>>>> kexec_file_loaders[] = {
>>>>>>int arch_kimage_file_post_load_cleanup(struct kimage *image)
>>>>>>{
>>>>>> -vfree(image->arch.dtb);
>>>>>> +kfree(image->arch.dtb);
>>>>>>  image->arch.dtb = NULL;
>>>>>>  vfree(image->arch.elf_headers);
>>>>>> @@ -59,19 +59,21 @@ static int create_dtb(struct kimage *image,
>>>>>>  + cmdline_len + DTB_EXTRA_SPACE;
>>>>>>  for (;;) {
>>>>>> -buf = vmalloc(buf_size);
>>>>>> +buf = kmalloc(buf_size, GFP_KERNEL);
>>>>>
>>>>> Is there a functional need for this patch? I build the 'dtbs' target just
>>>>> now and sdm845-db845c.dtb is approaching 100K, which feels quite large
>>>>> for kmalloc().
>>>>
>>>> Changing the allocation from vmalloc() to kmalloc() would help us further
>>>> consolidate the DTB setup code for powerpc and arm64.
>>>
>>> Ok, but at the risk of allocation failure. Can powerpc use vmalloc()
>>> instead?
>> I believe this patch stems from this suggestion by Rob Herring:
>> 
>>> This could be taken a step further and do the allocation of the new
>>> FDT. The difference is arm64 uses vmalloc and powerpc uses kmalloc. The
>>> arm64 version also retries with a bigger allocation. That seems
>>> unnecessary.
>> in
>> https://lore.kernel.org/linux-integrity/20201211221006.1052453-3-r...@kernel.org/
>> The problem is that this patch implements only part of the suggestion,
>> which isn't useful in itself. So the patch series should either drop
>> this patch or consolidate the FDT allocation between the arches.
>> I just tested on powernv and pseries platforms and powerpc can use
>> vmalloc for the FDT buffer.
>> 
>
> Thanks for verifying on powerpc platform Thiago.
>
> I'll update the patch to do the following:
>
> => Use vmalloc for FDT buffer allocation on powerpc
> => Keep vmalloc for arm64, but remove the retry on allocation.
> => Also, there was a memory leak of FDT buffer in the error code path on 
> arm64,
> which I'll fix as well.
>
> Did I miss anything?

Yes, you missed the second part of Rob's suggestion I was mentioning,
which is factoring out the code which allocates the new FDT from both
arm64 and powerpc.

-- 
Thiago Jung Bauermann
IBM Linux Technology Center


Re: [PATCH v15 09/10] arm64: Call kmalloc() to allocate DTB buffer

2021-01-27 Thread Lakshmi Ramasubramanian

On 1/27/21 7:52 PM, Thiago Jung Bauermann wrote:


Will Deacon  writes:


On Wed, Jan 27, 2021 at 09:59:38AM -0800, Lakshmi Ramasubramanian wrote:

On 1/27/21 8:52 AM, Will Deacon wrote:

Hi Will,


On Fri, Jan 15, 2021 at 09:30:16AM -0800, Lakshmi Ramasubramanian wrote:

create_dtb() function allocates kernel virtual memory for
the device tree blob (DTB).  This is not consistent with other
architectures, such as powerpc, which calls kmalloc() for allocating
memory for the DTB.

Call kmalloc() to allocate memory for the DTB, and kfree() to free
the allocated memory.

Co-developed-by: Prakhar Srivastava 
Signed-off-by: Prakhar Srivastava 
Signed-off-by: Lakshmi Ramasubramanian 
---
   arch/arm64/kernel/machine_kexec_file.c | 12 +++-
   1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/kernel/machine_kexec_file.c 
b/arch/arm64/kernel/machine_kexec_file.c
index 7de9c47dee7c..51c40143d6fa 100644
--- a/arch/arm64/kernel/machine_kexec_file.c
+++ b/arch/arm64/kernel/machine_kexec_file.c
@@ -29,7 +29,7 @@ const struct kexec_file_ops * const kexec_file_loaders[] = {
   int arch_kimage_file_post_load_cleanup(struct kimage *image)
   {
-   vfree(image->arch.dtb);
+   kfree(image->arch.dtb);
image->arch.dtb = NULL;
vfree(image->arch.elf_headers);
@@ -59,19 +59,21 @@ static int create_dtb(struct kimage *image,
+ cmdline_len + DTB_EXTRA_SPACE;
for (;;) {
-   buf = vmalloc(buf_size);
+   buf = kmalloc(buf_size, GFP_KERNEL);


Is there a functional need for this patch? I build the 'dtbs' target just
now and sdm845-db845c.dtb is approaching 100K, which feels quite large
for kmalloc().


Changing the allocation from vmalloc() to kmalloc() would help us further
consolidate the DTB setup code for powerpc and arm64.


Ok, but at the risk of allocation failure. Can powerpc use vmalloc()
instead?


I believe this patch stems from this suggestion by Rob Herring:


This could be taken a step further and do the allocation of the new
FDT. The difference is arm64 uses vmalloc and powerpc uses kmalloc. The
arm64 version also retries with a bigger allocation. That seems
unnecessary.


in 
https://lore.kernel.org/linux-integrity/20201211221006.1052453-3-r...@kernel.org/

The problem is that this patch implements only part of the suggestion,
which isn't useful in itself. So the patch series should either drop
this patch or consolidate the FDT allocation between the arches.

I just tested on powernv and pseries platforms and powerpc can use
vmalloc for the FDT buffer.



Thanks for verifying on powerpc platform Thiago.

I'll update the patch to do the following:

=> Use vmalloc for FDT buffer allocation on powerpc
=> Keep vmalloc for arm64, but remove the retry on allocation.
=> Also, there was a memory leak of FDT buffer in the error code path on 
arm64, which I'll fix as well.


Did I miss anything?

thanks,
 -lakshmi


Re: [PATCH v15 09/10] arm64: Call kmalloc() to allocate DTB buffer

2021-01-27 Thread Will Deacon
On Wed, Jan 27, 2021 at 09:59:38AM -0800, Lakshmi Ramasubramanian wrote:
> On 1/27/21 8:52 AM, Will Deacon wrote:
> 
> Hi Will,
> 
> > On Fri, Jan 15, 2021 at 09:30:16AM -0800, Lakshmi Ramasubramanian wrote:
> > > create_dtb() function allocates kernel virtual memory for
> > > the device tree blob (DTB).  This is not consistent with other
> > > architectures, such as powerpc, which calls kmalloc() for allocating
> > > memory for the DTB.
> > > 
> > > Call kmalloc() to allocate memory for the DTB, and kfree() to free
> > > the allocated memory.
> > > 
> > > Co-developed-by: Prakhar Srivastava 
> > > Signed-off-by: Prakhar Srivastava 
> > > Signed-off-by: Lakshmi Ramasubramanian 
> > > ---
> > >   arch/arm64/kernel/machine_kexec_file.c | 12 +++-
> > >   1 file changed, 7 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/arch/arm64/kernel/machine_kexec_file.c 
> > > b/arch/arm64/kernel/machine_kexec_file.c
> > > index 7de9c47dee7c..51c40143d6fa 100644
> > > --- a/arch/arm64/kernel/machine_kexec_file.c
> > > +++ b/arch/arm64/kernel/machine_kexec_file.c
> > > @@ -29,7 +29,7 @@ const struct kexec_file_ops * const 
> > > kexec_file_loaders[] = {
> > >   int arch_kimage_file_post_load_cleanup(struct kimage *image)
> > >   {
> > > - vfree(image->arch.dtb);
> > > + kfree(image->arch.dtb);
> > >   image->arch.dtb = NULL;
> > >   vfree(image->arch.elf_headers);
> > > @@ -59,19 +59,21 @@ static int create_dtb(struct kimage *image,
> > >   + cmdline_len + DTB_EXTRA_SPACE;
> > >   for (;;) {
> > > - buf = vmalloc(buf_size);
> > > + buf = kmalloc(buf_size, GFP_KERNEL);
> > 
> > Is there a functional need for this patch? I build the 'dtbs' target just
> > now and sdm845-db845c.dtb is approaching 100K, which feels quite large
> > for kmalloc().
> 
> Changing the allocation from vmalloc() to kmalloc() would help us further
> consolidate the DTB setup code for powerpc and arm64.

Ok, but at the risk of allocation failure. Can powerpc use vmalloc()
instead?

Will


  1   2   3   4   5   6   7   8   9   10   >