Re: [PATCH v2 1/2] hugetlbfs: support tracepoint

2024-07-03 Thread Matthew Wilcox
On Thu, Jul 04, 2024 at 11:07:03AM +0800, Hongbo Li wrote:
> + TP_printk("dev = (%d,%d), ino = %lu, dir = %lu, mode = 0%o",
> + MAJOR(__entry->dev), MINOR(__entry->dev),
> + (unsigned long) __entry->ino,
> + (unsigned long) __entry->dir, __entry->mode)

erofs and f2fs are the only two places that print devices like this.

"dev=%d:%d inode=%lx"

Why do we need dir and mode?

Actually, why do we need a tracepoint on alloc_inode at all?  What
does it help us debug, and why does no other filesystem need an
alloc_inode tracepoint?



[PATCH v2 2/2] hugetlbfs: use tracepoints in hugetlbfs functions.

2024-07-03 Thread Hongbo Li
Here we use the hugetlbfs tracepoint to track the call stack. And
the output in trace is as follows:

```
touch-5250[002] .   123.557640: hugetlbfs_alloc_inode: dev = (0,49), 
ino = 29897, dir = 24079, mode = 0100644
touch-5250[002] .   123.557650: hugetlbfs_setattr: dev = (0,49), ino = 
29897, name = test1, ia_valid = 131184, ia_mode = 036720, ia_uid = 2544251907, 
ia_gid = 65534, old_size = 0, ia_size = 4064
truncate-5251[002] .   142.939424: hugetlbfs_setattr: dev = (0,49), ino 
= 29897, name = test1, ia_valid = 8296, ia_mode = 00, ia_uid = 0, ia_gid = 0, 
old_size = 0, ia_size = 2097152
rm-5273[002] .   412.618383: hugetlbfs_evict_inode: dev = (0,49), ino = 
29897, i_mode = 0100644, i_size = 2097152, i_nlink = 0, seals = 1, i_blocks = 0
-0   [002] ..s1.   412.634518: hugetlbfs_free_inode: dev = (0,49), 
ino = 29897, i_mode = 0100644, i_size = 2097152, i_nlink = 0, seals = 1, 
i_blocks = 0
```

Signed-off-by: Hongbo Li 
---
 fs/hugetlbfs/inode.c | 17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 412f295acebe..2e826bbcb6ed 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -39,6 +39,9 @@
 #include 
 #include 
 
+#define CREATE_TRACE_POINTS
+#include 
+
 static const struct address_space_operations hugetlbfs_aops;
 static const struct file_operations hugetlbfs_file_operations;
 static const struct inode_operations hugetlbfs_dir_inode_operations;
@@ -686,6 +689,7 @@ static void hugetlbfs_evict_inode(struct inode *inode)
 {
struct resv_map *resv_map;
 
+   trace_hugetlbfs_evict_inode(inode);
remove_inode_hugepages(inode, 0, LLONG_MAX);
 
/*
@@ -813,8 +817,10 @@ static long hugetlbfs_fallocate(struct file *file, int 
mode, loff_t offset,
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
 
-   if (mode & FALLOC_FL_PUNCH_HOLE)
-   return hugetlbfs_punch_hole(inode, offset, len);
+   if (mode & FALLOC_FL_PUNCH_HOLE) {
+   error = hugetlbfs_punch_hole(inode, offset, len);
+   goto out_nolock;
+   }
 
/*
 * Default preallocate case.
@@ -918,6 +924,9 @@ static long hugetlbfs_fallocate(struct file *file, int 
mode, loff_t offset,
inode_set_ctime_current(inode);
 out:
inode_unlock(inode);
+
+out_nolock:
+   trace_hugetlbfs_fallocate(inode, mode, offset, len, error);
return error;
 }
 
@@ -934,6 +943,8 @@ static int hugetlbfs_setattr(struct mnt_idmap *idmap,
if (error)
return error;
 
+   trace_hugetlbfs_setattr(inode, dentry, attr);
+
if (ia_valid & ATTR_SIZE) {
loff_t oldsize = inode->i_size;
loff_t newsize = attr->ia_size;
@@ -1032,6 +1043,7 @@ static struct inode *hugetlbfs_get_inode(struct 
super_block *sb,
break;
}
lockdep_annotate_inode_mutex_key(inode);
+   trace_hugetlbfs_alloc_inode(inode, dir, mode);
} else {
if (resv_map)
kref_put(_map->refs, resv_map_release);
@@ -1274,6 +1286,7 @@ static struct inode *hugetlbfs_alloc_inode(struct 
super_block *sb)
 
 static void hugetlbfs_free_inode(struct inode *inode)
 {
+   trace_hugetlbfs_free_inode(inode);
kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
 }
 
-- 
2.34.1




[PATCH v2 0/2] Introduce tracepoint for hugetlbfs

2024-07-03 Thread Hongbo Li
Here we add some basic tracepoints for debugging hugetlbfs: {alloc, free,
evict}_inode, setattr and fallocate.

v1 can be found at:
https://lore.kernel.org/linux-mm/20240701194906.3a9b6...@gandalf.local.home/T/

Changes since v1:
  - Decrease the parameters for setattr tracer suggested by Steve and Mathieu.
  - Replace current_user_ns() with init_user_ns when translate uid/gid.

Hongbo Li (2):
  hugetlbfs: support tracepoint
  hugetlbfs: use tracepoints in hugetlbfs functions.

 MAINTAINERS  |   1 +
 fs/hugetlbfs/inode.c |  17 +++-
 include/trace/events/hugetlbfs.h | 160 +++
 3 files changed, 176 insertions(+), 2 deletions(-)
 create mode 100644 include/trace/events/hugetlbfs.h

-- 
2.34.1




[PATCH v2 1/2] hugetlbfs: support tracepoint

2024-07-03 Thread Hongbo Li
Add basic tracepoints for {alloc, evict, free}_inode, setattr and
fallocate. These can help users to debug hugetlbfs more conveniently.

Signed-off-by: Hongbo Li 
---
 MAINTAINERS  |   1 +
 include/trace/events/hugetlbfs.h | 160 +++
 2 files changed, 161 insertions(+)
 create mode 100644 include/trace/events/hugetlbfs.h

diff --git a/MAINTAINERS b/MAINTAINERS
index cd2ca0c3158e..865c48e92d40 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10188,6 +10188,7 @@ F:  Documentation/mm/hugetlbfs_reserv.rst
 F: Documentation/mm/vmemmap_dedup.rst
 F: fs/hugetlbfs/
 F: include/linux/hugetlb.h
+F: include/trace/events/hugetlbfs.h
 F: mm/hugetlb.c
 F: mm/hugetlb_vmemmap.c
 F: mm/hugetlb_vmemmap.h
diff --git a/include/trace/events/hugetlbfs.h b/include/trace/events/hugetlbfs.h
new file mode 100644
index ..975f584f6f51
--- /dev/null
+++ b/include/trace/events/hugetlbfs.h
@@ -0,0 +1,160 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hugetlbfs
+
+#if !defined(_TRACE_HUGETLBFS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_HUGETLBFS_H
+
+#include 
+
+TRACE_EVENT(hugetlbfs_alloc_inode,
+
+   TP_PROTO(struct inode *inode, struct inode *dir, int mode),
+
+   TP_ARGS(inode, dir, mode),
+
+   TP_STRUCT__entry(
+   __field(dev_t,  dev)
+   __field(ino_t,  ino)
+   __field(ino_t,  dir)
+   __field(__u16,  mode)
+   ),
+
+   TP_fast_assign(
+   __entry->dev= inode->i_sb->s_dev;
+   __entry->ino= inode->i_ino;
+   __entry->dir= dir->i_ino;
+   __entry->mode   = mode;
+   ),
+
+   TP_printk("dev = (%d,%d), ino = %lu, dir = %lu, mode = 0%o",
+   MAJOR(__entry->dev), MINOR(__entry->dev),
+   (unsigned long) __entry->ino,
+   (unsigned long) __entry->dir, __entry->mode)
+);
+
+DECLARE_EVENT_CLASS(hugetlbfs__inode,
+
+   TP_PROTO(struct inode *inode),
+
+   TP_ARGS(inode),
+
+   TP_STRUCT__entry(
+   __field(dev_t,  dev)
+   __field(ino_t,  ino)
+   __field(__u16,  mode)
+   __field(loff_t, size)
+   __field(unsigned int,   nlink)
+   __field(unsigned int,   seals)
+   __field(blkcnt_t,   blocks)
+   ),
+
+   TP_fast_assign(
+   __entry->dev= inode->i_sb->s_dev;
+   __entry->ino= inode->i_ino;
+   __entry->mode   = inode->i_mode;
+   __entry->size   = inode->i_size;
+   __entry->nlink  = inode->i_nlink;
+   __entry->seals  = HUGETLBFS_I(inode)->seals;
+   __entry->blocks = inode->i_blocks;
+   ),
+
+   TP_printk("dev = (%d,%d), ino = %lu, i_mode = 0%o, i_size = %lld, 
i_nlink = %u, seals = %u, i_blocks = %llu",
+   MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) 
__entry->ino,
+   __entry->mode, __entry->size, __entry->nlink, __entry->seals,
+   (unsigned long long)__entry->blocks)
+);
+
+DEFINE_EVENT(hugetlbfs__inode, hugetlbfs_evict_inode,
+
+   TP_PROTO(struct inode *inode),
+
+   TP_ARGS(inode)
+);
+
+DEFINE_EVENT(hugetlbfs__inode, hugetlbfs_free_inode,
+
+   TP_PROTO(struct inode *inode),
+
+   TP_ARGS(inode)
+);
+
+TRACE_EVENT(hugetlbfs_setattr,
+
+   TP_PROTO(struct inode *inode, struct dentry *dentry,
+   struct iattr *attr),
+
+   TP_ARGS(inode, dentry, attr),
+
+   TP_STRUCT__entry(
+   __field(dev_t,  dev)
+   __field(ino_t,  ino)
+   __field(unsigned int,   d_len)
+   __string(d_name,dentry->d_name.name)
+   __field(unsigned int,   ia_valid)
+   __field(unsigned int,   ia_mode)
+   __field(unsigned int,   ia_uid)
+   __field(unsigned int,   ia_gid)
+   __field(loff_t, old_size)
+   __field(loff_t, ia_size)
+   ),
+
+   TP_fast_assign(
+   __entry->dev= inode->i_sb->s_dev;
+   __entry->ino= inode->i_ino;
+   __entry->d_len  = dentry->d_name.len;
+   __assign_str(d_name);
+   __entry->ia_valid   = attr->ia_valid;
+   __entry->ia_mode= attr->ia_mode;
+   __entry->ia_uid = from_kuid_munged(current_user_ns(), 
attr->ia_uid);
+   __entry->ia_gid = from_kgid_munged(current_user_ns(), 
attr->ia_gid);
+   __entry->old_size   = inode->i_size;
+   __entry->ia_size= attr->ia_size;
+   ),
+
+   TP_printk("dev = (%d,%d), ino = %lu, 

Re: [PATCH v2 01/12] uprobes: update outdated comment

2024-07-03 Thread Andrii Nakryiko
On Wed, Jul 3, 2024 at 4:40 AM Oleg Nesterov  wrote:
>
> Sorry for the late reply. I'll try to read this version/discussion
> when I have time... yes, I have already promised this before, sorry :/
>

No worries, there will be v3 next week (I'm going on short PTO
starting tomorrow). So you still have time. I appreciate you looking
at it, though!

> On 07/01, Andrii Nakryiko wrote:
> >
> > There is no task_struct passed into get_user_pages_remote() anymore,
> > drop the parts of comment mentioning NULL tsk, it's just confusing at
> > this point.
>
> Agreed.
>
> > @@ -2030,10 +2030,8 @@ static int is_trap_at_addr(struct mm_struct *mm, 
> > unsigned long vaddr)
> >   goto out;
> >
> >   /*
> > -  * The NULL 'tsk' here ensures that any faults that occur here
> > -  * will not be accounted to the task.  'mm' *is* current->mm,
> > -  * but we treat this as a 'remote' access since it is
> > -  * essentially a kernel access to the memory.
> > +  * 'mm' *is* current->mm, but we treat this as a 'remote' access since
> > +  * it is essentially a kernel access to the memory.
> >*/
> >   result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, , NULL);
>
> OK, this makes it less confusing, so
>
> Acked-by: Oleg Nesterov 
>
>
> -
> but it still looks confusing to me. This code used to pass tsk = NULL
> only to avoid tsk->maj/min_flt++ in faultin_page().
>
> But today mm_account_fault() increments these counters without checking
> FAULT_FLAG_REMOTE, mm == current->mm, so it seems it would be better to
> just use get_user_pages() and remove this comment?
>
> Oleg.
>



Re: [PATCH v2 02/12] uprobes: correct mmap_sem locking assumptions in uprobe_write_opcode()

2024-07-03 Thread Google
On Wed, 3 Jul 2024 11:25:35 -0700
Andrii Nakryiko  wrote:

> On Wed, Jul 3, 2024 at 6:15 AM Masami Hiramatsu  wrote:
> >
> > On Mon,  1 Jul 2024 15:39:25 -0700
> > Andrii Nakryiko  wrote:
> >
> > > It seems like uprobe_write_opcode() doesn't require writer locked
> > > mmap_sem, any lock (reader or writer) should be sufficient. This was
> > > established in a discussion in [0] and looking through existing code
> > > seems to confirm that there is no need for write-locked mmap_sem.
> > >
> > > Fix the comment to state this clearly.
> > >
> > >   [0] 
> > > https://lore.kernel.org/linux-trace-kernel/20240625190748.gc14...@redhat.com/
> > >
> > > Fixes: 29dedee0e693 ("uprobes: Add mem_cgroup_charge_anon() into 
> > > uprobe_write_opcode()")
> >
> > nit: why this has Fixes but [01/12] doesn't?
> >
> > Should I pick both to fixes branch?
> 
> I'd keep both of them in probes/for-next, tbh. They are not literally
> fixing anything, just cleaning up comments. I can drop the Fixes tag
> from this one, if you'd like.

Got it. Usually cleanup patch will not have Fixed tag, so if you don't mind,
please drop it.

Thank you,

> 
> >
> > Thank you,
> >
> > > Signed-off-by: Andrii Nakryiko 
> > > ---
> > >  kernel/events/uprobes.c | 2 +-
> > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > >
> > > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > > index 081821fd529a..f87049c08ee9 100644
> > > --- a/kernel/events/uprobes.c
> > > +++ b/kernel/events/uprobes.c
> > > @@ -453,7 +453,7 @@ static int update_ref_ctr(struct uprobe *uprobe, 
> > > struct mm_struct *mm,
> > >   * @vaddr: the virtual address to store the opcode.
> > >   * @opcode: opcode to be written at @vaddr.
> > >   *
> > > - * Called with mm->mmap_lock held for write.
> > > + * Called with mm->mmap_lock held for read or write.
> > >   * Return 0 (success) or a negative errno.
> > >   */
> > >  int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct 
> > > *mm,
> > > --
> > > 2.43.0
> > >
> >
> >
> > --
> > Masami Hiramatsu (Google) 


-- 
Masami Hiramatsu (Google) 



Re: [PATCH v2 04/12] uprobes: revamp uprobe refcounting and lifetime management

2024-07-03 Thread Andrii Nakryiko
On Wed, Jul 3, 2024 at 6:36 AM Peter Zijlstra  wrote:
>
> On Mon, Jul 01, 2024 at 03:39:27PM -0700, Andrii Nakryiko wrote:
>
> > One, attempted initially, way to solve this is through using
> > atomic_inc_not_zero() approach, turning get_uprobe() into
> > try_get_uprobe(),
>
> This is the canonical thing to do. Everybody does this.

Sure, and I provided arguments why I don't do it. Can you provide your
counter argument, please? "Everybody does this." is hardly one.

>
> > which can fail to bump refcount if uprobe is already
> > destined to be destroyed. This, unfortunately, turns out to be a rather
> > expensive due to underlying cmpxchg() operation in
> > atomic_inc_not_zero() and scales rather poorly with increased amount of
> > parallel threads triggering uprobes.
>
> Different archs different trade-offs. You'll not see this on LL/SC archs
> for example.

Clearly x86-64 is the highest priority target, and I've shown that it
benefits from atomic addition vs cmpxchg. Sure, other architecture
might benefit less. But will atomic addition be slower than cmpxchg on
any other architecture?

It's clearly beneficial for x86-64 and not regressing other
architectures, right?

>
> > Furthermore, CPU profiling showed the following overall CPU usage:
> >   - try_get_uprobe (19.3%) + put_uprobe (8.2%) = 27.5% CPU usage for
> > atomic_inc_not_zero approach;
> >   - __get_uprobe (12.3%) + put_uprobe (9.9%) = 22.2% CPU usage for
> > atomic_add_and_return approach implemented by this patch.
>
> I think those numbers suggest trying to not have a refcount in the first
> place. Both are pretty terrible, yes one is less terrible than the
> other, but still terrible.

Good, we are on the same page here, yes.

>
> Specifically, I'm thinking it is the refcounting in handlw_swbp() that
> is actually the problem, all the other stuff is noise.
>
> So if you have SRCU protected consumers, what is the reason for still
> having a refcount in handlw_swbp() ? Simply have the whole of it inside
> a single SRCU critical section, then all consumers you find get a hit.

That's the goal (except SRCU vs RCU Tasks Trace) and that's the next
step. I didn't want to add all that complexity to an already pretty
big and complex patch set. I do believe that batch APIs are the first
necessary step.

Your innocuous "// XXX amortize / batch" comment below is *the major
point of this patch set*. Try to appreciate that. It's not a small
todo, it took this entire patch set to allow for that.

Now, if you are so against percpu RW semapshore, I can just drop the
last patch for now, but the rest is necessary regardless.

Note how I didn't really touch locking *at all*. uprobes_treelock used
to be a spinlock, which we 1-to-1 replaced with rw_spinlock. And now
I'm replacing it, again 1-to-1, with percpu RW semaphore. Specifically
not to entangle batching with the locking schema changes.

>
> Hmm, return probes are a pain, they require the uprobe to stay extant
> between handle_swbp() and handle_trampoline(). I'm thinking we can do
> that with SRCU as well.

I don't think we can, and I'm surprised you don't think that way.

uretprobe might never be triggered for various reasons. Either user
space never returns from the function, or uretprobe was never
installed in the right place (and so uprobe part will trigger, but
there will never be returning probe triggering). I don't think it's
acceptable to delay whole global uprobes SRCU unlocking indefinitely
and keep that at user space's code will.

So, with that, I think refcounting *for return probe* will have to
stay. And will have to be fast.

>
> When I cobble all that together (it really shouldn't be one patch, but
> you get the idea I hope) it looks a little something like the below.
>
> I *think* it should work, but perhaps I've missed something?

Well, at the very least you missed that we can't delay SRCU (or any
other sleepable RCU flavor) potentially indefinitely for uretprobes,
which are completely under user space control.

>
> TL;DR replace treelock with seqcount+SRCU
>   replace register_rwsem with SRCU
>   replace handle_swbp() refcount with SRCU
>   replace return_instance refcount with a second SRCU

So, as I mentioned. I haven't considered seqcount just yet, and I will
think that through. This patch set was meant to add batched API to
unblock all of the above you describe. Percpu RW semaphore switch was
a no-brainer with batched APIs, so I went for that to get more
performance with zero added effort and complexity. If you hate that
part, I can drop it. But batching APIs are unavoidable, no matter what
specific RCU-protected locking schema we end up doing.

Can we agree on that and move this forward, please?

>
> Paul, I had to do something vile with SRCU. The basic problem is that we
> want to keep a SRCU critical section across fork(), which leads to both
> parent and child doing srcu_read_unlock(, idx). As such, I need an
> extra increment on the @idx ssp counter to even 

Re: [PATCH v2 02/12] uprobes: correct mmap_sem locking assumptions in uprobe_write_opcode()

2024-07-03 Thread Andrii Nakryiko
On Wed, Jul 3, 2024 at 6:15 AM Masami Hiramatsu  wrote:
>
> On Mon,  1 Jul 2024 15:39:25 -0700
> Andrii Nakryiko  wrote:
>
> > It seems like uprobe_write_opcode() doesn't require writer locked
> > mmap_sem, any lock (reader or writer) should be sufficient. This was
> > established in a discussion in [0] and looking through existing code
> > seems to confirm that there is no need for write-locked mmap_sem.
> >
> > Fix the comment to state this clearly.
> >
> >   [0] 
> > https://lore.kernel.org/linux-trace-kernel/20240625190748.gc14...@redhat.com/
> >
> > Fixes: 29dedee0e693 ("uprobes: Add mem_cgroup_charge_anon() into 
> > uprobe_write_opcode()")
>
> nit: why this has Fixes but [01/12] doesn't?
>
> Should I pick both to fixes branch?

I'd keep both of them in probes/for-next, tbh. They are not literally
fixing anything, just cleaning up comments. I can drop the Fixes tag
from this one, if you'd like.

>
> Thank you,
>
> > Signed-off-by: Andrii Nakryiko 
> > ---
> >  kernel/events/uprobes.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> > index 081821fd529a..f87049c08ee9 100644
> > --- a/kernel/events/uprobes.c
> > +++ b/kernel/events/uprobes.c
> > @@ -453,7 +453,7 @@ static int update_ref_ctr(struct uprobe *uprobe, struct 
> > mm_struct *mm,
> >   * @vaddr: the virtual address to store the opcode.
> >   * @opcode: opcode to be written at @vaddr.
> >   *
> > - * Called with mm->mmap_lock held for write.
> > + * Called with mm->mmap_lock held for read or write.
> >   * Return 0 (success) or a negative errno.
> >   */
> >  int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
> > --
> > 2.43.0
> >
>
>
> --
> Masami Hiramatsu (Google) 



Re: [PATCH v2 01/12] uprobes: update outdated comment

2024-07-03 Thread Andrii Nakryiko
On Wed, Jul 3, 2024 at 4:40 AM Oleg Nesterov  wrote:
>
> Sorry for the late reply. I'll try to read this version/discussion
> when I have time... yes, I have already promised this before, sorry :/
>
> On 07/01, Andrii Nakryiko wrote:
> >
> > There is no task_struct passed into get_user_pages_remote() anymore,
> > drop the parts of comment mentioning NULL tsk, it's just confusing at
> > this point.
>
> Agreed.
>
> > @@ -2030,10 +2030,8 @@ static int is_trap_at_addr(struct mm_struct *mm, 
> > unsigned long vaddr)
> >   goto out;
> >
> >   /*
> > -  * The NULL 'tsk' here ensures that any faults that occur here
> > -  * will not be accounted to the task.  'mm' *is* current->mm,
> > -  * but we treat this as a 'remote' access since it is
> > -  * essentially a kernel access to the memory.
> > +  * 'mm' *is* current->mm, but we treat this as a 'remote' access since
> > +  * it is essentially a kernel access to the memory.
> >*/
> >   result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, , NULL);
>
> OK, this makes it less confusing, so
>
> Acked-by: Oleg Nesterov 
>
>
> -
> but it still looks confusing to me. This code used to pass tsk = NULL
> only to avoid tsk->maj/min_flt++ in faultin_page().
>
> But today mm_account_fault() increments these counters without checking
> FAULT_FLAG_REMOTE, mm == current->mm, so it seems it would be better to
> just use get_user_pages() and remove this comment?

I don't know, it was a drive-by cleanup which I'm starting to regret already :)

>
> Oleg.
>



Re: [PATCH v2 05/12] uprobes: move offset and ref_ctr_offset into uprobe_consumer

2024-07-03 Thread Andrii Nakryiko
On Wed, Jul 3, 2024 at 3:13 AM Masami Hiramatsu
 wrote:
>
> On Wed, 3 Jul 2024 10:13:15 +0200
> Peter Zijlstra  wrote:
>
> > On Mon, Jul 01, 2024 at 03:39:28PM -0700, Andrii Nakryiko wrote:
> > > Simplify uprobe registration/unregistration interfaces by making offset
> > > and ref_ctr_offset part of uprobe_consumer "interface". In practice, all
> > > existing users already store these fields somewhere in uprobe_consumer's
> > > containing structure, so this doesn't pose any problem. We just move
> > > some fields around.
> > >
> > > On the other hand, this simplifies uprobe_register() and
> > > uprobe_unregister() API by having only struct uprobe_consumer as one
> > > thing representing attachment/detachment entity. This makes batched
> > > versions of uprobe_register() and uprobe_unregister() simpler.
> > >
> > > This also makes uprobe_register_refctr() unnecessary, so remove it and
> > > simplify consumers.
> > >
> > > No functional changes intended.
> > >
> > > Acked-by: Masami Hiramatsu (Google) 
> > > Signed-off-by: Andrii Nakryiko 
> > > ---
> > >  include/linux/uprobes.h   | 18 +++
> > >  kernel/events/uprobes.c   | 19 ++-
> > >  kernel/trace/bpf_trace.c  | 21 +++-
> > >  kernel/trace/trace_uprobe.c   | 53 ---
> > >  .../selftests/bpf/bpf_testmod/bpf_testmod.c   | 22 
> > >  5 files changed, 55 insertions(+), 78 deletions(-)
> > >
> > > diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
> > > index b503fafb7fb3..a75ba37ce3c8 100644
> > > --- a/include/linux/uprobes.h
> > > +++ b/include/linux/uprobes.h
> > > @@ -42,6 +42,11 @@ struct uprobe_consumer {
> > > enum uprobe_filter_ctx ctx,
> > > struct mm_struct *mm);
> > >
> > > +   /* associated file offset of this probe */
> > > +   loff_t offset;
> > > +   /* associated refctr file offset of this probe, or zero */
> > > +   loff_t ref_ctr_offset;
> > > +   /* for internal uprobe infra use, consumers shouldn't touch fields 
> > > below */
> > > struct uprobe_consumer *next;
> > >  };
> > >
> > > @@ -110,10 +115,9 @@ extern bool is_trap_insn(uprobe_opcode_t *insn);
> > >  extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs);
> > >  extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs);
> > >  extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct 
> > > mm_struct *mm, unsigned long vaddr, uprobe_opcode_t);
> > > -extern int uprobe_register(struct inode *inode, loff_t offset, struct 
> > > uprobe_consumer *uc);
> > > -extern int uprobe_register_refctr(struct inode *inode, loff_t offset, 
> > > loff_t ref_ctr_offset, struct uprobe_consumer *uc);
> > > +extern int uprobe_register(struct inode *inode, struct uprobe_consumer 
> > > *uc);
> > >  extern int uprobe_apply(struct inode *inode, loff_t offset, struct 
> > > uprobe_consumer *uc, bool);
> > > -extern void uprobe_unregister(struct inode *inode, loff_t offset, struct 
> > > uprobe_consumer *uc);
> > > +extern void uprobe_unregister(struct inode *inode, struct 
> > > uprobe_consumer *uc);
> >
> > It seems very weird and unnatural to split inode and offset like this.
> > The whole offset thing only makes sense within the context of an inode.
>
> Hm, so would you mean we should have inode inside the uprobe_consumer?
> If so, I think it is reasonable.
>

I don't think so, for at least two reasons.

1) We will be wasting 8 bytes per consumer saving exactly the same
inode pointer for no good reason, while uprobe itself already stores
this inode. One can argue that having offset and ref_ctr_offset inside
uprobe_consumer is wasteful, in principle, and I agree. But all
existing users already store them in the same struct that contains
uprobe_consumer, so we are not regressing anything, while making the
interface simpler. We can always optimize that further, if necessary,
but right now that would give us nothing.

But moving inode into uprobe_consumer will regress memory usage.

2) In the context of batched APIs, offset and ref_ctr_offset varies
with each uprobe_consumer, while inode is explicitly the same for all
consumers in that batch. This API makes it very clear.

Technically, we can remove inode completely from *uprobe_unregister*,
because we now can access it from uprobe_consumer->uprobe->inode, but
it still feels right for symmetry and explicitly making a point that
callers should ensure that inode is kept alive.


> Thank you,
>
> >
> > So yeah, lets not do this.
>
>
> --
> Masami Hiramatsu



Re: [PATCH v2 04/12] uprobes: revamp uprobe refcounting and lifetime management

2024-07-03 Thread Peter Zijlstra
On Mon, Jul 01, 2024 at 03:39:27PM -0700, Andrii Nakryiko wrote:

> One, attempted initially, way to solve this is through using
> atomic_inc_not_zero() approach, turning get_uprobe() into
> try_get_uprobe(),

This is the canonical thing to do. Everybody does this.

> which can fail to bump refcount if uprobe is already
> destined to be destroyed. This, unfortunately, turns out to be a rather
> expensive due to underlying cmpxchg() operation in
> atomic_inc_not_zero() and scales rather poorly with increased amount of
> parallel threads triggering uprobes.

Different archs different trade-offs. You'll not see this on LL/SC archs
for example.

> Furthermore, CPU profiling showed the following overall CPU usage:
>   - try_get_uprobe (19.3%) + put_uprobe (8.2%) = 27.5% CPU usage for
> atomic_inc_not_zero approach;
>   - __get_uprobe (12.3%) + put_uprobe (9.9%) = 22.2% CPU usage for
> atomic_add_and_return approach implemented by this patch.

I think those numbers suggest trying to not have a refcount in the first
place. Both are pretty terrible, yes one is less terrible than the
other, but still terrible.

Specifically, I'm thinking it is the refcounting in handlw_swbp() that
is actually the problem, all the other stuff is noise. 

So if you have SRCU protected consumers, what is the reason for still
having a refcount in handlw_swbp() ? Simply have the whole of it inside
a single SRCU critical section, then all consumers you find get a hit.

Hmm, return probes are a pain, they require the uprobe to stay extant
between handle_swbp() and handle_trampoline(). I'm thinking we can do
that with SRCU as well.

When I cobble all that together (it really shouldn't be one patch, but
you get the idea I hope) it looks a little something like the below.

I *think* it should work, but perhaps I've missed something?

TL;DR replace treelock with seqcount+SRCU
  replace register_rwsem with SRCU
  replace handle_swbp() refcount with SRCU
  replace return_instance refcount with a second SRCU

Paul, I had to do something vile with SRCU. The basic problem is that we
want to keep a SRCU critical section across fork(), which leads to both
parent and child doing srcu_read_unlock(, idx). As such, I need an
extra increment on the @idx ssp counter to even things out, see
__srcu_read_clone_lock().

---
 include/linux/rbtree.h  |  45 +
 include/linux/srcu.h|   2 +
 include/linux/uprobes.h |   2 +
 kernel/events/uprobes.c | 166 +++-
 kernel/rcu/srcutree.c   |   5 ++
 5 files changed, 161 insertions(+), 59 deletions(-)

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index f7edca369eda..9847fa58a287 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -244,6 +244,31 @@ rb_find_add(struct rb_node *node, struct rb_root *tree,
return NULL;
 }
 
+static __always_inline struct rb_node *
+rb_find_add_rcu(struct rb_node *node, struct rb_root *tree,
+   int (*cmp)(struct rb_node *, const struct rb_node *))
+{
+   struct rb_node **link = >rb_node;
+   struct rb_node *parent = NULL;
+   int c;
+
+   while (*link) {
+   parent = *link;
+   c = cmp(node, parent);
+
+   if (c < 0)
+   link = >rb_left;
+   else if (c > 0)
+   link = >rb_right;
+   else
+   return parent;
+   }
+
+   rb_link_node_rcu(node, parent, link);
+   rb_insert_color(node, tree);
+   return NULL;
+}
+
 /**
  * rb_find() - find @key in tree @tree
  * @key: key to match
@@ -272,6 +297,26 @@ rb_find(const void *key, const struct rb_root *tree,
return NULL;
 }
 
+static __always_inline struct rb_node *
+rb_find_rcu(const void *key, const struct rb_root *tree,
+   int (*cmp)(const void *key, const struct rb_node *))
+{
+   struct rb_node *node = tree->rb_node;
+
+   while (node) {
+   int c = cmp(key, node);
+
+   if (c < 0)
+   node = rcu_dereference_raw(node->rb_left);
+   else if (c > 0)
+   node = rcu_dereference_raw(node->rb_right);
+   else
+   return node;
+   }
+
+   return NULL;
+}
+
 /**
  * rb_find_first() - find the first @key in @tree
  * @key: key to match
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 236610e4a8fa..9b14acecbb9d 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -55,7 +55,9 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *head,
void (*func)(struct rcu_head *head));
 void cleanup_srcu_struct(struct srcu_struct *ssp);
 int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp);
+void __srcu_read_clone_lock(struct srcu_struct *ssp, int idx);
 void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp);
+
 void synchronize_srcu(struct srcu_struct *ssp);
 unsigned long 

Re: [PATCH v2 02/12] uprobes: correct mmap_sem locking assumptions in uprobe_write_opcode()

2024-07-03 Thread Google
On Mon,  1 Jul 2024 15:39:25 -0700
Andrii Nakryiko  wrote:

> It seems like uprobe_write_opcode() doesn't require writer locked
> mmap_sem, any lock (reader or writer) should be sufficient. This was
> established in a discussion in [0] and looking through existing code
> seems to confirm that there is no need for write-locked mmap_sem.
> 
> Fix the comment to state this clearly.
> 
>   [0] 
> https://lore.kernel.org/linux-trace-kernel/20240625190748.gc14...@redhat.com/
> 
> Fixes: 29dedee0e693 ("uprobes: Add mem_cgroup_charge_anon() into 
> uprobe_write_opcode()")

nit: why this has Fixes but [01/12] doesn't?

Should I pick both to fixes branch?

Thank you,

> Signed-off-by: Andrii Nakryiko 
> ---
>  kernel/events/uprobes.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 081821fd529a..f87049c08ee9 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -453,7 +453,7 @@ static int update_ref_ctr(struct uprobe *uprobe, struct 
> mm_struct *mm,
>   * @vaddr: the virtual address to store the opcode.
>   * @opcode: opcode to be written at @vaddr.
>   *
> - * Called with mm->mmap_lock held for write.
> + * Called with mm->mmap_lock held for read or write.
>   * Return 0 (success) or a negative errno.
>   */
>  int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
> -- 
> 2.43.0
> 


-- 
Masami Hiramatsu (Google) 



Re: [PATCH v2 02/12] uprobes: correct mmap_sem locking assumptions in uprobe_write_opcode()

2024-07-03 Thread Oleg Nesterov
On 07/01, Andrii Nakryiko wrote:
>
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -453,7 +453,7 @@ static int update_ref_ctr(struct uprobe *uprobe, struct 
> mm_struct *mm,
>   * @vaddr: the virtual address to store the opcode.
>   * @opcode: opcode to be written at @vaddr.
>   *
> - * Called with mm->mmap_lock held for write.
> + * Called with mm->mmap_lock held for read or write.
>   * Return 0 (success) or a negative errno.

Thanks,

Acked-by: Oleg Nesterov 


I'll try to send the patch which explains the reasons for mmap_write_lock()
in register_for_each_vma() later.

Oleg.




Re: [PATCH v2 01/12] uprobes: update outdated comment

2024-07-03 Thread Oleg Nesterov
Sorry for the late reply. I'll try to read this version/discussion
when I have time... yes, I have already promised this before, sorry :/

On 07/01, Andrii Nakryiko wrote:
>
> There is no task_struct passed into get_user_pages_remote() anymore,
> drop the parts of comment mentioning NULL tsk, it's just confusing at
> this point.

Agreed.

> @@ -2030,10 +2030,8 @@ static int is_trap_at_addr(struct mm_struct *mm, 
> unsigned long vaddr)
>   goto out;
>
>   /*
> -  * The NULL 'tsk' here ensures that any faults that occur here
> -  * will not be accounted to the task.  'mm' *is* current->mm,
> -  * but we treat this as a 'remote' access since it is
> -  * essentially a kernel access to the memory.
> +  * 'mm' *is* current->mm, but we treat this as a 'remote' access since
> +  * it is essentially a kernel access to the memory.
>*/
>   result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, , NULL);

OK, this makes it less confusing, so

Acked-by: Oleg Nesterov 


-
but it still looks confusing to me. This code used to pass tsk = NULL
only to avoid tsk->maj/min_flt++ in faultin_page().

But today mm_account_fault() increments these counters without checking
FAULT_FLAG_REMOTE, mm == current->mm, so it seems it would be better to
just use get_user_pages() and remove this comment?

Oleg.




Re: [PATCH v2 05/12] uprobes: move offset and ref_ctr_offset into uprobe_consumer

2024-07-03 Thread Masami Hiramatsu
On Wed, 3 Jul 2024 10:13:15 +0200
Peter Zijlstra  wrote:

> On Mon, Jul 01, 2024 at 03:39:28PM -0700, Andrii Nakryiko wrote:
> > Simplify uprobe registration/unregistration interfaces by making offset
> > and ref_ctr_offset part of uprobe_consumer "interface". In practice, all
> > existing users already store these fields somewhere in uprobe_consumer's
> > containing structure, so this doesn't pose any problem. We just move
> > some fields around.
> > 
> > On the other hand, this simplifies uprobe_register() and
> > uprobe_unregister() API by having only struct uprobe_consumer as one
> > thing representing attachment/detachment entity. This makes batched
> > versions of uprobe_register() and uprobe_unregister() simpler.
> > 
> > This also makes uprobe_register_refctr() unnecessary, so remove it and
> > simplify consumers.
> > 
> > No functional changes intended.
> > 
> > Acked-by: Masami Hiramatsu (Google) 
> > Signed-off-by: Andrii Nakryiko 
> > ---
> >  include/linux/uprobes.h   | 18 +++
> >  kernel/events/uprobes.c   | 19 ++-
> >  kernel/trace/bpf_trace.c  | 21 +++-
> >  kernel/trace/trace_uprobe.c   | 53 ---
> >  .../selftests/bpf/bpf_testmod/bpf_testmod.c   | 22 
> >  5 files changed, 55 insertions(+), 78 deletions(-)
> > 
> > diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
> > index b503fafb7fb3..a75ba37ce3c8 100644
> > --- a/include/linux/uprobes.h
> > +++ b/include/linux/uprobes.h
> > @@ -42,6 +42,11 @@ struct uprobe_consumer {
> > enum uprobe_filter_ctx ctx,
> > struct mm_struct *mm);
> >  
> > +   /* associated file offset of this probe */
> > +   loff_t offset;
> > +   /* associated refctr file offset of this probe, or zero */
> > +   loff_t ref_ctr_offset;
> > +   /* for internal uprobe infra use, consumers shouldn't touch fields 
> > below */
> > struct uprobe_consumer *next;
> >  };
> >  
> > @@ -110,10 +115,9 @@ extern bool is_trap_insn(uprobe_opcode_t *insn);
> >  extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs);
> >  extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs);
> >  extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct 
> > mm_struct *mm, unsigned long vaddr, uprobe_opcode_t);
> > -extern int uprobe_register(struct inode *inode, loff_t offset, struct 
> > uprobe_consumer *uc);
> > -extern int uprobe_register_refctr(struct inode *inode, loff_t offset, 
> > loff_t ref_ctr_offset, struct uprobe_consumer *uc);
> > +extern int uprobe_register(struct inode *inode, struct uprobe_consumer 
> > *uc);
> >  extern int uprobe_apply(struct inode *inode, loff_t offset, struct 
> > uprobe_consumer *uc, bool);
> > -extern void uprobe_unregister(struct inode *inode, loff_t offset, struct 
> > uprobe_consumer *uc);
> > +extern void uprobe_unregister(struct inode *inode, struct uprobe_consumer 
> > *uc);
> 
> It seems very weird and unnatural to split inode and offset like this.
> The whole offset thing only makes sense within the context of an inode.

Hm, so would you mean we should have inode inside the uprobe_consumer?
If so, I think it is reasonable.

Thank you,

> 
> So yeah, lets not do this.


-- 
Masami Hiramatsu



Re: [PATCH v2 05/12] uprobes: move offset and ref_ctr_offset into uprobe_consumer

2024-07-03 Thread Peter Zijlstra
On Mon, Jul 01, 2024 at 03:39:28PM -0700, Andrii Nakryiko wrote:
> Simplify uprobe registration/unregistration interfaces by making offset
> and ref_ctr_offset part of uprobe_consumer "interface". In practice, all
> existing users already store these fields somewhere in uprobe_consumer's
> containing structure, so this doesn't pose any problem. We just move
> some fields around.
> 
> On the other hand, this simplifies uprobe_register() and
> uprobe_unregister() API by having only struct uprobe_consumer as one
> thing representing attachment/detachment entity. This makes batched
> versions of uprobe_register() and uprobe_unregister() simpler.
> 
> This also makes uprobe_register_refctr() unnecessary, so remove it and
> simplify consumers.
> 
> No functional changes intended.
> 
> Acked-by: Masami Hiramatsu (Google) 
> Signed-off-by: Andrii Nakryiko 
> ---
>  include/linux/uprobes.h   | 18 +++
>  kernel/events/uprobes.c   | 19 ++-
>  kernel/trace/bpf_trace.c  | 21 +++-
>  kernel/trace/trace_uprobe.c   | 53 ---
>  .../selftests/bpf/bpf_testmod/bpf_testmod.c   | 22 
>  5 files changed, 55 insertions(+), 78 deletions(-)
> 
> diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
> index b503fafb7fb3..a75ba37ce3c8 100644
> --- a/include/linux/uprobes.h
> +++ b/include/linux/uprobes.h
> @@ -42,6 +42,11 @@ struct uprobe_consumer {
>   enum uprobe_filter_ctx ctx,
>   struct mm_struct *mm);
>  
> + /* associated file offset of this probe */
> + loff_t offset;
> + /* associated refctr file offset of this probe, or zero */
> + loff_t ref_ctr_offset;
> + /* for internal uprobe infra use, consumers shouldn't touch fields 
> below */
>   struct uprobe_consumer *next;
>  };
>  
> @@ -110,10 +115,9 @@ extern bool is_trap_insn(uprobe_opcode_t *insn);
>  extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs);
>  extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs);
>  extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct 
> *mm, unsigned long vaddr, uprobe_opcode_t);
> -extern int uprobe_register(struct inode *inode, loff_t offset, struct 
> uprobe_consumer *uc);
> -extern int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t 
> ref_ctr_offset, struct uprobe_consumer *uc);
> +extern int uprobe_register(struct inode *inode, struct uprobe_consumer *uc);
>  extern int uprobe_apply(struct inode *inode, loff_t offset, struct 
> uprobe_consumer *uc, bool);
> -extern void uprobe_unregister(struct inode *inode, loff_t offset, struct 
> uprobe_consumer *uc);
> +extern void uprobe_unregister(struct inode *inode, struct uprobe_consumer 
> *uc);

It seems very weird and unnatural to split inode and offset like this.
The whole offset thing only makes sense within the context of an inode.

So yeah, lets not do this.