When uprobes are active, MADV_DONTNEED can discard file-backed pages
that contain uprobe software breakpoint instructions.  Because the
uprobe infrastructure does not re-instrument pages on individual page
faults (uprobe_mmap() is only called during VMA creation, not on
page-in), the breakpoints are silently lost once the discarded pages are
re-read from the backing file.  The probes stop firing with no error
indication, and the only recovery is to unregister and re-register the
affected uprobes.

Note that MADV_FREE is not affected: it only operates on anonymous VMAs
(madvise_free_single_vma() rejects non-anonymous VMAs with -EINVAL),
while uprobes only instrument file-backed mappings, so the two can never
overlap.

A concrete example is a userspace memory reclamation subsystem that
periodically calls madvise(MADV_DONTNEED) on file-backed text pages to
release memory.  This silently clears uprobe breakpoints placed by
eBPF-based security and tracing tools that use uprobes to attach eBPF
programs to user-space functions, causing those tools to stop
functioning within seconds of the first reclamation pass.

Add a check in madvise_dontneed_free(), which handles MADV_DONTNEED,
MADV_DONTNEED_LOCKED and MADV_FREE, that when CONFIG_UPROBES is enabled
detects whether the target range contains active uprobes:

  - Fast path: if no uprobes are registered system-wide, or the VMA is
    not file-backed (uprobes only instrument file-backed mappings, so
    anonymous VMAs -- including MADV_FREE targets -- can never contain
    breakpoints), or no uprobes are present in the VMA range, proceed
    with the discard as before.
  - Slow path: when uprobes are detected in the range, use
    vma_first_uprobe_addr() to jump directly to each uprobe page via
    the rbtree, zapping the clean ranges between them.  This is
    O(M * log N) where M is the number of uprobes in the range and
    N is the total uprobe count, rather than O(pages).  madvise()
    still returns success, consistent with the advisory nature of
    MADV_DONTNEED.

When CONFIG_UPROBES is not configured, the original behaviour is
preserved with no overhead.

To support the above, export vma_has_uprobes() and add new helpers
any_uprobes_registered() and vma_first_uprobe_addr() in the uprobes
subsystem.  vma_first_uprobe_addr() returns the page-aligned virtual
address of the lowest-offset uprobe in a given VMA range by leveraging
the (inode, offset)-sorted global rbtree.

Cc: [email protected]
Cc: [email protected]
Signed-off-by: Darko Tominac <[email protected]>
---
 include/linux/uprobes.h | 21 +++++++++++
 kernel/events/uprobes.c | 79 +++++++++++++++++++++++++++++++++++++++--
 mm/madvise.c            | 73 +++++++++++++++++++++++++++++++++----
 3 files changed, 164 insertions(+), 9 deletions(-)

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index f548fea2adec..9ce5c46fd2e9 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -212,6 +212,11 @@ extern void uprobe_unregister_nosync(struct uprobe 
*uprobe, struct uprobe_consum
 extern void uprobe_unregister_sync(void);
 extern int uprobe_mmap(struct vm_area_struct *vma);
 extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, 
unsigned long end);
+extern bool vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, 
unsigned long end);
+extern unsigned long vma_first_uprobe_addr(struct vm_area_struct *vma,
+                                          unsigned long start,
+                                          unsigned long end);
+extern bool any_uprobes_registered(void);
 extern void uprobe_start_dup_mmap(void);
 extern void uprobe_end_dup_mmap(void);
 extern void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm);
@@ -278,6 +283,22 @@ static inline void
 uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long 
end)
 {
 }
+static inline bool
+vma_has_uprobes(struct vm_area_struct *vma, unsigned long start,
+               unsigned long end)
+{
+       return false;
+}
+static inline unsigned long
+vma_first_uprobe_addr(struct vm_area_struct *vma, unsigned long start,
+                     unsigned long end)
+{
+       return 0;
+}
+static inline bool any_uprobes_registered(void)
+{
+       return false;
+}
 static inline void uprobe_start_dup_mmap(void)
 {
 }
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 4084e926e284..0f8aea99b96f 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -152,6 +152,19 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, 
unsigned long vaddr)
        return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
 }
 
+/**
+ * any_uprobes_registered - check if any uprobes are currently registered
+ *
+ * Check whether the global uprobe rbtree has any entries, indicating
+ * that at least one uprobe is currently active in the system.
+ *
+ * Return: true if one or more uprobes are registered, false otherwise.
+ */
+bool any_uprobes_registered(void)
+{
+       return !no_uprobe_events();
+}
+
 /**
  * is_swbp_insn - check if instruction is breakpoint instruction.
  * @insn: instruction to be checked.
@@ -1635,8 +1648,16 @@ int uprobe_mmap(struct vm_area_struct *vma)
        return 0;
 }
 
-static bool
-vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long 
end)
+/**
+ * vma_has_uprobes - check whether a vma range contains any uprobes.
+ * @vma: the vma to search.
+ * @start: start address of the range (inclusive).
+ * @end: end address of the range (exclusive).
+ *
+ * Return: true if at least one uprobe is registered in [@start, @end),
+ * false otherwise.
+ */
+bool vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned 
long end)
 {
        loff_t min, max;
        struct inode *inode;
@@ -1654,6 +1675,60 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned 
long start, unsigned long e
        return !!n;
 }
 
+/**
+ * vma_first_uprobe_addr - find first uprobe in a vma range.
+ * @vma: the vma to search.
+ * @start: start address of the range (inclusive).
+ * @end: end address of the range (exclusive).
+ *
+ * Used by madvise to skip directly to uprobe pages.
+ *
+ * Return: the page-aligned virtual address of the first uprobe in
+ * [@start, @end), or 0 if none exists.
+ */
+unsigned long vma_first_uprobe_addr(struct vm_area_struct *vma,
+                                   unsigned long start, unsigned long end)
+{
+       loff_t min, max, first_offset;
+       struct inode *inode;
+       struct rb_node *n, *t;
+       struct uprobe *u;
+
+       /* No uprobes possible on anonymous mappings */
+       if (!vma->vm_file)
+               return 0;
+
+       /* Empty range -- nothing to search */
+       if (start >= end)
+               return 0;
+
+       inode = file_inode(vma->vm_file);
+
+       min = vaddr_to_offset(vma, start);
+       max = min + (end - start) - 1;
+
+       read_lock(&uprobes_treelock);
+       n = find_node_in_range(inode, min, max);
+       if (!n) {
+               read_unlock(&uprobes_treelock);
+               return 0;
+       }
+
+       /* Walk left to find the lowest offset in range */
+       u = rb_entry(n, struct uprobe, rb_node);
+       first_offset = u->offset;
+       for (t = rb_prev(n); t; t = rb_prev(t)) {
+               u = rb_entry(t, struct uprobe, rb_node);
+               if (u->inode != inode || u->offset < min)
+                       break;
+               first_offset = u->offset;
+       }
+       read_unlock(&uprobes_treelock);
+
+       /* Return page-aligned vaddr containing this uprobe */
+       return PAGE_ALIGN_DOWN(offset_to_vaddr(vma, first_offset));
+}
+
 /*
  * Called in context of a munmap of a vma.
  */
diff --git a/mm/madvise.c b/mm/madvise.c
index 69708e953cf5..c73f1131224b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -32,6 +32,7 @@
 #include <linux/leafops.h>
 #include <linux/shmem_fs.h>
 #include <linux/mmu_notifier.h>
+#include <linux/uprobes.h>
 
 #include <asm/tlb.h>
 
@@ -862,6 +863,30 @@ static long madvise_dontneed_single_vma(struct 
madvise_behavior *madv_behavior)
        return 0;
 }
 
+static long madvise_dontneed_free_range(struct madvise_behavior *madv_behavior,
+                                       unsigned long start, unsigned long end)
+{
+       struct madvise_behavior_range *range = &madv_behavior->range;
+       unsigned long saved_start = range->start;
+       unsigned long saved_end = range->end;
+       int behavior = madv_behavior->behavior;
+       long ret;
+
+       range->start = start;
+       range->end = end;
+
+       if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
+               ret = madvise_dontneed_single_vma(madv_behavior);
+       else if (behavior == MADV_FREE)
+               ret = madvise_free_single_vma(madv_behavior);
+       else
+               ret = -EINVAL;
+
+       range->start = saved_start;
+       range->end = saved_end;
+       return ret;
+}
+
 static
 bool madvise_dontneed_free_valid_vma(struct madvise_behavior *madv_behavior)
 {
@@ -898,7 +923,7 @@ static long madvise_dontneed_free(struct madvise_behavior 
*madv_behavior)
 {
        struct mm_struct *mm = madv_behavior->mm;
        struct madvise_behavior_range *range = &madv_behavior->range;
-       int behavior = madv_behavior->behavior;
+       unsigned long cur, end, uprobe_addr;
 
        if (!madvise_dontneed_free_valid_vma(madv_behavior))
                return -EINVAL;
@@ -947,12 +972,46 @@ static long madvise_dontneed_free(struct madvise_behavior 
*madv_behavior)
                VM_WARN_ON(range->start > range->end);
        }
 
-       if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
-               return madvise_dontneed_single_vma(madv_behavior);
-       else if (behavior == MADV_FREE)
-               return madvise_free_single_vma(madv_behavior);
-       else
-               return -EINVAL;
+       /*
+        * Preserve uprobes: if any uprobes are active in this VMA range,
+        * avoid discarding pages that contain active breakpoints.
+        *
+        * Fast path: if no uprobes are registered system-wide, or the VMA
+        * is not file-backed (uprobes only instrument file-backed mappings,
+        * so anonymous VMAs can never contain breakpoints), or no uprobes
+        * are present in this VMA range, proceed with the full operation.
+        */
+       if (likely(!any_uprobes_registered()) ||
+           !madv_behavior->vma->vm_file ||
+           !vma_has_uprobes(madv_behavior->vma, range->start, range->end))
+               return madvise_dontneed_free_range(madv_behavior,
+                                                  range->start, range->end);
+
+       /*
+        * Slow path: jump from uprobe to uprobe via rbtree lookup, zapping
+        * the clean range before each uprobe page. This is O(M * log N)
+        * where M is the number of uprobes in the range and N is the total
+        * uprobe count, versus O(pages) for a page-by-page scan. 'cur'
+        * tracks the beginning of the current clean range.
+        */
+       cur = range->start;
+       end = range->end;
+       while (cur < end) {
+               uprobe_addr = vma_first_uprobe_addr(madv_behavior->vma,
+                                                   cur, end);
+               if (!uprobe_addr) {
+                       /* No more uprobes - zap the rest */
+                       madvise_dontneed_free_range(madv_behavior, cur, end);
+                       break;
+               }
+               /* Zap the clean range before the uprobe page */
+               if (cur < uprobe_addr)
+                       madvise_dontneed_free_range(madv_behavior, cur,
+                                                   uprobe_addr);
+               /* Skip past the uprobe page */
+               cur = uprobe_addr + PAGE_SIZE;
+       }
+       return 0;
 }
 
 static long madvise_populate(struct madvise_behavior *madv_behavior)
-- 
2.35.6


Reply via email to