When uprobes are active, MADV_DONTNEED can discard file-backed pages
that contain uprobe software breakpoint instructions. Because the
uprobe infrastructure does not re-instrument pages on individual page
faults (uprobe_mmap() is only called during VMA creation, not on
page-in), the breakpoints are silently lost once the discarded pages are
re-read from the backing file. The probes stop firing with no error
indication, and the only recovery is to unregister and re-register the
affected uprobes.
Note that MADV_FREE is not affected: it only operates on anonymous VMAs
(madvise_free_single_vma() rejects non-anonymous VMAs with -EINVAL),
while uprobes only instrument file-backed mappings, so the two can never
overlap.
A concrete example is a userspace memory reclamation subsystem that
periodically calls madvise(MADV_DONTNEED) on file-backed text pages to
release memory. This silently clears uprobe breakpoints placed by
eBPF-based security and tracing tools that use uprobes to attach eBPF
programs to user-space functions, causing those tools to stop
functioning within seconds of the first reclamation pass.
Add a check in madvise_dontneed_free(), which handles MADV_DONTNEED,
MADV_DONTNEED_LOCKED and MADV_FREE, that when CONFIG_UPROBES is enabled
detects whether the target range contains active uprobes:
- Fast path: if no uprobes are registered system-wide, or the VMA is
not file-backed (uprobes only instrument file-backed mappings, so
anonymous VMAs -- including MADV_FREE targets -- can never contain
breakpoints), or no uprobes are present in the VMA range, proceed
with the discard as before.
- Slow path: when uprobes are detected in the range, use
vma_first_uprobe_addr() to jump directly to each uprobe page via
the rbtree, zapping the clean ranges between them. This is
O(M * log N) where M is the number of uprobes in the range and
N is the total uprobe count, rather than O(pages). madvise()
still returns success, consistent with the advisory nature of
MADV_DONTNEED.
When CONFIG_UPROBES is not configured, the original behaviour is
preserved with no overhead.
To support the above, export vma_has_uprobes() and add new helpers
any_uprobes_registered() and vma_first_uprobe_addr() in the uprobes
subsystem. vma_first_uprobe_addr() returns the page-aligned virtual
address of the lowest-offset uprobe in a given VMA range by leveraging
the (inode, offset)-sorted global rbtree.
Cc: [email protected]
Cc: [email protected]
Signed-off-by: Darko Tominac <[email protected]>
---
include/linux/uprobes.h | 21 +++++++++++
kernel/events/uprobes.c | 79 +++++++++++++++++++++++++++++++++++++++--
mm/madvise.c | 73 +++++++++++++++++++++++++++++++++----
3 files changed, 164 insertions(+), 9 deletions(-)
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index f548fea2adec..9ce5c46fd2e9 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -212,6 +212,11 @@ extern void uprobe_unregister_nosync(struct uprobe
*uprobe, struct uprobe_consum
extern void uprobe_unregister_sync(void);
extern int uprobe_mmap(struct vm_area_struct *vma);
extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start,
unsigned long end);
+extern bool vma_has_uprobes(struct vm_area_struct *vma, unsigned long start,
unsigned long end);
+extern unsigned long vma_first_uprobe_addr(struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end);
+extern bool any_uprobes_registered(void);
extern void uprobe_start_dup_mmap(void);
extern void uprobe_end_dup_mmap(void);
extern void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm);
@@ -278,6 +283,22 @@ static inline void
uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long
end)
{
}
+static inline bool
+vma_has_uprobes(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end)
+{
+ return false;
+}
+static inline unsigned long
+vma_first_uprobe_addr(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end)
+{
+ return 0;
+}
+static inline bool any_uprobes_registered(void)
+{
+ return false;
+}
static inline void uprobe_start_dup_mmap(void)
{
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 4084e926e284..0f8aea99b96f 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -152,6 +152,19 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma,
unsigned long vaddr)
return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
}
+/**
+ * any_uprobes_registered - check if any uprobes are currently registered
+ *
+ * Check whether the global uprobe rbtree has any entries, indicating
+ * that at least one uprobe is currently active in the system.
+ *
+ * Return: true if one or more uprobes are registered, false otherwise.
+ */
+bool any_uprobes_registered(void)
+{
+ return !no_uprobe_events();
+}
+
/**
* is_swbp_insn - check if instruction is breakpoint instruction.
* @insn: instruction to be checked.
@@ -1635,8 +1648,16 @@ int uprobe_mmap(struct vm_area_struct *vma)
return 0;
}
-static bool
-vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long
end)
+/**
+ * vma_has_uprobes - check whether a vma range contains any uprobes.
+ * @vma: the vma to search.
+ * @start: start address of the range (inclusive).
+ * @end: end address of the range (exclusive).
+ *
+ * Return: true if at least one uprobe is registered in [@start, @end),
+ * false otherwise.
+ */
+bool vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned
long end)
{
loff_t min, max;
struct inode *inode;
@@ -1654,6 +1675,60 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned
long start, unsigned long e
return !!n;
}
+/**
+ * vma_first_uprobe_addr - find first uprobe in a vma range.
+ * @vma: the vma to search.
+ * @start: start address of the range (inclusive).
+ * @end: end address of the range (exclusive).
+ *
+ * Used by madvise to skip directly to uprobe pages.
+ *
+ * Return: the page-aligned virtual address of the first uprobe in
+ * [@start, @end), or 0 if none exists.
+ */
+unsigned long vma_first_uprobe_addr(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ loff_t min, max, first_offset;
+ struct inode *inode;
+ struct rb_node *n, *t;
+ struct uprobe *u;
+
+ /* No uprobes possible on anonymous mappings */
+ if (!vma->vm_file)
+ return 0;
+
+ /* Empty range -- nothing to search */
+ if (start >= end)
+ return 0;
+
+ inode = file_inode(vma->vm_file);
+
+ min = vaddr_to_offset(vma, start);
+ max = min + (end - start) - 1;
+
+ read_lock(&uprobes_treelock);
+ n = find_node_in_range(inode, min, max);
+ if (!n) {
+ read_unlock(&uprobes_treelock);
+ return 0;
+ }
+
+ /* Walk left to find the lowest offset in range */
+ u = rb_entry(n, struct uprobe, rb_node);
+ first_offset = u->offset;
+ for (t = rb_prev(n); t; t = rb_prev(t)) {
+ u = rb_entry(t, struct uprobe, rb_node);
+ if (u->inode != inode || u->offset < min)
+ break;
+ first_offset = u->offset;
+ }
+ read_unlock(&uprobes_treelock);
+
+ /* Return page-aligned vaddr containing this uprobe */
+ return PAGE_ALIGN_DOWN(offset_to_vaddr(vma, first_offset));
+}
+
/*
* Called in context of a munmap of a vma.
*/
diff --git a/mm/madvise.c b/mm/madvise.c
index 69708e953cf5..c73f1131224b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -32,6 +32,7 @@
#include <linux/leafops.h>
#include <linux/shmem_fs.h>
#include <linux/mmu_notifier.h>
+#include <linux/uprobes.h>
#include <asm/tlb.h>
@@ -862,6 +863,30 @@ static long madvise_dontneed_single_vma(struct
madvise_behavior *madv_behavior)
return 0;
}
+static long madvise_dontneed_free_range(struct madvise_behavior *madv_behavior,
+ unsigned long start, unsigned long end)
+{
+ struct madvise_behavior_range *range = &madv_behavior->range;
+ unsigned long saved_start = range->start;
+ unsigned long saved_end = range->end;
+ int behavior = madv_behavior->behavior;
+ long ret;
+
+ range->start = start;
+ range->end = end;
+
+ if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
+ ret = madvise_dontneed_single_vma(madv_behavior);
+ else if (behavior == MADV_FREE)
+ ret = madvise_free_single_vma(madv_behavior);
+ else
+ ret = -EINVAL;
+
+ range->start = saved_start;
+ range->end = saved_end;
+ return ret;
+}
+
static
bool madvise_dontneed_free_valid_vma(struct madvise_behavior *madv_behavior)
{
@@ -898,7 +923,7 @@ static long madvise_dontneed_free(struct madvise_behavior
*madv_behavior)
{
struct mm_struct *mm = madv_behavior->mm;
struct madvise_behavior_range *range = &madv_behavior->range;
- int behavior = madv_behavior->behavior;
+ unsigned long cur, end, uprobe_addr;
if (!madvise_dontneed_free_valid_vma(madv_behavior))
return -EINVAL;
@@ -947,12 +972,46 @@ static long madvise_dontneed_free(struct madvise_behavior
*madv_behavior)
VM_WARN_ON(range->start > range->end);
}
- if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
- return madvise_dontneed_single_vma(madv_behavior);
- else if (behavior == MADV_FREE)
- return madvise_free_single_vma(madv_behavior);
- else
- return -EINVAL;
+ /*
+ * Preserve uprobes: if any uprobes are active in this VMA range,
+ * avoid discarding pages that contain active breakpoints.
+ *
+ * Fast path: if no uprobes are registered system-wide, or the VMA
+ * is not file-backed (uprobes only instrument file-backed mappings,
+ * so anonymous VMAs can never contain breakpoints), or no uprobes
+ * are present in this VMA range, proceed with the full operation.
+ */
+ if (likely(!any_uprobes_registered()) ||
+ !madv_behavior->vma->vm_file ||
+ !vma_has_uprobes(madv_behavior->vma, range->start, range->end))
+ return madvise_dontneed_free_range(madv_behavior,
+ range->start, range->end);
+
+ /*
+ * Slow path: jump from uprobe to uprobe via rbtree lookup, zapping
+ * the clean range before each uprobe page. This is O(M * log N)
+ * where M is the number of uprobes in the range and N is the total
+ * uprobe count, versus O(pages) for a page-by-page scan. 'cur'
+ * tracks the beginning of the current clean range.
+ */
+ cur = range->start;
+ end = range->end;
+ while (cur < end) {
+ uprobe_addr = vma_first_uprobe_addr(madv_behavior->vma,
+ cur, end);
+ if (!uprobe_addr) {
+ /* No more uprobes - zap the rest */
+ madvise_dontneed_free_range(madv_behavior, cur, end);
+ break;
+ }
+ /* Zap the clean range before the uprobe page */
+ if (cur < uprobe_addr)
+ madvise_dontneed_free_range(madv_behavior, cur,
+ uprobe_addr);
+ /* Skip past the uprobe page */
+ cur = uprobe_addr + PAGE_SIZE;
+ }
+ return 0;
}
static long madvise_populate(struct madvise_behavior *madv_behavior)
--
2.35.6