Re: [RFC][PATCH] uprobe: support for private hugetlb mappings

Guillaume Morin Wed, 01 May 2024 21:00:15 -0700

On 30 Apr 21:25, David Hildenbrand wrote:
> > I tried to get the hugepd stuff right but this was the first I heard
> > about it :-) Afaict follow_huge_pmd and friends were already DTRT
> 
> I'll have to have a closer look at some details (the hugepd writability
> check looks a bit odd), but it's mostly what I would have expected!


Ok in the meantime, here is the uprobe change on your current
uprobes_cow trying to address the comments you made in your previous
message. Some of them were not 100% clear to me, so it's a best effort
patch :-) Again lightly tested

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -83,6 +83,10 @@ static const struct fs_parameter_spec 
hugetlb_fs_parameters[] = {
        {}
 };
 
+bool hugetlbfs_mapping(struct address_space *mapping) {
+       return mapping->a_ops == &hugetlbfs_aops;
+}
+
 /*
  * Mask used when checking the page offset value passed in via system
  * calls.  This value will be converted to a loff_t which is signed.
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -511,6 +511,8 @@ struct hugetlbfs_sb_info {
        umode_t mode;
 };
 
+bool hugetlbfs_mapping(struct address_space *mapping);
+
 static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 {
        return sb->s_fs_info;
@@ -557,6 +559,8 @@ static inline struct hstate *hstate_inode(struct inode *i)
 {
        return NULL;
 }
+
+static inline bool hugetlbfs_mapping(struct address_space *mapping) { return 
false; }
 #endif /* !CONFIG_HUGETLBFS */
 
 #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -11,6 +11,7 @@
 
 #include <linux/kernel.h>
 #include <linux/highmem.h>
+#include <linux/hugetlb.h>
 #include <linux/pagemap.h>     /* read_mapping_page */
 #include <linux/slab.h>
 #include <linux/sched.h>
@@ -120,7 +121,7 @@ struct xol_area {
  */
 static bool valid_vma(struct vm_area_struct *vma, bool is_register)
 {
-       vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
+       vm_flags_t flags = VM_MAYEXEC | VM_MAYSHARE;
 
        if (is_register)
                flags |= VM_WRITE;
@@ -177,6 +178,19 @@ static void copy_to_page(struct page *page, unsigned long 
vaddr, const void *src
        kunmap_atomic(kaddr);
 }
 
+static bool compare_pages(struct page *page1, struct page *page2, unsigned 
long page_size)
+{
+       char *addr1, *addr2;
+       int ret;
+
+       addr1 = kmap_local_page(page1);
+       addr2 = kmap_local_page(page2);
+       ret = memcmp(addr1, addr2, page_size);
+       kunmap_local(addr2);
+       kunmap_local(addr1);
+       return ret == 0;
+}
+
 static int verify_opcode(struct page *page, unsigned long vaddr, 
uprobe_opcode_t *new_opcode)
 {
        uprobe_opcode_t old_opcode;
@@ -366,7 +380,9 @@ static int update_ref_ctr(struct uprobe *uprobe, struct 
mm_struct *mm,
 }
 
 static bool orig_page_is_identical(struct vm_area_struct *vma,
-               unsigned long vaddr, struct page *page, bool *large)
+               unsigned long vaddr, struct page *page,
+               unsigned long page_size,
+               bool *large)
 {
        const pgoff_t index = vaddr_to_offset(vma, vaddr) >> PAGE_SHIFT;
        struct page *orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
@@ -380,7 +396,7 @@ static bool orig_page_is_identical(struct vm_area_struct 
*vma,
 
        *large = folio_test_large(orig_folio);
        identical = folio_test_uptodate(orig_folio) &&
-                   pages_identical(page, orig_page);
+                   compare_pages(page, orig_page, page_size);
        folio_put(orig_folio);
        return identical;
 }
@@ -396,6 +412,81 @@ struct uwo_data {
        uprobe_opcode_t opcode;
 };
 
+static int __write_opcode_hugetlb(pte_t *ptep, unsigned long page_mask,
+                                 unsigned long vaddr,
+                                 unsigned long next, struct mm_walk *walk)
+{
+       struct uwo_data *data = walk->private;
+       const bool is_register = !!is_swbp_insn(&data->opcode);
+       pte_t pte = huge_ptep_get(ptep);
+       struct folio *folio;
+       struct page *page;
+       bool large;
+       struct hstate *h = hstate_vma(walk->vma);
+       unsigned subpage_index = (vaddr & (huge_page_size(h) - 1)) >>
+               PAGE_SHIFT;
+
+       if (!pte_present(pte))
+               return UWO_RETRY;
+       page = vm_normal_page(walk->vma, vaddr, pte);
+       if (!page)
+               return UWO_RETRY;
+       folio = page_folio(page);
+
+       /* When unregistering and there is no anon folio anymore, we're done. */
+       if (!folio_test_anon(folio))
+               return is_register ? UWO_RETRY_WRITE_FAULT : UWO_DONE;
+
+       /*
+        * See can_follow_write_pte(): we'd actually prefer requiring a
+        * writable PTE here, but when unregistering we might no longer have
+        * VM_WRITE ...
+        */
+       if (!huge_pte_write(pte)) {
+               if (!PageAnonExclusive(page))
+                       return UWO_RETRY_WRITE_FAULT;
+               if (unlikely(userfaultfd_wp(walk->vma) && 
huge_pte_uffd_wp(pte)))
+                       return UWO_RETRY_WRITE_FAULT;
+               /* SOFTDIRTY is handled via pte_mkdirty() below. */
+       }
+
+       /* Unmap + flush the TLB, such that we can write atomically .*/
+       flush_cache_page(walk->vma, vaddr & page_mask, pte_pfn(pte));
+       pte = huge_ptep_clear_flush(walk->vma, vaddr & page_mask, ptep);
+       copy_to_page(nth_page(page, subpage_index), data->opcode_vaddr,
+                    &data->opcode, UPROBE_SWBP_INSN_SIZE);
+
+       /*
+        * When unregistering, we may only zap a PTE if uffd is disabled and
+        * the folio is not pinned ...
+        */
+       if (is_register || userfaultfd_missing(walk->vma) ||
+           folio_maybe_dma_pinned(folio))
+               goto remap;
+
+       /*
+        * ... the mapped anon page is identical to the original page (that
+        * will get faulted in on next access), and we don't have GUP pins.
+        */
+       if (!orig_page_is_identical(walk->vma, vaddr & page_mask, page,
+                                   huge_page_size(h), &large))
+               goto remap;
+
+       hugetlb_remove_rmap(folio);
+       folio_put(folio);
+       return UWO_DONE;
+remap:
+       /*
+        * Make sure that our copy_to_page() changes become visible before the
+        * set_huge_pte_at() write.
+        */
+       smp_wmb();
+       /* We modified the page. Make sure to mark the PTE dirty. */
+       set_huge_pte_at(walk->mm , vaddr & page_mask, ptep,
+                       huge_pte_mkdirty(pte), huge_page_size(h));
+       return UWO_DONE;
+}
+
 static int __write_opcode_pte(pte_t *ptep, unsigned long vaddr,
                unsigned long next, struct mm_walk *walk)
 {
@@ -447,7 +538,7 @@ static int __write_opcode_pte(pte_t *ptep, unsigned long 
vaddr,
         * ... the mapped anon page is identical to the original page (that
         * will get faulted in on next access), and we don't have GUP pins.
         */
-       if (!orig_page_is_identical(walk->vma, vaddr, page, &large))
+       if (!orig_page_is_identical(walk->vma, vaddr, page, PAGE_SIZE, &large))
                goto remap;
 
        /* Zap it and try to reclaim swap space. */
@@ -473,6 +564,7 @@ static int __write_opcode_pte(pte_t *ptep, unsigned long 
vaddr,
 }
 
 static const struct mm_walk_ops write_opcode_ops = {
+       .hugetlb_entry          = __write_opcode_hugetlb,
        .pte_entry              = __write_opcode_pte,
        .walk_lock              = PGWALK_WRLOCK,
 };
@@ -510,6 +602,8 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct 
vm_area_struct *vma,
        struct mmu_notifier_range range;
        int ret, ref_ctr_updated = 0;
        struct page *page;
+       unsigned long page_size = PAGE_SIZE;
+       unsigned long page_mask = PAGE_MASK;
 
        if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags)))
                return -EINVAL;
@@ -528,6 +622,11 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, 
struct vm_area_struct *vma,
        if (ret != 1)
                goto out;
 
+       if (is_vm_hugetlb_page(vma)) {
+               struct hstate *h = hstate_vma(vma);
+               page_size = huge_page_size(h);
+               page_mask = huge_page_mask(h);
+       }
        ret = verify_opcode(page, opcode_vaddr, &opcode);
        put_page(page);
        if (ret <= 0)
@@ -547,8 +646,9 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct 
vm_area_struct *vma,
                 * unregistering. So trigger MMU notifiers now, as we won't
                 * be able to do it under PTL.
                 */
+               const unsigned long start = vaddr & page_mask;
                mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
-                                       vaddr, vaddr + PAGE_SIZE);
+                                       start, start + page_size);
                mmu_notifier_invalidate_range_start(&range);
        }
 
@@ -830,8 +930,16 @@ static int __copy_insn(struct address_space *mapping, 
struct file *filp,
         */
        if (mapping->a_ops->read_folio)
                page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
-       else
+       else if (!is_file_hugepages(filp))
                page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
+       else {
+               struct hstate *h = hstate_file(filp);
+               unsigned long mask = huge_page_mask(h);
+               page = find_get_page(mapping, (offset & mask) >> PAGE_SHIFT);
+               if (IS_ERR(page))
+                       return PTR_ERR(page);
+               page = nth_page(page, (offset & (huge_page_size(h) - 1)) >> 
PAGE_SHIFT);
+       }
        if (IS_ERR(page))
                return PTR_ERR(page);
 
@@ -1182,9 +1290,12 @@ static int __uprobe_register(struct inode *inode, loff_t 
offset,
        if (!uc->handler && !uc->ret_handler)
                return -EINVAL;
 
-       /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
+       /* copy_insn() uses read_mapping_page() or shmem/hugetlbfs specific
+        * logic
+        */
        if (!inode->i_mapping->a_ops->read_folio &&
-           !shmem_mapping(inode->i_mapping))
+           !shmem_mapping(inode->i_mapping) &&
+           !hugetlbfs_mapping(inode->i_mapping))
                return -EIO;
        /* Racy, just to catch the obvious mistakes */
        if (offset > i_size_read(inode))

-- 
Guillaume Morin <guilla...@morinfr.org>

Re: [RFC][PATCH] uprobe: support for private hugetlb mappings

Reply via email to