On Fri, Mar 21, 2025 at 12:37:13PM +0100, David Hildenbrand wrote:

SNIP

> +static int __uprobe_write_opcode(struct vm_area_struct *vma,
> +             struct folio_walk *fw, struct folio *folio,
> +             unsigned long opcode_vaddr, uprobe_opcode_t opcode)
> +{
> +     const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
> +     const bool is_register = !!is_swbp_insn(&opcode);
> +     bool pmd_mappable;
> +
> +     /* For now, we'll only handle PTE-mapped folios. */
> +     if (fw->level != FW_LEVEL_PTE)
> +             return -EFAULT;
> +
> +     /*
> +      * See can_follow_write_pte(): we'd actually prefer a writable PTE here,
> +      * but the VMA might not be writable.
> +      */
> +     if (!pte_write(fw->pte)) {
> +             if (!PageAnonExclusive(fw->page))
> +                     return -EFAULT;
> +             if (unlikely(userfaultfd_pte_wp(vma, fw->pte)))
> +                     return -EFAULT;
> +             /* SOFTDIRTY is handled via pte_mkdirty() below. */
> +     }
> +
> +     /*
> +      * We'll temporarily unmap the page and flush the TLB, such that we can
> +      * modify the page atomically.
> +      */
> +     flush_cache_page(vma, vaddr, pte_pfn(fw->pte));
> +     fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep);
> +     copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
> +
> +     /*
> +      * When unregistering, we may only zap a PTE if uffd is disabled and
> +      * there are no unexpected folio references ...
> +      */
> +     if (is_register || userfaultfd_missing(vma) ||
> +         (folio_ref_count(folio) != folio_mapcount(folio) + 1 +
> +          folio_test_swapcache(folio) * folio_nr_pages(folio)))
> +             goto remap;
> +
> +     /*
> +      * ... and the mapped page is identical to the original page that
> +      * would get faulted in on next access.
> +      */
> +     if (!orig_page_is_identical(vma, vaddr, fw->page, &pmd_mappable))
> +             goto remap;
> +
> +     dec_mm_counter(vma->vm_mm, MM_ANONPAGES);
> +     folio_remove_rmap_pte(folio, fw->page, vma);
> +     if (!folio_mapped(folio) && folio_test_swapcache(folio) &&
> +          folio_trylock(folio)) {
> +             folio_free_swap(folio);
> +             folio_unlock(folio);
> +     }
> +     folio_put(folio);

hi,
it's probably ok and I'm missing something, but why do we call folio_put
in here, I'd think it's done by folio_put call in uprobe_write_opcode

thanks,
jirka


> +
> +     return pmd_mappable;
> +remap:
> +     /*
> +      * Make sure that our copy_to_page() changes become visible before the
> +      * set_pte_at() write.
> +      */
> +     smp_wmb();
> +     /* We modified the page. Make sure to mark the PTE dirty. */
> +     set_pte_at(vma->vm_mm, vaddr, fw->ptep, pte_mkdirty(fw->pte));
> +     return 0;
> +}
> +
>  /*
>   * NOTE:
>   * Expect the breakpoint instruction to be the smallest size instruction for
> @@ -475,116 +480,115 @@ static int update_ref_ctr(struct uprobe *uprobe, 
> struct mm_struct *mm,
>   * uprobe_write_opcode - write the opcode at a given virtual address.
>   * @auprobe: arch specific probepoint information.
>   * @vma: the probed virtual memory area.
> - * @vaddr: the virtual address to store the opcode.
> - * @opcode: opcode to be written at @vaddr.
> + * @opcode_vaddr: the virtual address to store the opcode.
> + * @opcode: opcode to be written at @opcode_vaddr.
>   *
>   * Called with mm->mmap_lock held for read or write.
>   * Return 0 (success) or a negative errno.
>   */
>  int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct 
> *vma,
> -             unsigned long vaddr, uprobe_opcode_t opcode)
> +             const unsigned long opcode_vaddr, uprobe_opcode_t opcode)
>  {
> +     const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
>       struct mm_struct *mm = vma->vm_mm;
>       struct uprobe *uprobe;
> -     struct page *old_page, *new_page;
>       int ret, is_register, ref_ctr_updated = 0;
> -     bool orig_page_huge = false;
>       unsigned int gup_flags = FOLL_FORCE;
> +     struct mmu_notifier_range range;
> +     struct folio_walk fw;
> +     struct folio *folio;
> +     struct page *page;
>  
>       is_register = is_swbp_insn(&opcode);
>       uprobe = container_of(auprobe, struct uprobe, arch);
>  
> -retry:
> +     if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags)))
> +             return -EINVAL;
> +
> +     /*
> +      * When registering, we have to break COW to get an exclusive anonymous
> +      * page that we can safely modify. Use FOLL_WRITE to trigger a write
> +      * fault if required. When unregistering, we might be lucky and the
> +      * anon page is already gone. So defer write faults until really
> +      * required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode()
> +      * cannot deal with PMDs yet.
> +      */
>       if (is_register)
> -             gup_flags |= FOLL_SPLIT_PMD;
> -     /* Read the page with vaddr into memory */
> -     ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &old_page, NULL);
> -     if (ret != 1)
> -             return ret;
> +             gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD;
>  
> -     ret = verify_opcode(old_page, vaddr, &opcode);
> +retry:
> +     ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &page, NULL);
>       if (ret <= 0)
> -             goto put_old;
> -
> -     if (is_zero_page(old_page)) {
> -             ret = -EINVAL;
> -             goto put_old;
> -     }
> +             goto out;
> +     folio = page_folio(page);
>  
> -     if (WARN(!is_register && PageCompound(old_page),
> -              "uprobe unregister should never work on compound page\n")) {
> -             ret = -EINVAL;
> -             goto put_old;
> +     ret = verify_opcode(page, opcode_vaddr, &opcode);
> +     if (ret <= 0) {
> +             folio_put(folio);
> +             goto out;
>       }
>  
>       /* We are going to replace instruction, update ref_ctr. */
>       if (!ref_ctr_updated && uprobe->ref_ctr_offset) {
>               ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1);
> -             if (ret)
> -                     goto put_old;
> +             if (ret) {
> +                     folio_put(folio);
> +                     goto out;
> +             }
>  
>               ref_ctr_updated = 1;
>       }
>  
>       ret = 0;
> -     if (!is_register && !PageAnon(old_page))
> -             goto put_old;
> -
> -     ret = anon_vma_prepare(vma);
> -     if (ret)
> -             goto put_old;
> -
> -     ret = -ENOMEM;
> -     new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
> -     if (!new_page)
> -             goto put_old;
> -
> -     __SetPageUptodate(new_page);
> -     copy_highpage(new_page, old_page);
> -     copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
> +     if (unlikely(!folio_test_anon(folio))) {
> +             VM_WARN_ON_ONCE(is_register);
> +             folio_put(folio);
> +             goto out;
> +     }
>  
>       if (!is_register) {
> -             struct page *orig_page;
> -             pgoff_t index;
> -
> -             VM_BUG_ON_PAGE(!PageAnon(old_page), old_page);
> -
> -             index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT;
> -             orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
> -                                       index);
> -
> -             if (orig_page) {
> -                     if (PageUptodate(orig_page) &&
> -                         pages_identical(new_page, orig_page)) {
> -                             /* let go new_page */
> -                             put_page(new_page);
> -                             new_page = NULL;
> -
> -                             if (PageCompound(orig_page))
> -                                     orig_page_huge = true;
> -                     }
> -                     put_page(orig_page);
> -             }
> +             /*
> +              * In the common case, we'll be able to zap the page when
> +              * unregistering. So trigger MMU notifiers now, as we won't
> +              * be able to do it under PTL.
> +              */
> +             mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
> +                                     vaddr, vaddr + PAGE_SIZE);
> +             mmu_notifier_invalidate_range_start(&range);
> +     }
> +
> +     ret = -EAGAIN;
> +     /* Walk the page tables again, to perform the actual update. */
> +     if (folio_walk_start(&fw, vma, vaddr, 0)) {
> +             if (fw.page == page)
> +                     ret = __uprobe_write_opcode(vma, &fw, folio, 
> opcode_vaddr, opcode);
> +             folio_walk_end(&fw, vma);
>       }
>  
> -     ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page);
> -     if (new_page)
> -             put_page(new_page);
> -put_old:
> -     put_page(old_page);
> +     if (!is_register)
> +             mmu_notifier_invalidate_range_end(&range);
>  
> -     if (unlikely(ret == -EAGAIN))
> +     folio_put(folio);
> +     switch (ret) {
> +     case -EFAULT:
> +             gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD;
> +             fallthrough;
> +     case -EAGAIN:
>               goto retry;
> +     default:
> +             break;
> +     }
>  
> +out:
>       /* Revert back reference counter if instruction update failed. */
> -     if (ret && is_register && ref_ctr_updated)
> +     if (ret < 0 && is_register && ref_ctr_updated)
>               update_ref_ctr(uprobe, mm, -1);
>  
>       /* try collapse pmd for compound page */
> -     if (!ret && orig_page_huge)
> +     if (ret > 0)
>               collapse_pte_mapped_thp(mm, vaddr, false);
>  
> -     return ret;
> +     return ret < 0 ? ret : 0;
>  }
>  
>  /**
> -- 
> 2.48.1
> 

Reply via email to