On (06/03/16 09:25), Michal Hocko wrote:
> > it's quite hard to trigger the bug (somehow), so I can't
> > follow up with more information as of now.

either I did something very silly fixing up the patch, or the
patch may be causing general protection faults on my system.

RIP collect_mm_slot() + 0x42/0x84
        khugepaged
        prepare_to_wait_event
        maybe_pmd_mkwrite
        kthread
        _raw_sin_unlock_irq
        ret_from_fork
        kthread_create_on_node

collect_mm_slot() + 0x42/0x84 is

0000000000000328 <collect_mm_slot>:
     328:       55                      push   %rbp
     329:       48 89 e5                mov    %rsp,%rbp
     32c:       53                      push   %rbx
     32d:       48 8b 5f 20             mov    0x20(%rdi),%rbx
     331:       8b 43 48                mov    0x48(%rbx),%eax
     334:       ff c8                   dec    %eax
     336:       7f 71                   jg     3a9 <collect_mm_slot+0x81>
     338:       48 8b 57 08             mov    0x8(%rdi),%rdx
     33c:       48 85 d2                test   %rdx,%rdx
     33f:       74 1e                   je     35f <collect_mm_slot+0x37>
     341:       48 8b 07                mov    (%rdi),%rax
     344:       48 85 c0                test   %rax,%rax
     347:       48 89 02                mov    %rax,(%rdx)
     34a:       74 04                   je     350 <collect_mm_slot+0x28>
     34c:       48 89 50 08             mov    %rdx,0x8(%rax)
     350:       48 c7 07 00 00 00 00    movq   $0x0,(%rdi)
     357:       48 c7 47 08 00 00 00    movq   $0x0,0x8(%rdi)
     35e:       00 
     35f:       48 8b 57 10             mov    0x10(%rdi),%rdx
     363:       48 8b 47 18             mov    0x18(%rdi),%rax
     367:       48 89 fe                mov    %rdi,%rsi
     36a:       48 89 42 08             mov    %rax,0x8(%rdx)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
     36e:       48 89 10                mov    %rdx,(%rax)
     371:       48 b8 00 01 00 00 00    movabs $0xdead000000000100,%rax
     378:       00 ad de 
     37b:       48 89 47 10             mov    %rax,0x10(%rdi)
     37f:       48 b8 00 02 00 00 00    movabs $0xdead000000000200,%rax
     386:       00 ad de 
     389:       48 89 47 18             mov    %rax,0x18(%rdi)
     38d:       48 8b 3d 00 00 00 00    mov    0x0(%rip),%rdi        # 394 
<collect_mm_slot+0x6c>
     394:       e8 00 00 00 00          callq  399 <collect_mm_slot+0x71>
     399:       f0 ff 4b 4c             lock decl 0x4c(%rbx)
     39d:       74 02                   je     3a1 <collect_mm_slot+0x79>
     39f:       eb 08                   jmp    3a9 <collect_mm_slot+0x81>
     3a1:       48 89 df                mov    %rbx,%rdi
     3a4:       e8 00 00 00 00          callq  3a9 <collect_mm_slot+0x81>
     3a9:       5b                      pop    %rbx
     3aa:       5d                      pop    %rbp
     3ab:       c3                      retq

which is list_del(&mm_slot->mm_node), I believe.


I attached the patch (just in case).

---
 mm/huge_memory.c | 87 +++++++++++++++++++++++++-------------------------------
 1 file changed, 39 insertions(+), 48 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 292cedd..1c82fa4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1938,7 +1938,8 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
 
 static inline int khugepaged_test_exit(struct mm_struct *mm)
 {
-       return atomic_read(&mm->mm_users) == 0;
+       /* the only pin is from khugepaged_scan_mm_slot */
+       return atomic_read(&mm->mm_users) <= 1;
 }
 
 int __khugepaged_enter(struct mm_struct *mm)
@@ -1950,8 +1951,6 @@ int __khugepaged_enter(struct mm_struct *mm)
        if (!mm_slot)
                return -ENOMEM;
 
-       /* __khugepaged_exit() must not run from under us */
-       VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
        if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
                free_mm_slot(mm_slot);
                return 0;
@@ -1994,36 +1993,40 @@ int khugepaged_enter_vma_merge(struct vm_area_struct 
*vma,
        return 0;
 }
 
-void __khugepaged_exit(struct mm_struct *mm)
+static void collect_mm_slot(struct mm_slot *mm_slot)
 {
-       struct mm_slot *mm_slot;
-       int free = 0;
+       struct mm_struct *mm = mm_slot->mm;
 
-       spin_lock(&khugepaged_mm_lock);
-       mm_slot = get_mm_slot(mm);
-       if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
+       VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
+
+       if (khugepaged_test_exit(mm)) {
+               /* free mm_slot */
                hash_del(&mm_slot->hash);
                list_del(&mm_slot->mm_node);
-               free = 1;
-       }
-       spin_unlock(&khugepaged_mm_lock);
 
-       if (free) {
-               clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
-               free_mm_slot(mm_slot);
-               mmdrop(mm);
-       } else if (mm_slot) {
                /*
-                * This is required to serialize against
-                * khugepaged_test_exit() (which is guaranteed to run
-                * under mmap sem read mode). Stop here (after we
-                * return all pagetables will be destroyed) until
-                * khugepaged has finished working on the pagetables
-                * under the mmap_sem.
+                * Not strictly needed because the mm exited already.
+                *
+                * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
                 */
-               down_write(&mm->mmap_sem);
-               up_write(&mm->mmap_sem);
+
+               /* khugepaged_mm_lock actually not necessary for the below */
+               free_mm_slot(mm_slot);
+               mmdrop(mm);
+       }
+}
+
+void __khugepaged_exit(struct mm_struct *mm)
+{
+       struct mm_slot *mm_slot;
+
+       spin_lock(&khugepaged_mm_lock);
+       mm_slot = get_mm_slot(mm);
+       if (mm_slot) {
+               collect_mm_slot(mm_slot);
+               clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
        }
+       spin_unlock(&khugepaged_mm_lock);
 }
 
 static void release_pte_page(struct page *page)
@@ -2738,29 +2741,6 @@ out:
        return ret;
 }
 
-static void collect_mm_slot(struct mm_slot *mm_slot)
-{
-       struct mm_struct *mm = mm_slot->mm;
-
-       VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
-
-       if (khugepaged_test_exit(mm)) {
-               /* free mm_slot */
-               hash_del(&mm_slot->hash);
-               list_del(&mm_slot->mm_node);
-
-               /*
-                * Not strictly needed because the mm exited already.
-                *
-                * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
-                */
-
-               /* khugepaged_mm_lock actually not necessary for the below */
-               free_mm_slot(mm_slot);
-               mmdrop(mm);
-       }
-}
-
 static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
                                            struct page **hpage)
        __releases(&khugepaged_mm_lock)
@@ -2782,6 +2762,16 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int 
pages,
                khugepaged_scan.address = 0;
                khugepaged_scan.mm_slot = mm_slot;
        }
+
+       /*
+        * Do not even try to do anything if the current mm is already
+        * dead. khugepaged_mm_lock will make sure only this or
+        * __khugepaged_exit does the unhasing.
+        */
+       if (!atomic_inc_not_zero(&mm_slot->mm->mm_users)) {
+               collect_mm_slot(mm_slot);
+               return progress;
+       }
        spin_unlock(&khugepaged_mm_lock);
 
        mm = mm_slot->mm;
@@ -2865,6 +2855,7 @@ breakouterloop_mmap_sem:
 
                collect_mm_slot(mm_slot);
        }
+       mmput_async(mm);
 
        return progress;
 }
-- 
2.9.0.rc1

Reply via email to