Hi Andrea, Few comments below.
On Thu, Jun 26, 2008 at 08:11:16PM +0200, Andrea Arcangeli wrote: > + } > + return need_tlb_flush; > +} > + > +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) > +{ > + int i; > + int need_tlb_flush = 0; > + > + /* > + * If mmap_sem isn't taken, we can look the memslots with only > + * the mmu_lock by skipping over the slots with userspace_addr == 0. > + */ > + for (i = 0; i < kvm->nmemslots; i++) { > + struct kvm_memory_slot *memslot = &kvm->memslots[i]; > + unsigned long start = memslot->userspace_addr; > + unsigned long end; > + > + /* mmu_lock protects userspace_addr */ > + if (!start) > + continue; > +int kvm_age_hva(struct kvm *kvm, unsigned long hva) > +{ > + int i; > + int young = 0; > + > + /* > + * If mmap_sem isn't taken, we can look the memslots with only > + * the mmu_lock by skipping over the slots with userspace_addr == 0. > + */ > + for (i = 0; i < kvm->nmemslots; i++) { > + struct kvm_memory_slot *memslot = &kvm->memslots[i]; > + unsigned long start = memslot->userspace_addr; > + unsigned long end; > + > + /* mmu_lock protects userspace_addr */ > + if (!start) > + continue; These two functions share the same memslot iteration code, you could avoid duplication. > + if (unlikely(atomic_read(&vcpu->kvm->mmu_notifier_count))) > + return; > + smp_rmb(); I don't think you need smp_rmb() on x86 since atomic operations serialize. barrier() should suffice. > spin_lock(&vcpu->kvm->mmu_lock); > + if (unlikely(atomic_read(&vcpu->kvm->mmu_notifier_count))) > + goto out_unlock; > + smp_rmb(); > + if (unlikely(atomic_read(&vcpu->kvm->mmu_notifier_seq) != mmu_seq)) > + goto out_unlock; Wrap this sequence in a well documented function? > +static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, > + struct mm_struct *mm, > + unsigned long start, > + unsigned long end) > +{ > + struct kvm *kvm = mmu_notifier_to_kvm(mn); > + int need_tlb_flush = 0; > + > + /* > + * The count increase must become visible at unlock time as no > + * spte can be established without taking the mmu_lock and > + * count is also read inside the mmu_lock critical section. > + */ > + atomic_inc(&kvm->mmu_notifier_count); > + > + spin_lock(&kvm->mmu_lock); > + for (; start < end; start += PAGE_SIZE) > + need_tlb_flush |= kvm_unmap_hva(kvm, start); > + spin_unlock(&kvm->mmu_lock); You don't handle large mappings here at all, which means that there might be external mappings even after ->range_start, ->range_end. This is not a problem now because QEMU kills all the shadow mappings before munmap() on hugetlbfs, but it will be a practical problem if ballooning supports largepages (which will probably happen in the future), or with fancy hugetlb features. > + atomic_inc(&kvm->mmu_notifier_seq); > + /* > + * The sequence increase must be visible before count > + * decrease. The page fault has to read count before sequence > + * for this write order to be effective. > + */ > + wmb(); smp_mb_after_atomic_inc ? > +static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, > + struct mm_struct *mm, > + unsigned long address) > +{ > + struct kvm *kvm = mmu_notifier_to_kvm(mn); > + int young; > + > + spin_lock(&kvm->mmu_lock); > + young = kvm_age_hva(kvm, address); > + spin_unlock(&kvm->mmu_lock); > + > + if (young) > + kvm_flush_remote_tlbs(kvm); Is it worth to flush remote TLB's just due to the young bit? Aging can happen often. - mmu_notifier_count could be a non-atomic type (range_end() does not grab mmu_lock but could). - why the MMU notifier API pass mm_struct instead of vma ? As it stands, VM pte aging/swapping/nuking of QEMU non-guest mappings interferes with guest pagefault processing for no reason. - isnt the logic susceptible to mmu_seq wraparound ? :-) -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html