On Wed, Mar 26, 2008 at 05:02:53PM +0200, Avi Kivity wrote:
> Andrea notes that freeing the page before flushing the tlb is a race, as the
> guest can sneak in one last write before the tlb is flushed, writing to a
> page that may belong to someone else.
> 
> Fix be reversing the order of freeing and flushing the tlb.  Since the tlb
> flush is expensive, queue the pages to be freed so we need to flush just once.


Andrea previously wrote (which I somehow missed):

>> This case is safe because the path that frees a pte and subsequently
>> a page will take care of flushing the TLB of any remote CPU's that
>> possibly have it cached (before freeing the page, of course).
>> ptep_clear_flush->flush_tlb_page.
>>
>> Am I missing something?

> flush_tlb_page only takes care of accesses to the page through qemu
> task when outside vmx/svm mode. If the other physical cpu, is running
> some code of some guest vcpu in vmx/svm mode after the race window is
> open and before it is closed the race will trigger.

Nope. If a physical CPU has page translations cached it _must_ be
running in the context of a qemu thread (does not matter if its in
userspace or executing guest code). The bit corresponding to such CPU's
will be set in mm->vm_cpu_mask and flush_tlb_page() will take care of
flushing the TLBs appropriately.

That is, in the perspective of the generic TLB flushing code, there is
no distinction between page translations cached via the qemu process
mappings or "shadow guest pages" (you can view shadow guest pages as a
subset of qemu process translations in this context).

So the patch is unnecessary and adds additional IPI TLB flushes
(especially in mmu_pte_write_zap_pte path where the need_remote_flush
optimization essentially vanishes).

> Signed-off-by: Avi Kivity <[EMAIL PROTECTED]>
> ---
>  arch/x86/kvm/mmu.c         |   66 +++++++++++++++++++++++++++++++++++++++----
>  include/asm-x86/kvm_host.h |    8 +++++
>  2 files changed, 67 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 95c12bc..2033879 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -536,6 +536,52 @@ static void rmap_desc_remove_entry(unsigned long *rmapp,
>       mmu_free_rmap_desc(desc);
>  }
>  
> +static void rmap_remove_complete(struct kvm *kvm)
> +{
> +     int i;
> +     struct page *page;
> +
> +     if (!kvm->arch.nr_page_release_queue)
> +             return;
> +     kvm_flush_remote_tlbs(kvm);
> +     for (i = 0; i < kvm->arch.nr_page_release_queue; ++i) {
> +             page = kvm->arch.page_release_queue[i].page;
> +             if (kvm->arch.page_release_queue[i].dirty)
> +                     kvm_release_page_dirty(page);
> +             else
> +                     kvm_release_page_clean(page);
> +     }
> +     kvm->arch.nr_page_release_queue = 0;
> +}
> +
> +static void kvm_release_page_deferred(struct kvm *kvm,
> +                                   struct page *page,
> +                                   bool dirty)
> +{
> +     int i;
> +
> +     i = kvm->arch.nr_page_release_queue;
> +     if (i == KVM_MAX_PAGE_RELEASE_QUEUE) {
> +             rmap_remove_complete(kvm);
> +             i = 0;
> +     }
> +     kvm->arch.page_release_queue[i].page = page;
> +     kvm->arch.page_release_queue[i].dirty = dirty;
> +     kvm->arch.nr_page_release_queue = i + 1;
> +}
> +
> +static void kvm_release_page_dirty_deferred(struct kvm *kvm,
> +                                         struct page *page)
> +{
> +     kvm_release_page_deferred(kvm, page, true);
> +}
> +
> +static void kvm_release_page_clean_deferred(struct kvm *kvm,
> +                                         struct page *page)
> +{
> +     kvm_release_page_deferred(kvm, page, true);
> +}
> +
>  static void rmap_remove(struct kvm *kvm, u64 *spte)
>  {
>       struct kvm_rmap_desc *desc;
> @@ -551,9 +597,9 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
>       page = spte_to_page(*spte);
>       mark_page_accessed(page);
>       if (is_writeble_pte(*spte))
> -             kvm_release_page_dirty(page);
> +             kvm_release_page_dirty_deferred(kvm, page);
>       else
> -             kvm_release_page_clean(page);
> +             kvm_release_page_clean_deferred(kvm, page);
>       rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte));
>       if (!*rmapp) {
>               printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
> @@ -585,6 +631,12 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
>       }
>  }
>  
> +static void rmap_remove_one(struct kvm *kvm, u64 *spte)
> +{
> +     rmap_remove(kvm, spte);
> +     rmap_remove_complete(kvm);
> +}
> +
>  static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
>  {
>       struct kvm_rmap_desc *desc;
> @@ -650,7 +702,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
>               BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != 
> (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
>               pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", 
> spte, *spte, gfn);
>               if (is_writeble_pte(*spte)) {
> -                     rmap_remove(kvm, spte);
> +                     rmap_remove_one(kvm, spte);
>                       --kvm->stat.lpages;
>                       set_shadow_pte(spte, shadow_trap_nonpresent_pte);
>                       write_protected = 1;
> @@ -872,7 +924,7 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
>                               rmap_remove(kvm, &pt[i]);
>                       pt[i] = shadow_trap_nonpresent_pte;
>               }
> -             kvm_flush_remote_tlbs(kvm);
> +             rmap_remove_complete(kvm);
>               return;
>       }
>  
> @@ -891,7 +943,7 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
>               }
>               pt[i] = shadow_trap_nonpresent_pte;
>       }
> -     kvm_flush_remote_tlbs(kvm);
> +     rmap_remove_complete(kvm);
>  }
>  
>  static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
> @@ -1048,7 +1100,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 
> *shadow_pte,
>  
>       if (is_rmap_pte(*shadow_pte)) {
>               if (page != spte_to_page(*shadow_pte))
> -                     rmap_remove(vcpu->kvm, shadow_pte);
> +                     rmap_remove_one(vcpu->kvm, shadow_pte);
>               else
>                       was_rmapped = 1;
>       }
> @@ -1583,7 +1635,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
>       if (is_shadow_present_pte(pte)) {
>               if (sp->role.level == PT_PAGE_TABLE_LEVEL ||
>                   is_large_pte(pte))
> -                     rmap_remove(vcpu->kvm, spte);
> +                     rmap_remove_one(vcpu->kvm, spte);
>               else {
>                       child = page_header(pte & PT64_BASE_ADDR_MASK);
>                       mmu_page_remove_parent_pte(child, spte);
> diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
> index 4382ca0..8a71616 100644
> --- a/include/asm-x86/kvm_host.h
> +++ b/include/asm-x86/kvm_host.h
> @@ -77,6 +77,8 @@
>  #define KVM_REFILL_PAGES 25
>  #define KVM_MAX_CPUID_ENTRIES 40
>  
> +#define KVM_MAX_PAGE_RELEASE_QUEUE 16
> +
>  extern spinlock_t kvm_lock;
>  extern struct list_head vm_list;
>  
> @@ -312,6 +314,12 @@ struct kvm_arch{
>       struct kvm_ioapic *vioapic;
>       struct kvm_pit *vpit;
>  
> +     int nr_page_release_queue;
> +     struct {
> +             struct page *page;
> +             bool dirty;
> +     } page_release_queue[KVM_MAX_PAGE_RELEASE_QUEUE];
> +
>       int round_robin_prev_vcpu;
>       unsigned int tss_addr;
>       struct page *apic_access_page;
> -- 
> 1.5.4.2

-------------------------------------------------------------------------
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://ad.doubleclick.net/clk;164216239;13503038;w?http://sf.net/marketplace
_______________________________________________
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel

Reply via email to