On Fri, Dec 07, 2007 at 07:06:26AM +0200, Avi Kivity wrote:
> Marcelo Tosatti wrote:
> >Right, patch at end of the message restarts the process if the pte
> >changes under the walker. The goto is pretty ugly, but I fail to see any
> >elegant way of doing that. Ideas?
> >
> >  
> 
> goto is fine for that.  But there's a subtle livelock here: suppose vcpu 
> 0 is in guest mode with continuously updating a memory location.  vcpu 1 
> is faulting with that memory location acting as a pte.  While we're in 
> kernel mode, we aren't responding to signals like we should; so we need 
> to abort the walk and let the guest retry; that way we go through the 
> signal_pending() check.

I see that possibility, but why on earth would the guest be continuously
updating a pagetable entry ?

> 
> However, this is an intrusive change, so let's start with the goto and 
> drop it later in favor or an abort.
> 
> >>>@@ -1510,6 +1510,9 @@ static int emulator_write_phys(struct kvm_vcpu 
> >>>*vcpu, gpa_t gpa,
> >>> {
> >>>   int ret;
> >>> 
> >>>+  /* No need for kvm_cmpxchg_guest_pte here, its the guest 
> >>>+   * responsability to synchronize pte updates and page faults.
> >>>+   */
> >>>   ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
> >>>   if (ret < 0)
> >>>           return 0;
> >>>      
> >>Hmm.  What if an i386 pae guest carefully uses cmpxchg8b to atomically 
> >>set a pte?  kvm_write_guest() doesn't guarantee atomicity, so an 
> >>intended atomic write can be seen splitted by the guest walker doing a 
> >>concurrent walk.
> >>    
> >
> >True, an atomic write is needed... a separate patch for that seems more
> >appropriate.
> >
> >
> >  
> 
> Yes.
> 
> >+static inline bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
> >+                     gfn_t table_gfn, unsigned index, 
> >+                     pt_element_t orig_pte, pt_element_t new_pte)
> >+{
> >+    pt_element_t ret;
> >+    pt_element_t *table;
> >+    struct page *page;
> >+
> >+    page = gfn_to_page(kvm, table_gfn);
> >+    table = kmap_atomic(page, KM_USER0);
> >+    
> >+    ret = CMPXCHG(&table[index], orig_pte, new_pte);
> >+
> >+    kunmap_atomic(page, KM_USER0);
> >+
> >  
> 
> Missing kvm_release_page_dirty() here.  May also move mark_page_dirty() 
> here.

I prefer leaving that for later when the locking for mark_page_dirty()
gets sorted out...

> No need to force inlining.

Alright.

> >+    return (ret != orig_pte);
> >+}
> >+
> > /*
> >  * Fetch a guest pte for a guest virtual address
> >  */
> >@@ -91,6 +112,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
> >     gpa_t pte_gpa;
> > 
> >     pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
> >+walk:
> >     walker->level = vcpu->mmu.root_level;
> >     pte = vcpu->cr3;
> > #if PTTYPE == 64
> >@@ -135,8 +157,9 @@ static int FNAME(walk_addr)(struct guest_walker 
> >*walker,
> > 
> >             if (!(pte & PT_ACCESSED_MASK)) {
> >                     mark_page_dirty(vcpu->kvm, table_gfn);
> >-                    pte |= PT_ACCESSED_MASK;
> >-                    kvm_write_guest(vcpu->kvm, pte_gpa, &pte, 
> >sizeof(pte));
> >+                    if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, 
> >+                        index, pte, pte|PT_ACCESSED_MASK))
> >+                            goto walk;
> >  
> 
> We lose the accessed bit in the local variable pte here.  Not sure if it 
> matters but let's play it safe.
>
> >             }
> > 
> >             if (walker->level == PT_PAGE_TABLE_LEVEL) {
> >@@ -159,9 +182,13 @@ static int FNAME(walk_addr)(struct guest_walker 
> >*walker,
> >     }
> > 
> >     if (write_fault && !is_dirty_pte(pte)) {
> >+            bool ret;
> >             mark_page_dirty(vcpu->kvm, table_gfn);
> >-            pte |= PT_DIRTY_MASK;
> >-            kvm_write_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
> >+            ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
> >+                        pte|PT_DIRTY_MASK);
> >+            if (ret)
> >+                    goto walk;
> >+    
> 
> Again we lose a bit in pte.  That ends up in walker->pte and is quite 
> important.

Oops!

Updated patch:


diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h
index b24bc7c..2e54b02 100644
--- a/drivers/kvm/paging_tmpl.h
+++ b/drivers/kvm/paging_tmpl.h
@@ -34,7 +34,9 @@
        #define PT_LEVEL_BITS PT64_LEVEL_BITS
        #ifdef CONFIG_X86_64
        #define PT_MAX_FULL_LEVELS 4
+       #define CMPXCHG cmpxchg
        #else
+       #define CMPXCHG cmpxchg64
        #define PT_MAX_FULL_LEVELS 2
        #endif
 #elif PTTYPE == 32
@@ -48,6 +50,7 @@
        #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
        #define PT_LEVEL_BITS PT32_LEVEL_BITS
        #define PT_MAX_FULL_LEVELS 2
+       #define CMPXCHG cmpxchg
 #else
        #error Invalid PTTYPE value
 #endif
@@ -78,6 +81,26 @@ static gfn_t gpte_to_gfn_pde(pt_element_t gpte)
        return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
 }
 
+static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
+                        gfn_t table_gfn, unsigned index, 
+                        pt_element_t orig_pte, pt_element_t new_pte)
+{
+       pt_element_t ret;
+       pt_element_t *table;
+       struct page *page;
+
+       page = gfn_to_page(kvm, table_gfn);
+       table = kmap_atomic(page, KM_USER0);
+       
+       ret = CMPXCHG(&table[index], orig_pte, new_pte);
+
+       kunmap_atomic(page, KM_USER0);
+
+       kvm_release_page_dirty(page);
+
+       return (ret != orig_pte);
+}
+
 /*
  * Fetch a guest pte for a guest virtual address
  */
@@ -91,6 +114,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
        gpa_t pte_gpa;
 
        pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
+walk:
        walker->level = vcpu->mmu.root_level;
        pte = vcpu->cr3;
 #if PTTYPE == 64
@@ -135,8 +159,10 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
 
                if (!(pte & PT_ACCESSED_MASK)) {
                        mark_page_dirty(vcpu->kvm, table_gfn);
+                       if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, 
+                           index, pte, pte|PT_ACCESSED_MASK))
+                               goto walk;
                        pte |= PT_ACCESSED_MASK;
-                       kvm_write_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
                }
 
                if (walker->level == PT_PAGE_TABLE_LEVEL) {
@@ -159,9 +185,13 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
        }
 
        if (write_fault && !is_dirty_pte(pte)) {
+               bool ret;
                mark_page_dirty(vcpu->kvm, table_gfn);
+               ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
+                           pte|PT_DIRTY_MASK);
+               if (ret)
+                       goto walk;
                pte |= PT_DIRTY_MASK;
-               kvm_write_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
                kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
        }
 

-------------------------------------------------------------------------
SF.Net email is sponsored by:
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://sourceforge.net/services/buy/index.php
_______________________________________________
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel

Reply via email to