On Fri, Dec 07, 2007 at 07:06:26AM +0200, Avi Kivity wrote: > Marcelo Tosatti wrote: > >Right, patch at end of the message restarts the process if the pte > >changes under the walker. The goto is pretty ugly, but I fail to see any > >elegant way of doing that. Ideas? > > > > > > goto is fine for that. But there's a subtle livelock here: suppose vcpu > 0 is in guest mode with continuously updating a memory location. vcpu 1 > is faulting with that memory location acting as a pte. While we're in > kernel mode, we aren't responding to signals like we should; so we need > to abort the walk and let the guest retry; that way we go through the > signal_pending() check.
I see that possibility, but why on earth would the guest be continuously updating a pagetable entry ? > > However, this is an intrusive change, so let's start with the goto and > drop it later in favor or an abort. > > >>>@@ -1510,6 +1510,9 @@ static int emulator_write_phys(struct kvm_vcpu > >>>*vcpu, gpa_t gpa, > >>> { > >>> int ret; > >>> > >>>+ /* No need for kvm_cmpxchg_guest_pte here, its the guest > >>>+ * responsability to synchronize pte updates and page faults. > >>>+ */ > >>> ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); > >>> if (ret < 0) > >>> return 0; > >>> > >>Hmm. What if an i386 pae guest carefully uses cmpxchg8b to atomically > >>set a pte? kvm_write_guest() doesn't guarantee atomicity, so an > >>intended atomic write can be seen splitted by the guest walker doing a > >>concurrent walk. > >> > > > >True, an atomic write is needed... a separate patch for that seems more > >appropriate. > > > > > > > > Yes. > > >+static inline bool FNAME(cmpxchg_gpte)(struct kvm *kvm, > >+ gfn_t table_gfn, unsigned index, > >+ pt_element_t orig_pte, pt_element_t new_pte) > >+{ > >+ pt_element_t ret; > >+ pt_element_t *table; > >+ struct page *page; > >+ > >+ page = gfn_to_page(kvm, table_gfn); > >+ table = kmap_atomic(page, KM_USER0); > >+ > >+ ret = CMPXCHG(&table[index], orig_pte, new_pte); > >+ > >+ kunmap_atomic(page, KM_USER0); > >+ > > > > Missing kvm_release_page_dirty() here. May also move mark_page_dirty() > here. I prefer leaving that for later when the locking for mark_page_dirty() gets sorted out... > No need to force inlining. Alright. > >+ return (ret != orig_pte); > >+} > >+ > > /* > > * Fetch a guest pte for a guest virtual address > > */ > >@@ -91,6 +112,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, > > gpa_t pte_gpa; > > > > pgprintk("%s: addr %lx\n", __FUNCTION__, addr); > >+walk: > > walker->level = vcpu->mmu.root_level; > > pte = vcpu->cr3; > > #if PTTYPE == 64 > >@@ -135,8 +157,9 @@ static int FNAME(walk_addr)(struct guest_walker > >*walker, > > > > if (!(pte & PT_ACCESSED_MASK)) { > > mark_page_dirty(vcpu->kvm, table_gfn); > >- pte |= PT_ACCESSED_MASK; > >- kvm_write_guest(vcpu->kvm, pte_gpa, &pte, > >sizeof(pte)); > >+ if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, > >+ index, pte, pte|PT_ACCESSED_MASK)) > >+ goto walk; > > > > We lose the accessed bit in the local variable pte here. Not sure if it > matters but let's play it safe. > > > } > > > > if (walker->level == PT_PAGE_TABLE_LEVEL) { > >@@ -159,9 +182,13 @@ static int FNAME(walk_addr)(struct guest_walker > >*walker, > > } > > > > if (write_fault && !is_dirty_pte(pte)) { > >+ bool ret; > > mark_page_dirty(vcpu->kvm, table_gfn); > >- pte |= PT_DIRTY_MASK; > >- kvm_write_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); > >+ ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, > >+ pte|PT_DIRTY_MASK); > >+ if (ret) > >+ goto walk; > >+ > > Again we lose a bit in pte. That ends up in walker->pte and is quite > important. Oops! Updated patch: diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index b24bc7c..2e54b02 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h @@ -34,7 +34,9 @@ #define PT_LEVEL_BITS PT64_LEVEL_BITS #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS 4 + #define CMPXCHG cmpxchg #else + #define CMPXCHG cmpxchg64 #define PT_MAX_FULL_LEVELS 2 #endif #elif PTTYPE == 32 @@ -48,6 +50,7 @@ #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) #define PT_LEVEL_BITS PT32_LEVEL_BITS #define PT_MAX_FULL_LEVELS 2 + #define CMPXCHG cmpxchg #else #error Invalid PTTYPE value #endif @@ -78,6 +81,26 @@ static gfn_t gpte_to_gfn_pde(pt_element_t gpte) return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; } +static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, + gfn_t table_gfn, unsigned index, + pt_element_t orig_pte, pt_element_t new_pte) +{ + pt_element_t ret; + pt_element_t *table; + struct page *page; + + page = gfn_to_page(kvm, table_gfn); + table = kmap_atomic(page, KM_USER0); + + ret = CMPXCHG(&table[index], orig_pte, new_pte); + + kunmap_atomic(page, KM_USER0); + + kvm_release_page_dirty(page); + + return (ret != orig_pte); +} + /* * Fetch a guest pte for a guest virtual address */ @@ -91,6 +114,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, gpa_t pte_gpa; pgprintk("%s: addr %lx\n", __FUNCTION__, addr); +walk: walker->level = vcpu->mmu.root_level; pte = vcpu->cr3; #if PTTYPE == 64 @@ -135,8 +159,10 @@ static int FNAME(walk_addr)(struct guest_walker *walker, if (!(pte & PT_ACCESSED_MASK)) { mark_page_dirty(vcpu->kvm, table_gfn); + if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, + index, pte, pte|PT_ACCESSED_MASK)) + goto walk; pte |= PT_ACCESSED_MASK; - kvm_write_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); } if (walker->level == PT_PAGE_TABLE_LEVEL) { @@ -159,9 +185,13 @@ static int FNAME(walk_addr)(struct guest_walker *walker, } if (write_fault && !is_dirty_pte(pte)) { + bool ret; mark_page_dirty(vcpu->kvm, table_gfn); + ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, + pte|PT_DIRTY_MASK); + if (ret) + goto walk; pte |= PT_DIRTY_MASK; - kvm_write_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte)); } ------------------------------------------------------------------------- SF.Net email is sponsored by: Check out the new SourceForge.net Marketplace. It's the best place to buy or sell services for just about anything Open Source. http://sourceforge.net/services/buy/index.php _______________________________________________ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel