On Fri, Dec 07, 2007 at 07:06:26AM +0200, Avi Kivity wrote:
> Marcelo Tosatti wrote:
> >Right, patch at end of the message restarts the process if the pte
> >changes under the walker. The goto is pretty ugly, but I fail to see any
> >elegant way of doing that. Ideas?
> >
> >
>
> goto is fine for that. But there's a subtle livelock here: suppose vcpu
> 0 is in guest mode with continuously updating a memory location. vcpu 1
> is faulting with that memory location acting as a pte. While we're in
> kernel mode, we aren't responding to signals like we should; so we need
> to abort the walk and let the guest retry; that way we go through the
> signal_pending() check.
I see that possibility, but why on earth would the guest be continuously
updating a pagetable entry ?
>
> However, this is an intrusive change, so let's start with the goto and
> drop it later in favor or an abort.
>
> >>>@@ -1510,6 +1510,9 @@ static int emulator_write_phys(struct kvm_vcpu
> >>>*vcpu, gpa_t gpa,
> >>> {
> >>> int ret;
> >>>
> >>>+ /* No need for kvm_cmpxchg_guest_pte here, its the guest
> >>>+ * responsability to synchronize pte updates and page faults.
> >>>+ */
> >>> ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
> >>> if (ret < 0)
> >>> return 0;
> >>>
> >>Hmm. What if an i386 pae guest carefully uses cmpxchg8b to atomically
> >>set a pte? kvm_write_guest() doesn't guarantee atomicity, so an
> >>intended atomic write can be seen splitted by the guest walker doing a
> >>concurrent walk.
> >>
> >
> >True, an atomic write is needed... a separate patch for that seems more
> >appropriate.
> >
> >
> >
>
> Yes.
>
> >+static inline bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
> >+ gfn_t table_gfn, unsigned index,
> >+ pt_element_t orig_pte, pt_element_t new_pte)
> >+{
> >+ pt_element_t ret;
> >+ pt_element_t *table;
> >+ struct page *page;
> >+
> >+ page = gfn_to_page(kvm, table_gfn);
> >+ table = kmap_atomic(page, KM_USER0);
> >+
> >+ ret = CMPXCHG(&table[index], orig_pte, new_pte);
> >+
> >+ kunmap_atomic(page, KM_USER0);
> >+
> >
>
> Missing kvm_release_page_dirty() here. May also move mark_page_dirty()
> here.
I prefer leaving that for later when the locking for mark_page_dirty()
gets sorted out...
> No need to force inlining.
Alright.
> >+ return (ret != orig_pte);
> >+}
> >+
> > /*
> > * Fetch a guest pte for a guest virtual address
> > */
> >@@ -91,6 +112,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
> > gpa_t pte_gpa;
> >
> > pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
> >+walk:
> > walker->level = vcpu->mmu.root_level;
> > pte = vcpu->cr3;
> > #if PTTYPE == 64
> >@@ -135,8 +157,9 @@ static int FNAME(walk_addr)(struct guest_walker
> >*walker,
> >
> > if (!(pte & PT_ACCESSED_MASK)) {
> > mark_page_dirty(vcpu->kvm, table_gfn);
> >- pte |= PT_ACCESSED_MASK;
> >- kvm_write_guest(vcpu->kvm, pte_gpa, &pte,
> >sizeof(pte));
> >+ if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
> >+ index, pte, pte|PT_ACCESSED_MASK))
> >+ goto walk;
> >
>
> We lose the accessed bit in the local variable pte here. Not sure if it
> matters but let's play it safe.
>
> > }
> >
> > if (walker->level == PT_PAGE_TABLE_LEVEL) {
> >@@ -159,9 +182,13 @@ static int FNAME(walk_addr)(struct guest_walker
> >*walker,
> > }
> >
> > if (write_fault && !is_dirty_pte(pte)) {
> >+ bool ret;
> > mark_page_dirty(vcpu->kvm, table_gfn);
> >- pte |= PT_DIRTY_MASK;
> >- kvm_write_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
> >+ ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
> >+ pte|PT_DIRTY_MASK);
> >+ if (ret)
> >+ goto walk;
> >+
>
> Again we lose a bit in pte. That ends up in walker->pte and is quite
> important.
Oops!
Updated patch:
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h
index b24bc7c..2e54b02 100644
--- a/drivers/kvm/paging_tmpl.h
+++ b/drivers/kvm/paging_tmpl.h
@@ -34,7 +34,9 @@
#define PT_LEVEL_BITS PT64_LEVEL_BITS
#ifdef CONFIG_X86_64
#define PT_MAX_FULL_LEVELS 4
+ #define CMPXCHG cmpxchg
#else
+ #define CMPXCHG cmpxchg64
#define PT_MAX_FULL_LEVELS 2
#endif
#elif PTTYPE == 32
@@ -48,6 +50,7 @@
#define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
#define PT_LEVEL_BITS PT32_LEVEL_BITS
#define PT_MAX_FULL_LEVELS 2
+ #define CMPXCHG cmpxchg
#else
#error Invalid PTTYPE value
#endif
@@ -78,6 +81,26 @@ static gfn_t gpte_to_gfn_pde(pt_element_t gpte)
return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
}
+static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
+ gfn_t table_gfn, unsigned index,
+ pt_element_t orig_pte, pt_element_t new_pte)
+{
+ pt_element_t ret;
+ pt_element_t *table;
+ struct page *page;
+
+ page = gfn_to_page(kvm, table_gfn);
+ table = kmap_atomic(page, KM_USER0);
+
+ ret = CMPXCHG(&table[index], orig_pte, new_pte);
+
+ kunmap_atomic(page, KM_USER0);
+
+ kvm_release_page_dirty(page);
+
+ return (ret != orig_pte);
+}
+
/*
* Fetch a guest pte for a guest virtual address
*/
@@ -91,6 +114,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
gpa_t pte_gpa;
pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
+walk:
walker->level = vcpu->mmu.root_level;
pte = vcpu->cr3;
#if PTTYPE == 64
@@ -135,8 +159,10 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
if (!(pte & PT_ACCESSED_MASK)) {
mark_page_dirty(vcpu->kvm, table_gfn);
+ if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
+ index, pte, pte|PT_ACCESSED_MASK))
+ goto walk;
pte |= PT_ACCESSED_MASK;
- kvm_write_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
}
if (walker->level == PT_PAGE_TABLE_LEVEL) {
@@ -159,9 +185,13 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
}
if (write_fault && !is_dirty_pte(pte)) {
+ bool ret;
mark_page_dirty(vcpu->kvm, table_gfn);
+ ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
+ pte|PT_DIRTY_MASK);
+ if (ret)
+ goto walk;
pte |= PT_DIRTY_MASK;
- kvm_write_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
}
-------------------------------------------------------------------------
SF.Net email is sponsored by:
Check out the new SourceForge.net Marketplace.
It's the best place to buy or sell services for
just about anything Open Source.
http://sourceforge.net/services/buy/index.php
_______________________________________________
kvm-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/kvm-devel