Re: [PATCH 1/5] KVM: MMU: Push clean gpte write protection out of gpte_access()

2012-09-13 Thread Xiao Guangrong
On 09/12/2012 10:29 PM, Avi Kivity wrote:
> gpte_access() computes the access permissions of a guest pte and also
> write-protects clean gptes.  This is wrong when we are servicing a
> write fault (since we'll be setting the dirty bit momentarily) but
> correct when instantiating a speculative spte, or when servicing a
> read fault (since we'll want to trap a following write in order to
> set the dirty bit).
> 
> It doesn't seem to hurt in practice, but in order to make the code

In current code, it seems that we will get two #PF if guest write memory
through clean pte: one mark the dirty bit, then fault again, set W bit.

> readable, push the write protection out of gpte_access() and into
> a new protect_clean_gpte() which is called explicitly when needed.

Reviewed-by: Xiao Guangrong 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/5] KVM: MMU: Push clean gpte write protection out of gpte_access()

2012-09-12 Thread Avi Kivity
gpte_access() computes the access permissions of a guest pte and also
write-protects clean gptes.  This is wrong when we are servicing a
write fault (since we'll be setting the dirty bit momentarily) but
correct when instantiating a speculative spte, or when servicing a
read fault (since we'll want to trap a following write in order to
set the dirty bit).

It doesn't seem to hurt in practice, but in order to make the code
readable, push the write protection out of gpte_access() and into
a new protect_clean_gpte() which is called explicitly when needed.

Signed-off-by: Avi Kivity 
---
 arch/x86/kvm/mmu.c | 12 
 arch/x86/kvm/mmu.h |  3 ++-
 arch/x86/kvm/paging_tmpl.h | 24 
 3 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index aa0b469..54c9cb4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3408,6 +3408,18 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 
gpte, int level)
return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
 }
 
+static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
+{
+   unsigned mask;
+
+   BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
+
+   mask = (unsigned)~ACC_WRITE_MASK;
+   /* Allow write access to dirty gptes */
+   mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & 
PT_WRITABLE_MASK;
+   *access &= mask;
+}
+
 static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
   int *nr_present)
 {
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index e374db9..2832081 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -18,7 +18,8 @@
 #define PT_PCD_MASK (1ULL << 4)
 #define PT_ACCESSED_SHIFT 5
 #define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
-#define PT_DIRTY_MASK (1ULL << 6)
+#define PT_DIRTY_SHIFT 6
+#define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT)
 #define PT_PAGE_SIZE_MASK (1ULL << 7)
 #define PT_PAT_MASK (1ULL << 7)
 #define PT_GLOBAL_MASK (1ULL << 8)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index bf8c42b..bf7b4ff 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -101,14 +101,11 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
return (ret != orig_pte);
 }
 
-static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte,
-  bool last)
+static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
 {
unsigned access;
 
access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
-   if (last && !is_dirty_gpte(gpte))
-   access &= ~ACC_WRITE_MASK;
 
 #if PTTYPE == 64
if (vcpu->arch.mmu.nx)
@@ -222,8 +219,7 @@ retry_walk:
 
last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte);
if (last_gpte) {
-   pte_access = pt_access &
-FNAME(gpte_access)(vcpu, pte, true);
+   pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
/* check if the kernel is fetching from user page */
if (unlikely(pte_access & PT_USER_MASK) &&
kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
@@ -274,7 +270,7 @@ retry_walk:
break;
}
 
-   pt_access &= FNAME(gpte_access)(vcpu, pte, false);
+   pt_access &= FNAME(gpte_access)(vcpu, pte);
--walker->level;
}
 
@@ -283,7 +279,9 @@ retry_walk:
goto error;
}
 
-   if (write_fault && unlikely(!is_dirty_gpte(pte))) {
+   if (!write_fault)
+   protect_clean_gpte(&pte_access, pte);
+   else if (unlikely(!is_dirty_gpte(pte))) {
int ret;
 
trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
@@ -368,7 +366,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct 
kvm_mmu_page *sp,
return;
 
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
-   pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true);
+   pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+   protect_clean_gpte(&pte_access, gpte);
pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
if (mmu_invalid_pfn(pfn))
return;
@@ -441,8 +440,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, 
struct guest_walker *gw,
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
continue;
 
-   pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte,
- true);
+   pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+   protect_clean_gpte(&pte_access, gpte);
gfn = gpt

[PATCH 1/5] KVM: MMU: Push clean gpte write protection out of gpte_access()

2012-09-12 Thread Avi Kivity
gpte_access() computes the access permissions of a guest pte and also
write-protects clean gptes.  This is wrong when we are servicing a
write fault (since we'll be setting the dirty bit momentarily) but
correct when instantiating a speculative spte, or when servicing a
read fault (since we'll want to trap a following write in order to
set the dirty bit).

It doesn't seem to hurt in practice, but in order to make the code
readable, push the write protection out of gpte_access() and into
a new protect_clean_gpte() which is called explicitly when needed.

Signed-off-by: Avi Kivity 
---
 arch/x86/kvm/mmu.c | 12 
 arch/x86/kvm/mmu.h |  3 ++-
 arch/x86/kvm/paging_tmpl.h | 24 
 3 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index aa0b469..54c9cb4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3408,6 +3408,18 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 
gpte, int level)
return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
 }
 
+static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
+{
+   unsigned mask;
+
+   BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
+
+   mask = (unsigned)~ACC_WRITE_MASK;
+   /* Allow write access to dirty gptes */
+   mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & 
PT_WRITABLE_MASK;
+   *access &= mask;
+}
+
 static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
   int *nr_present)
 {
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index e374db9..2832081 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -18,7 +18,8 @@
 #define PT_PCD_MASK (1ULL << 4)
 #define PT_ACCESSED_SHIFT 5
 #define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
-#define PT_DIRTY_MASK (1ULL << 6)
+#define PT_DIRTY_SHIFT 6
+#define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT)
 #define PT_PAGE_SIZE_MASK (1ULL << 7)
 #define PT_PAT_MASK (1ULL << 7)
 #define PT_GLOBAL_MASK (1ULL << 8)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index bf8c42b..bf7b4ff 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -101,14 +101,11 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
return (ret != orig_pte);
 }
 
-static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte,
-  bool last)
+static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
 {
unsigned access;
 
access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
-   if (last && !is_dirty_gpte(gpte))
-   access &= ~ACC_WRITE_MASK;
 
 #if PTTYPE == 64
if (vcpu->arch.mmu.nx)
@@ -222,8 +219,7 @@ retry_walk:
 
last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte);
if (last_gpte) {
-   pte_access = pt_access &
-FNAME(gpte_access)(vcpu, pte, true);
+   pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
/* check if the kernel is fetching from user page */
if (unlikely(pte_access & PT_USER_MASK) &&
kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
@@ -274,7 +270,7 @@ retry_walk:
break;
}
 
-   pt_access &= FNAME(gpte_access)(vcpu, pte, false);
+   pt_access &= FNAME(gpte_access)(vcpu, pte);
--walker->level;
}
 
@@ -283,7 +279,9 @@ retry_walk:
goto error;
}
 
-   if (write_fault && unlikely(!is_dirty_gpte(pte))) {
+   if (!write_fault)
+   protect_clean_gpte(&pte_access, pte);
+   else if (unlikely(!is_dirty_gpte(pte))) {
int ret;
 
trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
@@ -368,7 +366,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct 
kvm_mmu_page *sp,
return;
 
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
-   pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true);
+   pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+   protect_clean_gpte(&pte_access, gpte);
pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
if (mmu_invalid_pfn(pfn))
return;
@@ -441,8 +440,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, 
struct guest_walker *gw,
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
continue;
 
-   pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte,
- true);
+   pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+   protect_clean_gpte(&pte_access, gpte);
gfn = gpt