[PATCH v3 2/9] KVM: MMU: abstract spte write-protect

2012-04-20 Thread Xiao Guangrong
Introduce a common function to abstract spte write-protect to
cleanup the code

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/x86/kvm/mmu.c |   60 ++-
 1 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4a3cc18..e70ff38 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1041,6 +1041,34 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
rmap_remove(kvm, sptep);
 }

+/* Return true if the spte is dropped. */
+static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,
+  bool *flush)
+{
+   u64 spte = *sptep;
+
+   if (!is_writable_pte(spte))
+   return false;
+
+   *flush |= true;
+
+   if (large) {
+   pgprintk(rmap_write_protect(large): spte %p %llx\n,
+spte, *spte);
+   BUG_ON(!is_large_pte(spte));
+
+   drop_spte(kvm, sptep);
+   --kvm-stat.lpages;
+   return true;
+   }
+
+   rmap_printk(rmap_write_protect: spte %p %llx\n, spte, *spte);
+   spte = spte  ~PT_WRITABLE_MASK;
+   mmu_spte_update(sptep, spte);
+
+   return false;
+}
+
 static bool
 __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
 {
@@ -1050,24 +1078,13 @@ __rmap_write_protect(struct kvm *kvm, unsigned long 
*rmapp, int level)

for (sptep = rmap_get_first(*rmapp, iter); sptep;) {
BUG_ON(!(*sptep  PT_PRESENT_MASK));
-   rmap_printk(rmap_write_protect: spte %p %llx\n, sptep, 
*sptep);
-
-   if (!is_writable_pte(*sptep)) {
-   sptep = rmap_get_next(iter);
-   continue;
-   }
-
-   if (level == PT_PAGE_TABLE_LEVEL) {
-   mmu_spte_update(sptep, *sptep  ~PT_WRITABLE_MASK);
-   sptep = rmap_get_next(iter);
-   } else {
-   BUG_ON(!is_large_pte(*sptep));
-   drop_spte(kvm, sptep);
-   --kvm-stat.lpages;
+   if (spte_write_protect(kvm, sptep, level  PT_PAGE_TABLE_LEVEL,
+ write_protected)) {
sptep = rmap_get_first(*rmapp, iter);
+   continue;
}

-   write_protected = true;
+   sptep = rmap_get_next(iter);
}

return write_protected;
@@ -3902,6 +3919,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 {
struct kvm_mmu_page *sp;
+   bool flush = false;

list_for_each_entry(sp, kvm-arch.active_mmu_pages, link) {
int i;
@@ -3916,16 +3934,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, 
int slot)
  !is_last_spte(pt[i], sp-role.level))
continue;

-   if (is_large_pte(pt[i])) {
-   drop_spte(kvm, pt[i]);
-   --kvm-stat.lpages;
-   continue;
-   }
-
-   /* avoid RMW */
-   if (is_writable_pte(pt[i]))
-   mmu_spte_update(pt[i],
-   pt[i]  ~PT_WRITABLE_MASK);
+   spte_write_protect(kvm, pt[i],
+  is_large_pte(pt[i]), flush);
}
}
kvm_flush_remote_tlbs(kvm);
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 4/9] KVM: MMU: introduce SPTE_ALLOW_WRITE bit

2012-04-20 Thread Xiao Guangrong
This bit indicates whether the spte is allow to be writable that
means the gpte of this spte is writable and the pfn pointed by
this spte is writable on host

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/x86/kvm/mmu.c |   13 ++---
 1 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e70ff38..dd984b6 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -145,7 +145,8 @@ module_param(dbg, bool, 0644);
 #define CREATE_TRACE_POINTS
 #include mmutrace.h

-#define SPTE_HOST_WRITEABLE (1ULL  PT_FIRST_AVAIL_BITS_SHIFT)
+#define SPTE_HOST_WRITEABLE(1ULL  PT_FIRST_AVAIL_BITS_SHIFT)
+#define SPTE_ALLOW_WRITE   (1ULL  (PT_FIRST_AVAIL_BITS_SHIFT + 1))

 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)

@@ -1177,9 +1178,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned 
long *rmapp,
new_spte = *sptep  ~PT64_BASE_ADDR_MASK;
new_spte |= (u64)new_pfn  PAGE_SHIFT;

-   new_spte = ~PT_WRITABLE_MASK;
-   new_spte = ~SPTE_HOST_WRITEABLE;
-   new_spte = ~shadow_accessed_mask;
+   new_spte = ~(PT_WRITABLE_MASK | SPTE_HOST_WRITEABLE |
+ shadow_accessed_mask | SPTE_ALLOW_WRITE);

mmu_spte_clear_track_bits(sptep);
mmu_spte_set(sptep, new_spte);
@@ -2316,7 +2316,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
goto done;
}

-   spte |= PT_WRITABLE_MASK;
+   spte |= PT_WRITABLE_MASK | SPTE_ALLOW_WRITE;

if (!vcpu-arch.mmu.direct_map
 !(pte_access  ACC_WRITE_MASK)) {
@@ -2345,8 +2345,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 __func__, gfn);
ret = 1;
pte_access = ~ACC_WRITE_MASK;
-   if (is_writable_pte(spte))
-   spte = ~PT_WRITABLE_MASK;
+   spte = ~PT_WRITABLE_MASK;
}
}

-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 7/9] KVM: MMU: trace fast page fault

2012-04-20 Thread Xiao Guangrong
To see what happen on this path and help us to optimize it

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/x86/kvm/mmu.c  |2 ++
 arch/x86/kvm/mmutrace.h |   41 +
 2 files changed, 43 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7aea156..ac9c285 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2830,6 +2830,8 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t 
gva, gfn_t gfn,
spte, gfn);

 exit:
+   trace_fast_page_fault(vcpu, gva, gfn, error_code, iterator.sptep,
+ spte, ret);
walk_shadow_page_lockless_end(vcpu);

return ret;
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 89fb0e8..84da94f 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -243,6 +243,47 @@ TRACE_EVENT(
TP_printk(addr:%llx gfn %llx access %x, __entry-addr, __entry-gfn,
  __entry-access)
 );
+
+#define __spte_satisfied(__spte)   \
+   (__entry-retry  is_writable_pte(__entry-__spte))
+
+TRACE_EVENT(
+   fast_page_fault,
+   TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, u32 error_code,
+u64 *sptep, u64 old_spte, bool retry),
+   TP_ARGS(vcpu, gva, gfn, error_code, sptep, old_spte, retry),
+
+   TP_STRUCT__entry(
+   __field(int, vcpu_id)
+   __field(gva_t, gva)
+   __field(gfn_t, gfn)
+   __field(u32, error_code)
+   __field(u64 *, sptep)
+   __field(u64, old_spte)
+   __field(u64, new_spte)
+   __field(bool, retry)
+   ),
+
+   TP_fast_assign(
+   __entry-vcpu_id = vcpu-vcpu_id;
+   __entry-gva = gva;
+   __entry-gfn = gfn;
+   __entry-error_code = error_code;
+   __entry-sptep = sptep;
+   __entry-old_spte = old_spte;
+   __entry-new_spte = *sptep;
+   __entry-retry = retry;
+   ),
+
+   TP_printk(vcpu %d gva %lx gfn %llx error_code %s sptep %p 
+ old %#llx new %llx spurious %d fixed %d, __entry-vcpu_id,
+ __entry-gva, __entry-gfn,
+ __print_flags(__entry-error_code, |,
+   kvm_mmu_trace_pferr_flags),
+ __entry-sptep, __entry-old_spte, __entry-new_spte,
+ __spte_satisfied(old_spte), __spte_satisfied(new_spte)
+   )
+);
 #endif /* _TRACE_KVMMMU_H */

 #undef TRACE_INCLUDE_PATH
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 0/9] KVM: MMU: fast page fault

2012-04-20 Thread Xiao Guangrong
Changlog:

- split the long series for better review. This is the core
  implementation of fast page fault.
- document fast page fault in locking.txt

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 1/9] KVM: MMU: return bool in __rmap_write_protect

2012-04-20 Thread Xiao Guangrong
The reture value of __rmap_write_protect is either 1 or 0, use
true/false instead of these

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/x86/kvm/mmu.c |   13 +++--
 1 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 29ad6f9..4a3cc18 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1041,11 +1041,12 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
rmap_remove(kvm, sptep);
 }

-static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int 
level)
+static bool
+__rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
 {
u64 *sptep;
struct rmap_iterator iter;
-   int write_protected = 0;
+   bool write_protected = false;

for (sptep = rmap_get_first(*rmapp, iter); sptep;) {
BUG_ON(!(*sptep  PT_PRESENT_MASK));
@@ -1066,7 +1067,7 @@ static int __rmap_write_protect(struct kvm *kvm, unsigned 
long *rmapp, int level
sptep = rmap_get_first(*rmapp, iter);
}

-   write_protected = 1;
+   write_protected = true;
}

return write_protected;
@@ -1097,12 +1098,12 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
}
 }

-static int rmap_write_protect(struct kvm *kvm, u64 gfn)
+static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
 {
struct kvm_memory_slot *slot;
unsigned long *rmapp;
int i;
-   int write_protected = 0;
+   bool write_protected = false;

slot = gfn_to_memslot(kvm, gfn);

@@ -1691,7 +1692,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,

kvm_mmu_pages_init(parent, parents, pages);
while (mmu_unsync_walk(parent, pages)) {
-   int protected = 0;
+   bool protected = false;

for_each_sp(pages, sp, parents, i)
protected |= rmap_write_protect(vcpu-kvm, sp-gfn);
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 6/9] KVM: MMU: fast path of handling guest page fault

2012-04-20 Thread Xiao Guangrong
If the the present bit of page fault error code is set, it indicates
the shadow page is populated on all levels, it means what we do is
only modify the access bit which can be done out of mmu-lock

Currently, in order to simplify the code, we only fix the page fault
caused by write-protect on the fast path

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/x86/kvm/mmu.c |  197 
 arch/x86/kvm/paging_tmpl.h |3 +
 2 files changed, 184 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index eb02fc4..7aea156 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -446,6 +446,13 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
 }
 #endif

+static bool spte_wp_by_dirty_log(u64 spte)
+{
+   WARN_ON(is_writable_pte(spte));
+
+   return (spte  SPTE_ALLOW_WRITE)  !(spte  SPTE_WRITE_PROTECT);
+}
+
 static bool spte_has_volatile_bits(u64 spte)
 {
if (!shadow_accessed_mask)
@@ -454,9 +461,18 @@ static bool spte_has_volatile_bits(u64 spte)
if (!is_shadow_present_pte(spte))
return false;

-   if ((spte  shadow_accessed_mask) 
- (!is_writable_pte(spte) || (spte  shadow_dirty_mask)))
-   return false;
+   if (spte  shadow_accessed_mask) {
+   if (is_writable_pte(spte))
+   return !(spte  shadow_dirty_mask);
+
+   /*
+* If the spte is write-protected by dirty-log, it may
+* be marked writable on fast page fault path, then CPU
+* can modify the Dirty bit.
+*/
+   if (!spte_wp_by_dirty_log(spte))
+   return false;
+   }

return true;
 }
@@ -1043,13 +1059,6 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
rmap_remove(kvm, sptep);
 }

-static bool spte_wp_by_dirty_log(u64 spte)
-{
-   WARN_ON(is_writable_pte(spte));
-
-   return (spte  SPTE_ALLOW_WRITE)  !(spte  SPTE_WRITE_PROTECT);
-}
-
 /* Return true if the spte is dropped. */
 static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,
   bool *flush, bool page_table_protect)
@@ -1057,13 +1066,12 @@ static bool spte_write_protect(struct kvm *kvm, u64 
*sptep, bool large,
u64 spte = *sptep;

if (is_writable_pte(spte)) {
-   *flush |= true;
-
if (large) {
pgprintk(rmap_write_protect(large): spte %p %llx\n,
 spte, *spte);
BUG_ON(!is_large_pte(spte));

+   *flush |= true;
drop_spte(kvm, sptep);
--kvm-stat.lpages;
return true;
@@ -1072,6 +1080,9 @@ static bool spte_write_protect(struct kvm *kvm, u64 
*sptep, bool large,
goto reset_spte;
}

+   /* We need flush tlbs in this case: the fast page fault path
+* can mark the spte writable after we read the sptep.
+*/
if (page_table_protect  spte_wp_by_dirty_log(spte))
goto reset_spte;

@@ -1079,6 +1090,8 @@ static bool spte_write_protect(struct kvm *kvm, u64 
*sptep, bool large,

 reset_spte:
rmap_printk(rmap_write_protect: spte %p %llx\n, spte, *spte);
+
+   *flush |= true;
spte = spte  ~PT_WRITABLE_MASK;
if (page_table_protect)
spte |= SPTE_WRITE_PROTECT;
@@ -2676,18 +2689,164 @@ exit:
return ret;
 }

+static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, gfn_t gfn,
+  u32 error_code)
+{
+   /*
+* #PF can be fast only if the shadow page table is present and it
+* is caused by write-protect, that means we just need change the
+* W bit of the spte which can be done out of mmu-lock.
+*/
+   if (!(error_code  PFERR_PRESENT_MASK) ||
+ !(error_code  PFERR_WRITE_MASK))
+   return false;
+
+   return true;
+}
+
+static bool
+fast_pf_fix_indirect_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ u64 *sptep, u64 spte, gfn_t gfn)
+{
+   pfn_t pfn;
+   bool ret = false;
+
+   /*
+* For the indirect spte, it is hard to get a stable gfn since
+* we just use a cmpxchg to avoid all the races which is not
+* enough to avoid the ABA problem: the host can arbitrarily
+* change spte and the mapping from gfn to pfn.
+*
+* What we do is call gfn_to_pfn_atomic to bind the gfn and the
+* pfn because after the call:
+* - we have held the refcount of pfn that means the pfn can not
+*   be freed and be reused for another gfn.
+* - the pfn is writable that means it can not be shared by different
+*   gfn.
+*/
+   pfn = gfn_to_pfn_atomic(vcpu-kvm, gfn);
+
+   /* The host page is 

[PATCH v3 3/9] KVM: VMX: export PFEC.P bit on ept

2012-04-20 Thread Xiao Guangrong
Export the present bit of page fault error code, the later patch
will use it

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/x86/kvm/vmx.c |9 -
 1 files changed, 8 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 52f6856..2c98057 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4735,6 +4735,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 {
unsigned long exit_qualification;
gpa_t gpa;
+   u32 error_code;
int gla_validity;

exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -4759,7 +4760,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)

gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
trace_kvm_page_fault(gpa, exit_qualification);
-   return kvm_mmu_page_fault(vcpu, gpa, exit_qualification  0x3, NULL, 0);
+
+   /* It is a write fault? */
+   error_code = exit_qualification  (1U  1);
+   /* ept page table is present? */
+   error_code |= (exit_qualification  3)  0x1;
+
+   return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
 }

 static u64 ept_rsvd_mask(u64 spte, int level)
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 9/9] KVM: MMU: document mmu-lock and fast page fault

2012-04-20 Thread Xiao Guangrong
Document fast page fault and mmu-lock in locking.txt

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 Documentation/virtual/kvm/locking.txt |  148 -
 1 files changed, 147 insertions(+), 1 deletions(-)

diff --git a/Documentation/virtual/kvm/locking.txt 
b/Documentation/virtual/kvm/locking.txt
index 3b4cd3b..089d462 100644
--- a/Documentation/virtual/kvm/locking.txt
+++ b/Documentation/virtual/kvm/locking.txt
@@ -6,7 +6,147 @@ KVM Lock Overview

 (to be written)

-2. Reference
+3: Exception
+
+
+Fast page fault:
+
+Fast page fault is the fast path which fixes the guest page fault out of
+the mmu-lock on x86. Currently, the page fault can be fast only if the
+shadow page table is present and it is caused by write-protect, that means
+we just need change the W bit of the spte.
+
+What we use to avoid all the race is the SPTE_ALLOW_WRITE bit and
+SPTE_WRITE_PROTECT bit on the spte:
+- SPTE_ALLOW_WRITE means the gfn is writable on both guest and host.
+- SPTE_WRITE_PROTECT means the gfn is not write-protected for shadow page
+  write protection.
+
+On fast page fault path, we will use cmpxchg to atomically set the spte W
+bit if spte.SPTE_WRITE_PROTECT = 1 and spte.SPTE_WRITE_PROTECT = 0, this is
+safe because whenever changing these bits can be detected by cmpxchg.
+
+But we need carefully check these cases:
+1): The mapping from gfn to pfn
+
+The mapping from gfn to pfn may be changed since we can only ensure the pfn
+is not changed during cmpxchg. This is a ABA problem, for example, below case
+will happen:
+
+At the beginning:
+gpte = gfn1
+gfn1 is mapped to pfn1 on host
+spte is the shadow page table entry corresponding with gpte and
+spte = pfn1
+
+   VCPU 0   VCPU0
+on fast page fault path:
+
+   old_spte = *spte;
+ pfn1 is swapped out:
+spte = 0;
+
+ pfn1 is re-alloced for gfn2.
+
+ gpte is changed to point to
+ gfn2 by the guest:
+spte = pfn1;
+
+   if (cmpxchg(spte, old_spte, old_spte+W)
+   mark_page_dirty(vcpu-kvm, gfn1)
+ OOPS!!!
+
+We dirty-log for gfn1, that means gfn2 is lost in dirty-bitmap.
+
+For direct sp, we can easily avoid it since the spte of direct sp is fixed
+to gfn. For indirect sp, before we do cmpxchg, we call gfn_to_pfn_atomic()
+to pin gfn to pfn, because after gfn_to_pfn_atomic():
+- We have held the refcount of pfn that means the pfn can not be freed and
+  be reused for another gfn.
+- The pfn is writable that means it can not be shared between different gfns
+  by KSM.
+
+Then, we can ensure the dirty bitmaps is correctly set for a gfn.
+
+2): flush tlbs due to shadow page table write-protected
+
+In rmap_write_protect(), we always need flush tlbs if spte.SPTE_ALLOW_WRITE = 1
+even if the current spte is read-only. The reason is fast page fault path
+will mark the spte to writable and the writable spte will be cached into tlb.
+Like below case:
+
+At the beginning:
+spte.W = 0
+spte.SPTE_WRITE_PROTECT = 0
+spte.SPTE_ALLOW_WRITE = 1
+
+   VCPU 0  VCPU0
+In rmap_write_protect():
+
+   flush = false;
+
+   if (spte.W == 1)
+  flush = true;
+
+   On fast page fault path:
+  old_spte = *spte
+  cmpxchg(spte, old_spte, old_spte + W)
+
+   the spte is fetched/prefetched into
+   tlb by CPU
+
+   spte = (spte | SPTE_WRITE_PROTECT) 
+  ~PT_WRITABLE_MASK;
+
+   if (flush)
+ kvm_flush_remote_tlbs(vcpu-kvm)
+   OOPS!!!
+
+The tlbs are not flushed since the spte is read-only, but invalid writable
+spte has been cached in the tlbs caused by fast page fault.
+
+3): Dirty bit tracking
+In the origin code, the spte can be fast updated (non-atomically) if the
+spte is read-only and the Accessed bit has already been set since the
+Accessed bit and Dirty bit can not be lost.
+
+But it is not true after fast page fault since the spte can be marked
+writable between reading spte and updating spte. Like below case:
+
+At the beginning:
+spte.W = 0
+spte.Accessed = 1
+
+   VCPU 0   VCPU0
+In mmu_spte_clear_track_bits():
+
+   old_spte = *spte;
+
+   /* 'if' condition is satisfied. */
+   if (old_spte.Accssed == 1 
+old_spte.W == 0)
+  spte = 0ull;
+ on fast page fault path:
+ spte.W = 1
+ memory write on the spte:
+ spte.Dirty = 1
+
+
+   else
+  old_spte = xchg(spte, 0ull)
+
+
+   if (old_spte.Accssed == 1)
+  kvm_set_pfn_accessed(spte.pfn);
+   if (old_spte.Dirty == 

[PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit

2012-04-20 Thread Xiao Guangrong
If this bit is set, it means the W bit of the spte is cleared due
to shadow page table protection

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/x86/kvm/mmu.c |   56 ++-
 1 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index dd984b6..eb02fc4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -147,6 +147,7 @@ module_param(dbg, bool, 0644);

 #define SPTE_HOST_WRITEABLE(1ULL  PT_FIRST_AVAIL_BITS_SHIFT)
 #define SPTE_ALLOW_WRITE   (1ULL  (PT_FIRST_AVAIL_BITS_SHIFT + 1))
+#define SPTE_WRITE_PROTECT (1ULL  (PT_FIRST_AVAIL_BITS_SHIFT + 2))

 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)

@@ -1042,36 +1043,51 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
rmap_remove(kvm, sptep);
 }

+static bool spte_wp_by_dirty_log(u64 spte)
+{
+   WARN_ON(is_writable_pte(spte));
+
+   return (spte  SPTE_ALLOW_WRITE)  !(spte  SPTE_WRITE_PROTECT);
+}
+
 /* Return true if the spte is dropped. */
 static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,
-  bool *flush)
+  bool *flush, bool page_table_protect)
 {
u64 spte = *sptep;

-   if (!is_writable_pte(spte))
-   return false;
+   if (is_writable_pte(spte)) {
+   *flush |= true;

-   *flush |= true;
+   if (large) {
+   pgprintk(rmap_write_protect(large): spte %p %llx\n,
+spte, *spte);
+   BUG_ON(!is_large_pte(spte));

-   if (large) {
-   pgprintk(rmap_write_protect(large): spte %p %llx\n,
-spte, *spte);
-   BUG_ON(!is_large_pte(spte));
+   drop_spte(kvm, sptep);
+   --kvm-stat.lpages;
+   return true;
+   }

-   drop_spte(kvm, sptep);
-   --kvm-stat.lpages;
-   return true;
+   goto reset_spte;
}

+   if (page_table_protect  spte_wp_by_dirty_log(spte))
+   goto reset_spte;
+
+   return false;
+
+reset_spte:
rmap_printk(rmap_write_protect: spte %p %llx\n, spte, *spte);
spte = spte  ~PT_WRITABLE_MASK;
+   if (page_table_protect)
+   spte |= SPTE_WRITE_PROTECT;
mmu_spte_update(sptep, spte);
-
return false;
 }

-static bool
-__rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
+static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
+int level, bool page_table_protect)
 {
u64 *sptep;
struct rmap_iterator iter;
@@ -1080,7 +1096,7 @@ __rmap_write_protect(struct kvm *kvm, unsigned long 
*rmapp, int level)
for (sptep = rmap_get_first(*rmapp, iter); sptep;) {
BUG_ON(!(*sptep  PT_PRESENT_MASK));
if (spte_write_protect(kvm, sptep, level  PT_PAGE_TABLE_LEVEL,
- write_protected)) {
+ write_protected, page_table_protect)) {
sptep = rmap_get_first(*rmapp, iter);
continue;
}
@@ -1109,7 +1125,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,

while (mask) {
rmapp = slot-rmap[gfn_offset + __ffs(mask)];
-   __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL);
+   __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false);

/* clear the first set bit */
mask = mask - 1;
@@ -1128,7 +1144,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
for (i = PT_PAGE_TABLE_LEVEL;
 i  PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
rmapp = __gfn_to_rmap(gfn, i, slot);
-   write_protected |= __rmap_write_protect(kvm, rmapp, i);
+   write_protected |= __rmap_write_protect(kvm, rmapp, i, true);
}

return write_protected;
@@ -1179,7 +1195,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned 
long *rmapp,
new_spte |= (u64)new_pfn  PAGE_SHIFT;

new_spte = ~(PT_WRITABLE_MASK | SPTE_HOST_WRITEABLE |
- shadow_accessed_mask | SPTE_ALLOW_WRITE);
+ shadow_accessed_mask | SPTE_ALLOW_WRITE |
+ SPTE_WRITE_PROTECT);

mmu_spte_clear_track_bits(sptep);
mmu_spte_set(sptep, new_spte);
@@ -2346,6 +2363,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
ret = 1;
pte_access = ~ACC_WRITE_MASK;
spte = ~PT_WRITABLE_MASK;
+   spte |= SPTE_WRITE_PROTECT;
}
}

@@ 

[PATCH v3 8/9] KVM: MMU: fix kvm_mmu_pagetable_walk tracepoint

2012-04-20 Thread Xiao Guangrong
P bit of page fault error code is missed in this tracepoint, fix it by
passing the full error code

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/x86/kvm/mmutrace.h|7 +++
 arch/x86/kvm/paging_tmpl.h |3 +--
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 84da94f..e762d35 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -54,8 +54,8 @@
  */
 TRACE_EVENT(
kvm_mmu_pagetable_walk,
-   TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault),
-   TP_ARGS(addr, write_fault, user_fault, fetch_fault),
+   TP_PROTO(u64 addr, u32 pferr),
+   TP_ARGS(addr, pferr),

TP_STRUCT__entry(
__field(__u64, addr)
@@ -64,8 +64,7 @@ TRACE_EVENT(

TP_fast_assign(
__entry-addr = addr;
-   __entry-pferr = (!!write_fault  1) | (!!user_fault  2)
-| (!!fetch_fault  4);
+   __entry-pferr = pferr;
),

TP_printk(addr %llx pferr %x %s, __entry-addr, __entry-pferr,
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 80493fb..8e6aac9 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -154,8 +154,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker 
*walker,
const int fetch_fault = access  PFERR_FETCH_MASK;
u16 errcode = 0;

-   trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
-fetch_fault);
+   trace_kvm_mmu_pagetable_walk(addr, access);
 retry_walk:
eperm = false;
walker-level = mmu-root_level;
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/4] Export offsets of VMCS fields as note information for kdump

2012-04-20 Thread HATAYAMA Daisuke
From: Avi Kivity a...@redhat.com
Subject: Re: [PATCH 0/4] Export offsets of VMCS fields as note information for 
kdump
Date: Thu, 19 Apr 2012 15:08:00 +0300

 On 04/19/2012 03:01 PM, HATAYAMA Daisuke wrote:
  It would be not helpful for the qemu crash case you are concerned
  about. We want to use the guest state data to look into guest
  machine's image in the crasshed qemu.
  
  Why?
  

 It seems natural to check the situation from guest machine's side when
 qemu crashs. Suppose a service is running on the guest machine, and
 then the qemu crash. Then, we may need to know the details of the
 progress of the service if it's important. What has been successfully
 done, and what has not yet.
 
 How can a service on the guest be related to a qemu crash?  And how
 would guest registers help?

I don't mean the service is related to qemu's crash. When qemu
crashes, then the guest machine also crashes (although it doesn't
notice anything). What I'm interested in here is guest machine side,
not qemu side. I want to debug guest machine, not qemu.

 
 You can extract the list of running processes from a qemu crash dump
 without looking at guest registers.  And most vcpus are running
 asynchronously to qemu, so their register contents is irrelevant (if a
 vcpu is running synchronously with qemu - it just exited to qemu and is
 waiting for a response - then you'd see the details in qemu's call stack).
 

Just as you say, we can refer to guest machine's memory without guest
registers.

The definitely necessary data in vmcs are RSP and RIP, which are not
saved in anywhare of kvm module. The two registers are needed for back
tracing to determine what processsing is being done on the guest
machine at qemu crash.

There are other useful data in vmcs, but even if we don't have them,
we can do what we want to do in exchange of usability.

For example, we want IA32_EFER.LMA to determine whether guest machine
is in 32-bit or 64-bit mode. But there are only two modes, we can
perhaps try these in order in an ad-hoc way.

Other control registers are also useful, but they are not so important
than definitely needed.

Thanks.
HATAYAMA, Daisuke

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH 2/2] Expose tsc deadline timer cpuid to guest

2012-04-20 Thread Jan Kiszka
On 2012-04-19 22:03, Eduardo Habkost wrote:
 Jan/Avi: ping?
 
 I would like to get this ABI detail clarified so it can be implemented
 the right way on Qemu and KVM.
 
 My proposal is to simply add tsc-deadline to the data returned by
 GET_SUPPORTED_CPUID, making KVM_CAP_TSC_DEADLINE_TIMER unnecessary.
 

IIRC, there were problems with this model to exclude that the feature
gets reported this way without ensuring that in-kernel irqchip is
actually activated. Please browse the discussions, it should be
documented there.

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4] KVM: PPC Use clockevent multiplier and shifter for decrementer

2012-04-20 Thread Alexander Graf

On 18.04.2012, at 18:01, Bharat Bhushan wrote:

 Time for which the hrtimer is started for decrementer emulation is calculated 
 using tb_ticks_per_usec. While hrtimer uses the clockevent for DEC 
 reprogramming (if needed) and which calculate timebase ticks using the 
 multiplier and shifter mechanism implemented within clockevent layer. It was 
 observed that this conversion (timebase-time-timebase) are not correct 
 because the mechanism are not consistent. In our setup it adds 2% jitter.
 
 With this patch clockevent multiplier and shifter mechanism are used when 
 starting hrtimer for decrementer emulation. Now the jitter is  0.5%.
 
 Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com

Thanks, applied to kvm-ppc-next with fixed commit message and fixed trailing 
whitespace :).


Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [RFC PATCH 6/9] pc: pass paravirt info for hotplug memory slots to BIOS

2012-04-20 Thread Igor Mammedov

On 04/19/2012 04:08 PM, Vasilis Liaskovitis wrote:

  The numa_fw_cfg paravirt interface is extended to include SRAT information for
  all hotplug-able memslots. There are 3 words for each hotplug-able memory 
slot,
  denoting start address, size and node proximity. nb_numa_nodes is set to 1 by
  default (not 0), so that we always pass srat info to SeaBIOS.

  This information is used by Seabios to build hotplug memory device objects at 
runtime.

  Signed-off-by: Vasilis Liaskovitisvasilis.liaskovi...@profitbricks.com
---
  hw/pc.c |   59 +--
  vl.c|4 +++-
  2 files changed, 56 insertions(+), 7 deletions(-)

diff --git a/hw/pc.c b/hw/pc.c
index 67f0479..f1f550a 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -46,6 +46,7 @@
  #include ui/qemu-spice.h
  #include memory.h
  #include exec-memory.h
+#include memslot.h

  /* output Bochs bios info messages */
  //#define DEBUG_BIOS
@@ -592,12 +593,15 @@ int e820_add_entry(uint64_t address, uint64_t length, 
uint32_t type)
  return index;
  }

+static void bochs_bios_setup_hp_memslots(uint64_t *fw_cfg_slots);
+
  static void *bochs_bios_init(void)
  {
  void *fw_cfg;
  uint8_t *smbios_table;
  size_t smbios_len;
  uint64_t *numa_fw_cfg;
+uint64_t *hp_memslots_fw_cfg;
  int i, j;

  register_ioport_write(0x400, 1, 2, bochs_bios_write, NULL);
@@ -630,28 +634,71 @@ static void *bochs_bios_init(void)
  fw_cfg_add_bytes(fw_cfg, FW_CFG_HPET, (uint8_t *)hpet_cfg,
   sizeof(struct hpet_fw_config));
  /* allocate memory for the NUMA channel: one (64bit) word for the number
- * of nodes, one word for each VCPU-node and one word for each node to
- * hold the amount of memory.
+ * of nodes, one word for the number of hotplug memory slots, one word
+ * for each VCPU-node, one word for each node to hold the amount of 
memory.
+ * Finally three words for each hotplug memory slot, denoting start 
address,
+ * size and node proximity.
   */
-numa_fw_cfg = g_malloc0((1 + max_cpus + nb_numa_nodes) * 8);
+numa_fw_cfg = g_malloc0((2 + max_cpus + nb_numa_nodes + 3 * 
nb_hp_memslots) * 8);
  numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes);
+numa_fw_cfg[1] = cpu_to_le64(nb_hp_memslots);

this will brake compatibility if guest was migrated from old-new qemu
than on reboot it will use old bios that expects numa_fw_cfg[1] to be something 
else.
Could memslots info be moved to the end of an existing interface?


+
  for (i = 0; i  max_cpus; i++) {
  for (j = 0; j  nb_numa_nodes; j++) {
  if (node_cpumask[j]  (1  i)) {
-numa_fw_cfg[i + 1] = cpu_to_le64(j);
+numa_fw_cfg[i + 2] = cpu_to_le64(j);
  break;
  }
  }
  }
  for (i = 0; i  nb_numa_nodes; i++) {
-numa_fw_cfg[max_cpus + 1 + i] = cpu_to_le64(node_mem[i]);
+numa_fw_cfg[max_cpus + 2 + i] = cpu_to_le64(node_mem[i]);
  }
+
+hp_memslots_fw_cfg = numa_fw_cfg + 2 + max_cpus + nb_numa_nodes;
+if (nb_hp_memslots)
+bochs_bios_setup_hp_memslots(hp_memslots_fw_cfg);
+
  fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, (uint8_t *)numa_fw_cfg,
- (1 + max_cpus + nb_numa_nodes) * 8);
+ (2 + max_cpus + nb_numa_nodes + 3 * nb_hp_memslots) * 8);

  return fw_cfg;
  }

+static void bochs_bios_setup_hp_memslots(uint64_t *fw_cfg_slots)
+{
+int i = 0;
+Error *err = NULL;
+DeviceState *dev;
+MemSlotState *slot;
+char *type;
+BusState *bus = sysbus_get_default();
+
+QTAILQ_FOREACH(dev,bus-children, sibling) {
+type = object_property_get_str(OBJECT(dev), type,err);
+if (err) {
+error_free(err);
+fprintf(stderr, error getting device type\n);
+exit(1);
+}
+
+if (!strcmp(type, memslot)) {
+if (!dev-id) {
+error_free(err);
+fprintf(stderr, error getting memslot device id\n);
+exit(1);
+}
+if (!strcmp(dev-id, initialslot)) continue;
+slot = MEMSLOT(dev);
+fw_cfg_slots[3 * slot-idx] = cpu_to_le64(slot-start);
+fw_cfg_slots[3 * slot-idx + 1] = cpu_to_le64(slot-size);
+fw_cfg_slots[3 * slot-idx + 2] = cpu_to_le64(slot-node);
+i++;
+}
+}
+assert(i == nb_hp_memslots);
+}
+
  static long get_file_size(FILE *f)
  {
  long where, size;
diff --git a/vl.c b/vl.c
index ae91a8a..50df453 100644
--- a/vl.c
+++ b/vl.c
@@ -3428,8 +3428,10 @@ int main(int argc, char **argv, char **envp)

  register_savevm_live(NULL, ram, 0, 4, NULL, ram_save_live, NULL,
   ram_load, NULL);
+if (!nb_numa_nodes)
+nb_numa_nodes = 1;

-if (nb_numa_nodes  0) {
+{
  int i;

  if (nb_numa_nodes  MAX_NODES) {


--
-
 Igor
--
To unsubscribe from this list: send 

Re: [Qemu-devel] [RFC PATCH 2/9][SeaBIOS] Implement acpi-dsdt functions for memory hotplug.

2012-04-20 Thread Igor Mammedov

On 04/19/2012 04:08 PM, Vasilis Liaskovitis wrote:

Extend the DSDT to include methods for handling memory hot-add and hot-remove
notifications and memory device status requests. These functions are called
from the memory device SSDT methods.

Eject has only been tested with level gpe event, but will be changed to edge gpe
event soon, according to recent master patch for other ACPI hotplug events.

Signed-off-by: Vasilis Liaskovitisvasilis.liaskovi...@profitbricks.com
---
  src/acpi-dsdt.dsl |   68 +++-
  1 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/src/acpi-dsdt.dsl b/src/acpi-dsdt.dsl
index 4bdc268..184daf0 100644
--- a/src/acpi-dsdt.dsl
+++ b/src/acpi-dsdt.dsl
@@ -709,9 +709,72 @@ DefinitionBlock (
  }
  Return(One)
  }
-}

+/* Objects filled in by run-time generated SSDT */
+External(MTFY, MethodObj)
+External(MEON, PkgObj)
+
+Method (CMST, 1, NotSerialized) {
+// _STA method - return ON status of memdevice
+// Local0 = MEON flag for this cpu
+Store(DerefOf(Index(MEON, Arg0)), Local0)
+If (Local0) { Return(0xF) } Else { Return(0x0) }
+}
+/* Memory eject notify method */
+OperationRegion(MEMJ, SystemIO, 0xaf40, 32)
+Field (MEMJ, ByteAcc, NoLock, Preserve)
+{
+MPE, 256
+}
+
+Method (MPEJ, 2, NotSerialized) {
+// _EJ0 method - eject callback
+Store(ShiftLeft(1,Arg0), MPE)
+Sleep(200)
+}

MPE is write only and only one memslot is ejected at a time. Why 256 bit-field 
is here then?
Could we use just 1 byte and write a slot number into it and save some io 
address space this way?


+
+/* Memory hotplug notify method */
+OperationRegion(MEST, SystemIO, 0xaf20, 32)

It's more a suggestion: move it a bit farther to allow maybe 1024 cpus in the 
future.
That will prevent compatibility a headache, if we decide to expand support to 
more then
256 cpus.

Or event better to make this address configurable in run-time and build this 
var along
with SSDT (converting along the way all other hard-coded io ports to the same 
generic
run-time interface). This wish is out of scope of this patch-set, but what
do you think about the idea?


+Field (MEST, ByteAcc, NoLock, Preserve)
+{
+MES, 256
+}
+
+Method(MESC, 0) {
+// Local5 = active memdevice bitmap
+Store (MES, Local5)
+// Local2 = last read byte from bitmap
+Store (Zero, Local2)
+// Local0 = memory device iterator
+Store (Zero, Local0)
+While (LLess(Local0, SizeOf(MEON))) {
+// Local1 = MEON flag for this memory device
+Store(DerefOf(Index(MEON, Local0)), Local1)
+If (And(Local0, 0x07)) {
+// Shift down previously read bitmap byte
+ShiftRight(Local2, 1, Local2)
+} Else {
+// Read next byte from memdevice bitmap
+Store(DerefOf(Index(Local5, ShiftRight(Local0, 3))), 
Local2)
+}
+// Local3 = active state for this memory device
+Store(And(Local2, 1), Local3)

+If (LNotEqual(Local1, Local3)) {
+// State change - update MEON with new state
+Store(Local3, Index(MEON, Local0))
+// Do MEM notify
+If (LEqual(Local3, 1)) {
+MTFY(Local0, 1)
+} Else {
+MTFY(Local0, 3)
+}
+}
+Increment(Local0)
+}
+Return(One)
+}
+}
  /
   * General purpose events
   /
@@ -732,7 +795,8 @@ DefinitionBlock (
  Return(\_SB.PRSC())
  }
  Method(_L03) {
-Return(0x01)
+// Memory hotplug event
+Return(\_SB.MESC())
  }
  Method(_L04) {
  Return(0x01)


--
-
 Igor
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [RFC PATCH 2/9][SeaBIOS] Implement acpi-dsdt functions for memory hotplug.

2012-04-20 Thread Vasilis Liaskovitis
Hi,

On Fri, Apr 20, 2012 at 12:55:24PM +0200, Igor Mammedov wrote:
 +/* Memory eject notify method */
 +OperationRegion(MEMJ, SystemIO, 0xaf40, 32)
 +Field (MEMJ, ByteAcc, NoLock, Preserve)
 +{
 +MPE, 256
 +}
 +
 +Method (MPEJ, 2, NotSerialized) {
 +// _EJ0 method - eject callback
 +Store(ShiftLeft(1,Arg0), MPE)
 +Sleep(200)
 +}
 MPE is write only and only one memslot is ejected at a time. Why 256 
 bit-field is here then?
 Could we use just 1 byte and write a slot number into it and save some io 
 address space this way?

good point. This was implemented similarly to the hot-add/status register only
for symmetry, but you are right, since only one slot is ejected at a time, this
can be reduced to one byte and save space. I will update for the next version.

 
 +
 +/* Memory hotplug notify method */
 +OperationRegion(MEST, SystemIO, 0xaf20, 32)
 It's more a suggestion: move it a bit farther to allow maybe 1024 cpus in the 
 future.
 That will prevent compatibility a headache, if we decide to expand support to 
 more then
 256 cpus.

ok, I will move it to 0xaf80 or higher (so cpu-hotplug could be extended to at
least 1024 cpus)

 
 Or event better to make this address configurable in run-time and build this 
 var along
 with SSDT (converting along the way all other hard-coded io ports to the same 
 generic
 run-time interface). This wish is out of scope of this patch-set, but what
 do you think about the idea?

yes, that would give more flexibility and avoid more compatibility headaches.
As you say it's not a main issue for the series, but I can work on it as we 
start
converting hardcoded i/o ports to configurable properties.

thanks,

- Vasilis
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Networking performance on a KVM Host (with no guests)

2012-04-20 Thread Chegu Vinod

On 4/18/2012 10:43 PM, Gleb Natapov wrote:

On Thu, Apr 19, 2012 at 03:53:39AM +, Chegu Vinod wrote:

Hello,

Perhaps this query was answered in the past. If yes kindly point me to
the same.

We noticed differences in networking performance (measured via netperf
over a 10G  NIC) on an X86_64 server between the following two
configurations :

1) Server run as a KVM Host (but with no KVM guests created on it (or) no
extra  bridges created other than the default vibr0 bridge)

vs.

2) The same server running the same version of Linux but without any of the
virtualization software installed on it.

Config #2 performed much better !
Is this expected ? What are the reasons behind this ?

Is there any way to gain back the loss in performance on the KVM host
without having to uninstall the virtualization software (i.e. in Config 1).




Just an observation   :

Removing the intel_iommu=on boot time parameter  in the Config 1 case 
seemed to help
gain back the networking performance on the KVM host... i.e. performed 
nearly the same as

in Config2.   Didn't have to change any of the VT settings in the BIOS...

Trying to understand why that's the case...   Any pointers ?

Vinod


--
Gleb.



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/9] ACPI memory hotplug

2012-04-20 Thread Vasilis Liaskovitis
On Thu, Apr 19, 2012 at 04:08:38PM +0200, Vasilis Liaskovitis wrote:
 
 series is based on uq/master for qemu-kvm, and master for seabios. Can be 
 found
 also at:
forgot to paste the repo links in the original coverletter, here they are if
someone wants them:

https://github.com/vliaskov/qemu-kvm/commits/memory-hotplug 
https://github.com/vliaskov/seabios/commits/memory-hotplug

thanks,

- Vasilis
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH 2/2] Expose tsc deadline timer cpuid to guest

2012-04-20 Thread Eduardo Habkost
On Fri, Apr 20, 2012 at 12:12:38PM +0200, Jan Kiszka wrote:
 On 2012-04-19 22:03, Eduardo Habkost wrote:
  Jan/Avi: ping?
  
  I would like to get this ABI detail clarified so it can be implemented
  the right way on Qemu and KVM.
  
  My proposal is to simply add tsc-deadline to the data returned by
  GET_SUPPORTED_CPUID, making KVM_CAP_TSC_DEADLINE_TIMER unnecessary.
  
 
 IIRC, there were problems with this model to exclude that the feature
 gets reported this way without ensuring that in-kernel irqchip is
 actually activated. Please browse the discussions, it should be
 documented there.

The flag was never added to GET_SUPPORTED_CPUID on any of the git
repositories I have checked, and I don't see that being done anywhere on
my KVM mailing list archives, either. So I don't see how we could have
had issues with GET_SUPPORTED_CPUID, if it was never present on the
code.

What was present on the code before the fix, was coded that
unconditinally enabled the flag even if Qemu never asked for it, and
that really was wrong.

GET_SUPPORTED_CPUID doesn't enable any flag: it just tells userspace
that the hardware and KVM supports the feature (and, surprise, this is
exactly what KVM_CAP_TSC_DEADLINE_TIMER means too, isn't it?). It's up
to userspace to enable the CPUID bits according to user requirements and
userspace capabilities (in other words: only when userspace knows it is
safe because the in-kernel irqchip is enabled).

If the above is not what GET_SUPPORTED_CPUID means, I would like to get
that clarified, so I can submit an updated to KVM API documentation.

-- 
Eduardo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Question about host CPU usage/allocation by KVM

2012-04-20 Thread Brian Jackson
On Thu, 19 Apr 2012 13:01:54 -0500, Alexander Lyakas  
alex.bols...@gmail.com wrote:



Hi Stuart,
I have been doing some experiments, and I noticed that there are
additional QEMU threads, besides the ones reported by info cpus
command. In particular, the main QEMU thread (the one whose LWP is the
same as its PID), also consumes significant CPU time. Is this
expected?


The extra threads are for various things. It can be for the vnc server if  
you are using it. Threads are used to mimic aio in certain situations.  
Etc. The main thread also does a lot of the device emulation work  
(console, network, serial, block, etc.)





Alex.


On Wed, Apr 18, 2012 at 8:24 PM, Stuart Yoder b08...@gmail.com wrote:

On Tue, Apr 17, 2012 at 4:54 PM, Alexander Lyakas
alex.bols...@gmail.com wrote:

Greetings everybody,

Can anybody please point me to code/documentation regarding the
following questions I have:

- What does it actually mean using -smp N option, in terms of CPU
sharing between the host and the guest?
- How are guest CPUs mapped to host CPUs (if at all)?


Each guest CPU (vcpu) corresponds to a QEMU thread.
You can see the thread ids in QEMU with info cpus in the
QEMU monitor.

Since a vcpu is a thread you can apply standard Linux
mechanisms to managing those threads-- CPU affinity, etc.

Stuart

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH 2/2] Expose tsc deadline timer cpuid to guest

2012-04-20 Thread Jan Kiszka
On 2012-04-20 17:00, Eduardo Habkost wrote:
 On Fri, Apr 20, 2012 at 12:12:38PM +0200, Jan Kiszka wrote:
 On 2012-04-19 22:03, Eduardo Habkost wrote:
 Jan/Avi: ping?

 I would like to get this ABI detail clarified so it can be implemented
 the right way on Qemu and KVM.

 My proposal is to simply add tsc-deadline to the data returned by
 GET_SUPPORTED_CPUID, making KVM_CAP_TSC_DEADLINE_TIMER unnecessary.


 IIRC, there were problems with this model to exclude that the feature
 gets reported this way without ensuring that in-kernel irqchip is
 actually activated. Please browse the discussions, it should be
 documented there.
 
 The flag was never added to GET_SUPPORTED_CPUID on any of the git
 repositories I have checked, and I don't see that being done anywhere on
 my KVM mailing list archives, either. So I don't see how we could have
 had issues with GET_SUPPORTED_CPUID, if it was never present on the
 code.
 
 What was present on the code before the fix, was coded that
 unconditinally enabled the flag even if Qemu never asked for it, and
 that really was wrong.
 
 GET_SUPPORTED_CPUID doesn't enable any flag: it just tells userspace
 that the hardware and KVM supports the feature (and, surprise, this is
 exactly what KVM_CAP_TSC_DEADLINE_TIMER means too, isn't it?). It's up
 to userspace to enable the CPUID bits according to user requirements and
 userspace capabilities (in other words: only when userspace knows it is
 safe because the in-kernel irqchip is enabled).
 
 If the above is not what GET_SUPPORTED_CPUID means, I would like to get
 that clarified, so I can submit an updated to KVM API documentation.

Does old userspace filter out flags from GET_SUPPORTED_CPUID that it
does not understand?

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH 2/2] Expose tsc deadline timer cpuid to guest

2012-04-20 Thread Eduardo Habkost
On Fri, Apr 20, 2012 at 05:19:17PM +0200, Jan Kiszka wrote:
 On 2012-04-20 17:00, Eduardo Habkost wrote:
  On Fri, Apr 20, 2012 at 12:12:38PM +0200, Jan Kiszka wrote:
  On 2012-04-19 22:03, Eduardo Habkost wrote:
  Jan/Avi: ping?
 
  I would like to get this ABI detail clarified so it can be implemented
  the right way on Qemu and KVM.
 
  My proposal is to simply add tsc-deadline to the data returned by
  GET_SUPPORTED_CPUID, making KVM_CAP_TSC_DEADLINE_TIMER unnecessary.
 
 
  IIRC, there were problems with this model to exclude that the feature
  gets reported this way without ensuring that in-kernel irqchip is
  actually activated. Please browse the discussions, it should be
  documented there.
  
  The flag was never added to GET_SUPPORTED_CPUID on any of the git
  repositories I have checked, and I don't see that being done anywhere on
  my KVM mailing list archives, either. So I don't see how we could have
  had issues with GET_SUPPORTED_CPUID, if it was never present on the
  code.
  
  What was present on the code before the fix, was coded that
  unconditinally enabled the flag even if Qemu never asked for it, and
  that really was wrong.
  
  GET_SUPPORTED_CPUID doesn't enable any flag: it just tells userspace
  that the hardware and KVM supports the feature (and, surprise, this is
  exactly what KVM_CAP_TSC_DEADLINE_TIMER means too, isn't it?). It's up
  to userspace to enable the CPUID bits according to user requirements and
  userspace capabilities (in other words: only when userspace knows it is
  safe because the in-kernel irqchip is enabled).
  
  If the above is not what GET_SUPPORTED_CPUID means, I would like to get
  that clarified, so I can submit an updated to KVM API documentation.
 
 Does old userspace filter out flags from GET_SUPPORTED_CPUID that it
 does not understand?

It's even more strict than that: it only enables what was explicitly
enabled on the CPU model asked by the user.

But:

The only exception is -cpu host, that tries to enable everything, even
flags Qemu never heard of, and that is something that may require some
changes on the API design (or at least documentation clarifications).

Today -cpu host can't differentiate (A) a feature that KVM supports
and emulate by itself and can be enabled without any support from
userspace and (B) a feature that KVM supports but need support from
userspace to be enabled. I am sure we will be able to find multiple
examples of (B) inside the flags returned by GET_SUPPORTED_CPUID today.

We could decide to never add any new flag to GET_SUPPORTED_CPUID if it
requires additional userspace support to work, from now on, and create
new KVM_CAP_* flags for them. But, we really want to do that for almost
every new CPUID feature bit in the future?

We also have the problem of defining what requires support from
userspace to work means: I am not sure this has the same meaning for
everybody. Many new features require userspace support only for
migration, and nothing else, but some users don't need migration to
work. Do those features qualify as (A), or as (B)?

-- 
Eduardo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [RFC PATCH 6/9] pc: pass paravirt info for hotplug memory slots to BIOS

2012-04-20 Thread Vasilis Liaskovitis
On Fri, Apr 20, 2012 at 12:33:57PM +0200, Igor Mammedov wrote:
 On 04/19/2012 04:08 PM, Vasilis Liaskovitis wrote:
 -numa_fw_cfg = g_malloc0((1 + max_cpus + nb_numa_nodes) * 8);
 +numa_fw_cfg = g_malloc0((2 + max_cpus + nb_numa_nodes + 3 * 
 nb_hp_memslots) * 8);
   numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes);
 +numa_fw_cfg[1] = cpu_to_le64(nb_hp_memslots);
 this will brake compatibility if guest was migrated from old-new qemu
 than on reboot it will use old bios that expects numa_fw_cfg[1] to be 
 something else.
 Could memslots info be moved to the end of an existing interface?

right. The number of memslots can be placed at 1 + max_cpus + nb_numa_nodes,
instead of right after the number of nodes. This way the old layout is 
preserved,
and all memslot info comes at the end. I will rewrite.

thanks,
- Vasilis
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Networking performance on a KVM Host (with no guests)

2012-04-20 Thread Nadav Har'El
On Fri, Apr 20, 2012, Chegu Vinod wrote about Re: Networking performance on a 
KVM Host (with no guests):
 Removing the intel_iommu=on boot time parameter  in the Config 1
 case seemed to help

intel_iommu=on is essential with you're mostly running guests *and*
using device assignment.

However, unfortunately, it also has a serious performance penalty for
I/O in *host* processes:

When intel_iommu=on, Linux (completely unrelated to KVM) adds a new level
of protection which didn't exist without an IOMMU - the network card, which
without an IOMMU could write (via DMA) to any memory location, now is
not allowed - the card can only write to memory locates which the OS
wanted it to write. Theoretically, this can protect the OS against
various kinds of attacks. But what happens now is that every time that
Linux passes a new buffer to the card, it needs to change the IOMMU
mappings. This noticably slows down I/O, unfortunately.

-- 
Nadav Har'El|Friday, Apr 20 2012, 
n...@math.technion.ac.il |-
Phone +972-523-790466, ICQ 13349191 |A bird in the hand is safer than one
http://nadav.harel.org.il   |overhead.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: vhost-blk development

2012-04-20 Thread Michael Baysek
Ok, so is this it?  

https://lkml.org/lkml/2011/7/28/175

And what, once it's compiled, it intercepts the virtio-blk requests?  If 
not, how is it enabled in the kvm instance?

Best,

Mike



- Original Message -
From: Liu Yuan namei.u...@gmail.com
To: Michael Baysek mbay...@liquidweb.com
Cc: kvm@vger.kernel.org
Sent: Thursday, April 19, 2012 9:28:45 PM
Subject: Re: vhost-blk development

On 04/20/2012 04:26 AM, Michael Baysek wrote:

 Can you point me to the latest revision of the code and provide some 
 guidance on how to test it?  I really would love to see if it helps.


There is no latest revision, I didn't continue the development when I
saw the sign that it wouldn't be accepted.

Thanks,
Yuan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


KVM: add kvm_arch_para_features stub to asm-generic/kvm_para.h

2012-04-20 Thread Marcelo Tosatti

Needed by kvm_para_has_feature().

Reported-by: Stephen Rothwell s...@canb.auug.org.au
Signed-off-by: Marcelo Tosatti mtosa...@redhat.com

diff --git a/include/asm-generic/kvm_para.h b/include/asm-generic/kvm_para.h
index 05ef7e7..9a7bbad 100644
--- a/include/asm-generic/kvm_para.h
+++ b/include/asm-generic/kvm_para.h
@@ -11,4 +11,9 @@ static inline bool kvm_check_and_clear_guest_paused(void)
return false;
 }
 
+static inline unsigned int kvm_arch_para_features(void)
+{
+   return 0;
+}
+
 #endif
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm: don't call mmu_shrinker w/o used_mmu_pages

2012-04-20 Thread Andrew Morton
On Fri, 13 Apr 2012 15:38:41 -0700
Ying Han ying...@google.com wrote:

 The mmu_shrink() is heavy by itself by iterating all kvms and holding
 the kvm_lock. spotted the code w/ Rik during LSF, and it turns out we
 don't need to call the shrinker if nothing to shrink.
 

We should probably tell the kvm maintainers about this ;)

 --- a/arch/x86/kvm/mmu.c
 +++ b/arch/x86/kvm/mmu.c
 @@ -188,6 +188,11 @@ static u64 __read_mostly shadow_mmio_mask;
  
  static void mmu_spte_set(u64 *sptep, u64 spte);
  
 +static inline int get_kvm_total_used_mmu_pages()
 +{
 + return percpu_counter_read_positive(kvm_total_used_mmu_pages);
 +}
 +
  void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
  {
   shadow_mmio_mask = mmio_mask;
 @@ -3900,6 +3905,9 @@ static int mmu_shrink(struct shrinker *shrink, struct 
 shrink_control *sc)
   if (nr_to_scan == 0)
   goto out;
  
 + if (!get_kvm_total_used_mmu_pages())
 + return 0;
 +
   raw_spin_lock(kvm_lock);
  
   list_for_each_entry(kvm, vm_list, vm_list) {
 @@ -3926,7 +3934,7 @@ static int mmu_shrink(struct shrinker *shrink, struct 
 shrink_control *sc)
   raw_spin_unlock(kvm_lock);
  
  out:
 - return percpu_counter_read_positive(kvm_total_used_mmu_pages);
 + return get_kvm_total_used_mmu_pages();
  }
  
  static struct shrinker mmu_shrinker = {

There's a small functional change: percpu_counter_read_positive() is an
approximate thing, so there will be cases where there will be some
pages which are accounted for only in the percpu_counter's per-cpu
accumulators.  In that case mmu_shrink() will bale out when there are
in fact some freeable pages available.  This is hopefully unimportant.

Do we actually know that this patch helps anything?  Any measurements? Is
kvm_total_used_mmu_pages==0 at all common?

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm: don't call mmu_shrinker w/o used_mmu_pages

2012-04-20 Thread Rik van Riel

On 04/20/2012 06:11 PM, Andrew Morton wrote:

On Fri, 13 Apr 2012 15:38:41 -0700
Ying Hanying...@google.com  wrote:


The mmu_shrink() is heavy by itself by iterating all kvms and holding
the kvm_lock. spotted the code w/ Rik during LSF, and it turns out we
don't need to call the shrinker if nothing to shrink.



@@ -3900,6 +3905,9 @@ static int mmu_shrink(struct shrinker *shrink, struct 
shrink_control *sc)
if (nr_to_scan == 0)
goto out;

+   if (!get_kvm_total_used_mmu_pages())
+   return 0;
+



Do we actually know that this patch helps anything?  Any measurements? Is
kvm_total_used_mmu_pages==0 at all common?



On re-reading mmu.c, it looks like even with EPT or NPT,
we end up creating mmu pages for the nested page tables.

I have not had the time to look into it more, but it would
be nice to know if the patch has any effect at all.

--
All rights reversed
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm: don't call mmu_shrinker w/o used_mmu_pages

2012-04-20 Thread Ying Han
On Fri, Apr 20, 2012 at 3:53 PM, Rik van Riel r...@redhat.com wrote:
 On 04/20/2012 06:11 PM, Andrew Morton wrote:

 On Fri, 13 Apr 2012 15:38:41 -0700
 Ying Hanying...@google.com  wrote:

 The mmu_shrink() is heavy by itself by iterating all kvms and holding
 the kvm_lock. spotted the code w/ Rik during LSF, and it turns out we
 don't need to call the shrinker if nothing to shrink.


 @@ -3900,6 +3905,9 @@ static int mmu_shrink(struct shrinker *shrink,
 struct shrink_control *sc)
        if (nr_to_scan == 0)
                goto out;

 +       if (!get_kvm_total_used_mmu_pages())
 +               return 0;
 +


 Do we actually know that this patch helps anything?  Any measurements? Is
 kvm_total_used_mmu_pages==0 at all common?


 On re-reading mmu.c, it looks like even with EPT or NPT,
 we end up creating mmu pages for the nested page tables.

I think you are right here. So the patch doesn't help the real pain.

My understanding of the real pain is the poor implementation of the
mmu_shrinker. It iterates all the registered mmu_shrink callbacks for
each kvm and only does little work at a time while holding two big
locks. I learned from mikew@ (also ++cc-ed) that is causing latency
spikes and unfairness among kvm instance in some of the experiment
we've seen.

Mike might tell more on that.

--Ying


 I have not had the time to look into it more, but it would
 be nice to know if the patch has any effect at all.

 --
 All rights reversed
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: Question about host CPU usage/allocation by KVM

2012-04-20 Thread Shergill, Gurinder


| -Original Message-
| From: kvm-ow...@vger.kernel.org [mailto:kvm-ow...@vger.kernel.org] On
| Behalf Of Brian Jackson
| Sent: Friday, April 20, 2012 8:01 AM
| To: Stuart Yoder; Alexander Lyakas
| Cc: kvm@vger.kernel.org
| Subject: Re: Question about host CPU usage/allocation by KVM
| 
| On Thu, 19 Apr 2012 13:01:54 -0500, Alexander Lyakas
| alex.bols...@gmail.com wrote:
| 
|  Hi Stuart,
|  I have been doing some experiments, and I noticed that there are
|  additional QEMU threads, besides the ones reported by info cpus
|  command. In particular, the main QEMU thread (the one whose LWP is the
|  same as its PID), also consumes significant CPU time. Is this
|  expected?
| 
| The extra threads are for various things. It can be for the vnc server if
| you are using it. Threads are used to mimic aio in certain situations.
| Etc. The main thread also does a lot of the device emulation work
| (console, network, serial, block, etc.)

Is there some way to know what they are for ? or how many can get created ?

I noticed that the number of threads go from ~4 to ~70 if I do some I/O inside 
the guest.

Sunny

| 
| 
| 
|  Alex.
| 
| 
|  On Wed, Apr 18, 2012 at 8:24 PM, Stuart Yoder b08...@gmail.com wrote:
|  On Tue, Apr 17, 2012 at 4:54 PM, Alexander Lyakas
|  alex.bols...@gmail.com wrote:
|  Greetings everybody,
| 
|  Can anybody please point me to code/documentation regarding the
|  following questions I have:
| 
|  - What does it actually mean using -smp N option, in terms of CPU
|  sharing between the host and the guest?
|  - How are guest CPUs mapped to host CPUs (if at all)?
| 
|  Each guest CPU (vcpu) corresponds to a QEMU thread.
|  You can see the thread ids in QEMU with info cpus in the QEMU
|  monitor.
| 
|  Since a vcpu is a thread you can apply standard Linux mechanisms to
|  managing those threads-- CPU affinity, etc.
| 
|  Stuart
|  --
|  To unsubscribe from this list: send the line unsubscribe kvm in the
|  body of a message to majord...@vger.kernel.org More majordomo info at
|  http://vger.kernel.org/majordomo-info.html
| --
| To unsubscribe from this list: send the line unsubscribe kvm in the body
| of a message to majord...@vger.kernel.org More majordomo info at
| http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 2/9] KVM: MMU: abstract spte write-protect

2012-04-20 Thread Marcelo Tosatti
On Fri, Apr 20, 2012 at 04:17:47PM +0800, Xiao Guangrong wrote:
 Introduce a common function to abstract spte write-protect to
 cleanup the code
 
 Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
 ---
  arch/x86/kvm/mmu.c |   60 ++-
  1 files changed, 35 insertions(+), 25 deletions(-)
 
 diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
 index 4a3cc18..e70ff38 100644
 --- a/arch/x86/kvm/mmu.c
 +++ b/arch/x86/kvm/mmu.c
 @@ -1041,6 +1041,34 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
   rmap_remove(kvm, sptep);
  }
 
 +/* Return true if the spte is dropped. */
 +static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool large,
 +bool *flush)
 +{
 + u64 spte = *sptep;
 +
 + if (!is_writable_pte(spte))
 + return false;
 +
 + *flush |= true;
 +
 + if (large) {
 + pgprintk(rmap_write_protect(large): spte %p %llx\n,
 +  spte, *spte);
 + BUG_ON(!is_large_pte(spte));
 +
 + drop_spte(kvm, sptep);
 + --kvm-stat.lpages;
 + return true;
 + }
 +
 + rmap_printk(rmap_write_protect: spte %p %llx\n, spte, *spte);
 + spte = spte  ~PT_WRITABLE_MASK;
 + mmu_spte_update(sptep, spte);
 +
 + return false;
 +}
 +
  static bool
  __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
  {
 @@ -1050,24 +1078,13 @@ __rmap_write_protect(struct kvm *kvm, unsigned long 
 *rmapp, int level)
 
   for (sptep = rmap_get_first(*rmapp, iter); sptep;) {
   BUG_ON(!(*sptep  PT_PRESENT_MASK));
 - rmap_printk(rmap_write_protect: spte %p %llx\n, sptep, 
 *sptep);
 -
 - if (!is_writable_pte(*sptep)) {
 - sptep = rmap_get_next(iter);
 - continue;
 - }
 -
 - if (level == PT_PAGE_TABLE_LEVEL) {
 - mmu_spte_update(sptep, *sptep  ~PT_WRITABLE_MASK);
 - sptep = rmap_get_next(iter);
 - } else {
 - BUG_ON(!is_large_pte(*sptep));
 - drop_spte(kvm, sptep);
 - --kvm-stat.lpages;

It is preferable to remove all large sptes including read-only ones, the
current behaviour, then to verify that no read-write transition can
occur in fault paths (fault paths which are increasing in number).

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit

2012-04-20 Thread Marcelo Tosatti
On Fri, Apr 20, 2012 at 04:19:17PM +0800, Xiao Guangrong wrote:
 If this bit is set, it means the W bit of the spte is cleared due
 to shadow page table protection
 
 Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
 ---
  arch/x86/kvm/mmu.c |   56 ++-
  1 files changed, 37 insertions(+), 19 deletions(-)
 
 diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
 index dd984b6..eb02fc4 100644
 --- a/arch/x86/kvm/mmu.c
 +++ b/arch/x86/kvm/mmu.c
 @@ -147,6 +147,7 @@ module_param(dbg, bool, 0644);
 
  #define SPTE_HOST_WRITEABLE  (1ULL  PT_FIRST_AVAIL_BITS_SHIFT)
  #define SPTE_ALLOW_WRITE (1ULL  (PT_FIRST_AVAIL_BITS_SHIFT + 1))
 +#define SPTE_WRITE_PROTECT   (1ULL  (PT_FIRST_AVAIL_BITS_SHIFT + 2))
 
  #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 
 @@ -1042,36 +1043,51 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
   rmap_remove(kvm, sptep);
  }
 
 +static bool spte_wp_by_dirty_log(u64 spte)
 +{
 + WARN_ON(is_writable_pte(spte));
 +
 + return (spte  SPTE_ALLOW_WRITE)  !(spte  SPTE_WRITE_PROTECT);
 +}

Is the information accurate? Say:

- dirty log write protect, set SPTE_ALLOW_WRITE, clear WRITABLE.
- shadow gfn, rmap_write_protect finds page not WRITABLE.
- spte points to shadow gfn, but SPTE_WRITE_PROTECT is not set.

BTW,

introduce SPTE_ALLOW_WRITE bit

This bit indicates whether the spte is allow to be writable that
means the gpte of this spte is writable and the pfn pointed by
this spte is writable on host

Other than the fact that each bit should have one meaning, how
can this bit be accurate without write protection of the gpte?

As soon as guest writes to gpte, information in bit is outdated.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Networking performance on a KVM Host (with no guests)

2012-04-20 Thread Chegu Vinod
Nadav Har'El nyh at math.technion.ac.il writes:

 
 On Fri, Apr 20, 2012, Chegu Vinod wrote about Re: Networking performance on 
 a 
KVM Host (with no guests):
  Removing the intel_iommu=on boot time parameter  in the Config 1
  case seemed to help
 
 intel_iommu=on is essential with you're mostly running guests *and*
 using device assignment.
 

Yes. I do understand that...

 However, unfortunately, it also has a serious performance penalty for
 I/O in *host* processes:
 
 When intel_iommu=on, Linux (completely unrelated to KVM) adds a new level
 of protection which didn't exist without an IOMMU - the network card, which
 without an IOMMU could write (via DMA) to any memory location, now is
 not allowed - the card can only write to memory locates which the OS
 wanted it to write. Theoretically, this can protect the OS against
 various kinds of attacks. But what happens now is that every time that
 Linux passes a new buffer to the card, it needs to change the IOMMU
 mappings. This noticably slows down I/O, unfortunately.
 

Hmm...  So if one were to have a private link setup between two hosts to drive 
some traffic through between the hosts... then we can't expect to get line rate 
on that private NIC. 

Thanks much for your response !
Vinod



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit

2012-04-20 Thread Marcelo Tosatti
On Fri, Apr 20, 2012 at 06:52:11PM -0300, Marcelo Tosatti wrote:
 On Fri, Apr 20, 2012 at 04:19:17PM +0800, Xiao Guangrong wrote:
  If this bit is set, it means the W bit of the spte is cleared due
  to shadow page table protection
  
  Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
  ---
   arch/x86/kvm/mmu.c |   56 
  ++-
   1 files changed, 37 insertions(+), 19 deletions(-)
  
  diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
  index dd984b6..eb02fc4 100644
  --- a/arch/x86/kvm/mmu.c
  +++ b/arch/x86/kvm/mmu.c
  @@ -147,6 +147,7 @@ module_param(dbg, bool, 0644);
  
   #define SPTE_HOST_WRITEABLE(1ULL  PT_FIRST_AVAIL_BITS_SHIFT)
   #define SPTE_ALLOW_WRITE   (1ULL  (PT_FIRST_AVAIL_BITS_SHIFT + 1))
  +#define SPTE_WRITE_PROTECT (1ULL  (PT_FIRST_AVAIL_BITS_SHIFT + 2))
  
   #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
  
  @@ -1042,36 +1043,51 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
  rmap_remove(kvm, sptep);
   }
  
  +static bool spte_wp_by_dirty_log(u64 spte)
  +{
  +   WARN_ON(is_writable_pte(spte));
  +
  +   return (spte  SPTE_ALLOW_WRITE)  !(spte  SPTE_WRITE_PROTECT);
  +}
 
 Is the information accurate? Say:
 
 - dirty log write protect, set SPTE_ALLOW_WRITE, clear WRITABLE.
 - shadow gfn, rmap_write_protect finds page not WRITABLE.
 - spte points to shadow gfn, but SPTE_WRITE_PROTECT is not set.
 
 BTW,
 
 introduce SPTE_ALLOW_WRITE bit
 
 This bit indicates whether the spte is allow to be writable that
 means the gpte of this spte is writable and the pfn pointed by
 this spte is writable on host
 
 Other than the fact that each bit should have one meaning, how
 can this bit be accurate without write protection of the gpte?
 
 As soon as guest writes to gpte, information in bit is outdated.

Ok, i found one example where mmu_lock was expecting sptes not 
to change:


VCPU0   VCPU1

- read-only gpte
- read-only spte
- write fault
- spte = *sptep
guest write to gpte, set writable bit
spte writable
parent page unsync
guest write to gpte writable bit clear
guest invlpg updates spte to RO
sync_page
enter set_spte from sync_page
- cmpxchg(spte) is now writable
[window where another vcpu can
cache spte with writable bit
set]

if (is_writable_pte(entry)  
!is_writable_pte(*sptep))
kvm_flush_remote_tlbs(vcpu-kvm);

The flush is not executed because spte was read-only (which is 
a correct assumption as long as sptes updates are protected
by mmu_lock).

So this is an example of implicit assumptions which break if you update
spte without mmu_lock. Certainly there are more cases. :(


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit

2012-04-20 Thread Marcelo Tosatti
On Fri, Apr 20, 2012 at 09:40:30PM -0300, Marcelo Tosatti wrote:
 On Fri, Apr 20, 2012 at 06:52:11PM -0300, Marcelo Tosatti wrote:
  On Fri, Apr 20, 2012 at 04:19:17PM +0800, Xiao Guangrong wrote:
   If this bit is set, it means the W bit of the spte is cleared due
   to shadow page table protection
   
   Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
   ---
arch/x86/kvm/mmu.c |   56 
   ++-
1 files changed, 37 insertions(+), 19 deletions(-)
   
   diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
   index dd984b6..eb02fc4 100644
   --- a/arch/x86/kvm/mmu.c
   +++ b/arch/x86/kvm/mmu.c
   @@ -147,6 +147,7 @@ module_param(dbg, bool, 0644);
   
#define SPTE_HOST_WRITEABLE  (1ULL  PT_FIRST_AVAIL_BITS_SHIFT)
#define SPTE_ALLOW_WRITE (1ULL  (PT_FIRST_AVAIL_BITS_SHIFT + 1))
   +#define SPTE_WRITE_PROTECT   (1ULL  (PT_FIRST_AVAIL_BITS_SHIFT + 
   2))
   
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
   
   @@ -1042,36 +1043,51 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
 rmap_remove(kvm, sptep);
}
   
   +static bool spte_wp_by_dirty_log(u64 spte)
   +{
   + WARN_ON(is_writable_pte(spte));
   +
   + return (spte  SPTE_ALLOW_WRITE)  !(spte  SPTE_WRITE_PROTECT);
   +}
  
  Is the information accurate? Say:
  
  - dirty log write protect, set SPTE_ALLOW_WRITE, clear WRITABLE.
  - shadow gfn, rmap_write_protect finds page not WRITABLE.
  - spte points to shadow gfn, but SPTE_WRITE_PROTECT is not set.
  
  BTW,
  
  introduce SPTE_ALLOW_WRITE bit
  
  This bit indicates whether the spte is allow to be writable that
  means the gpte of this spte is writable and the pfn pointed by
  this spte is writable on host
  
  Other than the fact that each bit should have one meaning, how
  can this bit be accurate without write protection of the gpte?
  
  As soon as guest writes to gpte, information in bit is outdated.
 
 Ok, i found one example where mmu_lock was expecting sptes not 
 to change:
 
 
 VCPU0 VCPU1
 
 - read-only gpte
 - read-only spte
 - write fault
 - spte = *sptep
   guest write to gpte, set writable bit
   spte writable
   parent page unsync
   guest write to gpte writable bit clear
   guest invlpg updates spte to RO
   sync_page
   enter set_spte from sync_page
 - cmpxchg(spte) is now writable
 [window where another vcpu can
 cache spte with writable bit
 set]
 
   if (is_writable_pte(entry)  
 !is_writable_pte(*sptep))
   kvm_flush_remote_tlbs(vcpu-kvm);
 
 The flush is not executed because spte was read-only (which is 
 a correct assumption as long as sptes updates are protected
 by mmu_lock).
 
 So this is an example of implicit assumptions which break if you update
 spte without mmu_lock. Certainly there are more cases. :(

OK, i now see you mentioned a similar case in the document, for
rmap_write_protect.

More importantly than the particular flush TLB case, the point is
every piece of code that reads and writes sptes must now be aware that
mmu_lock alone does not guarantee stability. Everything must be audited.

Where the bulk of the improvement comes from again? If there is little
or no mmu_lock contention (which we have no consistent data to be honest
in your testcase) is the bouncing off mmu_lock's cacheline that hurts?

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 07/16] KVM: MMU: introduce for_each_pte_list_spte

2012-04-20 Thread Takuya Yoshikawa
Sorry for the delay.

On Wed, 18 Apr 2012 12:01:03 +0800
Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com wrote:

  I have checked dirty-log-perf myself with this patch [01-07].
  
  GET_DIRTY_LOG for 1GB dirty pages has become more than 15% slower.

 Thanks for your test!
 
 Unbelievable, i will do more test and check it more carefully.

GET_DIRTY_LOG now traverses rmap lists intensively.
So only a small change can affect the performance when there are many
dirty pages.

 Could you please open your tool, then i can reproduction it and find the
 real reason?

It's already in kvm unit tests!

 I will check whether your tool is better then kernbench/autotest after
 review your tool.

Let's focus on lock-less now: so dirty-log-perf is not needed now.

I think you failed to appeal the real advantage of your lock-less approach!
I will write about this on v3-threads.

Thanks,
Takuya
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 07/16] KVM: MMU: introduce for_each_pte_list_spte

2012-04-20 Thread Takuya Yoshikawa
On Wed, 18 Apr 2012 18:03:10 +0800
Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com wrote:

 By the way, what is the case did you test? ept = 1 ?

Yes!

I am not worrying about without-EPT/NPT-migration.

Takuya
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 0/9] KVM: MMU: fast page fault

2012-04-20 Thread Marcelo Tosatti
On Fri, Apr 20, 2012 at 04:16:52PM +0800, Xiao Guangrong wrote:
 Changlog:
 
 - split the long series for better review. This is the core
   implementation of fast page fault.
 - document fast page fault in locking.txt

Great, this is a big improvement, thanks.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 2/9] KVM: MMU: abstract spte write-protect

2012-04-20 Thread Takuya Yoshikawa
On Fri, 20 Apr 2012 18:33:19 -0300
Marcelo Tosatti mtosa...@redhat.com wrote:

 It is preferable to remove all large sptes including read-only ones, the
 current behaviour, then to verify that no read-write transition can
 occur in fault paths (fault paths which are increasing in number).
 

I think we should use separate function than spte_write_protect() for
the large case.

Takuya
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit

2012-04-20 Thread Takuya Yoshikawa
On Fri, 20 Apr 2012 21:55:55 -0300
Marcelo Tosatti mtosa...@redhat.com wrote:

 More importantly than the particular flush TLB case, the point is
 every piece of code that reads and writes sptes must now be aware that
 mmu_lock alone does not guarantee stability. Everything must be audited.

In addition, please give me some stress-test cases to verify these in the
real environments.  Live migration with KSM, with notifier call, etc?

Although the current logic is verified by dirty-log api test, the new logic
may need another api test program.

Note: the problem is that live migration can fail silently.  We cannot know
the data loss is from guest side problem or get_dirty side.

 Where the bulk of the improvement comes from again? If there is little
 or no mmu_lock contention (which we have no consistent data to be honest
 in your testcase) is the bouncing off mmu_lock's cacheline that hurts?

This week, I was doing some simplified worst-latency-tests for my work.
It was difficult than I thought.

But Xiao's lock-less should see the reduction of mmu_lock contention
more easily, if there is really some.

To make things simple, e.g., we can do the same kind of write-loop as
XBZRLE people are doing in the guest - with more VCPUs if possible.

Thanks,
Takuya
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm: don't call mmu_shrinker w/o used_mmu_pages

2012-04-20 Thread Takuya Yoshikawa
On Fri, 20 Apr 2012 16:07:41 -0700
Ying Han ying...@google.com wrote:

 My understanding of the real pain is the poor implementation of the
 mmu_shrinker. It iterates all the registered mmu_shrink callbacks for
 each kvm and only does little work at a time while holding two big
 locks. I learned from mikew@ (also ++cc-ed) that is causing latency
 spikes and unfairness among kvm instance in some of the experiment
 we've seen.

Last year, I discussed the mmu_shrink issues on kvm ML:

[PATCH 0/4] KVM: Make mmu_shrink() scan nr_to_scan shadow pages
http://www.spinics.net/lists/kvm/msg65231.html

Sadly, we could not find any good way at that time.

Thanks,
Takuya
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm: don't call mmu_shrinker w/o used_mmu_pages

2012-04-20 Thread Mike Waychison
On Fri, Apr 20, 2012 at 6:56 PM, Takuya Yoshikawa
takuya.yoshik...@gmail.com wrote:
 On Fri, 20 Apr 2012 16:07:41 -0700
 Ying Han ying...@google.com wrote:

 My understanding of the real pain is the poor implementation of the
 mmu_shrinker. It iterates all the registered mmu_shrink callbacks for
 each kvm and only does little work at a time while holding two big
 locks. I learned from mikew@ (also ++cc-ed) that is causing latency
 spikes and unfairness among kvm instance in some of the experiment
 we've seen.

The pains we have with mmu_shrink are twofold:

 - Memory pressure against the shinker applies globally.  Any task can
cause pressure within their own environment (using numa or memcg) and
cause the global shrinker to shrink all shadowed tables on the system
(regardless of how memory is isolated between tasks).
 - Massive lock contention when all these CPUs are hitting the global
lock (which backs everybody on the system up).

In our situation, we simple disable the shrinker altogether.  My
understanding is that we EPT or NPT, the amount of memory used by
these tables is bounded by the size of guest physical memory, whereas
with software shadowed tables, it is bounded by the addresses spaces
in the guest.  This bound makes it reasonable to not do any reclaim
and charge it as a system overhead tax.

As for data, the most impressive result was a massive improvement in
round-trip latency to a webserver running in a guest while another
process on the system was thrashing through page-cache (on a dozen or
so spinning disks iirc).  We were using fake-numa, and would otherwise
not expect the antagonist to drastrically affect the latency-sensitive
task (as per a lot of effort into making that work).  Unfortunately,
we saw the 99th%ile latency riding at the 140ms timeout cut-off (they
were likely tailing out much longer), with the 95%ile at over 40ms.
With the mmu_shrinker disabled, the 99th%ile latency quickly dropped
down to about 20ms.

CPU profiles were showing 30% of cpu time wasted on spinlocks, all the
mmu_list_lock iirc.

In our case, I'm much happier just disabling the damned thing altogether.


 Last year, I discussed the mmu_shrink issues on kvm ML:

        [PATCH 0/4] KVM: Make mmu_shrink() scan nr_to_scan shadow pages
        http://www.spinics.net/lists/kvm/msg65231.html

 Sadly, we could not find any good way at that time.

 Thanks,
        Takuya
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm: don't call mmu_shrinker w/o used_mmu_pages

2012-04-20 Thread Takuya Yoshikawa
On Fri, 20 Apr 2012 19:15:24 -0700
Mike Waychison mi...@google.com wrote:

 In our situation, we simple disable the shrinker altogether.  My
 understanding is that we EPT or NPT, the amount of memory used by
 these tables is bounded by the size of guest physical memory, whereas
 with software shadowed tables, it is bounded by the addresses spaces
 in the guest.  This bound makes it reasonable to not do any reclaim
 and charge it as a system overhead tax.

IIRC, KVM's mmu_shrink is mainly for protecting the host from pathological
guest without EPT or NPT.

You can see Avi's summary: -- http://www.spinics.net/lists/kvm/msg65671.html
===
We should aim for the following:
- normal operation causes very little shrinks (some are okay)
- high pressure mostly due to kvm results in kvm being shrunk (this is a
pathological case caused by a starting a guest with a huge amount of
memory, and mapping it all to /dev/zero (or ksm), and getting the guest
the create shadow mappings for all of it)
- general high pressure is shared among other caches like dcache and icache

The cost of reestablishing an mmu page can be as high as half a
millisecond of cpu time, which is the reason I want to be conservative.
===

Thanks,
Takuya
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/2] Simplify RCU freeing of shadow pages

2012-04-20 Thread Marcelo Tosatti
On Thu, Apr 19, 2012 at 07:26:35PM +0300, Avi Kivity wrote:
 This patchset simplifies the freeing by RCU of mmu pages.
 
 Xiao, I'm sure you thought of always freeing by RCU.  Why didn't you choose
 this way?  I saves a couple of atomics in the fast path.
 
 Avi Kivity (2):
   KVM: MMU: Always free shadow pages using RCU
   KVM: MMU: Recover space used by rcu_head in struct kvm_mmu_page
 
  arch/x86/include/asm/kvm_host.h |9 +++---
  arch/x86/kvm/mmu.c  |   58 
 ---
  2 files changed, 15 insertions(+), 52 deletions(-)

Check Documentation/RCU/checklist.txt item 8.

a.  Keeping a count of the number of data-structure elements
used by the RCU-protected data structure, including
those waiting for a grace period to elapse.  Enforce a
limit on this number, stalling updates as needed to allow
previously deferred frees to complete.  Alternatively,
limit only the number awaiting deferred free rather than
the total number of elements.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm: don't call mmu_shrinker w/o used_mmu_pages

2012-04-20 Thread Mike Waychison
On Fri, Apr 20, 2012 at 7:29 PM, Takuya Yoshikawa
takuya.yoshik...@gmail.com wrote:
 On Fri, 20 Apr 2012 19:15:24 -0700
 Mike Waychison mi...@google.com wrote:

 In our situation, we simple disable the shrinker altogether.  My
 understanding is that we EPT or NPT, the amount of memory used by
 these tables is bounded by the size of guest physical memory, whereas
 with software shadowed tables, it is bounded by the addresses spaces
 in the guest.  This bound makes it reasonable to not do any reclaim
 and charge it as a system overhead tax.

 IIRC, KVM's mmu_shrink is mainly for protecting the host from pathological
 guest without EPT or NPT.

 You can see Avi's summary: -- http://www.spinics.net/lists/kvm/msg65671.html
 ===
 We should aim for the following:
 - normal operation causes very little shrinks (some are okay)
 - high pressure mostly due to kvm results in kvm being shrunk (this is a
 pathological case caused by a starting a guest with a huge amount of
 memory, and mapping it all to /dev/zero (or ksm), and getting the guest
 the create shadow mappings for all of it)
 - general high pressure is shared among other caches like dcache and icache

 The cost of reestablishing an mmu page can be as high as half a
 millisecond of cpu time, which is the reason I want to be conservative.

To add to that, on these systems (32-way), the fault itself isn't as
heavy-handed as a global lock in everyone's reclaim path :)

I'd be very happy if this stuff was memcg aware, but until that
happens, this code is disabled in our production builds.  30% of CPU
time lost to a spinlock when mixing VMs with IO is worth paying the 
1% of system ram these pages cost if it means
tighter/more-deterministic service latencies.

 ===

 Thanks,
        Takuya
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 2/9] KVM: MMU: abstract spte write-protect

2012-04-20 Thread Xiao Guangrong
On 04/21/2012 05:33 AM, Marcelo Tosatti wrote:


  static bool
  __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
  {
 @@ -1050,24 +1078,13 @@ __rmap_write_protect(struct kvm *kvm, unsigned long 
 *rmapp, int level)

  for (sptep = rmap_get_first(*rmapp, iter); sptep;) {
  BUG_ON(!(*sptep  PT_PRESENT_MASK));
 -rmap_printk(rmap_write_protect: spte %p %llx\n, sptep, 
 *sptep);
 -
 -if (!is_writable_pte(*sptep)) {
 -sptep = rmap_get_next(iter);
 -continue;
 -}
 -
 -if (level == PT_PAGE_TABLE_LEVEL) {
 -mmu_spte_update(sptep, *sptep  ~PT_WRITABLE_MASK);
 -sptep = rmap_get_next(iter);
 -} else {
 -BUG_ON(!is_large_pte(*sptep));
 -drop_spte(kvm, sptep);
 ---kvm-stat.lpages;
 
 It is preferable to remove all large sptes including read-only ones, the


It can cause page faults even if read memory on these large sptse.

Actually, Avi suggested that make large writable spte to be readonly
(not dropped) on this path.

 current behaviour, then to verify that no read-write transition can
 occur in fault paths (fault paths which are increasing in number).


Yes, the small spte also has issue (find a write-protected spte in
fault paths). Later, the second part of this patchset will introduce
rmap.WRITE_PROTECTED bit, then we can do the fast check before calling
fast page fault.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 4/9] KVM: MMU: introduce SPTE_ALLOW_WRITE bit

2012-04-20 Thread Xiao Guangrong
On 04/21/2012 05:39 AM, Marcelo Tosatti wrote:


 @@ -1177,9 +1178,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned 
 long *rmapp,
  new_spte = *sptep  ~PT64_BASE_ADDR_MASK;
  new_spte |= (u64)new_pfn  PAGE_SHIFT;

 -new_spte = ~PT_WRITABLE_MASK;
 -new_spte = ~SPTE_HOST_WRITEABLE;
 -new_spte = ~shadow_accessed_mask;
 +new_spte = ~(PT_WRITABLE_MASK | SPTE_HOST_WRITEABLE |
 +  shadow_accessed_mask | SPTE_ALLOW_WRITE);
 
 Each bit should have a distinct meaning. Here the host pte is being
 write-protected, which means only the SPTE_HOST_WRITEABLE bit
 should be cleared.


Hmm, it is no problem if SPTE_ALLOW_WRITE is not cleared.

But the meaning of SPTE_ALLOW_WRITE will become strange: we will see a
spte with spte.SPTE_ALLOW_WRITE = 1 (means the spte is writable on host
and guest) and spte.SPTE_HOST_WRITEABLE = 0 (means the spte is read-only
on host).

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit

2012-04-20 Thread Xiao Guangrong
On 04/21/2012 05:52 AM, Marcelo Tosatti wrote:

 On Fri, Apr 20, 2012 at 04:19:17PM +0800, Xiao Guangrong wrote:
 If this bit is set, it means the W bit of the spte is cleared due
 to shadow page table protection

 Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
 ---
  arch/x86/kvm/mmu.c |   56 
 ++-
  1 files changed, 37 insertions(+), 19 deletions(-)

 diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
 index dd984b6..eb02fc4 100644
 --- a/arch/x86/kvm/mmu.c
 +++ b/arch/x86/kvm/mmu.c
 @@ -147,6 +147,7 @@ module_param(dbg, bool, 0644);

  #define SPTE_HOST_WRITEABLE (1ULL  PT_FIRST_AVAIL_BITS_SHIFT)
  #define SPTE_ALLOW_WRITE(1ULL  (PT_FIRST_AVAIL_BITS_SHIFT + 1))
 +#define SPTE_WRITE_PROTECT  (1ULL  (PT_FIRST_AVAIL_BITS_SHIFT + 2))

  #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)

 @@ -1042,36 +1043,51 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
  rmap_remove(kvm, sptep);
  }

 +static bool spte_wp_by_dirty_log(u64 spte)
 +{
 +WARN_ON(is_writable_pte(spte));
 +
 +return (spte  SPTE_ALLOW_WRITE)  !(spte  SPTE_WRITE_PROTECT);
 +}
 
 Is the information accurate? Say:
 
 - dirty log write protect, set SPTE_ALLOW_WRITE, clear WRITABLE.
 - shadow gfn, rmap_write_protect finds page not WRITABLE.
 - spte points to shadow gfn, but SPTE_WRITE_PROTECT is not set.


It can not happen, rmap_write_protect will set SPTE_WRITE_PROTECT
even if the spte is not WRITABLE, please see:

+   if (page_table_protect  spte_wp_by_dirty_log(spte))
+   goto reset_spte;
+
+   return false;
+
+reset_spte:
rmap_printk(rmap_write_protect: spte %p %llx\n, spte, *spte);
spte = spte  ~PT_WRITABLE_MASK;
+   if (page_table_protect)
+   spte |= SPTE_WRITE_PROTECT;
mmu_spte_update(sptep, spte);
-
return false;
 }

 
 BTW,
 
 introduce SPTE_ALLOW_WRITE bit
 
 This bit indicates whether the spte is allow to be writable that
 means the gpte of this spte is writable and the pfn pointed by
 this spte is writable on host
 
 Other than the fact that each bit should have one meaning, how
 can this bit be accurate without write protection of the gpte?
 


Above explanation can ensure the meaning of this bit is accurate?
Or it has another case? :)

 As soon as guest writes to gpte, information in bit is outdated.
 


The bit will be updated when spte is updated.

When the guest write gpte, the spte is not updated immediately,
yes, the bit is outdated at that time, but it is ok since tlb is
not flushed.

After tlb flush, the bit can be coincident with gpte.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit

2012-04-20 Thread Xiao Guangrong
On 04/21/2012 08:40 AM, Marcelo Tosatti wrote:

 On Fri, Apr 20, 2012 at 06:52:11PM -0300, Marcelo Tosatti wrote:
 On Fri, Apr 20, 2012 at 04:19:17PM +0800, Xiao Guangrong wrote:
 If this bit is set, it means the W bit of the spte is cleared due
 to shadow page table protection

 Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
 ---
  arch/x86/kvm/mmu.c |   56 
 ++-
  1 files changed, 37 insertions(+), 19 deletions(-)

 diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
 index dd984b6..eb02fc4 100644
 --- a/arch/x86/kvm/mmu.c
 +++ b/arch/x86/kvm/mmu.c
 @@ -147,6 +147,7 @@ module_param(dbg, bool, 0644);

  #define SPTE_HOST_WRITEABLE(1ULL  PT_FIRST_AVAIL_BITS_SHIFT)
  #define SPTE_ALLOW_WRITE   (1ULL  (PT_FIRST_AVAIL_BITS_SHIFT + 1))
 +#define SPTE_WRITE_PROTECT (1ULL  (PT_FIRST_AVAIL_BITS_SHIFT + 2))

  #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)

 @@ -1042,36 +1043,51 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
 rmap_remove(kvm, sptep);
  }

 +static bool spte_wp_by_dirty_log(u64 spte)
 +{
 +   WARN_ON(is_writable_pte(spte));
 +
 +   return (spte  SPTE_ALLOW_WRITE)  !(spte  SPTE_WRITE_PROTECT);
 +}

 Is the information accurate? Say:

 - dirty log write protect, set SPTE_ALLOW_WRITE, clear WRITABLE.
 - shadow gfn, rmap_write_protect finds page not WRITABLE.
 - spte points to shadow gfn, but SPTE_WRITE_PROTECT is not set.

 BTW,

 introduce SPTE_ALLOW_WRITE bit

 This bit indicates whether the spte is allow to be writable that
 means the gpte of this spte is writable and the pfn pointed by
 this spte is writable on host

 Other than the fact that each bit should have one meaning, how
 can this bit be accurate without write protection of the gpte?

 As soon as guest writes to gpte, information in bit is outdated.
 
 Ok, i found one example where mmu_lock was expecting sptes not 
 to change:
 
 
 VCPU0 VCPU1
 
 - read-only gpte
 - read-only spte
 - write fault


It is not true, gpte is read-only, and it is a write fault, then we
should reject the page fault to guest, the fast page fault is not called. :)

 - spte = *sptep
   guest write to gpte, set writable bit
   spte writable
   parent page unsync
   guest write to gpte writable bit clear
   guest invlpg updates spte to RO
   sync_page
   enter set_spte from sync_page
 - cmpxchg(spte) is now writable
 [window where another vcpu can
 cache spte with writable bit
 set]
 
   if (is_writable_pte(entry)  
 !is_writable_pte(*sptep))
   kvm_flush_remote_tlbs(vcpu-kvm);
 
 The flush is not executed because spte was read-only (which is 
 a correct assumption as long as sptes updates are protected
 by mmu_lock).
 


It is also not true, flush tlbs in set_sptes is used to ensure 
rmap_write_protect
work correctly, but rmap_write_protect will flush tlbs even if the spte can be 
changed
by fast page fault.

 So this is an example of implicit assumptions which break if you update
 spte without mmu_lock. Certainly there are more cases. :(


We only need care the path which is depends on spte.WRITEABLE == 0, since only
these spte has chance to be changed out of mmu-lock.

The most trouble is in rmap_write_protect that need flush tlb to protect shadow
page table.

I think it is not too hard to check. :)


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit

2012-04-20 Thread Xiao Guangrong
On 04/21/2012 08:55 AM, Marcelo Tosatti wrote:


 So this is an example of implicit assumptions which break if you update
 spte without mmu_lock. Certainly there are more cases. :(
 
 OK, i now see you mentioned a similar case in the document, for
 rmap_write_protect.
 
 More importantly than the particular flush TLB case, the point is
 every piece of code that reads and writes sptes must now be aware that
 mmu_lock alone does not guarantee stability. Everything must be audited.
 


Yes, that is true, but it is not hard to audit the code since we only
change the spte from read-only to writable, also all information that
fast page fault depends on is from spte.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 2/9] KVM: MMU: abstract spte write-protect

2012-04-20 Thread Xiao Guangrong
On 04/21/2012 09:10 AM, Takuya Yoshikawa wrote:

 On Fri, 20 Apr 2012 18:33:19 -0300
 Marcelo Tosatti mtosa...@redhat.com wrote:
 
 It is preferable to remove all large sptes including read-only ones, the
 current behaviour, then to verify that no read-write transition can
 occur in fault paths (fault paths which are increasing in number).

 
 I think we should use separate function than spte_write_protect() for
 the large case.


I will introduce a function to handle large sptes when i implement the
idea of making writable spte to be read-only.

But, keep it in this patchset first.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 07/16] KVM: MMU: introduce for_each_pte_list_spte

2012-04-20 Thread Xiao Guangrong
On 04/21/2012 09:01 AM, Takuya Yoshikawa wrote:

 Sorry for the delay.
 
 On Wed, 18 Apr 2012 12:01:03 +0800
 Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com wrote:
 
 I have checked dirty-log-perf myself with this patch [01-07].

 GET_DIRTY_LOG for 1GB dirty pages has become more than 15% slower.
 
 Thanks for your test!

 Unbelievable, i will do more test and check it more carefully.
 
 GET_DIRTY_LOG now traverses rmap lists intensively.
 So only a small change can affect the performance when there are many
 dirty pages.
 
 Could you please open your tool, then i can reproduction it and find the
 real reason?
 
 It's already in kvm unit tests!
 


Great, i will check that.

 I will check whether your tool is better then kernbench/autotest after
 review your tool.
 
 Let's focus on lock-less now: so dirty-log-perf is not needed now.
 
 I think you failed to appeal the real advantage of your lock-less approach!
 I will write about this on v3-threads.


Thank you very much, Takuya!
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 4/9] KVM: MMU: introduce SPTE_ALLOW_WRITE bit

2012-04-20 Thread Marcelo Tosatti
On Sat, Apr 21, 2012 at 11:30:55AM +0800, Xiao Guangrong wrote:
 On 04/21/2012 05:39 AM, Marcelo Tosatti wrote:
 
 
  @@ -1177,9 +1178,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, 
  unsigned long *rmapp,
 new_spte = *sptep  ~PT64_BASE_ADDR_MASK;
 new_spte |= (u64)new_pfn  PAGE_SHIFT;
 
  -  new_spte = ~PT_WRITABLE_MASK;
  -  new_spte = ~SPTE_HOST_WRITEABLE;
  -  new_spte = ~shadow_accessed_mask;
  +  new_spte = ~(PT_WRITABLE_MASK | SPTE_HOST_WRITEABLE |
  +shadow_accessed_mask | SPTE_ALLOW_WRITE);
  
  Each bit should have a distinct meaning. Here the host pte is being
  write-protected, which means only the SPTE_HOST_WRITEABLE bit
  should be cleared.
 
 
 Hmm, it is no problem if SPTE_ALLOW_WRITE is not cleared.
 
 But the meaning of SPTE_ALLOW_WRITE will become strange: we will see a
 spte with spte.SPTE_ALLOW_WRITE = 1 (means the spte is writable on host
 and guest) and spte.SPTE_HOST_WRITEABLE = 0 (means the spte is read-only
 on host).
 
You are combining gpte writable bit, and host pte writable bit (which
are separate and independent of each other) into one bit. 

SPTE_HOST_WRITEABLE already indicates whether the host pte is writable 
or not.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit

2012-04-20 Thread Marcelo Tosatti
On Sat, Apr 21, 2012 at 11:47:46AM +0800, Xiao Guangrong wrote:
 On 04/21/2012 05:52 AM, Marcelo Tosatti wrote:
 
  On Fri, Apr 20, 2012 at 04:19:17PM +0800, Xiao Guangrong wrote:
  If this bit is set, it means the W bit of the spte is cleared due
  to shadow page table protection
 
  Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
  ---
   arch/x86/kvm/mmu.c |   56 
  ++-
   1 files changed, 37 insertions(+), 19 deletions(-)
 
  diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
  index dd984b6..eb02fc4 100644
  --- a/arch/x86/kvm/mmu.c
  +++ b/arch/x86/kvm/mmu.c
  @@ -147,6 +147,7 @@ module_param(dbg, bool, 0644);
 
   #define SPTE_HOST_WRITEABLE   (1ULL  PT_FIRST_AVAIL_BITS_SHIFT)
   #define SPTE_ALLOW_WRITE  (1ULL  (PT_FIRST_AVAIL_BITS_SHIFT + 1))
  +#define SPTE_WRITE_PROTECT(1ULL  (PT_FIRST_AVAIL_BITS_SHIFT + 
  2))
 
   #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 
  @@ -1042,36 +1043,51 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
 rmap_remove(kvm, sptep);
   }
 
  +static bool spte_wp_by_dirty_log(u64 spte)
  +{
  +  WARN_ON(is_writable_pte(spte));
  +
  +  return (spte  SPTE_ALLOW_WRITE)  !(spte  SPTE_WRITE_PROTECT);
  +}
  
  Is the information accurate? Say:
  
  - dirty log write protect, set SPTE_ALLOW_WRITE, clear WRITABLE.
  - shadow gfn, rmap_write_protect finds page not WRITABLE.
  - spte points to shadow gfn, but SPTE_WRITE_PROTECT is not set.
 
 
 It can not happen, rmap_write_protect will set SPTE_WRITE_PROTECT
 even if the spte is not WRITABLE, please see:
 
 + if (page_table_protect  spte_wp_by_dirty_log(spte))
 + goto reset_spte;
 +
 + return false;
 +
 +reset_spte:
   rmap_printk(rmap_write_protect: spte %p %llx\n, spte, *spte);
   spte = spte  ~PT_WRITABLE_MASK;
 + if (page_table_protect)
 + spte |= SPTE_WRITE_PROTECT;
   mmu_spte_update(sptep, spte);
 -
   return false;
  }

Right. What about sync path, fault path, prefault path, do they update
SPTE_WRITE_PROTECT properly?

  BTW,
  
  introduce SPTE_ALLOW_WRITE bit
  
  This bit indicates whether the spte is allow to be writable that
  means the gpte of this spte is writable and the pfn pointed by
  this spte is writable on host
  
  Other than the fact that each bit should have one meaning, how
  can this bit be accurate without write protection of the gpte?
  
 
 Above explanation can ensure the meaning of this bit is accurate?
 Or it has another case? :)

No, it is out of sync with guest pte.

  As soon as guest writes to gpte, information in bit is outdated.
  
 
 
 The bit will be updated when spte is updated.
 
 When the guest write gpte, the spte is not updated immediately,
 yes, the bit is outdated at that time, but it is ok since tlb is
 not flushed.

Page faults cause TLB flushes.

From Intel manual:

In addition to the instructions identified above, page faults
invalidate entries in the TLBs and paging-structure caches. In
particular, a page-fault exception resulting from an attempt to use
a linear address will invalidate any TLB entries that are for a page
number corresponding to that linear address and that are associated with
the current PCID.

 After tlb flush, the bit can be coincident with gpte.

You must read the gpte before updating from ro-rw, unless you write
protect gpte. IIRC you were doing that in previous patches?

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 2/9] KVM: MMU: abstract spte write-protect

2012-04-20 Thread Marcelo Tosatti
On Sat, Apr 21, 2012 at 11:24:54AM +0800, Xiao Guangrong wrote:
 On 04/21/2012 05:33 AM, Marcelo Tosatti wrote:
 
 
   static bool
   __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
   {
  @@ -1050,24 +1078,13 @@ __rmap_write_protect(struct kvm *kvm, unsigned 
  long *rmapp, int level)
 
 for (sptep = rmap_get_first(*rmapp, iter); sptep;) {
 BUG_ON(!(*sptep  PT_PRESENT_MASK));
  -  rmap_printk(rmap_write_protect: spte %p %llx\n, sptep, 
  *sptep);
  -
  -  if (!is_writable_pte(*sptep)) {
  -  sptep = rmap_get_next(iter);
  -  continue;
  -  }
  -
  -  if (level == PT_PAGE_TABLE_LEVEL) {
  -  mmu_spte_update(sptep, *sptep  ~PT_WRITABLE_MASK);
  -  sptep = rmap_get_next(iter);
  -  } else {
  -  BUG_ON(!is_large_pte(*sptep));
  -  drop_spte(kvm, sptep);
  -  --kvm-stat.lpages;
  
  It is preferable to remove all large sptes including read-only ones, the
 
 
 It can cause page faults even if read memory on these large sptse.
 
 Actually, Avi suggested that make large writable spte to be readonly
 (not dropped) on this path.

See commits e49146dce8c3dc6f4485c1904b6587855f393e71,
38187c830cab84daecb41169948467f1f19317e3 for issues
with large read-only sptes.

  current behaviour, then to verify that no read-write transition can
  occur in fault paths (fault paths which are increasing in number).
 
 
 Yes, the small spte also has issue (find a write-protected spte in
 fault paths). Later, the second part of this patchset will introduce
 rmap.WRITE_PROTECTED bit, then we can do the fast check before calling
 fast page fault.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4] KVM: PPC Use clockevent multiplier and shifter for decrementer

2012-04-20 Thread Alexander Graf

On 18.04.2012, at 18:01, Bharat Bhushan wrote:

 Time for which the hrtimer is started for decrementer emulation is calculated 
 using tb_ticks_per_usec. While hrtimer uses the clockevent for DEC 
 reprogramming (if needed) and which calculate timebase ticks using the 
 multiplier and shifter mechanism implemented within clockevent layer. It was 
 observed that this conversion (timebase-time-timebase) are not correct 
 because the mechanism are not consistent. In our setup it adds 2% jitter.
 
 With this patch clockevent multiplier and shifter mechanism are used when 
 starting hrtimer for decrementer emulation. Now the jitter is  0.5%.
 
 Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com

Thanks, applied to kvm-ppc-next with fixed commit message and fixed trailing 
whitespace :).


Alex

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html