[PATCH v3 2/9] KVM: MMU: abstract spte write-protect
Introduce a common function to abstract spte write-protect to cleanup the code Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/kvm/mmu.c | 60 ++- 1 files changed, 35 insertions(+), 25 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4a3cc18..e70ff38 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1041,6 +1041,34 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } +/* Return true if the spte is dropped. */ +static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool large, + bool *flush) +{ + u64 spte = *sptep; + + if (!is_writable_pte(spte)) + return false; + + *flush |= true; + + if (large) { + pgprintk(rmap_write_protect(large): spte %p %llx\n, +spte, *spte); + BUG_ON(!is_large_pte(spte)); + + drop_spte(kvm, sptep); + --kvm-stat.lpages; + return true; + } + + rmap_printk(rmap_write_protect: spte %p %llx\n, spte, *spte); + spte = spte ~PT_WRITABLE_MASK; + mmu_spte_update(sptep, spte); + + return false; +} + static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) { @@ -1050,24 +1078,13 @@ __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) for (sptep = rmap_get_first(*rmapp, iter); sptep;) { BUG_ON(!(*sptep PT_PRESENT_MASK)); - rmap_printk(rmap_write_protect: spte %p %llx\n, sptep, *sptep); - - if (!is_writable_pte(*sptep)) { - sptep = rmap_get_next(iter); - continue; - } - - if (level == PT_PAGE_TABLE_LEVEL) { - mmu_spte_update(sptep, *sptep ~PT_WRITABLE_MASK); - sptep = rmap_get_next(iter); - } else { - BUG_ON(!is_large_pte(*sptep)); - drop_spte(kvm, sptep); - --kvm-stat.lpages; + if (spte_write_protect(kvm, sptep, level PT_PAGE_TABLE_LEVEL, + write_protected)) { sptep = rmap_get_first(*rmapp, iter); + continue; } - write_protected = true; + sptep = rmap_get_next(iter); } return write_protected; @@ -3902,6 +3919,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu) void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) { struct kvm_mmu_page *sp; + bool flush = false; list_for_each_entry(sp, kvm-arch.active_mmu_pages, link) { int i; @@ -3916,16 +3934,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) !is_last_spte(pt[i], sp-role.level)) continue; - if (is_large_pte(pt[i])) { - drop_spte(kvm, pt[i]); - --kvm-stat.lpages; - continue; - } - - /* avoid RMW */ - if (is_writable_pte(pt[i])) - mmu_spte_update(pt[i], - pt[i] ~PT_WRITABLE_MASK); + spte_write_protect(kvm, pt[i], + is_large_pte(pt[i]), flush); } } kvm_flush_remote_tlbs(kvm); -- 1.7.7.6 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 4/9] KVM: MMU: introduce SPTE_ALLOW_WRITE bit
This bit indicates whether the spte is allow to be writable that means the gpte of this spte is writable and the pfn pointed by this spte is writable on host Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/kvm/mmu.c | 13 ++--- 1 files changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e70ff38..dd984b6 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -145,7 +145,8 @@ module_param(dbg, bool, 0644); #define CREATE_TRACE_POINTS #include mmutrace.h -#define SPTE_HOST_WRITEABLE (1ULL PT_FIRST_AVAIL_BITS_SHIFT) +#define SPTE_HOST_WRITEABLE(1ULL PT_FIRST_AVAIL_BITS_SHIFT) +#define SPTE_ALLOW_WRITE (1ULL (PT_FIRST_AVAIL_BITS_SHIFT + 1)) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) @@ -1177,9 +1178,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, new_spte = *sptep ~PT64_BASE_ADDR_MASK; new_spte |= (u64)new_pfn PAGE_SHIFT; - new_spte = ~PT_WRITABLE_MASK; - new_spte = ~SPTE_HOST_WRITEABLE; - new_spte = ~shadow_accessed_mask; + new_spte = ~(PT_WRITABLE_MASK | SPTE_HOST_WRITEABLE | + shadow_accessed_mask | SPTE_ALLOW_WRITE); mmu_spte_clear_track_bits(sptep); mmu_spte_set(sptep, new_spte); @@ -2316,7 +2316,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, goto done; } - spte |= PT_WRITABLE_MASK; + spte |= PT_WRITABLE_MASK | SPTE_ALLOW_WRITE; if (!vcpu-arch.mmu.direct_map !(pte_access ACC_WRITE_MASK)) { @@ -2345,8 +2345,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, __func__, gfn); ret = 1; pte_access = ~ACC_WRITE_MASK; - if (is_writable_pte(spte)) - spte = ~PT_WRITABLE_MASK; + spte = ~PT_WRITABLE_MASK; } } -- 1.7.7.6 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 7/9] KVM: MMU: trace fast page fault
To see what happen on this path and help us to optimize it Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/kvm/mmu.c |2 ++ arch/x86/kvm/mmutrace.h | 41 + 2 files changed, 43 insertions(+), 0 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7aea156..ac9c285 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2830,6 +2830,8 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, spte, gfn); exit: + trace_fast_page_fault(vcpu, gva, gfn, error_code, iterator.sptep, + spte, ret); walk_shadow_page_lockless_end(vcpu); return ret; diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 89fb0e8..84da94f 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -243,6 +243,47 @@ TRACE_EVENT( TP_printk(addr:%llx gfn %llx access %x, __entry-addr, __entry-gfn, __entry-access) ); + +#define __spte_satisfied(__spte) \ + (__entry-retry is_writable_pte(__entry-__spte)) + +TRACE_EVENT( + fast_page_fault, + TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, u32 error_code, +u64 *sptep, u64 old_spte, bool retry), + TP_ARGS(vcpu, gva, gfn, error_code, sptep, old_spte, retry), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(gva_t, gva) + __field(gfn_t, gfn) + __field(u32, error_code) + __field(u64 *, sptep) + __field(u64, old_spte) + __field(u64, new_spte) + __field(bool, retry) + ), + + TP_fast_assign( + __entry-vcpu_id = vcpu-vcpu_id; + __entry-gva = gva; + __entry-gfn = gfn; + __entry-error_code = error_code; + __entry-sptep = sptep; + __entry-old_spte = old_spte; + __entry-new_spte = *sptep; + __entry-retry = retry; + ), + + TP_printk(vcpu %d gva %lx gfn %llx error_code %s sptep %p + old %#llx new %llx spurious %d fixed %d, __entry-vcpu_id, + __entry-gva, __entry-gfn, + __print_flags(__entry-error_code, |, + kvm_mmu_trace_pferr_flags), + __entry-sptep, __entry-old_spte, __entry-new_spte, + __spte_satisfied(old_spte), __spte_satisfied(new_spte) + ) +); #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH -- 1.7.7.6 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 0/9] KVM: MMU: fast page fault
Changlog: - split the long series for better review. This is the core implementation of fast page fault. - document fast page fault in locking.txt -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 1/9] KVM: MMU: return bool in __rmap_write_protect
The reture value of __rmap_write_protect is either 1 or 0, use true/false instead of these Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/kvm/mmu.c | 13 +++-- 1 files changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 29ad6f9..4a3cc18 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1041,11 +1041,12 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } -static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) +static bool +__rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) { u64 *sptep; struct rmap_iterator iter; - int write_protected = 0; + bool write_protected = false; for (sptep = rmap_get_first(*rmapp, iter); sptep;) { BUG_ON(!(*sptep PT_PRESENT_MASK)); @@ -1066,7 +1067,7 @@ static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level sptep = rmap_get_first(*rmapp, iter); } - write_protected = 1; + write_protected = true; } return write_protected; @@ -1097,12 +1098,12 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, } } -static int rmap_write_protect(struct kvm *kvm, u64 gfn) +static bool rmap_write_protect(struct kvm *kvm, u64 gfn) { struct kvm_memory_slot *slot; unsigned long *rmapp; int i; - int write_protected = 0; + bool write_protected = false; slot = gfn_to_memslot(kvm, gfn); @@ -1691,7 +1692,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, kvm_mmu_pages_init(parent, parents, pages); while (mmu_unsync_walk(parent, pages)) { - int protected = 0; + bool protected = false; for_each_sp(pages, sp, parents, i) protected |= rmap_write_protect(vcpu-kvm, sp-gfn); -- 1.7.7.6 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 6/9] KVM: MMU: fast path of handling guest page fault
If the the present bit of page fault error code is set, it indicates the shadow page is populated on all levels, it means what we do is only modify the access bit which can be done out of mmu-lock Currently, in order to simplify the code, we only fix the page fault caused by write-protect on the fast path Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/kvm/mmu.c | 197 arch/x86/kvm/paging_tmpl.h |3 + 2 files changed, 184 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index eb02fc4..7aea156 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -446,6 +446,13 @@ static bool __check_direct_spte_mmio_pf(u64 spte) } #endif +static bool spte_wp_by_dirty_log(u64 spte) +{ + WARN_ON(is_writable_pte(spte)); + + return (spte SPTE_ALLOW_WRITE) !(spte SPTE_WRITE_PROTECT); +} + static bool spte_has_volatile_bits(u64 spte) { if (!shadow_accessed_mask) @@ -454,9 +461,18 @@ static bool spte_has_volatile_bits(u64 spte) if (!is_shadow_present_pte(spte)) return false; - if ((spte shadow_accessed_mask) - (!is_writable_pte(spte) || (spte shadow_dirty_mask))) - return false; + if (spte shadow_accessed_mask) { + if (is_writable_pte(spte)) + return !(spte shadow_dirty_mask); + + /* +* If the spte is write-protected by dirty-log, it may +* be marked writable on fast page fault path, then CPU +* can modify the Dirty bit. +*/ + if (!spte_wp_by_dirty_log(spte)) + return false; + } return true; } @@ -1043,13 +1059,6 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } -static bool spte_wp_by_dirty_log(u64 spte) -{ - WARN_ON(is_writable_pte(spte)); - - return (spte SPTE_ALLOW_WRITE) !(spte SPTE_WRITE_PROTECT); -} - /* Return true if the spte is dropped. */ static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool large, bool *flush, bool page_table_protect) @@ -1057,13 +1066,12 @@ static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool large, u64 spte = *sptep; if (is_writable_pte(spte)) { - *flush |= true; - if (large) { pgprintk(rmap_write_protect(large): spte %p %llx\n, spte, *spte); BUG_ON(!is_large_pte(spte)); + *flush |= true; drop_spte(kvm, sptep); --kvm-stat.lpages; return true; @@ -1072,6 +1080,9 @@ static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool large, goto reset_spte; } + /* We need flush tlbs in this case: the fast page fault path +* can mark the spte writable after we read the sptep. +*/ if (page_table_protect spte_wp_by_dirty_log(spte)) goto reset_spte; @@ -1079,6 +1090,8 @@ static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool large, reset_spte: rmap_printk(rmap_write_protect: spte %p %llx\n, spte, *spte); + + *flush |= true; spte = spte ~PT_WRITABLE_MASK; if (page_table_protect) spte |= SPTE_WRITE_PROTECT; @@ -2676,18 +2689,164 @@ exit: return ret; } +static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, gfn_t gfn, + u32 error_code) +{ + /* +* #PF can be fast only if the shadow page table is present and it +* is caused by write-protect, that means we just need change the +* W bit of the spte which can be done out of mmu-lock. +*/ + if (!(error_code PFERR_PRESENT_MASK) || + !(error_code PFERR_WRITE_MASK)) + return false; + + return true; +} + +static bool +fast_pf_fix_indirect_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + u64 *sptep, u64 spte, gfn_t gfn) +{ + pfn_t pfn; + bool ret = false; + + /* +* For the indirect spte, it is hard to get a stable gfn since +* we just use a cmpxchg to avoid all the races which is not +* enough to avoid the ABA problem: the host can arbitrarily +* change spte and the mapping from gfn to pfn. +* +* What we do is call gfn_to_pfn_atomic to bind the gfn and the +* pfn because after the call: +* - we have held the refcount of pfn that means the pfn can not +* be freed and be reused for another gfn. +* - the pfn is writable that means it can not be shared by different +* gfn. +*/ + pfn = gfn_to_pfn_atomic(vcpu-kvm, gfn); + + /* The host page is
[PATCH v3 3/9] KVM: VMX: export PFEC.P bit on ept
Export the present bit of page fault error code, the later patch will use it Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/kvm/vmx.c |9 - 1 files changed, 8 insertions(+), 1 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 52f6856..2c98057 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4735,6 +4735,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; gpa_t gpa; + u32 error_code; int gla_validity; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -4759,7 +4760,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(gpa, exit_qualification); - return kvm_mmu_page_fault(vcpu, gpa, exit_qualification 0x3, NULL, 0); + + /* It is a write fault? */ + error_code = exit_qualification (1U 1); + /* ept page table is present? */ + error_code |= (exit_qualification 3) 0x1; + + return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); } static u64 ept_rsvd_mask(u64 spte, int level) -- 1.7.7.6 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 9/9] KVM: MMU: document mmu-lock and fast page fault
Document fast page fault and mmu-lock in locking.txt Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- Documentation/virtual/kvm/locking.txt | 148 - 1 files changed, 147 insertions(+), 1 deletions(-) diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt index 3b4cd3b..089d462 100644 --- a/Documentation/virtual/kvm/locking.txt +++ b/Documentation/virtual/kvm/locking.txt @@ -6,7 +6,147 @@ KVM Lock Overview (to be written) -2. Reference +3: Exception + + +Fast page fault: + +Fast page fault is the fast path which fixes the guest page fault out of +the mmu-lock on x86. Currently, the page fault can be fast only if the +shadow page table is present and it is caused by write-protect, that means +we just need change the W bit of the spte. + +What we use to avoid all the race is the SPTE_ALLOW_WRITE bit and +SPTE_WRITE_PROTECT bit on the spte: +- SPTE_ALLOW_WRITE means the gfn is writable on both guest and host. +- SPTE_WRITE_PROTECT means the gfn is not write-protected for shadow page + write protection. + +On fast page fault path, we will use cmpxchg to atomically set the spte W +bit if spte.SPTE_WRITE_PROTECT = 1 and spte.SPTE_WRITE_PROTECT = 0, this is +safe because whenever changing these bits can be detected by cmpxchg. + +But we need carefully check these cases: +1): The mapping from gfn to pfn + +The mapping from gfn to pfn may be changed since we can only ensure the pfn +is not changed during cmpxchg. This is a ABA problem, for example, below case +will happen: + +At the beginning: +gpte = gfn1 +gfn1 is mapped to pfn1 on host +spte is the shadow page table entry corresponding with gpte and +spte = pfn1 + + VCPU 0 VCPU0 +on fast page fault path: + + old_spte = *spte; + pfn1 is swapped out: +spte = 0; + + pfn1 is re-alloced for gfn2. + + gpte is changed to point to + gfn2 by the guest: +spte = pfn1; + + if (cmpxchg(spte, old_spte, old_spte+W) + mark_page_dirty(vcpu-kvm, gfn1) + OOPS!!! + +We dirty-log for gfn1, that means gfn2 is lost in dirty-bitmap. + +For direct sp, we can easily avoid it since the spte of direct sp is fixed +to gfn. For indirect sp, before we do cmpxchg, we call gfn_to_pfn_atomic() +to pin gfn to pfn, because after gfn_to_pfn_atomic(): +- We have held the refcount of pfn that means the pfn can not be freed and + be reused for another gfn. +- The pfn is writable that means it can not be shared between different gfns + by KSM. + +Then, we can ensure the dirty bitmaps is correctly set for a gfn. + +2): flush tlbs due to shadow page table write-protected + +In rmap_write_protect(), we always need flush tlbs if spte.SPTE_ALLOW_WRITE = 1 +even if the current spte is read-only. The reason is fast page fault path +will mark the spte to writable and the writable spte will be cached into tlb. +Like below case: + +At the beginning: +spte.W = 0 +spte.SPTE_WRITE_PROTECT = 0 +spte.SPTE_ALLOW_WRITE = 1 + + VCPU 0 VCPU0 +In rmap_write_protect(): + + flush = false; + + if (spte.W == 1) + flush = true; + + On fast page fault path: + old_spte = *spte + cmpxchg(spte, old_spte, old_spte + W) + + the spte is fetched/prefetched into + tlb by CPU + + spte = (spte | SPTE_WRITE_PROTECT) + ~PT_WRITABLE_MASK; + + if (flush) + kvm_flush_remote_tlbs(vcpu-kvm) + OOPS!!! + +The tlbs are not flushed since the spte is read-only, but invalid writable +spte has been cached in the tlbs caused by fast page fault. + +3): Dirty bit tracking +In the origin code, the spte can be fast updated (non-atomically) if the +spte is read-only and the Accessed bit has already been set since the +Accessed bit and Dirty bit can not be lost. + +But it is not true after fast page fault since the spte can be marked +writable between reading spte and updating spte. Like below case: + +At the beginning: +spte.W = 0 +spte.Accessed = 1 + + VCPU 0 VCPU0 +In mmu_spte_clear_track_bits(): + + old_spte = *spte; + + /* 'if' condition is satisfied. */ + if (old_spte.Accssed == 1 +old_spte.W == 0) + spte = 0ull; + on fast page fault path: + spte.W = 1 + memory write on the spte: + spte.Dirty = 1 + + + else + old_spte = xchg(spte, 0ull) + + + if (old_spte.Accssed == 1) + kvm_set_pfn_accessed(spte.pfn); + if (old_spte.Dirty ==
[PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit
If this bit is set, it means the W bit of the spte is cleared due to shadow page table protection Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/kvm/mmu.c | 56 ++- 1 files changed, 37 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index dd984b6..eb02fc4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -147,6 +147,7 @@ module_param(dbg, bool, 0644); #define SPTE_HOST_WRITEABLE(1ULL PT_FIRST_AVAIL_BITS_SHIFT) #define SPTE_ALLOW_WRITE (1ULL (PT_FIRST_AVAIL_BITS_SHIFT + 1)) +#define SPTE_WRITE_PROTECT (1ULL (PT_FIRST_AVAIL_BITS_SHIFT + 2)) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) @@ -1042,36 +1043,51 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } +static bool spte_wp_by_dirty_log(u64 spte) +{ + WARN_ON(is_writable_pte(spte)); + + return (spte SPTE_ALLOW_WRITE) !(spte SPTE_WRITE_PROTECT); +} + /* Return true if the spte is dropped. */ static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool large, - bool *flush) + bool *flush, bool page_table_protect) { u64 spte = *sptep; - if (!is_writable_pte(spte)) - return false; + if (is_writable_pte(spte)) { + *flush |= true; - *flush |= true; + if (large) { + pgprintk(rmap_write_protect(large): spte %p %llx\n, +spte, *spte); + BUG_ON(!is_large_pte(spte)); - if (large) { - pgprintk(rmap_write_protect(large): spte %p %llx\n, -spte, *spte); - BUG_ON(!is_large_pte(spte)); + drop_spte(kvm, sptep); + --kvm-stat.lpages; + return true; + } - drop_spte(kvm, sptep); - --kvm-stat.lpages; - return true; + goto reset_spte; } + if (page_table_protect spte_wp_by_dirty_log(spte)) + goto reset_spte; + + return false; + +reset_spte: rmap_printk(rmap_write_protect: spte %p %llx\n, spte, *spte); spte = spte ~PT_WRITABLE_MASK; + if (page_table_protect) + spte |= SPTE_WRITE_PROTECT; mmu_spte_update(sptep, spte); - return false; } -static bool -__rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) +static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, +int level, bool page_table_protect) { u64 *sptep; struct rmap_iterator iter; @@ -1080,7 +1096,7 @@ __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) for (sptep = rmap_get_first(*rmapp, iter); sptep;) { BUG_ON(!(*sptep PT_PRESENT_MASK)); if (spte_write_protect(kvm, sptep, level PT_PAGE_TABLE_LEVEL, - write_protected)) { + write_protected, page_table_protect)) { sptep = rmap_get_first(*rmapp, iter); continue; } @@ -1109,7 +1125,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, while (mask) { rmapp = slot-rmap[gfn_offset + __ffs(mask)]; - __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL); + __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false); /* clear the first set bit */ mask = mask - 1; @@ -1128,7 +1144,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn) for (i = PT_PAGE_TABLE_LEVEL; i PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { rmapp = __gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(kvm, rmapp, i); + write_protected |= __rmap_write_protect(kvm, rmapp, i, true); } return write_protected; @@ -1179,7 +1195,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, new_spte |= (u64)new_pfn PAGE_SHIFT; new_spte = ~(PT_WRITABLE_MASK | SPTE_HOST_WRITEABLE | - shadow_accessed_mask | SPTE_ALLOW_WRITE); + shadow_accessed_mask | SPTE_ALLOW_WRITE | + SPTE_WRITE_PROTECT); mmu_spte_clear_track_bits(sptep); mmu_spte_set(sptep, new_spte); @@ -2346,6 +2363,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, ret = 1; pte_access = ~ACC_WRITE_MASK; spte = ~PT_WRITABLE_MASK; + spte |= SPTE_WRITE_PROTECT; } } @@
[PATCH v3 8/9] KVM: MMU: fix kvm_mmu_pagetable_walk tracepoint
P bit of page fault error code is missed in this tracepoint, fix it by passing the full error code Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/kvm/mmutrace.h|7 +++ arch/x86/kvm/paging_tmpl.h |3 +-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 84da94f..e762d35 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -54,8 +54,8 @@ */ TRACE_EVENT( kvm_mmu_pagetable_walk, - TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault), - TP_ARGS(addr, write_fault, user_fault, fetch_fault), + TP_PROTO(u64 addr, u32 pferr), + TP_ARGS(addr, pferr), TP_STRUCT__entry( __field(__u64, addr) @@ -64,8 +64,7 @@ TRACE_EVENT( TP_fast_assign( __entry-addr = addr; - __entry-pferr = (!!write_fault 1) | (!!user_fault 2) -| (!!fetch_fault 4); + __entry-pferr = pferr; ), TP_printk(addr %llx pferr %x %s, __entry-addr, __entry-pferr, diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 80493fb..8e6aac9 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -154,8 +154,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, const int fetch_fault = access PFERR_FETCH_MASK; u16 errcode = 0; - trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, -fetch_fault); + trace_kvm_mmu_pagetable_walk(addr, access); retry_walk: eperm = false; walker-level = mmu-root_level; -- 1.7.7.6 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/4] Export offsets of VMCS fields as note information for kdump
From: Avi Kivity a...@redhat.com Subject: Re: [PATCH 0/4] Export offsets of VMCS fields as note information for kdump Date: Thu, 19 Apr 2012 15:08:00 +0300 On 04/19/2012 03:01 PM, HATAYAMA Daisuke wrote: It would be not helpful for the qemu crash case you are concerned about. We want to use the guest state data to look into guest machine's image in the crasshed qemu. Why? It seems natural to check the situation from guest machine's side when qemu crashs. Suppose a service is running on the guest machine, and then the qemu crash. Then, we may need to know the details of the progress of the service if it's important. What has been successfully done, and what has not yet. How can a service on the guest be related to a qemu crash? And how would guest registers help? I don't mean the service is related to qemu's crash. When qemu crashes, then the guest machine also crashes (although it doesn't notice anything). What I'm interested in here is guest machine side, not qemu side. I want to debug guest machine, not qemu. You can extract the list of running processes from a qemu crash dump without looking at guest registers. And most vcpus are running asynchronously to qemu, so their register contents is irrelevant (if a vcpu is running synchronously with qemu - it just exited to qemu and is waiting for a response - then you'd see the details in qemu's call stack). Just as you say, we can refer to guest machine's memory without guest registers. The definitely necessary data in vmcs are RSP and RIP, which are not saved in anywhare of kvm module. The two registers are needed for back tracing to determine what processsing is being done on the guest machine at qemu crash. There are other useful data in vmcs, but even if we don't have them, we can do what we want to do in exchange of usability. For example, we want IA32_EFER.LMA to determine whether guest machine is in 32-bit or 64-bit mode. But there are only two modes, we can perhaps try these in order in an ad-hoc way. Other control registers are also useful, but they are not so important than definitely needed. Thanks. HATAYAMA, Daisuke -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [PATCH 2/2] Expose tsc deadline timer cpuid to guest
On 2012-04-19 22:03, Eduardo Habkost wrote: Jan/Avi: ping? I would like to get this ABI detail clarified so it can be implemented the right way on Qemu and KVM. My proposal is to simply add tsc-deadline to the data returned by GET_SUPPORTED_CPUID, making KVM_CAP_TSC_DEADLINE_TIMER unnecessary. IIRC, there were problems with this model to exclude that the feature gets reported this way without ensuring that in-kernel irqchip is actually activated. Please browse the discussions, it should be documented there. Jan -- Siemens AG, Corporate Technology, CT T DE IT 1 Corporate Competence Center Embedded Linux -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4] KVM: PPC Use clockevent multiplier and shifter for decrementer
On 18.04.2012, at 18:01, Bharat Bhushan wrote: Time for which the hrtimer is started for decrementer emulation is calculated using tb_ticks_per_usec. While hrtimer uses the clockevent for DEC reprogramming (if needed) and which calculate timebase ticks using the multiplier and shifter mechanism implemented within clockevent layer. It was observed that this conversion (timebase-time-timebase) are not correct because the mechanism are not consistent. In our setup it adds 2% jitter. With this patch clockevent multiplier and shifter mechanism are used when starting hrtimer for decrementer emulation. Now the jitter is 0.5%. Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com Thanks, applied to kvm-ppc-next with fixed commit message and fixed trailing whitespace :). Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [RFC PATCH 6/9] pc: pass paravirt info for hotplug memory slots to BIOS
On 04/19/2012 04:08 PM, Vasilis Liaskovitis wrote: The numa_fw_cfg paravirt interface is extended to include SRAT information for all hotplug-able memslots. There are 3 words for each hotplug-able memory slot, denoting start address, size and node proximity. nb_numa_nodes is set to 1 by default (not 0), so that we always pass srat info to SeaBIOS. This information is used by Seabios to build hotplug memory device objects at runtime. Signed-off-by: Vasilis Liaskovitisvasilis.liaskovi...@profitbricks.com --- hw/pc.c | 59 +-- vl.c|4 +++- 2 files changed, 56 insertions(+), 7 deletions(-) diff --git a/hw/pc.c b/hw/pc.c index 67f0479..f1f550a 100644 --- a/hw/pc.c +++ b/hw/pc.c @@ -46,6 +46,7 @@ #include ui/qemu-spice.h #include memory.h #include exec-memory.h +#include memslot.h /* output Bochs bios info messages */ //#define DEBUG_BIOS @@ -592,12 +593,15 @@ int e820_add_entry(uint64_t address, uint64_t length, uint32_t type) return index; } +static void bochs_bios_setup_hp_memslots(uint64_t *fw_cfg_slots); + static void *bochs_bios_init(void) { void *fw_cfg; uint8_t *smbios_table; size_t smbios_len; uint64_t *numa_fw_cfg; +uint64_t *hp_memslots_fw_cfg; int i, j; register_ioport_write(0x400, 1, 2, bochs_bios_write, NULL); @@ -630,28 +634,71 @@ static void *bochs_bios_init(void) fw_cfg_add_bytes(fw_cfg, FW_CFG_HPET, (uint8_t *)hpet_cfg, sizeof(struct hpet_fw_config)); /* allocate memory for the NUMA channel: one (64bit) word for the number - * of nodes, one word for each VCPU-node and one word for each node to - * hold the amount of memory. + * of nodes, one word for the number of hotplug memory slots, one word + * for each VCPU-node, one word for each node to hold the amount of memory. + * Finally three words for each hotplug memory slot, denoting start address, + * size and node proximity. */ -numa_fw_cfg = g_malloc0((1 + max_cpus + nb_numa_nodes) * 8); +numa_fw_cfg = g_malloc0((2 + max_cpus + nb_numa_nodes + 3 * nb_hp_memslots) * 8); numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes); +numa_fw_cfg[1] = cpu_to_le64(nb_hp_memslots); this will brake compatibility if guest was migrated from old-new qemu than on reboot it will use old bios that expects numa_fw_cfg[1] to be something else. Could memslots info be moved to the end of an existing interface? + for (i = 0; i max_cpus; i++) { for (j = 0; j nb_numa_nodes; j++) { if (node_cpumask[j] (1 i)) { -numa_fw_cfg[i + 1] = cpu_to_le64(j); +numa_fw_cfg[i + 2] = cpu_to_le64(j); break; } } } for (i = 0; i nb_numa_nodes; i++) { -numa_fw_cfg[max_cpus + 1 + i] = cpu_to_le64(node_mem[i]); +numa_fw_cfg[max_cpus + 2 + i] = cpu_to_le64(node_mem[i]); } + +hp_memslots_fw_cfg = numa_fw_cfg + 2 + max_cpus + nb_numa_nodes; +if (nb_hp_memslots) +bochs_bios_setup_hp_memslots(hp_memslots_fw_cfg); + fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, (uint8_t *)numa_fw_cfg, - (1 + max_cpus + nb_numa_nodes) * 8); + (2 + max_cpus + nb_numa_nodes + 3 * nb_hp_memslots) * 8); return fw_cfg; } +static void bochs_bios_setup_hp_memslots(uint64_t *fw_cfg_slots) +{ +int i = 0; +Error *err = NULL; +DeviceState *dev; +MemSlotState *slot; +char *type; +BusState *bus = sysbus_get_default(); + +QTAILQ_FOREACH(dev,bus-children, sibling) { +type = object_property_get_str(OBJECT(dev), type,err); +if (err) { +error_free(err); +fprintf(stderr, error getting device type\n); +exit(1); +} + +if (!strcmp(type, memslot)) { +if (!dev-id) { +error_free(err); +fprintf(stderr, error getting memslot device id\n); +exit(1); +} +if (!strcmp(dev-id, initialslot)) continue; +slot = MEMSLOT(dev); +fw_cfg_slots[3 * slot-idx] = cpu_to_le64(slot-start); +fw_cfg_slots[3 * slot-idx + 1] = cpu_to_le64(slot-size); +fw_cfg_slots[3 * slot-idx + 2] = cpu_to_le64(slot-node); +i++; +} +} +assert(i == nb_hp_memslots); +} + static long get_file_size(FILE *f) { long where, size; diff --git a/vl.c b/vl.c index ae91a8a..50df453 100644 --- a/vl.c +++ b/vl.c @@ -3428,8 +3428,10 @@ int main(int argc, char **argv, char **envp) register_savevm_live(NULL, ram, 0, 4, NULL, ram_save_live, NULL, ram_load, NULL); +if (!nb_numa_nodes) +nb_numa_nodes = 1; -if (nb_numa_nodes 0) { +{ int i; if (nb_numa_nodes MAX_NODES) { -- - Igor -- To unsubscribe from this list: send
Re: [Qemu-devel] [RFC PATCH 2/9][SeaBIOS] Implement acpi-dsdt functions for memory hotplug.
On 04/19/2012 04:08 PM, Vasilis Liaskovitis wrote: Extend the DSDT to include methods for handling memory hot-add and hot-remove notifications and memory device status requests. These functions are called from the memory device SSDT methods. Eject has only been tested with level gpe event, but will be changed to edge gpe event soon, according to recent master patch for other ACPI hotplug events. Signed-off-by: Vasilis Liaskovitisvasilis.liaskovi...@profitbricks.com --- src/acpi-dsdt.dsl | 68 +++- 1 files changed, 66 insertions(+), 2 deletions(-) diff --git a/src/acpi-dsdt.dsl b/src/acpi-dsdt.dsl index 4bdc268..184daf0 100644 --- a/src/acpi-dsdt.dsl +++ b/src/acpi-dsdt.dsl @@ -709,9 +709,72 @@ DefinitionBlock ( } Return(One) } -} +/* Objects filled in by run-time generated SSDT */ +External(MTFY, MethodObj) +External(MEON, PkgObj) + +Method (CMST, 1, NotSerialized) { +// _STA method - return ON status of memdevice +// Local0 = MEON flag for this cpu +Store(DerefOf(Index(MEON, Arg0)), Local0) +If (Local0) { Return(0xF) } Else { Return(0x0) } +} +/* Memory eject notify method */ +OperationRegion(MEMJ, SystemIO, 0xaf40, 32) +Field (MEMJ, ByteAcc, NoLock, Preserve) +{ +MPE, 256 +} + +Method (MPEJ, 2, NotSerialized) { +// _EJ0 method - eject callback +Store(ShiftLeft(1,Arg0), MPE) +Sleep(200) +} MPE is write only and only one memslot is ejected at a time. Why 256 bit-field is here then? Could we use just 1 byte and write a slot number into it and save some io address space this way? + +/* Memory hotplug notify method */ +OperationRegion(MEST, SystemIO, 0xaf20, 32) It's more a suggestion: move it a bit farther to allow maybe 1024 cpus in the future. That will prevent compatibility a headache, if we decide to expand support to more then 256 cpus. Or event better to make this address configurable in run-time and build this var along with SSDT (converting along the way all other hard-coded io ports to the same generic run-time interface). This wish is out of scope of this patch-set, but what do you think about the idea? +Field (MEST, ByteAcc, NoLock, Preserve) +{ +MES, 256 +} + +Method(MESC, 0) { +// Local5 = active memdevice bitmap +Store (MES, Local5) +// Local2 = last read byte from bitmap +Store (Zero, Local2) +// Local0 = memory device iterator +Store (Zero, Local0) +While (LLess(Local0, SizeOf(MEON))) { +// Local1 = MEON flag for this memory device +Store(DerefOf(Index(MEON, Local0)), Local1) +If (And(Local0, 0x07)) { +// Shift down previously read bitmap byte +ShiftRight(Local2, 1, Local2) +} Else { +// Read next byte from memdevice bitmap +Store(DerefOf(Index(Local5, ShiftRight(Local0, 3))), Local2) +} +// Local3 = active state for this memory device +Store(And(Local2, 1), Local3) +If (LNotEqual(Local1, Local3)) { +// State change - update MEON with new state +Store(Local3, Index(MEON, Local0)) +// Do MEM notify +If (LEqual(Local3, 1)) { +MTFY(Local0, 1) +} Else { +MTFY(Local0, 3) +} +} +Increment(Local0) +} +Return(One) +} +} / * General purpose events / @@ -732,7 +795,8 @@ DefinitionBlock ( Return(\_SB.PRSC()) } Method(_L03) { -Return(0x01) +// Memory hotplug event +Return(\_SB.MESC()) } Method(_L04) { Return(0x01) -- - Igor -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [RFC PATCH 2/9][SeaBIOS] Implement acpi-dsdt functions for memory hotplug.
Hi, On Fri, Apr 20, 2012 at 12:55:24PM +0200, Igor Mammedov wrote: +/* Memory eject notify method */ +OperationRegion(MEMJ, SystemIO, 0xaf40, 32) +Field (MEMJ, ByteAcc, NoLock, Preserve) +{ +MPE, 256 +} + +Method (MPEJ, 2, NotSerialized) { +// _EJ0 method - eject callback +Store(ShiftLeft(1,Arg0), MPE) +Sleep(200) +} MPE is write only and only one memslot is ejected at a time. Why 256 bit-field is here then? Could we use just 1 byte and write a slot number into it and save some io address space this way? good point. This was implemented similarly to the hot-add/status register only for symmetry, but you are right, since only one slot is ejected at a time, this can be reduced to one byte and save space. I will update for the next version. + +/* Memory hotplug notify method */ +OperationRegion(MEST, SystemIO, 0xaf20, 32) It's more a suggestion: move it a bit farther to allow maybe 1024 cpus in the future. That will prevent compatibility a headache, if we decide to expand support to more then 256 cpus. ok, I will move it to 0xaf80 or higher (so cpu-hotplug could be extended to at least 1024 cpus) Or event better to make this address configurable in run-time and build this var along with SSDT (converting along the way all other hard-coded io ports to the same generic run-time interface). This wish is out of scope of this patch-set, but what do you think about the idea? yes, that would give more flexibility and avoid more compatibility headaches. As you say it's not a main issue for the series, but I can work on it as we start converting hardcoded i/o ports to configurable properties. thanks, - Vasilis -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Networking performance on a KVM Host (with no guests)
On 4/18/2012 10:43 PM, Gleb Natapov wrote: On Thu, Apr 19, 2012 at 03:53:39AM +, Chegu Vinod wrote: Hello, Perhaps this query was answered in the past. If yes kindly point me to the same. We noticed differences in networking performance (measured via netperf over a 10G NIC) on an X86_64 server between the following two configurations : 1) Server run as a KVM Host (but with no KVM guests created on it (or) no extra bridges created other than the default vibr0 bridge) vs. 2) The same server running the same version of Linux but without any of the virtualization software installed on it. Config #2 performed much better ! Is this expected ? What are the reasons behind this ? Is there any way to gain back the loss in performance on the KVM host without having to uninstall the virtualization software (i.e. in Config 1). Just an observation : Removing the intel_iommu=on boot time parameter in the Config 1 case seemed to help gain back the networking performance on the KVM host... i.e. performed nearly the same as in Config2. Didn't have to change any of the VT settings in the BIOS... Trying to understand why that's the case... Any pointers ? Vinod -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 0/9] ACPI memory hotplug
On Thu, Apr 19, 2012 at 04:08:38PM +0200, Vasilis Liaskovitis wrote: series is based on uq/master for qemu-kvm, and master for seabios. Can be found also at: forgot to paste the repo links in the original coverletter, here they are if someone wants them: https://github.com/vliaskov/qemu-kvm/commits/memory-hotplug https://github.com/vliaskov/seabios/commits/memory-hotplug thanks, - Vasilis -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [PATCH 2/2] Expose tsc deadline timer cpuid to guest
On Fri, Apr 20, 2012 at 12:12:38PM +0200, Jan Kiszka wrote: On 2012-04-19 22:03, Eduardo Habkost wrote: Jan/Avi: ping? I would like to get this ABI detail clarified so it can be implemented the right way on Qemu and KVM. My proposal is to simply add tsc-deadline to the data returned by GET_SUPPORTED_CPUID, making KVM_CAP_TSC_DEADLINE_TIMER unnecessary. IIRC, there were problems with this model to exclude that the feature gets reported this way without ensuring that in-kernel irqchip is actually activated. Please browse the discussions, it should be documented there. The flag was never added to GET_SUPPORTED_CPUID on any of the git repositories I have checked, and I don't see that being done anywhere on my KVM mailing list archives, either. So I don't see how we could have had issues with GET_SUPPORTED_CPUID, if it was never present on the code. What was present on the code before the fix, was coded that unconditinally enabled the flag even if Qemu never asked for it, and that really was wrong. GET_SUPPORTED_CPUID doesn't enable any flag: it just tells userspace that the hardware and KVM supports the feature (and, surprise, this is exactly what KVM_CAP_TSC_DEADLINE_TIMER means too, isn't it?). It's up to userspace to enable the CPUID bits according to user requirements and userspace capabilities (in other words: only when userspace knows it is safe because the in-kernel irqchip is enabled). If the above is not what GET_SUPPORTED_CPUID means, I would like to get that clarified, so I can submit an updated to KVM API documentation. -- Eduardo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Question about host CPU usage/allocation by KVM
On Thu, 19 Apr 2012 13:01:54 -0500, Alexander Lyakas alex.bols...@gmail.com wrote: Hi Stuart, I have been doing some experiments, and I noticed that there are additional QEMU threads, besides the ones reported by info cpus command. In particular, the main QEMU thread (the one whose LWP is the same as its PID), also consumes significant CPU time. Is this expected? The extra threads are for various things. It can be for the vnc server if you are using it. Threads are used to mimic aio in certain situations. Etc. The main thread also does a lot of the device emulation work (console, network, serial, block, etc.) Alex. On Wed, Apr 18, 2012 at 8:24 PM, Stuart Yoder b08...@gmail.com wrote: On Tue, Apr 17, 2012 at 4:54 PM, Alexander Lyakas alex.bols...@gmail.com wrote: Greetings everybody, Can anybody please point me to code/documentation regarding the following questions I have: - What does it actually mean using -smp N option, in terms of CPU sharing between the host and the guest? - How are guest CPUs mapped to host CPUs (if at all)? Each guest CPU (vcpu) corresponds to a QEMU thread. You can see the thread ids in QEMU with info cpus in the QEMU monitor. Since a vcpu is a thread you can apply standard Linux mechanisms to managing those threads-- CPU affinity, etc. Stuart -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [PATCH 2/2] Expose tsc deadline timer cpuid to guest
On 2012-04-20 17:00, Eduardo Habkost wrote: On Fri, Apr 20, 2012 at 12:12:38PM +0200, Jan Kiszka wrote: On 2012-04-19 22:03, Eduardo Habkost wrote: Jan/Avi: ping? I would like to get this ABI detail clarified so it can be implemented the right way on Qemu and KVM. My proposal is to simply add tsc-deadline to the data returned by GET_SUPPORTED_CPUID, making KVM_CAP_TSC_DEADLINE_TIMER unnecessary. IIRC, there were problems with this model to exclude that the feature gets reported this way without ensuring that in-kernel irqchip is actually activated. Please browse the discussions, it should be documented there. The flag was never added to GET_SUPPORTED_CPUID on any of the git repositories I have checked, and I don't see that being done anywhere on my KVM mailing list archives, either. So I don't see how we could have had issues with GET_SUPPORTED_CPUID, if it was never present on the code. What was present on the code before the fix, was coded that unconditinally enabled the flag even if Qemu never asked for it, and that really was wrong. GET_SUPPORTED_CPUID doesn't enable any flag: it just tells userspace that the hardware and KVM supports the feature (and, surprise, this is exactly what KVM_CAP_TSC_DEADLINE_TIMER means too, isn't it?). It's up to userspace to enable the CPUID bits according to user requirements and userspace capabilities (in other words: only when userspace knows it is safe because the in-kernel irqchip is enabled). If the above is not what GET_SUPPORTED_CPUID means, I would like to get that clarified, so I can submit an updated to KVM API documentation. Does old userspace filter out flags from GET_SUPPORTED_CPUID that it does not understand? Jan -- Siemens AG, Corporate Technology, CT T DE IT 1 Corporate Competence Center Embedded Linux -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [PATCH 2/2] Expose tsc deadline timer cpuid to guest
On Fri, Apr 20, 2012 at 05:19:17PM +0200, Jan Kiszka wrote: On 2012-04-20 17:00, Eduardo Habkost wrote: On Fri, Apr 20, 2012 at 12:12:38PM +0200, Jan Kiszka wrote: On 2012-04-19 22:03, Eduardo Habkost wrote: Jan/Avi: ping? I would like to get this ABI detail clarified so it can be implemented the right way on Qemu and KVM. My proposal is to simply add tsc-deadline to the data returned by GET_SUPPORTED_CPUID, making KVM_CAP_TSC_DEADLINE_TIMER unnecessary. IIRC, there were problems with this model to exclude that the feature gets reported this way without ensuring that in-kernel irqchip is actually activated. Please browse the discussions, it should be documented there. The flag was never added to GET_SUPPORTED_CPUID on any of the git repositories I have checked, and I don't see that being done anywhere on my KVM mailing list archives, either. So I don't see how we could have had issues with GET_SUPPORTED_CPUID, if it was never present on the code. What was present on the code before the fix, was coded that unconditinally enabled the flag even if Qemu never asked for it, and that really was wrong. GET_SUPPORTED_CPUID doesn't enable any flag: it just tells userspace that the hardware and KVM supports the feature (and, surprise, this is exactly what KVM_CAP_TSC_DEADLINE_TIMER means too, isn't it?). It's up to userspace to enable the CPUID bits according to user requirements and userspace capabilities (in other words: only when userspace knows it is safe because the in-kernel irqchip is enabled). If the above is not what GET_SUPPORTED_CPUID means, I would like to get that clarified, so I can submit an updated to KVM API documentation. Does old userspace filter out flags from GET_SUPPORTED_CPUID that it does not understand? It's even more strict than that: it only enables what was explicitly enabled on the CPU model asked by the user. But: The only exception is -cpu host, that tries to enable everything, even flags Qemu never heard of, and that is something that may require some changes on the API design (or at least documentation clarifications). Today -cpu host can't differentiate (A) a feature that KVM supports and emulate by itself and can be enabled without any support from userspace and (B) a feature that KVM supports but need support from userspace to be enabled. I am sure we will be able to find multiple examples of (B) inside the flags returned by GET_SUPPORTED_CPUID today. We could decide to never add any new flag to GET_SUPPORTED_CPUID if it requires additional userspace support to work, from now on, and create new KVM_CAP_* flags for them. But, we really want to do that for almost every new CPUID feature bit in the future? We also have the problem of defining what requires support from userspace to work means: I am not sure this has the same meaning for everybody. Many new features require userspace support only for migration, and nothing else, but some users don't need migration to work. Do those features qualify as (A), or as (B)? -- Eduardo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [RFC PATCH 6/9] pc: pass paravirt info for hotplug memory slots to BIOS
On Fri, Apr 20, 2012 at 12:33:57PM +0200, Igor Mammedov wrote: On 04/19/2012 04:08 PM, Vasilis Liaskovitis wrote: -numa_fw_cfg = g_malloc0((1 + max_cpus + nb_numa_nodes) * 8); +numa_fw_cfg = g_malloc0((2 + max_cpus + nb_numa_nodes + 3 * nb_hp_memslots) * 8); numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes); +numa_fw_cfg[1] = cpu_to_le64(nb_hp_memslots); this will brake compatibility if guest was migrated from old-new qemu than on reboot it will use old bios that expects numa_fw_cfg[1] to be something else. Could memslots info be moved to the end of an existing interface? right. The number of memslots can be placed at 1 + max_cpus + nb_numa_nodes, instead of right after the number of nodes. This way the old layout is preserved, and all memslot info comes at the end. I will rewrite. thanks, - Vasilis -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Networking performance on a KVM Host (with no guests)
On Fri, Apr 20, 2012, Chegu Vinod wrote about Re: Networking performance on a KVM Host (with no guests): Removing the intel_iommu=on boot time parameter in the Config 1 case seemed to help intel_iommu=on is essential with you're mostly running guests *and* using device assignment. However, unfortunately, it also has a serious performance penalty for I/O in *host* processes: When intel_iommu=on, Linux (completely unrelated to KVM) adds a new level of protection which didn't exist without an IOMMU - the network card, which without an IOMMU could write (via DMA) to any memory location, now is not allowed - the card can only write to memory locates which the OS wanted it to write. Theoretically, this can protect the OS against various kinds of attacks. But what happens now is that every time that Linux passes a new buffer to the card, it needs to change the IOMMU mappings. This noticably slows down I/O, unfortunately. -- Nadav Har'El|Friday, Apr 20 2012, n...@math.technion.ac.il |- Phone +972-523-790466, ICQ 13349191 |A bird in the hand is safer than one http://nadav.harel.org.il |overhead. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: vhost-blk development
Ok, so is this it? https://lkml.org/lkml/2011/7/28/175 And what, once it's compiled, it intercepts the virtio-blk requests? If not, how is it enabled in the kvm instance? Best, Mike - Original Message - From: Liu Yuan namei.u...@gmail.com To: Michael Baysek mbay...@liquidweb.com Cc: kvm@vger.kernel.org Sent: Thursday, April 19, 2012 9:28:45 PM Subject: Re: vhost-blk development On 04/20/2012 04:26 AM, Michael Baysek wrote: Can you point me to the latest revision of the code and provide some guidance on how to test it? I really would love to see if it helps. There is no latest revision, I didn't continue the development when I saw the sign that it wouldn't be accepted. Thanks, Yuan -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
KVM: add kvm_arch_para_features stub to asm-generic/kvm_para.h
Needed by kvm_para_has_feature(). Reported-by: Stephen Rothwell s...@canb.auug.org.au Signed-off-by: Marcelo Tosatti mtosa...@redhat.com diff --git a/include/asm-generic/kvm_para.h b/include/asm-generic/kvm_para.h index 05ef7e7..9a7bbad 100644 --- a/include/asm-generic/kvm_para.h +++ b/include/asm-generic/kvm_para.h @@ -11,4 +11,9 @@ static inline bool kvm_check_and_clear_guest_paused(void) return false; } +static inline unsigned int kvm_arch_para_features(void) +{ + return 0; +} + #endif -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] kvm: don't call mmu_shrinker w/o used_mmu_pages
On Fri, 13 Apr 2012 15:38:41 -0700 Ying Han ying...@google.com wrote: The mmu_shrink() is heavy by itself by iterating all kvms and holding the kvm_lock. spotted the code w/ Rik during LSF, and it turns out we don't need to call the shrinker if nothing to shrink. We should probably tell the kvm maintainers about this ;) --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -188,6 +188,11 @@ static u64 __read_mostly shadow_mmio_mask; static void mmu_spte_set(u64 *sptep, u64 spte); +static inline int get_kvm_total_used_mmu_pages() +{ + return percpu_counter_read_positive(kvm_total_used_mmu_pages); +} + void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) { shadow_mmio_mask = mmio_mask; @@ -3900,6 +3905,9 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) if (nr_to_scan == 0) goto out; + if (!get_kvm_total_used_mmu_pages()) + return 0; + raw_spin_lock(kvm_lock); list_for_each_entry(kvm, vm_list, vm_list) { @@ -3926,7 +3934,7 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) raw_spin_unlock(kvm_lock); out: - return percpu_counter_read_positive(kvm_total_used_mmu_pages); + return get_kvm_total_used_mmu_pages(); } static struct shrinker mmu_shrinker = { There's a small functional change: percpu_counter_read_positive() is an approximate thing, so there will be cases where there will be some pages which are accounted for only in the percpu_counter's per-cpu accumulators. In that case mmu_shrink() will bale out when there are in fact some freeable pages available. This is hopefully unimportant. Do we actually know that this patch helps anything? Any measurements? Is kvm_total_used_mmu_pages==0 at all common? -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] kvm: don't call mmu_shrinker w/o used_mmu_pages
On 04/20/2012 06:11 PM, Andrew Morton wrote: On Fri, 13 Apr 2012 15:38:41 -0700 Ying Hanying...@google.com wrote: The mmu_shrink() is heavy by itself by iterating all kvms and holding the kvm_lock. spotted the code w/ Rik during LSF, and it turns out we don't need to call the shrinker if nothing to shrink. @@ -3900,6 +3905,9 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) if (nr_to_scan == 0) goto out; + if (!get_kvm_total_used_mmu_pages()) + return 0; + Do we actually know that this patch helps anything? Any measurements? Is kvm_total_used_mmu_pages==0 at all common? On re-reading mmu.c, it looks like even with EPT or NPT, we end up creating mmu pages for the nested page tables. I have not had the time to look into it more, but it would be nice to know if the patch has any effect at all. -- All rights reversed -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] kvm: don't call mmu_shrinker w/o used_mmu_pages
On Fri, Apr 20, 2012 at 3:53 PM, Rik van Riel r...@redhat.com wrote: On 04/20/2012 06:11 PM, Andrew Morton wrote: On Fri, 13 Apr 2012 15:38:41 -0700 Ying Hanying...@google.com wrote: The mmu_shrink() is heavy by itself by iterating all kvms and holding the kvm_lock. spotted the code w/ Rik during LSF, and it turns out we don't need to call the shrinker if nothing to shrink. @@ -3900,6 +3905,9 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) if (nr_to_scan == 0) goto out; + if (!get_kvm_total_used_mmu_pages()) + return 0; + Do we actually know that this patch helps anything? Any measurements? Is kvm_total_used_mmu_pages==0 at all common? On re-reading mmu.c, it looks like even with EPT or NPT, we end up creating mmu pages for the nested page tables. I think you are right here. So the patch doesn't help the real pain. My understanding of the real pain is the poor implementation of the mmu_shrinker. It iterates all the registered mmu_shrink callbacks for each kvm and only does little work at a time while holding two big locks. I learned from mikew@ (also ++cc-ed) that is causing latency spikes and unfairness among kvm instance in some of the experiment we've seen. Mike might tell more on that. --Ying I have not had the time to look into it more, but it would be nice to know if the patch has any effect at all. -- All rights reversed -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: Question about host CPU usage/allocation by KVM
| -Original Message- | From: kvm-ow...@vger.kernel.org [mailto:kvm-ow...@vger.kernel.org] On | Behalf Of Brian Jackson | Sent: Friday, April 20, 2012 8:01 AM | To: Stuart Yoder; Alexander Lyakas | Cc: kvm@vger.kernel.org | Subject: Re: Question about host CPU usage/allocation by KVM | | On Thu, 19 Apr 2012 13:01:54 -0500, Alexander Lyakas | alex.bols...@gmail.com wrote: | | Hi Stuart, | I have been doing some experiments, and I noticed that there are | additional QEMU threads, besides the ones reported by info cpus | command. In particular, the main QEMU thread (the one whose LWP is the | same as its PID), also consumes significant CPU time. Is this | expected? | | The extra threads are for various things. It can be for the vnc server if | you are using it. Threads are used to mimic aio in certain situations. | Etc. The main thread also does a lot of the device emulation work | (console, network, serial, block, etc.) Is there some way to know what they are for ? or how many can get created ? I noticed that the number of threads go from ~4 to ~70 if I do some I/O inside the guest. Sunny | | | | Alex. | | | On Wed, Apr 18, 2012 at 8:24 PM, Stuart Yoder b08...@gmail.com wrote: | On Tue, Apr 17, 2012 at 4:54 PM, Alexander Lyakas | alex.bols...@gmail.com wrote: | Greetings everybody, | | Can anybody please point me to code/documentation regarding the | following questions I have: | | - What does it actually mean using -smp N option, in terms of CPU | sharing between the host and the guest? | - How are guest CPUs mapped to host CPUs (if at all)? | | Each guest CPU (vcpu) corresponds to a QEMU thread. | You can see the thread ids in QEMU with info cpus in the QEMU | monitor. | | Since a vcpu is a thread you can apply standard Linux mechanisms to | managing those threads-- CPU affinity, etc. | | Stuart | -- | To unsubscribe from this list: send the line unsubscribe kvm in the | body of a message to majord...@vger.kernel.org More majordomo info at | http://vger.kernel.org/majordomo-info.html | -- | To unsubscribe from this list: send the line unsubscribe kvm in the body | of a message to majord...@vger.kernel.org More majordomo info at | http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 2/9] KVM: MMU: abstract spte write-protect
On Fri, Apr 20, 2012 at 04:17:47PM +0800, Xiao Guangrong wrote: Introduce a common function to abstract spte write-protect to cleanup the code Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/kvm/mmu.c | 60 ++- 1 files changed, 35 insertions(+), 25 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4a3cc18..e70ff38 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1041,6 +1041,34 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } +/* Return true if the spte is dropped. */ +static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool large, +bool *flush) +{ + u64 spte = *sptep; + + if (!is_writable_pte(spte)) + return false; + + *flush |= true; + + if (large) { + pgprintk(rmap_write_protect(large): spte %p %llx\n, + spte, *spte); + BUG_ON(!is_large_pte(spte)); + + drop_spte(kvm, sptep); + --kvm-stat.lpages; + return true; + } + + rmap_printk(rmap_write_protect: spte %p %llx\n, spte, *spte); + spte = spte ~PT_WRITABLE_MASK; + mmu_spte_update(sptep, spte); + + return false; +} + static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) { @@ -1050,24 +1078,13 @@ __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) for (sptep = rmap_get_first(*rmapp, iter); sptep;) { BUG_ON(!(*sptep PT_PRESENT_MASK)); - rmap_printk(rmap_write_protect: spte %p %llx\n, sptep, *sptep); - - if (!is_writable_pte(*sptep)) { - sptep = rmap_get_next(iter); - continue; - } - - if (level == PT_PAGE_TABLE_LEVEL) { - mmu_spte_update(sptep, *sptep ~PT_WRITABLE_MASK); - sptep = rmap_get_next(iter); - } else { - BUG_ON(!is_large_pte(*sptep)); - drop_spte(kvm, sptep); - --kvm-stat.lpages; It is preferable to remove all large sptes including read-only ones, the current behaviour, then to verify that no read-write transition can occur in fault paths (fault paths which are increasing in number). -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit
On Fri, Apr 20, 2012 at 04:19:17PM +0800, Xiao Guangrong wrote: If this bit is set, it means the W bit of the spte is cleared due to shadow page table protection Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/kvm/mmu.c | 56 ++- 1 files changed, 37 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index dd984b6..eb02fc4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -147,6 +147,7 @@ module_param(dbg, bool, 0644); #define SPTE_HOST_WRITEABLE (1ULL PT_FIRST_AVAIL_BITS_SHIFT) #define SPTE_ALLOW_WRITE (1ULL (PT_FIRST_AVAIL_BITS_SHIFT + 1)) +#define SPTE_WRITE_PROTECT (1ULL (PT_FIRST_AVAIL_BITS_SHIFT + 2)) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) @@ -1042,36 +1043,51 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } +static bool spte_wp_by_dirty_log(u64 spte) +{ + WARN_ON(is_writable_pte(spte)); + + return (spte SPTE_ALLOW_WRITE) !(spte SPTE_WRITE_PROTECT); +} Is the information accurate? Say: - dirty log write protect, set SPTE_ALLOW_WRITE, clear WRITABLE. - shadow gfn, rmap_write_protect finds page not WRITABLE. - spte points to shadow gfn, but SPTE_WRITE_PROTECT is not set. BTW, introduce SPTE_ALLOW_WRITE bit This bit indicates whether the spte is allow to be writable that means the gpte of this spte is writable and the pfn pointed by this spte is writable on host Other than the fact that each bit should have one meaning, how can this bit be accurate without write protection of the gpte? As soon as guest writes to gpte, information in bit is outdated. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Networking performance on a KVM Host (with no guests)
Nadav Har'El nyh at math.technion.ac.il writes: On Fri, Apr 20, 2012, Chegu Vinod wrote about Re: Networking performance on a KVM Host (with no guests): Removing the intel_iommu=on boot time parameter in the Config 1 case seemed to help intel_iommu=on is essential with you're mostly running guests *and* using device assignment. Yes. I do understand that... However, unfortunately, it also has a serious performance penalty for I/O in *host* processes: When intel_iommu=on, Linux (completely unrelated to KVM) adds a new level of protection which didn't exist without an IOMMU - the network card, which without an IOMMU could write (via DMA) to any memory location, now is not allowed - the card can only write to memory locates which the OS wanted it to write. Theoretically, this can protect the OS against various kinds of attacks. But what happens now is that every time that Linux passes a new buffer to the card, it needs to change the IOMMU mappings. This noticably slows down I/O, unfortunately. Hmm... So if one were to have a private link setup between two hosts to drive some traffic through between the hosts... then we can't expect to get line rate on that private NIC. Thanks much for your response ! Vinod -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit
On Fri, Apr 20, 2012 at 06:52:11PM -0300, Marcelo Tosatti wrote: On Fri, Apr 20, 2012 at 04:19:17PM +0800, Xiao Guangrong wrote: If this bit is set, it means the W bit of the spte is cleared due to shadow page table protection Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/kvm/mmu.c | 56 ++- 1 files changed, 37 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index dd984b6..eb02fc4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -147,6 +147,7 @@ module_param(dbg, bool, 0644); #define SPTE_HOST_WRITEABLE(1ULL PT_FIRST_AVAIL_BITS_SHIFT) #define SPTE_ALLOW_WRITE (1ULL (PT_FIRST_AVAIL_BITS_SHIFT + 1)) +#define SPTE_WRITE_PROTECT (1ULL (PT_FIRST_AVAIL_BITS_SHIFT + 2)) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) @@ -1042,36 +1043,51 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } +static bool spte_wp_by_dirty_log(u64 spte) +{ + WARN_ON(is_writable_pte(spte)); + + return (spte SPTE_ALLOW_WRITE) !(spte SPTE_WRITE_PROTECT); +} Is the information accurate? Say: - dirty log write protect, set SPTE_ALLOW_WRITE, clear WRITABLE. - shadow gfn, rmap_write_protect finds page not WRITABLE. - spte points to shadow gfn, but SPTE_WRITE_PROTECT is not set. BTW, introduce SPTE_ALLOW_WRITE bit This bit indicates whether the spte is allow to be writable that means the gpte of this spte is writable and the pfn pointed by this spte is writable on host Other than the fact that each bit should have one meaning, how can this bit be accurate without write protection of the gpte? As soon as guest writes to gpte, information in bit is outdated. Ok, i found one example where mmu_lock was expecting sptes not to change: VCPU0 VCPU1 - read-only gpte - read-only spte - write fault - spte = *sptep guest write to gpte, set writable bit spte writable parent page unsync guest write to gpte writable bit clear guest invlpg updates spte to RO sync_page enter set_spte from sync_page - cmpxchg(spte) is now writable [window where another vcpu can cache spte with writable bit set] if (is_writable_pte(entry) !is_writable_pte(*sptep)) kvm_flush_remote_tlbs(vcpu-kvm); The flush is not executed because spte was read-only (which is a correct assumption as long as sptes updates are protected by mmu_lock). So this is an example of implicit assumptions which break if you update spte without mmu_lock. Certainly there are more cases. :( -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit
On Fri, Apr 20, 2012 at 09:40:30PM -0300, Marcelo Tosatti wrote: On Fri, Apr 20, 2012 at 06:52:11PM -0300, Marcelo Tosatti wrote: On Fri, Apr 20, 2012 at 04:19:17PM +0800, Xiao Guangrong wrote: If this bit is set, it means the W bit of the spte is cleared due to shadow page table protection Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/kvm/mmu.c | 56 ++- 1 files changed, 37 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index dd984b6..eb02fc4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -147,6 +147,7 @@ module_param(dbg, bool, 0644); #define SPTE_HOST_WRITEABLE (1ULL PT_FIRST_AVAIL_BITS_SHIFT) #define SPTE_ALLOW_WRITE (1ULL (PT_FIRST_AVAIL_BITS_SHIFT + 1)) +#define SPTE_WRITE_PROTECT (1ULL (PT_FIRST_AVAIL_BITS_SHIFT + 2)) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) @@ -1042,36 +1043,51 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } +static bool spte_wp_by_dirty_log(u64 spte) +{ + WARN_ON(is_writable_pte(spte)); + + return (spte SPTE_ALLOW_WRITE) !(spte SPTE_WRITE_PROTECT); +} Is the information accurate? Say: - dirty log write protect, set SPTE_ALLOW_WRITE, clear WRITABLE. - shadow gfn, rmap_write_protect finds page not WRITABLE. - spte points to shadow gfn, but SPTE_WRITE_PROTECT is not set. BTW, introduce SPTE_ALLOW_WRITE bit This bit indicates whether the spte is allow to be writable that means the gpte of this spte is writable and the pfn pointed by this spte is writable on host Other than the fact that each bit should have one meaning, how can this bit be accurate without write protection of the gpte? As soon as guest writes to gpte, information in bit is outdated. Ok, i found one example where mmu_lock was expecting sptes not to change: VCPU0 VCPU1 - read-only gpte - read-only spte - write fault - spte = *sptep guest write to gpte, set writable bit spte writable parent page unsync guest write to gpte writable bit clear guest invlpg updates spte to RO sync_page enter set_spte from sync_page - cmpxchg(spte) is now writable [window where another vcpu can cache spte with writable bit set] if (is_writable_pte(entry) !is_writable_pte(*sptep)) kvm_flush_remote_tlbs(vcpu-kvm); The flush is not executed because spte was read-only (which is a correct assumption as long as sptes updates are protected by mmu_lock). So this is an example of implicit assumptions which break if you update spte without mmu_lock. Certainly there are more cases. :( OK, i now see you mentioned a similar case in the document, for rmap_write_protect. More importantly than the particular flush TLB case, the point is every piece of code that reads and writes sptes must now be aware that mmu_lock alone does not guarantee stability. Everything must be audited. Where the bulk of the improvement comes from again? If there is little or no mmu_lock contention (which we have no consistent data to be honest in your testcase) is the bouncing off mmu_lock's cacheline that hurts? -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 07/16] KVM: MMU: introduce for_each_pte_list_spte
Sorry for the delay. On Wed, 18 Apr 2012 12:01:03 +0800 Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com wrote: I have checked dirty-log-perf myself with this patch [01-07]. GET_DIRTY_LOG for 1GB dirty pages has become more than 15% slower. Thanks for your test! Unbelievable, i will do more test and check it more carefully. GET_DIRTY_LOG now traverses rmap lists intensively. So only a small change can affect the performance when there are many dirty pages. Could you please open your tool, then i can reproduction it and find the real reason? It's already in kvm unit tests! I will check whether your tool is better then kernbench/autotest after review your tool. Let's focus on lock-less now: so dirty-log-perf is not needed now. I think you failed to appeal the real advantage of your lock-less approach! I will write about this on v3-threads. Thanks, Takuya -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 07/16] KVM: MMU: introduce for_each_pte_list_spte
On Wed, 18 Apr 2012 18:03:10 +0800 Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com wrote: By the way, what is the case did you test? ept = 1 ? Yes! I am not worrying about without-EPT/NPT-migration. Takuya -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 0/9] KVM: MMU: fast page fault
On Fri, Apr 20, 2012 at 04:16:52PM +0800, Xiao Guangrong wrote: Changlog: - split the long series for better review. This is the core implementation of fast page fault. - document fast page fault in locking.txt Great, this is a big improvement, thanks. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 2/9] KVM: MMU: abstract spte write-protect
On Fri, 20 Apr 2012 18:33:19 -0300 Marcelo Tosatti mtosa...@redhat.com wrote: It is preferable to remove all large sptes including read-only ones, the current behaviour, then to verify that no read-write transition can occur in fault paths (fault paths which are increasing in number). I think we should use separate function than spte_write_protect() for the large case. Takuya -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit
On Fri, 20 Apr 2012 21:55:55 -0300 Marcelo Tosatti mtosa...@redhat.com wrote: More importantly than the particular flush TLB case, the point is every piece of code that reads and writes sptes must now be aware that mmu_lock alone does not guarantee stability. Everything must be audited. In addition, please give me some stress-test cases to verify these in the real environments. Live migration with KSM, with notifier call, etc? Although the current logic is verified by dirty-log api test, the new logic may need another api test program. Note: the problem is that live migration can fail silently. We cannot know the data loss is from guest side problem or get_dirty side. Where the bulk of the improvement comes from again? If there is little or no mmu_lock contention (which we have no consistent data to be honest in your testcase) is the bouncing off mmu_lock's cacheline that hurts? This week, I was doing some simplified worst-latency-tests for my work. It was difficult than I thought. But Xiao's lock-less should see the reduction of mmu_lock contention more easily, if there is really some. To make things simple, e.g., we can do the same kind of write-loop as XBZRLE people are doing in the guest - with more VCPUs if possible. Thanks, Takuya -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] kvm: don't call mmu_shrinker w/o used_mmu_pages
On Fri, 20 Apr 2012 16:07:41 -0700 Ying Han ying...@google.com wrote: My understanding of the real pain is the poor implementation of the mmu_shrinker. It iterates all the registered mmu_shrink callbacks for each kvm and only does little work at a time while holding two big locks. I learned from mikew@ (also ++cc-ed) that is causing latency spikes and unfairness among kvm instance in some of the experiment we've seen. Last year, I discussed the mmu_shrink issues on kvm ML: [PATCH 0/4] KVM: Make mmu_shrink() scan nr_to_scan shadow pages http://www.spinics.net/lists/kvm/msg65231.html Sadly, we could not find any good way at that time. Thanks, Takuya -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] kvm: don't call mmu_shrinker w/o used_mmu_pages
On Fri, Apr 20, 2012 at 6:56 PM, Takuya Yoshikawa takuya.yoshik...@gmail.com wrote: On Fri, 20 Apr 2012 16:07:41 -0700 Ying Han ying...@google.com wrote: My understanding of the real pain is the poor implementation of the mmu_shrinker. It iterates all the registered mmu_shrink callbacks for each kvm and only does little work at a time while holding two big locks. I learned from mikew@ (also ++cc-ed) that is causing latency spikes and unfairness among kvm instance in some of the experiment we've seen. The pains we have with mmu_shrink are twofold: - Memory pressure against the shinker applies globally. Any task can cause pressure within their own environment (using numa or memcg) and cause the global shrinker to shrink all shadowed tables on the system (regardless of how memory is isolated between tasks). - Massive lock contention when all these CPUs are hitting the global lock (which backs everybody on the system up). In our situation, we simple disable the shrinker altogether. My understanding is that we EPT or NPT, the amount of memory used by these tables is bounded by the size of guest physical memory, whereas with software shadowed tables, it is bounded by the addresses spaces in the guest. This bound makes it reasonable to not do any reclaim and charge it as a system overhead tax. As for data, the most impressive result was a massive improvement in round-trip latency to a webserver running in a guest while another process on the system was thrashing through page-cache (on a dozen or so spinning disks iirc). We were using fake-numa, and would otherwise not expect the antagonist to drastrically affect the latency-sensitive task (as per a lot of effort into making that work). Unfortunately, we saw the 99th%ile latency riding at the 140ms timeout cut-off (they were likely tailing out much longer), with the 95%ile at over 40ms. With the mmu_shrinker disabled, the 99th%ile latency quickly dropped down to about 20ms. CPU profiles were showing 30% of cpu time wasted on spinlocks, all the mmu_list_lock iirc. In our case, I'm much happier just disabling the damned thing altogether. Last year, I discussed the mmu_shrink issues on kvm ML: [PATCH 0/4] KVM: Make mmu_shrink() scan nr_to_scan shadow pages http://www.spinics.net/lists/kvm/msg65231.html Sadly, we could not find any good way at that time. Thanks, Takuya -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] kvm: don't call mmu_shrinker w/o used_mmu_pages
On Fri, 20 Apr 2012 19:15:24 -0700 Mike Waychison mi...@google.com wrote: In our situation, we simple disable the shrinker altogether. My understanding is that we EPT or NPT, the amount of memory used by these tables is bounded by the size of guest physical memory, whereas with software shadowed tables, it is bounded by the addresses spaces in the guest. This bound makes it reasonable to not do any reclaim and charge it as a system overhead tax. IIRC, KVM's mmu_shrink is mainly for protecting the host from pathological guest without EPT or NPT. You can see Avi's summary: -- http://www.spinics.net/lists/kvm/msg65671.html === We should aim for the following: - normal operation causes very little shrinks (some are okay) - high pressure mostly due to kvm results in kvm being shrunk (this is a pathological case caused by a starting a guest with a huge amount of memory, and mapping it all to /dev/zero (or ksm), and getting the guest the create shadow mappings for all of it) - general high pressure is shared among other caches like dcache and icache The cost of reestablishing an mmu page can be as high as half a millisecond of cpu time, which is the reason I want to be conservative. === Thanks, Takuya -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/2] Simplify RCU freeing of shadow pages
On Thu, Apr 19, 2012 at 07:26:35PM +0300, Avi Kivity wrote: This patchset simplifies the freeing by RCU of mmu pages. Xiao, I'm sure you thought of always freeing by RCU. Why didn't you choose this way? I saves a couple of atomics in the fast path. Avi Kivity (2): KVM: MMU: Always free shadow pages using RCU KVM: MMU: Recover space used by rcu_head in struct kvm_mmu_page arch/x86/include/asm/kvm_host.h |9 +++--- arch/x86/kvm/mmu.c | 58 --- 2 files changed, 15 insertions(+), 52 deletions(-) Check Documentation/RCU/checklist.txt item 8. a. Keeping a count of the number of data-structure elements used by the RCU-protected data structure, including those waiting for a grace period to elapse. Enforce a limit on this number, stalling updates as needed to allow previously deferred frees to complete. Alternatively, limit only the number awaiting deferred free rather than the total number of elements. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] kvm: don't call mmu_shrinker w/o used_mmu_pages
On Fri, Apr 20, 2012 at 7:29 PM, Takuya Yoshikawa takuya.yoshik...@gmail.com wrote: On Fri, 20 Apr 2012 19:15:24 -0700 Mike Waychison mi...@google.com wrote: In our situation, we simple disable the shrinker altogether. My understanding is that we EPT or NPT, the amount of memory used by these tables is bounded by the size of guest physical memory, whereas with software shadowed tables, it is bounded by the addresses spaces in the guest. This bound makes it reasonable to not do any reclaim and charge it as a system overhead tax. IIRC, KVM's mmu_shrink is mainly for protecting the host from pathological guest without EPT or NPT. You can see Avi's summary: -- http://www.spinics.net/lists/kvm/msg65671.html === We should aim for the following: - normal operation causes very little shrinks (some are okay) - high pressure mostly due to kvm results in kvm being shrunk (this is a pathological case caused by a starting a guest with a huge amount of memory, and mapping it all to /dev/zero (or ksm), and getting the guest the create shadow mappings for all of it) - general high pressure is shared among other caches like dcache and icache The cost of reestablishing an mmu page can be as high as half a millisecond of cpu time, which is the reason I want to be conservative. To add to that, on these systems (32-way), the fault itself isn't as heavy-handed as a global lock in everyone's reclaim path :) I'd be very happy if this stuff was memcg aware, but until that happens, this code is disabled in our production builds. 30% of CPU time lost to a spinlock when mixing VMs with IO is worth paying the 1% of system ram these pages cost if it means tighter/more-deterministic service latencies. === Thanks, Takuya -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 2/9] KVM: MMU: abstract spte write-protect
On 04/21/2012 05:33 AM, Marcelo Tosatti wrote: static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) { @@ -1050,24 +1078,13 @@ __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) for (sptep = rmap_get_first(*rmapp, iter); sptep;) { BUG_ON(!(*sptep PT_PRESENT_MASK)); -rmap_printk(rmap_write_protect: spte %p %llx\n, sptep, *sptep); - -if (!is_writable_pte(*sptep)) { -sptep = rmap_get_next(iter); -continue; -} - -if (level == PT_PAGE_TABLE_LEVEL) { -mmu_spte_update(sptep, *sptep ~PT_WRITABLE_MASK); -sptep = rmap_get_next(iter); -} else { -BUG_ON(!is_large_pte(*sptep)); -drop_spte(kvm, sptep); ---kvm-stat.lpages; It is preferable to remove all large sptes including read-only ones, the It can cause page faults even if read memory on these large sptse. Actually, Avi suggested that make large writable spte to be readonly (not dropped) on this path. current behaviour, then to verify that no read-write transition can occur in fault paths (fault paths which are increasing in number). Yes, the small spte also has issue (find a write-protected spte in fault paths). Later, the second part of this patchset will introduce rmap.WRITE_PROTECTED bit, then we can do the fast check before calling fast page fault. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 4/9] KVM: MMU: introduce SPTE_ALLOW_WRITE bit
On 04/21/2012 05:39 AM, Marcelo Tosatti wrote: @@ -1177,9 +1178,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, new_spte = *sptep ~PT64_BASE_ADDR_MASK; new_spte |= (u64)new_pfn PAGE_SHIFT; -new_spte = ~PT_WRITABLE_MASK; -new_spte = ~SPTE_HOST_WRITEABLE; -new_spte = ~shadow_accessed_mask; +new_spte = ~(PT_WRITABLE_MASK | SPTE_HOST_WRITEABLE | + shadow_accessed_mask | SPTE_ALLOW_WRITE); Each bit should have a distinct meaning. Here the host pte is being write-protected, which means only the SPTE_HOST_WRITEABLE bit should be cleared. Hmm, it is no problem if SPTE_ALLOW_WRITE is not cleared. But the meaning of SPTE_ALLOW_WRITE will become strange: we will see a spte with spte.SPTE_ALLOW_WRITE = 1 (means the spte is writable on host and guest) and spte.SPTE_HOST_WRITEABLE = 0 (means the spte is read-only on host). -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit
On 04/21/2012 05:52 AM, Marcelo Tosatti wrote: On Fri, Apr 20, 2012 at 04:19:17PM +0800, Xiao Guangrong wrote: If this bit is set, it means the W bit of the spte is cleared due to shadow page table protection Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/kvm/mmu.c | 56 ++- 1 files changed, 37 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index dd984b6..eb02fc4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -147,6 +147,7 @@ module_param(dbg, bool, 0644); #define SPTE_HOST_WRITEABLE (1ULL PT_FIRST_AVAIL_BITS_SHIFT) #define SPTE_ALLOW_WRITE(1ULL (PT_FIRST_AVAIL_BITS_SHIFT + 1)) +#define SPTE_WRITE_PROTECT (1ULL (PT_FIRST_AVAIL_BITS_SHIFT + 2)) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) @@ -1042,36 +1043,51 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } +static bool spte_wp_by_dirty_log(u64 spte) +{ +WARN_ON(is_writable_pte(spte)); + +return (spte SPTE_ALLOW_WRITE) !(spte SPTE_WRITE_PROTECT); +} Is the information accurate? Say: - dirty log write protect, set SPTE_ALLOW_WRITE, clear WRITABLE. - shadow gfn, rmap_write_protect finds page not WRITABLE. - spte points to shadow gfn, but SPTE_WRITE_PROTECT is not set. It can not happen, rmap_write_protect will set SPTE_WRITE_PROTECT even if the spte is not WRITABLE, please see: + if (page_table_protect spte_wp_by_dirty_log(spte)) + goto reset_spte; + + return false; + +reset_spte: rmap_printk(rmap_write_protect: spte %p %llx\n, spte, *spte); spte = spte ~PT_WRITABLE_MASK; + if (page_table_protect) + spte |= SPTE_WRITE_PROTECT; mmu_spte_update(sptep, spte); - return false; } BTW, introduce SPTE_ALLOW_WRITE bit This bit indicates whether the spte is allow to be writable that means the gpte of this spte is writable and the pfn pointed by this spte is writable on host Other than the fact that each bit should have one meaning, how can this bit be accurate without write protection of the gpte? Above explanation can ensure the meaning of this bit is accurate? Or it has another case? :) As soon as guest writes to gpte, information in bit is outdated. The bit will be updated when spte is updated. When the guest write gpte, the spte is not updated immediately, yes, the bit is outdated at that time, but it is ok since tlb is not flushed. After tlb flush, the bit can be coincident with gpte. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit
On 04/21/2012 08:40 AM, Marcelo Tosatti wrote: On Fri, Apr 20, 2012 at 06:52:11PM -0300, Marcelo Tosatti wrote: On Fri, Apr 20, 2012 at 04:19:17PM +0800, Xiao Guangrong wrote: If this bit is set, it means the W bit of the spte is cleared due to shadow page table protection Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/kvm/mmu.c | 56 ++- 1 files changed, 37 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index dd984b6..eb02fc4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -147,6 +147,7 @@ module_param(dbg, bool, 0644); #define SPTE_HOST_WRITEABLE(1ULL PT_FIRST_AVAIL_BITS_SHIFT) #define SPTE_ALLOW_WRITE (1ULL (PT_FIRST_AVAIL_BITS_SHIFT + 1)) +#define SPTE_WRITE_PROTECT (1ULL (PT_FIRST_AVAIL_BITS_SHIFT + 2)) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) @@ -1042,36 +1043,51 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } +static bool spte_wp_by_dirty_log(u64 spte) +{ + WARN_ON(is_writable_pte(spte)); + + return (spte SPTE_ALLOW_WRITE) !(spte SPTE_WRITE_PROTECT); +} Is the information accurate? Say: - dirty log write protect, set SPTE_ALLOW_WRITE, clear WRITABLE. - shadow gfn, rmap_write_protect finds page not WRITABLE. - spte points to shadow gfn, but SPTE_WRITE_PROTECT is not set. BTW, introduce SPTE_ALLOW_WRITE bit This bit indicates whether the spte is allow to be writable that means the gpte of this spte is writable and the pfn pointed by this spte is writable on host Other than the fact that each bit should have one meaning, how can this bit be accurate without write protection of the gpte? As soon as guest writes to gpte, information in bit is outdated. Ok, i found one example where mmu_lock was expecting sptes not to change: VCPU0 VCPU1 - read-only gpte - read-only spte - write fault It is not true, gpte is read-only, and it is a write fault, then we should reject the page fault to guest, the fast page fault is not called. :) - spte = *sptep guest write to gpte, set writable bit spte writable parent page unsync guest write to gpte writable bit clear guest invlpg updates spte to RO sync_page enter set_spte from sync_page - cmpxchg(spte) is now writable [window where another vcpu can cache spte with writable bit set] if (is_writable_pte(entry) !is_writable_pte(*sptep)) kvm_flush_remote_tlbs(vcpu-kvm); The flush is not executed because spte was read-only (which is a correct assumption as long as sptes updates are protected by mmu_lock). It is also not true, flush tlbs in set_sptes is used to ensure rmap_write_protect work correctly, but rmap_write_protect will flush tlbs even if the spte can be changed by fast page fault. So this is an example of implicit assumptions which break if you update spte without mmu_lock. Certainly there are more cases. :( We only need care the path which is depends on spte.WRITEABLE == 0, since only these spte has chance to be changed out of mmu-lock. The most trouble is in rmap_write_protect that need flush tlb to protect shadow page table. I think it is not too hard to check. :) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit
On 04/21/2012 08:55 AM, Marcelo Tosatti wrote: So this is an example of implicit assumptions which break if you update spte without mmu_lock. Certainly there are more cases. :( OK, i now see you mentioned a similar case in the document, for rmap_write_protect. More importantly than the particular flush TLB case, the point is every piece of code that reads and writes sptes must now be aware that mmu_lock alone does not guarantee stability. Everything must be audited. Yes, that is true, but it is not hard to audit the code since we only change the spte from read-only to writable, also all information that fast page fault depends on is from spte. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 2/9] KVM: MMU: abstract spte write-protect
On 04/21/2012 09:10 AM, Takuya Yoshikawa wrote: On Fri, 20 Apr 2012 18:33:19 -0300 Marcelo Tosatti mtosa...@redhat.com wrote: It is preferable to remove all large sptes including read-only ones, the current behaviour, then to verify that no read-write transition can occur in fault paths (fault paths which are increasing in number). I think we should use separate function than spte_write_protect() for the large case. I will introduce a function to handle large sptes when i implement the idea of making writable spte to be read-only. But, keep it in this patchset first. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 07/16] KVM: MMU: introduce for_each_pte_list_spte
On 04/21/2012 09:01 AM, Takuya Yoshikawa wrote: Sorry for the delay. On Wed, 18 Apr 2012 12:01:03 +0800 Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com wrote: I have checked dirty-log-perf myself with this patch [01-07]. GET_DIRTY_LOG for 1GB dirty pages has become more than 15% slower. Thanks for your test! Unbelievable, i will do more test and check it more carefully. GET_DIRTY_LOG now traverses rmap lists intensively. So only a small change can affect the performance when there are many dirty pages. Could you please open your tool, then i can reproduction it and find the real reason? It's already in kvm unit tests! Great, i will check that. I will check whether your tool is better then kernbench/autotest after review your tool. Let's focus on lock-less now: so dirty-log-perf is not needed now. I think you failed to appeal the real advantage of your lock-less approach! I will write about this on v3-threads. Thank you very much, Takuya! -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 4/9] KVM: MMU: introduce SPTE_ALLOW_WRITE bit
On Sat, Apr 21, 2012 at 11:30:55AM +0800, Xiao Guangrong wrote: On 04/21/2012 05:39 AM, Marcelo Tosatti wrote: @@ -1177,9 +1178,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, new_spte = *sptep ~PT64_BASE_ADDR_MASK; new_spte |= (u64)new_pfn PAGE_SHIFT; - new_spte = ~PT_WRITABLE_MASK; - new_spte = ~SPTE_HOST_WRITEABLE; - new_spte = ~shadow_accessed_mask; + new_spte = ~(PT_WRITABLE_MASK | SPTE_HOST_WRITEABLE | +shadow_accessed_mask | SPTE_ALLOW_WRITE); Each bit should have a distinct meaning. Here the host pte is being write-protected, which means only the SPTE_HOST_WRITEABLE bit should be cleared. Hmm, it is no problem if SPTE_ALLOW_WRITE is not cleared. But the meaning of SPTE_ALLOW_WRITE will become strange: we will see a spte with spte.SPTE_ALLOW_WRITE = 1 (means the spte is writable on host and guest) and spte.SPTE_HOST_WRITEABLE = 0 (means the spte is read-only on host). You are combining gpte writable bit, and host pte writable bit (which are separate and independent of each other) into one bit. SPTE_HOST_WRITEABLE already indicates whether the host pte is writable or not. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 5/9] KVM: MMU: introduce SPTE_WRITE_PROTECT bit
On Sat, Apr 21, 2012 at 11:47:46AM +0800, Xiao Guangrong wrote: On 04/21/2012 05:52 AM, Marcelo Tosatti wrote: On Fri, Apr 20, 2012 at 04:19:17PM +0800, Xiao Guangrong wrote: If this bit is set, it means the W bit of the spte is cleared due to shadow page table protection Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/kvm/mmu.c | 56 ++- 1 files changed, 37 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index dd984b6..eb02fc4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -147,6 +147,7 @@ module_param(dbg, bool, 0644); #define SPTE_HOST_WRITEABLE (1ULL PT_FIRST_AVAIL_BITS_SHIFT) #define SPTE_ALLOW_WRITE (1ULL (PT_FIRST_AVAIL_BITS_SHIFT + 1)) +#define SPTE_WRITE_PROTECT(1ULL (PT_FIRST_AVAIL_BITS_SHIFT + 2)) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) @@ -1042,36 +1043,51 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } +static bool spte_wp_by_dirty_log(u64 spte) +{ + WARN_ON(is_writable_pte(spte)); + + return (spte SPTE_ALLOW_WRITE) !(spte SPTE_WRITE_PROTECT); +} Is the information accurate? Say: - dirty log write protect, set SPTE_ALLOW_WRITE, clear WRITABLE. - shadow gfn, rmap_write_protect finds page not WRITABLE. - spte points to shadow gfn, but SPTE_WRITE_PROTECT is not set. It can not happen, rmap_write_protect will set SPTE_WRITE_PROTECT even if the spte is not WRITABLE, please see: + if (page_table_protect spte_wp_by_dirty_log(spte)) + goto reset_spte; + + return false; + +reset_spte: rmap_printk(rmap_write_protect: spte %p %llx\n, spte, *spte); spte = spte ~PT_WRITABLE_MASK; + if (page_table_protect) + spte |= SPTE_WRITE_PROTECT; mmu_spte_update(sptep, spte); - return false; } Right. What about sync path, fault path, prefault path, do they update SPTE_WRITE_PROTECT properly? BTW, introduce SPTE_ALLOW_WRITE bit This bit indicates whether the spte is allow to be writable that means the gpte of this spte is writable and the pfn pointed by this spte is writable on host Other than the fact that each bit should have one meaning, how can this bit be accurate without write protection of the gpte? Above explanation can ensure the meaning of this bit is accurate? Or it has another case? :) No, it is out of sync with guest pte. As soon as guest writes to gpte, information in bit is outdated. The bit will be updated when spte is updated. When the guest write gpte, the spte is not updated immediately, yes, the bit is outdated at that time, but it is ok since tlb is not flushed. Page faults cause TLB flushes. From Intel manual: In addition to the instructions identified above, page faults invalidate entries in the TLBs and paging-structure caches. In particular, a page-fault exception resulting from an attempt to use a linear address will invalidate any TLB entries that are for a page number corresponding to that linear address and that are associated with the current PCID. After tlb flush, the bit can be coincident with gpte. You must read the gpte before updating from ro-rw, unless you write protect gpte. IIRC you were doing that in previous patches? -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 2/9] KVM: MMU: abstract spte write-protect
On Sat, Apr 21, 2012 at 11:24:54AM +0800, Xiao Guangrong wrote: On 04/21/2012 05:33 AM, Marcelo Tosatti wrote: static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) { @@ -1050,24 +1078,13 @@ __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) for (sptep = rmap_get_first(*rmapp, iter); sptep;) { BUG_ON(!(*sptep PT_PRESENT_MASK)); - rmap_printk(rmap_write_protect: spte %p %llx\n, sptep, *sptep); - - if (!is_writable_pte(*sptep)) { - sptep = rmap_get_next(iter); - continue; - } - - if (level == PT_PAGE_TABLE_LEVEL) { - mmu_spte_update(sptep, *sptep ~PT_WRITABLE_MASK); - sptep = rmap_get_next(iter); - } else { - BUG_ON(!is_large_pte(*sptep)); - drop_spte(kvm, sptep); - --kvm-stat.lpages; It is preferable to remove all large sptes including read-only ones, the It can cause page faults even if read memory on these large sptse. Actually, Avi suggested that make large writable spte to be readonly (not dropped) on this path. See commits e49146dce8c3dc6f4485c1904b6587855f393e71, 38187c830cab84daecb41169948467f1f19317e3 for issues with large read-only sptes. current behaviour, then to verify that no read-write transition can occur in fault paths (fault paths which are increasing in number). Yes, the small spte also has issue (find a write-protected spte in fault paths). Later, the second part of this patchset will introduce rmap.WRITE_PROTECTED bit, then we can do the fast check before calling fast page fault. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4] KVM: PPC Use clockevent multiplier and shifter for decrementer
On 18.04.2012, at 18:01, Bharat Bhushan wrote: Time for which the hrtimer is started for decrementer emulation is calculated using tb_ticks_per_usec. While hrtimer uses the clockevent for DEC reprogramming (if needed) and which calculate timebase ticks using the multiplier and shifter mechanism implemented within clockevent layer. It was observed that this conversion (timebase-time-timebase) are not correct because the mechanism are not consistent. In our setup it adds 2% jitter. With this patch clockevent multiplier and shifter mechanism are used when starting hrtimer for decrementer emulation. Now the jitter is 0.5%. Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com Thanks, applied to kvm-ppc-next with fixed commit message and fixed trailing whitespace :). Alex -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html