date:20130805

Re: [PATCH] KVM: MMU: fix check the reserved bits on the gpte of L2

2013-08-05 Thread Gleb Natapov

On Mon, Aug 05, 2013 at 12:59:19PM +0800, Xiao Guangrong wrote:
 Current code always uses arch.mmu to check the reserved bits on guest gpte
 which is valid only for L1 guest, we should use arch.nested_mmu instead when
 we translate gva to gpa for the L2 guest
 
 Fix it by using @mmu instead since it is adapted to the current mmu mode
 automatically
 
 The bug can be triggered when nested npt is used and L1 guest and L2 guest
 use different mmu mode
 
 Reported-by: Jan Kiszka jan.kis...@siemens.com
 Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
Reviewed-by: Gleb Natapov g...@redhat.com

 ---
  arch/x86/kvm/paging_tmpl.h | 3 +--
  1 file changed, 1 insertion(+), 2 deletions(-)
 
 diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
 index 7769699..3a75828 100644
 --- a/arch/x86/kvm/paging_tmpl.h
 +++ b/arch/x86/kvm/paging_tmpl.h
 @@ -218,8 +218,7 @@ retry_walk:
   if (unlikely(!is_present_gpte(pte)))
   goto error;
 
 - if (unlikely(is_rsvd_bits_set(vcpu-arch.mmu, pte,
 -   walker-level))) {
 + if (unlikely(is_rsvd_bits_set(mmu, pte, walker-level))) {
   errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
   goto error;
   }
 -- 
 1.8.1.4

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: [PATCH] KVM: nVMX: correctly set tr base on nested vmexit emulation

2013-08-05 Thread Ren, Yongjie

 -Original Message-
 From: Gleb Natapov [mailto:g...@redhat.com]
 Sent: Sunday, August 04, 2013 8:08 PM
 To: kvm@vger.kernel.org
 Cc: Ren, Yongjie; pbonz...@redhat.com; yzt...@gmail.com
 Subject: [PATCH] KVM: nVMX: correctly set tr base on nested vmexit
 emulation
 
 After commit 21feb4eb64e21f8dc91136b91ee886b978ce6421 tr base is
 zeroed
 during vmexit. Set it to L1's HOST_TR_BASE. This should fix
 https://bugzilla.kernel.org/show_bug.cgi?id=60679
 
Yes, your patch can fix this bug I reported.

Tested-by: Yongjie Ren yongjie@intel.com


 Signed-off-by: Gleb Natapov g...@redhat.com
 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index b533cf8..7d31e7b 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -8261,7 +8261,7 @@ static void load_vmcs12_host_state(struct
 kvm_vcpu *vcpu,
   seg.base = vmcs12-host_gs_base;
   vmx_set_segment(vcpu, seg, VCPU_SREG_GS);
   seg = (struct kvm_segment) {
 - .base = 0,
 + .base = vmcs12-host_tr_base,
   .limit = 0x67,
   .selector = vmcs12-host_tr_selector,
   .type = 11,
 --
   Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] nVMX: Keep arch.pat in sync on L1-L2 switches

2013-08-05 Thread Jan Kiszka

On 2013-08-05 04:19, Arthur Chunqi Li wrote:
 On Sun, Aug 4, 2013 at 11:17 PM, Jan Kiszka jan.kis...@web.de wrote:
 From: Jan Kiszka jan.kis...@siemens.com

 When asking vmx to load the PAT MSR for us while switching from L1 to L2
 or vice versa, we have to update arch.pat as well as it may later be
 used again to load or read out the MSR content.

 Signed-off-by: Jan Kiszka jan.kis...@siemens.com
 ---

 Arthur, please add your tested-by also officially.
 This patch is generally good expect that we need to add
 VM_EXIT_LOAD_IA32_PAT/VM_EXIT_SAVE_IA32_PAT to
 nested_vmx_exit_ctls_high in function nested_vmx_setup_ctls_msrs() to
 enable these two features. Currently only VM_ENTRY_LOAD_IA32_PAT is
 enabled while all three are implemented.

Yes, feel free to follow up with a corresponding patch.

Jan




signature.asc
Description: OpenPGP digital signature

Re: [PATCH 6/7] KVM: nVMX: Implement support for EFER saving on VM-exit

2013-08-05 Thread Arthur Chunqi Li

On Mon, Aug 5, 2013 at 1:49 AM, Jan Kiszka jan.kis...@web.de wrote:
 From: Jan Kiszka jan.kis...@siemens.com

 Implement and advertise VM_EXIT_SAVE_IA32_EFER. L0 traps EFER writes
 unconditionally, so we always find the current L2 value in the
 architectural state.

 Signed-off-by: Jan Kiszka jan.kis...@siemens.com
 ---
  arch/x86/kvm/vmx.c |4 +++-
  1 files changed, 3 insertions(+), 1 deletions(-)

 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index 84a05b4..7208d0b 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -2203,7 +2203,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 nested_vmx_exit_ctls_high = 0;
  #endif
 nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
 - VM_EXIT_LOAD_IA32_EFER);
 +   VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER);
Here you may need to access MSR_IA32_VMX_EXIT_CTLS first to check if
host support these two features. The codes for entry controls
following it can achieve goals like this.

Arthur

 /* entry controls */
 rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
 @@ -8113,6 +8113,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, 
 struct vmcs12 *vmcs12)
 vmcs12-guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
 if (vmcs12-vm_exit_controls  VM_EXIT_SAVE_IA32_PAT)
 vmcs12-guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
 +   if (vmcs12-vm_exit_controls  VM_EXIT_SAVE_IA32_EFER)
 +   vmcs12-guest_ia32_efer = vcpu-arch.efer;
 vmcs12-guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
 vmcs12-guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
 vmcs12-guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
 --
 1.7.3.4

 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 6/7] KVM: nVMX: Implement support for EFER saving on VM-exit

2013-08-05 Thread Jan Kiszka

On 2013-08-05 09:21, Arthur Chunqi Li wrote:
 On Mon, Aug 5, 2013 at 1:49 AM, Jan Kiszka jan.kis...@web.de wrote:
 From: Jan Kiszka jan.kis...@siemens.com

 Implement and advertise VM_EXIT_SAVE_IA32_EFER. L0 traps EFER writes
 unconditionally, so we always find the current L2 value in the
 architectural state.

 Signed-off-by: Jan Kiszka jan.kis...@siemens.com
 ---
  arch/x86/kvm/vmx.c |4 +++-
  1 files changed, 3 insertions(+), 1 deletions(-)

 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index 84a05b4..7208d0b 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -2203,7 +2203,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 nested_vmx_exit_ctls_high = 0;
  #endif
 nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
 - VM_EXIT_LOAD_IA32_EFER);
 +   VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER);
 Here you may need to access MSR_IA32_VMX_EXIT_CTLS first to check if
 host support these two features. The codes for entry controls
 following it can achieve goals like this.

We are providing this feature independently of the host's support. Every
write to EFER is trapped by L0, so we can emulate saving by reading what
L0 recorded.

Jan




signature.asc
Description: OpenPGP digital signature

[PATCH v7 01/15] nEPT: Support LOAD_IA32_EFER entry/exit controls for L1

2013-08-05 Thread Gleb Natapov

From: Nadav Har'El n...@il.ibm.com

Recent KVM, since http://kerneltrap.org/mailarchive/linux-kvm/2010/5/2/6261577
switch the EFER MSR when EPT is used and the host and guest have different
NX bits. So if we add support for nested EPT (L1 guest using EPT to run L2)
and want to be able to run recent KVM as L1, we need to allow L1 to use this
EFER switching feature.

To do this EFER switching, KVM uses VM_ENTRY/EXIT_LOAD_IA32_EFER if available,
and if it isn't, it uses the generic VM_ENTRY/EXIT_MSR_LOAD. This patch adds
support for the former (the latter is still unsupported).

Nested entry and exit emulation (prepare_vmcs_02 and load_vmcs12_host_state,
respectively) already handled VM_ENTRY/EXIT_LOAD_IA32_EFER correctly. So all
that's left to do in this patch is to properly advertise this feature to L1.

Note that vmcs12's VM_ENTRY/EXIT_LOAD_IA32_EFER are emulated by L0, by using
vmx_set_efer (which itself sets one of several vmcs02 fields), so we always
support this feature, regardless of whether the host supports it.

Reviewed-by: Orit Wasserman owass...@redhat.com
Signed-off-by: Nadav Har'El n...@il.ibm.com
Signed-off-by: Jun Nakajima jun.nakaj...@intel.com
Signed-off-by: Xinhao Xu xinhao...@intel.com
Signed-off-by: Yang Zhang yang.z.zh...@intel.com
Signed-off-by: Gleb Natapov g...@redhat.com
---
 arch/x86/kvm/vmx.c |   23 ---
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e999dc7..27efa6a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2198,7 +2198,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 #else
nested_vmx_exit_ctls_high = 0;
 #endif
-   nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+   nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+ VM_EXIT_LOAD_IA32_EFER);
 
/* entry controls */
rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2207,8 +2208,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
nested_vmx_entry_ctls_high =
VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
-   nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
-
+   nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
+  VM_ENTRY_LOAD_IA32_EFER);
/* cpu-based controls */
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
@@ -7529,10 +7530,18 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, 
struct vmcs12 *vmcs12)
vcpu-arch.cr0_guest_owned_bits = ~vmcs12-cr0_guest_host_mask;
vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu-arch.cr0_guest_owned_bits);
 
-   /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */
-   vmcs_write32(VM_EXIT_CONTROLS,
-   vmcs12-vm_exit_controls | vmcs_config.vmexit_ctrl);
-   vmcs_write32(VM_ENTRY_CONTROLS, vmcs12-vm_entry_controls |
+   /* L2-L1 exit controls are emulated - the hardware exit is to L0 so
+* we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
+* bits are further modified by vmx_set_efer() below.
+*/
+   vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+
+   /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
+* emulated by vmx_set_efer(), below.
+*/
+   vmcs_write32(VM_ENTRY_CONTROLS,
+   (vmcs12-vm_entry_controls  ~VM_ENTRY_LOAD_IA32_EFER 
+   ~VM_ENTRY_IA32E_MODE) |
(vmcs_config.vmentry_ctrl  ~VM_ENTRY_IA32E_MODE));
 
if (vmcs12-vm_entry_controls  VM_ENTRY_LOAD_IA32_PAT)
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v7 04/15] nEPT: Move common code to paging_tmpl.h

2013-08-05 Thread Gleb Natapov

From: Nadav Har'El n...@il.ibm.com

For preparation, we just move gpte_access(), prefetch_invalid_gpte(),
s_rsvd_bits_set(), protect_clean_gpte() and is_dirty_gpte() from mmu.c
to paging_tmpl.h.

Reviewed-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
Signed-off-by: Nadav Har'El n...@il.ibm.com
Signed-off-by: Jun Nakajima jun.nakaj...@intel.com
Signed-off-by: Xinhao Xu xinhao...@intel.com
Signed-off-by: Yang Zhang yang.z.zh...@intel.com
Signed-off-by: Jun Nakajima jun.nakaj...@intel.com
Signed-off-by: Gleb Natapov g...@redhat.com
---
 arch/x86/kvm/mmu.c |   55 --
 arch/x86/kvm/paging_tmpl.h |   80 +---
 2 files changed, 68 insertions(+), 67 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3a9493a..4c4274d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -331,11 +331,6 @@ static int is_large_pte(u64 pte)
return pte  PT_PAGE_SIZE_MASK;
 }
 
-static int is_dirty_gpte(unsigned long pte)
-{
-   return pte  PT_DIRTY_MASK;
-}
-
 static int is_rmap_spte(u64 pte)
 {
return is_shadow_present_pte(pte);
@@ -2574,14 +2569,6 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
mmu_free_roots(vcpu);
 }
 
-static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
-{
-   int bit7;
-
-   bit7 = (gpte  7)  1;
-   return (gpte  mmu-rsvd_bits_mask[bit7][level-1]) != 0;
-}
-
 static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
 bool no_dirty_log)
 {
@@ -2594,26 +2581,6 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu 
*vcpu, gfn_t gfn,
return gfn_to_pfn_memslot_atomic(slot, gfn);
 }
 
-static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *spte,
- u64 gpte)
-{
-   if (is_rsvd_bits_set(vcpu-arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
-   goto no_present;
-
-   if (!is_present_gpte(gpte))
-   goto no_present;
-
-   if (!(gpte  PT_ACCESSED_MASK))
-   goto no_present;
-
-   return false;
-
-no_present:
-   drop_spte(vcpu-kvm, spte);
-   return true;
-}
-
 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp,
u64 *start, u64 *end)
@@ -3501,18 +3468,6 @@ static void paging_free(struct kvm_vcpu *vcpu)
nonpaging_free(vcpu);
 }
 
-static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
-{
-   unsigned mask;
-
-   BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
-
-   mask = (unsigned)~ACC_WRITE_MASK;
-   /* Allow write access to dirty gptes */
-   mask |= (gpte  (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT))  
PT_WRITABLE_MASK;
-   *access = mask;
-}
-
 static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
   unsigned access, int *nr_present)
 {
@@ -3530,16 +3485,6 @@ static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, 
gfn_t gfn,
return false;
 }
 
-static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
-{
-   unsigned access;
-
-   access = (gpte  (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
-   access = ~(gpte  PT64_NX_SHIFT);
-
-   return access;
-}
-
 static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned 
gpte)
 {
unsigned index;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 7769699..fb26ca9 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -80,6 +80,31 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
return (gpte  PT_LVL_ADDR_MASK(lvl))  PAGE_SHIFT;
 }
 
+static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
+{
+   unsigned mask;
+
+   BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
+
+   mask = (unsigned)~ACC_WRITE_MASK;
+   /* Allow write access to dirty gptes */
+   mask |= (gpte  (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT))  
PT_WRITABLE_MASK;
+   *access = mask;
+}
+
+static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+   int bit7;
+
+   bit7 = (gpte  7)  1;
+   return (gpte  mmu-rsvd_bits_mask[bit7][level-1]) != 0;
+}
+
+static inline int FNAME(is_present_gpte)(unsigned long pte)
+{
+   return is_present_gpte(pte);
+}
+
 static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
   pt_element_t __user *ptep_user, unsigned index,
   pt_element_t orig_pte, pt_element_t new_pte)
@@ -103,6 +128,36 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
return (ret != orig_pte);
 }
 
+static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *spte,
+

[PATCH v7 12/15] nEPT: Nested INVEPT

2013-08-05 Thread Gleb Natapov

From: Nadav Har'El n...@il.ibm.com

If we let L1 use EPT, we should probably also support the INVEPT instruction.

In our current nested EPT implementation, when L1 changes its EPT table
for L2 (i.e., EPT12), L0 modifies the shadow EPT table (EPT02), and in
the course of this modification already calls INVEPT. But if last level
of shadow page is unsync not all L1's changes to EPT12 are intercepted,
which means roots need to be synced when L1 calls INVEPT. Global INVEPT
should not be different since roots are synced by kvm_mmu_load() each
time EPTP02 changes.

Signed-off-by: Nadav Har'El n...@il.ibm.com
Signed-off-by: Jun Nakajima jun.nakaj...@intel.com
Signed-off-by: Xinhao Xu xinhao...@intel.com
Signed-off-by: Yang Zhang yang.z.zh...@intel.com
Signed-off-by: Gleb Natapov g...@redhat.com
---
 arch/x86/include/asm/vmx.h  |2 ++
 arch/x86/include/uapi/asm/vmx.h |1 +
 arch/x86/kvm/mmu.c  |2 ++
 arch/x86/kvm/vmx.c  |   72 +++
 4 files changed, 77 insertions(+)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index f3e01a2..966502d 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -387,6 +387,7 @@ enum vmcs_field {
 #define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0
 #define VMX_EPT_EXTENT_CONTEXT 1
 #define VMX_EPT_EXTENT_GLOBAL  2
+#define VMX_EPT_EXTENT_SHIFT   24
 
 #define VMX_EPT_EXECUTE_ONLY_BIT   (1ull)
 #define VMX_EPT_PAGE_WALK_4_BIT(1ull  6)
@@ -394,6 +395,7 @@ enum vmcs_field {
 #define VMX_EPTP_WB_BIT(1ull  14)
 #define VMX_EPT_2MB_PAGE_BIT   (1ull  16)
 #define VMX_EPT_1GB_PAGE_BIT   (1ull  17)
+#define VMX_EPT_INVEPT_BIT (1ull  20)
 #define VMX_EPT_AD_BIT (1ull  21)
 #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull  25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT  (1ull  26)
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index d651082..7a34e8f 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -65,6 +65,7 @@
 #define EXIT_REASON_EOI_INDUCED 45
 #define EXIT_REASON_EPT_VIOLATION   48
 #define EXIT_REASON_EPT_MISCONFIG   49
+#define EXIT_REASON_INVEPT  50
 #define EXIT_REASON_PREEMPTION_TIMER52
 #define EXIT_REASON_WBINVD  54
 #define EXIT_REASON_XSETBV  55
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e3bfdde..c0b4e0f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3182,6 +3182,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
mmu_sync_roots(vcpu);
spin_unlock(vcpu-kvm-mmu_lock);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
 
 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
  u32 access, struct x86_exception *exception)
@@ -3451,6 +3452,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
++vcpu-stat.tlb_flush;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb);
 
 static void paging_new_cr3(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index fbfabbe..51c06c2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -712,6 +712,7 @@ static void nested_release_page_clean(struct page *page)
kvm_release_page_clean(page);
 }
 
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
@@ -2161,6 +2162,7 @@ static u32 nested_vmx_pinbased_ctls_low, 
nested_vmx_pinbased_ctls_high;
 static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
 static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
 static u32 nested_vmx_misc_low, nested_vmx_misc_high;
+static u32 nested_vmx_ept_caps;
 static __init void nested_vmx_setup_ctls_msrs(void)
 {
/*
@@ -6279,6 +6281,74 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
return 1;
 }
 
+/* Emulate the INVEPT instruction */
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+   u32 vmx_instruction_info, types;
+   unsigned long type;
+   gva_t gva;
+   struct x86_exception e;
+   struct {
+   u64 eptp, gpa;
+   } operand;
+   u64 eptp_mask = ((1ull  51) - 1)  PAGE_MASK;
+
+   if (!(nested_vmx_secondary_ctls_high  SECONDARY_EXEC_ENABLE_EPT) ||
+   !(nested_vmx_ept_caps  VMX_EPT_INVEPT_BIT)) {
+   kvm_queue_exception(vcpu, UD_VECTOR);
+   return 1;
+   }
+
+   if (!nested_vmx_check_permission(vcpu))
+   return 1;
+
+   if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
+   kvm_queue_exception(vcpu, UD_VECTOR);
+   return 1;
+   }
+
+   vmx_instruction_info =

[PATCH v7 03/15] nEPT: Fix wrong test in kvm_set_cr3

2013-08-05 Thread Gleb Natapov

From: Nadav Har'El n...@il.ibm.com

kvm_set_cr3() attempts to check if the new cr3 is a valid guest physical
address. The problem is that with nested EPT, cr3 is an *L2* physical
address, not an L1 physical address as this test expects.

As the comment above this test explains, it isn't necessary, and doesn't
correspond to anything a real processor would do. So this patch removes it.

Note that this wrong test could have also theoretically caused problems
in nested NPT, not just in nested EPT. However, in practice, the problem
was avoided: nested_svm_vmexit()/vmrun() do not call kvm_set_cr3 in the
nested NPT case, and instead set the vmcb (and arch.cr3) directly, thus
circumventing the problem. Additional potential calls to the buggy function
are avoided in that we don't trap cr3 modifications when nested NPT is
enabled. However, because in nested VMX we did want to use kvm_set_cr3()
(as requested in Avi Kivity's review of the original nested VMX patches),
we can't avoid this problem and need to fix it.

Reviewed-by: Orit Wasserman owass...@redhat.com
Reviewed-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
Signed-off-by: Nadav Har'El n...@il.ibm.com
Signed-off-by: Jun Nakajima jun.nakaj...@intel.com
Signed-off-by: Xinhao Xu xinhao...@intel.com
Signed-off-by: Yang Zhang yang.z.zh...@intel.com
Signed-off-by: Gleb Natapov g...@redhat.com
---
 arch/x86/kvm/x86.c |   11 ---
 1 file changed, 11 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d2caeb9..e2fef8b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -682,17 +682,6 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 */
}
 
-   /*
-* Does the new cr3 value map to physical memory? (Note, we
-* catch an invalid cr3 even in real-mode, because it would
-* cause trouble later on when we turn on paging anyway.)
-*
-* A real CPU would silently accept an invalid cr3 and would
-* attempt to use it - with largely undefined (and often hard
-* to debug) behavior on the guest side.
-*/
-   if (unlikely(!gfn_to_memslot(vcpu-kvm, cr3  PAGE_SHIFT)))
-   return 1;
vcpu-arch.cr3 = cr3;
__set_bit(VCPU_EXREG_CR3, (ulong *)vcpu-arch.regs_avail);
vcpu-arch.mmu.new_cr3(vcpu);
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v7 09/15] nEPT: correctly check if remote tlb flush is needed for shadowed EPT tables

2013-08-05 Thread Gleb Natapov

need_remote_flush() assumes that shadow page is in PT64 format, but
with addition of nested EPT this is no longer always true. Fix it by
bits definitions that depend on host shadow page type.

Reported-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
Reviewed-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
Signed-off-by: Gleb Natapov g...@redhat.com
---
 arch/x86/kvm/mmu.c |8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9e0f467..a512ecf 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -132,8 +132,8 @@ module_param(dbg, bool, 0644);
(PAGE_MASK  ~((1ULL  (PAGE_SHIFT + (((level) - 1) \
* PT32_LEVEL_BITS))) - 1))
 
-#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
-   | PT64_NX_MASK)
+#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
+   | shadow_x_mask | shadow_nx_mask)
 
 #define ACC_EXEC_MASK1
 #define ACC_WRITE_MASK   PT_WRITABLE_MASK
@@ -3879,8 +3879,8 @@ static bool need_remote_flush(u64 old, u64 new)
return true;
if ((old ^ new)  PT64_BASE_ADDR_MASK)
return true;
-   old ^= PT64_NX_MASK;
-   new ^= PT64_NX_MASK;
+   old ^= shadow_nx_mask;
+   new ^= shadow_nx_mask;
return (old  ~new  PT64_PERM_MASK) != 0;
 }
 
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v7 11/15] nEPT: MMU context for nested EPT

2013-08-05 Thread Gleb Natapov

From: Nadav Har'El n...@il.ibm.com

KVM's existing shadow MMU code already supports nested TDP. To use it, we
need to set up a new MMU context for nested EPT, and create a few callbacks
for it (nested_ept_*()). This context should also use the EPT versions of
the page table access functions (defined in the previous patch).
Then, we need to switch back and forth between this nested context and the
regular MMU context when switching between L1 and L2 (when L1 runs this L2
with EPT).

Reviewed-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
Signed-off-by: Nadav Har'El n...@il.ibm.com
Signed-off-by: Jun Nakajima jun.nakaj...@intel.com
Signed-off-by: Xinhao Xu xinhao...@intel.com
Signed-off-by: Yang Zhang yang.z.zh...@intel.com
Signed-off-by: Gleb Natapov g...@redhat.com
---
 arch/x86/kvm/mmu.c |   27 +++
 arch/x86/kvm/mmu.h |2 ++
 arch/x86/kvm/vmx.c |   41 -
 3 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f2d982d..e3bfdde 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3795,6 +3795,33 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct 
kvm_mmu *context)
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
 
+int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+   bool execonly)
+{
+   ASSERT(vcpu);
+   ASSERT(!VALID_PAGE(vcpu-arch.mmu.root_hpa));
+
+   context-shadow_root_level = kvm_x86_ops-get_tdp_level();
+
+   context-nx = true;
+   context-new_cr3 = paging_new_cr3;
+   context-page_fault = ept_page_fault;
+   context-gva_to_gpa = ept_gva_to_gpa;
+   context-sync_page = ept_sync_page;
+   context-invlpg = ept_invlpg;
+   context-update_pte = ept_update_pte;
+   context-free = paging_free;
+   context-root_level = context-shadow_root_level;
+   context-root_hpa = INVALID_PAGE;
+   context-direct_map = false;
+
+   update_permission_bitmask(vcpu, context, true);
+   reset_rsvds_bits_mask_ept(vcpu, context, execonly);
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
+
 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
int r = kvm_init_shadow_mmu(vcpu, vcpu-arch.walk_mmu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 5b59c57..77e044a 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -71,6 +71,8 @@ enum {
 
 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool 
direct);
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+   bool execonly);
 
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 984f8d7..fbfabbe 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1046,6 +1046,11 @@ static inline bool nested_cpu_has_virtual_nmis(struct 
vmcs12 *vmcs12,
return vmcs12-pin_based_vm_exec_control  PIN_BASED_VIRTUAL_NMIS;
 }
 
+static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
+{
+   return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
return (intr_info  (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -7367,6 +7372,33 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu 
*vcpu,
vmcs12-guest_physical_address = fault-address;
 }
 
+/* Callbacks for nested_ept_init_mmu_context: */
+
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
+{
+   /* return the page table to be shadowed - in our case, EPT12 */
+   return get_vmcs12(vcpu)-ept_pointer;
+}
+
+static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+   int r = kvm_init_shadow_ept_mmu(vcpu, vcpu-arch.mmu,
+   nested_vmx_ept_caps  VMX_EPT_EXECUTE_ONLY_BIT);
+
+   vcpu-arch.mmu.set_cr3   = vmx_set_cr3;
+   vcpu-arch.mmu.get_cr3   = nested_ept_get_cr3;
+   vcpu-arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
+
+   vcpu-arch.walk_mmu  = vcpu-arch.nested_mmu;
+
+   return r;
+}
+
+static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
+{
+   vcpu-arch.walk_mmu = vcpu-arch.mmu;
+}
+
 /*
  * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
  * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function merges it
@@ -7587,6 +7619,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct 
vmcs12 *vmcs12)
vmx_flush_tlb(vcpu);
}
 
+   if (nested_cpu_has_ept(vmcs12)) {
+   kvm_mmu_unload(vcpu);
+   nested_ept_init_mmu_context(vcpu);
+   }
+
if (vmcs12-vm_entry_controls  VM_ENTRY_LOAD_IA32_EFER)
vcpu-arch.efer = vmcs12-guest_ia32_efer;
else if (vmcs12-vm_entry_controls  VM_ENTRY_IA32E_MODE)
@@ -8059,7 +8096,9 @@ static void

[PATCH v7 02/15] nEPT: Fix cr3 handling in nested exit and entry

2013-08-05 Thread Gleb Natapov

From: Nadav Har'El n...@il.ibm.com

The existing code for handling cr3 and related VMCS fields during nested
exit and entry wasn't correct in all cases:

If L2 is allowed to control cr3 (and this is indeed the case in nested EPT),
during nested exit we must copy the modified cr3 from vmcs02 to vmcs12, and
we forgot to do so. This patch adds this copy.

If L0 isn't controlling cr3 when running L2 (i.e., L0 is using EPT), and
whoever does control cr3 (L1 or L2) is using PAE, the processor might have
saved PDPTEs and we should also save them in vmcs12 (and restore later).

Reviewed-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
Reviewed-by: Orit Wasserman owass...@redhat.com
Signed-off-by: Nadav Har'El n...@il.ibm.com
Signed-off-by: Jun Nakajima jun.nakaj...@intel.com
Signed-off-by: Xinhao Xu xinhao...@intel.com
Signed-off-by: Yang Zhang yang.z.zh...@intel.com
Signed-off-by: Gleb Natapov g...@redhat.com
---
 arch/x86/kvm/vmx.c |   26 ++
 1 file changed, 26 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 27efa6a..4e98764 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7595,6 +7595,16 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct 
vmcs12 *vmcs12)
kvm_set_cr3(vcpu, vmcs12-guest_cr3);
kvm_mmu_reset_context(vcpu);
 
+   /*
+* L1 may access the L2's PDPTR, so save them to construct vmcs12
+*/
+   if (enable_ept) {
+   vmcs_write64(GUEST_PDPTR0, vmcs12-guest_pdptr0);
+   vmcs_write64(GUEST_PDPTR1, vmcs12-guest_pdptr1);
+   vmcs_write64(GUEST_PDPTR2, vmcs12-guest_pdptr2);
+   vmcs_write64(GUEST_PDPTR3, vmcs12-guest_pdptr3);
+   }
+
kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12-guest_rsp);
kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12-guest_rip);
 }
@@ -7917,6 +7927,22 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct 
vmcs12 *vmcs12)
vmcs12-guest_pending_dbg_exceptions =
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
 
+   /*
+* In some cases (usually, nested EPT), L2 is allowed to change its
+* own CR3 without exiting. If it has changed it, we must keep it.
+* Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
+* by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
+*
+* Additionally, restore L2's PDPTR to vmcs12.
+*/
+   if (enable_ept) {
+   vmcs12-guest_cr3 = vmcs_read64(GUEST_CR3);
+   vmcs12-guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
+   vmcs12-guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
+   vmcs12-guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
+   vmcs12-guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+   }
+
vmcs12-vm_entry_controls =
(vmcs12-vm_entry_controls  ~VM_ENTRY_IA32E_MODE) |
(vmcs_read32(VM_ENTRY_CONTROLS)  VM_ENTRY_IA32E_MODE);
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v7 10/15] nEPT: Add nEPT violation/misconfigration support

2013-08-05 Thread Gleb Natapov

From: Yang Zhang yang.z.zh...@intel.com

Inject nEPT fault to L1 guest. This patch is original from Xinhao.

Reviewed-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
Signed-off-by: Jun Nakajima jun.nakaj...@intel.com
Signed-off-by: Xinhao Xu xinhao...@intel.com
Signed-off-by: Yang Zhang yang.z.zh...@intel.com
Signed-off-by: Gleb Natapov g...@redhat.com
---
 arch/x86/include/asm/kvm_host.h |4 +++
 arch/x86/kvm/mmu.c  |   61 ---
 arch/x86/kvm/paging_tmpl.h  |   25 ++--
 arch/x86/kvm/vmx.c  |   19 
 4 files changed, 95 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 531f47c..58a17c0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -286,6 +286,7 @@ struct kvm_mmu {
u64 *pae_root;
u64 *lm_root;
u64 rsvd_bits_mask[2][4];
+   u64 bad_mt_xwr;
 
/*
 * Bitmap: bit set = last pte in walk
@@ -512,6 +513,9 @@ struct kvm_vcpu_arch {
 * instruction.
 */
bool write_fault_to_shadow_pgtable;
+
+   /* set at EPT violation at this point */
+   unsigned long exit_qualification;
 };
 
 struct kvm_lpage_info {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a512ecf..f2d982d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3519,6 +3519,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
int maxphyaddr = cpuid_maxphyaddr(vcpu);
u64 exb_bit_rsvd = 0;
 
+   context-bad_mt_xwr = 0;
+
if (!context-nx)
exb_bit_rsvd = rsvd_bits(63, 63);
switch (context-root_level) {
@@ -3574,7 +3576,40 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
}
 }
 
-static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu 
*mmu)
+static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
+   struct kvm_mmu *context, bool execonly)
+{
+   int maxphyaddr = cpuid_maxphyaddr(vcpu);
+   int pte;
+
+   context-rsvd_bits_mask[0][3] =
+   rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
+   context-rsvd_bits_mask[0][2] =
+   rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
+   context-rsvd_bits_mask[0][1] =
+   rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
+   context-rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
+
+   /* large page */
+   context-rsvd_bits_mask[1][3] = context-rsvd_bits_mask[0][3];
+   context-rsvd_bits_mask[1][2] =
+   rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
+   context-rsvd_bits_mask[1][1] =
+   rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
+   context-rsvd_bits_mask[1][0] = context-rsvd_bits_mask[0][0];
+   
+   for (pte = 0; pte  64; pte++) {
+   int rwx_bits = pte  7;
+   int mt = pte  3;
+   if (mt == 0x2 || mt == 0x3 || mt == 0x7 ||
+   rwx_bits == 0x2 || rwx_bits == 0x6 ||
+   (rwx_bits == 0x4  !execonly))
+   context-bad_mt_xwr |= (1ull  pte);
+   }
+}
+
+static void update_permission_bitmask(struct kvm_vcpu *vcpu,
+   struct kvm_mmu *mmu, bool ept)
 {
unsigned bit, byte, pfec;
u8 map;
@@ -3592,12 +3627,16 @@ static void update_permission_bitmask(struct kvm_vcpu 
*vcpu, struct kvm_mmu *mmu
w = bit  ACC_WRITE_MASK;
u = bit  ACC_USER_MASK;
 
-   /* Not really needed: !nx will cause pte.nx to fault */
-   x |= !mmu-nx;
-   /* Allow supervisor writes if !cr0.wp */
-   w |= !is_write_protection(vcpu)  !uf;
-   /* Disallow supervisor fetches of user code if cr4.smep 
*/
-   x = !(smep  u  !uf);
+   if (!ept) {
+   /* Not really needed: !nx will cause pte.nx to 
fault */
+   x |= !mmu-nx;
+   /* Allow supervisor writes if !cr0.wp */
+   w |= !is_write_protection(vcpu)  !uf;
+   /* Disallow supervisor fetches of user code if 
cr4.smep */
+   x = !(smep  u  !uf);
+   } else
+   /* Not really needed: no U/S accesses on ept  */
+   u = 1;
 
fault = (ff  !x) || (uf  !u) || (wf  !w);
map |= fault  bit;
@@ -3632,7 +3671,7 @@ static int paging64_init_context_common(struct kvm_vcpu 
*vcpu,
context-root_level = level;
 
reset_rsvds_bits_mask(vcpu, context);
-   update_permission_bitmask(vcpu, context);
+   update_permission_bitmask(vcpu, context, false);
update_last_pte_bitmap(vcpu, context);

[PATCH v7 15/15] nEPT: Miscelleneous cleanups

2013-08-05 Thread Gleb Natapov

From: Nadav Har'El n...@il.ibm.com

Some trivial code cleanups not really related to nested EPT.

Reviewed-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
Signed-off-by: Nadav Har'El n...@il.ibm.com
Signed-off-by: Jun Nakajima jun.nakaj...@intel.com
Signed-off-by: Xinhao Xu xinhao...@intel.com
Reviewed-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Yang Zhang yang.z.zh...@intel.com
Signed-off-by: Gleb Natapov g...@redhat.com
---
 arch/x86/kvm/vmx.c |6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ed224bd..01f2091 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -716,7 +716,6 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu 
*vcpu);
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
-static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -1041,8 +1040,7 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, 
u32 bit)
(vmcs12-secondary_vm_exec_control  bit);
 }
 
-static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
-   struct kvm_vcpu *vcpu)
+static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
 {
return vmcs12-pin_based_vm_exec_control  PIN_BASED_VIRTUAL_NMIS;
 }
@@ -6770,7 +6768,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 
if (unlikely(!cpu_has_virtual_nmis()  vmx-soft_vnmi_blocked 
!(is_guest_mode(vcpu)  nested_cpu_has_virtual_nmis(
-   get_vmcs12(vcpu), vcpu {
+   get_vmcs12(vcpu) {
if (vmx_interrupt_allowed(vcpu)) {
vmx-soft_vnmi_blocked = 0;
} else if (vmx-vnmi_blocked_time  10LL 
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v7 06/15] nEPT: Support shadow paging for guest paging without A/D bits

2013-08-05 Thread Gleb Natapov

Some guest paging modes do not support A/D bits. Add support for such
modes in shadow page code. For such modes PT_GUEST_DIRTY_MASK,
PT_GUEST_ACCESSED_MASK, PT_GUEST_DIRTY_SHIFT and PT_GUEST_ACCESSED_SHIFT
should be set to zero.

Reviewed-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
Signed-off-by: Gleb Natapov g...@redhat.com
---
 arch/x86/kvm/paging_tmpl.h |   16 +---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 7581395..2c2f635 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -92,6 +92,10 @@ static inline void FNAME(protect_clean_gpte)(unsigned 
*access, unsigned gpte)
 {
unsigned mask;
 
+   /* dirty bit is not supported, so no need to track it */
+   if (!PT_GUEST_DIRTY_MASK)
+   return;
+
BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
 
mask = (unsigned)~ACC_WRITE_MASK;
@@ -147,7 +151,8 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu 
*vcpu,
if (!FNAME(is_present_gpte)(gpte))
goto no_present;
 
-   if (!(gpte  PT_GUEST_ACCESSED_MASK))
+   /* if accessed bit is not supported prefetch non accessed gpte */
+   if (PT_GUEST_ACCESSED_MASK  !(gpte  PT_GUEST_ACCESSED_MASK))
goto no_present;
 
return false;
@@ -178,6 +183,10 @@ static int FNAME(update_accessed_dirty_bits)(struct 
kvm_vcpu *vcpu,
gfn_t table_gfn;
int ret;
 
+   /* dirty/accessed bits are not supported, so no need to update them */
+   if (!PT_GUEST_DIRTY_MASK)
+   return 0;
+
for (level = walker-max_level; level = walker-level; --level) {
pte = orig_pte = walker-ptes[level - 1];
table_gfn = walker-table_gfn[level - 1];
@@ -316,8 +325,9 @@ retry_walk:
FNAME(protect_clean_gpte)(pte_access, pte);
else
/*
-* On a write fault, fold the dirty bit into accessed_dirty by
-* shifting it one place right.
+* On a write fault, fold the dirty bit into accessed_dirty.
+* For modes without A/D bits support accessed_dirty will be
+* always clear.
 */
accessed_dirty = pte 
(PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v7 13/15] nEPT: Advertise EPT to L1

2013-08-05 Thread Gleb Natapov

From: Nadav Har'El n...@il.ibm.com

Advertise the support of EPT to the L1 guest, through the appropriate MSR.

This is the last patch of the basic Nested EPT feature, so as to allow
bisection through this patch series: The guest will not see EPT support until
this last patch, and will not attempt to use the half-applied feature.

Reviewed-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
Signed-off-by: Nadav Har'El n...@il.ibm.com
Signed-off-by: Jun Nakajima jun.nakaj...@intel.com
Signed-off-by: Xinhao Xu xinhao...@intel.com
Signed-off-by: Yang Zhang yang.z.zh...@intel.com
Signed-off-by: Gleb Natapov g...@redhat.com
---
 arch/x86/kvm/vmx.c |   20 ++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 51c06c2..c65a08a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2250,6 +2250,22 @@ static __init void nested_vmx_setup_ctls_msrs(void)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_WBINVD_EXITING;
 
+   if (enable_ept) {
+   /* nested EPT: emulate EPT also to L1 */
+   nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
+   nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT;
+   nested_vmx_ept_caps |= VMX_EPT_INVEPT_BIT;
+   nested_vmx_ept_caps = vmx_capability.ept;
+   /*
+* Since invept is completely emulated we support both global
+* and context invalidation independent of what host cpu
+* supports
+*/
+   nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
+   VMX_EPT_EXTENT_CONTEXT_BIT;
+   } else
+   nested_vmx_ept_caps = 0;
+
/* miscellaneous data */
rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
nested_vmx_misc_low = VMX_MISC_PREEMPTION_TIMER_RATE_MASK |
@@ -2358,8 +2374,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 
msr_index, u64 *pdata)
nested_vmx_secondary_ctls_high);
break;
case MSR_IA32_VMX_EPT_VPID_CAP:
-   /* Currently, no nested ept or nested vpid */
-   *pdata = 0;
+   /* Currently, no nested vpid support */
+   *pdata = nested_vmx_ept_caps;
break;
default:
return 0;
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v7 00/15] Nested EPT

2013-08-05 Thread Gleb Natapov

Xiao comment about checking ept pointer before flushing individual ept context
is addressed here.

Gleb Natapov (3):
  nEPT: make guest's A/D bits depends on guest's paging mode
  nEPT: Support shadow paging for guest paging without A/D bits
  nEPT: correctly check if remote tlb flush is needed for shadowed EPT
tables

Nadav Har'El (10):
  nEPT: Support LOAD_IA32_EFER entry/exit controls for L1
  nEPT: Fix cr3 handling in nested exit and entry
  nEPT: Fix wrong test in kvm_set_cr3
  nEPT: Move common code to paging_tmpl.h
  nEPT: Add EPT tables support to paging_tmpl.h
  nEPT: MMU context for nested EPT
  nEPT: Nested INVEPT
  nEPT: Advertise EPT to L1
  nEPT: Some additional comments
  nEPT: Miscelleneous cleanups

Yang Zhang (2):
  nEPT: Redefine EPT-specific link_shadow_page()
  nEPT: Add nEPT violation/misconfigration support

 arch/x86/include/asm/kvm_host.h |4 +
 arch/x86/include/asm/vmx.h  |2 +
 arch/x86/include/uapi/asm/vmx.h |1 +
 arch/x86/kvm/mmu.c  |  170 +-
 arch/x86/kvm/mmu.h  |2 +
 arch/x86/kvm/paging_tmpl.h  |  176 +++
 arch/x86/kvm/vmx.c  |  220 ---
 arch/x86/kvm/x86.c  |   11 --
 8 files changed, 467 insertions(+), 119 deletions(-)

-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v7 05/15] nEPT: make guest's A/D bits depends on guest's paging mode

2013-08-05 Thread Gleb Natapov

This patch makes guest A/D bits definition to be dependable on paging
mode, so when EPT support will be added it will be able to define them
differently.

Reviewed-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
Signed-off-by: Gleb Natapov g...@redhat.com
---
 arch/x86/kvm/paging_tmpl.h |   30 ++
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index fb26ca9..7581395 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -32,6 +32,10 @@
#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define PT_LEVEL_BITS PT64_LEVEL_BITS
+   #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
+   #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
+   #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+   #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
#ifdef CONFIG_X86_64
#define PT_MAX_FULL_LEVELS 4
#define CMPXCHG cmpxchg
@@ -49,6 +53,10 @@
#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
#define PT_LEVEL_BITS PT32_LEVEL_BITS
#define PT_MAX_FULL_LEVELS 2
+   #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
+   #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
+   #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+   #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
#define CMPXCHG cmpxchg
 #else
#error Invalid PTTYPE value
@@ -88,7 +96,8 @@ static inline void FNAME(protect_clean_gpte)(unsigned 
*access, unsigned gpte)
 
mask = (unsigned)~ACC_WRITE_MASK;
/* Allow write access to dirty gptes */
-   mask |= (gpte  (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT))  
PT_WRITABLE_MASK;
+   mask |= (gpte  (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) 
+   PT_WRITABLE_MASK;
*access = mask;
 }
 
@@ -138,7 +147,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu 
*vcpu,
if (!FNAME(is_present_gpte)(gpte))
goto no_present;
 
-   if (!(gpte  PT_ACCESSED_MASK))
+   if (!(gpte  PT_GUEST_ACCESSED_MASK))
goto no_present;
 
return false;
@@ -174,14 +183,14 @@ static int FNAME(update_accessed_dirty_bits)(struct 
kvm_vcpu *vcpu,
table_gfn = walker-table_gfn[level - 1];
ptep_user = walker-ptep_user[level - 1];
index = offset_in_page(ptep_user) / sizeof(pt_element_t);
-   if (!(pte  PT_ACCESSED_MASK)) {
+   if (!(pte  PT_GUEST_ACCESSED_MASK)) {
trace_kvm_mmu_set_accessed_bit(table_gfn, index, 
sizeof(pte));
-   pte |= PT_ACCESSED_MASK;
+   pte |= PT_GUEST_ACCESSED_MASK;
}
if (level == walker-level  write_fault 
-   !(pte  PT_DIRTY_MASK)) {
+   !(pte  PT_GUEST_DIRTY_MASK)) {
trace_kvm_mmu_set_dirty_bit(table_gfn, index, 
sizeof(pte));
-   pte |= PT_DIRTY_MASK;
+   pte |= PT_GUEST_DIRTY_MASK;
}
if (pte == orig_pte)
continue;
@@ -235,7 +244,7 @@ retry_walk:
ASSERT((!is_long_mode(vcpu)  is_pae(vcpu)) ||
   (mmu-get_cr3(vcpu)  CR3_NONPAE_RESERVED_BITS) == 0);
 
-   accessed_dirty = PT_ACCESSED_MASK;
+   accessed_dirty = PT_GUEST_ACCESSED_MASK;
pt_access = pte_access = ACC_ALL;
++walker-level;
 
@@ -310,7 +319,8 @@ retry_walk:
 * On a write fault, fold the dirty bit into accessed_dirty by
 * shifting it one place right.
 */
-   accessed_dirty = pte  (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT);
+   accessed_dirty = pte 
+   (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
 
if (unlikely(!accessed_dirty)) {
ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, 
write_fault);
@@ -886,3 +896,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct 
kvm_mmu_page *sp)
 #undef gpte_to_gfn
 #undef gpte_to_gfn_lvl
 #undef CMPXCHG
+#undef PT_GUEST_ACCESSED_MASK
+#undef PT_GUEST_DIRTY_MASK
+#undef PT_GUEST_DIRTY_SHIFT
+#undef PT_GUEST_ACCESSED_SHIFT
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v7 08/15] nEPT: Redefine EPT-specific link_shadow_page()

2013-08-05 Thread Gleb Natapov

From: Yang Zhang yang.z.zh...@intel.com

Since nEPT doesn't support A/D bit, so we should not set those bit
when build shadow page table.

Reviewed-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
Signed-off-by: Yang Zhang yang.z.zh...@intel.com
Signed-off-by: Gleb Natapov g...@redhat.com
---
 arch/x86/kvm/mmu.c |   12 +---
 arch/x86/kvm/paging_tmpl.h |4 ++--
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b5273c3..9e0f467 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2047,12 +2047,18 @@ static void shadow_walk_next(struct 
kvm_shadow_walk_iterator *iterator)
return __shadow_walk_next(iterator, *iterator-sptep);
 }
 
-static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
+static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool 
accessed)
 {
u64 spte;
 
+   BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK ||
+   VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
+
spte = __pa(sp-spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
-  shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
+  shadow_user_mask | shadow_x_mask;
+
+   if (accessed)
+   spte |= shadow_accessed_mask;
 
mmu_spte_set(sptep, spte);
 }
@@ -2677,7 +2683,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, 
int write,
  iterator.level - 1,
  1, ACC_ALL, iterator.sptep);
 
-   link_shadow_page(iterator.sptep, sp);
+   link_shadow_page(iterator.sptep, sp, true);
}
}
return emulate;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 762c904..f8e5680 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -555,7 +555,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
goto out_gpte_changed;
 
if (sp)
-   link_shadow_page(it.sptep, sp);
+   link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
}
 
for (;
@@ -575,7 +575,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 
sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
  true, direct_access, it.sptep);
-   link_shadow_page(it.sptep, sp);
+   link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
}
 
clear_sp_write_flooding_count(it.sptep);
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v7 07/15] nEPT: Add EPT tables support to paging_tmpl.h

2013-08-05 Thread Gleb Natapov

From: Nadav Har'El n...@il.ibm.com

This is the first patch in a series which adds nested EPT support to KVM's
nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use
EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
to set its own cr3 and take its own page faults without either of L0 or L1
getting involved. This often significanlty improves L2's performance over the
previous two alternatives (shadow page tables over EPT, and shadow page
tables over shadow page tables).

This patch adds EPT support to paging_tmpl.h.

paging_tmpl.h contains the code for reading and writing page tables. The code
for 32-bit and 64-bit tables is very similar, but not identical, so
paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
with PTTYPE=64, and this generates the two sets of similar functions.

There are subtle but important differences between the format of EPT tables
and that of ordinary x86 64-bit page tables, so for nested EPT we need a
third set of functions to read the guest EPT table and to write the shadow
EPT table.

So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed
with EPT) which correctly read and write EPT tables.

Reviewed-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
Signed-off-by: Nadav Har'El n...@il.ibm.com
Signed-off-by: Jun Nakajima jun.nakaj...@intel.com
Signed-off-by: Xinhao Xu xinhao...@intel.com
Signed-off-by: Yang Zhang yang.z.zh...@intel.com
Signed-off-by: Gleb Natapov g...@redhat.com
---
 arch/x86/kvm/mmu.c |5 +
 arch/x86/kvm/paging_tmpl.h |   37 -
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4c4274d..b5273c3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3494,6 +3494,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, 
unsigned level, unsigned gp
return mmu-last_pte_bitmap  (1  index);
 }
 
+#define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE PTTYPE_EPT
+#include paging_tmpl.h
+#undef PTTYPE
+
 #define PTTYPE 64
 #include paging_tmpl.h
 #undef PTTYPE
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 2c2f635..762c904 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -23,6 +23,13 @@
  * so the code in this file is compiled twice, once per pte size.
  */
 
+/*
+ * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro
+ * uses for EPT without A/D paging type.
+ */
+extern u64 __pure __using_nonexistent_pte_bit(void)
+  __compiletime_error(wrong use of 
PT_GUEST_(DIRTY|ACCESS)_SHIFT);
+
 #if PTTYPE == 64
#define pt_element_t u64
#define guest_walker guest_walker64
@@ -58,6 +65,21 @@
#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
#define CMPXCHG cmpxchg
+#elif PTTYPE == PTTYPE_EPT
+   #define pt_element_t u64
+   #define guest_walker guest_walkerEPT
+   #define FNAME(name) ept_##name
+   #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+   #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+   #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
+   #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+   #define PT_LEVEL_BITS PT64_LEVEL_BITS
+   #define PT_GUEST_ACCESSED_MASK 0
+   #define PT_GUEST_DIRTY_MASK 0
+   #define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit()
+   #define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit()
+   #define CMPXCHG cmpxchg64
+   #define PT_MAX_FULL_LEVELS 4
 #else
#error Invalid PTTYPE value
 #endif
@@ -115,7 +137,11 @@ static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, 
u64 gpte, int level)
 
 static inline int FNAME(is_present_gpte)(unsigned long pte)
 {
+#if PTTYPE != PTTYPE_EPT
return is_present_gpte(pte);
+#else
+   return pte  7;
+#endif
 }
 
 static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
@@ -165,9 +191,14 @@ no_present:
 static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
 {
unsigned access;
-
+#if PTTYPE == PTTYPE_EPT
+   access = ((gpte  VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
+   ((gpte  VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
+   ACC_USER_MASK;
+#else
access = (gpte  (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
access = ~(gpte  PT64_NX_SHIFT);
+#endif
 
return access;
 }
@@ -369,6 +400,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
access);
 }
 
+#if PTTYPE != PTTYPE_EPT
 static int FNAME(walk_addr_nested)(struct guest_walker *walker,
   struct kvm_vcpu *vcpu, gva_t addr,
   u32 access)
@@ -376,6 +408,7 @@ static int FNAME(walk_addr_nested)(struct guest_walker 
*walker,
return

[PATCH v7 14/15] nEPT: Some additional comments

2013-08-05 Thread Gleb Natapov

From: Nadav Har'El n...@il.ibm.com

Some additional comments to preexisting code:
Explain who (L0 or L1) handles EPT violation and misconfiguration exits.
Don't mention shadow on either EPT or shadow as the only two options.

Reviewed-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
Signed-off-by: Nadav Har'El n...@il.ibm.com
Signed-off-by: Jun Nakajima jun.nakaj...@intel.com
Signed-off-by: Xinhao Xu xinhao...@intel.com
Signed-off-by: Yang Zhang yang.z.zh...@intel.com
Signed-off-by: Gleb Natapov g...@redhat.com
---
 arch/x86/kvm/vmx.c |   13 +
 1 file changed, 13 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c65a08a..ed224bd 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6669,7 +6669,20 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu 
*vcpu)
return nested_cpu_has2(vmcs12,
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
case EXIT_REASON_EPT_VIOLATION:
+   /*
+* L0 always deals with the EPT violation. If nested EPT is
+* used, and the nested mmu code discovers that the address is
+* missing in the guest EPT table (EPT12), the EPT violation
+* will be injected with nested_ept_inject_page_fault()
+*/
+   return 0;
case EXIT_REASON_EPT_MISCONFIG:
+   /*
+* L2 never uses directly L1's EPT, but rather L0's own EPT
+* table (shadow on EPT) or a merged EPT table that L0 built
+* (EPT on EPT). So any problems with the structure of the
+* table is L0's fault.
+*/
return 0;
case EXIT_REASON_PREEMPTION_TIMER:
return vmcs12-pin_based_vm_exec_control 
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] kvm-unit-tests: VMX: Fix confused definition of rflags

2013-08-05 Thread Gleb Natapov

On Tue, Jul 30, 2013 at 11:41:00PM +0800, Arthur Chunqi Li wrote:
 Change rflags in struct regs to host_rflags. Remove settings
 to GUEST_RFLAGS since GUEST_RFLAGS can be set by vmwrite. Treat
 host_rflags as host rflags before and after vmenter.
 
I am not sure the change is for the better. Before the change one could
set up rflags for guest environment by setting regs.rflags, no special
init function had to be written. I do not see any problem with correct
code, except that rflags is not correct on a guest entry, but this
should be easy to fix. 

 Besides, add checks to flags after vmenter.
 
 Signed-off-by: Arthur Chunqi Li yzt...@gmail.com
 ---
  x86/vmx.c |   11 ++-
  x86/vmx.h |2 +-
  2 files changed, 7 insertions(+), 6 deletions(-)
 
 diff --git a/x86/vmx.c b/x86/vmx.c
 index 7467927..082c3bb 100644
 --- a/x86/vmx.c
 +++ b/x86/vmx.c
 @@ -481,6 +481,8 @@ static int vmx_run()
   vmresume\n\t
   2: 
   setbe %0\n\t
 + jbe vmx_return\n\t
 + ud2\n\t
   vmx_return:\n\t
   SAVE_GPR_C
   SAVE_RFLAGS
 @@ -505,15 +507,15 @@ static int vmx_run()
   return 0;
   case VMX_TEST_LAUNCH_ERR:
   printf(%s : vmlaunch failed.\n, __func__);
 - if ((!(regs.rflags  X86_EFLAGS_CF)  !(regs.rflags  
 X86_EFLAGS_ZF))
 - || ((regs.rflags  X86_EFLAGS_CF)  (regs.rflags  
 X86_EFLAGS_ZF)))
 + if ((!(regs.host_rflags  X86_EFLAGS_CF)  !(regs.host_rflags 
  X86_EFLAGS_ZF))
 + || ((regs.host_rflags  X86_EFLAGS_CF)  
 (regs.host_rflags  X86_EFLAGS_ZF)))
   printf(\tvmlaunch set wrong flags\n);
   report(test vmlaunch, 0);
   break;
   case VMX_TEST_RESUME_ERR:
   printf(%s : vmresume failed.\n, __func__);
 - if ((!(regs.rflags  X86_EFLAGS_CF)  !(regs.rflags  
 X86_EFLAGS_ZF))
 - || ((regs.rflags  X86_EFLAGS_CF)  (regs.rflags  
 X86_EFLAGS_ZF)))
 + if ((!(regs.host_rflags  X86_EFLAGS_CF)  !(regs.host_rflags 
  X86_EFLAGS_ZF))
 + || ((regs.host_rflags  X86_EFLAGS_CF)  
 (regs.host_rflags  X86_EFLAGS_ZF)))
   printf(\tvmresume set wrong flags\n);
   report(test vmresume, 0);
   break;
 @@ -540,7 +542,6 @@ static int test_run(struct vmx_test *test)
   test-exits = 0;
   current = test;
   regs = test-guest_regs;
 - vmcs_write(GUEST_RFLAGS, regs.rflags | 0x2);
   launched = 0;
   printf(\nTest suite : %s\n, test-name);
   vmx_run();
 diff --git a/x86/vmx.h b/x86/vmx.h
 index 1fb9738..d80e000 100644
 --- a/x86/vmx.h
 +++ b/x86/vmx.h
 @@ -27,7 +27,7 @@ struct regs {
   u64 r13;
   u64 r14;
   u64 r15;
 - u64 rflags;
 + u64 host_rflags;
  };
  
  struct vmx_test {
 -- 
 1.7.9.5

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: [Qemu-devel] vm performance degradation after kvm live migration or save-restore with EPT enabled

2013-08-05 Thread Zhanghaoyu (A)

   hi all,
   
   I met similar problem to these, while performing live migration or 
   save-restore test on the kvm platform (qemu:1.4.0, host:suse11sp2, 
   guest:suse11sp2), running tele-communication software suite in 
   guest, 
   https://lists.gnu.org/archive/html/qemu-devel/2013-05/msg00098.html
   http://comments.gmane.org/gmane.comp.emulators.kvm.devel/102506
   http://thread.gmane.org/gmane.comp.emulators.kvm.devel/100592
   https://bugzilla.kernel.org/show_bug.cgi?id=58771
   
   After live migration or virsh restore [savefile], one process's CPU 
   utilization went up by about 30%, resulted in throughput 
   degradation of this process.
   
   If EPT disabled, this problem gone.
   
   I suspect that kvm hypervisor has business with this problem.
   Based on above suspect, I want to find the two adjacent versions of 
   kvm-kmod which triggers this problem or not (e.g. 2.6.39, 3.0-rc1), 
   and analyze the differences between this two versions, or apply the 
   patches between this two versions by bisection method, finally find 
   the key patches.
   
   Any better ideas?
   
   Thanks,
   Zhang Haoyu
  
  I've attempted to duplicate this on a number of machines that are as 
  similar to yours as I am able to get my hands on, and so far have not 
  been able to see any performance degradation. And from what I've read in 
  the above links, huge pages do not seem to be part of the problem.
  
  So, if you are in a position to bisect the kernel changes, that would 
  probably be the best avenue to pursue in my opinion.
  
  Bruce
  
  I found the first bad 
  commit([612819c3c6e67bac8fceaa7cc402f13b1b63f7e4] KVM: propagate fault 
  r/w information to gup(), allow read-only memory) which triggers this 
  problem by git bisecting the kvm kernel (download from 
  https://git.kernel.org/pub/scm/virt/kvm/kvm.git) changes.
  
  And,
  git log 612819c3c6e67bac8fceaa7cc402f13b1b63f7e4 -n 1 -p  
  612819c3c6e67bac8fceaa7cc402f13b1b63f7e4.log
  git diff 
  612819c3c6e67bac8fceaa7cc402f13b1b63f7e4~1..612819c3c6e67bac8fceaa7cc4
  02f13b1b63f7e4  612819c3c6e67bac8fceaa7cc402f13b1b63f7e4.diff
  
  Then, I diffed 612819c3c6e67bac8fceaa7cc402f13b1b63f7e4.log and 
  612819c3c6e67bac8fceaa7cc402f13b1b63f7e4.diff,
  came to a conclusion that all of the differences between 
  612819c3c6e67bac8fceaa7cc402f13b1b63f7e4~1 and 
  612819c3c6e67bac8fceaa7cc402f13b1b63f7e4
  are contributed by no other than 
  612819c3c6e67bac8fceaa7cc402f13b1b63f7e4, so this commit is the 
  peace-breaker which directly or indirectly causes the degradation.
  
  Does the map_writable flag passed to mmu_set_spte() function have effect 
  on PTE's PAT flag or increase the VMEXITs induced by that guest tried to 
  write read-only memory?
  
  Thanks,
  Zhang Haoyu
  
 
 There should be no read-only memory maps backing guest RAM.
 
 Can you confirm map_writable = false is being passed to __direct_map? (this 
 should not happen, for guest RAM).
 And if it is false, please capture the associated GFN.
 
 I added below check and printk at the start of __direct_map() at the fist 
 bad commit version,
 --- kvm-612819c3c6e67bac8fceaa7cc402f13b1b63f7e4/arch/x86/kvm/mmu.c 
 2013-07-26 18:44:05.0 +0800
 +++ kvm-612819/arch/x86/kvm/mmu.c   2013-07-31 00:05:48.0 +0800
 @@ -2223,6 +2223,9 @@ static int __direct_map(struct kvm_vcpu
 int pt_write = 0;
 gfn_t pseudo_gfn;
 
 +if (!map_writable)
 +printk(KERN_ERR %s: %s: gfn = %llu \n, __FILE__, 
 __func__, gfn);
 +
 for_each_shadow_entry(vcpu, (u64)gfn  PAGE_SHIFT, iterator) {
 if (iterator.level == level) {
 unsigned pte_access = ACC_ALL;
 
 I virsh-save the VM, and then virsh-restore it, so many GFNs were printed, 
 you can absolutely describe it as flooding.
 
The flooding you see happens during migrate to file stage because of dirty
page tracking. If you clear dmesg after virsh-save you should not see any
flooding after virsh-restore. I just checked with latest tree, I do not.

I made a verification again.
I virsh-save the VM, during the saving stage, I run 'dmesg', no GFN printed, 
maybe the switching from running stage to pause stage takes so short time, 
no guest-write happens during this switching period.
After the completion of saving operation, I run 'demsg -c' to clear the buffer 
all the same, then I virsh-restore the VM, so many GFNs are printed by running 
'dmesg',
and I also run 'tail -f /var/log/messages' during the restoring stage, so many 
GFNs are flooded dynamically too.
I'm sure that the flooding happens during the virsh-restore stage, not the 
migration stage.

On VM's normal starting stage, only very few GFNs are printed, shown as below
gfn = 16
gfn = 604
gfn = 605
gfn = 606
gfn = 607
gfn = 608
gfn = 609

but on the VM's restoring stage, so many GFNs are printed, taking some examples 
shown as below,
2042600
279
2797778
2797779
2797780
2797781
2797782
2797783
2797784

Re: [PATCH] nVMX: Keep arch.pat in sync on L1-L2 switches

2013-08-05 Thread Gleb Natapov

On Sun, Aug 04, 2013 at 05:17:27PM +0200, Jan Kiszka wrote:
 From: Jan Kiszka jan.kis...@siemens.com
 
 When asking vmx to load the PAT MSR for us while switching from L1 to L2
 or vice versa, we have to update arch.pat as well as it may later be
 used again to load or read out the MSR content.
 
 Signed-off-by: Jan Kiszka jan.kis...@siemens.com
Reviewed-by: Gleb Natapov g...@redhat.com

 ---
 
 Arthur, please add your tested-by also officially.
 
  arch/x86/kvm/vmx.c |9 ++---
  1 files changed, 6 insertions(+), 3 deletions(-)
 
 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index 45fd70c..396572d 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -7535,9 +7535,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, 
 struct vmcs12 *vmcs12)
   vmcs_write32(VM_ENTRY_CONTROLS, vmcs12-vm_entry_controls |
   (vmcs_config.vmentry_ctrl  ~VM_ENTRY_IA32E_MODE));
  
 - if (vmcs12-vm_entry_controls  VM_ENTRY_LOAD_IA32_PAT)
 + if (vmcs12-vm_entry_controls  VM_ENTRY_LOAD_IA32_PAT) {
   vmcs_write64(GUEST_IA32_PAT, vmcs12-guest_ia32_pat);
 - else if (vmcs_config.vmentry_ctrl  VM_ENTRY_LOAD_IA32_PAT)
 + vcpu-arch.pat = vmcs12-guest_ia32_pat;
 + } else if (vmcs_config.vmentry_ctrl  VM_ENTRY_LOAD_IA32_PAT)
   vmcs_write64(GUEST_IA32_PAT, vmx-vcpu.arch.pat);
  
  
 @@ -8025,8 +8026,10 @@ static void load_vmcs12_host_state(struct kvm_vcpu 
 *vcpu,
   vmcs_writel(GUEST_IDTR_BASE, vmcs12-host_idtr_base);
   vmcs_writel(GUEST_GDTR_BASE, vmcs12-host_gdtr_base);
  
 - if (vmcs12-vm_exit_controls  VM_EXIT_LOAD_IA32_PAT)
 + if (vmcs12-vm_exit_controls  VM_EXIT_LOAD_IA32_PAT) {
   vmcs_write64(GUEST_IA32_PAT, vmcs12-host_ia32_pat);
 + vcpu-arch.pat = vmcs12-host_ia32_pat;
 + }
   if (vmcs12-vm_exit_controls  VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
   vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
   vmcs12-host_ia32_perf_global_ctrl);
 -- 
 1.7.3.4

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] vm performance degradation after kvm live migration or save-restore with EPT enabled

2013-08-05 Thread Gleb Natapov

On Mon, Aug 05, 2013 at 08:35:09AM +, Zhanghaoyu (A) wrote:
hi all,

I met similar problem to these, while performing live migration or 
save-restore test on the kvm platform (qemu:1.4.0, host:suse11sp2, 
guest:suse11sp2), running tele-communication software suite in 
guest, 
https://lists.gnu.org/archive/html/qemu-devel/2013-05/msg00098.html
http://comments.gmane.org/gmane.comp.emulators.kvm.devel/102506
http://thread.gmane.org/gmane.comp.emulators.kvm.devel/100592
https://bugzilla.kernel.org/show_bug.cgi?id=58771

After live migration or virsh restore [savefile], one process's CPU 
utilization went up by about 30%, resulted in throughput 
degradation of this process.

If EPT disabled, this problem gone.

I suspect that kvm hypervisor has business with this problem.
Based on above suspect, I want to find the two adjacent versions of 
kvm-kmod which triggers this problem or not (e.g. 2.6.39, 3.0-rc1), 
and analyze the differences between this two versions, or apply the 
patches between this two versions by bisection method, finally find 
the key patches.

Any better ideas?

Thanks,
Zhang Haoyu
   
   I've attempted to duplicate this on a number of machines that are as 
   similar to yours as I am able to get my hands on, and so far have not 
   been able to see any performance degradation. And from what I've read 
   in the above links, huge pages do not seem to be part of the problem.
   
   So, if you are in a position to bisect the kernel changes, that would 
   probably be the best avenue to pursue in my opinion.
   
   Bruce
   
   I found the first bad 
   commit([612819c3c6e67bac8fceaa7cc402f13b1b63f7e4] KVM: propagate fault 
   r/w information to gup(), allow read-only memory) which triggers this 
   problem by git bisecting the kvm kernel (download from 
   https://git.kernel.org/pub/scm/virt/kvm/kvm.git) changes.
   
   And,
   git log 612819c3c6e67bac8fceaa7cc402f13b1b63f7e4 -n 1 -p  
   612819c3c6e67bac8fceaa7cc402f13b1b63f7e4.log
   git diff 
   612819c3c6e67bac8fceaa7cc402f13b1b63f7e4~1..612819c3c6e67bac8fceaa7cc4
   02f13b1b63f7e4  612819c3c6e67bac8fceaa7cc402f13b1b63f7e4.diff
   
   Then, I diffed 612819c3c6e67bac8fceaa7cc402f13b1b63f7e4.log and 
   612819c3c6e67bac8fceaa7cc402f13b1b63f7e4.diff,
   came to a conclusion that all of the differences between 
   612819c3c6e67bac8fceaa7cc402f13b1b63f7e4~1 and 
   612819c3c6e67bac8fceaa7cc402f13b1b63f7e4
   are contributed by no other than 
   612819c3c6e67bac8fceaa7cc402f13b1b63f7e4, so this commit is the 
   peace-breaker which directly or indirectly causes the degradation.
   
   Does the map_writable flag passed to mmu_set_spte() function have 
   effect on PTE's PAT flag or increase the VMEXITs induced by that guest 
   tried to write read-only memory?
   
   Thanks,
   Zhang Haoyu
   
  
  There should be no read-only memory maps backing guest RAM.
  
  Can you confirm map_writable = false is being passed to __direct_map? 
  (this should not happen, for guest RAM).
  And if it is false, please capture the associated GFN.
  
  I added below check and printk at the start of __direct_map() at the fist 
  bad commit version,
  --- kvm-612819c3c6e67bac8fceaa7cc402f13b1b63f7e4/arch/x86/kvm/mmu.c 
  2013-07-26 18:44:05.0 +0800
  +++ kvm-612819/arch/x86/kvm/mmu.c   2013-07-31 00:05:48.0 +0800
  @@ -2223,6 +2223,9 @@ static int __direct_map(struct kvm_vcpu
  int pt_write = 0;
  gfn_t pseudo_gfn;
  
  +if (!map_writable)
  +printk(KERN_ERR %s: %s: gfn = %llu \n, __FILE__, 
  __func__, gfn);
  +
  for_each_shadow_entry(vcpu, (u64)gfn  PAGE_SHIFT, iterator) {
  if (iterator.level == level) {
  unsigned pte_access = ACC_ALL;
  
  I virsh-save the VM, and then virsh-restore it, so many GFNs were printed, 
  you can absolutely describe it as flooding.
  
 The flooding you see happens during migrate to file stage because of dirty
 page tracking. If you clear dmesg after virsh-save you should not see any
 flooding after virsh-restore. I just checked with latest tree, I do not.
 
 I made a verification again.
 I virsh-save the VM, during the saving stage, I run 'dmesg', no GFN printed, 
 maybe the switching from running stage to pause stage takes so short time, 
 no guest-write happens during this switching period.
 After the completion of saving operation, I run 'demsg -c' to clear the 
 buffer all the same, then I virsh-restore the VM, so many GFNs are printed by 
 running 'dmesg',
 and I also run 'tail -f /var/log/messages' during the restoring stage, so 
 many GFNs are flooded dynamically too.
 I'm sure that the flooding happens during the virsh-restore stage, not the 
 migration stage.
 
Interesting, is this with upstream kernel? For me the situation is
exactly the opposite. What is your command line?
 
--

Re: [PATCH] kvm-unit-tests: VMX: Fix confused definition of rflags

2013-08-05 Thread Arthur Chunqi Li

On Mon, Aug 5, 2013 at 4:29 PM, Gleb Natapov g...@redhat.com wrote:
 On Tue, Jul 30, 2013 at 11:41:00PM +0800, Arthur Chunqi Li wrote:
 Change rflags in struct regs to host_rflags. Remove settings
 to GUEST_RFLAGS since GUEST_RFLAGS can be set by vmwrite. Treat
 host_rflags as host rflags before and after vmenter.

 I am not sure the change is for the better. Before the change one could
 set up rflags for guest environment by setting regs.rflags, no special
 init function had to be written. I do not see any problem with correct
 code, except that rflags is not correct on a guest entry, but this
 should be easy to fix.
regs.rflags are designed to set guest rflags, but the current
implementation just use it as host_rflags. For every VM entry, it will
load value set by vmcs_write(GUEST_RFLAGS). Set regs.flags as host
rflags and then enter VM cannot affect VM's rflags, which is the
current implementation.

Besides, if host want to set/get guest's rflags, it just use
vmcs_write/read(GUEST_RFLAGS).

Arthur

 Besides, add checks to flags after vmenter.

 Signed-off-by: Arthur Chunqi Li yzt...@gmail.com
 ---
  x86/vmx.c |   11 ++-
  x86/vmx.h |2 +-
  2 files changed, 7 insertions(+), 6 deletions(-)

 diff --git a/x86/vmx.c b/x86/vmx.c
 index 7467927..082c3bb 100644
 --- a/x86/vmx.c
 +++ b/x86/vmx.c
 @@ -481,6 +481,8 @@ static int vmx_run()
   vmresume\n\t
   2: 
   setbe %0\n\t
 + jbe vmx_return\n\t
 + ud2\n\t
   vmx_return:\n\t
   SAVE_GPR_C
   SAVE_RFLAGS
 @@ -505,15 +507,15 @@ static int vmx_run()
   return 0;
   case VMX_TEST_LAUNCH_ERR:
   printf(%s : vmlaunch failed.\n, __func__);
 - if ((!(regs.rflags  X86_EFLAGS_CF)  !(regs.rflags  
 X86_EFLAGS_ZF))
 - || ((regs.rflags  X86_EFLAGS_CF)  (regs.rflags  
 X86_EFLAGS_ZF)))
 + if ((!(regs.host_rflags  X86_EFLAGS_CF)  !(regs.host_rflags 
  X86_EFLAGS_ZF))
 + || ((regs.host_rflags  X86_EFLAGS_CF)  
 (regs.host_rflags  X86_EFLAGS_ZF)))
   printf(\tvmlaunch set wrong flags\n);
   report(test vmlaunch, 0);
   break;
   case VMX_TEST_RESUME_ERR:
   printf(%s : vmresume failed.\n, __func__);
 - if ((!(regs.rflags  X86_EFLAGS_CF)  !(regs.rflags  
 X86_EFLAGS_ZF))
 - || ((regs.rflags  X86_EFLAGS_CF)  (regs.rflags  
 X86_EFLAGS_ZF)))
 + if ((!(regs.host_rflags  X86_EFLAGS_CF)  !(regs.host_rflags 
  X86_EFLAGS_ZF))
 + || ((regs.host_rflags  X86_EFLAGS_CF)  
 (regs.host_rflags  X86_EFLAGS_ZF)))
   printf(\tvmresume set wrong flags\n);
   report(test vmresume, 0);
   break;
 @@ -540,7 +542,6 @@ static int test_run(struct vmx_test *test)
   test-exits = 0;
   current = test;
   regs = test-guest_regs;
 - vmcs_write(GUEST_RFLAGS, regs.rflags | 0x2);
   launched = 0;
   printf(\nTest suite : %s\n, test-name);
   vmx_run();
 diff --git a/x86/vmx.h b/x86/vmx.h
 index 1fb9738..d80e000 100644
 --- a/x86/vmx.h
 +++ b/x86/vmx.h
 @@ -27,7 +27,7 @@ struct regs {
   u64 r13;
   u64 r14;
   u64 r15;
 - u64 rflags;
 + u64 host_rflags;
  };

  struct vmx_test {
 --
 1.7.9.5

 --
 Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v7 12/15] nEPT: Nested INVEPT

2013-08-05 Thread Xiao Guangrong

On 08/05/2013 04:07 PM, Gleb Natapov wrote:
 From: Nadav Har'El n...@il.ibm.com
 
 If we let L1 use EPT, we should probably also support the INVEPT instruction.
 
 In our current nested EPT implementation, when L1 changes its EPT table
 for L2 (i.e., EPT12), L0 modifies the shadow EPT table (EPT02), and in
 the course of this modification already calls INVEPT. But if last level
 of shadow page is unsync not all L1's changes to EPT12 are intercepted,
 which means roots need to be synced when L1 calls INVEPT. Global INVEPT
 should not be different since roots are synced by kvm_mmu_load() each
 time EPTP02 changes.

Reviewed-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] KVM: nVMX: Advertise IA32_PAT in VM exit control

2013-08-05 Thread Arthur Chunqi Li

Advertise VM_EXIT_SAVE_IA32_PAT and VM_EXIT_LOAD_IA32_PAT.

Signed-off-by: Arthur Chunqi Li yzt...@gmail.com
---
 arch/x86/kvm/vmx.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 45fd70c..240f0db 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2198,7 +2198,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 #else
nested_vmx_exit_ctls_high = 0;
 #endif
-   nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+   nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+   VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
 
/* entry controls */
rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: [Qemu-devel] vm performance degradation after kvm live migration or save-restore with EPT enabled

2013-08-05 Thread Zhanghaoyu (A)

hi all,

I met similar problem to these, while performing live migration or 
save-restore test on the kvm platform (qemu:1.4.0, host:suse11sp2, 
guest:suse11sp2), running tele-communication software suite in 
guest, 
https://lists.gnu.org/archive/html/qemu-devel/2013-05/msg00098.html
http://comments.gmane.org/gmane.comp.emulators.kvm.devel/102506
http://thread.gmane.org/gmane.comp.emulators.kvm.devel/100592
https://bugzilla.kernel.org/show_bug.cgi?id=58771

After live migration or virsh restore [savefile], one process's CPU 
utilization went up by about 30%, resulted in throughput 
degradation of this process.

If EPT disabled, this problem gone.

I suspect that kvm hypervisor has business with this problem.
Based on above suspect, I want to find the two adjacent versions of 
kvm-kmod which triggers this problem or not (e.g. 2.6.39, 3.0-rc1), 
and analyze the differences between this two versions, or apply the 
patches between this two versions by bisection method, finally find 
the key patches.

Any better ideas?

Thanks,
Zhang Haoyu
   
   I've attempted to duplicate this on a number of machines that are as 
   similar to yours as I am able to get my hands on, and so far have not 
   been able to see any performance degradation. And from what I've read 
   in the above links, huge pages do not seem to be part of the problem.
   
   So, if you are in a position to bisect the kernel changes, that would 
   probably be the best avenue to pursue in my opinion.
   
   Bruce
   
   I found the first bad 
   commit([612819c3c6e67bac8fceaa7cc402f13b1b63f7e4] KVM: propagate fault 
   r/w information to gup(), allow read-only memory) which triggers this 
   problem by git bisecting the kvm kernel (download from 
   https://git.kernel.org/pub/scm/virt/kvm/kvm.git) changes.
   
   And,
   git log 612819c3c6e67bac8fceaa7cc402f13b1b63f7e4 -n 1 -p  
   612819c3c6e67bac8fceaa7cc402f13b1b63f7e4.log
   git diff 
   612819c3c6e67bac8fceaa7cc402f13b1b63f7e4~1..612819c3c6e67bac8fceaa7cc4
   02f13b1b63f7e4  612819c3c6e67bac8fceaa7cc402f13b1b63f7e4.diff
   
   Then, I diffed 612819c3c6e67bac8fceaa7cc402f13b1b63f7e4.log and 
   612819c3c6e67bac8fceaa7cc402f13b1b63f7e4.diff,
   came to a conclusion that all of the differences between 
   612819c3c6e67bac8fceaa7cc402f13b1b63f7e4~1 and 
   612819c3c6e67bac8fceaa7cc402f13b1b63f7e4
   are contributed by no other than 
   612819c3c6e67bac8fceaa7cc402f13b1b63f7e4, so this commit is the 
   peace-breaker which directly or indirectly causes the degradation.
   
   Does the map_writable flag passed to mmu_set_spte() function have 
   effect on PTE's PAT flag or increase the VMEXITs induced by that guest 
   tried to write read-only memory?
   
   Thanks,
   Zhang Haoyu
   
  
  There should be no read-only memory maps backing guest RAM.
  
  Can you confirm map_writable = false is being passed to __direct_map? 
  (this should not happen, for guest RAM).
  And if it is false, please capture the associated GFN.
  
  I added below check and printk at the start of __direct_map() at the fist 
  bad commit version,
  --- kvm-612819c3c6e67bac8fceaa7cc402f13b1b63f7e4/arch/x86/kvm/mmu.c 
  2013-07-26 18:44:05.0 +0800
  +++ kvm-612819/arch/x86/kvm/mmu.c   2013-07-31 00:05:48.0 
  +0800
  @@ -2223,6 +2223,9 @@ static int __direct_map(struct kvm_vcpu
  int pt_write = 0;
  gfn_t pseudo_gfn;
  
  +if (!map_writable)
  +printk(KERN_ERR %s: %s: gfn = %llu \n, __FILE__, 
  __func__, gfn);
  +
  for_each_shadow_entry(vcpu, (u64)gfn  PAGE_SHIFT, iterator) {
  if (iterator.level == level) {
  unsigned pte_access = ACC_ALL;
  
  I virsh-save the VM, and then virsh-restore it, so many GFNs were 
  printed, you can absolutely describe it as flooding.
  
 The flooding you see happens during migrate to file stage because of dirty
 page tracking. If you clear dmesg after virsh-save you should not see any
 flooding after virsh-restore. I just checked with latest tree, I do not.
 
 I made a verification again.
 I virsh-save the VM, during the saving stage, I run 'dmesg', no GFN printed, 
 maybe the switching from running stage to pause stage takes so short time, 
 no guest-write happens during this switching period.
 After the completion of saving operation, I run 'demsg -c' to clear the 
 buffer all the same, then I virsh-restore the VM, so many GFNs are printed 
 by running 'dmesg',
 and I also run 'tail -f /var/log/messages' during the restoring stage, so 
 many GFNs are flooded dynamically too.
 I'm sure that the flooding happens during the virsh-restore stage, not the 
 migration stage.
 
Interesting, is this with upstream kernel? For me the situation is
exactly the opposite. What is your command line?
 
I made the verification on the first bad commit

Re: [PATCH] kvm-unit-tests: VMX: Fix confused definition of rflags

2013-08-05 Thread Gleb Natapov

On Mon, Aug 05, 2013 at 04:48:47PM +0800, Arthur Chunqi Li wrote:
 On Mon, Aug 5, 2013 at 4:29 PM, Gleb Natapov g...@redhat.com wrote:
  On Tue, Jul 30, 2013 at 11:41:00PM +0800, Arthur Chunqi Li wrote:
  Change rflags in struct regs to host_rflags. Remove settings
  to GUEST_RFLAGS since GUEST_RFLAGS can be set by vmwrite. Treat
  host_rflags as host rflags before and after vmenter.
 
  I am not sure the change is for the better. Before the change one could
  set up rflags for guest environment by setting regs.rflags, no special
  init function had to be written. I do not see any problem with correct
  code, except that rflags is not correct on a guest entry, but this
  should be easy to fix.
 regs.rflags are designed to set guest rflags, but the current
 implementation just use it as host_rflags.
Current implementation uses it to set guest flags at the beginning of
a test.

For every VM entry, it will
 load value set by vmcs_write(GUEST_RFLAGS).
Not for every vmentry, only for the first one. Doing it for every
vmentry without saving it first with vmcs_read would been incorrect.

  Set regs.flags as host
 rflags and then enter VM cannot affect VM's rflags, which is the
 current implementation.
 
It can, on the first launch. It is easy to add code to exit_handler() to
set current-guest_regs.rflags correctly before calling test's exit handler
and write GUEST_RFLAGS according to current-guest_regs.rflags after
exit handler returns, but as you say below test can do it by itself in
exit handler if it wishes so.

 Besides, if host want to set/get guest's rflags, it just use
 vmcs_write/read(GUEST_RFLAGS).
 
True, but to do it at the beginning of the test it will require to write
special init function even if non are needed otherwise. So you are
removing this functionality without clear benefit.

 Arthur
 
  Besides, add checks to flags after vmenter.
 
  Signed-off-by: Arthur Chunqi Li yzt...@gmail.com
  ---
   x86/vmx.c |   11 ++-
   x86/vmx.h |2 +-
   2 files changed, 7 insertions(+), 6 deletions(-)
 
  diff --git a/x86/vmx.c b/x86/vmx.c
  index 7467927..082c3bb 100644
  --- a/x86/vmx.c
  +++ b/x86/vmx.c
  @@ -481,6 +481,8 @@ static int vmx_run()
vmresume\n\t
2: 
setbe %0\n\t
  + jbe vmx_return\n\t
  + ud2\n\t
vmx_return:\n\t
SAVE_GPR_C
SAVE_RFLAGS
  @@ -505,15 +507,15 @@ static int vmx_run()
return 0;
case VMX_TEST_LAUNCH_ERR:
printf(%s : vmlaunch failed.\n, __func__);
  - if ((!(regs.rflags  X86_EFLAGS_CF)  !(regs.rflags  
  X86_EFLAGS_ZF))
  - || ((regs.rflags  X86_EFLAGS_CF)  (regs.rflags  
  X86_EFLAGS_ZF)))
  + if ((!(regs.host_rflags  X86_EFLAGS_CF)  
  !(regs.host_rflags  X86_EFLAGS_ZF))
  + || ((regs.host_rflags  X86_EFLAGS_CF)  
  (regs.host_rflags  X86_EFLAGS_ZF)))
printf(\tvmlaunch set wrong flags\n);
report(test vmlaunch, 0);
break;
case VMX_TEST_RESUME_ERR:
printf(%s : vmresume failed.\n, __func__);
  - if ((!(regs.rflags  X86_EFLAGS_CF)  !(regs.rflags  
  X86_EFLAGS_ZF))
  - || ((regs.rflags  X86_EFLAGS_CF)  (regs.rflags  
  X86_EFLAGS_ZF)))
  + if ((!(regs.host_rflags  X86_EFLAGS_CF)  
  !(regs.host_rflags  X86_EFLAGS_ZF))
  + || ((regs.host_rflags  X86_EFLAGS_CF)  
  (regs.host_rflags  X86_EFLAGS_ZF)))
printf(\tvmresume set wrong flags\n);
report(test vmresume, 0);
break;
  @@ -540,7 +542,6 @@ static int test_run(struct vmx_test *test)
test-exits = 0;
current = test;
regs = test-guest_regs;
  - vmcs_write(GUEST_RFLAGS, regs.rflags | 0x2);
launched = 0;
printf(\nTest suite : %s\n, test-name);
vmx_run();
  diff --git a/x86/vmx.h b/x86/vmx.h
  index 1fb9738..d80e000 100644
  --- a/x86/vmx.h
  +++ b/x86/vmx.h
  @@ -27,7 +27,7 @@ struct regs {
u64 r13;
u64 r14;
u64 r15;
  - u64 rflags;
  + u64 host_rflags;
   };
 
   struct vmx_test {
  --
  1.7.9.5
 
  --
  Gleb.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] vm performance degradation after kvm live migration or save-restore with EPT enabled

2013-08-05 Thread Andreas Färber

Hi,

Am 05.08.2013 11:09, schrieb Zhanghaoyu (A):
 When I build the upstream, encounter a problem that I compile and install the 
 upstream(commit: e769ece3b129698d2b09811a6f6d304e4eaa8c29) on sles11sp2 
 environment via below command
 cp /boot/config-3.0.13-0.27-default ./.config
 yes  | make oldconfig
 make  make modules_install  make install
 then, I reboot the host, and select the upstream kernel, but during the 
 starting stage, below problem happened,
 Could not find /dev/disk/by-id/scsi-3600508e0864407c5b8f7ad01-part3 
 
 I'm trying to resolve it.

Possibly you need to enable loading unsupported kernel modules?
At least that's needed when testing a kmod with a SUSE kernel.

Regards,
Andreas

-- 
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: [Qemu-devel] vm performance degradation after kvm live migration or save-restore with EPT enabled

2013-08-05 Thread Zhanghaoyu (A)

Hi,

Am 05.08.2013 11:09, schrieb Zhanghaoyu (A):
 When I build the upstream, encounter a problem that I compile and 
 install the upstream(commit: e769ece3b129698d2b09811a6f6d304e4eaa8c29) 
 on sles11sp2 environment via below command cp 
 /boot/config-3.0.13-0.27-default ./.config yes  | make oldconfig 
 make  make modules_install  make install then, I reboot the host, 
 and select the upstream kernel, but during the starting stage, below 
 problem happened, Could not find 
 /dev/disk/by-id/scsi-3600508e0864407c5b8f7ad01-part3
 
 I'm trying to resolve it.

Possibly you need to enable loading unsupported kernel modules?
At least that's needed when testing a kmod with a SUSE kernel.

I have tried to set  allow_unsupported_modules 1 in 
/etc/modprobe.d/unsupported-modules, but the problem still happened.
I replace the whole kernel with the kvm kernel, not only the kvm modules.

Regards,
Andreas
N�r��yb�X��ǧv�^�)޺{.n�+h����ܨ}���Ơz�j:+v���zZ+��+zf���h���~i���z��w���?��)ߢf

Re: [PATCH] kvm-unit-tests: VMX: Fix confused definition of rflags

2013-08-05 Thread Gmail


在 2013-8-5，17:10，Gleb Natapov g...@redhat.com 写道：

 On Mon, Aug 05, 2013 at 04:48:47PM +0800, Arthur Chunqi Li wrote:
 On Mon, Aug 5, 2013 at 4:29 PM, Gleb Natapov g...@redhat.com wrote:
 On Tue, Jul 30, 2013 at 11:41:00PM +0800, Arthur Chunqi Li wrote:
 Change rflags in struct regs to host_rflags. Remove settings
 to GUEST_RFLAGS since GUEST_RFLAGS can be set by vmwrite. Treat
 host_rflags as host rflags before and after vmenter.
 I am not sure the change is for the better. Before the change one could
 set up rflags for guest environment by setting regs.rflags, no special
 init function had to be written. I do not see any problem with correct
 code, except that rflags is not correct on a guest entry, but this
 should be easy to fix.
 regs.rflags are designed to set guest rflags, but the current
 implementation just use it as host_rflags.
 Current implementation uses it to set guest flags at the beginning of
 a test.
 
   For every VM entry, it will
 load value set by vmcs_write(GUEST_RFLAGS).
 Not for every vmentry, only for the first one. Doing it for every
 vmentry without saving it first with vmcs_read would been incorrect.
 
 Set regs.flags as host
 rflags and then enter VM cannot affect VM's rflags, which is the
 current implementation.
 It can, on the first launch. It is easy to add code to exit_handler() to
 set current-guest_regs.rflags correctly before calling test's exit handler
 and write GUEST_RFLAGS according to current-guest_regs.rflags after
 exit handler returns, but as you say below test can do it by itself in
 exit handler if it wishes so.
 
 Besides, if host want to set/get guest's rflags, it just use
 vmcs_write/read(GUEST_RFLAGS).
 True, but to do it at the beginning of the test it will require to write
 special init function even if non are needed otherwise. So you are
 removing this functionality without clear benefit.
True, so it seems better to keep it. However, now it is confused with 
host_rflags and not correctly set/get when vmentry/vmexit. I will commit 
another patch to fix both bugs.

Arthur
 
 Arthur
 
 Besides, add checks to flags after vmenter.
 
 Signed-off-by: Arthur Chunqi Li yzt...@gmail.com
 ---
 x86/vmx.c |   11 ++-
 x86/vmx.h |2 +-
 2 files changed, 7 insertions(+), 6 deletions(-)
 
 diff --git a/x86/vmx.c b/x86/vmx.c
 index 7467927..082c3bb 100644
 --- a/x86/vmx.c
 +++ b/x86/vmx.c
 @@ -481,6 +481,8 @@ static int vmx_run()
  vmresume\n\t
  2: 
  setbe %0\n\t
 + jbe vmx_return\n\t
 + ud2\n\t
  vmx_return:\n\t
  SAVE_GPR_C
  SAVE_RFLAGS
 @@ -505,15 +507,15 @@ static int vmx_run()
  return 0;
  case VMX_TEST_LAUNCH_ERR:
  printf(%s : vmlaunch failed.\n, __func__);
 - if ((!(regs.rflags  X86_EFLAGS_CF)  !(regs.rflags  
 X86_EFLAGS_ZF))
 - || ((regs.rflags  X86_EFLAGS_CF)  (regs.rflags  
 X86_EFLAGS_ZF)))
 + if ((!(regs.host_rflags  X86_EFLAGS_CF)  
 !(regs.host_rflags  X86_EFLAGS_ZF))
 + || ((regs.host_rflags  X86_EFLAGS_CF)  
 (regs.host_rflags  X86_EFLAGS_ZF)))
  printf(\tvmlaunch set wrong flags\n);
  report(test vmlaunch, 0);
  break;
  case VMX_TEST_RESUME_ERR:
  printf(%s : vmresume failed.\n, __func__);
 - if ((!(regs.rflags  X86_EFLAGS_CF)  !(regs.rflags  
 X86_EFLAGS_ZF))
 - || ((regs.rflags  X86_EFLAGS_CF)  (regs.rflags  
 X86_EFLAGS_ZF)))
 + if ((!(regs.host_rflags  X86_EFLAGS_CF)  
 !(regs.host_rflags  X86_EFLAGS_ZF))
 + || ((regs.host_rflags  X86_EFLAGS_CF)  
 (regs.host_rflags  X86_EFLAGS_ZF)))
  printf(\tvmresume set wrong flags\n);
  report(test vmresume, 0);
  break;
 @@ -540,7 +542,6 @@ static int test_run(struct vmx_test *test)
  test-exits = 0;
  current = test;
  regs = test-guest_regs;
 - vmcs_write(GUEST_RFLAGS, regs.rflags | 0x2);
  launched = 0;
  printf(\nTest suite : %s\n, test-name);
  vmx_run();
 diff --git a/x86/vmx.h b/x86/vmx.h
 index 1fb9738..d80e000 100644
 --- a/x86/vmx.h
 +++ b/x86/vmx.h
 @@ -27,7 +27,7 @@ struct regs {
  u64 r13;
  u64 r14;
  u64 r15;
 - u64 rflags;
 + u64 host_rflags;
 };
 
 struct vmx_test {
 --
 1.7.9.5
 
 --
Gleb.
 
 --
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] KVM: nVMX: Advertise IA32_PAT in VM exit control

2013-08-05 Thread Gleb Natapov

On Mon, Aug 05, 2013 at 05:10:35PM +0800, Arthur Chunqi Li wrote:
 Advertise VM_EXIT_SAVE_IA32_PAT and VM_EXIT_LOAD_IA32_PAT.
 
 Signed-off-by: Arthur Chunqi Li yzt...@gmail.com
 ---
  arch/x86/kvm/vmx.c |3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)
 
 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index 45fd70c..240f0db 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -2198,7 +2198,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
  #else
   nested_vmx_exit_ctls_high = 0;
  #endif
 - nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
 + nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
 + VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
  
You should not set those if host does not support them, otherwise
GUEST_IA32_PAT may not be available.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] KVM: nVMX: Advertise IA32_PAT in VM exit control

2013-08-05 Thread Gmail



 On Mon, Aug 05, 2013 at 05:10:35PM +0800, Arthur Chunqi Li wrote:
 Advertise VM_EXIT_SAVE_IA32_PAT and VM_EXIT_LOAD_IA32_PAT.
 
 Signed-off-by: Arthur Chunqi Li yzt...@gmail.com
 ---
 arch/x86/kvm/vmx.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
 
 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index 45fd70c..240f0db 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -2198,7 +2198,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 #else
nested_vmx_exit_ctls_high = 0;
 #endif
 -nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
 +nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
 +VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
 
 You should not set those if host does not support them, otherwise
 GUEST_IA32_PAT may not be available
To Jan,
Is this different from IA32_EFER?

Arthur
 
 --
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] vm performance degradation after kvm live migration or save-restore with EPT enabled

2013-08-05 Thread Gleb Natapov

On Mon, Aug 05, 2013 at 09:09:56AM +, Zhanghaoyu (A) wrote:
 The QEMU command line (/var/log/libvirt/qemu/[domain name].log),
 LC_ALL=C PATH=/bin:/sbin:/usr/bin:/usr/sbin HOME=/ QEMU_AUDIO_DRV=none 
 /usr/local/bin/qemu-system-x86_64 -name ATS1 -S -M pc-0.12 -cpu qemu32 
 -enable-kvm -m 12288 -smp 4,sockets=4,cores=1,threads=1 -uuid 
 0505ec91-382d-800e-2c79-e5b286eb60b5 -no-user-config -nodefaults -chardev 
 socket,id=charmonitor,path=/var/lib/libvirt/qemu/ATS1.monitor,server,nowait 
 -mon chardev=charmonitor,id=monitor,mode=control -rtc base=localtime 
 -no-shutdown -device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 -drive 
 file=/opt/ne/vm/ATS1.img,if=none,id=drive-virtio-disk0,format=raw,cache=none 
 -device 
 virtio-blk-pci,scsi=off,bus=pci.0,addr=0x8,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1
  -netdev tap,fd=20,id=hostnet0,vhost=on,vhostfd=21 -device 
 virtio-net-pci,netdev=hostnet0,id=net0,mac=00:e0:fc:00:0f:00,bus=pci.0,addr=0x3,bootindex=2
  -netdev tap,fd=22,id=hostnet1,vhost=on,vhostfd=23 -device 
 virtio-net-pci,netdev=hostnet1,id=net1,mac=00:e0:fc:01:0f:00,bus=pci.0,addr=0x4
  -netdev tap,fd=24,id=hostnet2,vhost=on,vhostfd=25 -device 
 virtio-net-pci,netdev=hostnet2,id=net2,mac=00:e0:fc:02:0f:00,bus=pci.0,addr=0x5
  -netdev tap,fd=26,id=hostnet3,vhost=on,vhostfd=27 -device 
 virtio-net-pci,netdev=hostnet3,id=net3,mac=00:e0:fc:03:0f:00,bus=pci.0,addr=0x6
  -netdev tap,fd=28,id=hostnet4,vhost=on,vhostfd=29 -device 
 virtio-net-pci,netdev=hostnet4,id=net4,mac=00:e0:fc:0a:0f:00,bus=pci.0,addr=0x7
  -netdev tap,fd=30,id=hostnet5,vhost=on,vhostfd=31 -device 
 virtio-net-pci,netdev=hostnet5,id=net5,mac=00:e0:fc:0b:0f:00,bus=pci.0,addr=0x9
  -chardev pty,id=charserial0 -device 
 isa-serial,chardev=charserial0,id=serial0 -vnc *:0 -k en-us -vga cirrus 
 -device i6300esb,id=watchdog0,bus=pci.0,addr=0xb -watchdog-action poweroff 
 -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0xa
 
Which QEMU version is this? Can you try with e1000 NICs instead of
virtio?

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC V11 15/18] kvm : Paravirtual ticketlocks support for linux guests running on KVM hypervisor

2013-08-05 Thread Ingo Molnar


* Gleb Natapov g...@redhat.com wrote:

 On Fri, Aug 02, 2013 at 11:25:39AM +0200, Ingo Molnar wrote:
   Ingo,
   
   Do you have any concerns reg this series? please let me know if this 
   looks good now to you.
  
  I'm inclined to NAK it for excessive quotation - who knows how many 
  people left the discussion in disgust? Was it done to drive away as 
  many reviewers as possible?
  
  Anyway, see my other reply, the measurement results seem hard to 
  interpret and inconclusive at the moment.

 That result was only for patch 18 of the series, not pvspinlock in 
 general.

Okay - I've re-read the performance numbers and they are impressive, so no 
objections from me.

The x86 impact seems to be a straightforward API change, with most of the 
changes on the virtualization side. So:

Acked-by: Ingo Molnar mi...@kernel.org

I guess you'd want to carry this in the KVM tree or so - maybe in a 
separate branch because it changes Xen as well?

Thanks,

Ingo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC V11 15/18] kvm : Paravirtual ticketlocks support for linux guests running on KVM hypervisor

2013-08-05 Thread Raghavendra K T




That result was only for patch 18 of the series, not pvspinlock in
general.


Okay - I've re-read the performance numbers and they are impressive, so no
objections from me.

The x86 impact seems to be a straightforward API change, with most of the
changes on the virtualization side. So:

Acked-by: Ingo Molnar mi...@kernel.org

I guess you'd want to carry this in the KVM tree or so - maybe in a
separate branch because it changes Xen as well?



Thank you Ingo for taking a relook.

Gleb, Please let me know if you want me to resend the first 17 patches
with acked-bys. i.e excluding the 18th patch.




--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] KVM: nVMX: Advertise IA32_PAT in VM exit control

2013-08-05 Thread Jan Kiszka

On 2013-08-05 11:36, Gmail wrote:
 
 
 On Mon, Aug 05, 2013 at 05:10:35PM +0800, Arthur Chunqi Li wrote:
 Advertise VM_EXIT_SAVE_IA32_PAT and VM_EXIT_LOAD_IA32_PAT.

 Signed-off-by: Arthur Chunqi Li yzt...@gmail.com
 ---
 arch/x86/kvm/vmx.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index 45fd70c..240f0db 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -2198,7 +2198,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 #else
nested_vmx_exit_ctls_high = 0;
 #endif
 -nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
 +nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
 +VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;

 You should not set those if host does not support them, otherwise
 GUEST_IA32_PAT may not be available
 To Jan,
 Is this different from IA32_EFER?

As we use a real VMCS to keep GUEST/HOST_IA32_EFER, we actually do
depend on the existence of the host feature. So my patch requires an
update as you noted.

Jan




signature.asc
Description: OpenPGP digital signature

Re: [PATCH] KVM: nVMX: Advertise IA32_PAT in VM exit control

2013-08-05 Thread Gleb Natapov

On Mon, Aug 05, 2013 at 12:38:32PM +0200, Jan Kiszka wrote:
 On 2013-08-05 11:36, Gmail wrote:
  
  
  On Mon, Aug 05, 2013 at 05:10:35PM +0800, Arthur Chunqi Li wrote:
  Advertise VM_EXIT_SAVE_IA32_PAT and VM_EXIT_LOAD_IA32_PAT.
 
  Signed-off-by: Arthur Chunqi Li yzt...@gmail.com
  ---
  arch/x86/kvm/vmx.c |3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)
 
  diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
  index 45fd70c..240f0db 100644
  --- a/arch/x86/kvm/vmx.c
  +++ b/arch/x86/kvm/vmx.c
  @@ -2198,7 +2198,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
  #else
 nested_vmx_exit_ctls_high = 0;
  #endif
  -nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
  +nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
  +VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
 
  You should not set those if host does not support them, otherwise
  GUEST_IA32_PAT may not be available
  To Jan,
  Is this different from IA32_EFER?
 
 As we use a real VMCS to keep GUEST/HOST_IA32_EFER, we actually do
 depend on the existence of the host feature. So my patch requires an
 update as you noted.
 
Your patch didn't add new users of GUEST_IA32_PAT and
VM_ENTRY_LOAD_IA32_PAT is enabled by nested only if host has it, so I am
not sure why you are saying that patch should be updated.


--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] KVM: nVMX: Advertise IA32_PAT in VM exit control

2013-08-05 Thread Arthur Chunqi Li

On Mon, Aug 5, 2013 at 6:55 PM, Gleb Natapov g...@redhat.com wrote:
 On Mon, Aug 05, 2013 at 12:38:32PM +0200, Jan Kiszka wrote:
 On 2013-08-05 11:36, Gmail wrote:
 
 
  On Mon, Aug 05, 2013 at 05:10:35PM +0800, Arthur Chunqi Li wrote:
  Advertise VM_EXIT_SAVE_IA32_PAT and VM_EXIT_LOAD_IA32_PAT.
 
  Signed-off-by: Arthur Chunqi Li yzt...@gmail.com
  ---
  arch/x86/kvm/vmx.c |3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)
 
  diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
  index 45fd70c..240f0db 100644
  --- a/arch/x86/kvm/vmx.c
  +++ b/arch/x86/kvm/vmx.c
  @@ -2198,7 +2198,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
  #else
 nested_vmx_exit_ctls_high = 0;
  #endif
  -nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
  +nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
  +VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
 
  You should not set those if host does not support them, otherwise
  GUEST_IA32_PAT may not be available
  To Jan,
  Is this different from IA32_EFER?

 As we use a real VMCS to keep GUEST/HOST_IA32_EFER, we actually do
 depend on the existence of the host feature. So my patch requires an
 update as you noted.

 Your patch didn't add new users of GUEST_IA32_PAT and
 VM_ENTRY_LOAD_IA32_PAT is enabled by nested only if host has it, so I am
 not sure why you are saying that patch should be updated.
I think Jan points [1] and you refer to another.

[1] http://www.mail-archive.com/kvm@vger.kernel.org/msg94188.html

Arthur


 --
 Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] KVM: nVMX: Advertise IA32_PAT in VM exit control

2013-08-05 Thread Jan Kiszka

On 2013-08-05 13:00, Arthur Chunqi Li wrote:
 On Mon, Aug 5, 2013 at 6:55 PM, Gleb Natapov g...@redhat.com wrote:
 On Mon, Aug 05, 2013 at 12:38:32PM +0200, Jan Kiszka wrote:
 On 2013-08-05 11:36, Gmail wrote:


 On Mon, Aug 05, 2013 at 05:10:35PM +0800, Arthur Chunqi Li wrote:
 Advertise VM_EXIT_SAVE_IA32_PAT and VM_EXIT_LOAD_IA32_PAT.

 Signed-off-by: Arthur Chunqi Li yzt...@gmail.com
 ---
 arch/x86/kvm/vmx.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index 45fd70c..240f0db 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -2198,7 +2198,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 #else
nested_vmx_exit_ctls_high = 0;
 #endif
 -nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
 +nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
 +VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;

 You should not set those if host does not support them, otherwise
 GUEST_IA32_PAT may not be available
 To Jan,
 Is this different from IA32_EFER?

 As we use a real VMCS to keep GUEST/HOST_IA32_EFER, we actually do
 depend on the existence of the host feature. So my patch requires an
 update as you noted.

 Your patch didn't add new users of GUEST_IA32_PAT and
 VM_ENTRY_LOAD_IA32_PAT is enabled by nested only if host has it, so I am
 not sure why you are saying that patch should be updated.
 I think Jan points [1] and you refer to another.
 
 [1] http://www.mail-archive.com/kvm@vger.kernel.org/msg94188.html

Yep, that's what I was referring to.

Jan




signature.asc
Description: OpenPGP digital signature

Re: [PATCH] KVM: nVMX: Advertise IA32_PAT in VM exit control

2013-08-05 Thread Gleb Natapov

On Mon, Aug 05, 2013 at 01:04:25PM +0200, Jan Kiszka wrote:
 On 2013-08-05 13:00, Arthur Chunqi Li wrote:
  On Mon, Aug 5, 2013 at 6:55 PM, Gleb Natapov g...@redhat.com wrote:
  On Mon, Aug 05, 2013 at 12:38:32PM +0200, Jan Kiszka wrote:
  On 2013-08-05 11:36, Gmail wrote:
 
 
  On Mon, Aug 05, 2013 at 05:10:35PM +0800, Arthur Chunqi Li wrote:
  Advertise VM_EXIT_SAVE_IA32_PAT and VM_EXIT_LOAD_IA32_PAT.
 
  Signed-off-by: Arthur Chunqi Li yzt...@gmail.com
  ---
  arch/x86/kvm/vmx.c |3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)
 
  diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
  index 45fd70c..240f0db 100644
  --- a/arch/x86/kvm/vmx.c
  +++ b/arch/x86/kvm/vmx.c
  @@ -2198,7 +2198,8 @@ static __init void 
  nested_vmx_setup_ctls_msrs(void)
  #else
 nested_vmx_exit_ctls_high = 0;
  #endif
  -nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
  +nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
  +VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
 
  You should not set those if host does not support them, otherwise
  GUEST_IA32_PAT may not be available
  To Jan,
  Is this different from IA32_EFER?
 
  As we use a real VMCS to keep GUEST/HOST_IA32_EFER, we actually do
  depend on the existence of the host feature. So my patch requires an
  update as you noted.
 
  Your patch didn't add new users of GUEST_IA32_PAT and
  VM_ENTRY_LOAD_IA32_PAT is enabled by nested only if host has it, so I am
  not sure why you are saying that patch should be updated.
  I think Jan points [1] and you refer to another.
  
  [1] http://www.mail-archive.com/kvm@vger.kernel.org/msg94188.html
 
 Yep, that's what I was referring to.
 
Ah, too many patches :)

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v2] KVM: nVMX: Advertise IA32_PAT in VM exit control

2013-08-05 Thread Arthur Chunqi Li

Advertise VM_EXIT_SAVE_IA32_PAT and VM_EXIT_LOAD_IA32_PAT.

Signed-off-by: Arthur Chunqi Li yzt...@gmail.com
---

ChangeLog to v1:
Add host features check. Format is the same as entry controls.

 arch/x86/kvm/vmx.c |   10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 396572d..fd451c1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2191,14 +2191,16 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and
 * 17 must be 1.
 */
+   rdmsr(MSR_IA32_VMX_EXIT_CTLS,
+   nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+   nested_vmx_exit_ctls_high =
+   VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
+   nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
/* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
 #ifdef CONFIG_X86_64
-   nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE;
-#else
-   nested_vmx_exit_ctls_high = 0;
+   nested_vmx_exit_ctls_high |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
 #endif
-   nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
 
/* entry controls */
rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v7 01/15] nEPT: Support LOAD_IA32_EFER entry/exit controls for L1

2013-08-05 Thread Arthur Chunqi Li

On Mon, Aug 5, 2013 at 4:07 PM, Gleb Natapov g...@redhat.com wrote:
 From: Nadav Har'El n...@il.ibm.com

 Recent KVM, since http://kerneltrap.org/mailarchive/linux-kvm/2010/5/2/6261577
 switch the EFER MSR when EPT is used and the host and guest have different
 NX bits. So if we add support for nested EPT (L1 guest using EPT to run L2)
 and want to be able to run recent KVM as L1, we need to allow L1 to use this
 EFER switching feature.

 To do this EFER switching, KVM uses VM_ENTRY/EXIT_LOAD_IA32_EFER if available,
 and if it isn't, it uses the generic VM_ENTRY/EXIT_MSR_LOAD. This patch adds
 support for the former (the latter is still unsupported).

 Nested entry and exit emulation (prepare_vmcs_02 and load_vmcs12_host_state,
 respectively) already handled VM_ENTRY/EXIT_LOAD_IA32_EFER correctly. So all
 that's left to do in this patch is to properly advertise this feature to L1.

 Note that vmcs12's VM_ENTRY/EXIT_LOAD_IA32_EFER are emulated by L0, by using
 vmx_set_efer (which itself sets one of several vmcs02 fields), so we always
 support this feature, regardless of whether the host supports it.

 Reviewed-by: Orit Wasserman owass...@redhat.com
 Signed-off-by: Nadav Har'El n...@il.ibm.com
 Signed-off-by: Jun Nakajima jun.nakaj...@intel.com
 Signed-off-by: Xinhao Xu xinhao...@intel.com
 Signed-off-by: Yang Zhang yang.z.zh...@intel.com
 Signed-off-by: Gleb Natapov g...@redhat.com
 ---
  arch/x86/kvm/vmx.c |   23 ---
  1 file changed, 16 insertions(+), 7 deletions(-)

 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index e999dc7..27efa6a 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -2198,7 +2198,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
  #else
 nested_vmx_exit_ctls_high = 0;
  #endif
 -   nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
 +   nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
 + VM_EXIT_LOAD_IA32_EFER);
Gleb, why we don't need to check whether host supports
VM_EXIT_LOAD_IA32_EFER here, as what you noted in my
VM_EXIT_LOAD_IA32_PAT patch?

Arthur

 /* entry controls */
 rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
 @@ -2207,8 +2208,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
 nested_vmx_entry_ctls_high =
 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
 -   nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
 -
 +   nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
 +  VM_ENTRY_LOAD_IA32_EFER);
 /* cpu-based controls */
 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
 nested_vmx_procbased_ctls_low, 
 nested_vmx_procbased_ctls_high);
 @@ -7529,10 +7530,18 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, 
 struct vmcs12 *vmcs12)
 vcpu-arch.cr0_guest_owned_bits = ~vmcs12-cr0_guest_host_mask;
 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu-arch.cr0_guest_owned_bits);

 -   /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below 
 */
 -   vmcs_write32(VM_EXIT_CONTROLS,
 -   vmcs12-vm_exit_controls | vmcs_config.vmexit_ctrl);
 -   vmcs_write32(VM_ENTRY_CONTROLS, vmcs12-vm_entry_controls |
 +   /* L2-L1 exit controls are emulated - the hardware exit is to L0 so
 +* we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
 +* bits are further modified by vmx_set_efer() below.
 +*/
 +   vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
 +
 +   /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
 +* emulated by vmx_set_efer(), below.
 +*/
 +   vmcs_write32(VM_ENTRY_CONTROLS,
 +   (vmcs12-vm_entry_controls  ~VM_ENTRY_LOAD_IA32_EFER 
 +   ~VM_ENTRY_IA32E_MODE) |
 (vmcs_config.vmentry_ctrl  ~VM_ENTRY_IA32E_MODE));

 if (vmcs12-vm_entry_controls  VM_ENTRY_LOAD_IA32_PAT)
 --
 1.7.10.4

 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: FAQ on linux-kvm.org has broken link

2013-08-05 Thread Stefan Hajnoczi

On Fri, Aug 02, 2013 at 08:06:58PM +0200, folkert wrote:
  A couple of questions:
  Please post the QEMU command-line from the host (ps aux | grep qemu).
 
 I'll post them all:
 - UMTS-clone: this one works fine since it was created a weak ago
 - belle: this one was fine but suddenly also showed the problem
 - mauer: the problem one
 
 112   4819 1  4 Jul30 ?03:29:39 /usr/bin/kvm -S -M pc-1.1 
 -enable-kvm -m 1024 -smp 1,sockets=1,cores=1,threads=1 -name UMTS-clone -uuid 
 e49502f1-0c74-2a60-99dc-7602da5ee640 -no-user-config -nodefaults -chardev 
 socket,id=charmonitor,path=/var/lib/libvirt/qemu/UMTS-clone.monitor,server,nowait
  -mon chardev=charmonitor,id=monitor,mode=control -rtc base=utc -no-shutdown 
 -device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 -drive 
 file=/dev/VGNEO/LV_V_UMTS-clone,if=none,id=drive-virtio-disk0,format=raw,cache=writeback
  -device 
 virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1
  -drive 
 file=/home/folkert/ISOs/wheezy.iso,if=none,id=drive-ide0-1-0,readonly=on,format=raw
  -device ide-cd,bus=ide.1,unit=0,drive=drive-ide0-1-0,id=ide0-1-0 -netdev 
 tap,fd=20,id=hostnet0,vhost=on,vhostfd=21 -device 
 virtio-net-pci,netdev=hostnet0,id=net0,mac=52:54:00:09:3b:b6,bus=pci.0,addr=0x3
  -chardev pty,id=charserial0 -device 
 isa-serial,chardev=charserial0,id=serial0 -vnc 127.0.0.1:0,password -vga 
 cirrus -device usb-host,hostbus=6,hostaddr=5,id=hostdev0 -device 
 virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x6
 112  10065 1 11 Jul30 ?07:46:16 /usr/bin/kvm -S -M pc-1.1 
 -enable-kvm -m 8192 -smp 12,sockets=12,cores=1,threads=1 -name belle -uuid 
 16b704d7-5fbd-d67b-71e6-0d6b43f1bc0a -no-user-config -nodefaults -chardev 
 socket,id=charmonitor,path=/var/lib/libvirt/qemu/belle.monitor,server,nowait 
 -mon chardev=charmonitor,id=monitor,mode=control -rtc base=localtime 
 -no-shutdown -device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 -drive 
 file=/dev/VGNEO/LV_V_BELLE,if=none,id=drive-virtio-disk0,format=raw -device 
 virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1
  -drive 
 file=/dev/VGNEO/LV_V_BELLE_OS,if=none,id=drive-virtio-disk1,format=raw,cache=writeback
  -device 
 virtio-blk-pci,scsi=off,bus=pci.0,addr=0x8,drive=drive-virtio-disk1,id=virtio-disk1
  -drive 
 file=/dev/VGJOURNAL/LV_J_BELLE,if=none,id=drive-ide0-0-0,format=raw,cache=writeback
  -device ide-hd,bus=ide.0,unit=0,drive=drive-ide0-0-0,id=ide0-0-0 -drive 
 if=none,id=drive-ide0-1-0,readonly=on,format=raw,cache=none -device 
 ide-cd,bus=ide.1,unit=0,drive=drive-ide0-1-0,id=ide0-1-0 -netdev 
 tap,fd=26,id=hostnet0,vhost=on,vhostfd=27 -device 
 virtio-net-pci,netdev=hostnet0,id=net0,mac=52:54:00:75:4a:6f,bus=pci.0,addr=0x3
  -netdev tap,fd=28,id=hostnet1,vhost=on,vhostfd=29 -device 
 virtio-net-pci,netdev=hostnet1,id=net1,mac=52:54:00:0a:6e:de,bus=pci.0,addr=0x7
  -chardev pty,id=charserial0 -device 
 isa-serial,chardev=charserial0,id=serial0 -device usb-tablet,id=input0 -vnc 
 127.0.0.1:1,password -vga cirrus -device 
 intel-hda,id=sound0,bus=pci.0,addr=0x4 -device 
 hda-duplex,id=sound0-codec0,bus=sound0.0,cad=0 -device 
 virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x6
 root 13116 12830  0 19:54 pts/800:00:00 grep qemu
 112  23453 1 57 13:16 ?03:46:51 /usr/bin/kvm -S -M pc-1.1 
 -enable-kvm -m 8192 -smp 8,maxcpus=12,sockets=12,cores=1,threads=1 -name 
 mauer -uuid 3a8452e6-81af-b185-63b6-2b32be17ed87 -no-user-config -nodefaults 
 -chardev 
 socket,id=charmonitor,path=/var/lib/libvirt/qemu/mauer.monitor,server,nowait 
 -mon chardev=charmonitor,id=monitor,mode=control -rtc base=utc -no-shutdown 
 -device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 -drive 
 file=/dev/VGNEO/LV_V_MAUER,if=none,id=drive-virtio-disk0,format=raw,cache=writeback
  -device 
 virtio-blk-pci,scsi=off,bus=pci.0,addr=0x8,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1
  -drive 
 file=/dev/VGJOURNAL/LV_J_MAUER,if=none,id=drive-virtio-disk1,format=raw,cache=writethrough
  -device 
 virtio-blk-pci,scsi=off,bus=pci.0,addr=0xa,drive=drive-virtio-disk1,id=virtio-disk1
  -drive if=none,id=drive-ide0-1-0,readonly=on,format=raw -device 
 ide-cd,bus=ide.1,unit=0,drive=drive-ide0-1-0,id=ide0-1-0 -netdev 
 tap,fd=26,id=hostnet0,vhost=on,vhostfd=27 -device 
 virtio-net-pci,netdev=hostnet0,id=net0,mac=52:54:00:86:d9:1f,bus=pci.0,addr=0x3
  -netdev tap,fd=28,id=hostnet1,vhost=on,vhostfd=29 -device 
 virtio-net-pci,netdev=hostnet1,id=net1,mac=52:54:00:a3:12:8a,bus=pci.0,addr=0x4
  -netdev tap,fd=30,id=hostnet2,vhost=on,vhostfd=31 -device 
 virtio-net-pci,netdev=hostnet2,id=net2,mac=52:54:00:0f:54:c2,bus=pci.0,addr=0x5
  -chardev pty,id=charserial0 -device 
 isa-serial,chardev=charserial0,id=serial0 -device usb-tablet,id=input0 -vnc 
 127.0.0.1:2,password -vga cirrus -device 
 intel-hda,id=sound0,bus=pci.0,addr=0x7 -device 
 hda-duplex,id=sound0-codec0,bus=sound0.0,cad=0 -device

Re: [PATCH v7 01/15] nEPT: Support LOAD_IA32_EFER entry/exit controls for L1

2013-08-05 Thread Gleb Natapov

On Mon, Aug 05, 2013 at 07:27:33PM +0800, Arthur Chunqi Li wrote:
 On Mon, Aug 5, 2013 at 4:07 PM, Gleb Natapov g...@redhat.com wrote:
  From: Nadav Har'El n...@il.ibm.com
 
  Recent KVM, since 
  http://kerneltrap.org/mailarchive/linux-kvm/2010/5/2/6261577
  switch the EFER MSR when EPT is used and the host and guest have different
  NX bits. So if we add support for nested EPT (L1 guest using EPT to run L2)
  and want to be able to run recent KVM as L1, we need to allow L1 to use this
  EFER switching feature.
 
  To do this EFER switching, KVM uses VM_ENTRY/EXIT_LOAD_IA32_EFER if 
  available,
  and if it isn't, it uses the generic VM_ENTRY/EXIT_MSR_LOAD. This patch adds
  support for the former (the latter is still unsupported).
 
  Nested entry and exit emulation (prepare_vmcs_02 and load_vmcs12_host_state,
  respectively) already handled VM_ENTRY/EXIT_LOAD_IA32_EFER correctly. So all
  that's left to do in this patch is to properly advertise this feature to L1.
 
  Note that vmcs12's VM_ENTRY/EXIT_LOAD_IA32_EFER are emulated by L0, by using
  vmx_set_efer (which itself sets one of several vmcs02 fields), so we always
  support this feature, regardless of whether the host supports it.
 
  Reviewed-by: Orit Wasserman owass...@redhat.com
  Signed-off-by: Nadav Har'El n...@il.ibm.com
  Signed-off-by: Jun Nakajima jun.nakaj...@intel.com
  Signed-off-by: Xinhao Xu xinhao...@intel.com
  Signed-off-by: Yang Zhang yang.z.zh...@intel.com
  Signed-off-by: Gleb Natapov g...@redhat.com
  ---
   arch/x86/kvm/vmx.c |   23 ---
   1 file changed, 16 insertions(+), 7 deletions(-)
 
  diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
  index e999dc7..27efa6a 100644
  --- a/arch/x86/kvm/vmx.c
  +++ b/arch/x86/kvm/vmx.c
  @@ -2198,7 +2198,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
   #else
  nested_vmx_exit_ctls_high = 0;
   #endif
  -   nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
  +   nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
  + VM_EXIT_LOAD_IA32_EFER);
 Gleb, why we don't need to check whether host supports
 VM_EXIT_LOAD_IA32_EFER here, as what you noted in my
 VM_EXIT_LOAD_IA32_PAT patch?
Nested VMX completely emulates the capability. It calls vmx_set_efer()
which suppose to handle hosts without VM_EXIT_LOAD_IA32_EFER support.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v7 01/15] nEPT: Support LOAD_IA32_EFER entry/exit controls for L1

2013-08-05 Thread Paolo Bonzini

  diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
  index e999dc7..27efa6a 100644
  --- a/arch/x86/kvm/vmx.c
  +++ b/arch/x86/kvm/vmx.c
  @@ -2198,7 +2198,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
   #else
  nested_vmx_exit_ctls_high = 0;
   #endif
  -   nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
  +   nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
  + VM_EXIT_LOAD_IA32_EFER);
 
 Gleb, why we don't need to check whether host supports
 VM_EXIT_LOAD_IA32_EFER here, as what you noted in my
 VM_EXIT_LOAD_IA32_PAT patch?

The host can also emulate VM_EXIT_LOAD_IA32_EFER using the
VM-exit MSR-load feature.  If neither EFER save/load nor
MSR save/load are available, I believe you are right and
the feature should not be available in nested VMX.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2] KVM: nVMX: Advertise IA32_PAT in VM exit control

2013-08-05 Thread Paolo Bonzini



- Original Message -
 From: Arthur Chunqi Li yzt...@gmail.com
 To: kvm@vger.kernel.org
 Cc: jan kiszka jan.kis...@web.de, g...@redhat.com, pbonz...@redhat.com, 
 Arthur Chunqi Li yzt...@gmail.com
 Sent: Monday, August 5, 2013 1:18:13 PM
 Subject: [PATCH v2] KVM: nVMX: Advertise IA32_PAT in VM exit control
 
 Advertise VM_EXIT_SAVE_IA32_PAT and VM_EXIT_LOAD_IA32_PAT.
 
 Signed-off-by: Arthur Chunqi Li yzt...@gmail.com
 ---
 
 ChangeLog to v1:
   Add host features check. Format is the same as entry controls.
 
  arch/x86/kvm/vmx.c |   10 ++
  1 file changed, 6 insertions(+), 4 deletions(-)
 
 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index 396572d..fd451c1 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -2191,14 +2191,16 @@ static __init void nested_vmx_setup_ctls_msrs(void)
* If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and
* 17 must be 1.
*/
 + rdmsr(MSR_IA32_VMX_EXIT_CTLS,
 + nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
   nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
 + nested_vmx_exit_ctls_high =
 + VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
 + nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
   /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
  #ifdef CONFIG_X86_64
 - nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE;
 -#else
 - nested_vmx_exit_ctls_high = 0;
 + nested_vmx_exit_ctls_high |= VM_EXIT_HOST_ADDR_SPACE_SIZE;

I think you should clear the bit if !X86_64 (and not do anything if
X86_64, the bit should be already set in MSR_IA32_VMX_EXIT_CTLS).

Paolo

  #endif
 - nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
  
   /* entry controls */
   rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
 --
 1.7.9.5
 
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v7 01/15] nEPT: Support LOAD_IA32_EFER entry/exit controls for L1

2013-08-05 Thread Gleb Natapov

On Mon, Aug 05, 2013 at 07:48:54AM -0400, Paolo Bonzini wrote:
   diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
   index e999dc7..27efa6a 100644
   --- a/arch/x86/kvm/vmx.c
   +++ b/arch/x86/kvm/vmx.c
   @@ -2198,7 +2198,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
#else
   nested_vmx_exit_ctls_high = 0;
#endif
   -   nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
   +   nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
   + VM_EXIT_LOAD_IA32_EFER);
  
  Gleb, why we don't need to check whether host supports
  VM_EXIT_LOAD_IA32_EFER here, as what you noted in my
  VM_EXIT_LOAD_IA32_PAT patch?
 
 The host can also emulate VM_EXIT_LOAD_IA32_EFER using the
 VM-exit MSR-load feature.  If neither EFER save/load nor
 MSR save/load are available, I believe you are right and
 the feature should not be available in nested VMX.
As far as I can tell MSR save/load is not optional.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2] KVM: nVMX: Advertise IA32_PAT in VM exit control

2013-08-05 Thread Gleb Natapov

On Mon, Aug 05, 2013 at 08:03:31AM -0400, Paolo Bonzini wrote:
 
 
 - Original Message -
  From: Arthur Chunqi Li yzt...@gmail.com
  To: kvm@vger.kernel.org
  Cc: jan kiszka jan.kis...@web.de, g...@redhat.com, pbonz...@redhat.com, 
  Arthur Chunqi Li yzt...@gmail.com
  Sent: Monday, August 5, 2013 1:18:13 PM
  Subject: [PATCH v2] KVM: nVMX: Advertise IA32_PAT in VM exit control
  
  Advertise VM_EXIT_SAVE_IA32_PAT and VM_EXIT_LOAD_IA32_PAT.
  
  Signed-off-by: Arthur Chunqi Li yzt...@gmail.com
  ---
  
  ChangeLog to v1:
  Add host features check. Format is the same as entry controls.
  
   arch/x86/kvm/vmx.c |   10 ++
   1 file changed, 6 insertions(+), 4 deletions(-)
  
  diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
  index 396572d..fd451c1 100644
  --- a/arch/x86/kvm/vmx.c
  +++ b/arch/x86/kvm/vmx.c
  @@ -2191,14 +2191,16 @@ static __init void nested_vmx_setup_ctls_msrs(void)
   * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and
   * 17 must be 1.
   */
  +   rdmsr(MSR_IA32_VMX_EXIT_CTLS,
  +   nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
  nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
  +   nested_vmx_exit_ctls_high =
  +   VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
  +   nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
  /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
   #ifdef CONFIG_X86_64
  -   nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE;
  -#else
  -   nested_vmx_exit_ctls_high = 0;
  +   nested_vmx_exit_ctls_high |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
 
 I think you should clear the bit if !X86_64 (and not do anything if
 X86_64, the bit should be already set in MSR_IA32_VMX_EXIT_CTLS).
 
Yes, we do not support 64 bit guests on 32bit host. It means that 32 bit
host is essentially processors that do not support Intel 64 architecture,
so the control should be zero.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] kvm-unit-tests: VMX: Separate host and guest rflags

2013-08-05 Thread Arthur Chunqi Li

Separate host_rflags and guest_rflags (regs.rflags used for guest).
Fix bug of set/get guest rflags when vmenter/vmexit.

Signed-off-by: Arthur Chunqi Li yzt...@gmail.com
---
 x86/vmx.c |   11 +++
 x86/vmx.h |4 ++--
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/x86/vmx.c b/x86/vmx.c
index 7467927..7b28aca 100644
--- a/x86/vmx.c
+++ b/x86/vmx.c
@@ -19,6 +19,7 @@ struct regs regs;
 struct vmx_test *current;
 u64 hypercall_field = 0;
 bool launched;
+u64 host_rflags;
 
 extern u64 gdt64_desc[];
 extern u64 idt_descr[];
@@ -440,12 +441,14 @@ static int exit_handler()
int ret;
 
current-exits++;
+   regs.rflags = vmcs_read(GUEST_RFLAGS);
current-guest_regs = regs;
if (is_hypercall())
ret = handle_hypercall();
else
ret = current-exit_handler();
regs = current-guest_regs;
+   vmcs_write(GUEST_RFLAGS, regs.rflags);
switch (ret) {
case VMX_TEST_VMEXIT:
case VMX_TEST_RESUME:
@@ -505,15 +508,15 @@ static int vmx_run()
return 0;
case VMX_TEST_LAUNCH_ERR:
printf(%s : vmlaunch failed.\n, __func__);
-   if ((!(regs.rflags  X86_EFLAGS_CF)  !(regs.rflags  
X86_EFLAGS_ZF))
-   || ((regs.rflags  X86_EFLAGS_CF)  (regs.rflags  
X86_EFLAGS_ZF)))
+   if ((!(host_rflags  X86_EFLAGS_CF)  !(host_rflags  
X86_EFLAGS_ZF))
+   || ((host_rflags  X86_EFLAGS_CF)  (host_rflags  
X86_EFLAGS_ZF)))
printf(\tvmlaunch set wrong flags\n);
report(test vmlaunch, 0);
break;
case VMX_TEST_RESUME_ERR:
printf(%s : vmresume failed.\n, __func__);
-   if ((!(regs.rflags  X86_EFLAGS_CF)  !(regs.rflags  
X86_EFLAGS_ZF))
-   || ((regs.rflags  X86_EFLAGS_CF)  (regs.rflags  
X86_EFLAGS_ZF)))
+   if ((!(host_rflags  X86_EFLAGS_CF)  !(host_rflags  
X86_EFLAGS_ZF))
+   || ((host_rflags  X86_EFLAGS_CF)  (host_rflags  
X86_EFLAGS_ZF)))
printf(\tvmresume set wrong flags\n);
report(test vmresume, 0);
break;
diff --git a/x86/vmx.h b/x86/vmx.h
index 1fb9738..d4f979c 100644
--- a/x86/vmx.h
+++ b/x86/vmx.h
@@ -403,10 +403,10 @@ enum Ctrl1 {
 
 #define SAVE_RFLAGS\
pushf\n\t \
-   pop regs+0x80\n\t
+   pop host_rflags\n\t
 
 #define LOAD_RFLAGS\
-   push regs+0x80\n\t\
+   push host_rflags\n\t  \
popf\n\t
 
 #define VMX_IO_SIZE_MASK   0x7
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 1/7] KVM: nEPT: Advertise WB type EPTP

2013-08-05 Thread Gleb Natapov

On Sun, Aug 04, 2013 at 07:49:00PM +0200, Jan Kiszka wrote:
 From: Jan Kiszka jan.kis...@siemens.com
 
 At least WB must be possible.
 
 Signed-off-by: Jan Kiszka jan.kis...@siemens.com
Reviewed-by: Gleb Natapov g...@redhat.com

 ---
  arch/x86/kvm/vmx.c |4 ++--
  1 files changed, 2 insertions(+), 2 deletions(-)
 
 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index e51bf4a..53050a0 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -2250,8 +2250,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
   if (enable_ept) {
   /* nested EPT: emulate EPT also to L1 */
   nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
 - nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT;
 - nested_vmx_ept_caps |= VMX_EPT_INVEPT_BIT;
 + nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
 +  VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
   nested_vmx_ept_caps = vmx_capability.ept;
   /*
* Since invept is completely emulated we support both global
 -- 
 1.7.3.4

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/3] Initial skeleton of VFIO support for Device Tree based devices

2013-08-05 Thread Antonios Motakis

Platform devices in the Linux kernel are usually managed by the DT
interface. This patch forms the base to support these kind of devices
with VFIO.

Signed-off-by: Antonios Motakis a.mota...@virtualopensystems.com
---
 drivers/vfio/Kconfig  |  10 +++
 drivers/vfio/Makefile |   1 +
 drivers/vfio/vfio_dt.c| 187 ++
 include/uapi/linux/vfio.h |   1 +
 4 files changed, 199 insertions(+)
 create mode 100644 drivers/vfio/vfio_dt.c

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 1f84eda..a77a2e4 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -13,4 +13,14 @@ menuconfig VFIO
 
  If you don't know what to do here, say N.
 
+config VFIO_DT
+   tristate VFIO support for Device Tree devices
+   depends on VFIO  EVENTFD
+   help
+ Support for the VFIO Device Tree driver.  This is required to make
+ use of platform devices present on Device Tree nodes using the VFIO
+ framework.
+
+ If you don't know what to do here, say N.
+
 source drivers/vfio/pci/Kconfig
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 2398d4a..d599a67 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_VFIO) += vfio.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
 obj-$(CONFIG_VFIO_PCI) += pci/
+obj-$(CONFIG_VFIO_DT) += vfio_dt.o
diff --git a/drivers/vfio/vfio_dt.c b/drivers/vfio/vfio_dt.c
new file mode 100644
index 000..ad4d31d
--- /dev/null
+++ b/drivers/vfio/vfio_dt.c
@@ -0,0 +1,187 @@
+/*
+ * Copyright (C) 2013 - Virtual Open Systems
+ * Author: Antonios Motakis a.mota...@virtualopensystems.com
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include linux/device.h
+#include linux/eventfd.h
+#include linux/interrupt.h
+#include linux/iommu.h
+#include linux/module.h
+#include linux/mutex.h
+#include linux/notifier.h
+#include linux/pm_runtime.h
+#include linux/slab.h
+#include linux/types.h
+#include linux/uaccess.h
+#include linux/vfio.h
+
+#define DRIVER_VERSION  0.1
+#define DRIVER_AUTHOR   Antonios Motakis a.mota...@virtualopensystems.com
+#define DRIVER_DESC VFIO Device Tree devices - User Level meta-driver
+
+struct vfio_dt_device {
+   struct platform_device  *pdev;
+};
+
+static void vfio_dt_release(void *device_data)
+{
+   module_put(THIS_MODULE);
+}
+
+static int vfio_dt_open(void *device_data)
+{
+   if (!try_module_get(THIS_MODULE))
+   return -ENODEV;
+
+   return 0;
+}
+
+static long vfio_dt_ioctl(void *device_data,
+  unsigned int cmd, unsigned long arg)
+{
+   struct vfio_dt_device *vdev = device_data;
+   unsigned long minsz;
+
+   if (cmd == VFIO_DEVICE_GET_INFO) {
+   struct vfio_device_info info;
+
+   minsz = offsetofend(struct vfio_device_info, num_irqs);
+
+   if (copy_from_user(info, (void __user *)arg, minsz))
+   return -EFAULT;
+
+   if (info.argsz  minsz)
+   return -EINVAL;
+
+   info.flags = VFIO_DEVICE_FLAGS_DT;
+   info.num_regions = 0;
+   info.num_irqs = 0;
+
+   return copy_to_user((void __user *)arg, info, minsz);
+
+   } else if (cmd == VFIO_DEVICE_GET_REGION_INFO)
+   return -EINVAL;
+
+   else if (cmd == VFIO_DEVICE_GET_IRQ_INFO)
+   return -EINVAL;
+
+   else if (cmd == VFIO_DEVICE_SET_IRQS)
+   return -EINVAL;
+
+   else if (cmd == VFIO_DEVICE_RESET)
+   return -EINVAL;
+
+   return -ENOTTY;
+}
+
+static ssize_t vfio_dt_read(void *device_data, char __user *buf,
+size_t count, loff_t *ppos)
+{
+   return 0;
+}
+
+static ssize_t vfio_dt_write(void *device_data, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+   return 0;
+}
+
+static int vfio_dt_mmap(void *device_data, struct vm_area_struct *vma)
+{
+   return -EINVAL;
+}
+
+static const struct vfio_device_ops vfio_dt_ops = {
+   .name   = vfio-dts,
+   .open   = vfio_dt_open,
+   .release= vfio_dt_release,
+   .ioctl  = vfio_dt_ioctl,
+   .read   = vfio_dt_read,
+   .write  = vfio_dt_write,
+   .mmap   =

[PATCH 3/3] Return info for device and its memory regions and interrupts

2013-08-05 Thread Antonios Motakis

A VFIO userspace driver will start by opening the VFIO device
that corresponds to an IOMMU group, and will use the ioctl interface
to get the basic device info, such as number of memory regions and
interrupts, and their properties.

This patch implements the IOCTLs:
 - VFIO_DEVICE_GET_INFO
 - VFIO_DEVICE_GET_REGION_INFO
 - VFIO_DEVICE_GET_IRQ_INFO

Signed-off-by: Antonios Motakis a.mota...@virtualopensystems.com
---
 drivers/vfio/vfio_dt.c | 60 --
 1 file changed, 53 insertions(+), 7 deletions(-)

diff --git a/drivers/vfio/vfio_dt.c b/drivers/vfio/vfio_dt.c
index ad4d31d..817c552 100644
--- a/drivers/vfio/vfio_dt.c
+++ b/drivers/vfio/vfio_dt.c
@@ -28,6 +28,10 @@
 #include linux/types.h
 #include linux/uaccess.h
 #include linux/vfio.h
+#include linux/of.h
+#include linux/of_address.h
+#include linux/of_irq.h
+#include linux/of_platform.h
 
 #define DRIVER_VERSION  0.1
 #define DRIVER_AUTHOR   Antonios Motakis a.mota...@virtualopensystems.com
@@ -54,10 +58,13 @@ static long vfio_dt_ioctl(void *device_data,
   unsigned int cmd, unsigned long arg)
 {
struct vfio_dt_device *vdev = device_data;
+   struct device_node *of_node = vdev-pdev-dev.of_node;
unsigned long minsz;
 
if (cmd == VFIO_DEVICE_GET_INFO) {
struct vfio_device_info info;
+   struct resource res;
+   int cnt = 0;
 
minsz = offsetofend(struct vfio_device_info, num_irqs);
 
@@ -68,18 +75,57 @@ static long vfio_dt_ioctl(void *device_data,
return -EINVAL;
 
info.flags = VFIO_DEVICE_FLAGS_DT;
-   info.num_regions = 0;
-   info.num_irqs = 0;
+
+   while (!of_address_to_resource(of_node, cnt, res))
+   cnt++;
+
+   info.num_regions = cnt;
+
+   info.num_irqs = of_irq_count(of_node);
 
return copy_to_user((void __user *)arg, info, minsz);
 
-   } else if (cmd == VFIO_DEVICE_GET_REGION_INFO)
-   return -EINVAL;
+   } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
+   struct vfio_region_info info;
+   struct resource res;
 
-   else if (cmd == VFIO_DEVICE_GET_IRQ_INFO)
-   return -EINVAL;
+   minsz = offsetofend(struct vfio_region_info, offset);
+
+   if (copy_from_user(info, (void __user *)arg, minsz))
+   return -EFAULT;
+
+   if (info.argsz  minsz)
+   return -EINVAL;
+
+   if(of_address_to_resource(of_node, info.index, res))
+   return -EINVAL;
+
+   info.offset = res.start;/* map phys addr with offset */
+   info.size = resource_size(res);
+   info.flags = 0;
+
+   return copy_to_user((void __user *)arg, info, minsz);  
+
+   } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
+   struct vfio_irq_info info;
+   struct resource res;
+
+   minsz = offsetofend(struct vfio_irq_info, count);
+
+   if (copy_from_user(info, (void __user *)arg, minsz))
+   return -EFAULT;
+
+   if (info.argsz  minsz)
+   return -EINVAL;
+
+   of_irq_to_resource(of_node, info.index, res);
+
+   info.flags = 0;
+   info.count = 1;
+
+   return copy_to_user((void __user *)arg, info, minsz);
 
-   else if (cmd == VFIO_DEVICE_SET_IRQS)
+   } else if (cmd == VFIO_DEVICE_SET_IRQS)
return -EINVAL;
 
else if (cmd == VFIO_DEVICE_RESET)
-- 
1.8.1.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC 0/3] WIP VFIO for device tree devices on Arndale

2013-08-05 Thread Antonios Motakis

This is a very early base work, towards VFIO support on ARM platforms
with an IOMMU. It forms a base on to which to implement the functionality
necessary to enable using device tree devices on ARM (and other platforms
based on device trees) with VFIO.

This very early work of progress is only published for the sake of openness,
and is very far from usable yet. However the driver can bind to devices,
and return to userspace the info about the memory regions and IRQs.

This patch series has been tested on the Arndale board (with the Exynos 5250
System MMU).

It depends on Cho KyongHo's patch series iommu/exynos: Fixes and Enhancements
of System MMU driver with DT, applied on a Linux 3.10.1 kernel, and also my
own iommu/exynos: add devices attached to the System MMU to an IOMMU group.
Those patches are required at least in order to test the proposed module on
Arndale.

This should not be treated as anything more than a work in progress. Numerous
functions still need to be implemented properly, e.g.
 - Proper binding of the VFIO_DT driver to devices; currently to test the
   driver, one has to edit the device tree and add the vfio-dt to the
   compatible property. However Linux does not support OF drivers that
   can be dynamically bound to any device.
 - Most IOCTLs are not implemented yet. Memory region mapping, DMA mapping,
   IRQFD still need to be added.
 - The VFIO_IOMMU_TYPE1 is patched to work instead of PCI IOMMU, with platform
   IOMMUs such as the one that is found on Arndale. This is a proof of concept
   hack, and a more permanent fix will be proposed as the code matures.

The API used is identical to the existing VFIO API that is also used with
PCI devices. Only devices that include a basic set of IRQs and memory regions
are targeted; devices with complicated relationships with other devices on the
device tree are not taken into account at this stage.

The API is not extended with device tree specific information; this would
complicate the driver unnecessarily as it is not needed for the base use cases.

Antonios Motakis (3):
  VFIO_IOMMU_TYPE1 workaround to build for platform devices
  Initial skeleton of VFIO support for Device Tree based devices
  Return info for device and its memory regions and interrupts

 drivers/vfio/Kconfig|  12 ++-
 drivers/vfio/Makefile   |   1 +
 drivers/vfio/vfio_dt.c  | 233 
 drivers/vfio/vfio_iommu_type1.c |  15 ++-
 include/uapi/linux/vfio.h   |   1 +
 5 files changed, 258 insertions(+), 4 deletions(-)
 create mode 100644 drivers/vfio/vfio_dt.c

-- 
1.8.1.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/3] VFIO_IOMMU_TYPE1 workaround to build for platform devices

2013-08-05 Thread Antonios Motakis

This is a workaround to make the VFIO_IOMMU_TYPE1 driver usable with
platform devices instead of PCI. A future permanent fix should support
both. This is required in order to use the Exynos SMMU, or the ARM SMMU
driver with VFIO.

Signed-off-by: Antonios Motakis a.mota...@virtualopensystems.com
---
 drivers/vfio/Kconfig|  2 +-
 drivers/vfio/vfio_iommu_type1.c | 15 ---
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 7cd5dec..1f84eda 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -6,7 +6,7 @@ config VFIO_IOMMU_TYPE1
 menuconfig VFIO
tristate VFIO Non-Privileged userspace driver framework
depends on IOMMU_API
-   select VFIO_IOMMU_TYPE1 if X86
+   select VFIO_IOMMU_TYPE1 if X86 || ARM
help
  VFIO provides a framework for secure userspace device drivers.
  See Documentation/vfio.txt for more details.
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 6f3fbc4..4339603 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -30,7 +30,8 @@
 #include linux/iommu.h
 #include linux/module.h
 #include linux/mm.h
-#include linux/pci.h /* pci_bus_type */
+#include linux/pci.h /* pci_bus_type */
+#include linux/platform_device.h /* platform_bus_type */
 #include linux/sched.h
 #include linux/slab.h
 #include linux/uaccess.h
@@ -610,9 +611,17 @@ static void *vfio_iommu_type1_open(unsigned long arg)
mutex_init(iommu-lock);
 
/*
+* ARM SMMU compatibility workaround
+*/
+   iommu-domain = iommu_domain_alloc(platform_bus_type);
+   if (iommu-domain)
+   return iommu;
+
+   /*
 * Wish we didn't have to know about bus_type here.
 */
-   iommu-domain = iommu_domain_alloc(pci_bus_type);
+   //iommu-domain = iommu_domain_alloc(pci_bus_type);
+
if (!iommu-domain) {
kfree(iommu);
return ERR_PTR(-EIO);
@@ -733,7 +742,7 @@ static const struct vfio_iommu_driver_ops 
vfio_iommu_driver_ops_type1 = {
 
 static int __init vfio_iommu_type1_init(void)
 {
-   if (!iommu_present(pci_bus_type))
+   if (!iommu_present(platform_bus_type))
return -ENODEV;
 
return vfio_register_iommu_driver(vfio_iommu_driver_ops_type1);
-- 
1.8.1.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 1/3] mips/kvm: Improve code formatting in arch/mips/kvm/kvm_locore.S

2013-08-05 Thread James Hogan

Hi David,

On 01/08/13 21:22, David Daney wrote:
 From: David Daney david.da...@cavium.com
 
 No code changes, just reflowing some comments and consistently using
 tabs and spaces.  Object code is verified to be unchanged.
 
 Signed-off-by: David Daney david.da...@cavium.com
 Acked-by: Ralf Baechle r...@linux-mips.org


 +  /* Put the saved pointer to vcpu (s1) back into the DDATA_LO Register 
 */

git am detects a whitespace error here (space before tab in indent).
It's got spaces before and after the tab actually.

  /* load the guest context from VCPU and return */

this comment could have it's indentation fixed too

Otherwise, for all 3 patches:

Reviewed-by: James Hogan james.ho...@imgtec.com

Thanks
James

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] kvm-unit-tests: VMX: Separate host and guest rflags

2013-08-05 Thread Gleb Natapov

On Mon, Aug 05, 2013 at 08:43:25PM +0800, Arthur Chunqi Li wrote:
 Separate host_rflags and guest_rflags (regs.rflags used for guest).
 Fix bug of set/get guest rflags when vmenter/vmexit.
 
 Signed-off-by: Arthur Chunqi Li yzt...@gmail.com
 ---
  x86/vmx.c |   11 +++
  x86/vmx.h |4 ++--
  2 files changed, 9 insertions(+), 6 deletions(-)
 
 diff --git a/x86/vmx.c b/x86/vmx.c
 index 7467927..7b28aca 100644
 --- a/x86/vmx.c
 +++ b/x86/vmx.c
 @@ -19,6 +19,7 @@ struct regs regs;
  struct vmx_test *current;
  u64 hypercall_field = 0;
  bool launched;
 +u64 host_rflags;
  
Can't you define in on stack?

  extern u64 gdt64_desc[];
  extern u64 idt_descr[];
 @@ -440,12 +441,14 @@ static int exit_handler()
   int ret;
  
   current-exits++;
 + regs.rflags = vmcs_read(GUEST_RFLAGS);
   current-guest_regs = regs;
   if (is_hypercall())
   ret = handle_hypercall();
   else
   ret = current-exit_handler();
   regs = current-guest_regs;
 + vmcs_write(GUEST_RFLAGS, regs.rflags);
   switch (ret) {
   case VMX_TEST_VMEXIT:
   case VMX_TEST_RESUME:
 @@ -505,15 +508,15 @@ static int vmx_run()
   return 0;
   case VMX_TEST_LAUNCH_ERR:
   printf(%s : vmlaunch failed.\n, __func__);
 - if ((!(regs.rflags  X86_EFLAGS_CF)  !(regs.rflags  
 X86_EFLAGS_ZF))
 - || ((regs.rflags  X86_EFLAGS_CF)  (regs.rflags  
 X86_EFLAGS_ZF)))
 + if ((!(host_rflags  X86_EFLAGS_CF)  !(host_rflags  
 X86_EFLAGS_ZF))
 + || ((host_rflags  X86_EFLAGS_CF)  (host_rflags  
 X86_EFLAGS_ZF)))
   printf(\tvmlaunch set wrong flags\n);
   report(test vmlaunch, 0);
   break;
   case VMX_TEST_RESUME_ERR:
   printf(%s : vmresume failed.\n, __func__);
 - if ((!(regs.rflags  X86_EFLAGS_CF)  !(regs.rflags  
 X86_EFLAGS_ZF))
 - || ((regs.rflags  X86_EFLAGS_CF)  (regs.rflags  
 X86_EFLAGS_ZF)))
 + if ((!(host_rflags  X86_EFLAGS_CF)  !(host_rflags  
 X86_EFLAGS_ZF))
 + || ((host_rflags  X86_EFLAGS_CF)  (host_rflags  
 X86_EFLAGS_ZF)))
   printf(\tvmresume set wrong flags\n);
   report(test vmresume, 0);
   break;
 diff --git a/x86/vmx.h b/x86/vmx.h
 index 1fb9738..d4f979c 100644
 --- a/x86/vmx.h
 +++ b/x86/vmx.h
 @@ -403,10 +403,10 @@ enum Ctrl1 {
  
  #define SAVE_RFLAGS  \
   pushf\n\t \
 - pop regs+0x80\n\t
 + pop host_rflags\n\t
  
  #define LOAD_RFLAGS  \
 - push regs+0x80\n\t\
 + push host_rflags\n\t  \
   popf\n\t
  
  #define VMX_IO_SIZE_MASK 0x7
 -- 
 1.7.9.5

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 1/3] mips/kvm: Improve code formatting in arch/mips/kvm/kvm_locore.S

2013-08-05 Thread Ralf Baechle

On Mon, Aug 05, 2013 at 02:17:01PM +0100, James Hogan wrote:

 
 On 01/08/13 21:22, David Daney wrote:
  From: David Daney david.da...@cavium.com
  
  No code changes, just reflowing some comments and consistently using
  tabs and spaces.  Object code is verified to be unchanged.
  
  Signed-off-by: David Daney david.da...@cavium.com
  Acked-by: Ralf Baechle r...@linux-mips.org
 
 
  +/* Put the saved pointer to vcpu (s1) back into the DDATA_LO 
  Register */
 
 git am detects a whitespace error here (space before tab in indent).
 It's got spaces before and after the tab actually.
 
   /* load the guest context from VCPU and return */
 
 this comment could have it's indentation fixed too
 
 Otherwise, for all 3 patches:
 
 Reviewed-by: James Hogan james.ho...@imgtec.com

I'm happy with the patch series as well and will fix this issue when
applying the patch.

  Ralf
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/3] Initial skeleton of VFIO support for Device Tree based devices

2013-08-05 Thread Mark Rutland

[adding DT maintainers to Cc]

On Mon, Aug 05, 2013 at 02:17:11PM +0100, Antonios Motakis wrote:
 Platform devices in the Linux kernel are usually managed by the DT
 interface. This patch forms the base to support these kind of devices
 with VFIO.
 
 Signed-off-by: Antonios Motakis a.mota...@virtualopensystems.com
 ---
  drivers/vfio/Kconfig  |  10 +++
  drivers/vfio/Makefile |   1 +
  drivers/vfio/vfio_dt.c| 187 
 ++
  include/uapi/linux/vfio.h |   1 +
  4 files changed, 199 insertions(+)
  create mode 100644 drivers/vfio/vfio_dt.c
 
 diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
 index 1f84eda..a77a2e4 100644
 --- a/drivers/vfio/Kconfig
 +++ b/drivers/vfio/Kconfig
 @@ -13,4 +13,14 @@ menuconfig VFIO
  
 If you don't know what to do here, say N.
  
 +config VFIO_DT
 + tristate VFIO support for Device Tree devices
 + depends on VFIO  EVENTFD
 + help
 +   Support for the VFIO Device Tree driver.  This is required to make
 +   use of platform devices present on Device Tree nodes using the VFIO
 +   framework.
 +
 +   If you don't know what to do here, say N.
 +
  source drivers/vfio/pci/Kconfig
 diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
 index 2398d4a..d599a67 100644
 --- a/drivers/vfio/Makefile
 +++ b/drivers/vfio/Makefile
 @@ -1,3 +1,4 @@
  obj-$(CONFIG_VFIO) += vfio.o
  obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
  obj-$(CONFIG_VFIO_PCI) += pci/
 +obj-$(CONFIG_VFIO_DT) += vfio_dt.o
 diff --git a/drivers/vfio/vfio_dt.c b/drivers/vfio/vfio_dt.c
 new file mode 100644
 index 000..ad4d31d
 --- /dev/null
 +++ b/drivers/vfio/vfio_dt.c
 @@ -0,0 +1,187 @@
 +/*
 + * Copyright (C) 2013 - Virtual Open Systems
 + * Author: Antonios Motakis a.mota...@virtualopensystems.com
 + *
 + * This program is free software; you can redistribute it and/or modify
 + * it under the terms of the GNU General Public License, version 2, as
 + * published by the Free Software Foundation.
 + *
 + * This program is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 + * GNU General Public License for more details.
 + *
 + * You should have received a copy of the GNU General Public License
 + * along with this program; if not, write to the Free Software
 + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 + */
 +
 +#include linux/device.h
 +#include linux/eventfd.h
 +#include linux/interrupt.h
 +#include linux/iommu.h
 +#include linux/module.h
 +#include linux/mutex.h
 +#include linux/notifier.h
 +#include linux/pm_runtime.h
 +#include linux/slab.h
 +#include linux/types.h
 +#include linux/uaccess.h
 +#include linux/vfio.h
 +
 +#define DRIVER_VERSION  0.1
 +#define DRIVER_AUTHOR   Antonios Motakis a.mota...@virtualopensystems.com
 +#define DRIVER_DESC VFIO Device Tree devices - User Level meta-driver
 +
 +struct vfio_dt_device {
 + struct platform_device  *pdev;
 +};
 +
 +static void vfio_dt_release(void *device_data)
 +{
 + module_put(THIS_MODULE);
 +}
 +
 +static int vfio_dt_open(void *device_data)
 +{
 + if (!try_module_get(THIS_MODULE))
 + return -ENODEV;
 +
 + return 0;
 +}
 +
 +static long vfio_dt_ioctl(void *device_data,
 +unsigned int cmd, unsigned long arg)
 +{
 + struct vfio_dt_device *vdev = device_data;
 + unsigned long minsz;
 +
 + if (cmd == VFIO_DEVICE_GET_INFO) {
 + struct vfio_device_info info;
 +
 + minsz = offsetofend(struct vfio_device_info, num_irqs);
 +
 + if (copy_from_user(info, (void __user *)arg, minsz))
 + return -EFAULT;
 +
 + if (info.argsz  minsz)
 + return -EINVAL;
 +
 + info.flags = VFIO_DEVICE_FLAGS_DT;
 + info.num_regions = 0;
 + info.num_irqs = 0;
 +
 + return copy_to_user((void __user *)arg, info, minsz);
 +
 + } else if (cmd == VFIO_DEVICE_GET_REGION_INFO)
 + return -EINVAL;
 +
 + else if (cmd == VFIO_DEVICE_GET_IRQ_INFO)
 + return -EINVAL;
 +
 + else if (cmd == VFIO_DEVICE_SET_IRQS)
 + return -EINVAL;
 +
 + else if (cmd == VFIO_DEVICE_RESET)
 + return -EINVAL;
 +
 + return -ENOTTY;
 +}
 +
 +static ssize_t vfio_dt_read(void *device_data, char __user *buf,
 +  size_t count, loff_t *ppos)
 +{
 + return 0;
 +}
 +
 +static ssize_t vfio_dt_write(void *device_data, const char __user *buf,
 +   size_t count, loff_t *ppos)
 +{
 + return 0;
 +}
 +
 +static int vfio_dt_mmap(void *device_data, struct vm_area_struct *vma)
 +{
 + return -EINVAL;
 +}
 +
 +static const struct vfio_device_ops vfio_dt_ops = {
 + .name   = vfio-dts,
 + .open   = vfio_dt_open,
 + .release=

Re: [PATCH 1/3] mips/kvm: Improve code formatting in arch/mips/kvm/kvm_locore.S

2013-08-05 Thread Gleb Natapov

On Mon, Aug 05, 2013 at 03:21:57PM +0200, Ralf Baechle wrote:
 On Mon, Aug 05, 2013 at 02:17:01PM +0100, James Hogan wrote:
 
  
  On 01/08/13 21:22, David Daney wrote:
   From: David Daney david.da...@cavium.com
   
   No code changes, just reflowing some comments and consistently using
   tabs and spaces.  Object code is verified to be unchanged.
   
   Signed-off-by: David Daney david.da...@cavium.com
   Acked-by: Ralf Baechle r...@linux-mips.org
  
  
   +  /* Put the saved pointer to vcpu (s1) back into the DDATA_LO 
   Register */
  
  git am detects a whitespace error here (space before tab in indent).
  It's got spaces before and after the tab actually.
  
/* load the guest context from VCPU and return */
  
  this comment could have it's indentation fixed too
  
  Otherwise, for all 3 patches:
  
  Reviewed-by: James Hogan james.ho...@imgtec.com
 
 I'm happy with the patch series as well and will fix this issue when
 applying the patch.
 
kvm fixes usually go through kvm.git tree for all arches. Any special
reasons you want to get those through mips tree?

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] kvm-unit-tests: VMX: Separate host and guest rflags

2013-08-05 Thread Arthur Chunqi Li

On Mon, Aug 5, 2013 at 9:18 PM, Gleb Natapov g...@redhat.com wrote:
 On Mon, Aug 05, 2013 at 08:43:25PM +0800, Arthur Chunqi Li wrote:
 Separate host_rflags and guest_rflags (regs.rflags used for guest).
 Fix bug of set/get guest rflags when vmenter/vmexit.

 Signed-off-by: Arthur Chunqi Li yzt...@gmail.com
 ---
  x86/vmx.c |   11 +++
  x86/vmx.h |4 ++--
  2 files changed, 9 insertions(+), 6 deletions(-)

 diff --git a/x86/vmx.c b/x86/vmx.c
 index 7467927..7b28aca 100644
 --- a/x86/vmx.c
 +++ b/x86/vmx.c
 @@ -19,6 +19,7 @@ struct regs regs;
  struct vmx_test *current;
  u64 hypercall_field = 0;
  bool launched;
 +u64 host_rflags;

 Can't you define in on stack?
Currently I don't use it outside vmx_run(), but it may be used in user
defined exit_handler in the future, so I put it globally.

Arthur

  extern u64 gdt64_desc[];
  extern u64 idt_descr[];
 @@ -440,12 +441,14 @@ static int exit_handler()
   int ret;

   current-exits++;
 + regs.rflags = vmcs_read(GUEST_RFLAGS);
   current-guest_regs = regs;
   if (is_hypercall())
   ret = handle_hypercall();
   else
   ret = current-exit_handler();
   regs = current-guest_regs;
 + vmcs_write(GUEST_RFLAGS, regs.rflags);
   switch (ret) {
   case VMX_TEST_VMEXIT:
   case VMX_TEST_RESUME:
 @@ -505,15 +508,15 @@ static int vmx_run()
   return 0;
   case VMX_TEST_LAUNCH_ERR:
   printf(%s : vmlaunch failed.\n, __func__);
 - if ((!(regs.rflags  X86_EFLAGS_CF)  !(regs.rflags  
 X86_EFLAGS_ZF))
 - || ((regs.rflags  X86_EFLAGS_CF)  (regs.rflags  
 X86_EFLAGS_ZF)))
 + if ((!(host_rflags  X86_EFLAGS_CF)  !(host_rflags  
 X86_EFLAGS_ZF))
 + || ((host_rflags  X86_EFLAGS_CF)  (host_rflags  
 X86_EFLAGS_ZF)))
   printf(\tvmlaunch set wrong flags\n);
   report(test vmlaunch, 0);
   break;
   case VMX_TEST_RESUME_ERR:
   printf(%s : vmresume failed.\n, __func__);
 - if ((!(regs.rflags  X86_EFLAGS_CF)  !(regs.rflags  
 X86_EFLAGS_ZF))
 - || ((regs.rflags  X86_EFLAGS_CF)  (regs.rflags  
 X86_EFLAGS_ZF)))
 + if ((!(host_rflags  X86_EFLAGS_CF)  !(host_rflags  
 X86_EFLAGS_ZF))
 + || ((host_rflags  X86_EFLAGS_CF)  (host_rflags  
 X86_EFLAGS_ZF)))
   printf(\tvmresume set wrong flags\n);
   report(test vmresume, 0);
   break;
 diff --git a/x86/vmx.h b/x86/vmx.h
 index 1fb9738..d4f979c 100644
 --- a/x86/vmx.h
 +++ b/x86/vmx.h
 @@ -403,10 +403,10 @@ enum Ctrl1 {

  #define SAVE_RFLAGS  \
   pushf\n\t \
 - pop regs+0x80\n\t
 + pop host_rflags\n\t

  #define LOAD_RFLAGS  \
 - push regs+0x80\n\t\
 + push host_rflags\n\t  \
   popf\n\t

  #define VMX_IO_SIZE_MASK 0x7
 --
 1.7.9.5

 --
 Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/3] Initial skeleton of VFIO support for Device Tree based devices

2013-08-05 Thread Mark Rutland

   +static const struct of_device_id vfio_dt_match[] = {
   + � � /* In the future, we can implement a better mechanism to bind
  the
   + � � �* module to any device. For now add the compatible property to
  the
   + � � �* dtb of the devices we want to use. � */
   + � � {
   + � � � � � � .compatible = vfio-dt,
   + � � },
   + � � {},
   +};
 
  This definitely doesn't belong in the dt. It's purely a Linux
  abstraction and does not represent a piece of hardware or common
  interface.
 
  We need to think of a better mechanism for binding the module to these
  devices now.
 
I already make this remark in the cover letter; thanks for confirming it.
�
Antonios

Sorry, I found the cover letter a little unclear in that regard.

Thanks for the clarification :)

Mark.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC V11 15/18] kvm : Paravirtual ticketlocks support for linux guests running on KVM hypervisor

2013-08-05 Thread Ingo Molnar


* Gleb Natapov g...@redhat.com wrote:

 On Mon, Aug 05, 2013 at 11:46:03AM +0200, Ingo Molnar wrote:
  Acked-by: Ingo Molnar mi...@kernel.org
  
  I guess you'd want to carry this in the KVM tree or so - maybe in a 
  separate branch because it changes Xen as well?
 
 It changes KVM host and guest side, XEN and common x86 spinlock code. I 
 think it would be best to merge common x86 spinlock bits and guest side 
 KVM/XEN bits through tip tree and host KVM part will go through KVM 
 tree. If this is OK with you, Ingo, and XEN folks Raghavendra can send 
 two separate patch series one for the tip and one for KVM host side.

Sure, that's fine - if the initial series works fine in isolation as well 
(i.e. won't break anything).

Thanks,

Ingo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC V11 15/18] kvm : Paravirtual ticketlocks support for linux guests running on KVM hypervisor

2013-08-05 Thread Paolo Bonzini

  On Mon, Aug 05, 2013 at 11:46:03AM +0200, Ingo Molnar wrote:
   Acked-by: Ingo Molnar mi...@kernel.org
   
   I guess you'd want to carry this in the KVM tree or so - maybe in a
   separate branch because it changes Xen as well?
  
  It changes KVM host and guest side, XEN and common x86 spinlock code. I
  think it would be best to merge common x86 spinlock bits and guest side
  KVM/XEN bits through tip tree and host KVM part will go through KVM
  tree. If this is OK with you, Ingo, and XEN folks Raghavendra can send
  two separate patch series one for the tip and one for KVM host side.
 
 Sure, that's fine - if the initial series works fine in isolation as well
 (i.e. won't break anything).

It would be a big problem if it didn't!  Raghavendra, please send the
two separate series as Gleb explained above.

Thanks,

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte lookup like booke3s

2013-08-05 Thread Bhushan Bharat-R65777



 -Original Message-
 From: Benjamin Herrenschmidt [mailto:b...@kernel.crashing.org]
 Sent: Saturday, August 03, 2013 9:54 AM
 To: Bhushan Bharat-R65777
 Cc: Wood Scott-B07421; ag...@suse.de; kvm-...@vger.kernel.org;
 kvm@vger.kernel.org; linuxppc-...@lists.ozlabs.org
 Subject: Re: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte lookup like
 booke3s
 
 On Sat, 2013-08-03 at 02:58 +, Bhushan Bharat-R65777 wrote:
  One of the problem I saw was that if I put this code in
  asm/pgtable-32.h and asm/pgtable-64.h then pte_persent() and other
  friend function (on which this code depends) are defined in pgtable.h.
  And pgtable.h includes asm/pgtable-32.h and asm/pgtable-64.h before it
  defines pte_present() and friends functions.
 
  Ok I move wove this in asm/pgtable*.h, initially I fought with myself
  to take this code in pgtable* but finally end up doing here (got
  biased by book3s :)).
 
 Is there a reason why these routines can not be completely generic in 
 pgtable.h
 ?

How about the generic function:

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
b/arch/powerpc/include/asm/pgtable-ppc64.h
index d257d98..21daf28 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -221,6 +221,27 @@ static inline unsigned long pte_update(struct mm_struct 
*mm,
return old;
 }

+static inline unsigned long pte_read(pte_t *p)
+{
+#ifdef PTE_ATOMIC_UPDATES
+   pte_t pte;
+   pte_t tmp;
+   __asm__ __volatile__ (
+   1: ldarx   %0,0,%3\n
+  andi.   %1,%0,%4\n
+  bne-1b\n
+  ori %1,%0,%4\n
+  stdcx.  %1,0,%3\n
+  bne-1b
+   : =r (pte), =r (tmp), =m (*p)
+   : r (p), i (_PAGE_BUSY)
+   : cc);
+
+   return pte;
+#else  
+   return pte_val(*p);
+#endif
+#endif
+}
 static inline int __ptep_test_and_clear_young(struct mm_struct *mm,
  unsigned long addr, pte_t *ptep)
 {
diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index 690c8c2..dad712c 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -254,6 +254,45 @@ static inline pte_t *find_linux_pte_or_hugepte(pgd_t 
*pgdir, unsigned long ea,
 }
 #endif /* !CONFIG_HUGETLB_PAGE */

+static inline pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
+int writing, unsigned long *pte_sizep)
+{
+   pte_t *ptep;
+   pte_t pte;
+   unsigned long ps = *pte_sizep;
+   unsigned int shift;
+
+   ptep = find_linux_pte_or_hugepte(pgdir, hva, shift);
+   if (!ptep)
+   return __pte(0);
+   if (shift)
+   *pte_sizep = 1ul  shift;
+   else
+   *pte_sizep = PAGE_SIZE;
+
+   if (ps  *pte_sizep)
+   return __pte(0);
+
+   if (!pte_present(*ptep))
+   return __pte(0);
+
+#ifdef CONFIG_PPC64
+   /* Lock PTE (set _PAGE_BUSY) and read */
+   pte = pte_read(ptep);
+#else
+   pte = pte_val(*ptep);
+#endif
+   if (pte_present(pte)) {
+   pte = pte_mkyoung(pte);
+   if (writing  pte_write(pte))
+   pte = pte_mkdirty(pte);
+   }
+
+   *ptep = __pte(pte); /* 64bit: Also unlock pte (clear _PAGE_BUSY) */
+
+   return pte;
+}
+
 #endif /* __ASSEMBLY__ */

 #endif /* __KERNEL__ */

Re: [PATCH RFC V11 15/18] kvm : Paravirtual ticketlocks support for linux guests running on KVM hypervisor

2013-08-05 Thread Raghavendra K T


On 08/05/2013 07:35 PM, Paolo Bonzini wrote:

I guess you'd want to carry this in the KVM tree or so - maybe in a
separate branch because it changes Xen as well?


It changes KVM host and guest side, XEN and common x86 spinlock code. I
think it would be best to merge common x86 spinlock bits and guest side
KVM/XEN bits through tip tree and host KVM part will go through KVM
tree. If this is OK with you, Ingo, and XEN folks Raghavendra can send
two separate patch series one for the tip and one for KVM host side.


Sure, that's fine - if the initial series works fine in isolation as well
(i.e. won't break anything).


It would be a big problem if it didn't!  Raghavendra, please send the
two separate series as Gleb explained above.



Yes. Sure.  The patches have been split in that way.

Only thing I am thinking is about KVM_FEATURE_PV_UNHALT, and 
KVM_HC_KICK_CPU definition in the below hunk, that is needed by guest

as well. may be this header file change can be a separate patch so that
duplicate can be handled easily during merge?

I do testing of all combination after splitting and post.

diff --git a/arch/x86/include/uapi/asm/kvm_para.h 
b/arch/x86/include/uapi/asm/kvm_para.h

index 06fdbd9..94dc8ca 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -23,6 +23,7 @@
 #define KVM_FEATURE_ASYNC_PF   4
 #define KVM_FEATURE_STEAL_TIME 5
 #define KVM_FEATURE_PV_EOI 6
+#define KVM_FEATURE_PV_UNHALT  7

diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
index cea2c5c..2841f86 100644
--- a/include/uapi/linux/kvm_para.h
+++ b/include/uapi/linux/kvm_para.h
@@ -19,6 +19,7 @@
 #define KVM_HC_MMU_OP  2
 #define KVM_HC_FEATURES3
 #define KVM_HC_PPC_MAP_MAGIC_PAGE  4
+#define KVM_HC_KICK_CPU5


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH V2 4/4] x86: correctly detect hypervisor

2013-08-05 Thread Konrad Rzeszutek Wilk

On Mon, Aug 05, 2013 at 11:38:14AM +0800, Jason Wang wrote:
 On 07/25/2013 04:54 PM, Jason Wang wrote:
  We try to handle the hypervisor compatibility mode by detecting hypervisor
  through a specific order. This is not robust, since hypervisors may 
  implement
  each others features.
 
  This patch tries to handle this situation by always choosing the last one 
  in the
  CPUID leaves. This is done by letting .detect() returns a priority instead 
  of
  true/false and just re-using the CPUID leaf where the signature were found 
  as
  the priority (or 1 if it was found by DMI). Then we can just pick 
  hypervisor who
  has the highest priority. Other sophisticated detection method could also be
  implemented on top.
 
  Suggested by H. Peter Anvin and Paolo Bonzini.
 
  Cc: Thomas Gleixner t...@linutronix.de
  Cc: Ingo Molnar mi...@redhat.com
  Cc: H. Peter Anvin h...@zytor.com
  Cc: x...@kernel.org
  Cc: K. Y. Srinivasan k...@microsoft.com
  Cc: Haiyang Zhang haiya...@microsoft.com
  Cc: Konrad Rzeszutek Wilk konrad.w...@oracle.com
  Cc: Jeremy Fitzhardinge jer...@goop.org
  Cc: Doug Covelli dcove...@vmware.com
  Cc: Borislav Petkov b...@suse.de
  Cc: Dan Hecht dhe...@vmware.com
  Cc: Paul Gortmaker paul.gortma...@windriver.com
  Cc: Marcelo Tosatti mtosa...@redhat.com
  Cc: Gleb Natapov g...@redhat.com
  Cc: Paolo Bonzini pbonz...@redhat.com
  Cc: Frederic Weisbecker fweis...@gmail.com
  Cc: linux-ker...@vger.kernel.org
  Cc: de...@linuxdriverproject.org
  Cc: kvm@vger.kernel.org
  Cc: xen-de...@lists.xensource.com
  Cc: virtualizat...@lists.linux-foundation.org
  Signed-off-by: Jason Wang jasow...@redhat.com
  ---
 
 Ping, any comments and acks for this series?

Could you provide me with a git branch so I can test it overnight please?

 
 Thanks
   arch/x86/include/asm/hypervisor.h |2 +-
   arch/x86/kernel/cpu/hypervisor.c  |   15 +++
   arch/x86/kernel/cpu/mshyperv.c|   13 -
   arch/x86/kernel/cpu/vmware.c  |8 
   arch/x86/kernel/kvm.c |6 ++
   arch/x86/xen/enlighten.c  |9 +++--
   6 files changed, 25 insertions(+), 28 deletions(-)
 
  diff --git a/arch/x86/include/asm/hypervisor.h 
  b/arch/x86/include/asm/hypervisor.h
  index 2d4b5e6..e42f758 100644
  --- a/arch/x86/include/asm/hypervisor.h
  +++ b/arch/x86/include/asm/hypervisor.h
  @@ -33,7 +33,7 @@ struct hypervisor_x86 {
  const char  *name;
   
  /* Detection routine */
  -   bool(*detect)(void);
  +   uint32_t(*detect)(void);
   
  /* Adjust CPU feature bits (run once per CPU) */
  void(*set_cpu_features)(struct cpuinfo_x86 *);
  diff --git a/arch/x86/kernel/cpu/hypervisor.c 
  b/arch/x86/kernel/cpu/hypervisor.c
  index 8727921..36ce402 100644
  --- a/arch/x86/kernel/cpu/hypervisor.c
  +++ b/arch/x86/kernel/cpu/hypervisor.c
  @@ -25,11 +25,6 @@
   #include asm/processor.h
   #include asm/hypervisor.h
   
  -/*
  - * Hypervisor detect order.  This is specified explicitly here because
  - * some hypervisors might implement compatibility modes for other
  - * hypervisors and therefore need to be detected in specific sequence.
  - */
   static const __initconst struct hypervisor_x86 * const hypervisors[] =
   {
   #ifdef CONFIG_XEN_PVHVM
  @@ -49,15 +44,19 @@ static inline void __init
   detect_hypervisor_vendor(void)
   {
  const struct hypervisor_x86 *h, * const *p;
  +   uint32_t pri, max_pri = 0;
   
  for (p = hypervisors; p  hypervisors + ARRAY_SIZE(hypervisors); p++) {
  h = *p;
  -   if (h-detect()) {
  +   pri = h-detect();
  +   if (pri != 0  pri  max_pri) {
  +   max_pri = pri;
  x86_hyper = h;
  -   printk(KERN_INFO Hypervisor detected: %s\n, h-name);
  -   break;
  }
  }
  +
  +   if (max_pri)
  +   printk(KERN_INFO Hypervisor detected: %s\n, x86_hyper-name);
   }
   
   void init_hypervisor(struct cpuinfo_x86 *c)
  diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
  index 8f4be53..71a39f3 100644
  --- a/arch/x86/kernel/cpu/mshyperv.c
  +++ b/arch/x86/kernel/cpu/mshyperv.c
  @@ -27,20 +27,23 @@
   struct ms_hyperv_info ms_hyperv;
   EXPORT_SYMBOL_GPL(ms_hyperv);
   
  -static bool __init ms_hyperv_platform(void)
  +static uint32_t  __init ms_hyperv_platform(void)
   {
  u32 eax;
  u32 hyp_signature[3];
   
  if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
  -   return false;
  +   return 0;
   
  cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
eax, hyp_signature[0], hyp_signature[1], hyp_signature[2]);
   
  -   return eax = HYPERV_CPUID_MIN 
  -   eax = HYPERV_CPUID_MAX 
  -   !memcmp(Microsoft Hv, hyp_signature, 12);
  +   if (eax = HYPERV_CPUID_MIN 
  +   eax = HYPERV_CPUID_MAX 
  +   !memcmp(Microsoft Hv, hyp_signature, 12))
  +   return

Re: [PATCH RFC V11 15/18] kvm : Paravirtual ticketlocks support for linux guests running on KVM hypervisor

2013-08-05 Thread Paolo Bonzini

 Only thing I am thinking is about KVM_FEATURE_PV_UNHALT, and
 KVM_HC_KICK_CPU definition in the below hunk, that is needed by guest
 as well. may be this header file change can be a separate patch so that
 duplicate can be handled easily during merge?

Sure, good idea.

Paolo

 I do testing of all combination after splitting and post.

 diff --git a/arch/x86/include/uapi/asm/kvm_para.h
 b/arch/x86/include/uapi/asm/kvm_para.h
 index 06fdbd9..94dc8ca 100644
 --- a/arch/x86/include/uapi/asm/kvm_para.h
 +++ b/arch/x86/include/uapi/asm/kvm_para.h
 @@ -23,6 +23,7 @@
   #define KVM_FEATURE_ASYNC_PF   4
   #define KVM_FEATURE_STEAL_TIME 5
   #define KVM_FEATURE_PV_EOI 6
 +#define KVM_FEATURE_PV_UNHALT  7
 
 diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
 index cea2c5c..2841f86 100644
 --- a/include/uapi/linux/kvm_para.h
 +++ b/include/uapi/linux/kvm_para.h
 @@ -19,6 +19,7 @@
   #define KVM_HC_MMU_OP  2
   #define KVM_HC_FEATURES3
   #define KVM_HC_PPC_MAP_MAGIC_PAGE  4
 +#define KVM_HC_KICK_CPU5
 
 
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: [RFC 0/3] WIP VFIO for device tree devices on Arndale

2013-08-05 Thread Yoder Stuart-B08248

Thanks for starting work on this.

 -Original Message-
 From: Antonios Motakis [mailto:a.mota...@virtualopensystems.com]
 Sent: Monday, August 05, 2013 8:17 AM
 To: linux-arm-ker...@lists.infradead.org; alex.william...@redhat.com
 Cc: kvm...@lists.cs.columbia.edu; io...@lists.linux-foundation.org;
 linux-samsung-...@vger.kernel.org; kvm@vger.kernel.org; ag...@suse.de;
 Yoder Stuart-B08248; Antonios Motakis
 Subject: [RFC 0/3] WIP VFIO for device tree devices on Arndale

I think we should call this infrastructure vfio for platform
devices.  These devices are on a platform bus, as can be seen 
in /sys/bus/platform.  It is conceivable that there are platform
devices not in a device tree.

Also, the term device tree devices just seems awkward.

 This is a very early base work, towards VFIO support on ARM platforms
 with an IOMMU. It forms a base on to which to implement the functionality
 necessary to enable using device tree devices on ARM (and other platforms
 based on device trees) with VFIO.

 This very early work of progress is only published for the sake of
 openness,
 and is very far from usable yet. However the driver can bind to devices,
 and return to userspace the info about the memory regions and IRQs.

 This patch series has been tested on the Arndale board (with the Exynos
 5250
 System MMU).

 It depends on Cho KyongHo's patch series iommu/exynos: Fixes and
 Enhancements
 of System MMU driver with DT, applied on a Linux 3.10.1 kernel, and also
 my
 own iommu/exynos: add devices attached to the System MMU to an IOMMU
 group.
 Those patches are required at least in order to test the proposed module
 on
 Arndale.

 This should not be treated as anything more than a work in progress.
 Numerous
 functions still need to be implemented properly, e.g.
  - Proper binding of the VFIO_DT driver to devices; currently to test the
driver, one has to edit the device tree and add the vfio-dt to the
compatible property. However Linux does not support OF drivers that
can be dynamically bound to any device.

Yes, we really need to solve this.  I need to get more familiar with
the platform device infrastructure in Linux, but it seems at a high level
that it shouldn't be that hard to support dynamic binding.  We have
device drivers that register to handle certain compatible
strings.  And we have device tree parsing code that does platform_device_add()
calls to associate/bind a device to a driver.

What we need is something like the new_id mechanism in PCI where we
can have a platform driver dynamically register to support a new
device type.

Also, note that any Linux driver needs to properly support 'unbinding'
as well.   We can't have the normal driver and vfio competing to
bind to the same device.

  - Most IOCTLs are not implemented yet. Memory region mapping, DMA
 mapping,
IRQFD still need to be added.
  - The VFIO_IOMMU_TYPE1 is patched to work instead of PCI IOMMU, with
 platform
IOMMUs such as the one that is found on Arndale. This is a proof of
 concept
hack, and a more permanent fix will be proposed as the code matures.

 The API used is identical to the existing VFIO API that is also used with
 PCI devices. Only devices that include a basic set of IRQs and memory
 regions
 are targeted; devices with complicated relationships with other devices
 on the
 device tree are not taken into account at this stage.

 The API is not extended with device tree specific information; this would
 complicate the driver unnecessarily as it is not needed for the base use
 cases.

I would suggest adding a patch to this series that updates 
Documentation/vfio.txt
with any platform device specifics.

For example, it should be stated explicitly that if a device has multiple
regions and multiple IRQs that the index exposed by VFIO is the same
as the index within the associated reg and interrupt properties
in the device tree representation.

.i.e. if a device is represented like this in the device tree:

reg = 0x101e2000 0x1000 0x101e4000 0x1000;
interrupts = 24 25 26 27;

region #0 is 0x101e2000, region #1 is 0x101e4000
interrupt #0 is 24, and so on.

Perhaps that seems obvious, but think it is good to state it
explicitly in Documentation/vfio.txt.

 Antonios Motakis (3):
   VFIO_IOMMU_TYPE1 workaround to build for platform devices
   Initial skeleton of VFIO support for Device Tree based devices
   Return info for device and its memory regions and interrupts

  drivers/vfio/Kconfig|  12 ++-
  drivers/vfio/Makefile   |   1 +
  drivers/vfio/vfio_dt.c  | 233

would prefer:
drivers/vfio/vfio_platform.c

Thanks,
Stuart

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/3] Initial skeleton of VFIO support for Device Tree based devices

2013-08-05 Thread Alex Williamson

On Mon, 2013-08-05 at 15:17 +0200, Antonios Motakis wrote:
 Platform devices in the Linux kernel are usually managed by the DT
 interface. This patch forms the base to support these kind of devices
 with VFIO.
 
 Signed-off-by: Antonios Motakis a.mota...@virtualopensystems.com
 ---
  drivers/vfio/Kconfig  |  10 +++
  drivers/vfio/Makefile |   1 +
  drivers/vfio/vfio_dt.c| 187 
 ++
  include/uapi/linux/vfio.h |   1 +
  4 files changed, 199 insertions(+)
  create mode 100644 drivers/vfio/vfio_dt.c
 
 diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
 index 1f84eda..a77a2e4 100644
 --- a/drivers/vfio/Kconfig
 +++ b/drivers/vfio/Kconfig
 @@ -13,4 +13,14 @@ menuconfig VFIO
  
 If you don't know what to do here, say N.
  
 +config VFIO_DT
 + tristate VFIO support for Device Tree devices
 + depends on VFIO  EVENTFD

I think there needs to be another depends item here, this would allow
configuration even on architectures that have no concept of device tree.
Also, do we want to put this in a subdirectory like we've done for pci?
The rest looks like a reasonable start.  Thanks,

Alex

 + help
 +   Support for the VFIO Device Tree driver.  This is required to make
 +   use of platform devices present on Device Tree nodes using the VFIO
 +   framework.
 +
 +   If you don't know what to do here, say N.
 +
  source drivers/vfio/pci/Kconfig
 diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
 index 2398d4a..d599a67 100644
 --- a/drivers/vfio/Makefile
 +++ b/drivers/vfio/Makefile
 @@ -1,3 +1,4 @@
  obj-$(CONFIG_VFIO) += vfio.o
  obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
  obj-$(CONFIG_VFIO_PCI) += pci/
 +obj-$(CONFIG_VFIO_DT) += vfio_dt.o
 diff --git a/drivers/vfio/vfio_dt.c b/drivers/vfio/vfio_dt.c
 new file mode 100644
 index 000..ad4d31d
 --- /dev/null
 +++ b/drivers/vfio/vfio_dt.c
 @@ -0,0 +1,187 @@
 +/*
 + * Copyright (C) 2013 - Virtual Open Systems
 + * Author: Antonios Motakis a.mota...@virtualopensystems.com
 + *
 + * This program is free software; you can redistribute it and/or modify
 + * it under the terms of the GNU General Public License, version 2, as
 + * published by the Free Software Foundation.
 + *
 + * This program is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 + * GNU General Public License for more details.
 + *
 + * You should have received a copy of the GNU General Public License
 + * along with this program; if not, write to the Free Software
 + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 + */
 +
 +#include linux/device.h
 +#include linux/eventfd.h
 +#include linux/interrupt.h
 +#include linux/iommu.h
 +#include linux/module.h
 +#include linux/mutex.h
 +#include linux/notifier.h
 +#include linux/pm_runtime.h
 +#include linux/slab.h
 +#include linux/types.h
 +#include linux/uaccess.h
 +#include linux/vfio.h
 +
 +#define DRIVER_VERSION  0.1
 +#define DRIVER_AUTHOR   Antonios Motakis a.mota...@virtualopensystems.com
 +#define DRIVER_DESC VFIO Device Tree devices - User Level meta-driver
 +
 +struct vfio_dt_device {
 + struct platform_device  *pdev;
 +};
 +
 +static void vfio_dt_release(void *device_data)
 +{
 + module_put(THIS_MODULE);
 +}
 +
 +static int vfio_dt_open(void *device_data)
 +{
 + if (!try_module_get(THIS_MODULE))
 + return -ENODEV;
 +
 + return 0;
 +}
 +
 +static long vfio_dt_ioctl(void *device_data,
 +unsigned int cmd, unsigned long arg)
 +{
 + struct vfio_dt_device *vdev = device_data;
 + unsigned long minsz;
 +
 + if (cmd == VFIO_DEVICE_GET_INFO) {
 + struct vfio_device_info info;
 +
 + minsz = offsetofend(struct vfio_device_info, num_irqs);
 +
 + if (copy_from_user(info, (void __user *)arg, minsz))
 + return -EFAULT;
 +
 + if (info.argsz  minsz)
 + return -EINVAL;
 +
 + info.flags = VFIO_DEVICE_FLAGS_DT;
 + info.num_regions = 0;
 + info.num_irqs = 0;
 +
 + return copy_to_user((void __user *)arg, info, minsz);
 +
 + } else if (cmd == VFIO_DEVICE_GET_REGION_INFO)
 + return -EINVAL;
 +
 + else if (cmd == VFIO_DEVICE_GET_IRQ_INFO)
 + return -EINVAL;
 +
 + else if (cmd == VFIO_DEVICE_SET_IRQS)
 + return -EINVAL;
 +
 + else if (cmd == VFIO_DEVICE_RESET)
 + return -EINVAL;
 +
 + return -ENOTTY;
 +}
 +
 +static ssize_t vfio_dt_read(void *device_data, char __user *buf,
 +  size_t count, loff_t *ppos)
 +{
 + return 0;
 +}
 +
 +static ssize_t vfio_dt_write(void *device_data, const char __user *buf,
 +   size_t count, loff_t *ppos)
 +{
 + return 0;
 +}
 +
 +static int

RE: [PATCH 2/3] Initial skeleton of VFIO support for Device Tree based devices

2013-08-05 Thread Yoder Stuart-B08248

 +MODULE_VERSION(DRIVER_VERSION);
 +MODULE_LICENSE(GPL v2);
 +MODULE_AUTHOR(DRIVER_AUTHOR);
 +MODULE_DESCRIPTION(DRIVER_DESC);
 diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
 index 284ff24..1e4bef2 100644
 --- a/include/uapi/linux/vfio.h
 +++ b/include/uapi/linux/vfio.h
 @@ -147,6 +147,7 @@ struct vfio_device_info {
   __u32   flags;
  #define VFIO_DEVICE_FLAGS_RESET  (1  0)/* Device supports reset
 */
  #define VFIO_DEVICE_FLAGS_PCI(1  1)/* vfio-pci device */
 +#define VFIO_DEVICE_FLAGS_DT (1  2)/* vfio-dt device */
   __u32   num_regions;/* Max region index + 1 */
   __u32   num_irqs;   /* Max IRQ index + 1 */
  };

In the RFC I sent out the proposed flags were:

   #define VFIO_DEVICE_FLAGS_PLATFORM (1  ?) /* A platform bus device */
   #define VFIO_DEVICE_FLAGS_DEVTREE  (1  ?) /* device tree info available */

Since you are only implementing the first part, the platform
device support, just call it:  VFIO_DEVICE_FLAGS_PLATFORM

I think the 'platform' term is more accurate as we are providing
a mechanism to expose devices on the /sys/bus/platform bus to
user space.

Stuart

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH V2 4/4] x86: correctly detect hypervisor

2013-08-05 Thread H. Peter Anvin

On 08/05/2013 07:34 AM, Konrad Rzeszutek Wilk wrote:
 
 Could you provide me with a git branch so I can test it overnight please?
 

Pull tip:x86/paravirt.

-hpa


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC V11 15/18] kvm : Paravirtual ticketlocks support for linux guests running on KVM hypervisor

2013-08-05 Thread Konrad Rzeszutek Wilk

On Mon, Aug 05, 2013 at 11:46:03AM +0200, Ingo Molnar wrote:
 
 * Gleb Natapov g...@redhat.com wrote:
 
  On Fri, Aug 02, 2013 at 11:25:39AM +0200, Ingo Molnar wrote:
Ingo,

Do you have any concerns reg this series? please let me know if this 
looks good now to you.
   
   I'm inclined to NAK it for excessive quotation - who knows how many 
   people left the discussion in disgust? Was it done to drive away as 
   many reviewers as possible?
   
   Anyway, see my other reply, the measurement results seem hard to 
   interpret and inconclusive at the moment.
 
  That result was only for patch 18 of the series, not pvspinlock in 
  general.
 
 Okay - I've re-read the performance numbers and they are impressive, so no 
 objections from me.
 
 The x86 impact seems to be a straightforward API change, with most of the 
 changes on the virtualization side. So:
 
 Acked-by: Ingo Molnar mi...@kernel.org
 
 I guess you'd want to carry this in the KVM tree or so - maybe in a 
 separate branch because it changes Xen as well?

May I suggest an alternate way - perhaps you can put them in a tip/spinlock
tree for v3.12 - since both KVM and Xen maintainers have acked and carefully
reviewed them?
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 6/6 v2] kvm: powerpc: use caching attributes as per linux pte

2013-08-05 Thread Scott Wood

On Sat, 2013-08-03 at 14:25 +1000, Benjamin Herrenschmidt wrote:
 On Sat, 2013-08-03 at 03:11 +, Bhushan Bharat-R65777 wrote:
  
   
   Could you explain why we need to set dirty/referenced on the PTE, when we 
   didn't
   need to do that before? All we're getting from the PTE is wimg.
   We have MMU notifiers to take care of the page being unmapped, and we've 
   already
   marked the page itself as dirty if the TLB entry is writeable.
  
  I pulled this code from book3s.
  
  Ben, can you describe why we need this on book3s ?
 
 If you let the guest write to the page you must set the dirty bit on the PTE
 (or the struct page, at least one of them), similar with accessed on any 
 access.
 
 If you don't, the VM might swap the page out without writing it back to disk
 for example, assuming it contains no modified data.

We've already marked the page itself as dirty using kvm_set_pfn_dirty(),
and if the VM swaps it out we'll get an MMU notifier callback.  If we
marked the PTE dirty/accessed instead, is there any guarantee it will
stay marked dirty/accessed until the next MMU notifier?

-Scott



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 6/6 v2] kvm: powerpc: use caching attributes as per linux pte

2013-08-05 Thread Scott Wood

On Fri, 2013-08-02 at 22:11 -0500, Bhushan Bharat-R65777 wrote:
  How does wimg get set in the pfnmap case?
 
 Pfnmap is not kernel managed pages, right? So should we set I+G there ?

It could depend on ppc_md.phys_mem_access_prot().  Can't you pull it
from the PTE regardless of pfnmap?

-Scott



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH V2 4/4] x86: correctly detect hypervisor

2013-08-05 Thread Konrad Rzeszutek Wilk

On Mon, Aug 05, 2013 at 08:20:53AM -0700, H. Peter Anvin wrote:
 On 08/05/2013 07:34 AM, Konrad Rzeszutek Wilk wrote:
  
  Could you provide me with a git branch so I can test it overnight please?
  
 
 Pull tip:x86/paravirt.

It works for me. Thanks.
 
   -hpa
 
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 1/3] mips/kvm: Improve code formatting in arch/mips/kvm/kvm_locore.S

2013-08-05 Thread Ralf Baechle

On Mon, Aug 05, 2013 at 04:43:27PM +0300, Gleb Natapov wrote:
 Date:   Mon, 5 Aug 2013 16:43:27 +0300
 From: Gleb Natapov g...@redhat.com
 To: Ralf Baechle r...@linux-mips.org
 Cc: James Hogan james.ho...@imgtec.com, David Daney
  ddaney.c...@gmail.com, linux-m...@linux-mips.org, kvm@vger.kernel.org,
  Sanjay Lal sanj...@kymasys.com, linux-ker...@vger.kernel.org, David
  Daney david.da...@cavium.com
 Subject: Re: [PATCH 1/3] mips/kvm: Improve code formatting in
  arch/mips/kvm/kvm_locore.S
 Content-Type: text/plain; charset=us-ascii

 On Mon, Aug 05, 2013 at 03:21:57PM +0200, Ralf Baechle wrote:
  On Mon, Aug 05, 2013 at 02:17:01PM +0100, James Hogan wrote:

   On 01/08/13 21:22, David Daney wrote:
From: David Daney david.da...@cavium.com

No code changes, just reflowing some comments and consistently using
tabs and spaces.  Object code is verified to be unchanged.

Signed-off-by: David Daney david.da...@cavium.com
Acked-by: Ralf Baechle r...@linux-mips.org

+/* Put the saved pointer to vcpu (s1) back into the DDATA_LO 
Register */

   git am detects a whitespace error here (space before tab in indent).
   It's got spaces before and after the tab actually.

 /* load the guest context from VCPU and return */

   this comment could have it's indentation fixed too

   Otherwise, for all 3 patches:

   Reviewed-by: James Hogan james.ho...@imgtec.com

  I'm happy with the patch series as well and will fix this issue when
  applying the patch.

 kvm fixes usually go through kvm.git tree for all arches. Any special
 reasons you want to get those through mips tree?

MIPS fixes usually go through the MIPS tree ;-)

I don't care which tree this stuff goes through - but a general experience
is that things that affect MIPS systems receive most testing if going
through the MIPS tree.

  Ralf
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 1/3] mips/kvm: Improve code formatting in arch/mips/kvm/kvm_locore.S

2013-08-05 Thread David Daney


On 08/05/2013 06:43 AM, Gleb Natapov wrote:

On Mon, Aug 05, 2013 at 03:21:57PM +0200, Ralf Baechle wrote:

On Mon, Aug 05, 2013 at 02:17:01PM +0100, James Hogan wrote:



On 01/08/13 21:22, David Daney wrote:

From: David Daney david.da...@cavium.com

No code changes, just reflowing some comments and consistently using
tabs and spaces.  Object code is verified to be unchanged.

Signed-off-by: David Daney david.da...@cavium.com
Acked-by: Ralf Baechle r...@linux-mips.org




+/* Put the saved pointer to vcpu (s1) back into the DDATA_LO Register 
*/


git am detects a whitespace error here (space before tab in indent).
It's got spaces before and after the tab actually.


  /* load the guest context from VCPU and return */


this comment could have it's indentation fixed too

Otherwise, for all 3 patches:

Reviewed-by: James Hogan james.ho...@imgtec.com


I'm happy with the patch series as well and will fix this issue when
applying the patch.


kvm fixes usually go through kvm.git tree for all arches. Any special
reasons you want to get those through mips tree?



I don't really care which tree takes this particular patch set.

However, in the near future, I will be sending revised versions of 
patches needed by MIPS/KVM that are in files outside of the 
arch/mips/kvm directory and it is possible that those may suffer patch 
ordering problems if merged through a 'foreign tree'.


In any event, there is the problem with the whitespace error in the 
comment.  I blame checkpatch.pl for not flagging it, but that is not 
really a good excuse.  If it goes by the KVM tree, do you want me to 
send a corrected patch?  Or can you fix it when you merge it.


David Daney

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 03/10] vfio: add external user support

2013-08-05 Thread Alex Williamson

On Tue, 2013-07-23 at 19:07 +1000, Alexey Kardashevskiy wrote:
 On 07/23/2013 12:23 PM, Alex Williamson wrote:
  On Tue, 2013-07-16 at 10:53 +1000, Alexey Kardashevskiy wrote:
  VFIO is designed to be used via ioctls on file descriptors
  returned by VFIO.
 
  However in some situations support for an external user is required.
  The first user is KVM on PPC64 (SPAPR TCE protocol) which is going to
  use the existing VFIO groups for exclusive access in real/virtual mode
  on a host to avoid passing map/unmap requests to the user space which
  would made things pretty slow.
 
  The protocol includes:
 
  1. do normal VFIO init operation:
 - opening a new container;
 - attaching group(s) to it;
 - setting an IOMMU driver for a container.
  When IOMMU is set for a container, all groups in it are
  considered ready to use by an external user.
 
  2. User space passes a group fd to an external user.
  The external user calls vfio_group_get_external_user()
  to verify that:
 - the group is initialized;
 - IOMMU is set for it.
  If both checks passed, vfio_group_get_external_user()
  increments the container user counter to prevent
  the VFIO group from disposal before KVM exits.
 
  3. The external user calls vfio_external_user_iommu_id()
  to know an IOMMU ID. PPC64 KVM uses it to link logical bus
  number (LIOBN) with IOMMU ID.
 
  4. When the external KVM finishes, it calls
  vfio_group_put_external_user() to release the VFIO group.
  This call decrements the container user counter.
  Everything gets released.
 
  The vfio: Limit group opens patch is also required for the consistency.
 
  Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
  
  This looks fine to me.  Is the plan to add this through the ppc tree
  again?  Thanks,
 
 
 Nope, better to add this through your tree. And faster for sure :) Thanks!

Applied to my next branch for v3.12.  Thanks,

Alex


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 1/3] mips/kvm: Improve code formatting in arch/mips/kvm/kvm_locore.S

2013-08-05 Thread Gleb Natapov

On Mon, Aug 05, 2013 at 07:06:10PM +0200, Ralf Baechle wrote:
 On Mon, Aug 05, 2013 at 04:43:27PM +0300, Gleb Natapov wrote:
  Date:   Mon, 5 Aug 2013 16:43:27 +0300
  From: Gleb Natapov g...@redhat.com
  To: Ralf Baechle r...@linux-mips.org
  Cc: James Hogan james.ho...@imgtec.com, David Daney
   ddaney.c...@gmail.com, linux-m...@linux-mips.org, kvm@vger.kernel.org,
   Sanjay Lal sanj...@kymasys.com, linux-ker...@vger.kernel.org, David
   Daney david.da...@cavium.com
  Subject: Re: [PATCH 1/3] mips/kvm: Improve code formatting in
   arch/mips/kvm/kvm_locore.S
  Content-Type: text/plain; charset=us-ascii

  On Mon, Aug 05, 2013 at 03:21:57PM +0200, Ralf Baechle wrote:
   On Mon, Aug 05, 2013 at 02:17:01PM +0100, James Hogan wrote:

On 01/08/13 21:22, David Daney wrote:
 From: David Daney david.da...@cavium.com

 No code changes, just reflowing some comments and consistently using
 tabs and spaces.  Object code is verified to be unchanged.

 Signed-off-by: David Daney david.da...@cavium.com
 Acked-by: Ralf Baechle r...@linux-mips.org

 +  /* Put the saved pointer to vcpu (s1) back into the DDATA_LO 
 Register */

git am detects a whitespace error here (space before tab in indent).
It's got spaces before and after the tab actually.

  /* load the guest context from VCPU and return */

this comment could have it's indentation fixed too

Otherwise, for all 3 patches:

Reviewed-by: James Hogan james.ho...@imgtec.com

   I'm happy with the patch series as well and will fix this issue when
   applying the patch.

  kvm fixes usually go through kvm.git tree for all arches. Any special
  reasons you want to get those through mips tree?

 MIPS fixes usually go through the MIPS tree ;-)

arch/*/kvm/ fixes usually go through the kvm.git though :) KVM arch
code, after it is reasonably stable, usually depends more on kvm common
code then arch code and kvm development suppose to happen against
kvm.git otherwise APIs can go out of sync. I need to get acks of MIPS
people before taking patches of course.

When patch series touches code outside of arch/*/kvm, like David says
the next one will, it make sense to merge it through MIPS tree, just
please take KVM maintainers ACK for kvm part.

 I don't care which tree this stuff goes through - but a general experience
 is that things that affect MIPS systems receive most testing if going
 through the MIPS tree.

   Ralf

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte lookup like booke3s

2013-08-05 Thread Scott Wood

On Mon, 2013-08-05 at 09:27 -0500, Bhushan Bharat-R65777 wrote:
 
  -Original Message-
  From: Benjamin Herrenschmidt [mailto:b...@kernel.crashing.org]
  Sent: Saturday, August 03, 2013 9:54 AM
  To: Bhushan Bharat-R65777
  Cc: Wood Scott-B07421; ag...@suse.de; kvm-...@vger.kernel.org;
  kvm@vger.kernel.org; linuxppc-...@lists.ozlabs.org
  Subject: Re: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte lookup like
  booke3s
  
  On Sat, 2013-08-03 at 02:58 +, Bhushan Bharat-R65777 wrote:
   One of the problem I saw was that if I put this code in
   asm/pgtable-32.h and asm/pgtable-64.h then pte_persent() and other
   friend function (on which this code depends) are defined in pgtable.h.
   And pgtable.h includes asm/pgtable-32.h and asm/pgtable-64.h before it
   defines pte_present() and friends functions.
  
   Ok I move wove this in asm/pgtable*.h, initially I fought with myself
   to take this code in pgtable* but finally end up doing here (got
   biased by book3s :)).
  
  Is there a reason why these routines can not be completely generic in 
  pgtable.h
  ?
 
 How about the generic function:
 
 diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
 b/arch/powerpc/include/asm/pgtable-ppc64.h
 index d257d98..21daf28 100644
 --- a/arch/powerpc/include/asm/pgtable-ppc64.h
 +++ b/arch/powerpc/include/asm/pgtable-ppc64.h
 @@ -221,6 +221,27 @@ static inline unsigned long pte_update(struct mm_struct 
 *mm,
 return old;
  }
 
 +static inline unsigned long pte_read(pte_t *p)
 +{
 +#ifdef PTE_ATOMIC_UPDATES
 +   pte_t pte;
 +   pte_t tmp;
 +   __asm__ __volatile__ (
 +   1: ldarx   %0,0,%3\n
 +  andi.   %1,%0,%4\n
 +  bne-1b\n
 +  ori %1,%0,%4\n
 +  stdcx.  %1,0,%3\n
 +  bne-1b
 +   : =r (pte), =r (tmp), =m (*p)
 +   : r (p), i (_PAGE_BUSY)
 +   : cc);
 +
 +   return pte;
 +#else  
 +   return pte_val(*p);
 +#endif
 +#endif
 +}
  static inline int __ptep_test_and_clear_young(struct mm_struct *mm,
   unsigned long addr, pte_t *ptep)

Please leave a blank line between functions.

  {
 diff --git a/arch/powerpc/include/asm/pgtable.h 
 b/arch/powerpc/include/asm/pgtable.h
 index 690c8c2..dad712c 100644
 --- a/arch/powerpc/include/asm/pgtable.h
 +++ b/arch/powerpc/include/asm/pgtable.h
 @@ -254,6 +254,45 @@ static inline pte_t *find_linux_pte_or_hugepte(pgd_t 
 *pgdir, unsigned long ea,
  }
  #endif /* !CONFIG_HUGETLB_PAGE */
 
 +static inline pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
 +int writing, unsigned long *pte_sizep)

The name implies that it just reads the PTE.  Setting accessed/dirty
shouldn't be an undocumented side-effect.  Why can't the caller do that
(or a different function that the caller calls afterward if desired)?  

Though even then you have the undocumented side effect of locking the
PTE on certain targets.

 +{
 +   pte_t *ptep;
 +   pte_t pte;
 +   unsigned long ps = *pte_sizep;
 +   unsigned int shift;
 +
 +   ptep = find_linux_pte_or_hugepte(pgdir, hva, shift);
 +   if (!ptep)
 +   return __pte(0);
 +   if (shift)
 +   *pte_sizep = 1ul  shift;
 +   else
 +   *pte_sizep = PAGE_SIZE;
 +
 +   if (ps  *pte_sizep)
 +   return __pte(0);
 +
 +   if (!pte_present(*ptep))
 +   return __pte(0);
 +
 +#ifdef CONFIG_PPC64
 +   /* Lock PTE (set _PAGE_BUSY) and read */
 +   pte = pte_read(ptep);
 +#else
 +   pte = pte_val(*ptep);
 +#endif

What about 32-bit platforms that need atomic PTEs?

-Scott



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: FAQ on linux-kvm.org has broken link

2013-08-05 Thread folkert

 If you do find something in dmesg that could be very helpful.

Ok it is down again: nothing in dmesg, neither on the guest or in the
host.

 Based on this information it seems like a bug in the virtio_net guest
 driver or vhost_net on the host.  Actually there is one contradictory
 piece of evidence: in the original bug report you said using e1000
 instead of virtio: did not help.  Can you confirm that e1000 also does
 not work?

When I tested it, it also did not work yes.

 In your original bug report you said If I then ping any host connected
 to that interface, no ping comes back: only a message about buffer space
 not being enough.  Can you post the exact error message and whether it
 is printed by ping inside the guest, dmesg inside the guest, or dmesg on
 the host?

It was inside the guest going to the outside world (eg outside of the
host).

 There is still the possibility that there is a networking configuration
 issue or bug inside the guest itself.  That would explain why this has
 happened across different configurations (tap, mactvap, vhost_net,
 e1000).

I don't think it is as:
- bringing down the interfaces _AND_ doing rmmod/modprobe of virtio_net
  solves it
- it also happened twice on an other guest

 Two approaches to get closer to the source of the problem:
 1. Try the latest vanilla kernel on the host (Linux 3.10.5).  This way
you can rule out fixed bugs in vhost_net or tap.
 2. Get the system into the bad state and then do some deeper.  Start
with outgoing ping, instrument guest driver and host vhost_net
functions to see what the drivers are doing, inspect the transmit
vring, etc.
 
 #1 is probably the best next step.  If it fails and you still have time
 to work on a solution we can start digging deeper with #2.

I can upgrade now to 3.10.3 as that is the current version in debian.


Folkert van Heusden

-- 
--
Phone: +31-6-41278122, PGP-key: 1F28D8AE, www.vanheusden.com
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] kvm: Make kvm_rebooting visible

2013-08-05 Thread Andi Kleen

From: Andi Kleen a...@linux.intel.com

kvm_rebooting is referenced from assembler code, thus
needs to be visible.

Signed-off-by: Andi Kleen a...@linux.intel.com
---
 virt/kvm/kvm_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1580dd4..eff6abd 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -95,7 +95,7 @@ static void hardware_disable_all(void);
 
 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
 
-bool kvm_rebooting;
+__visible bool kvm_rebooting;
 EXPORT_SYMBOL_GPL(kvm_rebooting);
 
 static bool largepages_enabled = true;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] kvm: Make kvm_rebooting visible

2013-08-05 Thread David Daney


On 08/05/2013 03:12 PM, Andi Kleen wrote:

From: Andi Kleen a...@linux.intel.com

kvm_rebooting is referenced from assembler code, thus
needs to be visible.

Signed-off-by: Andi Kleen a...@linux.intel.com
---
  virt/kvm/kvm_main.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1580dd4..eff6abd 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -95,7 +95,7 @@ static void hardware_disable_all(void);

  static void kvm_io_bus_destroy(struct kvm_io_bus *bus);

-bool kvm_rebooting;
+__visible bool kvm_rebooting;
  EXPORT_SYMBOL_GPL(kvm_rebooting);


How many of these are there kernel wide?

Could you do something like this instead:

DEFINE_AND_EXPORT_GPL(bool, kvm_rebooting);

The definition of DEFINE_AND_EXPORT_GPL(_type, _name) is left as an 
exercise for the reader.


David Daney


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC V11 0/18] Paravirtualized ticket spinlocks

2013-08-05 Thread H. Peter Anvin

So, having read through the entire thread I *think* this is what the
status of this patchset is:

1. Patches 1-17 are noncontroversial, Raghavendra is going to send an
   update split into two patchsets;
2. There are at least two versions of patch 15; I think the PATCH
   RESEND RFC is the right one.
3. Patch 18 is controversial but there are performance numbers; these
   should be integrated in the patch description.
4. People are in general OK with us putting this patchset into -tip for
   testing, once the updated (V12) patchset is posted.

If I'm misunderstanding something, it is because of excessive thread
length as mentioned by Ingo.

Either way, I'm going to hold off on putting it into -tip until tomorrow
unless Ingo beats me to it.

-hpa

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] kvm: Make kvm_rebooting visible

2013-08-05 Thread Andi Kleen

   static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
 
 -bool kvm_rebooting;
 +__visible bool kvm_rebooting;
   EXPORT_SYMBOL_GPL(kvm_rebooting);
 
 How many of these are there kernel wide?

Not very many (at least on x86 allyes) ~10.
Also most users are not exported.

Probably not worth an own macro.

 
 Could you do something like this instead:
 
 DEFINE_AND_EXPORT_GPL(bool, kvm_rebooting);
 
 The definition of DEFINE_AND_EXPORT_GPL(_type, _name) is left as an
 exercise for the reader.

I actually had EXPORT_SYMBOL make things always visible for a long time,
but it prevents optimizing away unused code in very small
non modular configurations. So I switched to separate annotations.

-Andi

-- 
a...@linux.intel.com -- Speaking for myself only.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

RE: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte lookup like booke3s

2013-08-05 Thread Bhushan Bharat-R65777

 -Original Message-
 From: Wood Scott-B07421
 Sent: Tuesday, August 06, 2013 12:49 AM
 To: Bhushan Bharat-R65777
 Cc: Benjamin Herrenschmidt; Wood Scott-B07421; ag...@suse.de; kvm-
 p...@vger.kernel.org; kvm@vger.kernel.org; linuxppc-...@lists.ozlabs.org
 Subject: Re: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte lookup like
 booke3s

 On Mon, 2013-08-05 at 09:27 -0500, Bhushan Bharat-R65777 wrote:

   -Original Message-
   From: Benjamin Herrenschmidt [mailto:b...@kernel.crashing.org]
   Sent: Saturday, August 03, 2013 9:54 AM
   To: Bhushan Bharat-R65777
   Cc: Wood Scott-B07421; ag...@suse.de; kvm-...@vger.kernel.org;
   kvm@vger.kernel.org; linuxppc-...@lists.ozlabs.org
   Subject: Re: [PATCH 5/6 v2] kvm: powerpc: booke: Add linux pte
   lookup like booke3s

   On Sat, 2013-08-03 at 02:58 +, Bhushan Bharat-R65777 wrote:
One of the problem I saw was that if I put this code in
asm/pgtable-32.h and asm/pgtable-64.h then pte_persent() and other
friend function (on which this code depends) are defined in pgtable.h.
And pgtable.h includes asm/pgtable-32.h and asm/pgtable-64.h
before it defines pte_present() and friends functions.

Ok I move wove this in asm/pgtable*.h, initially I fought with
myself to take this code in pgtable* but finally end up doing here
(got biased by book3s :)).

   Is there a reason why these routines can not be completely generic
   in pgtable.h ?

  How about the generic function:

  diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h
  b/arch/powerpc/include/asm/pgtable-ppc64.h
  index d257d98..21daf28 100644
  --- a/arch/powerpc/include/asm/pgtable-ppc64.h
  +++ b/arch/powerpc/include/asm/pgtable-ppc64.h
  @@ -221,6 +221,27 @@ static inline unsigned long pte_update(struct mm_struct
 *mm,
  return old;
   }

  +static inline unsigned long pte_read(pte_t *p) { #ifdef
  +PTE_ATOMIC_UPDATES
  +   pte_t pte;
  +   pte_t tmp;
  +   __asm__ __volatile__ (
  +   1: ldarx   %0,0,%3\n
  +  andi.   %1,%0,%4\n
  +  bne-1b\n
  +  ori %1,%0,%4\n
  +  stdcx.  %1,0,%3\n
  +  bne-1b
  +   : =r (pte), =r (tmp), =m (*p)
  +   : r (p), i (_PAGE_BUSY)
  +   : cc);
  +
  +   return pte;
  +#else
  +   return pte_val(*p);
  +#endif
  +#endif
  +}
   static inline int __ptep_test_and_clear_young(struct mm_struct *mm,
unsigned long addr,
  pte_t *ptep)

 Please leave a blank line between functions.

   {
  diff --git a/arch/powerpc/include/asm/pgtable.h
  b/arch/powerpc/include/asm/pgtable.h
  index 690c8c2..dad712c 100644
  --- a/arch/powerpc/include/asm/pgtable.h
  +++ b/arch/powerpc/include/asm/pgtable.h
  @@ -254,6 +254,45 @@ static inline pte_t
  *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,  }  #endif
  /* !CONFIG_HUGETLB_PAGE */

  +static inline pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
  +int writing, unsigned long
  +*pte_sizep)

 The name implies that it just reads the PTE.  Setting accessed/dirty shouldn't
 be an undocumented side-effect.

Ok, will rename and document.

 Why can't the caller do that (or a different
 function that the caller calls afterward if desired)?

The current implementation in book3s is;
 1) find a pte/hugepte
 2) return null if pte not present
 3) take _PAGE_BUSY lock
 4) set accessed/dirty
 5) clear _PAGE_BUSY.

What I tried was 
1) find a pte/hugepte
2) return null if pte not present
3) return pte (not take lock by not setting _PAGE_BUSY)

4) then user calls  __ptep_set_access_flags() to atomic update the 
dirty/accessed flags in pte.

- but the benchmark results were not good
- Also can there be race as we do not take lock in step 3 and update in step 4 ?

 Though even then you have the undocumented side effect of locking the PTE on
 certain targets.

  +{
  +   pte_t *ptep;
  +   pte_t pte;
  +   unsigned long ps = *pte_sizep;
  +   unsigned int shift;
  +
  +   ptep = find_linux_pte_or_hugepte(pgdir, hva, shift);
  +   if (!ptep)
  +   return __pte(0);
  +   if (shift)
  +   *pte_sizep = 1ul  shift;
  +   else
  +   *pte_sizep = PAGE_SIZE;
  +
  +   if (ps  *pte_sizep)
  +   return __pte(0);
  +
  +   if (!pte_present(*ptep))
  +   return __pte(0);
  +
  +#ifdef CONFIG_PPC64
  +   /* Lock PTE (set _PAGE_BUSY) and read */
  +   pte = pte_read(ptep);
  +#else
  +   pte = pte_val(*ptep);
  +#endif

 What about 32-bit platforms that need atomic PTEs?

I called __ptep_set_access_flags() for both 32/64bit (for 64bit I was not 
calling pte_read()), which handles atomic updates. Somehow the benchmark result 
were not good, will try again.

Thanks
-Bharat

 -Scott

Re: [PATCH RFC V11 0/18] Paravirtualized ticket spinlocks

2013-08-05 Thread Raghavendra K T


On 08/06/2013 04:20 AM, H. Peter Anvin wrote:

So, having read through the entire thread I *think* this is what the
status of this patchset is:

1. Patches 1-17 are noncontroversial, Raghavendra is going to send an
update split into two patchsets;


Yes.  Only one patch would be common to both host and guest which will
be sent as a separate patch.
I 'll rebase first patchset to -next and second patchset to kvm tree as
needed.


2. There are at least two versions of patch 15; I think the PATCH
RESEND RFC is the right one.


True.


3. Patch 18 is controversial but there are performance numbers; these
should be integrated in the patch description.


Current plan is to drop for patch 18 for now.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 20/23] KVM: PPC: Book3S PR: Better handling of host-side read-only pages

2013-08-05 Thread Paul Mackerras

Currently we request write access to all pages that get mapped into the
guest, even if the guest is only loading from the page.  This reduces
the effectiveness of KSM because it means that we unshare every page we
access.  Also, we always set the changed (C) bit in the guest HPTE if
it allows writing, even for a guest load.

This fixes both these problems.  We pass an 'iswrite' flag to the
mmu.xlate() functions and to kvmppc_mmu_map_page() to indicate whether
the access is a load or a store.  The mmu.xlate() functions now only
set C for stores.  kvmppc_gfn_to_pfn() now calls gfn_to_pfn_prot()
instead of gfn_to_pfn() so that it can indicate whether we need write
access to the page, and get back a 'writable' flag to indicate whether
the page is writable or not.  If that 'writable' flag is clear, we then
make the host HPTE read-only even if the guest HPTE allowed writing.

This means that we can get a protection fault when the guest writes to a
page that it has mapped read-write but which is read-only on the host
side (perhaps due to KSM having merged the page).  Thus we now call
kvmppc_handle_pagefault() for protection faults as well as HPTE not found
faults.  In kvmppc_handle_pagefault(), if the access was allowed by the
guest HPTE and we thus need to install a new host HPTE, we then need to
remove the old host HPTE if there is one.  This is done with a new
function, kvmppc_mmu_unmap_page(), which uses kvmppc_mmu_pte_vflush() to
find and remove the old host HPTE.

Since the memslot-related functions require the KVM SRCU read lock to
be held, this adds srcu_read_lock/unlock pairs around the calls to
kvmppc_handle_pagefault().

Finally, this changes kvmppc_mmu_book3s_32_xlate_pte() to not ignore
guest HPTEs that don't permit access, and to return -EPERM for accesses
that are not permitted by the page protections.

Signed-off-by: Paul Mackerras pau...@samba.org
---
 arch/powerpc/include/asm/kvm_book3s.h |  7 +--
 arch/powerpc/include/asm/kvm_host.h   |  3 ++-
 arch/powerpc/kvm/book3s.c | 15 +--
 arch/powerpc/kvm/book3s_32_mmu.c  | 32 +---
 arch/powerpc/kvm/book3s_32_mmu_host.c | 14 +++---
 arch/powerpc/kvm/book3s_64_mmu.c  |  9 +
 arch/powerpc/kvm/book3s_64_mmu_host.c | 20 +---
 arch/powerpc/kvm/book3s_64_mmu_hv.c   |  2 +-
 arch/powerpc/kvm/book3s_pr.c  | 29 -
 9 files changed, 91 insertions(+), 40 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index e0bc83b..4fe6864 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -129,7 +129,9 @@ extern void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 
new_msr);
 extern void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu);
 extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu);
 extern void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu);
-extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
+extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte,
+  bool iswrite);
+extern void kvmppc_mmu_unmap_page(struct kvm_vcpu *vcpu, struct kvmppc_pte 
*pte);
 extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
 extern void kvmppc_mmu_flush_segment(struct kvm_vcpu *vcpu, ulong eaddr, ulong 
seg_size);
 extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
@@ -158,7 +160,8 @@ extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct 
kvmppc_bat *bat,
   bool upper, u32 val);
 extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
 extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu 
*vcpu);
-extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool writing,
+   bool *writable);
 extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
unsigned long *rmap, long pte_index, int realmode);
 extern void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep,
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 138e781..52c7b80 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -356,7 +356,8 @@ struct kvmppc_mmu {
/* book3s */
void (*mtsrin)(struct kvm_vcpu *vcpu, u32 srnum, ulong value);
u32  (*mfsrin)(struct kvm_vcpu *vcpu, u32 srnum);
-   int  (*xlate)(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte 
*pte, bool data);
+   int  (*xlate)(struct kvm_vcpu *vcpu, gva_t eaddr,
+ struct kvmppc_pte *pte, bool data, bool iswrite);
void (*reset_msr)(struct kvm_vcpu *vcpu);
void (*tlbie)(struct kvm_vcpu *vcpu, ulong addr, bool large);
int  (*esid_to_vsid)(struct kvm_vcpu *vcpu, ulong

[PATCH 09/23] KVM: PPC: Book3S PR: Correct errors in H_ENTER implementation

2013-08-05 Thread Paul Mackerras

The implementation of H_ENTER in PR KVM has some errors:

* With H_EXACT not set, if the HPTEG is full, we return H_PTEG_FULL
  as the return value of kvmppc_h_pr_enter, but the caller is expecting
  one of the EMULATE_* values.  The H_PTEG_FULL needs to go in the
  guest's R3 instead.

* With H_EXACT set, if the selected HPTE is already valid, the H_ENTER
  call should return a H_PTEG_FULL error.

This fixes these errors and also makes it write only the selected HPTE,
not the whole group, since only the selected HPTE has been modified.
This also micro-optimizes the calculations involving pte_index and i.

Signed-off-by: Paul Mackerras pau...@samba.org
---
 arch/powerpc/kvm/book3s_pr_papr.c | 19 ++-
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_pr_papr.c 
b/arch/powerpc/kvm/book3s_pr_papr.c
index da0e0bc..38f1899 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -21,6 +21,8 @@
 #include asm/kvm_ppc.h
 #include asm/kvm_book3s.h
 
+#define HPTE_SIZE  16  /* bytes per HPT entry */
+
 static unsigned long get_pteg_addr(struct kvm_vcpu *vcpu, long pte_index)
 {
struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
@@ -40,32 +42,39 @@ static int kvmppc_h_pr_enter(struct kvm_vcpu *vcpu)
long pte_index = kvmppc_get_gpr(vcpu, 5);
unsigned long pteg[2 * 8];
unsigned long pteg_addr, i, *hpte;
+   long int ret;
 
+   i = pte_index  7;
pte_index = ~7UL;
pteg_addr = get_pteg_addr(vcpu, pte_index);
 
copy_from_user(pteg, (void __user *)pteg_addr, sizeof(pteg));
hpte = pteg;
 
+   ret = H_PTEG_FULL;
if (likely((flags  H_EXACT) == 0)) {
-   pte_index = ~7UL;
for (i = 0; ; ++i) {
if (i == 8)
-   return H_PTEG_FULL;
+   goto done;
if ((*hpte  HPTE_V_VALID) == 0)
break;
hpte += 2;
}
} else {
-   i = kvmppc_get_gpr(vcpu, 5)  7UL;
hpte += i * 2;
+   if (*hpte  HPTE_V_VALID)
+   goto done;
}
 
hpte[0] = kvmppc_get_gpr(vcpu, 6);
hpte[1] = kvmppc_get_gpr(vcpu, 7);
-   copy_to_user((void __user *)pteg_addr, pteg, sizeof(pteg));
-   kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
+   pteg_addr += i * HPTE_SIZE;
+   copy_to_user((void __user *)pteg_addr, hpte, HPTE_SIZE);
kvmppc_set_gpr(vcpu, 4, pte_index | i);
+   ret = H_SUCCESS;
+
+ done:
+   kvmppc_set_gpr(vcpu, 3, ret);
 
return EMULATE_DONE;
 }
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 08/23] KVM: PPC: Book3S PR: Handle PP0 page-protection bit in guest HPTEs

2013-08-05 Thread Paul Mackerras

64-bit POWER processors have a three-bit field for page protection in
the hashed page table entry (HPTE).  Currently we only interpret the two
bits that were present in older versions of the architecture.  The only
defined combination that has the new bit set is 110, meaning read-only
for supervisor and no access for user mode.

This adds code to kvmppc_mmu_book3s_64_xlate() to interpret the extra
bit appropriately.

Signed-off-by: Paul Mackerras pau...@samba.org
---
 arch/powerpc/kvm/book3s_64_mmu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index 658ccd7..563fbf7 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -298,6 +298,8 @@ do_second:
v = pteg[i];
r = pteg[i+1];
pp = (r  HPTE_R_PP) | key;
+   if (r  HPTE_R_PP0)
+   pp |= 8;
 
gpte-eaddr = eaddr;
gpte-vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, eaddr, data);
@@ -319,6 +321,7 @@ do_second:
case 3:
case 5:
case 7:
+   case 10:
gpte-may_read = true;
break;
}
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 16/23] KVM: PPC: Book3S: Merge implementations of KVM_PPC_GET_SMMU_INFO ioctl

2013-08-05 Thread Paul Mackerras

This merges the PR and HV implementations of kvm_vm_ioctl_get_smmu_info()
into a single implementation in book3s.c.  Since userspace tends to
call this ioctl very early in the life of a VM, before (for instance)
enabling PAPR mode, we will need this to return results that are
compatible with both PR and HV guests, once we are able to compile both
PR and HV into one kernel image.  For HV guests, the capabilities and
encodings need to be consistent with what the real hardware we are
running on can do, whereas for PR guests, the MMU is completely
virtual and so the set of capabilities and encodings is arbitrary.

To achieve this, we report a set of segment and page sizes and
encodings that are consistent with what real POWER processors do.
If the guest could potentially use HV mode then we filter that set
to remove anything that is not implemented by the CPU that we are
running on.  The helper function, kvm_book3s_hv_possible(), that add
to trigger this filtering is currently just defined based on the
kernel configuration.

Signed-off-by: Paul Mackerras pau...@samba.org
---
 arch/powerpc/include/asm/kvm_ppc.h |  4 +++
 arch/powerpc/kvm/book3s.c  | 53 ++
 arch/powerpc/kvm/book3s_hv.c   | 38 ---
 arch/powerpc/kvm/book3s_pr.c   | 30 -
 4 files changed, 57 insertions(+), 68 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index b15554a..af7fe62 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -283,6 +283,8 @@ static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi)
 
 extern void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu);
 
+static inline int kvm_book3s_hv_possible(void) { return 1; }
+
 #else
 static inline void __init kvm_cma_reserve(void)
 {}
@@ -302,6 +304,8 @@ static inline void kvmppc_fast_vcpu_kick(struct kvm_vcpu 
*vcpu)
 {
kvm_vcpu_kick(vcpu);
 }
+
+static inline int kvm_book3s_hv_possible(void) { return 0; }
 #endif
 
 #ifdef CONFIG_KVM_XICS
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 4b136be..06abd84 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -728,6 +728,59 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct 
kvm_dirty_log *log)
return -ENOTTY;
 }
 
+#ifdef CONFIG_PPC64
+static void add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
+ int linux_psize, int shift, int sllp, int lp)
+{
+   struct mmu_psize_def *def = mmu_psize_defs[linux_psize];
+
+   if (kvm_book3s_hv_possible()) {
+   /* Check this matches what the hardware does */
+   if (shift != def-shift || sllp != def-sllp ||
+   lp != def-penc[linux_psize])
+   return;
+   }
+
+   (*sps)-page_shift = shift;
+   (*sps)-slb_enc = sllp;
+   (*sps)-enc[0].page_shift = shift;
+   (*sps)-enc[0].pte_enc = lp;
+   (*sps)++;
+}
+
+int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm,
+  struct kvm_ppc_smmu_info *info)
+{
+   struct kvm_ppc_one_seg_page_size *sps;
+
+   /*
+* At this stage we don't know whether this VM will be
+* HV or PR, so if it could be HV, restrict what we report
+* to what the hardware can do.
+*/
+   if (kvm_book3s_hv_possible()) {
+   info-slb_size = mmu_slb_size;
+   info-flags = KVM_PPC_PAGE_SIZES_REAL;
+   if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
+   info-flags |= KVM_PPC_1T_SEGMENTS;
+   } else {
+   /* emulated SLB is always 64 entries */
+   info-slb_size = 64;
+   info-flags = KVM_PPC_1T_SEGMENTS;
+   }
+
+   /* No multi-page size segments (MPSS) support yet */
+   sps = info-sps[0];
+   add_seg_page_size(sps, MMU_PAGE_4K, 12, 0, 0);
+   add_seg_page_size(sps, MMU_PAGE_64K, 16,
+ SLB_VSID_L | SLB_VSID_LP_01, 1);
+   add_seg_page_size(sps, MMU_PAGE_16M, 24,
+ SLB_VSID_L | SLB_VSID_LP_00, 0);
+
+   return 0;
+}
+#endif /* CONFIG_PPC64 */
+
 void kvmppc_core_free_memslot(struct kvm_memory_slot *free,
  struct kvm_memory_slot *dont)
 {
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index fcf0564..13f79dd 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1568,44 +1568,6 @@ long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct 
kvm_allocate_rma *ret)
return fd;
 }
 
-static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
-int linux_psize)
-{
-   struct mmu_psize_def *def = mmu_psize_defs[linux_psize];
-
-   if (!def-shift)
-   return;
-   (*sps)-page_shift = def-shift;
-   (*sps)-slb_enc = def-sllp;
-

[PATCH 22/23] KVM: PPC: Book3S PR: Mark pages accessed, and dirty if being written

2013-08-05 Thread Paul Mackerras

The mark_page_dirty() function, despite what its name might suggest,
doesn't actually mark the page as dirty as far as the MM subsystem is
concerned.  It merely sets a bit in KVM's map of dirty pages, if
userspace has requested dirty tracking for the relevant memslot.
To tell the MM subsystem that the page is dirty, we have to call
kvm_set_pfn_dirty() (or an equivalent such as SetPageDirty()).

This adds a call to kvm_set_pfn_dirty(), and while we are here, also
adds a call to kvm_set_pfn_accessed() to tell the MM subsystem that
the page has been accessed.  Since we are now using the pfn in
several places, this adds a 'pfn' variable to store it and changes
the places that used hpaddr  PAGE_SHIFT to use pfn instead, which
is the same thing.

This also changes a use of HPTE_R_PP to PP_RXRX.  Both are 3, but
PP_RXRX is more informative as being the read-only page permission
bit setting.

Signed-off-by: Paul Mackerras pau...@samba.org
---
 arch/powerpc/kvm/book3s_64_mmu_host.c | 26 +++---
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c 
b/arch/powerpc/kvm/book3s_64_mmu_host.c
index b7e9504..8e8aff9 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -96,20 +96,21 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct 
kvmppc_pte *orig_pte,
unsigned long mmu_seq;
struct kvm *kvm = vcpu-kvm;
struct hpte_cache *cpte;
+   unsigned long gfn = orig_pte-raddr  PAGE_SHIFT;
+   unsigned long pfn;
 
/* used to check for invalidations in progress */
mmu_seq = kvm-mmu_notifier_seq;
smp_rmb();
 
/* Get host physical address for gpa */
-   hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte-raddr  PAGE_SHIFT,
-  iswrite, writable);
-   if (is_error_noslot_pfn(hpaddr)) {
-   printk(KERN_INFO Couldn't get guest page for gfn %lx!\n, 
orig_pte-eaddr);
+   pfn = kvmppc_gfn_to_pfn(vcpu, gfn, iswrite, writable);
+   if (is_error_noslot_pfn(pfn)) {
+   printk(KERN_INFO Couldn't get guest page for gfn %lx!\n, gfn);
r = -EINVAL;
goto out;
}
-   hpaddr = PAGE_SHIFT;
+   hpaddr = pfn  PAGE_SHIFT;
 
/* and write the mapping ea - hpa into the pt */
vcpu-arch.mmu.esid_to_vsid(vcpu, orig_pte-eaddr  SID_SHIFT, vsid);
@@ -129,15 +130,18 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct 
kvmppc_pte *orig_pte,
 
vpn = hpt_vpn(orig_pte-eaddr, map-host_vsid, MMU_SEGSIZE_256M);
 
+   kvm_set_pfn_accessed(pfn);
if (!orig_pte-may_write || !writable)
-   rflags |= HPTE_R_PP;
-   else
-   mark_page_dirty(vcpu-kvm, orig_pte-raddr  PAGE_SHIFT);
+   rflags |= PP_RXRX;
+   else {
+   mark_page_dirty(vcpu-kvm, gfn);
+   kvm_set_pfn_dirty(pfn);
+   }
 
if (!orig_pte-may_execute)
rflags |= HPTE_R_N;
else
-   kvmppc_mmu_flush_icache(hpaddr  PAGE_SHIFT);
+   kvmppc_mmu_flush_icache(pfn);
 
/*
 * Use 64K pages if possible; otherwise, on 64K page kernels,
@@ -191,7 +195,7 @@ map_again:
cpte-slot = hpteg + (ret  7);
cpte-host_vpn = vpn;
cpte-pte = *orig_pte;
-   cpte-pfn = hpaddr  PAGE_SHIFT;
+   cpte-pfn = pfn;
cpte-pagesize = hpsize;
 
kvmppc_mmu_hpte_cache_map(vcpu, cpte);
@@ -200,7 +204,7 @@ map_again:
 
 out_unlock:
spin_unlock(kvm-mmu_lock);
-   kvm_release_pfn_clean(hpaddr  PAGE_SHIFT);
+   kvm_release_pfn_clean(pfn);
if (cpte)
kvmppc_mmu_hpte_cache_free(cpte);
 
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 02/23] KVM: PPC: Book3S PR: Don't corrupt guest state when kernel uses VMX

2013-08-05 Thread Paul Mackerras

Currently the code assumes that once we load up guest FP/VSX or VMX
state into the CPU, it stays valid in the CPU registers until we
explicitly flush it to the thread_struct.  However, on POWER7,
copy_page() and memcpy() can use VMX.  These functions do flush the
VMX state to the thread_struct before using VMX instructions, but if
this happens while we have guest state in the VMX registers, and we
then re-enter the guest, we don't reload the VMX state from the
thread_struct, leading to guest corruption.  This has been observed
to cause guest processes to segfault.

To fix this, we check before re-entering the guest that all of the
bits corresponding to facilities owned by the guest, as expressed
in vcpu-arch.guest_owned_ext, are set in current-thread.regs-msr.
Any bits that have been cleared correspond to facilities that have
been used by kernel code and thus flushed to the thread_struct, so
for them we reload the state from the thread_struct.

We also need to check current-thread.regs-msr before calling
giveup_fpu() or giveup_altivec(), since if the relevant bit is
clear, the state has already been flushed to the thread_struct and
to flush it again would corrupt it.

Signed-off-by: Paul Mackerras pau...@samba.org
---
 arch/powerpc/kvm/book3s_pr.c | 29 +
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index ddfaf56..adeab19 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -468,7 +468,8 @@ void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
 * both the traditional FP registers and the added VSX
 * registers into thread.fpr[].
 */
-   giveup_fpu(current);
+   if (current-thread.regs-msr  MSR_FP)
+   giveup_fpu(current);
for (i = 0; i  ARRAY_SIZE(vcpu-arch.fpr); i++)
vcpu_fpr[i] = thread_fpr[get_fpr_index(i)];
 
@@ -483,7 +484,8 @@ void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
 
 #ifdef CONFIG_ALTIVEC
if (msr  MSR_VEC) {
-   giveup_altivec(current);
+   if (current-thread.regs-msr  MSR_VEC)
+   giveup_altivec(current);
memcpy(vcpu-arch.vr, t-vr, sizeof(vcpu-arch.vr));
vcpu-arch.vscr = t-vscr;
}
@@ -575,8 +577,6 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, 
unsigned int exit_nr,
printk(KERN_INFO Loading up ext 0x%lx\n, msr);
 #endif
 
-   current-thread.regs-msr |= msr;
-
if (msr  MSR_FP) {
for (i = 0; i  ARRAY_SIZE(vcpu-arch.fpr); i++)
thread_fpr[get_fpr_index(i)] = vcpu_fpr[i];
@@ -598,12 +598,32 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, 
unsigned int exit_nr,
 #endif
}
 
+   current-thread.regs-msr |= msr;
vcpu-arch.guest_owned_ext |= msr;
kvmppc_recalc_shadow_msr(vcpu);
 
return RESUME_GUEST;
 }
 
+/*
+ * Kernel code using FP or VMX could have flushed guest state to
+ * the thread_struct; if so, get it back now.
+ */
+static void kvmppc_handle_lost_ext(struct kvm_vcpu *vcpu)
+{
+   unsigned long lost_ext;
+
+   lost_ext = vcpu-arch.guest_owned_ext  ~current-thread.regs-msr;
+   if (!lost_ext)
+   return;
+
+   if (lost_ext  MSR_FP)
+   kvmppc_load_up_fpu();
+   if (lost_ext  MSR_VEC)
+   kvmppc_load_up_altivec();
+   current-thread.regs-msr |= lost_ext;
+}
+
 int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
unsigned int exit_nr)
 {
@@ -892,6 +912,7 @@ program_interrupt:
} else {
kvmppc_fix_ee_before_entry();
}
+   kvmppc_handle_lost_ext(vcpu);
}
 
trace_kvm_book3s_reenter(r, vcpu);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 03/23] KVM: PPC: Book3S PR: Make instruction fetch fallback work for system calls

2013-08-05 Thread Paul Mackerras

It turns out that if we exit the guest due to a hcall instruction (sc 1),
and the loading of the instruction in the guest exit path fails for any
reason, the call to kvmppc_ld() in kvmppc_get_last_inst() fetches the
instruction after the hcall instruction rather than the hcall itself.
This in turn means that the instruction doesn't get recognized as an
hcall in kvmppc_handle_exit_pr() but gets passed to the guest kernel
as a sc instruction.  That usually results in the guest kernel getting
a return code of 38 (ENOSYS) from an hcall, which often triggers a
BUG_ON() or other failure.

This fixes the problem by adding a new variant of kvmppc_get_last_inst()
called kvmppc_get_last_sc(), which fetches the instruction if necessary
from pc - 4 rather than pc.

Signed-off-by: Paul Mackerras pau...@samba.org
---
 arch/powerpc/include/asm/kvm_book3s.h | 38 +++
 arch/powerpc/kvm/book3s_pr.c  |  2 +-
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index 08891d0..fa19e2f 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -334,6 +334,27 @@ static inline u32 kvmppc_get_last_inst(struct kvm_vcpu 
*vcpu)
return r;
 }
 
+/*
+ * Like kvmppc_get_last_inst(), but for fetching a sc instruction.
+ * Because the sc instruction sets SRR0 to point to the following
+ * instruction, we have to fetch from pc - 4.
+ */
+static inline u32 kvmppc_get_last_sc(struct kvm_vcpu *vcpu)
+{
+   ulong pc = kvmppc_get_pc(vcpu) - 4;
+   struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+   u32 r;
+
+   /* Load the instruction manually if it failed to do so in the
+* exit path */
+   if (svcpu-last_inst == KVM_INST_FETCH_FAILED)
+   kvmppc_ld(vcpu, pc, sizeof(u32), svcpu-last_inst, false);
+
+   r = svcpu-last_inst;
+   svcpu_put(svcpu);
+   return r;
+}
+
 static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
 {
struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
@@ -446,6 +467,23 @@ static inline u32 kvmppc_get_last_inst(struct kvm_vcpu 
*vcpu)
return vcpu-arch.last_inst;
 }
 
+/*
+ * Like kvmppc_get_last_inst(), but for fetching a sc instruction.
+ * Because the sc instruction sets SRR0 to point to the following
+ * instruction, we have to fetch from pc - 4.
+ */
+static inline u32 kvmppc_get_last_sc(struct kvm_vcpu *vcpu)
+{
+   ulong pc = kvmppc_get_pc(vcpu) - 4;
+
+   /* Load the instruction manually if it failed to do so in the
+* exit path */
+   if (vcpu-arch.last_inst == KVM_INST_FETCH_FAILED)
+   kvmppc_ld(vcpu, pc, sizeof(u32), vcpu-arch.last_inst, false);
+
+   return vcpu-arch.last_inst;
+}
+
 static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
 {
return vcpu-arch.fault_dar;
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index adeab19..6cb29ef 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -792,7 +792,7 @@ program_interrupt:
}
case BOOK3S_INTERRUPT_SYSCALL:
if (vcpu-arch.papr_enabled 
-   (kvmppc_get_last_inst(vcpu) == 0x4422) 
+   (kvmppc_get_last_sc(vcpu) == 0x4422) 
!(vcpu-arch.shared-msr  MSR_PR)) {
/* SC 1 papr hypercalls */
ulong cmd = kvmppc_get_gpr(vcpu, 3);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 10/23] KVM: PPC: Book3S PR: Make HPT accesses and updates SMP-safe

2013-08-05 Thread Paul Mackerras

This adds a per-VM mutex to provide mutual exclusion between vcpus
for accesses to and updates of the guest hashed page table (HPT).
This also makes the code use single-byte writes to the HPT entry
when updating of the reference (R) and change (C) bits.  The reason
for doing this, rather than writing back the whole HPTE, is that on
non-PAPR virtual machines, the guest OS might be writing to the HPTE
concurrently, and writing back the whole HPTE might conflict with
that.  Also, real hardware does single-byte writes to update R and C.

The new mutex is taken in kvmppc_mmu_book3s_64_xlate() when reading
the HPT and updating R and/or C, and in the PAPR HPT update hcalls
(H_ENTER, H_REMOVE, etc.).  Having the mutex means that we don't need
to use a hypervisor lock bit in the HPT update hcalls, and we don't
need to be careful about the order in which the bytes of the HPTE are
updated by those hcalls.

The other change here is to make emulated TLB invalidations (tlbie)
effective across all vcpus.  To do this we call kvmppc_mmu_pte_vflush
for all vcpus in kvmppc_ppc_book3s_64_tlbie().

For 32-bit, this makes the setting of the accessed and dirty bits use
single-byte writes, and makes tlbie invalidate shadow HPTEs for all
vcpus.

With this, PR KVM can successfully run SMP guests.

Signed-off-by: Paul Mackerras pau...@samba.org
---
 arch/powerpc/include/asm/kvm_host.h |  3 +++
 arch/powerpc/kvm/book3s_32_mmu.c| 36 ++--
 arch/powerpc/kvm/book3s_64_mmu.c| 33 +++--
 arch/powerpc/kvm/book3s_pr.c|  1 +
 arch/powerpc/kvm/book3s_pr_papr.c   | 33 +++--
 5 files changed, 72 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 2d3c770..c37207f 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -259,6 +259,9 @@ struct kvm_arch {
struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
int hpt_cma_alloc;
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
+#ifdef CONFIG_KVM_BOOK3S_PR
+   struct mutex hpt_mutex;
+#endif
 #ifdef CONFIG_PPC_BOOK3S_64
struct list_head spapr_tce_tables;
struct list_head rtas_tokens;
diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c
index af04553..856af98 100644
--- a/arch/powerpc/kvm/book3s_32_mmu.c
+++ b/arch/powerpc/kvm/book3s_32_mmu.c
@@ -271,19 +271,22 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu 
*vcpu, gva_t eaddr,
/* Update PTE C and A bits, so the guest's swapper knows we used the
   page */
if (found) {
-   u32 oldpte = pteg[i+1];
-
-   if (pte-may_read)
-   pteg[i+1] |= PTEG_FLAG_ACCESSED;
-   if (pte-may_write)
-   pteg[i+1] |= PTEG_FLAG_DIRTY;
-   else
-   dprintk_pte(KVM: Mapping read-only page!\n);
-
-   /* Write back into the PTEG */
-   if (pteg[i+1] != oldpte)
-   copy_to_user((void __user *)ptegp, pteg, sizeof(pteg));
-
+   u32 pte_r = pteg[i+1];
+   char __user *addr = (char __user *) pteg[i+1];
+
+   /*
+* Use single-byte writes to update the HPTE, to
+* conform to what real hardware does.
+*/
+   if (pte-may_read  !(pte_r  PTEG_FLAG_ACCESSED)) {
+   pte_r |= PTEG_FLAG_ACCESSED;
+   put_user(pte_r  8, addr + 2);
+   }
+   if (pte-may_write  !(pte_r  PTEG_FLAG_DIRTY)) {
+   /* XXX should only set this for stores */
+   pte_r |= PTEG_FLAG_DIRTY;
+   put_user(pte_r, addr + 3);
+   }
return 0;
}
 
@@ -348,7 +351,12 @@ static void kvmppc_mmu_book3s_32_mtsrin(struct kvm_vcpu 
*vcpu, u32 srnum,
 
 static void kvmppc_mmu_book3s_32_tlbie(struct kvm_vcpu *vcpu, ulong ea, bool 
large)
 {
-   kvmppc_mmu_pte_flush(vcpu, ea, 0x0000);
+   int i;
+   struct kvm_vcpu *v;
+
+   /* flush this VA on all cpus */
+   kvm_for_each_vcpu(i, v, vcpu-kvm)
+   kvmppc_mmu_pte_flush(v, ea, 0x0000);
 }
 
 static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index 563fbf7..26a57ca 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -257,6 +257,8 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu 
*vcpu, gva_t eaddr,
 
pgsize = slbe-large ? MMU_PAGE_16M : MMU_PAGE_4K;
 
+   mutex_lock(vcpu-kvm-arch.hpt_mutex);
+
 do_second:
ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu_book3s, slbe, eaddr, second);
if (kvm_is_error_hva(ptegp))
@@ -332,30 +334,37 @@ do_second:
 
/* Update PTE R and C bits, so the

[PATCH 17/23] KVM: PPC: Book3S HV: Factorize kvmppc_core_vcpu_create_hv()

2013-08-05 Thread Paul Mackerras

This splits kvmppc_core_vcpu_create_hv() into three functions and
adds a new kvmppc_free_vcores() to free the kvmppc_vcore structures
that we allocate for a guest, which are currently being leaked.
The reason for the split is to make the split-out code available
for later use in converting PR kvm_vcpu structs to HV use.

Signed-off-by: Paul Mackerras pau...@samba.org
---
 arch/powerpc/kvm/book3s_hv.c | 95 +++-
 1 file changed, 59 insertions(+), 36 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 13f79dd..c524d6b 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -891,32 +891,51 @@ int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
return r;
 }
 
-struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, unsigned int id)
+static int kvmppc_alloc_vcore(struct kvm_vcpu *vcpu, unsigned int id)
 {
-   struct kvm_vcpu *vcpu;
-   int err = -EINVAL;
-   int core;
+   struct kvm *kvm = vcpu-kvm;
struct kvmppc_vcore *vcore;
+   int core;
 
core = id / threads_per_core;
if (core = KVM_MAX_VCORES)
-   goto out;
+   return -EINVAL;
 
-   err = -ENOMEM;
-   vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
-   if (!vcpu)
-   goto out;
+   vcore = kvm-arch.vcores[core];
+   if (!vcore) {
+   vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL);
+   if (!vcore)
+   return -ENOMEM;
+   INIT_LIST_HEAD(vcore-runnable_threads);
+   spin_lock_init(vcore-lock);
+   init_waitqueue_head(vcore-wq);
+   vcore-preempt_tb = TB_NIL;
+   kvm-arch.vcores[core] = vcore;
+   kvm-arch.online_vcores++;
+   }
 
-   err = kvm_vcpu_init(vcpu, kvm, id);
-   if (err)
-   goto free_vcpu;
+   spin_lock(vcore-lock);
+   ++vcore-num_threads;
+   spin_unlock(vcore-lock);
+   vcpu-arch.vcore = vcore;
+
+   return 0;
+}
 
+static void kvmppc_free_vcores(struct kvm *kvm)
+{
+   long int i;
+
+   for (i = 0; i  KVM_MAX_VCORES; ++i)
+   kfree(kvm-arch.vcores[i]);
+   kvm-arch.online_vcores = 0;
+}
+
+static void kvmppc_setup_hv_vcpu(struct kvm_vcpu *vcpu)
+{
vcpu-arch.shared = vcpu-arch.shregs;
vcpu-arch.mmcr[0] = MMCR0_FC;
vcpu-arch.ctrl = CTRL_RUNLATCH;
-   /* default to host PVR, since we can't spoof it */
-   vcpu-arch.pvr = mfspr(SPRN_PVR);
-   kvmppc_set_pvr_hv(vcpu, vcpu-arch.pvr);
spin_lock_init(vcpu-arch.vpa_update_lock);
spin_lock_init(vcpu-arch.tbacct_lock);
vcpu-arch.busy_preempt = TB_NIL;
@@ -927,31 +946,34 @@ struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm 
*kvm, unsigned int id)
 
init_waitqueue_head(vcpu-arch.cpu_run);
 
-   mutex_lock(kvm-lock);
-   vcore = kvm-arch.vcores[core];
-   if (!vcore) {
-   vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL);
-   if (vcore) {
-   INIT_LIST_HEAD(vcore-runnable_threads);
-   spin_lock_init(vcore-lock);
-   init_waitqueue_head(vcore-wq);
-   vcore-preempt_tb = TB_NIL;
-   }
-   kvm-arch.vcores[core] = vcore;
-   kvm-arch.online_vcores++;
-   }
-   mutex_unlock(kvm-lock);
+   vcpu-arch.cpu_type = KVM_CPU_3S_64;
+   kvmppc_sanity_check(vcpu);
+}
 
-   if (!vcore)
+struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, unsigned int id)
+{
+   struct kvm_vcpu *vcpu;
+   int err = -EINVAL;
+
+   err = -ENOMEM;
+   vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+   if (!vcpu)
+   goto out;
+
+   err = kvm_vcpu_init(vcpu, kvm, id);
+   if (err)
goto free_vcpu;
 
-   spin_lock(vcore-lock);
-   ++vcore-num_threads;
-   spin_unlock(vcore-lock);
-   vcpu-arch.vcore = vcore;
+   /* default to host PVR, since we can't spoof it */
+   vcpu-arch.pvr = mfspr(SPRN_PVR);
 
-   vcpu-arch.cpu_type = KVM_CPU_3S_64;
-   kvmppc_sanity_check(vcpu);
+   mutex_lock(kvm-lock);
+   err = kvmppc_alloc_vcore(vcpu, id);
+   mutex_unlock(kvm-lock);
+   if (err)
+   goto free_vcpu;
+
+   kvmppc_setup_hv_vcpu(vcpu);
 
return vcpu;
 
@@ -1890,6 +1912,7 @@ void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
 {
uninhibit_secondary_onlining();
 
+   kvmppc_free_vcores(kvm);
if (kvm-arch.rma) {
kvm_release_rma(kvm-arch.rma);
kvm-arch.rma = NULL;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 14/23] KVM: PPC: Book3S PR: Delay disabling relocation-on interrupts

2013-08-05 Thread Paul Mackerras

When we are running a PR KVM guest on POWER8, we have to disable the
new POWER8 feature of taking interrupts with relocation on, that is,
of taking interrupts without disabling the MMU, because the SLB does
not contain the normal kernel SLB entries while in the guest.
Currently we disable relocation-on interrupts when a PR guest is
created, and leave it disabled until there are no more PR guests in
existence.

This defers the disabling of relocation-on interrupts until the first
time a PR KVM guest vcpu is run.  The reason is that in future we will
support both PR and HV guests in the same kernel, and this will avoid
disabling relocation-on interrupts unnecessarily for guests which turn
out to be HV guests, as we will not know at VM creation time whether
it will be a PR or a HV guest.

Signed-off-by: Paul Mackerras pau...@samba.org
---
 arch/powerpc/include/asm/kvm_host.h |  1 +
 arch/powerpc/kvm/book3s_pr.c| 71 ++---
 2 files changed, 52 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 4d83972..c012db2 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -264,6 +264,7 @@ struct kvm_arch {
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
 #ifdef CONFIG_KVM_BOOK3S_PR
struct mutex hpt_mutex;
+   bool relon_disabled;
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
struct list_head spapr_tce_tables;
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 5b06a70..2759ddc 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1197,6 +1197,47 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
kmem_cache_free(kvm_vcpu_cache, vcpu);
 }
 
+/*
+ * On POWER8, we have to disable relocation-on interrupts while
+ * we are in the guest, since the guest doesn't have the normal
+ * kernel SLB contents.  Since disabling relocation-on interrupts
+ * is a fairly heavy-weight operation, we do it once when starting
+ * the first guest vcpu and leave it disabled until the last guest
+ * has been destroyed.
+ */
+static unsigned int kvm_global_user_count = 0;
+static DEFINE_SPINLOCK(kvm_global_user_count_lock);
+
+static void disable_relon_interrupts(struct kvm *kvm)
+{
+   mutex_lock(kvm-lock);
+   if (!kvm-arch.relon_disabled) {
+   if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
+   spin_lock(kvm_global_user_count_lock);
+   if (++kvm_global_user_count == 1)
+   pSeries_disable_reloc_on_exc();
+   spin_unlock(kvm_global_user_count_lock);
+   }
+   /* order disabling above with setting relon_disabled */
+   smp_mb();
+   kvm-arch.relon_disabled = true;
+   }
+   mutex_unlock(kvm-lock);
+}
+
+static void enable_relon_interrupts(struct kvm *kvm)
+{
+   if (kvm-arch.relon_disabled 
+   firmware_has_feature(FW_FEATURE_SET_MODE)) {
+   spin_lock(kvm_global_user_count_lock);
+   BUG_ON(kvm_global_user_count == 0);
+   if (--kvm_global_user_count == 0)
+   pSeries_enable_reloc_on_exc();
+   spin_unlock(kvm_global_user_count_lock);
+   }
+   kvm-arch.relon_disabled = false;
+}
+
 int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 {
int ret;
@@ -1234,6 +1275,9 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct 
kvm_vcpu *vcpu)
goto out;
}
 
+   if (!vcpu-kvm-arch.relon_disabled)
+   disable_relon_interrupts(vcpu-kvm);
+
/* Save FPU state in stack */
if (current-thread.regs-msr  MSR_FP)
giveup_fpu(current);
@@ -1400,9 +1444,6 @@ void kvmppc_core_flush_memslot(struct kvm *kvm, struct 
kvm_memory_slot *memslot)
 {
 }
 
-static unsigned int kvm_global_user_count = 0;
-static DEFINE_SPINLOCK(kvm_global_user_count_lock);
-
 int kvmppc_core_init_vm(struct kvm *kvm)
 {
 #ifdef CONFIG_PPC64
@@ -1411,28 +1452,18 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 #endif
mutex_init(kvm-arch.hpt_mutex);
 
-   if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
-   spin_lock(kvm_global_user_count_lock);
-   if (++kvm_global_user_count == 1)
-   pSeries_disable_reloc_on_exc();
-   spin_unlock(kvm_global_user_count_lock);
-   }
+   /*
+* If we don't have relocation-on interrupts at all,
+* then we can consider them to be already disabled.
+*/
+   kvm-arch.relon_disabled = !firmware_has_feature(FW_FEATURE_SET_MODE);
+
return 0;
 }
 
 void kvmppc_core_destroy_vm(struct kvm *kvm)
 {
-#ifdef CONFIG_PPC64
-   WARN_ON(!list_empty(kvm-arch.spapr_tce_tables));
-#endif
-
-   if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
-   spin_lock(kvm_global_user_count_lock);
-

1 2 >

1 - 100 of 142 matches

Mail list logo