Re: [uq/master PATCH] kvmvapic: add ioport read accessor

2013-05-06 Thread Jan Kiszka
On 2013-05-05 22:51, Marcelo Tosatti wrote:
 
 Necessary since memory region accessor assumes read and write
 methods are registered. Otherwise reading I/O port 0x7e segfaults.
 
 https://bugzilla.redhat.com/show_bug.cgi?id=954306
 
 Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
 
 diff --git a/hw/i386/kvmvapic.c b/hw/i386/kvmvapic.c
 index 5b558aa..655483b 100644
 --- a/hw/i386/kvmvapic.c
 +++ b/hw/i386/kvmvapic.c
 @@ -687,8 +687,14 @@ static void vapic_write(void *opaque, hwaddr addr, 
 uint64_t data,
  }
  }
  
 +static uint64_t vapic_read(void *opaque, hwaddr addr, unsigned size)
 +{
 +return 0x;
 +}
 +
  static const MemoryRegionOps vapic_ops = {
  .write = vapic_write,
 +.read = vapic_read,
  .endianness = DEVICE_NATIVE_ENDIAN,
  };
  
 

Right. I'm just wondering why the guest reads from that port.

Reviewed-by: Jan Kiszka jan.kis...@siemens.com

-- 
Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
Corporate Competence Center Embedded Linux
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 02/13] nEPT: Move gpte_access() and prefetch_invalid_gpte() to paging_tmpl.h

2013-05-06 Thread Jun Nakajima
For preparation, we just move gpte_access() and prefetch_invalid_gpte() from 
mmu.c to paging_tmpl.h.

Signed-off-by: Nadav Har'El n...@il.ibm.com
Signed-off-by: Jun Nakajima jun.nakaj...@intel.com
Signed-off-by: Xinhao Xu xinhao...@intel.com
---
 arch/x86/kvm/mmu.c | 30 --
 arch/x86/kvm/paging_tmpl.h | 40 +++-
 2 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 956ca35..a431495 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2480,26 +2480,6 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu 
*vcpu, gfn_t gfn,
return gfn_to_pfn_memslot_atomic(slot, gfn);
 }
 
-static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *spte,
- u64 gpte)
-{
-   if (is_rsvd_bits_set(vcpu-arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
-   goto no_present;
-
-   if (!is_present_gpte(gpte))
-   goto no_present;
-
-   if (!(gpte  PT_ACCESSED_MASK))
-   goto no_present;
-
-   return false;
-
-no_present:
-   drop_spte(vcpu-kvm, spte);
-   return true;
-}
-
 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp,
u64 *start, u64 *end)
@@ -3399,16 +3379,6 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, 
unsigned access,
return false;
 }
 
-static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
-{
-   unsigned access;
-
-   access = (gpte  (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
-   access = ~(gpte  PT64_NX_SHIFT);
-
-   return access;
-}
-
 static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned 
gpte)
 {
unsigned index;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 105dd5b..13ceca6 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -103,6 +103,36 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
return (ret != orig_pte);
 }
 
+static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *spte,
+ u64 gpte)
+{
+   if (is_rsvd_bits_set(vcpu-arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+   goto no_present;
+
+   if (!is_present_gpte(gpte))
+   goto no_present;
+
+   if (!(gpte  PT_ACCESSED_MASK))
+   goto no_present;
+
+   return false;
+
+no_present:
+   drop_spte(vcpu-kvm, spte);
+   return true;
+}
+
+static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
+{
+   unsigned access;
+
+   access = (gpte  (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+   access = ~(gpte  PT64_NX_SHIFT);
+
+   return access;
+}
+
 static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
 struct kvm_mmu *mmu,
 struct guest_walker *walker,
@@ -225,7 +255,7 @@ retry_walk:
}
 
accessed_dirty = pte;
-   pte_access = pt_access  gpte_access(vcpu, pte);
+   pte_access = pt_access  FNAME(gpte_access)(vcpu, pte);
 
walker-ptes[walker-level - 1] = pte;
} while (!is_last_gpte(mmu, walker-level, pte));
@@ -309,13 +339,13 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct 
kvm_mmu_page *sp,
gfn_t gfn;
pfn_t pfn;
 
-   if (prefetch_invalid_gpte(vcpu, sp, spte, gpte))
+   if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
return false;
 
pgprintk(%s: gpte %llx spte %p\n, __func__, (u64)gpte, spte);
 
gfn = gpte_to_gfn(gpte);
-   pte_access = sp-role.access  gpte_access(vcpu, gpte);
+   pte_access = sp-role.access  FNAME(gpte_access)(vcpu, gpte);
protect_clean_gpte(pte_access, gpte);
pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
no_dirty_log  (pte_access  ACC_WRITE_MASK));
@@ -782,14 +812,14 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct 
kvm_mmu_page *sp)
  sizeof(pt_element_t)))
return -EINVAL;
 
-   if (prefetch_invalid_gpte(vcpu, sp, sp-spt[i], gpte)) {
+   if (FNAME(prefetch_invalid_gpte)(vcpu, sp, sp-spt[i], gpte)) {
vcpu-kvm-tlbs_dirty++;
continue;
}
 
gfn = gpte_to_gfn(gpte);
pte_access = sp-role.access;
-   pte_access = gpte_access(vcpu, gpte);
+   pte_access = FNAME(gpte_access)(vcpu, gpte);
protect_clean_gpte(pte_access, gpte);
 
if (sync_mmio_spte(sp-spt[i], gfn, 

[PATCH v2 01/13] nEPT: Support LOAD_IA32_EFER entry/exit controls for L1

2013-05-06 Thread Jun Nakajima
Recent KVM, since http://kerneltrap.org/mailarchive/linux-kvm/2010/5/2/6261577
switch the EFER MSR when EPT is used and the host and guest have different
NX bits. So if we add support for nested EPT (L1 guest using EPT to run L2)
and want to be able to run recent KVM as L1, we need to allow L1 to use this
EFER switching feature.

To do this EFER switching, KVM uses VM_ENTRY/EXIT_LOAD_IA32_EFER if available,
and if it isn't, it uses the generic VM_ENTRY/EXIT_MSR_LOAD. This patch adds
support for the former (the latter is still unsupported).

Nested entry and exit emulation (prepare_vmcs_02 and load_vmcs12_host_state,
respectively) already handled VM_ENTRY/EXIT_LOAD_IA32_EFER correctly. So all
that's left to do in this patch is to properly advertise this feature to L1.

Note that vmcs12's VM_ENTRY/EXIT_LOAD_IA32_EFER are emulated by L0, by using
vmx_set_efer (which itself sets one of several vmcs02 fields), so we always
support this feature, regardless of whether the host supports it.

Signed-off-by: Nadav Har'El n...@il.ibm.com
Signed-off-by: Jun Nakajima jun.nakaj...@intel.com
Signed-off-by: Xinhao Xu xinhao...@intel.com
---
 arch/x86/kvm/vmx.c | 18 ++
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 867b810..485ded6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2057,6 +2057,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 #else
nested_vmx_exit_ctls_high = 0;
 #endif
+   nested_vmx_exit_ctls_high |= VM_EXIT_LOAD_IA32_EFER;
 
/* entry controls */
rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2064,6 +2065,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
nested_vmx_entry_ctls_low = 0;
nested_vmx_entry_ctls_high =
VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
+   nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_IA32_EFER;
 
/* cpu-based controls */
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
@@ -7050,10 +7052,18 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, 
struct vmcs12 *vmcs12)
vcpu-arch.cr0_guest_owned_bits = ~vmcs12-cr0_guest_host_mask;
vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu-arch.cr0_guest_owned_bits);
 
-   /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */
-   vmcs_write32(VM_EXIT_CONTROLS,
-   vmcs12-vm_exit_controls | vmcs_config.vmexit_ctrl);
-   vmcs_write32(VM_ENTRY_CONTROLS, vmcs12-vm_entry_controls |
+   /* L2-L1 exit controls are emulated - the hardware exit is to L0 so
+* we should use its exit controls. Note that IA32_MODE, LOAD_IA32_EFER
+* bits are further modified by vmx_set_efer() below.
+*/
+   vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+
+   /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
+* emulated by vmx_set_efer(), below.
+*/
+   vmcs_write32(VM_ENTRY_CONTROLS,
+   (vmcs12-vm_entry_controls  ~VM_ENTRY_LOAD_IA32_EFER 
+   ~VM_ENTRY_IA32E_MODE) |
(vmcs_config.vmentry_ctrl  ~VM_ENTRY_IA32E_MODE));
 
if (vmcs12-vm_entry_controls  VM_ENTRY_LOAD_IA32_PAT)
-- 
1.8.1.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 03/13] nEPT: Add EPT tables support to paging_tmpl.h

2013-05-06 Thread Jun Nakajima
This is the first patch in a series which adds nested EPT support to KVM's
nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use
EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest
to set its own cr3 and take its own page faults without either of L0 or L1
getting involved. This often significanlty improves L2's performance over the
previous two alternatives (shadow page tables over EPT, and shadow page
tables over shadow page tables).

This patch adds EPT support to paging_tmpl.h.

paging_tmpl.h contains the code for reading and writing page tables. The code
for 32-bit and 64-bit tables is very similar, but not identical, so
paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once
with PTTYPE=64, and this generates the two sets of similar functions.

There are subtle but important differences between the format of EPT tables
and that of ordinary x86 64-bit page tables, so for nested EPT we need a
third set of functions to read the guest EPT table and to write the shadow
EPT table.

So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed
with EPT) which correctly read and write EPT tables.

Signed-off-by: Nadav Har'El n...@il.ibm.com
Signed-off-by: Jun Nakajima jun.nakaj...@intel.com
Signed-off-by: Xinhao Xu xinhao...@intel.com
---
 arch/x86/kvm/mmu.c |  5 +
 arch/x86/kvm/paging_tmpl.h | 43 +--
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a431495..cb9c6fd 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3388,6 +3388,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, 
unsigned level, unsigned gp
return mmu-last_pte_bitmap  (1  index);
 }
 
+#define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE PTTYPE_EPT
+#include paging_tmpl.h
+#undef PTTYPE
+
 #define PTTYPE 64
 #include paging_tmpl.h
 #undef PTTYPE
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 13ceca6..5644f61 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -50,6 +50,22 @@
#define PT_LEVEL_BITS PT32_LEVEL_BITS
#define PT_MAX_FULL_LEVELS 2
#define CMPXCHG cmpxchg
+#elif PTTYPE == PTTYPE_EPT
+   #define pt_element_t u64
+   #define guest_walker guest_walkerEPT
+   #define FNAME(name) EPT_##name
+   #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+   #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+   #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
+   #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+   #define PT_LEVEL_BITS PT64_LEVEL_BITS
+   #ifdef CONFIG_X86_64
+   #define PT_MAX_FULL_LEVELS 4
+   #define CMPXCHG cmpxchg
+   #else
+   #define CMPXCHG cmpxchg64
+   #define PT_MAX_FULL_LEVELS 2
+   #endif
 #else
#error Invalid PTTYPE value
 #endif
@@ -80,6 +96,10 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
return (gpte  PT_LVL_ADDR_MASK(lvl))  PAGE_SHIFT;
 }
 
+#if PTTYPE != PTTYPE_EPT
+/*
+ *  Comment out this for EPT because update_accessed_dirty_bits() is not used.
+ */
 static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
   pt_element_t __user *ptep_user, unsigned index,
   pt_element_t orig_pte, pt_element_t new_pte)
@@ -102,6 +122,7 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
 
return (ret != orig_pte);
 }
+#endif
 
 static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
  struct kvm_mmu_page *sp, u64 *spte,
@@ -126,13 +147,21 @@ no_present:
 static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
 {
unsigned access;
-
+#if PTTYPE == PTTYPE_EPT
+   access = (gpte  (VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK |
+ VMX_EPT_EXECUTABLE_MASK));
+#else
access = (gpte  (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
access = ~(gpte  PT64_NX_SHIFT);
+#endif
 
return access;
 }
 
+#if PTTYPE != PTTYPE_EPT
+/*
+ * EPT A/D bit support is not implemented.
+ */
 static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
 struct kvm_mmu *mmu,
 struct guest_walker *walker,
@@ -169,6 +198,7 @@ static int FNAME(update_accessed_dirty_bits)(struct 
kvm_vcpu *vcpu,
}
return 0;
 }
+#endif
 
 /*
  * Fetch a guest pte for a guest virtual address
@@ -177,7 +207,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker 
*walker,
struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gva_t addr, u32 access)
 {
-   int ret;
pt_element_t pte;
pt_element_t __user *uninitialized_var(ptep_user);
gfn_t table_gfn;
@@ 

[PATCH v2 04/13] nEPT: Define EPT-specific link_shadow_page()

2013-05-06 Thread Jun Nakajima
Since link_shadow_page() is used by a routine in mmu.c, add an
EPT-specific link_shadow_page() in paging_tmp.h, rather than moving
it.

Signed-off-by: Nadav Har'El n...@il.ibm.com
Signed-off-by: Jun Nakajima jun.nakaj...@intel.com
Signed-off-by: Xinhao Xu xinhao...@intel.com
---
 arch/x86/kvm/paging_tmpl.h | 20 
 1 file changed, 20 insertions(+)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 5644f61..51dca23 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -461,6 +461,18 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, 
struct guest_walker *gw,
}
 }
 
+#if PTTYPE == PTTYPE_EPT
+static void FNAME(link_shadow_page)(u64 *sptep, struct kvm_mmu_page *sp)
+{
+   u64 spte;
+
+   spte = __pa(sp-spt) | VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK |
+   VMX_EPT_EXECUTABLE_MASK;
+
+   mmu_spte_set(sptep, spte);
+}
+#endif
+
 /*
  * Fetch a shadow pte for a specific level in the paging hierarchy.
  * If the guest tries to write a write-protected page, we need to
@@ -513,7 +525,11 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
goto out_gpte_changed;
 
if (sp)
+#if PTTYPE == PTTYPE_EPT
+   FNAME(link_shadow_page)(it.sptep, sp);
+#else
link_shadow_page(it.sptep, sp);
+#endif
}
 
for (;
@@ -533,7 +549,11 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 
sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
  true, direct_access, it.sptep);
+#if PTTYPE == PTTYPE_EPT
+   FNAME(link_shadow_page)(it.sptep, sp);
+#else
link_shadow_page(it.sptep, sp);
+#endif
}
 
clear_sp_write_flooding_count(it.sptep);
-- 
1.8.1.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 05/13] nEPT: MMU context for nested EPT

2013-05-06 Thread Jun Nakajima
KVM's existing shadow MMU code already supports nested TDP. To use it, we
need to set up a new MMU context for nested EPT, and create a few callbacks
for it (nested_ept_*()). This context should also use the EPT versions of
the page table access functions (defined in the previous patch).
Then, we need to switch back and forth between this nested context and the
regular MMU context when switching between L1 and L2 (when L1 runs this L2
with EPT).

Signed-off-by: Nadav Har'El n...@il.ibm.com
Signed-off-by: Jun Nakajima jun.nakaj...@intel.com
Signed-off-by: Xinhao Xu xinhao...@intel.com
---
 arch/x86/kvm/mmu.c | 38 ++
 arch/x86/kvm/mmu.h |  1 +
 arch/x86/kvm/vmx.c | 53 -
 3 files changed, 91 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cb9c6fd..99bfc5e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3644,6 +3644,44 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct 
kvm_mmu *context)
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
 
+int kvm_init_shadow_EPT_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
+{
+   ASSERT(vcpu);
+   ASSERT(!VALID_PAGE(vcpu-arch.mmu.root_hpa));
+
+   context-shadow_root_level = kvm_x86_ops-get_tdp_level();
+
+   context-nx = is_nx(vcpu); /* TODO: ? */
+   context-new_cr3 = paging_new_cr3;
+   context-page_fault = EPT_page_fault;
+   context-gva_to_gpa = EPT_gva_to_gpa;
+   context-sync_page = EPT_sync_page;
+   context-invlpg = EPT_invlpg;
+   context-update_pte = EPT_update_pte;
+   context-free = paging_free;
+   context-root_level = context-shadow_root_level;
+   context-root_hpa = INVALID_PAGE;
+   context-direct_map = false;
+
+   /* TODO: reset_rsvds_bits_mask() is not built for EPT, we need
+  something different.
+*/
+   reset_rsvds_bits_mask(vcpu, context);
+
+
+   /* TODO: I copied these from kvm_init_shadow_mmu, I don't know why
+  they are done, or why they write to vcpu-arch.mmu and not context
+*/
+   vcpu-arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
+   vcpu-arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
+   vcpu-arch.mmu.base_role.smep_andnot_wp =
+   kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) 
+   !is_write_protection(vcpu);
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_EPT_mmu);
+
 static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
int r = kvm_init_shadow_mmu(vcpu, vcpu-arch.walk_mmu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 6987108..19dd5ab 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -54,6 +54,7 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 
addr, u64 sptes[4]);
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool 
direct);
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+int kvm_init_shadow_EPT_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
 
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 485ded6..8fdcacf 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -918,6 +918,11 @@ static inline bool nested_cpu_has_virtual_nmis(struct 
vmcs12 *vmcs12,
return vmcs12-pin_based_vm_exec_control  PIN_BASED_VIRTUAL_NMIS;
 }
 
+static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
+{
+   return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
return (intr_info  (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -6873,6 +6878,46 @@ static void vmx_set_supported_cpuid(u32 func, struct 
kvm_cpuid_entry2 *entry)
entry-ecx |= bit(X86_FEATURE_VMX);
 }
 
+/* Callbacks for nested_ept_init_mmu_context: */
+
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
+{
+   /* return the page table to be shadowed - in our case, EPT12 */
+   return get_vmcs12(vcpu)-ept_pointer;
+}
+
+static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
+   struct x86_exception *fault)
+{
+   struct vmcs12 *vmcs12;
+   nested_vmx_vmexit(vcpu);
+   vmcs12 = get_vmcs12(vcpu);
+   /*
+* Note no need to set vmcs12-vm_exit_reason as it is already copied
+* from vmcs02 in nested_vmx_vmexit() above, i.e., EPT_VIOLATION.
+*/
+   vmcs12-exit_qualification = fault-error_code;
+   vmcs12-guest_physical_address = fault-address;
+}
+
+static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+   int r = kvm_init_shadow_EPT_mmu(vcpu, vcpu-arch.mmu);
+
+   vcpu-arch.mmu.set_cr3   = vmx_set_cr3;
+   vcpu-arch.mmu.get_cr3   = nested_ept_get_cr3;
+   vcpu-arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
+
+   vcpu-arch.walk_mmu  = 

[PATCH v2 06/13] nEPT: Fix cr3 handling in nested exit and entry

2013-05-06 Thread Jun Nakajima
The existing code for handling cr3 and related VMCS fields during nested
exit and entry wasn't correct in all cases:

If L2 is allowed to control cr3 (and this is indeed the case in nested EPT),
during nested exit we must copy the modified cr3 from vmcs02 to vmcs12, and
we forgot to do so. This patch adds this copy.

If L0 isn't controlling cr3 when running L2 (i.e., L0 is using EPT), and
whoever does control cr3 (L1 or L2) is using PAE, the processor might have
saved PDPTEs and we should also save them in vmcs12 (and restore later).

Signed-off-by: Nadav Har'El n...@il.ibm.com
Signed-off-by: Jun Nakajima jun.nakaj...@intel.com
Signed-off-by: Xinhao Xu xinhao...@intel.com
---
 arch/x86/kvm/vmx.c | 37 -
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8fdcacf..d797d3e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7163,10 +7163,26 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, 
struct vmcs12 *vmcs12)
vmx_set_cr4(vcpu, vmcs12-guest_cr4);
vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
 
-   /* shadow page tables on either EPT or shadow page tables */
+   /*
+* Note that kvm_set_cr3() and kvm_mmu_reset_context() will do the
+* right thing, and set GUEST_CR3 and/or EPT_POINTER in all supported
+* settings: 1. shadow page tables on shadow page tables, 2. shadow
+* page tables on EPT, 3. EPT on EPT.
+*/
kvm_set_cr3(vcpu, vmcs12-guest_cr3);
kvm_mmu_reset_context(vcpu);
 
+   /*
+* Additionally, except when L0 is using shadow page tables, L1 or
+* L2 control guest_cr3 for L2, so they may also have saved PDPTEs
+*/
+   if (enable_ept) {
+   vmcs_write64(GUEST_PDPTR0, vmcs12-guest_pdptr0);
+   vmcs_write64(GUEST_PDPTR1, vmcs12-guest_pdptr1);
+   vmcs_write64(GUEST_PDPTR2, vmcs12-guest_pdptr2);
+   vmcs_write64(GUEST_PDPTR3, vmcs12-guest_pdptr3);
+   }
+
kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12-guest_rsp);
kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12-guest_rip);
 }
@@ -7398,6 +7414,25 @@ void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 
*vmcs12)
vmcs12-guest_pending_dbg_exceptions =
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
 
+   /*
+* In some cases (usually, nested EPT), L2 is allowed to change its
+* own CR3 without exiting. If it has changed it, we must keep it.
+* Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
+* by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
+*/
+   if (enable_ept)
+   vmcs12-guest_cr3 = vmcs_read64(GUEST_CR3);
+   /*
+* Additionally, except when L0 is using shadow page tables, L1 or
+* L2 control guest_cr3 for L2, so save their PDPTEs
+*/
+   if (enable_ept) {
+   vmcs12-guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
+   vmcs12-guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
+   vmcs12-guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
+   vmcs12-guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+   }
+
/* TODO: These cannot have changed unless we have MSR bitmaps and
 * the relevant bit asks not to trap the change */
vmcs12-guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
-- 
1.8.1.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 07/13] nEPT: Fix wrong test in kvm_set_cr3

2013-05-06 Thread Jun Nakajima
kvm_set_cr3() attempts to check if the new cr3 is a valid guest physical
address. The problem is that with nested EPT, cr3 is an *L2* physical
address, not an L1 physical address as this test expects.

As the comment above this test explains, it isn't necessary, and doesn't
correspond to anything a real processor would do. So this patch removes it.

Note that this wrong test could have also theoretically caused problems
in nested NPT, not just in nested EPT. However, in practice, the problem
was avoided: nested_svm_vmexit()/vmrun() do not call kvm_set_cr3 in the
nested NPT case, and instead set the vmcb (and arch.cr3) directly, thus
circumventing the problem. Additional potential calls to the buggy function
are avoided in that we don't trap cr3 modifications when nested NPT is
enabled. However, because in nested VMX we did want to use kvm_set_cr3()
(as requested in Avi Kivity's review of the original nested VMX patches),
we can't avoid this problem and need to fix it.

Signed-off-by: Nadav Har'El n...@il.ibm.com
Signed-off-by: Jun Nakajima jun.nakaj...@intel.com
Signed-off-by: Xinhao Xu xinhao...@intel.com
---
 arch/x86/kvm/x86.c | 11 ---
 1 file changed, 11 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e172132..c34590d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -659,17 +659,6 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 */
}
 
-   /*
-* Does the new cr3 value map to physical memory? (Note, we
-* catch an invalid cr3 even in real-mode, because it would
-* cause trouble later on when we turn on paging anyway.)
-*
-* A real CPU would silently accept an invalid cr3 and would
-* attempt to use it - with largely undefined (and often hard
-* to debug) behavior on the guest side.
-*/
-   if (unlikely(!gfn_to_memslot(vcpu-kvm, cr3  PAGE_SHIFT)))
-   return 1;
vcpu-arch.cr3 = cr3;
__set_bit(VCPU_EXREG_CR3, (ulong *)vcpu-arch.regs_avail);
vcpu-arch.mmu.new_cr3(vcpu);
-- 
1.8.1.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 08/13] nEPT: Some additional comments

2013-05-06 Thread Jun Nakajima
Some additional comments to preexisting code:
Explain who (L0 or L1) handles EPT violation and misconfiguration exits.
Don't mention shadow on either EPT or shadow as the only two options.

Signed-off-by: Nadav Har'El n...@il.ibm.com
Signed-off-by: Jun Nakajima jun.nakaj...@intel.com
Signed-off-by: Xinhao Xu xinhao...@intel.com
---
 arch/x86/kvm/vmx.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d797d3e..419b9e3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6127,7 +6127,20 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu 
*vcpu)
return nested_cpu_has2(vmcs12,
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
case EXIT_REASON_EPT_VIOLATION:
+   /*
+* L0 always deals with the EPT violation. If nested EPT is
+* used, and the nested mmu code discovers that the address is
+* missing in the guest EPT table (EPT12), the EPT violation
+* will be injected with nested_ept_inject_page_fault()
+*/
+   return 0;
case EXIT_REASON_EPT_MISCONFIG:
+   /*
+* L2 never uses directly L1's EPT, but rather L0's own EPT
+* table (shadow on EPT) or a merged EPT table that L0 built
+* (EPT on EPT). So any problems with the structure of the
+* table is L0's fault.
+*/
return 0;
case EXIT_REASON_WBINVD:
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
-- 
1.8.1.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 09/13] nEPT: Advertise EPT to L1

2013-05-06 Thread Jun Nakajima
Advertise the support of EPT to the L1 guest, through the appropriate MSR.

This is the last patch of the basic Nested EPT feature, so as to allow
bisection through this patch series: The guest will not see EPT support until
this last patch, and will not attempt to use the half-applied feature.

Signed-off-by: Nadav Har'El n...@il.ibm.com
Signed-off-by: Jun Nakajima jun.nakaj...@intel.com
Signed-off-by: Xinhao Xu xinhao...@intel.com
---
 arch/x86/include/asm/vmx.h |  2 ++
 arch/x86/kvm/vmx.c | 17 +++--
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index b6fbf86..79a5beb 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -376,7 +376,9 @@ enum vmcs_field {
 #define VMX_EPTP_WB_BIT(1ull  14)
 #define VMX_EPT_2MB_PAGE_BIT   (1ull  16)
 #define VMX_EPT_1GB_PAGE_BIT   (1ull  17)
+#define VMX_EPT_INVEPT_BIT (1ull  20)
 #define VMX_EPT_AD_BIT (1ull  21)
+#define VMX_EPT_EXTENT_INDIVIDUAL_BIT  (1ull  24)
 #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull  25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT  (1ull  26)
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 419b9e3..de6cfb4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2027,6 +2027,7 @@ static u32 nested_vmx_secondary_ctls_low, 
nested_vmx_secondary_ctls_high;
 static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
 static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
 static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
+static u32 nested_vmx_ept_caps;
 static __init void nested_vmx_setup_ctls_msrs(void)
 {
/*
@@ -2102,6 +2103,18 @@ static __init void nested_vmx_setup_ctls_msrs(void)
nested_vmx_secondary_ctls_low = 0;
nested_vmx_secondary_ctls_high =
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+   if (enable_ept) {
+   /* nested EPT: emulate EPT also to L1 */
+   nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
+   nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT;
+   nested_vmx_ept_caps |=
+   VMX_EPT_INVEPT_BIT | VMX_EPT_EXTENT_GLOBAL_BIT |
+   VMX_EPT_EXTENT_CONTEXT_BIT |
+   VMX_EPT_EXTENT_INDIVIDUAL_BIT;
+   nested_vmx_ept_caps = vmx_capability.ept;
+   } else
+   nested_vmx_ept_caps = 0;
+
 }
 
 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
@@ -2201,8 +2214,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 
msr_index, u64 *pdata)
nested_vmx_secondary_ctls_high);
break;
case MSR_IA32_VMX_EPT_VPID_CAP:
-   /* Currently, no nested ept or nested vpid */
-   *pdata = 0;
+   /* Currently, no nested vpid support */
+   *pdata = nested_vmx_ept_caps;
break;
default:
return 0;
-- 
1.8.1.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 10/13] nEPT: Nested INVEPT

2013-05-06 Thread Jun Nakajima
If we let L1 use EPT, we should probably also support the INVEPT instruction.

In our current nested EPT implementation, when L1 changes its EPT table for
L2 (i.e., EPT12), L0 modifies the shadow EPT table (EPT02), and in the course
of this modification already calls INVEPT. Therefore, when L1 calls INVEPT,
we don't really need to do anything. In particular we *don't* need to call
the real INVEPT again. All we do in our INVEPT is verify the validity of the
call, and its parameters, and then do nothing.

In KVM Forum 2010, Dong et al. presented Nested Virtualization Friendly KVM
and classified our current nested EPT implementation as shadow-like virtual
EPT. He recommended instead a different approach, which he called VTLB-like
virtual EPT. If we had taken that alternative approach, INVEPT would have had
a bigger role: L0 would only rebuild the shadow EPT table when L1 calls INVEPT.

Signed-off-by: Nadav Har'El n...@il.ibm.com
Signed-off-by: Jun Nakajima jun.nakaj...@intel.com
Signed-off-by: Xinhao Xu xinhao...@intel.com
---
 arch/x86/include/uapi/asm/vmx.h |  1 +
 arch/x86/kvm/vmx.c  | 83 +
 2 files changed, 84 insertions(+)

diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 2871fcc..ec51012 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -65,6 +65,7 @@
 #define EXIT_REASON_EOI_INDUCED 45
 #define EXIT_REASON_EPT_VIOLATION   48
 #define EXIT_REASON_EPT_MISCONFIG   49
+#define EXIT_REASON_INVEPT  50
 #define EXIT_REASON_WBINVD  54
 #define EXIT_REASON_XSETBV  55
 #define EXIT_REASON_APIC_WRITE  56
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index de6cfb4..86e4022 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5879,6 +5879,87 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
return 1;
 }
 
+/* Emulate the INVEPT instruction */
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+   u32 vmx_instruction_info;
+   unsigned long type;
+   gva_t gva;
+   struct x86_exception e;
+   struct {
+   u64 eptp, gpa;
+   } operand;
+
+   if (!(nested_vmx_secondary_ctls_high  SECONDARY_EXEC_ENABLE_EPT) ||
+   !(nested_vmx_ept_caps  VMX_EPT_INVEPT_BIT)) {
+   kvm_queue_exception(vcpu, UD_VECTOR);
+   return 1;
+   }
+
+   if (!nested_vmx_check_permission(vcpu))
+   return 1;
+
+   if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
+   kvm_queue_exception(vcpu, UD_VECTOR);
+   return 1;
+   }
+
+   /* According to the Intel VMX instruction reference, the memory
+* operand is read even if it isn't needed (e.g., for type==global)
+*/
+   vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+   if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+   vmx_instruction_info, gva))
+   return 1;
+   if (kvm_read_guest_virt(vcpu-arch.emulate_ctxt, gva, operand,
+   sizeof(operand), e)) {
+   kvm_inject_page_fault(vcpu, e);
+   return 1;
+   }
+
+   type = kvm_register_read(vcpu, (vmx_instruction_info  28)  0xf);
+
+   switch (type) {
+   case VMX_EPT_EXTENT_GLOBAL:
+   if (!(nested_vmx_ept_caps  VMX_EPT_EXTENT_GLOBAL_BIT))
+   nested_vmx_failValid(vcpu,
+   VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+   else {
+   /*
+* Do nothing: when L1 changes EPT12, we already
+* update EPT02 (the shadow EPT table) and call INVEPT.
+* So when L1 calls INVEPT, there's nothing left to do.
+*/
+   nested_vmx_succeed(vcpu);
+   }
+   break;
+   case VMX_EPT_EXTENT_CONTEXT:
+   if (!(nested_vmx_ept_caps  VMX_EPT_EXTENT_CONTEXT_BIT))
+   nested_vmx_failValid(vcpu,
+   VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+   else {
+   /* Do nothing */
+   nested_vmx_succeed(vcpu);
+   }
+   break;
+   case VMX_EPT_EXTENT_INDIVIDUAL_ADDR:
+   if (!(nested_vmx_ept_caps  VMX_EPT_EXTENT_INDIVIDUAL_BIT))
+   nested_vmx_failValid(vcpu,
+   VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+   else {
+   /* Do nothing */
+   nested_vmx_succeed(vcpu);
+   }
+   break;
+   default:
+   nested_vmx_failValid(vcpu,
+   VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+   }
+
+   skip_emulated_instruction(vcpu);
+   return 1;
+}
+
 /*
  * The exit handlers return 

[PATCH v2 11/13] nEPT: Miscelleneous cleanups

2013-05-06 Thread Jun Nakajima
Some trivial code cleanups not really related to nested EPT.

Signed-off-by: Nadav Har'El n...@il.ibm.com
Signed-off-by: Jun Nakajima jun.nakaj...@intel.com
Signed-off-by: Xinhao Xu xinhao...@intel.com
Reviewed-by: Paolo Bonzini pbonz...@redhat.com
---
 arch/x86/kvm/vmx.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 86e4022..914cdda 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -616,7 +616,6 @@ static void nested_release_page_clean(struct page *page)
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
-static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -912,8 +911,7 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, 
u32 bit)
(vmcs12-secondary_vm_exec_control  bit);
 }
 
-static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
-   struct kvm_vcpu *vcpu)
+static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
 {
return vmcs12-pin_based_vm_exec_control  PIN_BASED_VIRTUAL_NMIS;
 }
@@ -6321,7 +6319,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 
if (unlikely(!cpu_has_virtual_nmis()  vmx-soft_vnmi_blocked 
!(is_guest_mode(vcpu)  nested_cpu_has_virtual_nmis(
-   get_vmcs12(vcpu), vcpu {
+   get_vmcs12(vcpu) {
if (vmx_interrupt_allowed(vcpu)) {
vmx-soft_vnmi_blocked = 0;
} else if (vmx-vnmi_blocked_time  10LL 
-- 
1.8.1.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 12/13] nEPT: Move is_rsvd_bits_set() to paging_tmpl.h

2013-05-06 Thread Jun Nakajima
Move is_rsvd_bits_set() to paging_tmpl.h so that it can be used to check
reserved bits in EPT page table entries as well.

Signed-off-by: Jun Nakajima jun.nakaj...@intel.com
Signed-off-by: Xinhao Xu xinhao...@intel.com
---
 arch/x86/kvm/mmu.c |  8 
 arch/x86/kvm/paging_tmpl.h | 12 ++--
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 99bfc5e..054c68b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2460,14 +2460,6 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
mmu_free_roots(vcpu);
 }
 
-static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
-{
-   int bit7;
-
-   bit7 = (gpte  7)  1;
-   return (gpte  mmu-rsvd_bits_mask[bit7][level-1]) != 0;
-}
-
 static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
 bool no_dirty_log)
 {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 51dca23..777d5d7 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -124,11 +124,19 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
 }
 #endif
 
+static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+   int bit7;
+
+   bit7 = (gpte  7)  1;
+   return (gpte  mmu-rsvd_bits_mask[bit7][level-1]) != 0;
+}
+
 static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
  struct kvm_mmu_page *sp, u64 *spte,
  u64 gpte)
 {
-   if (is_rsvd_bits_set(vcpu-arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+   if (FNAME(is_rsvd_bits_set)(vcpu-arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
goto no_present;
 
if (!is_present_gpte(gpte))
@@ -279,7 +287,7 @@ retry_walk:
if (unlikely(!is_present_gpte(pte)))
goto error;
 
-   if (unlikely(is_rsvd_bits_set(vcpu-arch.mmu, pte,
+   if (unlikely(FNAME(is_rsvd_bits_set)(vcpu-arch.mmu, pte,
  walker-level))) {
errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
goto error;
-- 
1.8.1.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 13/13] nEPT: Inject EPT violation/misconfigration

2013-05-06 Thread Jun Nakajima
Add code to detect EPT misconfiguration and inject it to L1 VMM. Also,
it injects more correct exit qualification upon EPT violation to L1
VMM.  Now L1 can correctly go to ept_misconfig handler (instead of
wrongly going to fast_page_fault), it will try to handle mmio page
fault, if failed, it is a real EPT misconfiguration.

Signed-off-by: Jun Nakajima jun.nakaj...@intel.com
Signed-off-by: Xinhao Xu xinhao...@intel.com
---
 arch/x86/include/asm/kvm_host.h |  4 +++
 arch/x86/kvm/mmu.c  |  5 ---
 arch/x86/kvm/mmu.h  |  5 +++
 arch/x86/kvm/paging_tmpl.h  | 26 ++
 arch/x86/kvm/vmx.c  | 79 +++--
 5 files changed, 111 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4979778..a32bda6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -262,6 +262,8 @@ struct kvm_mmu {
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
   u64 *spte, const void *pte);
+   bool (*check_tdp_pte)(u64 pte, int level);
+
hpa_t root_hpa;
int root_level;
int shadow_root_level;
@@ -504,6 +506,8 @@ struct kvm_vcpu_arch {
 * instruction.
 */
bool write_fault_to_shadow_pgtable;
+
+   unsigned long exit_qualification; /* set at EPT violation at this point 
*/
 };
 
 struct kvm_lpage_info {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 054c68b..613fbd2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -230,11 +230,6 @@ static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t 
pfn, unsigned access)
return false;
 }
 
-static inline u64 rsvd_bits(int s, int e)
-{
-   return ((1ULL  (e - s + 1)) - 1)  s;
-}
-
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask)
 {
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 19dd5ab..8aebd5a 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -91,6 +91,11 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu)
return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
 }
 
+static inline u64 rsvd_bits(int s, int e)
+{
+   return ((1ULL  (e - s + 1)) - 1)  s;
+}
+
 /*
  * Will a fault with a given page-fault error code (pfec) cause a permission
  * fault with the given access (in ACC_* format)?
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 777d5d7..e4a0d72 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -126,10 +126,14 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
 
 static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
 {
+#if PTTYPE == PTTYPE_EPT
+   return (mmu-check_tdp_pte(gpte, level));
+#else
int bit7;
 
bit7 = (gpte  7)  1;
return (gpte  mmu-rsvd_bits_mask[bit7][level-1]) != 0;
+#endif
 }
 
 static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
@@ -352,6 +356,28 @@ error:
walker-fault.vector = PF_VECTOR;
walker-fault.error_code_valid = true;
walker-fault.error_code = errcode;
+
+#if PTTYPE == PTTYPE_EPT
+   /*
+* Use PFERR_RSVD_MASK in erorr_code to to tell if EPT
+* misconfiguration requires to be injected. The detection is
+* done by is_rsvd_bits_set() above.
+* 
+* We set up the value of exit_qualification to inject:
+* [2:0] -- Derive from [2:0] of real exit_qualification at EPT 
violation
+* [5:3] -- Calculated by the page walk of the guest EPT page tables
+* [7:8] -- Clear to 0.
+* 
+* The other bits are set to 0.
+*/
+   if (!(errcode  PFERR_RSVD_MASK)) {
+   unsigned long exit_qualification = 
vcpu-arch.exit_qualification;
+
+   pte_access = pt_access  pte;
+   vcpu-arch.exit_qualification = ((pte_access  0x7)  3) |
+   (exit_qualification  0x7);
+   }
+#endif
walker-fault.address = addr;
walker-fault.nested_page_fault = mmu != vcpu-arch.walk_mmu;
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 914cdda..4edf1fe 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5074,6 +5074,8 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
/* ept page table is present? */
error_code |= (exit_qualification  3)  0x1;
 
+   vcpu-arch.exit_qualification = exit_qualification;
+
return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
 }
 
@@ -6994,7 +6996,7 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu 
*vcpu)
 }
 
 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
-   struct x86_exception *fault)
+struct x86_exception *fault)
 {
struct vmcs12 *vmcs12;

Re: [PATCH v2 01/13] nEPT: Support LOAD_IA32_EFER entry/exit controls for L1

2013-05-06 Thread Jan Kiszka
On 2013-05-06 09:04, Jun Nakajima wrote:
 Recent KVM, since http://kerneltrap.org/mailarchive/linux-kvm/2010/5/2/6261577
 switch the EFER MSR when EPT is used and the host and guest have different
 NX bits. So if we add support for nested EPT (L1 guest using EPT to run L2)
 and want to be able to run recent KVM as L1, we need to allow L1 to use this
 EFER switching feature.
 
 To do this EFER switching, KVM uses VM_ENTRY/EXIT_LOAD_IA32_EFER if available,
 and if it isn't, it uses the generic VM_ENTRY/EXIT_MSR_LOAD. This patch adds
 support for the former (the latter is still unsupported).
 
 Nested entry and exit emulation (prepare_vmcs_02 and load_vmcs12_host_state,
 respectively) already handled VM_ENTRY/EXIT_LOAD_IA32_EFER correctly. So all
 that's left to do in this patch is to properly advertise this feature to L1.
 
 Note that vmcs12's VM_ENTRY/EXIT_LOAD_IA32_EFER are emulated by L0, by using
 vmx_set_efer (which itself sets one of several vmcs02 fields), so we always
 support this feature, regardless of whether the host supports it.
 
 Signed-off-by: Nadav Har'El n...@il.ibm.com
 Signed-off-by: Jun Nakajima jun.nakaj...@intel.com
 Signed-off-by: Xinhao Xu xinhao...@intel.com
 ---
  arch/x86/kvm/vmx.c | 18 ++
  1 file changed, 14 insertions(+), 4 deletions(-)
 
 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index 867b810..485ded6 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -2057,6 +2057,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
  #else
   nested_vmx_exit_ctls_high = 0;
  #endif
 + nested_vmx_exit_ctls_high |= VM_EXIT_LOAD_IA32_EFER;

You are using the wrong baseline. This does not apply against next.
Please fix (you can find the properly rebased version in my tree).

Thanks,
Jan




signature.asc
Description: OpenPGP digital signature


[PATCH 4/5] powerpc/vfio: Implement IOMMU driver for VFIO

2013-05-06 Thread aik
From: Alexey Kardashevskiy a...@ozlabs.ru

VFIO implements platform independent stuff such as
a PCI driver, BAR access (via read/write on a file descriptor
or direct mapping when possible) and IRQ signaling.

The platform dependent part includes IOMMU initialization
and handling.  This implements an IOMMU driver for VFIO
which does mapping/unmapping pages for the guest IO and
provides information about DMA window (required by a POWER
guest).

Cc: David Gibson da...@gibson.dropbear.id.au
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Signed-off-by: Paul Mackerras pau...@samba.org
---

Change log:
* no more PPC versions for vfio_iommu_spapr_tce_dma_(un)map (type1 structs 
reused)
* documentation updated
* containter enable/disable ioctls added
* request_module(spapr_iommu) added
* various locks fixed
* multiple TCE mapping support (no clients for that for now as SPAPR
does it in a different way)


---
 Documentation/vfio.txt  |   63 ++
 drivers/vfio/Kconfig|6 +
 drivers/vfio/Makefile   |1 +
 drivers/vfio/vfio.c |1 +
 drivers/vfio/vfio_iommu_spapr_tce.c |  377 +++
 include/uapi/linux/vfio.h   |   34 
 6 files changed, 482 insertions(+)
 create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index 8eda363..c55533c 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -283,6 +283,69 @@ a direct pass through for VFIO_DEVICE_* ioctls.  The 
read/write/mmap
 interfaces implement the device region access defined by the device's
 own VFIO_DEVICE_GET_REGION_INFO ioctl.
 
+
+PPC64 sPAPR implementation note
+---
+
+This implementation has some specifics:
+
+1) Only one IOMMU group per container is supported as an IOMMU group
+represents the minimal entity which isolation can be guaranteed for and
+groups are allocated statically, one per a Partitionable Endpoint (PE)
+(PE is often a PCI domain but not always).
+
+2) The hardware supports so called DMA windows - the PCI address range
+within which DMA transfer is allowed, any attempt to access address space
+out of the window leads to the whole PE isolation.
+
+3) PPC64 guests are paravirtualized but not fully emulated. There is an API
+to map/unmap pages for DMA, and it normally maps 1..32 pages per call and
+currently there is no way to reduce the number of calls. In order to make 
things
+faster, the map/unmap handling has been implemented in real mode which provides
+an excellent performance which has limitations such as inability to do
+locked pages accounting in real time.
+
+So 3 additional ioctls have been added:
+
+   VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
+   of the DMA window on the PCI bus.
+
+   VFIO_IOMMU_ENABLE - enables the container. The locked pages accounting
+   is done at this point. This lets user first to know what
+   the DMA window is and adjust rlimit before doing any real job.
+
+   VFIO_IOMMU_DISABLE - disables the container.
+
+
+The code flow from the example above should be slightly changed:
+
+   .
+   /* Add the group to the container */
+   ioctl(group, VFIO_GROUP_SET_CONTAINER, container);
+
+   /* Enable the IOMMU model we want */
+   ioctl(container, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU)
+
+   /* Get addition sPAPR IOMMU info */
+   vfio_iommu_spapr_tce_info spapr_iommu_info;
+   ioctl(container, VFIO_IOMMU_SPAPR_TCE_GET_INFO, spapr_iommu_info);
+
+   if (ioctl(container, VFIO_IOMMU_ENABLE))
+   /* Cannot enable container, may be low rlimit */
+
+   /* Allocate some space and setup a DMA mapping */
+   dma_map.vaddr = mmap(0, 1024 * 1024, PROT_READ | PROT_WRITE,
+MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+
+   dma_map.size = 1024 * 1024;
+   dma_map.iova = 0; /* 1MB starting at 0x0 from device view */
+   dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+
+   /* Check here is .iova/.size are within DMA window from 
spapr_iommu_info */
+
+   ioctl(container, VFIO_IOMMU_MAP_DMA, dma_map);
+   .
+
 ---
 
 [1] VFIO was originally an acronym for Virtual Function I/O in its
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 7cd5dec..b464687 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
depends on VFIO
default n
 
+config VFIO_IOMMU_SPAPR_TCE
+   tristate
+   depends on VFIO  SPAPR_TCE_IOMMU
+   default n
+
 menuconfig VFIO
tristate VFIO Non-Privileged userspace driver framework
depends on IOMMU_API
select VFIO_IOMMU_TYPE1 if X86
+   select VFIO_IOMMU_SPAPR_TCE if 

[PATCH 5/5] powerpc/vfio: Enable on pSeries platform

2013-05-06 Thread aik
From: Alexey Kardashevskiy a...@ozlabs.ru

The enables VFIO on the pSeries platform, enabling user space
programs to access PCI devices directly.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Cc: David Gibson da...@gibson.dropbear.id.au
Signed-off-by: Paul Mackerras pau...@samba.org
---
 arch/powerpc/platforms/pseries/iommu.c |4 
 drivers/iommu/Kconfig  |2 +-
 drivers/vfio/Kconfig   |2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index e2685ba..e178acc 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -613,6 +613,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 
iommu_table_setparms(pci-phb, dn, tbl);
pci-iommu_table = iommu_init_table(tbl, pci-phb-node);
+   iommu_register_group(tbl, pci_domain_nr(bus), 0);
 
/* Divide the rest (1.75GB) among the children */
pci-phb-dma_window_size = 0x8000ul;
@@ -657,6 +658,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
   ppci-phb-node);
iommu_table_setparms_lpar(ppci-phb, pdn, tbl, dma_window);
ppci-iommu_table = iommu_init_table(tbl, ppci-phb-node);
+   iommu_register_group(tbl, pci_domain_nr(bus), 0);
pr_debug(  created table: %p\n, ppci-iommu_table);
}
 }
@@ -683,6 +685,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
   phb-node);
iommu_table_setparms(phb, dn, tbl);
PCI_DN(dn)-iommu_table = iommu_init_table(tbl, phb-node);
+   iommu_register_group(tbl, pci_domain_nr(phb-bus), 0);
set_iommu_table_base(dev-dev, PCI_DN(dn)-iommu_table);
return;
}
@@ -1145,6 +1148,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev 
*dev)
   pci-phb-node);
iommu_table_setparms_lpar(pci-phb, pdn, tbl, dma_window);
pci-iommu_table = iommu_init_table(tbl, pci-phb-node);
+   iommu_register_group(tbl, pci_domain_nr(pci-phb-bus), 0);
pr_debug(  created table: %p\n, pci-iommu_table);
} else {
pr_debug(  found DMA window, table: %p\n, pci-iommu_table);
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 175e0f4..2d75ea0 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -189,7 +189,7 @@ config EXYNOS_IOMMU_DEBUG
 
 config SPAPR_TCE_IOMMU
bool sPAPR TCE IOMMU Support
-   depends on PPC_POWERNV
+   depends on PPC_POWERNV || PPC_PSERIES
select IOMMU_API
help
  Enables bits of IOMMU API required by VFIO. The iommu_ops
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index b464687..26b3d9d 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -12,7 +12,7 @@ menuconfig VFIO
tristate VFIO Non-Privileged userspace driver framework
depends on IOMMU_API
select VFIO_IOMMU_TYPE1 if X86
-   select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
+   select VFIO_IOMMU_SPAPR_TCE if (PPC_POWERNV || PPC_PSERIES)
help
  VFIO provides a framework for secure userspace device drivers.
  See Documentation/vfio.txt for more details.
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/5] VFIO PPC64: add VFIO support on POWERPC64

2013-05-06 Thread aik
From: Alexey Kardashevskiy a...@ozlabs.ru

The series adds support for VFIO on POWERPC in user space (such as QEMU).
The in-kernel real mode IOMMU support is added by another series posted
separately.

As the first and main aim of this series is the POWERNV platform support,
the Enable on POWERNV platform patch goes first and introduces an API
to be used by the VFIO IOMMU driver. The Enable on pSeries platform patch
simply registers PHBs in the IOMMU subsystem and expects the API to be present,
it enables VFIO support in fully emulated QEMU guests.

These patches were tested against 3.8 and the iommu: Move initialization 
earlier
patch is already in 3.9 so I am including it here only for the reference.

Change log:
* cleanups and minor fixes
* added support for pSeries
* separated from in-kernel IOMMU handling series (should make it easier to get 
sob'ed)
* signed-off-by Paul Mackerras

Alexey Kardashevskiy (5):
  iommu: Move initialization earlier
  KVM: PPC: iommu: Add missing
kvm_iommu_map_pages/kvm_iommu_unmap_pages
  powerpc/vfio: Enable on POWERNV platform
  powerpc/vfio: Implement IOMMU driver for VFIO
  powerpc/vfio: Enable on pSeries platform

 Documentation/vfio.txt  |   63 +
 arch/powerpc/include/asm/iommu.h|   26 ++
 arch/powerpc/include/asm/kvm_host.h |   14 +
 arch/powerpc/kernel/iommu.c |  319 +++
 arch/powerpc/platforms/powernv/pci-ioda.c   |1 +
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |5 +-
 arch/powerpc/platforms/powernv/pci.c|2 +
 arch/powerpc/platforms/pseries/iommu.c  |4 +
 drivers/iommu/Kconfig   |8 +
 drivers/iommu/iommu.c   |2 +-
 drivers/vfio/Kconfig|6 +
 drivers/vfio/Makefile   |1 +
 drivers/vfio/vfio.c |1 +
 drivers/vfio/vfio_iommu_spapr_tce.c |  377 +++
 include/uapi/linux/vfio.h   |   34 +++
 15 files changed, 861 insertions(+), 2 deletions(-)
 create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/5] iommu: Move initialization earlier

2013-05-06 Thread aik
From: Alexey Kardashevskiy a...@ozlabs.ru

The iommu_init() call initializes IOMMU internal structures and data
required for the API to function such as iommu_group_alloc().
It is registered as a subsys_initcall.

One of the IOMMU users is a PCI subsystem on POWER which discovers new
IOMMU tables during the PCI scan so the most logical place to call
iommu_group_alloc() is when a new group is just discovered. However
PCI scan is done from subsys_initcall hook as well, which makes
use of the IOMMU API impossible.

This moves IOMMU subsystem initialization one step earlier.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Cc: David Gibson da...@gibson.dropbear.id.au
Signed-off-by: Paul Mackerras pau...@samba.org
---
 drivers/iommu/iommu.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 5514dfa..0de83eb 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -890,7 +890,7 @@ static int __init iommu_init(void)
 
return 0;
 }
-subsys_initcall(iommu_init);
+arch_initcall(iommu_init);
 
 int iommu_domain_get_attr(struct iommu_domain *domain,
  enum iommu_attr attr, void *data)
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/5] KVM: PPC: iommu: Add missing kvm_iommu_map_pages/kvm_iommu_unmap_pages

2013-05-06 Thread aik
From: Alexey Kardashevskiy a...@ozlabs.ru

The IOMMU API implements groups creating/deletion, device binding
and IOMMU map/unmap operations.

The PowerPC implementation uses most of the API except map/unmap
operations, which are implemented on POWER using hypercalls.

However, in order to link a kernel with the CONFIG_IOMMU_API enabled,
the empty kvm_iommu_map_pages/kvm_iommu_unmap_pages have to be
defined, so this defines them.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Cc: David Gibson da...@gibson.dropbear.id.au
Signed-off-by: Paul Mackerras pau...@samba.org
---
 arch/powerpc/include/asm/kvm_host.h |   14 ++
 1 file changed, 14 insertions(+)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index b6a047e..c025d91 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -603,4 +603,18 @@ struct kvm_vcpu_arch {
 
 #define __KVM_HAVE_ARCH_WQP
 
+#ifdef CONFIG_IOMMU_API
+/* POWERPC does not use IOMMU API for mapping/unmapping */
+static inline int kvm_iommu_map_pages(struct kvm *kvm,
+   struct kvm_memory_slot *slot)
+{
+   return 0;
+}
+
+static inline void kvm_iommu_unmap_pages(struct kvm *kvm,
+   struct kvm_memory_slot *slot)
+{
+}
+#endif /* CONFIG_IOMMU_API */
+
 #endif /* __POWERPC_KVM_HOST_H__ */
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/5] powerpc/vfio: Enable on POWERNV platform

2013-05-06 Thread aik
From: Alexey Kardashevskiy a...@ozlabs.ru

This initializes IOMMU groups based on the IOMMU configuration
discovered during the PCI scan on POWERNV (POWER non virtualized)
platform.  The IOMMU groups are to be used later by the VFIO driver,
which is used for PCI pass through.

It also implements an API for mapping/unmapping pages for
guest PCI drivers and providing DMA window properties.
This API is going to be used later by QEMU-VFIO to handle
h_put_tce hypercalls from the KVM guest.

The iommu_put_tce_user_mode() does only a single page mapping
as an API for adding many mappings at once is going to be
added later.

Although this driver has been tested only on the POWERNV
platform, it should work on any platform which supports
TCE tables.  As h_put_tce hypercall is received by the host
kernel and processed by the QEMU (what involves calling
the host kernel again), performance is not the best -
circa 220MB/s on 10Gb ethernet network.

To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
option and configure VFIO as required.

Cc: David Gibson da...@gibson.dropbear.id.au
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Signed-off-by: Paul Mackerras pau...@samba.org
---
 arch/powerpc/include/asm/iommu.h|   26 +++
 arch/powerpc/kernel/iommu.c |  319 +++
 arch/powerpc/platforms/powernv/pci-ioda.c   |1 +
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |5 +-
 arch/powerpc/platforms/powernv/pci.c|2 +
 drivers/iommu/Kconfig   |8 +
 6 files changed, 360 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index cbfe678..98d1422 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -76,6 +76,9 @@ struct iommu_table {
struct iommu_pool large_pool;
struct iommu_pool pools[IOMMU_NR_POOLS];
unsigned long *it_map;   /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+   struct iommu_group *it_group;
+#endif
 };
 
 struct scatterlist;
@@ -98,6 +101,8 @@ extern void iommu_free_table(struct iommu_table *tbl, const 
char *node_name);
  */
 extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
int nid);
+extern void iommu_register_group(struct iommu_table *tbl,
+int pci_domain_number, unsigned long pe_num);
 
 extern int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
struct scatterlist *sglist, int nelems,
@@ -147,5 +152,26 @@ static inline void iommu_restore(void)
 }
 #endif
 
+/* The API to support IOMMU operations for VFIO */
+extern int iommu_tce_clear_param_check(struct iommu_table *tbl,
+   unsigned long ioba, unsigned long tce_value,
+   unsigned long npages);
+extern int iommu_tce_put_param_check(struct iommu_table *tbl,
+   unsigned long ioba, unsigned long tce);
+extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
+   unsigned long hwaddr, enum dma_data_direction direction);
+extern unsigned long iommu_clear_tce(struct iommu_table *tbl,
+   unsigned long entry);
+extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
+   unsigned long entry, unsigned long pages);
+extern int iommu_put_tce_user_mode(struct iommu_table *tbl,
+   unsigned long entry, unsigned long tce);
+
+extern void iommu_flush_tce(struct iommu_table *tbl);
+extern int iommu_take_ownership(struct iommu_table *tbl);
+extern void iommu_release_ownership(struct iommu_table *tbl);
+
+extern enum dma_data_direction iommu_tce_direction(unsigned long tce);
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index c862fd7..debedd2 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -36,6 +36,8 @@
 #include linux/hash.h
 #include linux/fault-inject.h
 #include linux/pci.h
+#include linux/iommu.h
+#include linux/sched.h
 #include asm/io.h
 #include asm/prom.h
 #include asm/iommu.h
@@ -44,6 +46,7 @@
 #include asm/kdump.h
 #include asm/fadump.h
 #include asm/vio.h
+#include asm/tce.h
 
 #define DBG(...)
 
@@ -717,6 +720,12 @@ void iommu_free_table(struct iommu_table *tbl, const char 
*node_name)
return;
}
 
+#ifdef CONFIG_IOMMU_API
+   if (tbl-it_group) {
+   iommu_group_put(tbl-it_group);
+   BUG_ON(tbl-it_group);
+   }
+#endif
/* verify that table contains no entries */
if (!bitmap_empty(tbl-it_map, tbl-it_size))
pr_warn(%s: Unexpected TCEs for %s\n, __func__, node_name);
@@ -853,3 +862,313 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t 
size,
free_pages((unsigned long)vaddr, get_order(size));
}
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+static void 

[PATCH 0/6] KVM: PPC: IOMMU in-kernel handling

2013-05-06 Thread Alexey Kardashevskiy
This series is supposed to accelerate IOMMU operations in real and virtual
mode in the host kernel for the KVM guest.

The first user is VFIO however this series does not contain any VFIO related
code as the connection between VFIO and the new handlers is to be made in QEMU
via ioctl to the KVM fd.

Although the series compiles, it does not make sense without VFIO patches which
are posted separately.

The iommu: Add a function to find an iommu group by id patch has already
gone to linux-next (from iommu tree) but it is not in upstream yet so
I am including it here for the reference.


Alexey Kardashevskiy (6):
  KVM: PPC: Make lookup_linux_pte public
  KVM: PPC: Add support for multiple-TCE hcalls
  powerpc: Prepare to support kernel handling of IOMMU map/unmap
  iommu: Add a function to find an iommu group by id
  KVM: PPC: Add support for IOMMU in-kernel handling
  KVM: PPC: Add hugepage support for IOMMU in-kernel handling

 Documentation/virtual/kvm/api.txt|   43 +++
 arch/powerpc/include/asm/kvm_host.h  |4 +
 arch/powerpc/include/asm/kvm_ppc.h   |   44 ++-
 arch/powerpc/include/asm/pgtable-ppc64.h |4 +
 arch/powerpc/include/uapi/asm/kvm.h  |7 +
 arch/powerpc/kvm/book3s_64_vio.c |  433 +++-
 arch/powerpc/kvm/book3s_64_vio_hv.c  |  464 --
 arch/powerpc/kvm/book3s_hv.c |   23 ++
 arch/powerpc/kvm/book3s_hv_rm_mmu.c  |5 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  |6 +
 arch/powerpc/kvm/book3s_pr_papr.c|   37 ++-
 arch/powerpc/kvm/powerpc.c   |   15 +
 arch/powerpc/mm/init_64.c|   77 -
 drivers/iommu/iommu.c|   29 ++
 include/linux/iommu.h|1 +
 include/uapi/linux/kvm.h |3 +
 16 files changed, 1159 insertions(+), 36 deletions(-)

-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/6] KVM: PPC: Make lookup_linux_pte public

2013-05-06 Thread Alexey Kardashevskiy
The lookup_linux_pte() function returns a linux PTE which is needed in
the process of converting KVM guest physical address into host real
address in real mode.

This conversion will be used by upcoming support of H_PUT_TCE_INDIRECT,
as the TCE list address comes from the guest and is a guest physical
address.  This makes lookup_linux_pte() public so that code can call
it.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Cc: David Gibson da...@gibson.dropbear.id.au
Signed-off-by: Paul Mackerras pau...@samba.org
---
 arch/powerpc/include/asm/kvm_ppc.h  |3 +++
 arch/powerpc/kvm/book3s_hv_rm_mmu.c |5 +++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 41426c9..99da298 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -379,4 +379,7 @@ static inline ulong kvmppc_get_ea_indexed(struct kvm_vcpu 
*vcpu, int ra, int rb)
return ea;
 }
 
+pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
+   int writing, unsigned long *pte_sizep);
+
 #endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 6dcbb49..18fc382 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -134,8 +134,8 @@ static void remove_revmap_chain(struct kvm *kvm, long 
pte_index,
unlock_rmap(rmap);
 }
 
-static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
- int writing, unsigned long *pte_sizep)
+pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
+  int writing, unsigned long *pte_sizep)
 {
pte_t *ptep;
unsigned long ps = *pte_sizep;
@@ -154,6 +154,7 @@ static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long 
hva,
return __pte(0);
return kvmppc_read_update_linux_pte(ptep, writing);
 }
+EXPORT_SYMBOL_GPL(lookup_linux_pte);
 
 static inline void unlock_hpte(unsigned long *hpte, unsigned long hpte_v)
 {
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/6] powerpc: Prepare to support kernel handling of IOMMU map/unmap

2013-05-06 Thread Alexey Kardashevskiy
The current VFIO-on-POWER implementation supports only user mode
driven mapping, i.e. QEMU is sending requests to map/unmap pages.
However this approach is really slow, so we want to move that to KVM.
Since H_PUT_TCE can be extremely performance sensitive (especially with
network adapters where each packet needs to be mapped/unmapped) we chose
to implement that as a fast hypercall directly in real
mode (processor still in the guest context but MMU off).

To be able to do that, we need to provide some facilities to
access the struct page count within that real mode environment as things
like the sparsemem vmemmap mappings aren't accessible.

This adds an API to increment/decrement page counter as
get_user_pages API used for user mode mapping does not work
in the real mode.

CONFIG_SPARSEMEM_VMEMMAP and CONFIG_FLATMEM are supported.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Reviewed-by: Paul Mackerras pau...@samba.org
Cc: David Gibson da...@gibson.dropbear.id.au
Signed-off-by: Paul Mackerras pau...@samba.org
---
 arch/powerpc/include/asm/pgtable-ppc64.h |4 ++
 arch/powerpc/mm/init_64.c|   77 +-
 2 files changed, 80 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
b/arch/powerpc/include/asm/pgtable-ppc64.h
index 0182c20..4c56ede 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -377,6 +377,10 @@ static inline pte_t *find_linux_pte_or_hugepte(pgd_t 
*pgdir, unsigned long ea,
 }
 #endif /* !CONFIG_HUGETLB_PAGE */
 
+struct page *realmode_pfn_to_page(unsigned long pfn);
+int realmode_get_page(struct page *page);
+int realmode_put_page(struct page *page);
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_PGTABLE_PPC64_H_ */
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 95a4529..838b8ae 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -297,5 +297,80 @@ int __meminit vmemmap_populate(struct page *start_page,
 
return 0;
 }
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
+/*
+ * We do not have access to the sparsemem vmemmap, so we fallback to
+ * walking the list of sparsemem blocks which we already maintain for
+ * the sake of crashdump. In the long run, we might want to maintain
+ * a tree if performance of that linear walk becomes a problem.
+ *
+ * Any of realmode_ functions can fail due to:
+ * 1) As real sparsemem blocks do not lay in RAM continously (they
+ * are in virtual address space which is not available in the real mode),
+ * the requested page struct can be split between blocks so get_page/put_page
+ * may fail.
+ * 2) When huge pages are used, the get_page/put_page API will fail
+ * in real mode as the linked addresses in the page struct are virtual
+ * too.
+ * When 1) or 2) takes place, the API returns an error code to cause
+ * an exit to kernel virtual mode where the operation will be completed.
+ */
+struct page *realmode_pfn_to_page(unsigned long pfn)
+{
+   struct vmemmap_backing *vmem_back;
+   struct page *page;
+   unsigned long page_size = 1  mmu_psize_defs[mmu_vmemmap_psize].shift;
+   unsigned long pg_va = (unsigned long) pfn_to_page(pfn);
+
+   for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back-list) {
+   if (pg_va  vmem_back-virt_addr)
+   continue;
+
+   /* Check that page struct is not split between real pages */
+   if ((pg_va + sizeof(struct page)) 
+   (vmem_back-virt_addr + page_size))
+   return NULL;
+
+   page = (struct page *) (vmem_back-phys + pg_va -
+   vmem_back-virt_addr);
+   return page;
+   }
+
+   return NULL;
+}
+EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
+
+#elif defined(CONFIG_FLATMEM)
+
+struct page *realmode_pfn_to_page(unsigned long pfn)
+{
+   struct page *page = pfn_to_page(pfn);
+   return page;
+}
+EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
+
+#endif /* CONFIG_SPARSEMEM_VMEMMAP/CONFIG_FLATMEM */
+
+#if defined(CONFIG_SPARSEMEM_VMEMMAP) || defined(CONFIG_FLATMEM)
+int realmode_get_page(struct page *page)
+{
+   if (PageTail(page))
+   return -EAGAIN;
+
+   get_page(page);
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(realmode_get_page);
+
+int realmode_put_page(struct page *page)
+{
+   if (PageCompound(page))
+   return -EAGAIN;
+
+   put_page(page);
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(realmode_put_page);
+#endif
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/6] iommu: Add a function to find an iommu group by id

2013-05-06 Thread Alexey Kardashevskiy
As IOMMU groups are exposed to the user space by their numbers,
the user space can use them in various kernel APIs so the kernel
might need an API to find a group by its ID.

As an example, QEMU VFIO on PPC64 platform needs it to associate
a logical bus number (LIOBN) with a specific IOMMU group in order
to support in-kernel handling of DMA map/unmap requests.

This adds the iommu_group_get_by_id(id) function which performs
this search.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Signed-off-by: Paul Mackerras pau...@samba.org
---
 drivers/iommu/iommu.c |   29 +
 include/linux/iommu.h |1 +
 2 files changed, 30 insertions(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index ddbdaca..5514dfa 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -204,6 +204,35 @@ again:
 }
 EXPORT_SYMBOL_GPL(iommu_group_alloc);
 
+struct iommu_group *iommu_group_get_by_id(int id)
+{
+   struct kobject *group_kobj;
+   struct iommu_group *group;
+   const char *name;
+
+   if (!iommu_group_kset)
+   return NULL;
+
+   name = kasprintf(GFP_KERNEL, %d, id);
+   if (!name)
+   return NULL;
+
+   group_kobj = kset_find_obj(iommu_group_kset, name);
+   kfree(name);
+
+   if (!group_kobj)
+   return NULL;
+
+   group = container_of(group_kobj, struct iommu_group, kobj);
+   BUG_ON(group-id != id);
+
+   kobject_get(group-devices_kobj);
+   kobject_put(group-kobj);
+
+   return group;
+}
+EXPORT_SYMBOL_GPL(iommu_group_get_by_id);
+
 /**
  * iommu_group_get_iommudata - retrieve iommu_data registered for a group
  * @group: the group
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index f3b99e1..00e5d7d 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -113,6 +113,7 @@ struct iommu_ops {
 extern int bus_set_iommu(struct bus_type *bus, struct iommu_ops *ops);
 extern bool iommu_present(struct bus_type *bus);
 extern struct iommu_domain *iommu_domain_alloc(struct bus_type *bus);
+extern struct iommu_group *iommu_group_get_by_id(int id);
 extern void iommu_domain_free(struct iommu_domain *domain);
 extern int iommu_attach_device(struct iommu_domain *domain,
   struct device *dev);
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/6] KVM: PPC: Add support for IOMMU in-kernel handling

2013-05-06 Thread Alexey Kardashevskiy
This allows the host kernel to handle H_PUT_TCE, H_PUT_TCE_INDIRECT
and H_STUFF_TCE requests without passing them to QEMU, which should
save time on switching to QEMU and back.

Both real and virtual modes are supported - whenever the kernel
fails to handle TCE request, it passes it to the virtual mode.
If it the virtual mode handlers fail, then the request is passed
to the user mode, for example, to QEMU.

This adds a new KVM_CAP_SPAPR_TCE_IOMMU ioctl to asssociate
a virtual PCI bus ID (LIOBN) with an IOMMU group, which enables
in-kernel handling of IOMMU map/unmap.

This adds a special case for huge pages (16MB).  The reference
counting cannot be easily done for such pages in real mode (when
MMU is off) so we added a list of huge pages.  It is populated in
virtual mode and get_page is called just once per a huge page.
Real mode handlers check if the requested page is huge and in the list,
then no reference counting is done, otherwise an exit to virtual mode
happens.  The list is released at KVM exit.  At the moment the fastest
card available for tests uses up to 9 huge pages so walking through this
list is not very expensive.  However this can change and we may want
to optimize this.

This also adds the virt_only parameter to the KVM module
for debug and performance check purposes.

Tests show that this patch increases transmission speed from 220MB/s
to 750..1020MB/s on 10Gb network (Chelsea CXGB3 10Gb ethernet card).

Cc: David Gibson da...@gibson.dropbear.id.au
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Signed-off-by: Paul Mackerras pau...@samba.org
---
 Documentation/virtual/kvm/api.txt   |   28 
 arch/powerpc/include/asm/kvm_host.h |2 +
 arch/powerpc/include/asm/kvm_ppc.h  |2 +
 arch/powerpc/include/uapi/asm/kvm.h |7 +
 arch/powerpc/kvm/book3s_64_vio.c|  242 ++-
 arch/powerpc/kvm/book3s_64_vio_hv.c |  192 +++
 arch/powerpc/kvm/powerpc.c  |   12 ++
 include/uapi/linux/kvm.h|2 +
 8 files changed, 485 insertions(+), 2 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index f621cd6..2039767 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2127,6 +2127,34 @@ written, then `n_invalid' invalid entries, invalidating 
any previously
 valid entries found.
 
 
+4.79 KVM_CREATE_SPAPR_TCE_IOMMU
+
+Capability: KVM_CAP_SPAPR_TCE_IOMMU
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_create_spapr_tce_iommu (in)
+Returns: 0 on success, -1 on error
+
+This creates a link between IOMMU group and a hardware TCE (translation
+control entry) table. This link lets the host kernel know what IOMMU
+group (i.e. TCE table) to use for the LIOBN number passed with
+H_PUT_TCE, H_PUT_TCE_INDIRECT, H_STUFF_TCE hypercalls.
+
+/* for KVM_CAP_SPAPR_TCE_IOMMU */
+struct kvm_create_spapr_tce_iommu {
+   __u64 liobn;
+   __u32 iommu_id;
+   __u32 flags;
+};
+
+No flag is supported at the moment.
+
+When the guest issues TCE call on a liobn for which a TCE table has been
+registered, the kernel will handle it in real mode, updating the hardware
+TCE table. TCE table calls for other liobns will cause a vm exit and must
+be handled by userspace.
+
+
 5. The kvm_run structure
 
 
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 36ceb0d..2b70cbc 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -178,6 +178,8 @@ struct kvmppc_spapr_tce_table {
struct kvm *kvm;
u64 liobn;
u32 window_size;
+   bool virtmode_only;
+   struct iommu_group *grp;/* used for IOMMU groups */
struct page *pages[0];
 };
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index d501246..bdfa140 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -139,6 +139,8 @@ extern void kvmppc_xics_free(struct kvm *kvm);
 
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
struct kvm_create_spapr_tce *args);
+extern long kvm_vm_ioctl_create_spapr_tce_iommu(struct kvm *kvm,
+   struct kvm_create_spapr_tce_iommu *args);
 extern struct kvmppc_spapr_tce_table *kvmppc_find_tce_table(
struct kvm_vcpu *vcpu, unsigned long liobn);
 extern long kvmppc_emulated_h_put_tce(struct kvmppc_spapr_tce_table *stt,
diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
b/arch/powerpc/include/uapi/asm/kvm.h
index 681b314..b67d44b 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -291,6 +291,13 @@ struct kvm_create_spapr_tce {
__u32 window_size;
 };
 
+/* for KVM_CAP_SPAPR_TCE_IOMMU */
+struct kvm_create_spapr_tce_iommu {
+   __u64 liobn;
+   __u32 iommu_id;
+   __u32 flags;
+};
+
 /* for KVM_ALLOCATE_RMA */
 

[PATCH 2/6] KVM: PPC: Add support for multiple-TCE hcalls

2013-05-06 Thread Alexey Kardashevskiy
This adds real mode handlers for the H_PUT_TCE_INDIRECT and
H_STUFF_TCE hypercalls for QEMU emulated devices such as virtio
devices or emulated PCI.  These calls allow adding multiple entries
(up to 512) into the TCE table in one call which saves time on
transition to/from real mode.

This adds a guest physical to host real address converter
and calls the existing H_PUT_TCE handler. The converting function
is going to be fully utilized by upcoming VFIO supporting patches.

This also implements the KVM_CAP_PPC_MULTITCE capability,
so in order to support the functionality of this patch, QEMU
needs to query for this capability and set the hcall-multi-tce
hypertas property only if the capability is present, otherwise
there will be serious performance degradation.

Cc: David Gibson da...@gibson.dropbear.id.au
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Signed-off-by: Paul Mackerras pau...@samba.org
---
 Documentation/virtual/kvm/api.txt   |   15 ++
 arch/powerpc/include/asm/kvm_ppc.h  |   15 +-
 arch/powerpc/kvm/book3s_64_vio.c|  114 +++
 arch/powerpc/kvm/book3s_64_vio_hv.c |  231 +++
 arch/powerpc/kvm/book3s_hv.c|   23 +++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |6 +
 arch/powerpc/kvm/book3s_pr_papr.c   |   37 -
 arch/powerpc/kvm/powerpc.c  |3 +
 include/uapi/linux/kvm.h|1 +
 9 files changed, 413 insertions(+), 32 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index a4df553..f621cd6 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2463,3 +2463,18 @@ For mmu types KVM_MMU_FSL_BOOKE_NOHV and 
KVM_MMU_FSL_BOOKE_HV:
where num_sets is the tlb_sizes[] value divided by the tlb_ways[] value.
  - The tsize field of mas1 shall be set to 4K on TLB0, even though the
hardware ignores this value for TLB0.
+
+
+6.4 KVM_CAP_PPC_MULTITCE
+
+Architectures: ppc
+Parameters: none
+Returns: 0 on success; -1 on error
+
+This capability enables the guest to put/remove multiple TCE entries
+per hypercall which significanly accelerates DMA operations for PPC KVM
+guests.
+
+When this capability is enabled, H_PUT_TCE_INDIRECT and H_STUFF_TCE are
+expected to occur rather than H_PUT_TCE which supports only one TCE entry
+per call.
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 99da298..d501246 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -139,8 +139,19 @@ extern void kvmppc_xics_free(struct kvm *kvm);
 
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
struct kvm_create_spapr_tce *args);
-extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
-unsigned long ioba, unsigned long tce);
+extern struct kvmppc_spapr_tce_table *kvmppc_find_tce_table(
+   struct kvm_vcpu *vcpu, unsigned long liobn);
+extern long kvmppc_emulated_h_put_tce(struct kvmppc_spapr_tce_table *stt,
+   unsigned long ioba, unsigned long tce);
+extern long kvmppc_virtmode_h_put_tce(struct kvm_vcpu *vcpu,
+   unsigned long liobn, unsigned long ioba,
+   unsigned long tce);
+extern long kvmppc_virtmode_h_put_tce_indirect(struct kvm_vcpu *vcpu,
+   unsigned long liobn, unsigned long ioba,
+   unsigned long tce_list, unsigned long npages);
+extern long kvmppc_virtmode_h_stuff_tce(struct kvm_vcpu *vcpu,
+   unsigned long liobn, unsigned long ioba,
+   unsigned long tce_value, unsigned long npages);
 extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm,
struct kvm_allocate_rma *rma);
 extern struct kvmppc_linear_info *kvm_alloc_rma(void);
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 72ffc89..643ac1e 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -14,6 +14,7 @@
  *
  * Copyright 2010 Paul Mackerras, IBM Corp. pau...@au1.ibm.com
  * Copyright 2011 David Gibson, IBM Corporation d...@au1.ibm.com
+ * Copyright 2013 Alexey Kardashevskiy, IBM Corporation a...@au1.ibm.com
  */
 
 #include linux/types.h
@@ -36,9 +37,14 @@
 #include asm/ppc-opcode.h
 #include asm/kvm_host.h
 #include asm/udbg.h
+#include asm/iommu.h
 
 #define TCES_PER_PAGE  (PAGE_SIZE / sizeof(u64))
+#define ERROR_ADDR  (~(unsigned long)0x0)
 
+/*
+ * TCE tables handlers.
+ */
 static long kvmppc_stt_npages(unsigned long window_size)
 {
return ALIGN((window_size  SPAPR_TCE_SHIFT)
@@ -148,3 +154,111 @@ fail:
}
return ret;
 }
+
+/*
+ * Virtual mode handling of IOMMU map/unmap.
+ */
+/* Converts guest physical address into host virtual */
+static unsigned long get_virt_address(struct kvm_vcpu *vcpu,
+   unsigned long gpa)
+{
+   unsigned long hva, gfn = gpa  PAGE_SHIFT;

[PATCH 6/6] KVM: PPC: Add hugepage support for IOMMU in-kernel handling

2013-05-06 Thread Alexey Kardashevskiy
This adds special support for huge pages (16MB).  The reference
counting cannot be easily done for such pages in real mode (when
MMU is off) so we added a list of huge pages.  It is populated in
virtual mode and get_page is called just once per a huge page.
Real mode handlers check if the requested page is huge and in the list,
then no reference counting is done, otherwise an exit to virtual mode
happens.  The list is released at KVM exit.  At the moment the fastest
card available for tests uses up to 9 huge pages so walking through this
list is not very expensive.  However this can change and we may want
to optimize this.

Cc: David Gibson da...@gibson.dropbear.id.au
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Signed-off-by: Paul Mackerras pau...@samba.org
---
 arch/powerpc/include/asm/kvm_host.h |2 +
 arch/powerpc/include/asm/kvm_ppc.h  |   24 +++
 arch/powerpc/kvm/book3s_64_vio.c|   79 ++-
 arch/powerpc/kvm/book3s_64_vio_hv.c |   47 -
 4 files changed, 149 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 2b70cbc..b6a047e 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -180,6 +180,8 @@ struct kvmppc_spapr_tce_table {
u32 window_size;
bool virtmode_only;
struct iommu_group *grp;/* used for IOMMU groups */
+   struct list_head hugepages; /* used for IOMMU groups */
+   spinlock_t hugepages_lock;  /* used for IOMMU groups */
struct page *pages[0];
 };
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index bdfa140..3c95464 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -154,6 +154,30 @@ extern long kvmppc_virtmode_h_put_tce_indirect(struct 
kvm_vcpu *vcpu,
 extern long kvmppc_virtmode_h_stuff_tce(struct kvm_vcpu *vcpu,
unsigned long liobn, unsigned long ioba,
unsigned long tce_value, unsigned long npages);
+
+/*
+ * The KVM guest can be backed with 16MB pages (qemu switch
+ * -mem-path /var/lib/hugetlbfs/global/pagesize-16MB/).
+ * In this case, we cannot do page counting from the real mode
+ * as the compound pages are used - they are linked in a list
+ * with pointers as virtual addresses which are inaccessible
+ * in real mode.
+ *
+ * The code below keeps a 16MB pages list and uses page struct
+ * in real mode if it is already locked in RAM and inserted into
+ * the list or switches to the virtual mode where it can be
+ * handled in a usual manner.
+ */
+struct iommu_kvmppc_hugepage {
+   struct list_head list;
+   pte_t pte;  /* Huge page PTE */
+   unsigned long pa;   /* Base phys address used as a real TCE */
+   struct page *page;  /* page struct of the very first subpage */
+   unsigned long size; /* Huge page size (always 16MB at the moment) */
+};
+extern struct iommu_kvmppc_hugepage *kvmppc_iommu_hugepage_find(
+   struct kvmppc_spapr_tce_table *tt, pte_t pte);
+
 extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm,
struct kvm_allocate_rma *rma);
 extern struct kvmppc_linear_info *kvm_alloc_rma(void);
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 98cf949..274458d 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -54,6 +54,59 @@ static bool kvmppc_tce_virt_only = false;
 module_param_named(virt_only, kvmppc_tce_virt_only, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(virt_only, Disable realmode handling of IOMMU map/unmap);
 
+#ifdef CONFIG_IOMMU_API
+/*
+ * Adds a new huge page descriptor to the list.
+ */
+static struct iommu_kvmppc_hugepage *kvmppc_iommu_hugepage_add(
+   struct kvmppc_spapr_tce_table *tt,
+   pte_t pte, unsigned long va, unsigned long pg_size)
+{
+   int ret;
+   struct iommu_kvmppc_hugepage *hp;
+   struct page *p;
+
+   va = va  ~(pg_size - 1);
+   ret = get_user_pages_fast(va, 1, true/*write*/, p);
+   if ((ret != 1) || !p)
+   return NULL;
+
+   hp = kzalloc(sizeof(*hp), GFP_KERNEL);
+   if (!hp)
+   return NULL;
+
+   hp-page = p;
+   hp-pte = pte;
+   hp-pa = __pa((unsigned long) page_address(hp-page));
+   hp-size = pg_size;
+
+   spin_lock(tt-hugepages_lock);
+   list_add(hp-list, tt-hugepages);
+   spin_unlock(tt-hugepages_lock);
+
+   return hp;
+}
+
+static void kvmppc_iommu_hugepages_init(struct kvmppc_spapr_tce_table *tt)
+{
+   INIT_LIST_HEAD(tt-hugepages);
+   spin_lock_init(tt-hugepages_lock);
+}
+
+static void kvmppc_iommu_hugepages_cleanup(struct kvmppc_spapr_tce_table *tt)
+{
+   struct iommu_kvmppc_hugepage *hp, *tmp;
+
+   spin_lock(tt-hugepages_lock);
+   list_for_each_entry_safe(hp, tmp, tt-hugepages, list) {

Re: [PATCH 0/3] vhost cleanups and separate module

2013-05-06 Thread Rusty Russell
Asias He as...@redhat.com writes:
 Asias He (3):
   vhost: Remove vhost_enable_zcopy in vhost.h
   vhost: Move VHOST_NET_FEATURES to net.c
   vhost: Make vhost a separate module

I like these cleanups, MST pleasee apply.

I have some other cleanups which are on hold for the moment pending
MST's vhost_net simplification.  MST, how's that going?

Thanks,
Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/3] vhost cleanups and separate module

2013-05-06 Thread Asias He
Hello Rusty, 

On Mon, May 06, 2013 at 03:41:36PM +0930, Rusty Russell wrote:
 Asias He as...@redhat.com writes:
  Asias He (3):
vhost: Remove vhost_enable_zcopy in vhost.h
vhost: Move VHOST_NET_FEATURES to net.c
vhost: Make vhost a separate module
 
 I like these cleanups, MST pleasee apply.
 
 I have some other cleanups which are on hold for the moment pending
 MST's vhost_net simplification.  MST, how's that going?

Do you mean patches in your rusty/vringh branch? I want to do the frame
assumption conversion for vhost-scsi on top of the vringh series.

 Thanks,
 Rusty.

-- 
Asias
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] vhost-net: Cleanup vhost_ubuf adn vhost_zcopy

2013-05-06 Thread Michael S. Tsirkin
Typo a/adn/and/

On Fri, May 03, 2013 at 02:25:18PM +0800, Asias He wrote:
 - Rename vhost_ubuf to vhost_net_ubuf
 - Rename vhost_zcopy_mask to vhost_net_zcopy_mask
 - Make funcs static
 
 Signed-off-by: Asias He as...@redhat.com
 ---
  drivers/vhost/net.c | 58 
 +++--
  1 file changed, 30 insertions(+), 28 deletions(-)
 
 diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
 index eb73217..4548c0b 100644
 --- a/drivers/vhost/net.c
 +++ b/drivers/vhost/net.c
 @@ -70,7 +70,7 @@ enum {
   VHOST_NET_VQ_MAX = 2,
  };
  
 -struct vhost_ubuf_ref {
 +struct vhost_net_ubuf_ref {
   struct kref kref;
   wait_queue_head_t wait;
   struct vhost_virtqueue *vq;
 @@ -93,7 +93,7 @@ struct vhost_net_virtqueue {
   struct ubuf_info *ubuf_info;
   /* Reference counting for outstanding ubufs.
* Protected by vq mutex. Writers must also take device mutex. */
 - struct vhost_ubuf_ref *ubufs;
 + struct vhost_net_ubuf_ref *ubufs;
  };
  
  struct vhost_net {
 @@ -110,24 +110,25 @@ struct vhost_net {
   bool tx_flush;
  };
  
 -static unsigned vhost_zcopy_mask __read_mostly;
 +static unsigned vhost_net_zcopy_mask __read_mostly;
  
 -void vhost_enable_zcopy(int vq)
 +static void vhost_net_enable_zcopy(int vq)
  {
 - vhost_zcopy_mask |= 0x1  vq;
 + vhost_net_zcopy_mask |= 0x1  vq;
  }
  
 -static void vhost_zerocopy_done_signal(struct kref *kref)
 +static void vhost_net_zerocopy_done_signal(struct kref *kref)
  {
 - struct vhost_ubuf_ref *ubufs = container_of(kref, struct vhost_ubuf_ref,
 - kref);
 + struct vhost_net_ubuf_ref *ubufs;
 +
 + ubufs = container_of(kref, struct vhost_net_ubuf_ref, kref);
   wake_up(ubufs-wait);
  }
  
 -struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *vq,
 - bool zcopy)
 +static struct vhost_net_ubuf_ref *
 +vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
  {
 - struct vhost_ubuf_ref *ubufs;
 + struct vhost_net_ubuf_ref *ubufs;
   /* No zero copy backend? Nothing to count. */
   if (!zcopy)
   return NULL;
 @@ -140,14 +141,14 @@ struct vhost_ubuf_ref *vhost_ubuf_alloc(struct 
 vhost_virtqueue *vq,
   return ubufs;
  }
  
 -void vhost_ubuf_put(struct vhost_ubuf_ref *ubufs)
 +static void vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs)
  {
 - kref_put(ubufs-kref, vhost_zerocopy_done_signal);
 + kref_put(ubufs-kref, vhost_net_zerocopy_done_signal);
  }
  
 -void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *ubufs)
 +static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs)
  {
 - kref_put(ubufs-kref, vhost_zerocopy_done_signal);
 + kref_put(ubufs-kref, vhost_net_zerocopy_done_signal);
   wait_event(ubufs-wait, !atomic_read(ubufs-kref.refcount));
   kfree(ubufs);
  }
 @@ -159,7 +160,7 @@ static void vhost_net_clear_ubuf_info(struct vhost_net *n)
   int i;
  
   for (i = 0; i  n-dev.nvqs; ++i) {
 - zcopy = vhost_zcopy_mask  (0x1  i);
 + zcopy = vhost_net_zcopy_mask  (0x1  i);
   if (zcopy)
   kfree(n-vqs[i].ubuf_info);
   }
 @@ -171,7 +172,7 @@ int vhost_net_set_ubuf_info(struct vhost_net *n)
   int i;
  
   for (i = 0; i  n-dev.nvqs; ++i) {
 - zcopy = vhost_zcopy_mask  (0x1  i);
 + zcopy = vhost_net_zcopy_mask  (0x1  i);
   if (!zcopy)
   continue;
   n-vqs[i].ubuf_info = kmalloc(sizeof(*n-vqs[i].ubuf_info) *
 @@ -183,7 +184,7 @@ int vhost_net_set_ubuf_info(struct vhost_net *n)
  
  err:
   while (i--) {
 - zcopy = vhost_zcopy_mask  (0x1  i);
 + zcopy = vhost_net_zcopy_mask  (0x1  i);
   if (!zcopy)
   continue;
   kfree(n-vqs[i].ubuf_info);
 @@ -305,7 +306,7 @@ static int vhost_zerocopy_signal_used(struct vhost_net 
 *net,
  
  static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
  {
 - struct vhost_ubuf_ref *ubufs = ubuf-ctx;
 + struct vhost_net_ubuf_ref *ubufs = ubuf-ctx;
   struct vhost_virtqueue *vq = ubufs-vq;
   int cnt = atomic_read(ubufs-kref.refcount);
  
 @@ -322,7 +323,7 @@ static void vhost_zerocopy_callback(struct ubuf_info 
 *ubuf, bool success)
   /* set len to mark this desc buffers done DMA */
   vq-heads[ubuf-desc].len = success ?
   VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
 - vhost_ubuf_put(ubufs);
 + vhost_net_ubuf_put(ubufs);
  }
  
  /* Expects to be always run from workqueue - which acts as
 @@ -345,7 +346,7 @@ static void handle_tx(struct vhost_net *net)
   int err;
   size_t hdr_size;
   struct socket *sock;
 - struct vhost_ubuf_ref *uninitialized_var(ubufs);
 + struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
   bool zcopy, zcopy_used;
  
   /* TODO: check that we are 

Re: [PATCH 4/4] vhost-net: Cleanup vhost_ubuf adn vhost_zcopy

2013-05-06 Thread Asias He
On Mon, May 6, 2013 at 4:17 PM, Michael S. Tsirkin m...@redhat.com wrote:
 Typo a/adn/and/

Yes.  Catched  this up and and fixed already.


 On Fri, May 03, 2013 at 02:25:18PM +0800, Asias He wrote:
 - Rename vhost_ubuf to vhost_net_ubuf
 - Rename vhost_zcopy_mask to vhost_net_zcopy_mask
 - Make funcs static

 Signed-off-by: Asias He as...@redhat.com
 ---
  drivers/vhost/net.c | 58 
 +++--
  1 file changed, 30 insertions(+), 28 deletions(-)

 diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
 index eb73217..4548c0b 100644
 --- a/drivers/vhost/net.c
 +++ b/drivers/vhost/net.c
 @@ -70,7 +70,7 @@ enum {
   VHOST_NET_VQ_MAX = 2,
  };

 -struct vhost_ubuf_ref {
 +struct vhost_net_ubuf_ref {
   struct kref kref;
   wait_queue_head_t wait;
   struct vhost_virtqueue *vq;
 @@ -93,7 +93,7 @@ struct vhost_net_virtqueue {
   struct ubuf_info *ubuf_info;
   /* Reference counting for outstanding ubufs.
* Protected by vq mutex. Writers must also take device mutex. */
 - struct vhost_ubuf_ref *ubufs;
 + struct vhost_net_ubuf_ref *ubufs;
  };

  struct vhost_net {
 @@ -110,24 +110,25 @@ struct vhost_net {
   bool tx_flush;
  };

 -static unsigned vhost_zcopy_mask __read_mostly;
 +static unsigned vhost_net_zcopy_mask __read_mostly;

 -void vhost_enable_zcopy(int vq)
 +static void vhost_net_enable_zcopy(int vq)
  {
 - vhost_zcopy_mask |= 0x1  vq;
 + vhost_net_zcopy_mask |= 0x1  vq;
  }

 -static void vhost_zerocopy_done_signal(struct kref *kref)
 +static void vhost_net_zerocopy_done_signal(struct kref *kref)
  {
 - struct vhost_ubuf_ref *ubufs = container_of(kref, struct 
 vhost_ubuf_ref,
 - kref);
 + struct vhost_net_ubuf_ref *ubufs;
 +
 + ubufs = container_of(kref, struct vhost_net_ubuf_ref, kref);
   wake_up(ubufs-wait);
  }

 -struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *vq,
 - bool zcopy)
 +static struct vhost_net_ubuf_ref *
 +vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
  {
 - struct vhost_ubuf_ref *ubufs;
 + struct vhost_net_ubuf_ref *ubufs;
   /* No zero copy backend? Nothing to count. */
   if (!zcopy)
   return NULL;
 @@ -140,14 +141,14 @@ struct vhost_ubuf_ref *vhost_ubuf_alloc(struct 
 vhost_virtqueue *vq,
   return ubufs;
  }

 -void vhost_ubuf_put(struct vhost_ubuf_ref *ubufs)
 +static void vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs)
  {
 - kref_put(ubufs-kref, vhost_zerocopy_done_signal);
 + kref_put(ubufs-kref, vhost_net_zerocopy_done_signal);
  }

 -void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *ubufs)
 +static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs)
  {
 - kref_put(ubufs-kref, vhost_zerocopy_done_signal);
 + kref_put(ubufs-kref, vhost_net_zerocopy_done_signal);
   wait_event(ubufs-wait, !atomic_read(ubufs-kref.refcount));
   kfree(ubufs);
  }
 @@ -159,7 +160,7 @@ static void vhost_net_clear_ubuf_info(struct vhost_net 
 *n)
   int i;

   for (i = 0; i  n-dev.nvqs; ++i) {
 - zcopy = vhost_zcopy_mask  (0x1  i);
 + zcopy = vhost_net_zcopy_mask  (0x1  i);
   if (zcopy)
   kfree(n-vqs[i].ubuf_info);
   }
 @@ -171,7 +172,7 @@ int vhost_net_set_ubuf_info(struct vhost_net *n)
   int i;

   for (i = 0; i  n-dev.nvqs; ++i) {
 - zcopy = vhost_zcopy_mask  (0x1  i);
 + zcopy = vhost_net_zcopy_mask  (0x1  i);
   if (!zcopy)
   continue;
   n-vqs[i].ubuf_info = kmalloc(sizeof(*n-vqs[i].ubuf_info) *
 @@ -183,7 +184,7 @@ int vhost_net_set_ubuf_info(struct vhost_net *n)

  err:
   while (i--) {
 - zcopy = vhost_zcopy_mask  (0x1  i);
 + zcopy = vhost_net_zcopy_mask  (0x1  i);
   if (!zcopy)
   continue;
   kfree(n-vqs[i].ubuf_info);
 @@ -305,7 +306,7 @@ static int vhost_zerocopy_signal_used(struct vhost_net 
 *net,

  static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
  {
 - struct vhost_ubuf_ref *ubufs = ubuf-ctx;
 + struct vhost_net_ubuf_ref *ubufs = ubuf-ctx;
   struct vhost_virtqueue *vq = ubufs-vq;
   int cnt = atomic_read(ubufs-kref.refcount);

 @@ -322,7 +323,7 @@ static void vhost_zerocopy_callback(struct ubuf_info 
 *ubuf, bool success)
   /* set len to mark this desc buffers done DMA */
   vq-heads[ubuf-desc].len = success ?
   VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
 - vhost_ubuf_put(ubufs);
 + vhost_net_ubuf_put(ubufs);
  }

  /* Expects to be always run from workqueue - which acts as
 @@ -345,7 +346,7 @@ static void handle_tx(struct vhost_net *net)
   int err;
   size_t hdr_size;
   struct socket *sock;
 - struct vhost_ubuf_ref *uninitialized_var(ubufs);
 + struct vhost_net_ubuf_ref 

[PATCH v2 00/11] vhost cleanups

2013-05-06 Thread Asias He
MST, This is on top of [PATCH 0/2] vhost-net fix ubuf.

Asias He (11):
  vhost: Remove vhost_enable_zcopy in vhost.h
  vhost: Move VHOST_NET_FEATURES to net.c
  vhost: Make vhost a separate module
  vhost: Remove comments for hdr in vhost.h
  vhost: Simplify dev-vqs[i] access
  vhost-net: Cleanup vhost_ubuf and vhost_zcopy
  vhost-scsi: Remove unnecessary forward struct vhost_scsi declaration
  vhost-scsi: Rename struct vhost_scsi *s to *vs
  vhost-scsi: Make func indention more consistent
  vhost-scsi: Rename struct tcm_vhost_tpg *tv_tpg to *tpg
  vhost-scsi: Rename struct tcm_vhost_cmd *tv_cmd to *cmd

 drivers/vhost/Kconfig  |   8 +
 drivers/vhost/Makefile |   3 +-
 drivers/vhost/net.c|  64 ---
 drivers/vhost/scsi.c   | 470 ++---
 drivers/vhost/vhost.c  |  86 +++--
 drivers/vhost/vhost.h  |  11 +-
 6 files changed, 361 insertions(+), 281 deletions(-)

-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 01/11] vhost: Remove vhost_enable_zcopy in vhost.h

2013-05-06 Thread Asias He
It is net.c specific.

Signed-off-by: Asias He as...@redhat.com
---
 drivers/vhost/vhost.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index cc23bc4..076c9ac 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -192,7 +192,4 @@ static inline int vhost_has_feature(struct vhost_dev *dev, 
int bit)
acked_features = rcu_dereference_index_check(dev-acked_features, 1);
return acked_features  (1  bit);
 }
-
-void vhost_enable_zcopy(int vq);
-
 #endif
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 02/11] vhost: Move VHOST_NET_FEATURES to net.c

2013-05-06 Thread Asias He
vhost.h should not depend on device specific marcos like
VHOST_NET_F_VIRTIO_NET_HDR and VIRTIO_NET_F_MRG_RXBUF.

Signed-off-by: Asias He as...@redhat.com
---
 drivers/vhost/net.c   | 6 ++
 drivers/vhost/vhost.h | 3 ---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 354665a..06b2447 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -59,6 +59,12 @@ MODULE_PARM_DESC(experimental_zcopytx, Enable Zero Copy TX;
 #define VHOST_DMA_IS_DONE(len) ((len) = VHOST_DMA_DONE_LEN)
 
 enum {
+   VHOST_NET_FEATURES = VHOST_FEATURES |
+(1ULL  VHOST_NET_F_VIRTIO_NET_HDR) |
+(1ULL  VIRTIO_NET_F_MRG_RXBUF),
+};
+
+enum {
VHOST_NET_VQ_RX = 0,
VHOST_NET_VQ_TX = 1,
VHOST_NET_VQ_MAX = 2,
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 076c9ac..6bf81a9 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -178,9 +178,6 @@ enum {
 (1ULL  VIRTIO_RING_F_INDIRECT_DESC) |
 (1ULL  VIRTIO_RING_F_EVENT_IDX) |
 (1ULL  VHOST_F_LOG_ALL),
-   VHOST_NET_FEATURES = VHOST_FEATURES |
-(1ULL  VHOST_NET_F_VIRTIO_NET_HDR) |
-(1ULL  VIRTIO_NET_F_MRG_RXBUF),
 };
 
 static inline int vhost_has_feature(struct vhost_dev *dev, int bit)
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 03/11] vhost: Make vhost a separate module

2013-05-06 Thread Asias He
Currently, vhost-net and vhost-scsi are sharing the vhost core code.
However, vhost-scsi shares the code by including the vhost.c file
directly.

Making vhost a separate module makes it is easier to share code with
other vhost devices.

Signed-off-by: Asias He as...@redhat.com
---
 drivers/vhost/Kconfig  |  8 
 drivers/vhost/Makefile |  3 ++-
 drivers/vhost/scsi.c   |  1 -
 drivers/vhost/vhost.c  | 51 +-
 drivers/vhost/vhost.h  |  2 ++
 5 files changed, 62 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 8b9226d..017a1e8 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -1,6 +1,7 @@
 config VHOST_NET
tristate Host kernel accelerator for virtio net
depends on NET  EVENTFD  (TUN || !TUN)  (MACVTAP || !MACVTAP)
+   select VHOST
select VHOST_RING
---help---
  This kernel module can be loaded in host kernel to accelerate
@@ -13,6 +14,7 @@ config VHOST_NET
 config VHOST_SCSI
tristate VHOST_SCSI TCM fabric driver
depends on TARGET_CORE  EVENTFD  m
+   select VHOST
select VHOST_RING
default n
---help---
@@ -24,3 +26,9 @@ config VHOST_RING
---help---
  This option is selected by any driver which needs to access
  the host side of a virtio ring.
+
+config VHOST
+   tristate
+   ---help---
+ This option is selected by any driver which needs to access
+ the core of vhost.
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 654e9afb..e0441c3 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -1,7 +1,8 @@
 obj-$(CONFIG_VHOST_NET) += vhost_net.o
-vhost_net-y := vhost.o net.o
+vhost_net-y := net.o
 
 obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o
 vhost_scsi-y := scsi.o
 
 obj-$(CONFIG_VHOST_RING) += vringh.o
+obj-$(CONFIG_VHOST)+= vhost.o
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 5179f7a..2dcb94a 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -49,7 +49,6 @@
 #include linux/llist.h
 #include linux/bitmap.h
 
-#include vhost.c
 #include vhost.h
 
 #define TCM_VHOST_VERSION  v0.1
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index de9441a..e406d5f 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -25,6 +25,7 @@
 #include linux/slab.h
 #include linux/kthread.h
 #include linux/cgroup.h
+#include linux/module.h
 
 #include vhost.h
 
@@ -66,6 +67,7 @@ void vhost_work_init(struct vhost_work *work, vhost_work_fn_t 
fn)
work-flushing = 0;
work-queue_seq = work-done_seq = 0;
 }
+EXPORT_SYMBOL_GPL(vhost_work_init);
 
 /* Init poll structure */
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
@@ -79,6 +81,7 @@ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t 
fn,
 
vhost_work_init(poll-work, fn);
 }
+EXPORT_SYMBOL_GPL(vhost_poll_init);
 
 /* Start polling a file. We add ourselves to file's wait queue. The caller must
  * keep a reference to a file until after vhost_poll_stop is called. */
@@ -101,6 +104,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file 
*file)
 
return ret;
 }
+EXPORT_SYMBOL_GPL(vhost_poll_start);
 
 /* Stop polling a file. After this function returns, it becomes safe to drop 
the
  * file reference. You must also flush afterwards. */
@@ -111,6 +115,7 @@ void vhost_poll_stop(struct vhost_poll *poll)
poll-wqh = NULL;
}
 }
+EXPORT_SYMBOL_GPL(vhost_poll_stop);
 
 static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work,
unsigned seq)
@@ -123,7 +128,7 @@ static bool vhost_work_seq_done(struct vhost_dev *dev, 
struct vhost_work *work,
return left = 0;
 }
 
-static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
+void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
 {
unsigned seq;
int flushing;
@@ -138,6 +143,7 @@ static void vhost_work_flush(struct vhost_dev *dev, struct 
vhost_work *work)
spin_unlock_irq(dev-work_lock);
BUG_ON(flushing  0);
 }
+EXPORT_SYMBOL_GPL(vhost_work_flush);
 
 /* Flush any work that has been scheduled. When calling this, don't hold any
  * locks that are also used by the callback. */
@@ -145,6 +151,7 @@ void vhost_poll_flush(struct vhost_poll *poll)
 {
vhost_work_flush(poll-dev, poll-work);
 }
+EXPORT_SYMBOL_GPL(vhost_poll_flush);
 
 void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
 {
@@ -158,11 +165,13 @@ void vhost_work_queue(struct vhost_dev *dev, struct 
vhost_work *work)
}
spin_unlock_irqrestore(dev-work_lock, flags);
 }
+EXPORT_SYMBOL_GPL(vhost_work_queue);
 
 void vhost_poll_queue(struct vhost_poll *poll)
 {
vhost_work_queue(poll-dev, poll-work);
 }
+EXPORT_SYMBOL_GPL(vhost_poll_queue);
 
 static void vhost_vq_reset(struct vhost_dev *dev,
 

[PATCH v2 04/11] vhost: Remove comments for hdr in vhost.h

2013-05-06 Thread Asias He
It is supposed to be removed when hdr is moved into vhost_net_virtqueue.

Signed-off-by: Asias He as...@redhat.com
---
 drivers/vhost/vhost.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 94a80eb..51aeb5f 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -101,9 +101,6 @@ struct vhost_virtqueue {
u64 log_addr;
 
struct iovec iov[UIO_MAXIOV];
-   /* hdr is used to store the virtio header.
-* Since each iovec has = 1 byte length, we never need more than
-* header length entries to store the header. */
struct iovec *indirect;
struct vring_used_elem *heads;
/* We use a kind of RCU to access private pointer.
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 05/11] vhost: Simplify dev-vqs[i] access

2013-05-06 Thread Asias He
Signed-off-by: Asias He as...@redhat.com
---
 drivers/vhost/vhost.c | 35 ++-
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index e406d5f..74bc779 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -260,17 +260,16 @@ static void vhost_vq_free_iovecs(struct vhost_virtqueue 
*vq)
 /* Helper to allocate iovec buffers for all vqs. */
 static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
 {
+   struct vhost_virtqueue *vq;
int i;
 
for (i = 0; i  dev-nvqs; ++i) {
-   dev-vqs[i]-indirect = kmalloc(sizeof *dev-vqs[i]-indirect *
-  UIO_MAXIOV, GFP_KERNEL);
-   dev-vqs[i]-log = kmalloc(sizeof *dev-vqs[i]-log * 
UIO_MAXIOV,
- GFP_KERNEL);
-   dev-vqs[i]-heads = kmalloc(sizeof *dev-vqs[i]-heads *
-   UIO_MAXIOV, GFP_KERNEL);
-   if (!dev-vqs[i]-indirect || !dev-vqs[i]-log ||
-   !dev-vqs[i]-heads)
+   vq = dev-vqs[i];
+   vq-indirect = kmalloc(sizeof *vq-indirect * UIO_MAXIOV,
+  GFP_KERNEL);
+   vq-log = kmalloc(sizeof *vq-log * UIO_MAXIOV, GFP_KERNEL);
+   vq-heads = kmalloc(sizeof *vq-heads * UIO_MAXIOV, GFP_KERNEL);
+   if (!vq-indirect || !vq-log || !vq-heads)
goto err_nomem;
}
return 0;
@@ -292,6 +291,7 @@ static void vhost_dev_free_iovecs(struct vhost_dev *dev)
 long vhost_dev_init(struct vhost_dev *dev,
struct vhost_virtqueue **vqs, int nvqs)
 {
+   struct vhost_virtqueue *vq;
int i;
 
dev-vqs = vqs;
@@ -306,15 +306,16 @@ long vhost_dev_init(struct vhost_dev *dev,
dev-worker = NULL;
 
for (i = 0; i  dev-nvqs; ++i) {
-   dev-vqs[i]-log = NULL;
-   dev-vqs[i]-indirect = NULL;
-   dev-vqs[i]-heads = NULL;
-   dev-vqs[i]-dev = dev;
-   mutex_init(dev-vqs[i]-mutex);
-   vhost_vq_reset(dev, dev-vqs[i]);
-   if (dev-vqs[i]-handle_kick)
-   vhost_poll_init(dev-vqs[i]-poll,
-   dev-vqs[i]-handle_kick, POLLIN, dev);
+   vq = dev-vqs[i];
+   vq-log = NULL;
+   vq-indirect = NULL;
+   vq-heads = NULL;
+   vq-dev = dev;
+   mutex_init(vq-mutex);
+   vhost_vq_reset(dev, vq);
+   if (vq-handle_kick)
+   vhost_poll_init(vq-poll, vq-handle_kick,
+   POLLIN, dev);
}
 
return 0;
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 06/11] vhost-net: Cleanup vhost_ubuf and vhost_zcopy

2013-05-06 Thread Asias He
- Rename vhost_ubuf to vhost_net_ubuf
- Rename vhost_zcopy_mask to vhost_net_zcopy_mask
- Make funcs static

Signed-off-by: Asias He as...@redhat.com
---
 drivers/vhost/net.c | 58 +++--
 1 file changed, 30 insertions(+), 28 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 06b2447..2b51e23 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -70,7 +70,7 @@ enum {
VHOST_NET_VQ_MAX = 2,
 };
 
-struct vhost_ubuf_ref {
+struct vhost_net_ubuf_ref {
struct kref kref;
wait_queue_head_t wait;
struct vhost_virtqueue *vq;
@@ -93,7 +93,7 @@ struct vhost_net_virtqueue {
struct ubuf_info *ubuf_info;
/* Reference counting for outstanding ubufs.
 * Protected by vq mutex. Writers must also take device mutex. */
-   struct vhost_ubuf_ref *ubufs;
+   struct vhost_net_ubuf_ref *ubufs;
 };
 
 struct vhost_net {
@@ -110,24 +110,25 @@ struct vhost_net {
bool tx_flush;
 };
 
-static unsigned vhost_zcopy_mask __read_mostly;
+static unsigned vhost_net_zcopy_mask __read_mostly;
 
-void vhost_enable_zcopy(int vq)
+static void vhost_net_enable_zcopy(int vq)
 {
-   vhost_zcopy_mask |= 0x1  vq;
+   vhost_net_zcopy_mask |= 0x1  vq;
 }
 
-static void vhost_zerocopy_done_signal(struct kref *kref)
+static void vhost_net_zerocopy_done_signal(struct kref *kref)
 {
-   struct vhost_ubuf_ref *ubufs = container_of(kref, struct vhost_ubuf_ref,
-   kref);
+   struct vhost_net_ubuf_ref *ubufs;
+
+   ubufs = container_of(kref, struct vhost_net_ubuf_ref, kref);
wake_up(ubufs-wait);
 }
 
-struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *vq,
-   bool zcopy)
+static struct vhost_net_ubuf_ref *
+vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
 {
-   struct vhost_ubuf_ref *ubufs;
+   struct vhost_net_ubuf_ref *ubufs;
/* No zero copy backend? Nothing to count. */
if (!zcopy)
return NULL;
@@ -140,14 +141,14 @@ struct vhost_ubuf_ref *vhost_ubuf_alloc(struct 
vhost_virtqueue *vq,
return ubufs;
 }
 
-void vhost_ubuf_put(struct vhost_ubuf_ref *ubufs)
+static void vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs)
 {
-   kref_put(ubufs-kref, vhost_zerocopy_done_signal);
+   kref_put(ubufs-kref, vhost_net_zerocopy_done_signal);
 }
 
-void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *ubufs)
+static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs)
 {
-   kref_put(ubufs-kref, vhost_zerocopy_done_signal);
+   kref_put(ubufs-kref, vhost_net_zerocopy_done_signal);
wait_event(ubufs-wait, !atomic_read(ubufs-kref.refcount));
kfree(ubufs);
 }
@@ -159,7 +160,7 @@ static void vhost_net_clear_ubuf_info(struct vhost_net *n)
int i;
 
for (i = 0; i  n-dev.nvqs; ++i) {
-   zcopy = vhost_zcopy_mask  (0x1  i);
+   zcopy = vhost_net_zcopy_mask  (0x1  i);
if (zcopy)
kfree(n-vqs[i].ubuf_info);
}
@@ -171,7 +172,7 @@ int vhost_net_set_ubuf_info(struct vhost_net *n)
int i;
 
for (i = 0; i  n-dev.nvqs; ++i) {
-   zcopy = vhost_zcopy_mask  (0x1  i);
+   zcopy = vhost_net_zcopy_mask  (0x1  i);
if (!zcopy)
continue;
n-vqs[i].ubuf_info = kmalloc(sizeof(*n-vqs[i].ubuf_info) *
@@ -183,7 +184,7 @@ int vhost_net_set_ubuf_info(struct vhost_net *n)
 
 err:
while (i--) {
-   zcopy = vhost_zcopy_mask  (0x1  i);
+   zcopy = vhost_net_zcopy_mask  (0x1  i);
if (!zcopy)
continue;
kfree(n-vqs[i].ubuf_info);
@@ -305,7 +306,7 @@ static int vhost_zerocopy_signal_used(struct vhost_net *net,
 
 static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
 {
-   struct vhost_ubuf_ref *ubufs = ubuf-ctx;
+   struct vhost_net_ubuf_ref *ubufs = ubuf-ctx;
struct vhost_virtqueue *vq = ubufs-vq;
int cnt = atomic_read(ubufs-kref.refcount);
 
@@ -322,7 +323,7 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, 
bool success)
/* set len to mark this desc buffers done DMA */
vq-heads[ubuf-desc].len = success ?
VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
-   vhost_ubuf_put(ubufs);
+   vhost_net_ubuf_put(ubufs);
 }
 
 /* Expects to be always run from workqueue - which acts as
@@ -345,7 +346,7 @@ static void handle_tx(struct vhost_net *net)
int err;
size_t hdr_size;
struct socket *sock;
-   struct vhost_ubuf_ref *uninitialized_var(ubufs);
+   struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
bool zcopy, zcopy_used;
 
/* TODO: check that we are running from vhost_worker? */
@@ -441,7 +442,7 @@ static void handle_tx(struct vhost_net 

[PATCH v2 07/11] vhost-scsi: Remove unnecessary forward struct vhost_scsi declaration

2013-05-06 Thread Asias He
It was needed when struct tcm_vhost_tpg is in tcm_vhost.h

Signed-off-by: Asias He as...@redhat.com
---
 drivers/vhost/scsi.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 2dcb94a..02ddedd 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -115,7 +115,6 @@ struct tcm_vhost_nacl {
struct se_node_acl se_node_acl;
 };
 
-struct vhost_scsi;
 struct tcm_vhost_tpg {
/* Vhost port target portal group tag for TCM */
u16 tport_tpgt;
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 08/11] vhost-scsi: Rename struct vhost_scsi *s to *vs

2013-05-06 Thread Asias He
vs is used everywhere, make the naming more consistent.

Signed-off-by: Asias He as...@redhat.com
---
 drivers/vhost/scsi.c | 56 ++--
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 02ddedd..d4798e1 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -1342,63 +1342,63 @@ static int vhost_scsi_set_features(struct vhost_scsi 
*vs, u64 features)
 
 static int vhost_scsi_open(struct inode *inode, struct file *f)
 {
-   struct vhost_scsi *s;
+   struct vhost_scsi *vs;
struct vhost_virtqueue **vqs;
int r, i;
 
-   s = kzalloc(sizeof(*s), GFP_KERNEL);
-   if (!s)
+   vs = kzalloc(sizeof(*vs), GFP_KERNEL);
+   if (!vs)
return -ENOMEM;
 
vqs = kmalloc(VHOST_SCSI_MAX_VQ * sizeof(*vqs), GFP_KERNEL);
if (!vqs) {
-   kfree(s);
+   kfree(vs);
return -ENOMEM;
}
 
-   vhost_work_init(s-vs_completion_work, vhost_scsi_complete_cmd_work);
-   vhost_work_init(s-vs_event_work, tcm_vhost_evt_work);
+   vhost_work_init(vs-vs_completion_work, vhost_scsi_complete_cmd_work);
+   vhost_work_init(vs-vs_event_work, tcm_vhost_evt_work);
 
-   s-vs_events_nr = 0;
-   s-vs_events_missed = false;
+   vs-vs_events_nr = 0;
+   vs-vs_events_missed = false;
 
-   vqs[VHOST_SCSI_VQ_CTL] = s-vqs[VHOST_SCSI_VQ_CTL].vq;
-   vqs[VHOST_SCSI_VQ_EVT] = s-vqs[VHOST_SCSI_VQ_EVT].vq;
-   s-vqs[VHOST_SCSI_VQ_CTL].vq.handle_kick = vhost_scsi_ctl_handle_kick;
-   s-vqs[VHOST_SCSI_VQ_EVT].vq.handle_kick = vhost_scsi_evt_handle_kick;
+   vqs[VHOST_SCSI_VQ_CTL] = vs-vqs[VHOST_SCSI_VQ_CTL].vq;
+   vqs[VHOST_SCSI_VQ_EVT] = vs-vqs[VHOST_SCSI_VQ_EVT].vq;
+   vs-vqs[VHOST_SCSI_VQ_CTL].vq.handle_kick = vhost_scsi_ctl_handle_kick;
+   vs-vqs[VHOST_SCSI_VQ_EVT].vq.handle_kick = vhost_scsi_evt_handle_kick;
for (i = VHOST_SCSI_VQ_IO; i  VHOST_SCSI_MAX_VQ; i++) {
-   vqs[i] = s-vqs[i].vq;
-   s-vqs[i].vq.handle_kick = vhost_scsi_handle_kick;
+   vqs[i] = vs-vqs[i].vq;
+   vs-vqs[i].vq.handle_kick = vhost_scsi_handle_kick;
}
-   r = vhost_dev_init(s-dev, vqs, VHOST_SCSI_MAX_VQ);
+   r = vhost_dev_init(vs-dev, vqs, VHOST_SCSI_MAX_VQ);
 
-   tcm_vhost_init_inflight(s, NULL);
+   tcm_vhost_init_inflight(vs, NULL);
 
if (r  0) {
kfree(vqs);
-   kfree(s);
+   kfree(vs);
return r;
}
 
-   f-private_data = s;
+   f-private_data = vs;
return 0;
 }
 
 static int vhost_scsi_release(struct inode *inode, struct file *f)
 {
-   struct vhost_scsi *s = f-private_data;
+   struct vhost_scsi *vs = f-private_data;
struct vhost_scsi_target t;
 
-   mutex_lock(s-dev.mutex);
-   memcpy(t.vhost_wwpn, s-vs_vhost_wwpn, sizeof(t.vhost_wwpn));
-   mutex_unlock(s-dev.mutex);
-   vhost_scsi_clear_endpoint(s, t);
-   vhost_dev_stop(s-dev);
-   vhost_dev_cleanup(s-dev, false);
+   mutex_lock(vs-dev.mutex);
+   memcpy(t.vhost_wwpn, vs-vs_vhost_wwpn, sizeof(t.vhost_wwpn));
+   mutex_unlock(vs-dev.mutex);
+   vhost_scsi_clear_endpoint(vs, t);
+   vhost_dev_stop(vs-dev);
+   vhost_dev_cleanup(vs-dev, false);
/* Jobs can re-queue themselves in evt kick handler. Do extra flush. */
-   vhost_scsi_flush(s);
-   kfree(s-dev.vqs);
-   kfree(s);
+   vhost_scsi_flush(vs);
+   kfree(vs-dev.vqs);
+   kfree(vs);
return 0;
 }
 
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 09/11] vhost-scsi: Make func indention more consistent

2013-05-06 Thread Asias He
Signed-off-by: Asias He as...@redhat.com
---
 drivers/vhost/scsi.c | 154 +--
 1 file changed, 88 insertions(+), 66 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index d4798e1..d9781ed 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -333,11 +333,12 @@ static u32 tcm_vhost_get_default_depth(struct 
se_portal_group *se_tpg)
return 1;
 }
 
-static u32 tcm_vhost_get_pr_transport_id(struct se_portal_group *se_tpg,
-   struct se_node_acl *se_nacl,
-   struct t10_pr_registration *pr_reg,
-   int *format_code,
-   unsigned char *buf)
+static u32
+tcm_vhost_get_pr_transport_id(struct se_portal_group *se_tpg,
+ struct se_node_acl *se_nacl,
+ struct t10_pr_registration *pr_reg,
+ int *format_code,
+ unsigned char *buf)
 {
struct tcm_vhost_tpg *tpg = container_of(se_tpg,
struct tcm_vhost_tpg, se_tpg);
@@ -363,10 +364,11 @@ static u32 tcm_vhost_get_pr_transport_id(struct 
se_portal_group *se_tpg,
format_code, buf);
 }
 
-static u32 tcm_vhost_get_pr_transport_id_len(struct se_portal_group *se_tpg,
-   struct se_node_acl *se_nacl,
-   struct t10_pr_registration *pr_reg,
-   int *format_code)
+static u32
+tcm_vhost_get_pr_transport_id_len(struct se_portal_group *se_tpg,
+ struct se_node_acl *se_nacl,
+ struct t10_pr_registration *pr_reg,
+ int *format_code)
 {
struct tcm_vhost_tpg *tpg = container_of(se_tpg,
struct tcm_vhost_tpg, se_tpg);
@@ -392,10 +394,11 @@ static u32 tcm_vhost_get_pr_transport_id_len(struct 
se_portal_group *se_tpg,
format_code);
 }
 
-static char *tcm_vhost_parse_pr_out_transport_id(struct se_portal_group 
*se_tpg,
-   const char *buf,
-   u32 *out_tid_len,
-   char **port_nexus_ptr)
+static char *
+tcm_vhost_parse_pr_out_transport_id(struct se_portal_group *se_tpg,
+   const char *buf,
+   u32 *out_tid_len,
+   char **port_nexus_ptr)
 {
struct tcm_vhost_tpg *tpg = container_of(se_tpg,
struct tcm_vhost_tpg, se_tpg);
@@ -421,8 +424,8 @@ static char *tcm_vhost_parse_pr_out_transport_id(struct 
se_portal_group *se_tpg,
port_nexus_ptr);
 }
 
-static struct se_node_acl *tcm_vhost_alloc_fabric_acl(
-   struct se_portal_group *se_tpg)
+static struct se_node_acl *
+tcm_vhost_alloc_fabric_acl(struct se_portal_group *se_tpg)
 {
struct tcm_vhost_nacl *nacl;
 
@@ -435,8 +438,9 @@ static struct se_node_acl *tcm_vhost_alloc_fabric_acl(
return nacl-se_node_acl;
 }
 
-static void tcm_vhost_release_fabric_acl(struct se_portal_group *se_tpg,
-   struct se_node_acl *se_nacl)
+static void
+tcm_vhost_release_fabric_acl(struct se_portal_group *se_tpg,
+struct se_node_acl *se_nacl)
 {
struct tcm_vhost_nacl *nacl = container_of(se_nacl,
struct tcm_vhost_nacl, se_node_acl);
@@ -531,8 +535,9 @@ static void tcm_vhost_free_evt(struct vhost_scsi *vs, 
struct tcm_vhost_evt *evt)
kfree(evt);
 }
 
-static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
-   u32 event, u32 reason)
+static struct tcm_vhost_evt *
+tcm_vhost_allocate_evt(struct vhost_scsi *vs,
+  u32 event, u32 reason)
 {
struct vhost_virtqueue *vq = vs-vqs[VHOST_SCSI_VQ_EVT].vq;
struct tcm_vhost_evt *evt;
@@ -576,8 +581,8 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd 
*tv_cmd)
kfree(tv_cmd);
 }
 
-static void tcm_vhost_do_evt_work(struct vhost_scsi *vs,
-   struct tcm_vhost_evt *evt)
+static void
+tcm_vhost_do_evt_work(struct vhost_scsi *vs, struct tcm_vhost_evt *evt)
 {
struct vhost_virtqueue *vq = vs-vqs[VHOST_SCSI_VQ_EVT].vq;
struct virtio_scsi_event *event = evt-event;
@@ -698,12 +703,12 @@ static void vhost_scsi_complete_cmd_work(struct 
vhost_work *work)
vhost_signal(vs-dev, vs-vqs[vq].vq);
 }
 
-static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
-   struct vhost_virtqueue *vq,
-   struct tcm_vhost_tpg *tv_tpg,
-   struct virtio_scsi_cmd_req *v_req,
-   u32 exp_data_len,
-   int data_direction)
+static struct tcm_vhost_cmd *
+vhost_scsi_allocate_cmd(struct vhost_virtqueue *vq,
+   struct tcm_vhost_tpg *tv_tpg,
+   struct virtio_scsi_cmd_req *v_req,
+   u32 exp_data_len,
+   int data_direction)
 {
struct tcm_vhost_cmd *tv_cmd;
struct tcm_vhost_nexus *tv_nexus;
@@ -734,8 +739,11 @@ static struct tcm_vhost_cmd 

[PATCH v2 10/11] vhost-scsi: Rename struct tcm_vhost_tpg *tv_tpg to *tpg

2013-05-06 Thread Asias He
Signed-off-by: Asias He as...@redhat.com
---
 drivers/vhost/scsi.c | 122 +--
 1 file changed, 61 insertions(+), 61 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index d9781ed..353145f 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -705,7 +705,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work 
*work)
 
 static struct tcm_vhost_cmd *
 vhost_scsi_allocate_cmd(struct vhost_virtqueue *vq,
-   struct tcm_vhost_tpg *tv_tpg,
+   struct tcm_vhost_tpg *tpg,
struct virtio_scsi_cmd_req *v_req,
u32 exp_data_len,
int data_direction)
@@ -713,7 +713,7 @@ vhost_scsi_allocate_cmd(struct vhost_virtqueue *vq,
struct tcm_vhost_cmd *tv_cmd;
struct tcm_vhost_nexus *tv_nexus;
 
-   tv_nexus = tv_tpg-tpg_nexus;
+   tv_nexus = tpg-tpg_nexus;
if (!tv_nexus) {
pr_err(Unable to locate active struct tcm_vhost_nexus\n);
return ERR_PTR(-EIO);
@@ -895,7 +895,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct 
vhost_virtqueue *vq)
 {
struct tcm_vhost_tpg **vs_tpg;
struct virtio_scsi_cmd_req v_req;
-   struct tcm_vhost_tpg *tv_tpg;
+   struct tcm_vhost_tpg *tpg;
struct tcm_vhost_cmd *tv_cmd;
u32 exp_data_len, data_first, data_num, data_direction;
unsigned out, in, i;
@@ -981,10 +981,10 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct 
vhost_virtqueue *vq)
 
/* Extract the tpgt */
target = v_req.lun[1];
-   tv_tpg = ACCESS_ONCE(vs_tpg[target]);
+   tpg = ACCESS_ONCE(vs_tpg[target]);
 
/* Target does not exist, fail the request */
-   if (unlikely(!tv_tpg)) {
+   if (unlikely(!tpg)) {
vhost_scsi_send_bad_target(vs, vq, head, out);
continue;
}
@@ -993,7 +993,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct 
vhost_virtqueue *vq)
for (i = 0; i  data_num; i++)
exp_data_len += vq-iov[data_first + i].iov_len;
 
-   tv_cmd = vhost_scsi_allocate_cmd(vq, tv_tpg, v_req,
+   tv_cmd = vhost_scsi_allocate_cmd(vq, tpg, v_req,
exp_data_len, data_direction);
if (IS_ERR(tv_cmd)) {
vq_err(vq, vhost_scsi_allocate_cmd failed %ld\n,
@@ -1172,7 +1172,7 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs,
struct vhost_scsi_target *t)
 {
struct tcm_vhost_tport *tv_tport;
-   struct tcm_vhost_tpg *tv_tpg;
+   struct tcm_vhost_tpg *tpg;
struct tcm_vhost_tpg **vs_tpg;
struct vhost_virtqueue *vq;
int index, ret, i, len;
@@ -1199,32 +1199,32 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs,
if (vs-vs_tpg)
memcpy(vs_tpg, vs-vs_tpg, len);
 
-   list_for_each_entry(tv_tpg, tcm_vhost_list, tv_tpg_list) {
-   mutex_lock(tv_tpg-tv_tpg_mutex);
-   if (!tv_tpg-tpg_nexus) {
-   mutex_unlock(tv_tpg-tv_tpg_mutex);
+   list_for_each_entry(tpg, tcm_vhost_list, tv_tpg_list) {
+   mutex_lock(tpg-tv_tpg_mutex);
+   if (!tpg-tpg_nexus) {
+   mutex_unlock(tpg-tv_tpg_mutex);
continue;
}
-   if (tv_tpg-tv_tpg_vhost_count != 0) {
-   mutex_unlock(tv_tpg-tv_tpg_mutex);
+   if (tpg-tv_tpg_vhost_count != 0) {
+   mutex_unlock(tpg-tv_tpg_mutex);
continue;
}
-   tv_tport = tv_tpg-tport;
+   tv_tport = tpg-tport;
 
if (!strcmp(tv_tport-tport_name, t-vhost_wwpn)) {
-   if (vs-vs_tpg  vs-vs_tpg[tv_tpg-tport_tpgt]) {
+   if (vs-vs_tpg  vs-vs_tpg[tpg-tport_tpgt]) {
kfree(vs_tpg);
-   mutex_unlock(tv_tpg-tv_tpg_mutex);
+   mutex_unlock(tpg-tv_tpg_mutex);
ret = -EEXIST;
goto out;
}
-   tv_tpg-tv_tpg_vhost_count++;
-   tv_tpg-vhost_scsi = vs;
-   vs_tpg[tv_tpg-tport_tpgt] = tv_tpg;
+   tpg-tv_tpg_vhost_count++;
+   tpg-vhost_scsi = vs;
+   vs_tpg[tpg-tport_tpgt] = tpg;
smp_mb__after_atomic_inc();
match = true;
}
-   mutex_unlock(tv_tpg-tv_tpg_mutex);
+   mutex_unlock(tpg-tv_tpg_mutex);
}
 
if (match) {
@@ -1262,7 +1262,7 @@ vhost_scsi_clear_endpoint(struct vhost_scsi *vs,
  

[PATCH v2 11/11] vhost-scsi: Rename struct tcm_vhost_cmd *tv_cmd to *cmd

2013-05-06 Thread Asias He
This way, we use cmd for struct tcm_vhost_cmd and evt for struct
tcm_vhost_cmd.

Signed-off-by: Asias He as...@redhat.com
---
 drivers/vhost/scsi.c | 142 +--
 1 file changed, 71 insertions(+), 71 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 353145f..d860b58 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -499,28 +499,28 @@ static int tcm_vhost_get_cmd_state(struct se_cmd *se_cmd)
return 0;
 }
 
-static void vhost_scsi_complete_cmd(struct tcm_vhost_cmd *tv_cmd)
+static void vhost_scsi_complete_cmd(struct tcm_vhost_cmd *cmd)
 {
-   struct vhost_scsi *vs = tv_cmd-tvc_vhost;
+   struct vhost_scsi *vs = cmd-tvc_vhost;
 
-   llist_add(tv_cmd-tvc_completion_list, vs-vs_completion_list);
+   llist_add(cmd-tvc_completion_list, vs-vs_completion_list);
 
vhost_work_queue(vs-dev, vs-vs_completion_work);
 }
 
 static int tcm_vhost_queue_data_in(struct se_cmd *se_cmd)
 {
-   struct tcm_vhost_cmd *tv_cmd = container_of(se_cmd,
+   struct tcm_vhost_cmd *cmd = container_of(se_cmd,
struct tcm_vhost_cmd, tvc_se_cmd);
-   vhost_scsi_complete_cmd(tv_cmd);
+   vhost_scsi_complete_cmd(cmd);
return 0;
 }
 
 static int tcm_vhost_queue_status(struct se_cmd *se_cmd)
 {
-   struct tcm_vhost_cmd *tv_cmd = container_of(se_cmd,
+   struct tcm_vhost_cmd *cmd = container_of(se_cmd,
struct tcm_vhost_cmd, tvc_se_cmd);
-   vhost_scsi_complete_cmd(tv_cmd);
+   vhost_scsi_complete_cmd(cmd);
return 0;
 }
 
@@ -561,24 +561,24 @@ tcm_vhost_allocate_evt(struct vhost_scsi *vs,
return evt;
 }
 
-static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
+static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *cmd)
 {
-   struct se_cmd *se_cmd = tv_cmd-tvc_se_cmd;
+   struct se_cmd *se_cmd = cmd-tvc_se_cmd;
 
/* TODO locking against target/backend threads? */
transport_generic_free_cmd(se_cmd, 1);
 
-   if (tv_cmd-tvc_sgl_count) {
+   if (cmd-tvc_sgl_count) {
u32 i;
-   for (i = 0; i  tv_cmd-tvc_sgl_count; i++)
-   put_page(sg_page(tv_cmd-tvc_sgl[i]));
+   for (i = 0; i  cmd-tvc_sgl_count; i++)
+   put_page(sg_page(cmd-tvc_sgl[i]));
 
-   kfree(tv_cmd-tvc_sgl);
+   kfree(cmd-tvc_sgl);
}
 
-   tcm_vhost_put_inflight(tv_cmd-inflight);
+   tcm_vhost_put_inflight(cmd-inflight);
 
-   kfree(tv_cmd);
+   kfree(cmd);
 }
 
 static void
@@ -661,7 +661,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work 
*work)
vs_completion_work);
DECLARE_BITMAP(signal, VHOST_SCSI_MAX_VQ);
struct virtio_scsi_cmd_resp v_rsp;
-   struct tcm_vhost_cmd *tv_cmd;
+   struct tcm_vhost_cmd *cmd;
struct llist_node *llnode;
struct se_cmd *se_cmd;
int ret, vq;
@@ -669,32 +669,32 @@ static void vhost_scsi_complete_cmd_work(struct 
vhost_work *work)
bitmap_zero(signal, VHOST_SCSI_MAX_VQ);
llnode = llist_del_all(vs-vs_completion_list);
while (llnode) {
-   tv_cmd = llist_entry(llnode, struct tcm_vhost_cmd,
+   cmd = llist_entry(llnode, struct tcm_vhost_cmd,
 tvc_completion_list);
llnode = llist_next(llnode);
-   se_cmd = tv_cmd-tvc_se_cmd;
+   se_cmd = cmd-tvc_se_cmd;
 
pr_debug(%s tv_cmd %p resid %u status %#02x\n, __func__,
-   tv_cmd, se_cmd-residual_count, se_cmd-scsi_status);
+   cmd, se_cmd-residual_count, se_cmd-scsi_status);
 
memset(v_rsp, 0, sizeof(v_rsp));
v_rsp.resid = se_cmd-residual_count;
/* TODO is status_qualifier field needed? */
v_rsp.status = se_cmd-scsi_status;
v_rsp.sense_len = se_cmd-scsi_sense_length;
-   memcpy(v_rsp.sense, tv_cmd-tvc_sense_buf,
+   memcpy(v_rsp.sense, cmd-tvc_sense_buf,
   v_rsp.sense_len);
-   ret = copy_to_user(tv_cmd-tvc_resp, v_rsp, sizeof(v_rsp));
+   ret = copy_to_user(cmd-tvc_resp, v_rsp, sizeof(v_rsp));
if (likely(ret == 0)) {
struct vhost_scsi_virtqueue *q;
-   vhost_add_used(tv_cmd-tvc_vq, tv_cmd-tvc_vq_desc, 0);
-   q = container_of(tv_cmd-tvc_vq, struct 
vhost_scsi_virtqueue, vq);
+   vhost_add_used(cmd-tvc_vq, cmd-tvc_vq_desc, 0);
+   q = container_of(cmd-tvc_vq, struct 
vhost_scsi_virtqueue, vq);
vq = q - vs-vqs;
__set_bit(vq, signal);
} else
pr_err(Faulted on virtio_scsi_cmd_resp\n);
 
-   

Re: [PATCH v2 00/11] vhost cleanups

2013-05-06 Thread Michael S. Tsirkin
On Mon, May 06, 2013 at 04:38:18PM +0800, Asias He wrote:
 MST, This is on top of [PATCH 0/2] vhost-net fix ubuf.

Acked-by: Michael S. Tsirkin m...@redhat.com

Once -rc1 is out I'll fork -next and apply them.
Thanks a lot!

Nicholas, recently attempts to push patches through both net and target
trees resulted in a bit of a mess, so let's stick to the common tree
(unless there's a dependency that makes us not to) until rate of changes
in the common code calms down a bit.  OK?

 Asias He (11):
   vhost: Remove vhost_enable_zcopy in vhost.h
   vhost: Move VHOST_NET_FEATURES to net.c
   vhost: Make vhost a separate module
   vhost: Remove comments for hdr in vhost.h
   vhost: Simplify dev-vqs[i] access
   vhost-net: Cleanup vhost_ubuf and vhost_zcopy
   vhost-scsi: Remove unnecessary forward struct vhost_scsi declaration
   vhost-scsi: Rename struct vhost_scsi *s to *vs
   vhost-scsi: Make func indention more consistent
   vhost-scsi: Rename struct tcm_vhost_tpg *tv_tpg to *tpg
   vhost-scsi: Rename struct tcm_vhost_cmd *tv_cmd to *cmd
 
  drivers/vhost/Kconfig  |   8 +
  drivers/vhost/Makefile |   3 +-
  drivers/vhost/net.c|  64 ---
  drivers/vhost/scsi.c   | 470 
 ++---
  drivers/vhost/vhost.c  |  86 +++--
  drivers/vhost/vhost.h  |  11 +-
  6 files changed, 361 insertions(+), 281 deletions(-)
 
 -- 
 1.8.1.4
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/3] vhost cleanups and separate module

2013-05-06 Thread Michael S. Tsirkin
On Mon, May 06, 2013 at 03:41:36PM +0930, Rusty Russell wrote:
 Asias He as...@redhat.com writes:
  Asias He (3):
vhost: Remove vhost_enable_zcopy in vhost.h
vhost: Move VHOST_NET_FEATURES to net.c
vhost: Make vhost a separate module
 
 I like these cleanups, MST pleasee apply.

Absolutely. Except it's 3.11 material and I can only
usefully create a -next branch once -rc1 is out.

 I have some other cleanups which are on hold for the moment pending
 MST's vhost_net simplification.  MST, how's that going?

Not too well. The array of status bytes which was designed to complete
packets in order turns out to be a very efficient datastructure:

It gives us a way to signal completions that is completely lockless for
multiple completers, and using the producer/consumer model saves extra
scans for the common case.

Overall I can save some memory and clean up some code but can't get rid
of the producer/consumer idices (currently named upend/done indices)
which is what you asked me to do.
Your cleanups basically don't work with zcopy because they
ignore the upend/done indices?
Would you like to post them, noting they only work with zcopy off, and
we'll look for a way to apply them, together?

 Thanks,
 Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 03/11] vhost: Make vhost a separate module

2013-05-06 Thread Michael S. Tsirkin
On Mon, May 06, 2013 at 04:38:21PM +0800, Asias He wrote:
 Currently, vhost-net and vhost-scsi are sharing the vhost core code.
 However, vhost-scsi shares the code by including the vhost.c file
 directly.
 
 Making vhost a separate module makes it is easier to share code with
 other vhost devices.
 
 Signed-off-by: Asias He as...@redhat.com
 ---
  drivers/vhost/Kconfig  |  8 
  drivers/vhost/Makefile |  3 ++-
  drivers/vhost/scsi.c   |  1 -
  drivers/vhost/vhost.c  | 51 
 +-
  drivers/vhost/vhost.h  |  2 ++
  5 files changed, 62 insertions(+), 3 deletions(-)
 
 diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
 index 8b9226d..017a1e8 100644
 --- a/drivers/vhost/Kconfig
 +++ b/drivers/vhost/Kconfig
 @@ -1,6 +1,7 @@
  config VHOST_NET
   tristate Host kernel accelerator for virtio net
   depends on NET  EVENTFD  (TUN || !TUN)  (MACVTAP || !MACVTAP)
 + select VHOST
   select VHOST_RING
   ---help---
 This kernel module can be loaded in host kernel to accelerate
 @@ -13,6 +14,7 @@ config VHOST_NET
  config VHOST_SCSI
   tristate VHOST_SCSI TCM fabric driver
   depends on TARGET_CORE  EVENTFD  m
 + select VHOST
   select VHOST_RING
   default n
   ---help---
 @@ -24,3 +26,9 @@ config VHOST_RING
   ---help---
 This option is selected by any driver which needs to access
 the host side of a virtio ring.
 +
 +config VHOST
 + tristate
 + ---help---
 +   This option is selected by any driver which needs to access
 +   the core of vhost.
 diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
 index 654e9afb..e0441c3 100644
 --- a/drivers/vhost/Makefile
 +++ b/drivers/vhost/Makefile
 @@ -1,7 +1,8 @@
  obj-$(CONFIG_VHOST_NET) += vhost_net.o
 -vhost_net-y := vhost.o net.o
 +vhost_net-y := net.o
  
  obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o
  vhost_scsi-y := scsi.o
  
  obj-$(CONFIG_VHOST_RING) += vringh.o
 +obj-$(CONFIG_VHOST)  += vhost.o
 diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
 index 5179f7a..2dcb94a 100644
 --- a/drivers/vhost/scsi.c
 +++ b/drivers/vhost/scsi.c
 @@ -49,7 +49,6 @@
  #include linux/llist.h
  #include linux/bitmap.h
  
 -#include vhost.c
  #include vhost.h
  
  #define TCM_VHOST_VERSION  v0.1
 diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
 index de9441a..e406d5f 100644
 --- a/drivers/vhost/vhost.c
 +++ b/drivers/vhost/vhost.c
 @@ -25,6 +25,7 @@
  #include linux/slab.h
  #include linux/kthread.h
  #include linux/cgroup.h
 +#include linux/module.h
  
  #include vhost.h
  
 @@ -66,6 +67,7 @@ void vhost_work_init(struct vhost_work *work, 
 vhost_work_fn_t fn)
   work-flushing = 0;
   work-queue_seq = work-done_seq = 0;
  }
 +EXPORT_SYMBOL_GPL(vhost_work_init);
  
  /* Init poll structure */
  void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
 @@ -79,6 +81,7 @@ void vhost_poll_init(struct vhost_poll *poll, 
 vhost_work_fn_t fn,
  
   vhost_work_init(poll-work, fn);
  }
 +EXPORT_SYMBOL_GPL(vhost_poll_init);
  
  /* Start polling a file. We add ourselves to file's wait queue. The caller 
 must
   * keep a reference to a file until after vhost_poll_stop is called. */
 @@ -101,6 +104,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file 
 *file)
  
   return ret;
  }
 +EXPORT_SYMBOL_GPL(vhost_poll_start);
  
  /* Stop polling a file. After this function returns, it becomes safe to drop 
 the
   * file reference. You must also flush afterwards. */
 @@ -111,6 +115,7 @@ void vhost_poll_stop(struct vhost_poll *poll)
   poll-wqh = NULL;
   }
  }
 +EXPORT_SYMBOL_GPL(vhost_poll_stop);
  
  static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work 
 *work,
   unsigned seq)
 @@ -123,7 +128,7 @@ static bool vhost_work_seq_done(struct vhost_dev *dev, 
 struct vhost_work *work,
   return left = 0;
  }
  
 -static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
 +void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
  {
   unsigned seq;
   int flushing;
 @@ -138,6 +143,7 @@ static void vhost_work_flush(struct vhost_dev *dev, 
 struct vhost_work *work)
   spin_unlock_irq(dev-work_lock);
   BUG_ON(flushing  0);
  }
 +EXPORT_SYMBOL_GPL(vhost_work_flush);
  
  /* Flush any work that has been scheduled. When calling this, don't hold any
   * locks that are also used by the callback. */
 @@ -145,6 +151,7 @@ void vhost_poll_flush(struct vhost_poll *poll)
  {
   vhost_work_flush(poll-dev, poll-work);
  }
 +EXPORT_SYMBOL_GPL(vhost_poll_flush);
  
  void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
  {
 @@ -158,11 +165,13 @@ void vhost_work_queue(struct vhost_dev *dev, struct 
 vhost_work *work)
   }
   spin_unlock_irqrestore(dev-work_lock, flags);
  }
 +EXPORT_SYMBOL_GPL(vhost_work_queue);
  
  void vhost_poll_queue(struct vhost_poll *poll)
  {
   

Re: [PATCH v2 03/11] vhost: Make vhost a separate module

2013-05-06 Thread Michael S. Tsirkin
On Mon, May 06, 2013 at 04:38:21PM +0800, Asias He wrote:
 Currently, vhost-net and vhost-scsi are sharing the vhost core code.
 However, vhost-scsi shares the code by including the vhost.c file
 directly.
 
 Making vhost a separate module makes it is easier to share code with
 other vhost devices.
 
 Signed-off-by: Asias He as...@redhat.com

Also this will break test.c, right? Let's fix it in the same
commit too.

 ---
  drivers/vhost/Kconfig  |  8 
  drivers/vhost/Makefile |  3 ++-
  drivers/vhost/scsi.c   |  1 -
  drivers/vhost/vhost.c  | 51 
 +-
  drivers/vhost/vhost.h  |  2 ++
  5 files changed, 62 insertions(+), 3 deletions(-)
 
 diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
 index 8b9226d..017a1e8 100644
 --- a/drivers/vhost/Kconfig
 +++ b/drivers/vhost/Kconfig
 @@ -1,6 +1,7 @@
  config VHOST_NET
   tristate Host kernel accelerator for virtio net
   depends on NET  EVENTFD  (TUN || !TUN)  (MACVTAP || !MACVTAP)
 + select VHOST
   select VHOST_RING
   ---help---
 This kernel module can be loaded in host kernel to accelerate
 @@ -13,6 +14,7 @@ config VHOST_NET
  config VHOST_SCSI
   tristate VHOST_SCSI TCM fabric driver
   depends on TARGET_CORE  EVENTFD  m
 + select VHOST
   select VHOST_RING
   default n
   ---help---
 @@ -24,3 +26,9 @@ config VHOST_RING
   ---help---
 This option is selected by any driver which needs to access
 the host side of a virtio ring.
 +
 +config VHOST
 + tristate
 + ---help---
 +   This option is selected by any driver which needs to access
 +   the core of vhost.
 diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
 index 654e9afb..e0441c3 100644
 --- a/drivers/vhost/Makefile
 +++ b/drivers/vhost/Makefile
 @@ -1,7 +1,8 @@
  obj-$(CONFIG_VHOST_NET) += vhost_net.o
 -vhost_net-y := vhost.o net.o
 +vhost_net-y := net.o
  
  obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o
  vhost_scsi-y := scsi.o
  
  obj-$(CONFIG_VHOST_RING) += vringh.o
 +obj-$(CONFIG_VHOST)  += vhost.o
 diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
 index 5179f7a..2dcb94a 100644
 --- a/drivers/vhost/scsi.c
 +++ b/drivers/vhost/scsi.c
 @@ -49,7 +49,6 @@
  #include linux/llist.h
  #include linux/bitmap.h
  
 -#include vhost.c
  #include vhost.h
  
  #define TCM_VHOST_VERSION  v0.1
 diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
 index de9441a..e406d5f 100644
 --- a/drivers/vhost/vhost.c
 +++ b/drivers/vhost/vhost.c
 @@ -25,6 +25,7 @@
  #include linux/slab.h
  #include linux/kthread.h
  #include linux/cgroup.h
 +#include linux/module.h
  
  #include vhost.h
  
 @@ -66,6 +67,7 @@ void vhost_work_init(struct vhost_work *work, 
 vhost_work_fn_t fn)
   work-flushing = 0;
   work-queue_seq = work-done_seq = 0;
  }
 +EXPORT_SYMBOL_GPL(vhost_work_init);
  
  /* Init poll structure */
  void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
 @@ -79,6 +81,7 @@ void vhost_poll_init(struct vhost_poll *poll, 
 vhost_work_fn_t fn,
  
   vhost_work_init(poll-work, fn);
  }
 +EXPORT_SYMBOL_GPL(vhost_poll_init);
  
  /* Start polling a file. We add ourselves to file's wait queue. The caller 
 must
   * keep a reference to a file until after vhost_poll_stop is called. */
 @@ -101,6 +104,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file 
 *file)
  
   return ret;
  }
 +EXPORT_SYMBOL_GPL(vhost_poll_start);
  
  /* Stop polling a file. After this function returns, it becomes safe to drop 
 the
   * file reference. You must also flush afterwards. */
 @@ -111,6 +115,7 @@ void vhost_poll_stop(struct vhost_poll *poll)
   poll-wqh = NULL;
   }
  }
 +EXPORT_SYMBOL_GPL(vhost_poll_stop);
  
  static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work 
 *work,
   unsigned seq)
 @@ -123,7 +128,7 @@ static bool vhost_work_seq_done(struct vhost_dev *dev, 
 struct vhost_work *work,
   return left = 0;
  }
  
 -static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
 +void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
  {
   unsigned seq;
   int flushing;
 @@ -138,6 +143,7 @@ static void vhost_work_flush(struct vhost_dev *dev, 
 struct vhost_work *work)
   spin_unlock_irq(dev-work_lock);
   BUG_ON(flushing  0);
  }
 +EXPORT_SYMBOL_GPL(vhost_work_flush);
  
  /* Flush any work that has been scheduled. When calling this, don't hold any
   * locks that are also used by the callback. */
 @@ -145,6 +151,7 @@ void vhost_poll_flush(struct vhost_poll *poll)
  {
   vhost_work_flush(poll-dev, poll-work);
  }
 +EXPORT_SYMBOL_GPL(vhost_poll_flush);
  
  void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
  {
 @@ -158,11 +165,13 @@ void vhost_work_queue(struct vhost_dev *dev, struct 
 vhost_work *work)
   }
   spin_unlock_irqrestore(dev-work_lock, flags);
  }
 

Re: [PATCH v2 00/11] vhost cleanups

2013-05-06 Thread Michael S. Tsirkin
On Mon, May 06, 2013 at 04:38:18PM +0800, Asias He wrote:
 MST, This is on top of [PATCH 0/2] vhost-net fix ubuf.

Okay, how about making EVENT_IDX work for virtio-scsi?
I'm guessing it's some messup with feature negotiation,
that's what all event-idx bugs came down to so far.

 Asias He (11):
   vhost: Remove vhost_enable_zcopy in vhost.h
   vhost: Move VHOST_NET_FEATURES to net.c
   vhost: Make vhost a separate module
   vhost: Remove comments for hdr in vhost.h
   vhost: Simplify dev-vqs[i] access
   vhost-net: Cleanup vhost_ubuf and vhost_zcopy
   vhost-scsi: Remove unnecessary forward struct vhost_scsi declaration
   vhost-scsi: Rename struct vhost_scsi *s to *vs
   vhost-scsi: Make func indention more consistent
   vhost-scsi: Rename struct tcm_vhost_tpg *tv_tpg to *tpg
   vhost-scsi: Rename struct tcm_vhost_cmd *tv_cmd to *cmd
 
  drivers/vhost/Kconfig  |   8 +
  drivers/vhost/Makefile |   3 +-
  drivers/vhost/net.c|  64 ---
  drivers/vhost/scsi.c   | 470 
 ++---
  drivers/vhost/vhost.c  |  86 +++--
  drivers/vhost/vhost.h  |  11 +-
  6 files changed, 361 insertions(+), 281 deletions(-)
 
 -- 
 1.8.1.4
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: virtio performance analysis

2013-05-06 Thread Michael S. Tsirkin
On Thu, May 02, 2013 at 09:13:29AM +0530, nitesh narayan lal wrote:
 Hi,
 I am currently working on the virtio performance analysis in PowerPC .

That's nice.

 I was looking at the virtio front-end code and had done tracing by
 adding WARN_ON condition in skb_recv_done(),xmit_skb_done(
 ) and virtqueue_kick().

That's a bit heavy-handed. Why not just use ftrace?

 What I had seen is virtqueue_kick() internally it calls iowrite16()
 which will cause an exit to QEMU,

You probably should be looking at vhost-net and not
userspace virtio-net if you are interested in performance.

 now either I send a packets from
 Guest or receive a packets to guest sart_xmit() will be called and
 inside start_xmit there is a call to virtqueue_kick() causing Guest
 exit. Also for every packet or ack sent from Guest there is an
 exception received while sending or receiving the next packet/ack.

Not exactly, we can buffer many events in
case guest and host are running in parallel.
See vring_need_event and its uses.

 Due to all of the above factors mentioned their will be an increase in
 signal, EXTINT and guest exits

Increase as compared to what?

  and hence it will effect the CPU
 performance.
 This is what my analysis is so far, it would be great if I could get
 some help on this such that whether it seems appropriate or not ?
 Regards
 Nitesh Narayan Lal
 ___
 Virtualization mailing list
 virtualizat...@lists.linux-foundation.org
 https://lists.linuxfoundation.org/mailman/listinfo/virtualization
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 06/11] vhost-net: Cleanup vhost_ubuf and vhost_zcopy

2013-05-06 Thread Michael S. Tsirkin
On Mon, May 06, 2013 at 04:38:24PM +0800, Asias He wrote:
 - Rename vhost_ubuf to vhost_net_ubuf
 - Rename vhost_zcopy_mask to vhost_net_zcopy_mask
 - Make funcs static
 
 Signed-off-by: Asias He as...@redhat.com

OK this actually fixes a warning introduced by patch 1,
so I'll pull this in too (don't like builds with warnings).
Then your patch 1 can go in as is (some warnings
during bisect builds this might trigger don't worry me).

 ---
  drivers/vhost/net.c | 58 
 +++--
  1 file changed, 30 insertions(+), 28 deletions(-)
 
 diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
 index 06b2447..2b51e23 100644
 --- a/drivers/vhost/net.c
 +++ b/drivers/vhost/net.c
 @@ -70,7 +70,7 @@ enum {
   VHOST_NET_VQ_MAX = 2,
  };
  
 -struct vhost_ubuf_ref {
 +struct vhost_net_ubuf_ref {
   struct kref kref;
   wait_queue_head_t wait;
   struct vhost_virtqueue *vq;
 @@ -93,7 +93,7 @@ struct vhost_net_virtqueue {
   struct ubuf_info *ubuf_info;
   /* Reference counting for outstanding ubufs.
* Protected by vq mutex. Writers must also take device mutex. */
 - struct vhost_ubuf_ref *ubufs;
 + struct vhost_net_ubuf_ref *ubufs;
  };
  
  struct vhost_net {
 @@ -110,24 +110,25 @@ struct vhost_net {
   bool tx_flush;
  };
  
 -static unsigned vhost_zcopy_mask __read_mostly;
 +static unsigned vhost_net_zcopy_mask __read_mostly;
  
 -void vhost_enable_zcopy(int vq)
 +static void vhost_net_enable_zcopy(int vq)
  {
 - vhost_zcopy_mask |= 0x1  vq;
 + vhost_net_zcopy_mask |= 0x1  vq;
  }
  
 -static void vhost_zerocopy_done_signal(struct kref *kref)
 +static void vhost_net_zerocopy_done_signal(struct kref *kref)
  {
 - struct vhost_ubuf_ref *ubufs = container_of(kref, struct vhost_ubuf_ref,
 - kref);
 + struct vhost_net_ubuf_ref *ubufs;
 +
 + ubufs = container_of(kref, struct vhost_net_ubuf_ref, kref);
   wake_up(ubufs-wait);
  }
  
 -struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *vq,
 - bool zcopy)
 +static struct vhost_net_ubuf_ref *
 +vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
  {
 - struct vhost_ubuf_ref *ubufs;
 + struct vhost_net_ubuf_ref *ubufs;
   /* No zero copy backend? Nothing to count. */
   if (!zcopy)
   return NULL;
 @@ -140,14 +141,14 @@ struct vhost_ubuf_ref *vhost_ubuf_alloc(struct 
 vhost_virtqueue *vq,
   return ubufs;
  }
  
 -void vhost_ubuf_put(struct vhost_ubuf_ref *ubufs)
 +static void vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs)
  {
 - kref_put(ubufs-kref, vhost_zerocopy_done_signal);
 + kref_put(ubufs-kref, vhost_net_zerocopy_done_signal);
  }
  
 -void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *ubufs)
 +static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs)
  {
 - kref_put(ubufs-kref, vhost_zerocopy_done_signal);
 + kref_put(ubufs-kref, vhost_net_zerocopy_done_signal);
   wait_event(ubufs-wait, !atomic_read(ubufs-kref.refcount));
   kfree(ubufs);
  }
 @@ -159,7 +160,7 @@ static void vhost_net_clear_ubuf_info(struct vhost_net *n)
   int i;
  
   for (i = 0; i  n-dev.nvqs; ++i) {
 - zcopy = vhost_zcopy_mask  (0x1  i);
 + zcopy = vhost_net_zcopy_mask  (0x1  i);
   if (zcopy)
   kfree(n-vqs[i].ubuf_info);
   }
 @@ -171,7 +172,7 @@ int vhost_net_set_ubuf_info(struct vhost_net *n)
   int i;
  
   for (i = 0; i  n-dev.nvqs; ++i) {
 - zcopy = vhost_zcopy_mask  (0x1  i);
 + zcopy = vhost_net_zcopy_mask  (0x1  i);
   if (!zcopy)
   continue;
   n-vqs[i].ubuf_info = kmalloc(sizeof(*n-vqs[i].ubuf_info) *
 @@ -183,7 +184,7 @@ int vhost_net_set_ubuf_info(struct vhost_net *n)
  
  err:
   while (i--) {
 - zcopy = vhost_zcopy_mask  (0x1  i);
 + zcopy = vhost_net_zcopy_mask  (0x1  i);
   if (!zcopy)
   continue;
   kfree(n-vqs[i].ubuf_info);
 @@ -305,7 +306,7 @@ static int vhost_zerocopy_signal_used(struct vhost_net 
 *net,
  
  static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
  {
 - struct vhost_ubuf_ref *ubufs = ubuf-ctx;
 + struct vhost_net_ubuf_ref *ubufs = ubuf-ctx;
   struct vhost_virtqueue *vq = ubufs-vq;
   int cnt = atomic_read(ubufs-kref.refcount);
  
 @@ -322,7 +323,7 @@ static void vhost_zerocopy_callback(struct ubuf_info 
 *ubuf, bool success)
   /* set len to mark this desc buffers done DMA */
   vq-heads[ubuf-desc].len = success ?
   VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
 - vhost_ubuf_put(ubufs);
 + vhost_net_ubuf_put(ubufs);
  }
  
  /* Expects to be always run from workqueue - which acts as
 @@ -345,7 +346,7 @@ static void handle_tx(struct vhost_net *net)
   int err;
   size_t hdr_size;
   

[PATCH] vhost: drop virtio_net.h dependency

2013-05-06 Thread Michael S. Tsirkin
There's no net specific code in vhost.c anymore,
don't include the virtio_net.h header.

Signed-off-by: Michael S. Tsirkin m...@redhat.com
---

This is on top of Asias' patches, already queued so
just FYI.

 drivers/vhost/vhost.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index de9441a..dcde269 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -13,7 +13,6 @@
 
 #include linux/eventfd.h
 #include linux/vhost.h
-#include linux/virtio_net.h
 #include linux/mm.h
 #include linux/mmu_context.h
 #include linux/miscdevice.h
-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v17 RESEND] pvpanic: pvpanic device driver

2013-05-06 Thread Marcelo Tosatti
On Mon, May 06, 2013 at 11:39:35AM +0800, Hu Tao wrote:
 On Fri, May 03, 2013 at 06:59:18PM -0300, Marcelo Tosatti wrote:
  On Fri, May 03, 2013 at 10:47:10AM +0800, Hu Tao wrote:
   pvpanic device is a qemu simulated device through which guest panic
   event is sent to host.
   
   Signed-off-by: Hu Tao hu...@cn.fujitsu.com
   ---
drivers/platform/x86/Kconfig   |   7 +++
drivers/platform/x86/Makefile  |   2 +
drivers/platform/x86/pvpanic.c | 115 
   +
3 files changed, 124 insertions(+)
create mode 100644 drivers/platform/x86/pvpanic.c
   
   diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
   index 3338437..527ed04 100644
   --- a/drivers/platform/x86/Kconfig
   +++ b/drivers/platform/x86/Kconfig
   @@ -781,4 +781,11 @@ config APPLE_GMUX
   graphics as well as the backlight. Currently only backlight
   control is supported by the driver.

   +config PVPANIC
   + tristate pvpanic device support
   + depends on ACPI
   + ---help---
   +   This driver provides support for pvpanic device, which is a qemu
   +   simulated device through which guest panic event is sent to host.
   +
endif # X86_PLATFORM_DEVICES
   diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
   index ace2b38..ef0ec74 100644
   --- a/drivers/platform/x86/Makefile
   +++ b/drivers/platform/x86/Makefile
   @@ -51,3 +51,5 @@ obj-$(CONFIG_INTEL_OAKTRAIL)+= intel_oaktrail.o
obj-$(CONFIG_SAMSUNG_Q10)+= samsung-q10.o
obj-$(CONFIG_APPLE_GMUX) += apple-gmux.o
obj-$(CONFIG_CHROMEOS_LAPTOP)+= chromeos_laptop.o
   +
   +obj-$(CONFIG_PVPANIC)   += pvpanic.o
   diff --git a/drivers/platform/x86/pvpanic.c 
   b/drivers/platform/x86/pvpanic.c
   new file mode 100644
   index 000..81c95ec
   --- /dev/null
   +++ b/drivers/platform/x86/pvpanic.c
   @@ -0,0 +1,115 @@
   +/*
   + *  pvpanic.c - pvpanic Device Support
   + *
   + *  Copyright (C) 2013 Fujitsu.
   + *
   + *  This program is free software; you can redistribute it and/or modify
   + *  it under the terms of the GNU General Public License as published by
   + *  the Free Software Foundation; either version 2 of the License, or
   + *  (at your option) any later version.
   + *
   + *  This program is distributed in the hope that it will be useful,
   + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
   + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   + *  GNU General Public License for more details.
   + *
   + *  You should have received a copy of the GNU General Public License
   + *  along with this program; if not, write to the Free Software
   + *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  
   02110-1301  USA
   + */
   +
   +#define pr_fmt(fmt) KBUILD_MODNAME :  fmt
   +
   +#include linux/kernel.h
   +#include linux/module.h
   +#include linux/init.h
   +#include linux/types.h
   +#include acpi/acpi_bus.h
   +#include acpi/acpi_drivers.h
   +
   +MODULE_AUTHOR(Hu Tao hu...@cn.fujitsu.com);
   +MODULE_DESCRIPTION(pvpanic device driver);
   +MODULE_LICENSE(GPL);
   +
   +static int pvpanic_add(struct acpi_device *device);
   +static int pvpanic_remove(struct acpi_device *device);
   +
   +static const struct acpi_device_id pvpanic_device_ids[] = {
   + { QEMU0001, 0},
   + { , 0},
   +};
   +MODULE_DEVICE_TABLE(acpi, pvpanic_device_ids);
   +
   +#define PVPANIC_PANICKED (1  0)
   +
   +static acpi_handle handle;
   +
   +static struct acpi_driver pvpanic_driver = {
   + .name = pvpanic,
   + .class =QEMU,
   + .ids =  pvpanic_device_ids,
   + .ops =  {
   + .add =  pvpanic_add,
   + .remove =   pvpanic_remove,
   + },
   + .owner =THIS_MODULE,
   +};
   +
   +static void
   +pvpanic_send_event(unsigned int event)
   +{
   + union acpi_object arg;
   + struct acpi_object_list arg_list;
   +
   + if (!handle)
   + return;
   +
   + arg.type = ACPI_TYPE_INTEGER;
   + arg.integer.value = event;
   +
   + arg_list.count = 1;
   + arg_list.pointer = arg;
   +
   + acpi_evaluate_object(handle, WRPT, arg_list, NULL);
   +}
  
  Is it safe to call acpi_evaluate_object from a panic notifier? For
  example:
  
  - Has it been confirmed that no code invoked via acpi_evaluate_object can 
  panic() ?
 
 Confirmed.
 
  - acpi_ex_enter_interpreter grabs a mutex. Is that path ever used?
 
 Unfortunately yes. As I can tell, there are 2 places in the path to grab
 a mutex: when searching the namespace for the method, and when executing
 the method. I didn't find a non-blocking version of acpi_evaluate_object.

Then thats a bug because the pvpanic notifier is called with interrupts
disabled.

Suppose its safer to use an OUT instruction?

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo 

Re: [uq/master PATCH] kvmvapic: add ioport read accessor

2013-05-06 Thread Gleb Natapov
On Sun, May 05, 2013 at 05:51:49PM -0300, Marcelo Tosatti wrote:
 
 Necessary since memory region accessor assumes read and write
 methods are registered. Otherwise reading I/O port 0x7e segfaults.
 
 https://bugzilla.redhat.com/show_bug.cgi?id=954306
 
 Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
 
Applied, thanks.

 diff --git a/hw/i386/kvmvapic.c b/hw/i386/kvmvapic.c
 index 5b558aa..655483b 100644
 --- a/hw/i386/kvmvapic.c
 +++ b/hw/i386/kvmvapic.c
 @@ -687,8 +687,14 @@ static void vapic_write(void *opaque, hwaddr addr, 
 uint64_t data,
  }
  }
  
 +static uint64_t vapic_read(void *opaque, hwaddr addr, unsigned size)
 +{
 +return 0x;
 +}
 +
  static const MemoryRegionOps vapic_ops = {
  .write = vapic_write,
 +.read = vapic_read,
  .endianness = DEVICE_NATIVE_ENDIAN,
  };
  
 
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 00/11] vhost cleanups

2013-05-06 Thread Asias He
On Mon, May 06, 2013 at 01:07:46PM +0300, Michael S. Tsirkin wrote:
 On Mon, May 06, 2013 at 04:38:18PM +0800, Asias He wrote:
  MST, This is on top of [PATCH 0/2] vhost-net fix ubuf.
 
 Okay, how about making EVENT_IDX work for virtio-scsi?
 I'm guessing it's some messup with feature negotiation,
 that's what all event-idx bugs came down to so far.

Yes, IIRC, EVENT_IDX works for vhost-scsi now. Will cook a patch to
enable it. It should go 3.10, right?

  Asias He (11):
vhost: Remove vhost_enable_zcopy in vhost.h
vhost: Move VHOST_NET_FEATURES to net.c
vhost: Make vhost a separate module
vhost: Remove comments for hdr in vhost.h
vhost: Simplify dev-vqs[i] access
vhost-net: Cleanup vhost_ubuf and vhost_zcopy
vhost-scsi: Remove unnecessary forward struct vhost_scsi declaration
vhost-scsi: Rename struct vhost_scsi *s to *vs
vhost-scsi: Make func indention more consistent
vhost-scsi: Rename struct tcm_vhost_tpg *tv_tpg to *tpg
vhost-scsi: Rename struct tcm_vhost_cmd *tv_cmd to *cmd
  
   drivers/vhost/Kconfig  |   8 +
   drivers/vhost/Makefile |   3 +-
   drivers/vhost/net.c|  64 ---
   drivers/vhost/scsi.c   | 470 
  ++---
   drivers/vhost/vhost.c  |  86 +++--
   drivers/vhost/vhost.h  |  11 +-
   6 files changed, 361 insertions(+), 281 deletions(-)
  
  -- 
  1.8.1.4

-- 
Asias
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] vhost: drop virtio_net.h dependency

2013-05-06 Thread Asias He
On Mon, May 06, 2013 at 01:37:34PM +0300, Michael S. Tsirkin wrote:
 There's no net specific code in vhost.c anymore,
 don't include the virtio_net.h header.
 
 Signed-off-by: Michael S. Tsirkin m...@redhat.com

Reviewed-by: Asias He as...@redhat.com

 ---
 
 This is on top of Asias' patches, already queued so
 just FYI.
 
  drivers/vhost/vhost.c | 1 -
  1 file changed, 1 deletion(-)
 
 diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
 index de9441a..dcde269 100644
 --- a/drivers/vhost/vhost.c
 +++ b/drivers/vhost/vhost.c
 @@ -13,7 +13,6 @@
  
  #include linux/eventfd.h
  #include linux/vhost.h
 -#include linux/virtio_net.h
  #include linux/mm.h
  #include linux/mmu_context.h
  #include linux/miscdevice.h
 -- 
 MST

-- 
Asias
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 03/11] vhost: Make vhost a separate module

2013-05-06 Thread Asias He
On Mon, May 06, 2013 at 01:03:42PM +0300, Michael S. Tsirkin wrote:
 On Mon, May 06, 2013 at 04:38:21PM +0800, Asias He wrote:
  Currently, vhost-net and vhost-scsi are sharing the vhost core code.
  However, vhost-scsi shares the code by including the vhost.c file
  directly.
  
  Making vhost a separate module makes it is easier to share code with
  other vhost devices.
  
  Signed-off-by: Asias He as...@redhat.com
 
 Also this will break test.c, right? Let's fix it in the same
 commit too.

I will fix it up and remove the useless 'return'.

  ---
   drivers/vhost/Kconfig  |  8 
   drivers/vhost/Makefile |  3 ++-
   drivers/vhost/scsi.c   |  1 -
   drivers/vhost/vhost.c  | 51 
  +-
   drivers/vhost/vhost.h  |  2 ++
   5 files changed, 62 insertions(+), 3 deletions(-)
  
  diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
  index 8b9226d..017a1e8 100644
  --- a/drivers/vhost/Kconfig
  +++ b/drivers/vhost/Kconfig
  @@ -1,6 +1,7 @@
   config VHOST_NET
  tristate Host kernel accelerator for virtio net
  depends on NET  EVENTFD  (TUN || !TUN)  (MACVTAP || !MACVTAP)
  +   select VHOST
  select VHOST_RING
  ---help---
This kernel module can be loaded in host kernel to accelerate
  @@ -13,6 +14,7 @@ config VHOST_NET
   config VHOST_SCSI
  tristate VHOST_SCSI TCM fabric driver
  depends on TARGET_CORE  EVENTFD  m
  +   select VHOST
  select VHOST_RING
  default n
  ---help---
  @@ -24,3 +26,9 @@ config VHOST_RING
  ---help---
This option is selected by any driver which needs to access
the host side of a virtio ring.
  +
  +config VHOST
  +   tristate
  +   ---help---
  + This option is selected by any driver which needs to access
  + the core of vhost.
  diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
  index 654e9afb..e0441c3 100644
  --- a/drivers/vhost/Makefile
  +++ b/drivers/vhost/Makefile
  @@ -1,7 +1,8 @@
   obj-$(CONFIG_VHOST_NET) += vhost_net.o
  -vhost_net-y := vhost.o net.o
  +vhost_net-y := net.o
   
   obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o
   vhost_scsi-y := scsi.o
   
   obj-$(CONFIG_VHOST_RING) += vringh.o
  +obj-$(CONFIG_VHOST)+= vhost.o
  diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
  index 5179f7a..2dcb94a 100644
  --- a/drivers/vhost/scsi.c
  +++ b/drivers/vhost/scsi.c
  @@ -49,7 +49,6 @@
   #include linux/llist.h
   #include linux/bitmap.h
   
  -#include vhost.c
   #include vhost.h
   
   #define TCM_VHOST_VERSION  v0.1
  diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
  index de9441a..e406d5f 100644
  --- a/drivers/vhost/vhost.c
  +++ b/drivers/vhost/vhost.c
  @@ -25,6 +25,7 @@
   #include linux/slab.h
   #include linux/kthread.h
   #include linux/cgroup.h
  +#include linux/module.h
   
   #include vhost.h
   
  @@ -66,6 +67,7 @@ void vhost_work_init(struct vhost_work *work, 
  vhost_work_fn_t fn)
  work-flushing = 0;
  work-queue_seq = work-done_seq = 0;
   }
  +EXPORT_SYMBOL_GPL(vhost_work_init);
   
   /* Init poll structure */
   void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
  @@ -79,6 +81,7 @@ void vhost_poll_init(struct vhost_poll *poll, 
  vhost_work_fn_t fn,
   
  vhost_work_init(poll-work, fn);
   }
  +EXPORT_SYMBOL_GPL(vhost_poll_init);
   
   /* Start polling a file. We add ourselves to file's wait queue. The caller 
  must
* keep a reference to a file until after vhost_poll_stop is called. */
  @@ -101,6 +104,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct 
  file *file)
   
  return ret;
   }
  +EXPORT_SYMBOL_GPL(vhost_poll_start);
   
   /* Stop polling a file. After this function returns, it becomes safe to 
  drop the
* file reference. You must also flush afterwards. */
  @@ -111,6 +115,7 @@ void vhost_poll_stop(struct vhost_poll *poll)
  poll-wqh = NULL;
  }
   }
  +EXPORT_SYMBOL_GPL(vhost_poll_stop);
   
   static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work 
  *work,
  unsigned seq)
  @@ -123,7 +128,7 @@ static bool vhost_work_seq_done(struct vhost_dev *dev, 
  struct vhost_work *work,
  return left = 0;
   }
   
  -static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work 
  *work)
  +void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
   {
  unsigned seq;
  int flushing;
  @@ -138,6 +143,7 @@ static void vhost_work_flush(struct vhost_dev *dev, 
  struct vhost_work *work)
  spin_unlock_irq(dev-work_lock);
  BUG_ON(flushing  0);
   }
  +EXPORT_SYMBOL_GPL(vhost_work_flush);
   
   /* Flush any work that has been scheduled. When calling this, don't hold 
  any
* locks that are also used by the callback. */
  @@ -145,6 +151,7 @@ void vhost_poll_flush(struct vhost_poll *poll)
   {
  vhost_work_flush(poll-dev, poll-work);
   }
  +EXPORT_SYMBOL_GPL(vhost_poll_flush);
   
   void vhost_work_queue(struct vhost_dev *dev, 

Re: [PATCH v4 4/6] KVM: MMU: fast invalid all shadow pages

2013-05-06 Thread Gleb Natapov
On Mon, May 06, 2013 at 11:39:11AM +0800, Xiao Guangrong wrote:
 On 05/04/2013 08:52 AM, Marcelo Tosatti wrote:
  On Sat, May 04, 2013 at 12:51:06AM +0800, Xiao Guangrong wrote:
  On 05/03/2013 11:53 PM, Marcelo Tosatti wrote:
  On Fri, May 03, 2013 at 01:52:07PM +0800, Xiao Guangrong wrote:
  On 05/03/2013 09:05 AM, Marcelo Tosatti wrote:
 
  +
  +/*
  + * Fast invalid all shadow pages belong to @slot.
  + *
  + * @slot != NULL means the invalidation is caused the memslot 
  specified
  + * by @slot is being deleted, in this case, we should ensure that rmap
  + * and lpage-info of the @slot can not be used after calling the 
  function.
  + *
  + * @slot == NULL means the invalidation due to other reasons, we need
  + * not care rmap and lpage-info since they are still valid after 
  calling
  + * the function.
  + */
  +void kvm_mmu_invalid_memslot_pages(struct kvm *kvm,
  + struct kvm_memory_slot *slot)
  +{
  +  spin_lock(kvm-mmu_lock);
  +  kvm-arch.mmu_valid_gen++;
  +
  +  /*
  +   * All shadow paes are invalid, reset the large page info,
  +   * then we can safely desotry the memslot, it is also good
  +   * for large page used.
  +   */
  +  kvm_clear_all_lpage_info(kvm);
 
  Xiao,
 
  I understood it was agreed that simple mmu_lock lockbreak while
  avoiding zapping of newly instantiated pages upon a
 
  if(spin_needbreak)
  cond_resched_lock()
 
  cycle was enough as a first step? And then later introduce root zapping
  along with measurements.
 
  https://lkml.org/lkml/2013/4/22/544
 
  Yes, it is.
 
  See the changelog in 0/0:
 
   we use lock-break technique to zap all sptes linked on the
  invalid rmap, it is not very effective but good for the first step.
 
  Thanks!
 
  Sure, but what is up with zeroing kvm_clear_all_lpage_info(kvm) and
  zapping the root? Only lock-break technique along with generation number 
  was what was agreed.
 
  Marcelo,
 
  Please Wait... I am completely confused. :(
 
  Let's clarify zeroing kvm_clear_all_lpage_info(kvm) and zapping the root 
  first.
  Are these changes you wanted?
 
  void kvm_mmu_invalid_memslot_pages(struct kvm *kvm,
struct kvm_memory_slot *slot)
  {
 spin_lock(kvm-mmu_lock);
 kvm-arch.mmu_valid_gen++;
 
 /* Zero all root pages.*/
  restart:
 list_for_each_entry_safe(sp, node, kvm-arch.active_mmu_pages, link) {
 if (!sp-root_count)
 continue;
 
 if (kvm_mmu_prepare_zap_page(kvm, sp, invalid_list))
 goto restart;
 }
 
 /*
  * All shadow paes are invalid, reset the large page info,
  * then we can safely desotry the memslot, it is also good
  * for large page used.
  */
 kvm_clear_all_lpage_info(kvm);
 
 kvm_mmu_commit_zap_page(kvm, invalid_list);
 spin_unlock(kvm-mmu_lock);
  }
 
  static void rmap_remove(struct kvm *kvm, u64 *spte)
  {
 struct kvm_mmu_page *sp;
 gfn_t gfn;
 unsigned long *rmapp;
 
 sp = page_header(__pa(spte));
  +
  +   /* Let invalid sp do not access its rmap. */
  +  if (!sp_is_valid(sp))
  +  return;
  +
 gfn = kvm_mmu_page_get_gfn(sp, spte - sp-spt);
 rmapp = gfn_to_rmap(kvm, gfn, sp-role.level);
 pte_list_remove(spte, rmapp);
  }
 
  If yes, there is the reason why we can not do this that i mentioned before:
 
  after call kvm_mmu_invalid_memslot_pages(), the memslot-rmap will be 
  destroyed.
  Later, if host reclaim page, the mmu-notify handlers, -invalidate_page and
  -invalidate_range_start, can not find any spte using the host page, then
  Accessed/Dirty for host page is missing tracked.
  (missing call kvm_set_pfn_accessed and kvm_set_pfn_dirty properly.)
 
  What's your idea?
  
  
  Step 1) Fix kvm_mmu_zap_all's behaviour: introduce lockbreak via
  spin_needbreak. Use generation numbers so that in case kvm_mmu_zap_all 
  releases mmu_lock and reacquires it again, only shadow pages 
  from the generation with which kvm_mmu_zap_all started are zapped (this
  guarantees forward progress and eventual termination).
  
  kvm_mmu_zap_generation()
  spin_lock(mmu_lock)
  int generation = kvm-arch.mmu_generation;
  
  for_each_shadow_page(sp) {
  if (sp-generation == kvm-arch.mmu_generation)
  zap_page(sp)
  if (spin_needbreak(mmu_lock)) {
  kvm-arch.mmu_generation++;
  cond_resched_lock(mmu_lock);
  }
  }
  
  kvm_mmu_zap_all()
  spin_lock(mmu_lock)
  for_each_shadow_page(sp) {
  if (spin_needbreak(mmu_lock)) {
  cond_resched_lock(mmu_lock);
  }
  }
  
  Use kvm_mmu_zap_generation for kvm_arch_flush_shadow_memslot.
  Use kvm_mmu_zap_all for kvm_mmu_notifier_release,kvm_destroy_vm.
  
  This addresses the main problem: excessively long hold times 
  of kvm_mmu_zap_all with very large 

[PATCH 0/1] [PULL] qemu-kvm.git uq/master queue

2013-05-06 Thread Gleb Natapov
Anthony please pull if it is not too later for 1.5.

The following changes since commit 467b34689d277fa56c09ad07ca0f08d7d7539f6d:

  Update OpenBIOS images (2013-05-05 09:53:22 +)

are available in the git repository at:

  git://git.kernel.org/pub/scm/virt/kvm/qemu-kvm.git uq/master

for you to fetch changes up to 0c1cd0ae2a4faabeb948b9a07ea1696e853de174:

  kvmvapic: add ioport read accessor (2013-05-06 14:52:26 +0300)


Marcelo Tosatti (1):
  kvmvapic: add ioport read accessor

 hw/i386/kvmvapic.c |6 ++
 1 file changed, 6 insertions(+)
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/1] kvmvapic: add ioport read accessor

2013-05-06 Thread Gleb Natapov
From: Marcelo Tosatti mtosa...@redhat.com

Necessary since memory region accessor assumes read and write
methods are registered. Otherwise reading I/O port 0x7e segfaults.

https://bugzilla.redhat.com/show_bug.cgi?id=954306

Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
Reviewed-by: Jan Kiszka jan.kis...@siemens.com
Signed-off-by: Gleb Natapov g...@redhat.com
---
 hw/i386/kvmvapic.c |6 ++
 1 file changed, 6 insertions(+)

diff --git a/hw/i386/kvmvapic.c b/hw/i386/kvmvapic.c
index 5b558aa..655483b 100644
--- a/hw/i386/kvmvapic.c
+++ b/hw/i386/kvmvapic.c
@@ -687,8 +687,14 @@ static void vapic_write(void *opaque, hwaddr addr, 
uint64_t data,
 }
 }
 
+static uint64_t vapic_read(void *opaque, hwaddr addr, unsigned size)
+{
+return 0x;
+}
+
 static const MemoryRegionOps vapic_ops = {
 .write = vapic_write,
+.read = vapic_read,
 .endianness = DEVICE_NATIVE_ENDIAN,
 };
 
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 4/6] KVM: MMU: fast invalid all shadow pages

2013-05-06 Thread Xiao Guangrong
On 05/06/2013 08:36 PM, Gleb Natapov wrote:

 Step 1) Fix kvm_mmu_zap_all's behaviour: introduce lockbreak via
 spin_needbreak. Use generation numbers so that in case kvm_mmu_zap_all 
 releases mmu_lock and reacquires it again, only shadow pages 
 from the generation with which kvm_mmu_zap_all started are zapped (this
 guarantees forward progress and eventual termination).

 kvm_mmu_zap_generation()
 spin_lock(mmu_lock)
 int generation = kvm-arch.mmu_generation;

 for_each_shadow_page(sp) {
 if (sp-generation == kvm-arch.mmu_generation)
 zap_page(sp)
 if (spin_needbreak(mmu_lock)) {
 kvm-arch.mmu_generation++;
 cond_resched_lock(mmu_lock);
 }
 }

 kvm_mmu_zap_all()
 spin_lock(mmu_lock)
 for_each_shadow_page(sp) {
 if (spin_needbreak(mmu_lock)) {
 cond_resched_lock(mmu_lock);
 }
 }

 Use kvm_mmu_zap_generation for kvm_arch_flush_shadow_memslot.
 Use kvm_mmu_zap_all for kvm_mmu_notifier_release,kvm_destroy_vm.

 This addresses the main problem: excessively long hold times 
 of kvm_mmu_zap_all with very large guests.

 Do you see any problem with this logic? This was what i was thinking 
 we agreed.

 No. I understand it and it can work.

 Actually, it is similar with Gleb's idea that zapping stale shadow pages
 (and uses lock break technique), after some discussion, we thought only zap
 shadow pages that are reachable from the slot's rmap is better, that is this
 patchset does.
 (https://lkml.org/lkml/2013/4/23/73)

 But this is not what the patch is doing. Close, but not the same :)

Okay. :)

 Instead of zapping shadow pages reachable from slot's rmap the patch
 does kvm_unmap_rmapp() which drop all spte without zapping shadow pages.
 That is why you need special code to re-init lpage_info. What I proposed
 was to call zap_page() on all shadow pages reachable from rmap. This
 will take care of lpage_info counters. Does this make sense?

Unfortunately, no! We still need to care lpage_info. lpage_info is used
to count the number of guest page tables in the memslot.

For example, there is a memslot:
memslot[0].based_gfn = 0, memslot[0].npages = 100,

and there is a shadow page:
sp-role.direct =0, sp-role.level = 4, sp-gfn = 10.

this sp is counted in the memslot[0] but it can not be found by walking
memslot[0]-rmap since there is no last mapping in this shadow page.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 00/11] vhost cleanups

2013-05-06 Thread Michael S. Tsirkin
On Mon, May 06, 2013 at 08:05:26PM +0800, Asias He wrote:
 On Mon, May 06, 2013 at 01:07:46PM +0300, Michael S. Tsirkin wrote:
  On Mon, May 06, 2013 at 04:38:18PM +0800, Asias He wrote:
   MST, This is on top of [PATCH 0/2] vhost-net fix ubuf.
  
  Okay, how about making EVENT_IDX work for virtio-scsi?
  I'm guessing it's some messup with feature negotiation,
  that's what all event-idx bugs came down to so far.
 
 Yes, IIRC, EVENT_IDX works for vhost-scsi now. Will cook a patch to
 enable it. It should go 3.10, right?

If it's early in the cycle, I think it can.

   Asias He (11):
 vhost: Remove vhost_enable_zcopy in vhost.h
 vhost: Move VHOST_NET_FEATURES to net.c
 vhost: Make vhost a separate module
 vhost: Remove comments for hdr in vhost.h
 vhost: Simplify dev-vqs[i] access
 vhost-net: Cleanup vhost_ubuf and vhost_zcopy
 vhost-scsi: Remove unnecessary forward struct vhost_scsi declaration
 vhost-scsi: Rename struct vhost_scsi *s to *vs
 vhost-scsi: Make func indention more consistent
 vhost-scsi: Rename struct tcm_vhost_tpg *tv_tpg to *tpg
 vhost-scsi: Rename struct tcm_vhost_cmd *tv_cmd to *cmd
   
drivers/vhost/Kconfig  |   8 +
drivers/vhost/Makefile |   3 +-
drivers/vhost/net.c|  64 ---
drivers/vhost/scsi.c   | 470 
   ++---
drivers/vhost/vhost.c  |  86 +++--
drivers/vhost/vhost.h  |  11 +-
6 files changed, 361 insertions(+), 281 deletions(-)
   
   -- 
   1.8.1.4
 
 -- 
 Asias
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v17 RESEND] pvpanic: pvpanic device driver

2013-05-06 Thread Paolo Bonzini
Il 06/05/2013 13:38, Marcelo Tosatti ha scritto:
 On Mon, May 06, 2013 at 11:39:35AM +0800, Hu Tao wrote:
 On Fri, May 03, 2013 at 06:59:18PM -0300, Marcelo Tosatti wrote:
 On Fri, May 03, 2013 at 10:47:10AM +0800, Hu Tao wrote:
 pvpanic device is a qemu simulated device through which guest panic
 event is sent to host.

 Signed-off-by: Hu Tao hu...@cn.fujitsu.com
 ---
  drivers/platform/x86/Kconfig   |   7 +++
  drivers/platform/x86/Makefile  |   2 +
  drivers/platform/x86/pvpanic.c | 115 
 +
  3 files changed, 124 insertions(+)
  create mode 100644 drivers/platform/x86/pvpanic.c

 diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
 index 3338437..527ed04 100644
 --- a/drivers/platform/x86/Kconfig
 +++ b/drivers/platform/x86/Kconfig
 @@ -781,4 +781,11 @@ config APPLE_GMUX
  graphics as well as the backlight. Currently only backlight
  control is supported by the driver.
  
 +config PVPANIC
 +  tristate pvpanic device support
 +  depends on ACPI
 +  ---help---
 +This driver provides support for pvpanic device, which is a qemu
 +simulated device through which guest panic event is sent to host.
 +
  endif # X86_PLATFORM_DEVICES
 diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
 index ace2b38..ef0ec74 100644
 --- a/drivers/platform/x86/Makefile
 +++ b/drivers/platform/x86/Makefile
 @@ -51,3 +51,5 @@ obj-$(CONFIG_INTEL_OAKTRAIL) += intel_oaktrail.o
  obj-$(CONFIG_SAMSUNG_Q10) += samsung-q10.o
  obj-$(CONFIG_APPLE_GMUX)  += apple-gmux.o
  obj-$(CONFIG_CHROMEOS_LAPTOP) += chromeos_laptop.o
 +
 +obj-$(CONFIG_PVPANIC)   += pvpanic.o
 diff --git a/drivers/platform/x86/pvpanic.c 
 b/drivers/platform/x86/pvpanic.c
 new file mode 100644
 index 000..81c95ec
 --- /dev/null
 +++ b/drivers/platform/x86/pvpanic.c
 @@ -0,0 +1,115 @@
 +/*
 + *  pvpanic.c - pvpanic Device Support
 + *
 + *  Copyright (C) 2013 Fujitsu.
 + *
 + *  This program is free software; you can redistribute it and/or modify
 + *  it under the terms of the GNU General Public License as published by
 + *  the Free Software Foundation; either version 2 of the License, or
 + *  (at your option) any later version.
 + *
 + *  This program is distributed in the hope that it will be useful,
 + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 + *  GNU General Public License for more details.
 + *
 + *  You should have received a copy of the GNU General Public License
 + *  along with this program; if not, write to the Free Software
 + *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  
 02110-1301  USA
 + */
 +
 +#define pr_fmt(fmt) KBUILD_MODNAME :  fmt
 +
 +#include linux/kernel.h
 +#include linux/module.h
 +#include linux/init.h
 +#include linux/types.h
 +#include acpi/acpi_bus.h
 +#include acpi/acpi_drivers.h
 +
 +MODULE_AUTHOR(Hu Tao hu...@cn.fujitsu.com);
 +MODULE_DESCRIPTION(pvpanic device driver);
 +MODULE_LICENSE(GPL);
 +
 +static int pvpanic_add(struct acpi_device *device);
 +static int pvpanic_remove(struct acpi_device *device);
 +
 +static const struct acpi_device_id pvpanic_device_ids[] = {
 +  { QEMU0001, 0},
 +  { , 0},
 +};
 +MODULE_DEVICE_TABLE(acpi, pvpanic_device_ids);
 +
 +#define PVPANIC_PANICKED  (1  0)
 +
 +static acpi_handle handle;
 +
 +static struct acpi_driver pvpanic_driver = {
 +  .name = pvpanic,
 +  .class =QEMU,
 +  .ids =  pvpanic_device_ids,
 +  .ops =  {
 +  .add =  pvpanic_add,
 +  .remove =   pvpanic_remove,
 +  },
 +  .owner =THIS_MODULE,
 +};
 +
 +static void
 +pvpanic_send_event(unsigned int event)
 +{
 +  union acpi_object arg;
 +  struct acpi_object_list arg_list;
 +
 +  if (!handle)
 +  return;
 +
 +  arg.type = ACPI_TYPE_INTEGER;
 +  arg.integer.value = event;
 +
 +  arg_list.count = 1;
 +  arg_list.pointer = arg;
 +
 +  acpi_evaluate_object(handle, WRPT, arg_list, NULL);
 +}

 Is it safe to call acpi_evaluate_object from a panic notifier? For
 example:

 - Has it been confirmed that no code invoked via acpi_evaluate_object can 
 panic() ?

 Confirmed.

 - acpi_ex_enter_interpreter grabs a mutex. Is that path ever used?

 Unfortunately yes. As I can tell, there are 2 places in the path to grab
 a mutex: when searching the namespace for the method, and when executing
 the method. I didn't find a non-blocking version of acpi_evaluate_object.
 
 Then thats a bug because the pvpanic notifier is called with interrupts
 disabled.
 
 Suppose its safer to use an OUT instruction?

Yeah, it probably is...  You can still use acpi_walk_resources to find
the port number instead of hard-coding 0x505...

Paolo

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  

Re: [PATCH v2 00/11] vhost cleanups

2013-05-06 Thread Asias He
On Mon, May 06, 2013 at 04:15:35PM +0300, Michael S. Tsirkin wrote:
 On Mon, May 06, 2013 at 08:05:26PM +0800, Asias He wrote:
  On Mon, May 06, 2013 at 01:07:46PM +0300, Michael S. Tsirkin wrote:
   On Mon, May 06, 2013 at 04:38:18PM +0800, Asias He wrote:
MST, This is on top of [PATCH 0/2] vhost-net fix ubuf.
   
   Okay, how about making EVENT_IDX work for virtio-scsi?
   I'm guessing it's some messup with feature negotiation,
   that's what all event-idx bugs came down to so far.
  
  Yes, IIRC, EVENT_IDX works for vhost-scsi now. Will cook a patch to
  enable it. It should go 3.10, right?
 
 If it's early in the cycle, I think it can.

Well, let's queue it for 3.11.

Asias He (11):
  vhost: Remove vhost_enable_zcopy in vhost.h
  vhost: Move VHOST_NET_FEATURES to net.c
  vhost: Make vhost a separate module
  vhost: Remove comments for hdr in vhost.h
  vhost: Simplify dev-vqs[i] access
  vhost-net: Cleanup vhost_ubuf and vhost_zcopy
  vhost-scsi: Remove unnecessary forward struct vhost_scsi declaration
  vhost-scsi: Rename struct vhost_scsi *s to *vs
  vhost-scsi: Make func indention more consistent
  vhost-scsi: Rename struct tcm_vhost_tpg *tv_tpg to *tpg
  vhost-scsi: Rename struct tcm_vhost_cmd *tv_cmd to *cmd

 drivers/vhost/Kconfig  |   8 +
 drivers/vhost/Makefile |   3 +-
 drivers/vhost/net.c|  64 ---
 drivers/vhost/scsi.c   | 470 
++---
 drivers/vhost/vhost.c  |  86 +++--
 drivers/vhost/vhost.h  |  11 +-
 6 files changed, 361 insertions(+), 281 deletions(-)

-- 
1.8.1.4
  
  -- 
  Asias

-- 
Asias
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] vhost: drop virtio_net.h dependency

2013-05-06 Thread Asias He
On Mon, May 06, 2013 at 01:37:34PM +0300, Michael S. Tsirkin wrote:
 There's no net specific code in vhost.c anymore,
 don't include the virtio_net.h header.

Did you push the it to your tree. I am not seeing it.

 Signed-off-by: Michael S. Tsirkin m...@redhat.com
 ---
 
 This is on top of Asias' patches, already queued so
 just FYI.
 
  drivers/vhost/vhost.c | 1 -
  1 file changed, 1 deletion(-)
 
 diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
 index de9441a..dcde269 100644
 --- a/drivers/vhost/vhost.c
 +++ b/drivers/vhost/vhost.c
 @@ -13,7 +13,6 @@
  
  #include linux/eventfd.h
  #include linux/vhost.h
 -#include linux/virtio_net.h
  #include linux/mm.h
  #include linux/mmu_context.h
  #include linux/miscdevice.h
 -- 
 MST

-- 
Asias
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [v1][KVM][PATCH 1/1] kvm:ppc: enable doorbell exception with E500MC

2013-05-06 Thread Alexander Graf

On 05/06/2013 04:53 AM, Tiejun Chen wrote:

Actually E500MC also support doorbell exception, and CONFIG_PPC_E500MC
can cover BOOK3E/BOOK3E_64 as well.

Signed-off-by: Tiejun Chentiejun.c...@windriver.com
---
  arch/powerpc/kvm/booke.c |2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 1020119..dc1f590 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -795,7 +795,7 @@ static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu,
kvmppc_fill_pt_regs(regs);
timer_interrupt(regs);
break;
-#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3E_64)
+#if defined(CONFIG_PPC_E500MC)


I suppose you mean CONFIG_KVM_E500MC here? Why didn't this work for you 
before? The ifdef above should cover the same range of CPUs.



Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


KVM call agenda for 2013-05-07

2013-05-06 Thread Juan Quintela

Hi

Please send in any agenda topics you are interested in.

Later, Juan.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 4/6] KVM: MMU: fast invalid all shadow pages

2013-05-06 Thread Gleb Natapov
On Mon, May 06, 2013 at 09:10:11PM +0800, Xiao Guangrong wrote:
 On 05/06/2013 08:36 PM, Gleb Natapov wrote:
 
  Step 1) Fix kvm_mmu_zap_all's behaviour: introduce lockbreak via
  spin_needbreak. Use generation numbers so that in case kvm_mmu_zap_all 
  releases mmu_lock and reacquires it again, only shadow pages 
  from the generation with which kvm_mmu_zap_all started are zapped (this
  guarantees forward progress and eventual termination).
 
  kvm_mmu_zap_generation()
spin_lock(mmu_lock)
int generation = kvm-arch.mmu_generation;
 
for_each_shadow_page(sp) {
if (sp-generation == kvm-arch.mmu_generation)
zap_page(sp)
if (spin_needbreak(mmu_lock)) {
kvm-arch.mmu_generation++;
cond_resched_lock(mmu_lock);
}
}
 
  kvm_mmu_zap_all()
spin_lock(mmu_lock)
for_each_shadow_page(sp) {
if (spin_needbreak(mmu_lock)) {
cond_resched_lock(mmu_lock);
}
}
 
  Use kvm_mmu_zap_generation for kvm_arch_flush_shadow_memslot.
  Use kvm_mmu_zap_all for kvm_mmu_notifier_release,kvm_destroy_vm.
 
  This addresses the main problem: excessively long hold times 
  of kvm_mmu_zap_all with very large guests.
 
  Do you see any problem with this logic? This was what i was thinking 
  we agreed.
 
  No. I understand it and it can work.
 
  Actually, it is similar with Gleb's idea that zapping stale shadow pages
  (and uses lock break technique), after some discussion, we thought only 
  zap
  shadow pages that are reachable from the slot's rmap is better, that is 
  this
  patchset does.
  (https://lkml.org/lkml/2013/4/23/73)
 
  But this is not what the patch is doing. Close, but not the same :)
 
 Okay. :)
 
  Instead of zapping shadow pages reachable from slot's rmap the patch
  does kvm_unmap_rmapp() which drop all spte without zapping shadow pages.
  That is why you need special code to re-init lpage_info. What I proposed
  was to call zap_page() on all shadow pages reachable from rmap. This
  will take care of lpage_info counters. Does this make sense?
 
 Unfortunately, no! We still need to care lpage_info. lpage_info is used
 to count the number of guest page tables in the memslot.
 
 For example, there is a memslot:
 memslot[0].based_gfn = 0, memslot[0].npages = 100,
 
 and there is a shadow page:
 sp-role.direct =0, sp-role.level = 4, sp-gfn = 10.
 
 this sp is counted in the memslot[0] but it can not be found by walking
 memslot[0]-rmap since there is no last mapping in this shadow page.
 
Right, so what about walking mmu_page_hash for each gfn belonging to the
slot that is in process to be removed to find those?

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 4/6] KVM: MMU: fast invalid all shadow pages

2013-05-06 Thread Xiao Guangrong
On 05/07/2013 01:24 AM, Gleb Natapov wrote:
 On Mon, May 06, 2013 at 09:10:11PM +0800, Xiao Guangrong wrote:
 On 05/06/2013 08:36 PM, Gleb Natapov wrote:

 Step 1) Fix kvm_mmu_zap_all's behaviour: introduce lockbreak via
 spin_needbreak. Use generation numbers so that in case kvm_mmu_zap_all 
 releases mmu_lock and reacquires it again, only shadow pages 
 from the generation with which kvm_mmu_zap_all started are zapped (this
 guarantees forward progress and eventual termination).

 kvm_mmu_zap_generation()
   spin_lock(mmu_lock)
   int generation = kvm-arch.mmu_generation;

   for_each_shadow_page(sp) {
   if (sp-generation == kvm-arch.mmu_generation)
   zap_page(sp)
   if (spin_needbreak(mmu_lock)) {
   kvm-arch.mmu_generation++;
   cond_resched_lock(mmu_lock);
   }
   }

 kvm_mmu_zap_all()
   spin_lock(mmu_lock)
   for_each_shadow_page(sp) {
   if (spin_needbreak(mmu_lock)) {
   cond_resched_lock(mmu_lock);
   }
   }

 Use kvm_mmu_zap_generation for kvm_arch_flush_shadow_memslot.
 Use kvm_mmu_zap_all for kvm_mmu_notifier_release,kvm_destroy_vm.

 This addresses the main problem: excessively long hold times 
 of kvm_mmu_zap_all with very large guests.

 Do you see any problem with this logic? This was what i was thinking 
 we agreed.

 No. I understand it and it can work.

 Actually, it is similar with Gleb's idea that zapping stale shadow pages
 (and uses lock break technique), after some discussion, we thought only 
 zap
 shadow pages that are reachable from the slot's rmap is better, that is 
 this
 patchset does.
 (https://lkml.org/lkml/2013/4/23/73)

 But this is not what the patch is doing. Close, but not the same :)

 Okay. :)

 Instead of zapping shadow pages reachable from slot's rmap the patch
 does kvm_unmap_rmapp() which drop all spte without zapping shadow pages.
 That is why you need special code to re-init lpage_info. What I proposed
 was to call zap_page() on all shadow pages reachable from rmap. This
 will take care of lpage_info counters. Does this make sense?

 Unfortunately, no! We still need to care lpage_info. lpage_info is used
 to count the number of guest page tables in the memslot.

 For example, there is a memslot:
 memslot[0].based_gfn = 0, memslot[0].npages = 100,

 and there is a shadow page:
 sp-role.direct =0, sp-role.level = 4, sp-gfn = 10.

 this sp is counted in the memslot[0] but it can not be found by walking
 memslot[0]-rmap since there is no last mapping in this shadow page.

 Right, so what about walking mmu_page_hash for each gfn belonging to the
 slot that is in process to be removed to find those?

That will cost lots of time. The size of hashtable is 1  10. If the
memslot has 4M memory, it will walk all the entries, the cost is the same
as walking active_list (maybe litter more). And a memslot has 4M memory is
the normal case i think.

Another point is that lpage_info stops mmu to use large page. If we
do not reset lpage_info, mmu is using 4K page until the invalid-sp is
zapped.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 4/6] KVM: MMU: fast invalid all shadow pages

2013-05-06 Thread Marcelo Tosatti
On Mon, May 06, 2013 at 11:39:11AM +0800, Xiao Guangrong wrote:
 On 05/04/2013 08:52 AM, Marcelo Tosatti wrote:
  On Sat, May 04, 2013 at 12:51:06AM +0800, Xiao Guangrong wrote:
  On 05/03/2013 11:53 PM, Marcelo Tosatti wrote:
  On Fri, May 03, 2013 at 01:52:07PM +0800, Xiao Guangrong wrote:
  On 05/03/2013 09:05 AM, Marcelo Tosatti wrote:
 
  +
  +/*
  + * Fast invalid all shadow pages belong to @slot.
  + *
  + * @slot != NULL means the invalidation is caused the memslot 
  specified
  + * by @slot is being deleted, in this case, we should ensure that rmap
  + * and lpage-info of the @slot can not be used after calling the 
  function.
  + *
  + * @slot == NULL means the invalidation due to other reasons, we need
  + * not care rmap and lpage-info since they are still valid after 
  calling
  + * the function.
  + */
  +void kvm_mmu_invalid_memslot_pages(struct kvm *kvm,
  + struct kvm_memory_slot *slot)
  +{
  +  spin_lock(kvm-mmu_lock);
  +  kvm-arch.mmu_valid_gen++;
  +
  +  /*
  +   * All shadow paes are invalid, reset the large page info,
  +   * then we can safely desotry the memslot, it is also good
  +   * for large page used.
  +   */
  +  kvm_clear_all_lpage_info(kvm);
 
  Xiao,
 
  I understood it was agreed that simple mmu_lock lockbreak while
  avoiding zapping of newly instantiated pages upon a
 
  if(spin_needbreak)
  cond_resched_lock()
 
  cycle was enough as a first step? And then later introduce root zapping
  along with measurements.
 
  https://lkml.org/lkml/2013/4/22/544
 
  Yes, it is.
 
  See the changelog in 0/0:
 
   we use lock-break technique to zap all sptes linked on the
  invalid rmap, it is not very effective but good for the first step.
 
  Thanks!
 
  Sure, but what is up with zeroing kvm_clear_all_lpage_info(kvm) and
  zapping the root? Only lock-break technique along with generation number 
  was what was agreed.
 
  Marcelo,
 
  Please Wait... I am completely confused. :(
 
  Let's clarify zeroing kvm_clear_all_lpage_info(kvm) and zapping the root 
  first.
  Are these changes you wanted?
 
  void kvm_mmu_invalid_memslot_pages(struct kvm *kvm,
struct kvm_memory_slot *slot)
  {
 spin_lock(kvm-mmu_lock);
 kvm-arch.mmu_valid_gen++;
 
 /* Zero all root pages.*/
  restart:
 list_for_each_entry_safe(sp, node, kvm-arch.active_mmu_pages, link) {
 if (!sp-root_count)
 continue;
 
 if (kvm_mmu_prepare_zap_page(kvm, sp, invalid_list))
 goto restart;
 }
 
 /*
  * All shadow paes are invalid, reset the large page info,
  * then we can safely desotry the memslot, it is also good
  * for large page used.
  */
 kvm_clear_all_lpage_info(kvm);
 
 kvm_mmu_commit_zap_page(kvm, invalid_list);
 spin_unlock(kvm-mmu_lock);
  }
 
  static void rmap_remove(struct kvm *kvm, u64 *spte)
  {
 struct kvm_mmu_page *sp;
 gfn_t gfn;
 unsigned long *rmapp;
 
 sp = page_header(__pa(spte));
  +
  +   /* Let invalid sp do not access its rmap. */
  +  if (!sp_is_valid(sp))
  +  return;
  +
 gfn = kvm_mmu_page_get_gfn(sp, spte - sp-spt);
 rmapp = gfn_to_rmap(kvm, gfn, sp-role.level);
 pte_list_remove(spte, rmapp);
  }
 
  If yes, there is the reason why we can not do this that i mentioned before:
 
  after call kvm_mmu_invalid_memslot_pages(), the memslot-rmap will be 
  destroyed.
  Later, if host reclaim page, the mmu-notify handlers, -invalidate_page and
  -invalidate_range_start, can not find any spte using the host page, then
  Accessed/Dirty for host page is missing tracked.
  (missing call kvm_set_pfn_accessed and kvm_set_pfn_dirty properly.)
 
  What's your idea?
  
  
  Step 1) Fix kvm_mmu_zap_all's behaviour: introduce lockbreak via
  spin_needbreak. Use generation numbers so that in case kvm_mmu_zap_all 
  releases mmu_lock and reacquires it again, only shadow pages 
  from the generation with which kvm_mmu_zap_all started are zapped (this
  guarantees forward progress and eventual termination).
  
  kvm_mmu_zap_generation()
  spin_lock(mmu_lock)
  int generation = kvm-arch.mmu_generation;
  
  for_each_shadow_page(sp) {
  if (sp-generation == kvm-arch.mmu_generation)
  zap_page(sp)
  if (spin_needbreak(mmu_lock)) {
  kvm-arch.mmu_generation++;
  cond_resched_lock(mmu_lock);
  }
  }
  
  kvm_mmu_zap_all()
  spin_lock(mmu_lock)
  for_each_shadow_page(sp) {
  if (spin_needbreak(mmu_lock)) {
  cond_resched_lock(mmu_lock);
  }
  }
  
  Use kvm_mmu_zap_generation for kvm_arch_flush_shadow_memslot.
  Use kvm_mmu_zap_all for kvm_mmu_notifier_release,kvm_destroy_vm.
  
  This addresses the main problem: excessively long hold times 
  of kvm_mmu_zap_all with very large 

Re: [RFC PATCH 04/11] kvm tools: console: unconditionally output to any console

2013-05-06 Thread Sasha Levin
On 05/03/2013 12:09 PM, Will Deacon wrote:
 On Fri, May 03, 2013 at 05:02:14PM +0100, Sasha Levin wrote:
 On 05/03/2013 05:19 AM, Pekka Enberg wrote:
 On Wed, May 1, 2013 at 6:50 PM, Will Deacon will.dea...@arm.com wrote:
 From: Marc Zyngier marc.zyng...@arm.com

 Kvmtool suppresses any output to a console that has not been elected
 as *the* console.

 While this makes sense on the input side (we want the input to be sent
 to one console driver only), it seems to be the wrong thing to do on
 the output side, as it effectively prevents the guest from switching
 from one console to another (think earlyprintk using 8250 to virtio
 console).

 After all, the guest *does* poke this device and outputs something
 there.

 Just remove the kvm-cfg.active_console test from the output paths.

 Signed-off-by: Marc Zyngier marc.zyng...@arm.com
 Signed-off-by: Will Deacon will.dea...@arm.com

 Seems reasonable. Asias, Sasha?


 I remember at trying it some time ago but dropped it for a reason I don't
 remember at the moment.

 Can I have the weekend to play with it to try and figure out why?
 
 There's no rush from my point of view (hence the RFC) so take as long as you
 need!

Looks good to me!


Thanks,
Sasha

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/5] KVM: PPC: iommu: Add missing kvm_iommu_map_pages/kvm_iommu_unmap_pages

2013-05-06 Thread Alex Williamson
On Mon, 2013-05-06 at 17:21 +1000, a...@ozlabs.ru wrote:
 From: Alexey Kardashevskiy a...@ozlabs.ru
 
 The IOMMU API implements groups creating/deletion, device binding
 and IOMMU map/unmap operations.
 
 The PowerPC implementation uses most of the API except map/unmap
 operations, which are implemented on POWER using hypercalls.
 
 However, in order to link a kernel with the CONFIG_IOMMU_API enabled,
 the empty kvm_iommu_map_pages/kvm_iommu_unmap_pages have to be
 defined, so this defines them.
 
 Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
 Cc: David Gibson da...@gibson.dropbear.id.au
 Signed-off-by: Paul Mackerras pau...@samba.org
 ---
  arch/powerpc/include/asm/kvm_host.h |   14 ++
  1 file changed, 14 insertions(+)
 
 diff --git a/arch/powerpc/include/asm/kvm_host.h 
 b/arch/powerpc/include/asm/kvm_host.h
 index b6a047e..c025d91 100644
 --- a/arch/powerpc/include/asm/kvm_host.h
 +++ b/arch/powerpc/include/asm/kvm_host.h
 @@ -603,4 +603,18 @@ struct kvm_vcpu_arch {
  
  #define __KVM_HAVE_ARCH_WQP
  
 +#ifdef CONFIG_IOMMU_API
 +/* POWERPC does not use IOMMU API for mapping/unmapping */
 +static inline int kvm_iommu_map_pages(struct kvm *kvm,
 + struct kvm_memory_slot *slot)
 +{
 + return 0;
 +}
 +
 +static inline void kvm_iommu_unmap_pages(struct kvm *kvm,
 + struct kvm_memory_slot *slot)
 +{
 +}
 +#endif /* CONFIG_IOMMU_API */
 +
  #endif /* __POWERPC_KVM_HOST_H__ */

This is no longer needed, Gleb applied my patch for 3.10 that make all
of KVM device assignment dependent on a build config option and the top
level kvm_host.h now includes this when that is not set.  Thanks,

Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] mips/kvm: Fix ABI for compatibility with 64-bit guests.

2013-05-06 Thread David Daney
From: David Daney david.da...@cavium.com

There are several parts to this:

o All registers are 64-bits wide, 32-bit guests use the least
  significant portion of the register storage fields.

o FPU register formats are defined.

o CP0 Registers are manipulated via the KVM_GET_MSRS/KVM_SET_MSRS
  mechanism.

The vcpu_ioctl_get_regs and vcpu_ioctl_set_regs function pointers
become unused so they were removed.

Some IOCTL functions were moved to kvm_trap_emul because the
implementations are only for that flavor of KVM host.  In the future, if
hardware based virtualization is added, they can be hidden behind
function pointers as appropriate.

Signed-off-by: David Daney david.da...@cavium.com
---
 arch/mips/include/asm/kvm.h  | 106 ++---
 arch/mips/include/asm/kvm_host.h |   6 +-
 arch/mips/kernel/asm-offsets.c   |  64 
 arch/mips/kvm/kvm_mips.c | 124 +++
 arch/mips/kvm/kvm_mips_emul.c| 108 ++---
 arch/mips/kvm/kvm_trap_emul.c| 330 ++-
 6 files changed, 480 insertions(+), 258 deletions(-)

diff --git a/arch/mips/include/asm/kvm.h b/arch/mips/include/asm/kvm.h
index 85789ea..83c44d8 100644
--- a/arch/mips/include/asm/kvm.h
+++ b/arch/mips/include/asm/kvm.h
@@ -1,55 +1,113 @@
 /*
-* This file is subject to the terms and conditions of the GNU General Public
-* License.  See the file COPYING in the main directory of this archive
-* for more details.
-*
-* Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
-* Authors: Sanjay Lal sanj...@kymasys.com
-*/
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
+ * Copyright (C) 2013 Cavium, Inc.
+ * Authors: Sanjay Lal sanj...@kymasys.com
+ */
 
 #ifndef __LINUX_KVM_MIPS_H
 #define __LINUX_KVM_MIPS_H
 
 #include linux/types.h
 
-#define __KVM_MIPS
-
-#define N_MIPS_COPROC_REGS  32
-#define N_MIPS_COPROC_SEL  8
+/*
+ * KVM MIPS specific structures and definitions.
+ *
+ * Some parts derived from the x86 version of this file.
+ */
 
 /* for KVM_GET_REGS and KVM_SET_REGS */
+/*
+ * If Config[AT] is zero (32-bit CPU), the register contents are
+ * stored in the lower 32-bits of the struct kvm_regs fields and sign
+ * extended to 64-bits.
+ */
 struct kvm_regs {
-   __u32 gprs[32];
-   __u32 hi;
-   __u32 lo;
-   __u32 pc;
+   /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
+   __u64 gpr[32];
+   __u64 hi, lo;
+   __u64 pc;
+};
 
-   __u32 cp0reg[N_MIPS_COPROC_REGS][N_MIPS_COPROC_SEL];
+/* for KVM_GET_FPU and KVM_SET_FPU */
+/*
+ * If Status[FR] is zero (32-bit FPU), the upper 32-bits of the FPRs
+ * are zero filled.
+ */
+struct kvm_fpu {
+   __u64 fpr[32];
+   __u32 fir;
+   __u32 fccr;
+   __u32 fexr;
+   __u32 fenr;
+   __u32 fcsr;
+   __u32 pad;
 };
 
-/* for KVM_GET_SREGS and KVM_SET_SREGS */
-struct kvm_sregs {
+
+/*
+ * For MIPS, we use the same APIs as x86, where 'msr' corresponds to a
+ * CP0 register.  The index field is broken down as follows:
+ *
+ *  bits[2..0]   - Register 'sel' index.
+ *  bits[7..3]   - Register 'rd'  index.
+ *  bits[15..8]  - Must be zero.
+ *  bits[31..16] - 0 - CP0 registers.
+ *
+ * Other sets registers may be added in the future.  Each set would
+ * have its own identifier in bits[31..16].
+ *
+ * For MSRs that are narrower than 64-bits, the value is stored in the
+ * low order bits of the data field, and sign extended to 64-bits.
+ */
+#define KVM_MIPS_MSR_CP0 0
+struct kvm_msr_entry {
+   __u32 index;
+   __u32 reserved;
+   __u64 data;
 };
 
-/* for KVM_GET_FPU and KVM_SET_FPU */
-struct kvm_fpu {
+/* for KVM_GET_MSRS and KVM_SET_MSRS */
+struct kvm_msrs {
+   __u32 nmsrs; /* number of msrs in entries */
+   __u32 pad;
+
+   struct kvm_msr_entry entries[0];
 };
 
+/* for KVM_GET_MSR_INDEX_LIST */
+struct kvm_msr_list {
+   __u32 nmsrs; /* number of msrs in entries */
+   __u32 indices[0];
+};
+
+/*
+ * KVM MIPS specific structures and definitions
+ *
+ */
 struct kvm_debug_exit_arch {
+   __u64 epc;
 };
 
 /* for KVM_SET_GUEST_DEBUG */
 struct kvm_guest_debug_arch {
 };
 
+/* definition of registers in kvm_run */
+struct kvm_sync_regs {
+};
+
+/* dummy definition */
+struct kvm_sregs {
+};
+
 struct kvm_mips_interrupt {
/* in */
__u32 cpu;
__u32 irq;
 };
 
-/* definition of registers in kvm_run */
-struct kvm_sync_regs {
-};
-
 #endif /* __LINUX_KVM_MIPS_H */
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index e68781e..3a5b2c8 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -360,7 +360,7 @@ struct kvm_vcpu_arch {
uint32_t guest_inst;
 
/* GPRS */
-   unsigned long gprs[32];
+   unsigned long gpr[32];

Re: [PATCH] mips/kvm: Fix ABI for compatibility with 64-bit guests.

2013-05-06 Thread Sanjay Lal

On May 6, 2013, at 3:39 PM, David Daney wrote:

 
 /* for KVM_GET_REGS and KVM_SET_REGS */
 +/*
 + * If Config[AT] is zero (32-bit CPU), the register contents are
 + * stored in the lower 32-bits of the struct kvm_regs fields and sign
 + * extended to 64-bits.
 + */
 struct kvm_regs {
 - __u32 gprs[32];
 - __u32 hi;
 - __u32 lo;
 - __u32 pc;
 + /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
 + __u64 gpr[32];
 + __u64 hi, lo;
 + __u64 pc;
 +};
 
 - __u32 cp0reg[N_MIPS_COPROC_REGS][N_MIPS_COPROC_SEL];

Hi David, I'll try out the diff with QEMU and confirm that it works as 
expected. Could you just leave the GPR field in kvm_regs as gprs. Its a minor 
change but avoids diffs that just replace gprs with gpr.

Regards
Sanjay

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] mips/kvm: Fix ABI for compatibility with 64-bit guests.

2013-05-06 Thread David Daney

On 05/06/2013 04:11 PM, Sanjay Lal wrote:


On May 6, 2013, at 3:39 PM, David Daney wrote:



/* for KVM_GET_REGS and KVM_SET_REGS */
+/*
+ * If Config[AT] is zero (32-bit CPU), the register contents are
+ * stored in the lower 32-bits of the struct kvm_regs fields and sign
+ * extended to 64-bits.
+ */
struct kvm_regs {
-   __u32 gprs[32];
-   __u32 hi;
-   __u32 lo;
-   __u32 pc;
+   /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
+   __u64 gpr[32];
+   __u64 hi, lo;
+   __u64 pc;
+};

-   __u32 cp0reg[N_MIPS_COPROC_REGS][N_MIPS_COPROC_SEL];


Hi David, I'll try out the diff with QEMU and confirm that it works as expected. Could you just leave the GPR 
field in kvm_regs as gprs. Its a minor change but avoids diffs that just replace gprs 
with gpr.



Well, there were two changes with respect to 'gprs' vs. 'gpr'.

The change you show above only results in a small handful of diff lines.

My argument for the change is that it will be part of a public ABI, and 
should be short and concise, so I changed it to 'gpr'.


I also changed the field with the same name in struct kvm_vcpu_arch to 
match, which causes the changes in asm-offsets.c and quite a few other 
places as well.  One could argue that this one was gratuitous, but I 
thought it would be nice for them to match.  Since it is an internal 
implementation detail, it is not that important, so I could revert this 
part if there are strong objections.


David Daney

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm/ppc/booke64: Hard disable interrupts when entering the guest

2013-05-06 Thread Scott Wood

On 05/05/2013 04:03:08 PM, Benjamin Herrenschmidt wrote:

On Fri, 2013-05-03 at 18:45 -0500, Scott Wood wrote:
 kvmppc_lazy_ee_enable() was causing interrupts to be soft-enabled
 (albeit hard-disabled) in kvmppc_restart_interrupt().  This led to
 warnings, and possibly breakage if the interrupt state was later  
saved

 and then restored (leading to interrupts being hard-and-soft enabled
 when they should be at least soft-disabled).

 Simply removing kvmppc_lazy_ee_enable() leaves interrupts only
 soft-disabled when we enter the guest, but they will be  
hard-disabled
 when we exit the guest -- without PACA_IRQ_HARD_DIS ever being set,  
so

 the local_irq_enable() fails to hard-enable.

 While we could just set PACA_IRQ_HARD_DIS after an exit to  
compensate,
 instead hard-disable interrupts before entering the guest.  This  
way,

 we won't have to worry about interactions if we take an interrupt
 during the guest entry code.  While I don't see any obvious
 interactions, it could change in the future (e.g. it would be bad if
 the non-hv code were used on 64-bit or if 32-bit guest lazy  
interrupt

 disabling, since the non-hv code changes IVPR among other things).

Shouldn't the interrupts be marked soft-enabled (even if hard  
disabled)

when entering the guest ?

Ie. The last stage of entry will hard enable, so they should be
soft-enabled too... if not, latency trackers will consider the whole
guest periods as interrupt disabled...


OK... I guess we already have that problem on 32-bit as well?


Now, kvmppc_lazy_ee_enable() seems to be clearly bogus to me. It will
unconditionally set soft_enabled and clear irq_happened from a
soft-disabled state, thus potentially losing a pending event.

Book3S HV seems to be keeping interrupts fully enabled all the way
until the asm hard disables, which would be fine except that I'm  
worried

we are racy vs. need_resched  signals.

One thing you may be able to do is call prep_irq_for_idle(). This will
tell you if something happened, giving you a chance to abort/re-enable
before you go the guest.


As long as we go straight from IRQs fully enabled to hard-disabled,  
before we check for signals and such, I don't think we need that (and  
using it would raise the question of what to do on 32-bit).


What if we just take this patch, and add trace_hardirqs_on() just  
before entering the guest?  This would be similar to what the 32-bit  
non-KVM exception return code does (except it would be in C code).   
Perhaps we could set soft_enabled as well, but then we'd have to clear  
it again before calling kvmppc_restart_interrupt() -- since the KVM  
exception handlers don't actually care about soft_enabled (it would  
just be for consistency), I'd rather just leave soft_enabled off.


We also don't want PACA_IRQ_HARD_DIS to be cleared the way  
prep_irq_for_idle() does, because that's what lets the  
local_irq_enable() do the hard-enabling after we exit the guest.


-Scott
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm/ppc/booke64: Hard disable interrupts when entering the guest

2013-05-06 Thread Benjamin Herrenschmidt
On Mon, 2013-05-06 at 18:53 -0500, Scott Wood wrote:
 
  Ie. The last stage of entry will hard enable, so they should be
  soft-enabled too... if not, latency trackers will consider the whole
  guest periods as interrupt disabled...
 
 OK... I guess we already have that problem on 32-bit as well?

32-bit doesn't do lazy disable, so the situation is a lot easier there.

  Now, kvmppc_lazy_ee_enable() seems to be clearly bogus to me. It will
  unconditionally set soft_enabled and clear irq_happened from a
  soft-disabled state, thus potentially losing a pending event.
  
  Book3S HV seems to be keeping interrupts fully enabled all the way
  until the asm hard disables, which would be fine except that I'm  
  worried
  we are racy vs. need_resched  signals.
  
  One thing you may be able to do is call prep_irq_for_idle(). This will
  tell you if something happened, giving you a chance to abort/re-enable
  before you go the guest.
 
 As long as we go straight from IRQs fully enabled to hard-disabled,  
 before we check for signals and such, I don't think we need that (and  
 using it would raise the question of what to do on 32-bit).

Except that you have to mark them as soft enabled before you enter the
guest with interrupts on...

But yes, I see your point. If interrupts are fully enabled and you call
hard_irq_disable(), there should be no chance for anything to mess
around with irq_happened.

However if you set soft-enabled later on before the rfid that returns to
the guest and sets EE, you *must* also clear PACA_IRQ_HARD_DIS in
irq_happened. If you get that out of sync bad things will happen later
on...

To be sure all is well, you might want to
WARN_ON(get_paca()-irq_happened == PACA_IRQ_HARD_DIS); (with a comment
explaining why so).

Another problem is that hard_irq_disable() doesn't call
trace_hardirqs_off()... We might want to fix that:

static inline void hard_irq_disable(void)
{
__hard_irq_disable();
if (get_paca()-soft_enabled)
trace_hardirqs_off();
get_paca()-soft_enabled = 0;
get_paca()-irq_happened |= PACA_IRQ_HARD_DIS;
}

 What if we just take this patch, and add trace_hardirqs_on() just  
 before entering the guest?

You still want to set soft_enabled I'd say ... though I can see how you
may get away without it as long as you call trace_hardirqs_off() right
on the way back from the guest, but beware some lockdep bits will choke
if they ever spot the discrepancy between the traced irq state and
soft_enabled. I'd recommend you just keep it in sync.

   This would be similar to what the 32-bit  
 non-KVM exception return code does (except it would be in C code).   
 Perhaps we could set soft_enabled as well, but then we'd have to clear  
 it again before calling kvmppc_restart_interrupt() -- since the KVM  
 exception handlers don't actually care about soft_enabled (it would  
 just be for consistency), I'd rather just leave soft_enabled off.
 
 We also don't want PACA_IRQ_HARD_DIS to be cleared the way  
 prep_irq_for_idle() does, because that's what lets the  
 local_irq_enable() do the hard-enabling after we exit the guest.

Then set it again. Don't leave the kernel in a state where soft_enabled
is 1 and irq_happened is non-zero. It might work in the specific KVM
case we are looking at now because we know we are coming back via KVM
exit and putting things right again but it's fragile, somebody will come
back and break it, etc...

If necessary, create (or improve existing) helpers that do the right
state adjustement. The cost of a couple of byte stores is negligible,
I'd rather you make sure everything remains in sync at all times.

Cheers,
Ben.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/5] KVM: PPC: iommu: Add missing kvm_iommu_map_pages/kvm_iommu_unmap_pages

2013-05-06 Thread Alexey Kardashevskiy
On 05/07/2013 07:07 AM, Alex Williamson wrote:
 On Mon, 2013-05-06 at 17:21 +1000, a...@ozlabs.ru wrote:
 From: Alexey Kardashevskiy a...@ozlabs.ru

 The IOMMU API implements groups creating/deletion, device binding
 and IOMMU map/unmap operations.

 The PowerPC implementation uses most of the API except map/unmap
 operations, which are implemented on POWER using hypercalls.

 However, in order to link a kernel with the CONFIG_IOMMU_API enabled,
 the empty kvm_iommu_map_pages/kvm_iommu_unmap_pages have to be
 defined, so this defines them.

 Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
 Cc: David Gibson da...@gibson.dropbear.id.au
 Signed-off-by: Paul Mackerras pau...@samba.org
 ---
  arch/powerpc/include/asm/kvm_host.h |   14 ++
  1 file changed, 14 insertions(+)

 diff --git a/arch/powerpc/include/asm/kvm_host.h 
 b/arch/powerpc/include/asm/kvm_host.h
 index b6a047e..c025d91 100644
 --- a/arch/powerpc/include/asm/kvm_host.h
 +++ b/arch/powerpc/include/asm/kvm_host.h
 @@ -603,4 +603,18 @@ struct kvm_vcpu_arch {
  
  #define __KVM_HAVE_ARCH_WQP
  
 +#ifdef CONFIG_IOMMU_API
 +/* POWERPC does not use IOMMU API for mapping/unmapping */
 +static inline int kvm_iommu_map_pages(struct kvm *kvm,
 +struct kvm_memory_slot *slot)
 +{
 +return 0;
 +}
 +
 +static inline void kvm_iommu_unmap_pages(struct kvm *kvm,
 +struct kvm_memory_slot *slot)
 +{
 +}
 +#endif /* CONFIG_IOMMU_API */
 +
  #endif /* __POWERPC_KVM_HOST_H__ */
 
 This is no longer needed, Gleb applied my patch for 3.10 that make all
 of KVM device assignment dependent on a build config option and the top
 level kvm_host.h now includes this when that is not set.  Thanks,

Cannot find it, could you point me please where it is on github or
git.kernel.org? Thanks.


-- 
Alexey
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [v1][KVM][PATCH 1/1] kvm:ppc: enable doorbell exception with E500MC

2013-05-06 Thread tiejun.chen

On 05/06/2013 10:58 PM, Alexander Graf wrote:

On 05/06/2013 04:53 AM, Tiejun Chen wrote:

Actually E500MC also support doorbell exception, and CONFIG_PPC_E500MC
can cover BOOK3E/BOOK3E_64 as well.

Signed-off-by: Tiejun Chentiejun.c...@windriver.com
---
  arch/powerpc/kvm/booke.c |2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 1020119..dc1f590 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -795,7 +795,7 @@ static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu,
  kvmppc_fill_pt_regs(regs);
  timer_interrupt(regs);
  break;
-#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3E_64)
+#if defined(CONFIG_PPC_E500MC)


I suppose you mean CONFIG_KVM_E500MC here? Why didn't this work for you before?


This works for me.

Here I just mean currently CONFIG_PPC_E500MC is always selected no matter what 
CONFIG_PPC_FSL_BOOK3E or CONFIG_PPC_BOOK3E_64 is enabled. And especially, this 
is already in the arch/powerpc/kvm/booke.c file, so I think one #ifdef 
(CONFIG_PPC_E500MC) is enough and also makes sense.



The ifdef above should cover the same range of CPUs.


Or furthermore, the #ifdef CONFIG_PPC_DOORBELL is reasonable to cover this.

Tiejun

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


irqfd issue

2013-05-06 Thread Asias He
Hi,

I am seeing this with linus/master. Any ideas?

[   34.168356] IPv6: ADDRCONF(NETDEV_UP): virbr0: link is not ready
[   36.743758] BUG: unable to handle kernel paging request at 00030029
[   36.745177] IP: [81c08584] __mutex_lock_slowpath+0x34/0x240
[   36.746576] PGD 0 
[   36.747962] Oops:  [#1] SMP 
[   36.749343] Modules linked in: ipt_MASQUERADE iptable_nat nf_nat_ipv4 nf_nat 
vhost_net vhost
[   36.750753] CPU: 0 PID: 4260 Comm: qemu-kvm Not tainted 3.9.0+ #752
[   36.752130] Hardware name: Dell Inc. OptiPlex 790/0V5HMK, BIOS A11 12/30/2011
[   36.753495] task: 88021fba8000 ti: 88021f0c4000 task.ti: 
88021f0c4000
[   36.754847] RIP: 0010:[81c08584]  [81c08584] 
__mutex_lock_slowpath+0x34/0x240
[   36.756228] RSP: 0018:88021f0c5c88  EFLAGS: 00010202
[   36.757584] RAX: 0001 RBX: 880223ffb420 RCX: 
[   36.758926] RDX: 00030001 RSI: 88021f0c5d60 RDI: 880223ffb420
[   36.760268] RBP: 88021f0c5cf8 R08: 88021f0c R09: 
[   36.761602] R10: 8802209c3f10 R11:  R12: 880223ffb420
[   36.762924] R13: 88022236c000 R14: 8802236213b0 R15: 880223ffb420
[   36.764232] FS:  () GS:88022dc0() 
knlGS:
[   36.765543] CS:  0010 DS:  ES:  CR0: 80050033
[   36.766869] CR2: 00030029 CR3: 0240b000 CR4: 000427f0
[   36.768220] DR0:  DR1:  DR2: 
[   36.769565] DR3:  DR6: 0ff0 DR7: 0400
[   36.770889] Stack:
[   36.772200]  0092 88021fba8000 88021f0c5cb8 
81c0b2c7
[   36.773551]  8802210a4fc0 8802210a4fc0 88021f0c5d08 
810c0f2c
[   36.774884]  000e 880223ffb420 88021f0c5d38 
88022236c000
[   36.776194] Call Trace:
[   36.777482]  [81c0b2c7] ? _raw_spin_unlock_irqrestore+0x37/0x40
[   36.778789]  [810c0f2c] ? try_to_wake_up+0x1ec/0x290
[   36.780107]  [81c0852b] mutex_lock+0x2b/0x50
[   36.781420]  [810a9b6d] flush_workqueue+0x9d/0x560
[   36.782729]  [8100933f] kvm_irqfd_release+0x8f/0xa0
[   36.784046]  [8100456d] kvm_vm_release+0x1d/0x30
[   36.785367]  [811a732a] __fput+0xba/0x240
[   36.786693]  [811a751e] fput+0xe/0x10
[   36.788007]  [810af685] task_work_run+0xa5/0xe0
[   36.789317]  [81092cd7] do_exit+0x2d7/0xac0
[   36.790622]  [811a4a04] ? fsnotify_modify+0x64/0x80
[   36.791896]  [8140327a] ? trace_hardirqs_off_thunk+0x3a/0x6c
[   36.793141]  [81093511] do_group_exit+0x51/0xc0
[   36.794358]  [81093597] SyS_exit_group+0x17/0x20
[   36.795547]  [81c13882] system_call_fastpath+0x16/0x1b
[   36.796731] Code: 55 41 54 53 48 83 ec 48 66 66 66 66 90 65 48 8b 04 25 00 
b8 00 00 49 89 fc 48 89 45 98 48 8b 57 18 b8 01 00 00 00 48 85 d2 74 03 8b 42 
28 85 c0 0f 84 e6 00 00 00 65 48 8b 04 25 08 b8 00 00 48 
[   36.798194] RIP  [81c08584] __mutex_lock_slowpath+0x34/0x240
[   36.799567]  RSP 88021f0c5c88
[   36.800943] CR2: 00030029
[   36.813185] ---[ end trace 4877613defb9fc19 ]---
[   36.813188] Fixing recursive fault but reboot is needed!
[   37.011566] usb 2-1.1: link qh8-0601/880223a9c600 start 3 [1/2 us]
[   70.539341] usb usb1: usb port1's DeviceRemovable is changed to 1 according 
to platform information.
[   70.539546] usb usb2: usb port1's DeviceRemovable is changed to 1 according 
to platform information.
[   70.862490] nr_pdflush_threads exported in /proc is scheduled for removal
[   70.862558] sysctl: The scan_unevictable_pages sysctl/node-interface has 
been disabled for lack of a legitimate use case.  If you have one, please send 
an email to linux...@kvack.org.

-- 
Asias
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/5] KVM: PPC: iommu: Add missing kvm_iommu_map_pages/kvm_iommu_unmap_pages

2013-05-06 Thread Alex Williamson
On Tue, 2013-05-07 at 10:49 +1000, Alexey Kardashevskiy wrote:
 On 05/07/2013 07:07 AM, Alex Williamson wrote:
  On Mon, 2013-05-06 at 17:21 +1000, a...@ozlabs.ru wrote:
  From: Alexey Kardashevskiy a...@ozlabs.ru
 
  The IOMMU API implements groups creating/deletion, device binding
  and IOMMU map/unmap operations.
 
  The PowerPC implementation uses most of the API except map/unmap
  operations, which are implemented on POWER using hypercalls.
 
  However, in order to link a kernel with the CONFIG_IOMMU_API enabled,
  the empty kvm_iommu_map_pages/kvm_iommu_unmap_pages have to be
  defined, so this defines them.
 
  Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
  Cc: David Gibson da...@gibson.dropbear.id.au
  Signed-off-by: Paul Mackerras pau...@samba.org
  ---
   arch/powerpc/include/asm/kvm_host.h |   14 ++
   1 file changed, 14 insertions(+)
 
  diff --git a/arch/powerpc/include/asm/kvm_host.h 
  b/arch/powerpc/include/asm/kvm_host.h
  index b6a047e..c025d91 100644
  --- a/arch/powerpc/include/asm/kvm_host.h
  +++ b/arch/powerpc/include/asm/kvm_host.h
  @@ -603,4 +603,18 @@ struct kvm_vcpu_arch {
   
   #define __KVM_HAVE_ARCH_WQP
   
  +#ifdef CONFIG_IOMMU_API
  +/* POWERPC does not use IOMMU API for mapping/unmapping */
  +static inline int kvm_iommu_map_pages(struct kvm *kvm,
  +  struct kvm_memory_slot *slot)
  +{
  +  return 0;
  +}
  +
  +static inline void kvm_iommu_unmap_pages(struct kvm *kvm,
  +  struct kvm_memory_slot *slot)
  +{
  +}
  +#endif /* CONFIG_IOMMU_API */
  +
   #endif /* __POWERPC_KVM_HOST_H__ */
  
  This is no longer needed, Gleb applied my patch for 3.10 that make all
  of KVM device assignment dependent on a build config option and the top
  level kvm_host.h now includes this when that is not set.  Thanks,
 
 Cannot find it, could you point me please where it is on github or
 git.kernel.org? Thanks.

http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=2a5bab1004729f3302c776e53ee7c895b98bb1ce

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v10 7/7] KVM: VMX: Use posted interrupt to deliver virtual interrupt

2013-05-06 Thread Zhang, Yang Z
Yangminqiang wrote on 2013-05-03:
 Nakajima, Jun wrote on 2013-04-26:
 Subject: Re: [PATCH v10 7/7] KVM: VMX: Use posted interrupt to deliver 
 virtual
 interrupt
 
 On Fri, Apr 26, 2013 at 2:29 AM, Yangminqiang yangminqi...@huawei.com
 wrote:
 
 Ivytown or newer platform supported it.
 
 Ivytown? Do you mean Ivy Bridge?
 
 
 Ivy Town is the codename of Ivy Bridge-based servers.
 
 One more question, what is the relationship between x2APIC and APIC
 virtualization? APIC-v requires x2APIC or APIC-v includes x2APIC?
If you are using x2apic way(MSR base access)inside guest and want to benefit 
from apic virtualization technology, then you should set virtual x2apic bit in 
Secondary Processor-Based VM-Execution Controls.

Best regards,
Yang



Re: [RFC][KVM][PATCH 1/1] kvm:ppc:booke-64: soft-disable interrupts

2013-05-06 Thread tiejun.chen

On 05/07/2013 07:50 AM, Scott Wood wrote:

On 05/05/2013 10:13:17 PM, tiejun.chen wrote:

On 05/06/2013 11:10 AM, Tiejun Chen wrote:

For the external interrupt, the decrementer exception and the doorbell
excpetion, we also need to soft-disable interrupts while doing as host
interrupt handlers since the DO_KVM hook is always performed to skip
EXCEPTION_COMMON then miss this original chance with the 'ints' (INTS_DISABLE).


http://patchwork.ozlabs.org/patch/241344/
http://patchwork.ozlabs.org/patch/241412/

:-)


I'm observing the same behaviour as well:

WARN_ON_ONCE(!irqs_disabled());




Signed-off-by: Tiejun Chen tiejun.c...@windriver.com
---
  arch/powerpc/kvm/bookehv_interrupts.S |9 +
  1 file changed, 9 insertions(+)

diff --git a/arch/powerpc/kvm/bookehv_interrupts.S
b/arch/powerpc/kvm/bookehv_interrupts.S
index e8ed7d6..2fd62bf 100644
--- a/arch/powerpc/kvm/bookehv_interrupts.S
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -33,6 +33,8 @@

  #ifdef CONFIG_64BIT
  #include asm/exception-64e.h
+#include asm/hw_irq.h
+#include asm/irqflags.h
  #else
  #include ../kernel/head_booke.h /* for THREAD_NORMSAVE() */
  #endif
@@ -469,6 +471,13 @@ _GLOBAL(kvmppc_resume_host)
  PPC_LLr3, HOST_RUN(r1)
  mrr5, r14 /* intno */
  mrr14, r4 /* Save vcpu pointer. */
+#ifdef CONFIG_64BIT
+/* Should we soft-disable interrupts? */
+andi.r6, r5, BOOKE_INTERRUPT_EXTERNAL | BOOKE_INTERRUPT_DECREMENTER
| BOOKE_INTERRUPT_DOORBELL
+beqskip_soft_dis
+SOFT_DISABLE_INTS(r7,r8)
+skip_soft_dis:
+#endif


Why wouldn't we always disable them?  kvmppc_handle_exit() will enable
interrupts when it's ready.


This only disable soft interrupt for kvmppc_restart_interrupt() that restarts 
interrupts if they were meant for the host:


a. SOFT_DISABLE_INTS() only for BOOKE_INTERRUPT_EXTERNAL | 
BOOKE_INTERRUPT_DECREMENTER | BOOKE_INTERRUPT_DOORBELL


b. bl  kvmppc_handle_exit

c. kvmppc_handle_exit()
{
int r = RESUME_HOST;
int s;

/* update before a new last_exit_type is rewritten */
kvmppc_update_timing_stats(vcpu);

/* restart interrupts if they were meant for the host */
kvmppc_restart_interrupt(vcpu, exit_nr);

local_irq_enable(); == Enable again.


And shouldn't we handle kvmppc_restart_interrupt() like the original HOST flow?

#define MASKABLE_EXCEPTION(trapnum, intnum, label, hdlr, ack)   \
START_EXCEPTION(label); \
NORMAL_EXCEPTION_PROLOG(trapnum, intnum, PROLOG_ADDITION_MASKABLE)\
EXCEPTION_COMMON(trapnum, PACA_EXGEN, *INTS_DISABLE*) \
...

So I think this should be reasonable :)

Tiejun

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][KVM][PATCH 1/1] kvm:ppc:booke-64: soft-disable interrupts

2013-05-06 Thread tiejun.chen

On 05/07/2013 10:06 AM, Scott Wood wrote:

On 05/06/2013 08:56:25 PM, tiejun.chen wrote:

On 05/07/2013 07:50 AM, Scott Wood wrote:

On 05/05/2013 10:13:17 PM, tiejun.chen wrote:

On 05/06/2013 11:10 AM, Tiejun Chen wrote:

For the external interrupt, the decrementer exception and the doorbell
excpetion, we also need to soft-disable interrupts while doing as host
interrupt handlers since the DO_KVM hook is always performed to skip
EXCEPTION_COMMON then miss this original chance with the 'ints'
(INTS_DISABLE).


http://patchwork.ozlabs.org/patch/241344/
http://patchwork.ozlabs.org/patch/241412/

:-)


I'm observing the same behaviour as well:

WARN_ON_ONCE(!irqs_disabled());


So, could you explain the benefits of your approach over what's being discussed
in those threads?


They're a long thread so I think I need to take time to see :)




Why wouldn't we always disable them?  kvmppc_handle_exit() will enable
interrupts when it's ready.


This only disable soft interrupt for kvmppc_restart_interrupt() that restarts
interrupts if they were meant for the host:

a. SOFT_DISABLE_INTS() only for BOOKE_INTERRUPT_EXTERNAL |
BOOKE_INTERRUPT_DECREMENTER | BOOKE_INTERRUPT_DOORBELL


Those aren't the only exceptions that can end up going to the host.  We could
get a TLB miss that results in a heavyweight MMIO exit, etc.


This is like host handler, so I'm just disabling soft interrupt during 
kvmppc_restart_interrupt() for Doorbell interrupt/Decrementer Interrupt/External 
Input Interrupt.


I don't see anything should be disabled for any TLB exception in host handler.



And I'd rather see any fix for this problem stay out of the asm code.


We already have an appropriate SOFT_DISABLE_INTS so I think we can take this 
easily :)





b. bl  kvmppc_handle_exit

c. kvmppc_handle_exit()
{
int r = RESUME_HOST;
int s;

/* update before a new last_exit_type is rewritten */
kvmppc_update_timing_stats(vcpu);

/* restart interrupts if they were meant for the host */
kvmppc_restart_interrupt(vcpu, exit_nr);

local_irq_enable();== Enable again.


And shouldn't we handle kvmppc_restart_interrupt() like the original HOST flow?

#define MASKABLE_EXCEPTION(trapnum, intnum, label, hdlr, ack)   \
START_EXCEPTION(label); \
NORMAL_EXCEPTION_PROLOG(trapnum, intnum, PROLOG_ADDITION_MASKABLE)\
EXCEPTION_COMMON(trapnum, PACA_EXGEN, *INTS_DISABLE*) \
...


Could you elaborate on what you mean?


In host handler, we always use MASKABLE_EXCEPTION() to define-to-handle some 
exceptions: Doorbell interrupt/Decrementer Interrupt/External Input Interrupt:


#define MASKABLE_EXCEPTION(trapnum, intnum, label, hdlr, ack)   \
START_EXCEPTION(label); \
NORMAL_EXCEPTION_PROLOG(trapnum, intnum, PROLOG_ADDITION_MASKABLE)\
EXCEPTION_COMMON(trapnum, PACA_EXGEN, *INTS_DISABLE*) \

This would call INTS_DISABLE, which is equal to SOFT_DISABLE_INTS(), to disable 
soft interrupt before call all associated handlers: 
do_IRQ()/timer_interrupt()/doorbell_exception().


But DO_KVM hook always skips INTS_DISABLE.

So I think we also need to do INTS_DISABLE for kvmppc_restart_interrupt() since 
actually that restarts interrupts for the host with a similar way as they are 
called by host.


Tiejun



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 04/11] kvm tools: console: unconditionally output to any console

2013-05-06 Thread Anup Patel
On Tue, May 7, 2013 at 2:34 AM, Sasha Levin sasha.le...@oracle.com wrote:
 On 05/03/2013 12:09 PM, Will Deacon wrote:
 On Fri, May 03, 2013 at 05:02:14PM +0100, Sasha Levin wrote:
 On 05/03/2013 05:19 AM, Pekka Enberg wrote:
 On Wed, May 1, 2013 at 6:50 PM, Will Deacon will.dea...@arm.com wrote:
 From: Marc Zyngier marc.zyng...@arm.com

 Kvmtool suppresses any output to a console that has not been elected
 as *the* console.

 While this makes sense on the input side (we want the input to be sent
 to one console driver only), it seems to be the wrong thing to do on
 the output side, as it effectively prevents the guest from switching
 from one console to another (think earlyprintk using 8250 to virtio
 console).

 After all, the guest *does* poke this device and outputs something
 there.

 Just remove the kvm-cfg.active_console test from the output paths.

 Signed-off-by: Marc Zyngier marc.zyng...@arm.com
 Signed-off-by: Will Deacon will.dea...@arm.com

 Seems reasonable. Asias, Sasha?


 I remember at trying it some time ago but dropped it for a reason I don't
 remember at the moment.

 Can I have the weekend to play with it to try and figure out why?

 There's no rush from my point of view (hence the RFC) so take as long as you
 need!

 Looks good to me!


 Thanks,
 Sasha


I am fine with having 8250 emulated by KVMTOOL, but I am more inclined towards
having a full para-virtualized (PV) machine emulated by KVMTOOL.

Best Regards,
Anup
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


KVM: x86: perform kvmclock updates in lockstep

2013-05-06 Thread Marcelo Tosatti

It is necessary for each vcpus system_timestamp memory copy to be
updated from one sample of the nanosecond kernel clock.

If this is not the case, and NTP changes frequency adjustment, different
vcpus will make use of different time bases.

Signed-off-by: Marcelo Tosatti mtosa...@redhat.com


diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 94f35d2..1ef4287 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1965,7 +1965,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
kvmclock_reset(vcpu);
 
vcpu-arch.time = data;
-   kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+   kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
 
/* we verify if the enable bit is set... */
if (!(data  1))
@@ -2665,7 +2665,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (unlikely(vcpu-arch.tsc_offset_adjustment)) {
adjust_tsc_offset_host(vcpu, vcpu-arch.tsc_offset_adjustment);
vcpu-arch.tsc_offset_adjustment = 0;
-   set_bit(KVM_REQ_CLOCK_UPDATE, vcpu-requests);
+   set_bit(KVM_REQ_MASTERCLOCK_UPDATE, vcpu-requests);
}
 
if (unlikely(vcpu-cpu != cpu) || check_tsc_unstable()) {
@@ -2684,7 +2684,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 * kvmclock on vcpu-cpu migration
 */
if (!vcpu-kvm-arch.use_master_clock || vcpu-cpu == -1)
-   kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+   kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
if (vcpu-cpu != cpu)
kvm_migrate_timers(vcpu);
vcpu-cpu = cpu;
@@ -5092,7 +5092,7 @@ static int kvmclock_cpufreq_notifier(struct 
notifier_block *nb, unsigned long va
kvm_for_each_vcpu(i, vcpu, kvm) {
if (vcpu-cpu != freq-cpu)
continue;
-   kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+   kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
if (vcpu-cpu != smp_processor_id())
send_ipi = 1;
}
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm/ppc/booke64: Hard disable interrupts when entering the guest

2013-05-06 Thread Scott Wood

On 05/06/2013 07:03:14 PM, Benjamin Herrenschmidt wrote:

On Mon, 2013-05-06 at 18:53 -0500, Scott Wood wrote:

  Ie. The last stage of entry will hard enable, so they should be
  soft-enabled too... if not, latency trackers will consider the  
whole

  guest periods as interrupt disabled...

 OK... I guess we already have that problem on 32-bit as well?

32-bit doesn't do lazy disable, so the situation is a lot easier  
there.


Right, but it still currently enters the guest with interrupts marked  
as disabled, so we'd have the same latency tracker issue.



Another problem is that hard_irq_disable() doesn't call
trace_hardirqs_off()... We might want to fix that:

static inline void hard_irq_disable(void)
{
__hard_irq_disable();
if (get_paca()-soft_enabled)
trace_hardirqs_off();
get_paca()-soft_enabled = 0;
get_paca()-irq_happened |= PACA_IRQ_HARD_DIS;
}


Is it possible there are places that assume the current behavior?


 We also don't want PACA_IRQ_HARD_DIS to be cleared the way
 prep_irq_for_idle() does, because that's what lets the
 local_irq_enable() do the hard-enabling after we exit the guest.

Then set it again. Don't leave the kernel in a state where  
soft_enabled

is 1 and irq_happened is non-zero. It might work in the specific KVM
case we are looking at now because we know we are coming back via KVM
exit and putting things right again but it's fragile, somebody will  
come

back and break it, etc...


KVM is a pretty special case -- at least on booke, it's required that  
all exits from guest state go through the KVM exception code.  I think  
it's less likely that that changes, than something breaks in the code  
to fix up lazy ee state (especially since we've already seen the latter  
happen).


I'll give it a shot, though.


If necessary, create (or improve existing) helpers that do the right
state adjustement. The cost of a couple of byte stores is negligible,
I'd rather you make sure everything remains in sync at all times.


My concern was mainly about complexity -- it seemed simpler to just say  
that the during guest execution, CPU is in a special state that is not  
visible to anything that cares about lazy EE.  The fact that EE can  
actually be *off* and we still take the interrupt supports its  
specialness. :-)


-Scott
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/5] KVM: PPC: iommu: Add missing kvm_iommu_map_pages/kvm_iommu_unmap_pages

2013-05-06 Thread Alexey Kardashevskiy
On 05/07/2013 11:42 AM, Alex Williamson wrote:
 On Tue, 2013-05-07 at 10:49 +1000, Alexey Kardashevskiy wrote:
 On 05/07/2013 07:07 AM, Alex Williamson wrote:
 On Mon, 2013-05-06 at 17:21 +1000, a...@ozlabs.ru wrote:
 From: Alexey Kardashevskiy a...@ozlabs.ru

 The IOMMU API implements groups creating/deletion, device binding
 and IOMMU map/unmap operations.

 The PowerPC implementation uses most of the API except map/unmap
 operations, which are implemented on POWER using hypercalls.

 However, in order to link a kernel with the CONFIG_IOMMU_API enabled,
 the empty kvm_iommu_map_pages/kvm_iommu_unmap_pages have to be
 defined, so this defines them.

 Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
 Cc: David Gibson da...@gibson.dropbear.id.au
 Signed-off-by: Paul Mackerras pau...@samba.org
 ---
  arch/powerpc/include/asm/kvm_host.h |   14 ++
  1 file changed, 14 insertions(+)

 diff --git a/arch/powerpc/include/asm/kvm_host.h 
 b/arch/powerpc/include/asm/kvm_host.h
 index b6a047e..c025d91 100644
 --- a/arch/powerpc/include/asm/kvm_host.h
 +++ b/arch/powerpc/include/asm/kvm_host.h
 @@ -603,4 +603,18 @@ struct kvm_vcpu_arch {
  
  #define __KVM_HAVE_ARCH_WQP
  
 +#ifdef CONFIG_IOMMU_API
 +/* POWERPC does not use IOMMU API for mapping/unmapping */
 +static inline int kvm_iommu_map_pages(struct kvm *kvm,
 +  struct kvm_memory_slot *slot)
 +{
 +  return 0;
 +}
 +
 +static inline void kvm_iommu_unmap_pages(struct kvm *kvm,
 +  struct kvm_memory_slot *slot)
 +{
 +}
 +#endif /* CONFIG_IOMMU_API */
 +
  #endif /* __POWERPC_KVM_HOST_H__ */

 This is no longer needed, Gleb applied my patch for 3.10 that make all
 of KVM device assignment dependent on a build config option and the top
 level kvm_host.h now includes this when that is not set.  Thanks,

 Cannot find it, could you point me please where it is on github or
 git.kernel.org? Thanks.
 
 http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=2a5bab1004729f3302c776e53ee7c895b98bb1ce


Yes, I confirm, this is patch is not need any more. Thanks!



-- 
Alexey
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] kvm/ppc: interrupt disabling fixes

2013-05-06 Thread Scott Wood
booke64 was not maintaing consistent lazy ee state when exiting the
guest, leading to warnings and worse.

booke32 was less affected due to the absence of lazy ee, but it was
still feeding bad information into trace_hardirqs_off/on -- we don't
want guest execution to be seen as an IRQs off interval.  book3s_pr
also has this problem.

book3s_pr and booke both used kvmppc_lazy_ee_enable() without
hard-disabling EE first, which could lead to races when irq_happened is
cleared, or if an interrupt happens after kvmppc_lazy_ee_enable(), and
possibly other issues.

Now, on book3s_pr and booke, always hard-disable interrupts before
kvmppc_prepare_to_enter(), but leave them soft-enabled.  On book3s,
this should results in the right lazy EE state when the asm code
hard-enables on an exit.  On booke, we call hard_irq_disable() rather
than hard-enable immediately.

Signed-off-by: Scott Wood scottw...@freescale.com
Cc: Mihai Caraman mihai.cara...@freescale.com
Cc: Benjamin Herrenschmidt b...@kernel.crashing.org
Cc: Tiejun Chen tiejun.c...@windriver.com
---
Only tested on booke (32 and 64 bit).  Testers of book3s_pr would be
appreciated (particularly with lockdep enabled).
---
 arch/powerpc/include/asm/kvm_ppc.h |7 +++
 arch/powerpc/kvm/book3s_pr.c   |6 --
 arch/powerpc/kvm/booke.c   |   12 ++--
 3 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index a5287fe..e55d7e5 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -399,6 +399,13 @@ static inline void kvmppc_mmu_flush_icache(pfn_t pfn)
 static inline void kvmppc_lazy_ee_enable(void)
 {
 #ifdef CONFIG_PPC64
+   /*
+* To avoid races, the caller must have gone directly from having
+* interrupts fully-enabled to hard-disabled.
+*/
+   WARN_ON(local_paca-irq_happened != PACA_IRQ_HARD_DIS);
+   trace_hardirqs_on();
+
/* Only need to enable IRQs by hard enabling them after this */
local_paca-irq_happened = 0;
local_paca-soft_enabled = 1;
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index d09baf1..a1e70113 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -884,7 +884,8 @@ program_interrupt:
 * and if we really did time things so badly, then we just exit
 * again due to a host external interrupt.
 */
-   local_irq_disable();
+   hard_irq_disable();
+   trace_hardirqs_off();
s = kvmppc_prepare_to_enter(vcpu);
if (s = 0) {
local_irq_enable();
@@ -1121,7 +1122,8 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct 
kvm_vcpu *vcpu)
 * really did time things so badly, then we just exit again due to
 * a host external interrupt.
 */
-   local_irq_disable();
+   hard_irq_disable();
+   trace_hardirqs_off();
ret = kvmppc_prepare_to_enter(vcpu);
if (ret = 0) {
local_irq_enable();
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index ecbe908..5dc1f53 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -666,7 +666,8 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct 
kvm_vcpu *vcpu)
return -EINVAL;
}
 
-   local_irq_disable();
+   hard_irq_disable();
+   trace_hardirqs_off();
s = kvmppc_prepare_to_enter(vcpu);
if (s = 0) {
local_irq_enable();
@@ -834,6 +835,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
int s;
int idx;
 
+#ifdef CONFIG_PPC64
+   WARN_ON(local_paca-irq_happened != 0);
+#endif
+   hard_irq_disable();
+   trace_hardirqs_off();
+
/* update before a new last_exit_type is rewritten */
kvmppc_update_timing_stats(vcpu);
 
@@ -1150,7 +1157,8 @@ int kvmppc_handle_exit(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
 * aren't already exiting to userspace for some other reason.
 */
if (!(r  RESUME_HOST)) {
-   local_irq_disable();
+   hard_irq_disable();
+   trace_hardirqs_off();
s = kvmppc_prepare_to_enter(vcpu);
if (s = 0) {
local_irq_enable();
-- 
1.7.10.4


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v18] pvpanic: pvpanic device driver

2013-05-06 Thread Hu Tao
pvpanic device is a qemu simulated device through which guest panic
event is sent to host.

Signed-off-by: Hu Tao hu...@cn.fujitsu.com
---

v17 - v18:  1. call acpi_walk_resources to get the port, and usb outb instead
of acpi_evaluate_oject in panic notifier callback
 2. reword help message

 drivers/platform/x86/Kconfig   |   8 +++
 drivers/platform/x86/Makefile  |   2 +
 drivers/platform/x86/pvpanic.c | 122 +
 3 files changed, 132 insertions(+)
 create mode 100644 drivers/platform/x86/pvpanic.c

diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 3338437..8577261 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -781,4 +781,12 @@ config APPLE_GMUX
  graphics as well as the backlight. Currently only backlight
  control is supported by the driver.
 
+config PVPANIC
+   tristate pvpanic device support
+   depends on ACPI
+   ---help---
+ This driver provides support for the pvpanic device.  pvpanic is
+ a paravirtualized device provided by QEMU; it lets a virtual machine
+ (guest) communicate panic events to the host.
+
 endif # X86_PLATFORM_DEVICES
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index ace2b38..ef0ec74 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -51,3 +51,5 @@ obj-$(CONFIG_INTEL_OAKTRAIL)  += intel_oaktrail.o
 obj-$(CONFIG_SAMSUNG_Q10)  += samsung-q10.o
 obj-$(CONFIG_APPLE_GMUX)   += apple-gmux.o
 obj-$(CONFIG_CHROMEOS_LAPTOP)  += chromeos_laptop.o
+
+obj-$(CONFIG_PVPANIC)   += pvpanic.o
diff --git a/drivers/platform/x86/pvpanic.c b/drivers/platform/x86/pvpanic.c
new file mode 100644
index 000..ddec5cb
--- /dev/null
+++ b/drivers/platform/x86/pvpanic.c
@@ -0,0 +1,122 @@
+/*
+ *  pvpanic.c - pvpanic Device Support
+ *
+ *  Copyright (C) 2013 Fujitsu.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  
USA
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME :  fmt
+
+#include linux/kernel.h
+#include linux/module.h
+#include linux/init.h
+#include linux/types.h
+#include acpi/acpi_bus.h
+#include acpi/acpi_drivers.h
+
+MODULE_AUTHOR(Hu Tao hu...@cn.fujitsu.com);
+MODULE_DESCRIPTION(pvpanic device driver);
+MODULE_LICENSE(GPL);
+
+static int pvpanic_add(struct acpi_device *device);
+static int pvpanic_remove(struct acpi_device *device);
+
+static const struct acpi_device_id pvpanic_device_ids[] = {
+   { QEMU0001, 0},
+   { , 0},
+};
+MODULE_DEVICE_TABLE(acpi, pvpanic_device_ids);
+
+#define PVPANIC_PANICKED   (1  0)
+
+static u16 port;
+
+static struct acpi_driver pvpanic_driver = {
+   .name = pvpanic,
+   .class =QEMU,
+   .ids =  pvpanic_device_ids,
+   .ops =  {
+   .add =  pvpanic_add,
+   .remove =   pvpanic_remove,
+   },
+   .owner =THIS_MODULE,
+};
+
+static void
+pvpanic_send_event(unsigned int event)
+{
+   if (port)
+   outb(event, port);
+}
+
+static int
+pvpanic_panic_notify(struct notifier_block *nb, unsigned long code,
+void *unused)
+{
+   pvpanic_send_event(PVPANIC_PANICKED);
+   return NOTIFY_DONE;
+}
+
+static struct notifier_block pvpanic_panic_nb = {
+   .notifier_call = pvpanic_panic_notify,
+};
+
+
+static acpi_status
+pvpanic_walk_resources(struct acpi_resource *res, void *context)
+{
+   switch (res-type) {
+   case ACPI_RESOURCE_TYPE_END_TAG:
+   return AE_OK;
+
+   case ACPI_RESOURCE_TYPE_IO:
+   port = res-data.io.minimum;
+   return AE_OK;
+
+   default:
+   return AE_ERROR;
+   }
+}
+
+static int pvpanic_add(struct acpi_device *device)
+{
+   acpi_status status;
+   u64 ret;
+
+   status = acpi_evaluate_integer(device-handle, _STA, NULL,
+  ret);
+
+   if (ACPI_FAILURE(status) || (ret  0x0B) != 0x0B)
+   return -ENODEV;
+
+   acpi_walk_resources(device-handle, METHOD_NAME__CRS,
+   pvpanic_walk_resources, NULL);
+
+   atomic_notifier_chain_register(panic_notifier_list,
+ 

Re: [PATCH v4 4/6] KVM: MMU: fast invalid all shadow pages

2013-05-06 Thread Xiao Guangrong
On 05/07/2013 03:50 AM, Marcelo Tosatti wrote:
 On Mon, May 06, 2013 at 11:39:11AM +0800, Xiao Guangrong wrote:
 On 05/04/2013 08:52 AM, Marcelo Tosatti wrote:
 On Sat, May 04, 2013 at 12:51:06AM +0800, Xiao Guangrong wrote:
 On 05/03/2013 11:53 PM, Marcelo Tosatti wrote:
 On Fri, May 03, 2013 at 01:52:07PM +0800, Xiao Guangrong wrote:
 On 05/03/2013 09:05 AM, Marcelo Tosatti wrote:

 +
 +/*
 + * Fast invalid all shadow pages belong to @slot.
 + *
 + * @slot != NULL means the invalidation is caused the memslot 
 specified
 + * by @slot is being deleted, in this case, we should ensure that rmap
 + * and lpage-info of the @slot can not be used after calling the 
 function.
 + *
 + * @slot == NULL means the invalidation due to other reasons, we need
 + * not care rmap and lpage-info since they are still valid after 
 calling
 + * the function.
 + */
 +void kvm_mmu_invalid_memslot_pages(struct kvm *kvm,
 + struct kvm_memory_slot *slot)
 +{
 +  spin_lock(kvm-mmu_lock);
 +  kvm-arch.mmu_valid_gen++;
 +
 +  /*
 +   * All shadow paes are invalid, reset the large page info,
 +   * then we can safely desotry the memslot, it is also good
 +   * for large page used.
 +   */
 +  kvm_clear_all_lpage_info(kvm);

 Xiao,

 I understood it was agreed that simple mmu_lock lockbreak while
 avoiding zapping of newly instantiated pages upon a

 if(spin_needbreak)
 cond_resched_lock()

 cycle was enough as a first step? And then later introduce root zapping
 along with measurements.

 https://lkml.org/lkml/2013/4/22/544

 Yes, it is.

 See the changelog in 0/0:

  we use lock-break technique to zap all sptes linked on the
 invalid rmap, it is not very effective but good for the first step.

 Thanks!

 Sure, but what is up with zeroing kvm_clear_all_lpage_info(kvm) and
 zapping the root? Only lock-break technique along with generation number 
 was what was agreed.

 Marcelo,

 Please Wait... I am completely confused. :(

 Let's clarify zeroing kvm_clear_all_lpage_info(kvm) and zapping the root 
 first.
 Are these changes you wanted?

 void kvm_mmu_invalid_memslot_pages(struct kvm *kvm,
   struct kvm_memory_slot *slot)
 {
spin_lock(kvm-mmu_lock);
kvm-arch.mmu_valid_gen++;

/* Zero all root pages.*/
 restart:
list_for_each_entry_safe(sp, node, kvm-arch.active_mmu_pages, link) {
if (!sp-root_count)
continue;

if (kvm_mmu_prepare_zap_page(kvm, sp, invalid_list))
goto restart;
}

/*
 * All shadow paes are invalid, reset the large page info,
 * then we can safely desotry the memslot, it is also good
 * for large page used.
 */
kvm_clear_all_lpage_info(kvm);

kvm_mmu_commit_zap_page(kvm, invalid_list);
spin_unlock(kvm-mmu_lock);
 }

 static void rmap_remove(struct kvm *kvm, u64 *spte)
 {
struct kvm_mmu_page *sp;
gfn_t gfn;
unsigned long *rmapp;

sp = page_header(__pa(spte));
 +
 +   /* Let invalid sp do not access its rmap. */
 +  if (!sp_is_valid(sp))
 +  return;
 +
gfn = kvm_mmu_page_get_gfn(sp, spte - sp-spt);
rmapp = gfn_to_rmap(kvm, gfn, sp-role.level);
pte_list_remove(spte, rmapp);
 }

 If yes, there is the reason why we can not do this that i mentioned before:

 after call kvm_mmu_invalid_memslot_pages(), the memslot-rmap will be 
 destroyed.
 Later, if host reclaim page, the mmu-notify handlers, -invalidate_page and
 -invalidate_range_start, can not find any spte using the host page, then
 Accessed/Dirty for host page is missing tracked.
 (missing call kvm_set_pfn_accessed and kvm_set_pfn_dirty properly.)

 What's your idea?


 Step 1) Fix kvm_mmu_zap_all's behaviour: introduce lockbreak via
 spin_needbreak. Use generation numbers so that in case kvm_mmu_zap_all 
 releases mmu_lock and reacquires it again, only shadow pages 
 from the generation with which kvm_mmu_zap_all started are zapped (this
 guarantees forward progress and eventual termination).

 kvm_mmu_zap_generation()
 spin_lock(mmu_lock)
 int generation = kvm-arch.mmu_generation;

 for_each_shadow_page(sp) {
 if (sp-generation == kvm-arch.mmu_generation)
 zap_page(sp)
 if (spin_needbreak(mmu_lock)) {
 kvm-arch.mmu_generation++;
 cond_resched_lock(mmu_lock);
 }
 }

 kvm_mmu_zap_all()
 spin_lock(mmu_lock)
 for_each_shadow_page(sp) {
 if (spin_needbreak(mmu_lock)) {
 cond_resched_lock(mmu_lock);
 }
 }

 Use kvm_mmu_zap_generation for kvm_arch_flush_shadow_memslot.
 Use kvm_mmu_zap_all for kvm_mmu_notifier_release,kvm_destroy_vm.

 This addresses the main problem: excessively long hold times 
 of kvm_mmu_zap_all with very large guests.

 Do you see any problem with this logic? This was what i was thinking 
 we agreed.

 No. I understand it 

Re: [PATCH] kvm/ppc/booke64: Hard disable interrupts when entering the guest

2013-05-06 Thread Benjamin Herrenschmidt
On Mon, 2013-05-06 at 22:05 -0500, Scott Wood wrote:
 On 05/06/2013 07:03:14 PM, Benjamin Herrenschmidt wrote:
  On Mon, 2013-05-06 at 18:53 -0500, Scott Wood wrote:
  
Ie. The last stage of entry will hard enable, so they should be
soft-enabled too... if not, latency trackers will consider the  
  whole
guest periods as interrupt disabled...
  
   OK... I guess we already have that problem on 32-bit as well?
  
  32-bit doesn't do lazy disable, so the situation is a lot easier  
  there.
 
 Right, but it still currently enters the guest with interrupts marked  
 as disabled, so we'd have the same latency tracker issue.
 
  Another problem is that hard_irq_disable() doesn't call
  trace_hardirqs_off()... We might want to fix that:
  
  static inline void hard_irq_disable(void)
  {
  __hard_irq_disable();
  if (get_paca()-soft_enabled)
  trace_hardirqs_off();
  get_paca()-soft_enabled = 0;
  get_paca()-irq_happened |= PACA_IRQ_HARD_DIS;
  }
 
 Is it possible there are places that assume the current behavior?

There aren't many callers, I think this should be safe. Most
callers call it with interrupts already soft disabled, so that
should be a nop in these cases (idle for example).

But I can give it a quick spin today on a machine or two.

   We also don't want PACA_IRQ_HARD_DIS to be cleared the way
   prep_irq_for_idle() does, because that's what lets the
   local_irq_enable() do the hard-enabling after we exit the guest.
  
  Then set it again. Don't leave the kernel in a state where  
  soft_enabled
  is 1 and irq_happened is non-zero. It might work in the specific KVM
  case we are looking at now because we know we are coming back via KVM
  exit and putting things right again but it's fragile, somebody will  
  come
  back and break it, etc...
 
 KVM is a pretty special case -- at least on booke, it's required that  
 all exits from guest state go through the KVM exception code.  I think  
 it's less likely that that changes, than something breaks in the code  
 to fix up lazy ee state (especially since we've already seen the latter  
 happen).
 
 I'll give it a shot, though.
 
  If necessary, create (or improve existing) helpers that do the right
  state adjustement. The cost of a couple of byte stores is negligible,
  I'd rather you make sure everything remains in sync at all times.
 
 My concern was mainly about complexity -- it seemed simpler to just say  
 that the during guest execution, CPU is in a special state that is not  
 visible to anything that cares about lazy EE.  The fact that EE can  
 actually be *off* and we still take the interrupt supports its  
 specialness. :-)

Yeah ... sort of :-)

Cheers,
Ben.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm/ppc: interrupt disabling fixes

2013-05-06 Thread Benjamin Herrenschmidt
On Mon, 2013-05-06 at 22:32 -0500, Scott Wood wrote:
 +   hard_irq_disable();
 +   trace_hardirqs_off();

I still think hard_irq_disable() should be fixed to do the right thing
here :-)

I'll do that standalone patch here and give it a spin.

Cheers,
Ben.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Fwd: Booting physically installed Windows while in Arch (AHCI support in OVMF?)

2013-05-06 Thread Michael Tokarev
06.05.2013 00:42, Evert Heylen wrote:
 Please, any help?

I think the easiest way is to convert your existing system from UEFI
back to regular bios + MBR.  For that, you need to disable UEFI boot
in bios and convert GPT to MBR on the HDD.

This is because, as you figured, ahci support in OVMF isn't exactly
working, and because generally, UEFI emulation and UEFI support as
whole is a bit too young still.

/mjt

 I'm currently in such a state I won't be able to sleep well before I
 make some progress on this.
 I've already described my situation quite precisly, if one needs even
 more information, just ask.
 
 I've now also tried with a separate img containing DUET, so I can use
 the default seabios to boot DUET, which can  boot Windows in UEFI
 mode. However, DUET just doesn't see my disk at all, be it in IDE or
 AHCI mode. If I boot the same img *physically* (from a usb), I can
 enter DUET and I can see my physical disk (which is running in AHCI
 mode). So I guess this is an issue with KVM/QEMU.
 
 Any ideas would be greatly appreciated.
 
 On Sun, Apr 28, 2013 at 6:29 PM, Evert Heylen everthey...@gmail.com wrote:
 Hi all, My situation is the following:
 My PC (x64) has an UEFI capable motherboard (ASRock Z77). On my hard
 drive (which is GPT formatted ofc), I have Windows 7 installed on
 /dev/sda3 and Arch Linux on /dev/sda2. I can boot both OS'es. However,
 I would like to boot Windows while in Arch, using KVM. I'm using the
 OVMF images. I tried it right away with this command:

 qemu-system-x86_64 -enable-kvm -smp 4 -cpu host -m 4096 -hda /dev/sda
 -L /path/to/ovmf/

 It doesn't work. When booting in safe mode in windows, I can see that
 windows fails when trying to load CLASSPNP.sys . After some googling I
 found out that it might be because qemu 'mounts' the drive in IDE
 mode, while windows expects it to be in AHCI mode (because it was
 installed in AHCI mode). Then, after some more googling, I tried this
 command, which should (correct me if I'm wrong) mount the drive in
 AHCI mode.

 qemu-system-x86_64 -enable-kvm -smp 4 -cpu host -m 4096 -L
 /path/to/ovmf -device ahci,id=ahci0 -drive
 if=none,file=/dev/sda,format=raw,id=drive-sata0-0-0 -device
 driver=ide-drive,bus=ahci0.0,drive=drive-sata0-0-0,id=sata0-0-0

 However, with this command OVMF doesn't seem to recognise any drive at
 all, the 'Boot from file' screen is empty.

 So, I would like to know if OVMF supports AHCI, and if it doesn't, do
 you have any other ideas?
 I know it's generally not a good idea to boot a physically installed
 OS in a vm, but I want to try it anyway.

 Thanks,
 Evert
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/6] KVM: PPC: Add support for multiple-TCE hcalls

2013-05-06 Thread David Gibson
On Mon, May 06, 2013 at 05:25:53PM +1000, Alexey Kardashevskiy wrote:
 This adds real mode handlers for the H_PUT_TCE_INDIRECT and
 H_STUFF_TCE hypercalls for QEMU emulated devices such as virtio
 devices or emulated PCI.  These calls allow adding multiple entries
 (up to 512) into the TCE table in one call which saves time on
 transition to/from real mode.
 
 This adds a guest physical to host real address converter
 and calls the existing H_PUT_TCE handler. The converting function
 is going to be fully utilized by upcoming VFIO supporting patches.
 
 This also implements the KVM_CAP_PPC_MULTITCE capability,
 so in order to support the functionality of this patch, QEMU
 needs to query for this capability and set the hcall-multi-tce
 hypertas property only if the capability is present, otherwise
 there will be serious performance degradation.
 
 Cc: David Gibson da...@gibson.dropbear.id.au
 Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
 Signed-off-by: Paul Mackerras pau...@samba.org

Fwiw, it would be nice to get this patch merged, regardless of the
rest of the VFIO/powerpc patches.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: Digital signature


Re: [PATCH 5/6] KVM: PPC: Add support for IOMMU in-kernel handling

2013-05-06 Thread David Gibson
On Mon, May 06, 2013 at 05:25:56PM +1000, Alexey Kardashevskiy wrote:
 This allows the host kernel to handle H_PUT_TCE, H_PUT_TCE_INDIRECT
 and H_STUFF_TCE requests without passing them to QEMU, which should
 save time on switching to QEMU and back.
 
 Both real and virtual modes are supported - whenever the kernel
 fails to handle TCE request, it passes it to the virtual mode.
 If it the virtual mode handlers fail, then the request is passed
 to the user mode, for example, to QEMU.
 
 This adds a new KVM_CAP_SPAPR_TCE_IOMMU ioctl to asssociate
 a virtual PCI bus ID (LIOBN) with an IOMMU group, which enables
 in-kernel handling of IOMMU map/unmap.
 
 This adds a special case for huge pages (16MB).  The reference
 counting cannot be easily done for such pages in real mode (when
 MMU is off) so we added a list of huge pages.  It is populated in
 virtual mode and get_page is called just once per a huge page.
 Real mode handlers check if the requested page is huge and in the list,
 then no reference counting is done, otherwise an exit to virtual mode
 happens.  The list is released at KVM exit.  At the moment the fastest
 card available for tests uses up to 9 huge pages so walking through this
 list is not very expensive.  However this can change and we may want
 to optimize this.
 
 This also adds the virt_only parameter to the KVM module
 for debug and performance check purposes.
 
 Tests show that this patch increases transmission speed from 220MB/s
 to 750..1020MB/s on 10Gb network (Chelsea CXGB3 10Gb ethernet card).
 
 Cc: David Gibson da...@gibson.dropbear.id.au
 Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
 Signed-off-by: Paul Mackerras pau...@samba.org
 ---
  Documentation/virtual/kvm/api.txt   |   28 
  arch/powerpc/include/asm/kvm_host.h |2 +
  arch/powerpc/include/asm/kvm_ppc.h  |2 +
  arch/powerpc/include/uapi/asm/kvm.h |7 +
  arch/powerpc/kvm/book3s_64_vio.c|  242 
 ++-
  arch/powerpc/kvm/book3s_64_vio_hv.c |  192 +++
  arch/powerpc/kvm/powerpc.c  |   12 ++
  include/uapi/linux/kvm.h|2 +
  8 files changed, 485 insertions(+), 2 deletions(-)
 
 diff --git a/Documentation/virtual/kvm/api.txt 
 b/Documentation/virtual/kvm/api.txt
 index f621cd6..2039767 100644
 --- a/Documentation/virtual/kvm/api.txt
 +++ b/Documentation/virtual/kvm/api.txt
 @@ -2127,6 +2127,34 @@ written, then `n_invalid' invalid entries, 
 invalidating any previously
  valid entries found.
  
  
 +4.79 KVM_CREATE_SPAPR_TCE_IOMMU
 +
 +Capability: KVM_CAP_SPAPR_TCE_IOMMU
 +Architectures: powerpc
 +Type: vm ioctl
 +Parameters: struct kvm_create_spapr_tce_iommu (in)
 +Returns: 0 on success, -1 on error
 +
 +This creates a link between IOMMU group and a hardware TCE (translation
 +control entry) table. This link lets the host kernel know what IOMMU
 +group (i.e. TCE table) to use for the LIOBN number passed with
 +H_PUT_TCE, H_PUT_TCE_INDIRECT, H_STUFF_TCE hypercalls.
 +
 +/* for KVM_CAP_SPAPR_TCE_IOMMU */
 +struct kvm_create_spapr_tce_iommu {
 + __u64 liobn;
 + __u32 iommu_id;

Wouldn't it be more in keeping 

 + __u32 flags;
 +};
 +
 +No flag is supported at the moment.
 +
 +When the guest issues TCE call on a liobn for which a TCE table has been
 +registered, the kernel will handle it in real mode, updating the hardware
 +TCE table. TCE table calls for other liobns will cause a vm exit and must
 +be handled by userspace.
 +
 +
  5. The kvm_run structure
  
  
 diff --git a/arch/powerpc/include/asm/kvm_host.h 
 b/arch/powerpc/include/asm/kvm_host.h
 index 36ceb0d..2b70cbc 100644
 --- a/arch/powerpc/include/asm/kvm_host.h
 +++ b/arch/powerpc/include/asm/kvm_host.h
 @@ -178,6 +178,8 @@ struct kvmppc_spapr_tce_table {
   struct kvm *kvm;
   u64 liobn;
   u32 window_size;
 + bool virtmode_only;

I see this is now initialized from the global parameter, but I think
it would be better to just check the global (debug) parameter
directly, rather than duplicating it here.

 + struct iommu_group *grp;/* used for IOMMU groups */
   struct page *pages[0];
  };
  
 diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
 b/arch/powerpc/include/asm/kvm_ppc.h
 index d501246..bdfa140 100644
 --- a/arch/powerpc/include/asm/kvm_ppc.h
 +++ b/arch/powerpc/include/asm/kvm_ppc.h
 @@ -139,6 +139,8 @@ extern void kvmppc_xics_free(struct kvm *kvm);
  
  extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
   struct kvm_create_spapr_tce *args);
 +extern long kvm_vm_ioctl_create_spapr_tce_iommu(struct kvm *kvm,
 + struct kvm_create_spapr_tce_iommu *args);
  extern struct kvmppc_spapr_tce_table *kvmppc_find_tce_table(
   struct kvm_vcpu *vcpu, unsigned long liobn);
  extern long kvmppc_emulated_h_put_tce(struct kvmppc_spapr_tce_table *stt,
 diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
 b/arch/powerpc/include/uapi/asm/kvm.h
 

Re: [PATCH 5/6] KVM: PPC: Add support for IOMMU in-kernel handling

2013-05-06 Thread Alexey Kardashevskiy
On 05/07/2013 03:29 PM, David Gibson wrote:
 On Mon, May 06, 2013 at 05:25:56PM +1000, Alexey Kardashevskiy wrote:
 This allows the host kernel to handle H_PUT_TCE, H_PUT_TCE_INDIRECT
 and H_STUFF_TCE requests without passing them to QEMU, which should
 save time on switching to QEMU and back.

 Both real and virtual modes are supported - whenever the kernel
 fails to handle TCE request, it passes it to the virtual mode.
 If it the virtual mode handlers fail, then the request is passed
 to the user mode, for example, to QEMU.

 This adds a new KVM_CAP_SPAPR_TCE_IOMMU ioctl to asssociate
 a virtual PCI bus ID (LIOBN) with an IOMMU group, which enables
 in-kernel handling of IOMMU map/unmap.

 This adds a special case for huge pages (16MB).  The reference
 counting cannot be easily done for such pages in real mode (when
 MMU is off) so we added a list of huge pages.  It is populated in
 virtual mode and get_page is called just once per a huge page.
 Real mode handlers check if the requested page is huge and in the list,
 then no reference counting is done, otherwise an exit to virtual mode
 happens.  The list is released at KVM exit.  At the moment the fastest
 card available for tests uses up to 9 huge pages so walking through this
 list is not very expensive.  However this can change and we may want
 to optimize this.

 This also adds the virt_only parameter to the KVM module
 for debug and performance check purposes.

 Tests show that this patch increases transmission speed from 220MB/s
 to 750..1020MB/s on 10Gb network (Chelsea CXGB3 10Gb ethernet card).

 Cc: David Gibson da...@gibson.dropbear.id.au
 Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
 Signed-off-by: Paul Mackerras pau...@samba.org
 ---
  Documentation/virtual/kvm/api.txt   |   28 
  arch/powerpc/include/asm/kvm_host.h |2 +
  arch/powerpc/include/asm/kvm_ppc.h  |2 +
  arch/powerpc/include/uapi/asm/kvm.h |7 +
  arch/powerpc/kvm/book3s_64_vio.c|  242 
 ++-
  arch/powerpc/kvm/book3s_64_vio_hv.c |  192 +++
  arch/powerpc/kvm/powerpc.c  |   12 ++
  include/uapi/linux/kvm.h|2 +
  8 files changed, 485 insertions(+), 2 deletions(-)

 diff --git a/Documentation/virtual/kvm/api.txt 
 b/Documentation/virtual/kvm/api.txt
 index f621cd6..2039767 100644
 --- a/Documentation/virtual/kvm/api.txt
 +++ b/Documentation/virtual/kvm/api.txt
 @@ -2127,6 +2127,34 @@ written, then `n_invalid' invalid entries, 
 invalidating any previously
  valid entries found.
  
  
 +4.79 KVM_CREATE_SPAPR_TCE_IOMMU
 +
 +Capability: KVM_CAP_SPAPR_TCE_IOMMU
 +Architectures: powerpc
 +Type: vm ioctl
 +Parameters: struct kvm_create_spapr_tce_iommu (in)
 +Returns: 0 on success, -1 on error
 +
 +This creates a link between IOMMU group and a hardware TCE (translation
 +control entry) table. This link lets the host kernel know what IOMMU
 +group (i.e. TCE table) to use for the LIOBN number passed with
 +H_PUT_TCE, H_PUT_TCE_INDIRECT, H_STUFF_TCE hypercalls.
 +
 +/* for KVM_CAP_SPAPR_TCE_IOMMU */
 +struct kvm_create_spapr_tce_iommu {
 +__u64 liobn;
 +__u32 iommu_id;
 
 Wouldn't it be more in keeping 


pardon?



 +__u32 flags;
 +};
 +
 +No flag is supported at the moment.
 +
 +When the guest issues TCE call on a liobn for which a TCE table has been
 +registered, the kernel will handle it in real mode, updating the hardware
 +TCE table. TCE table calls for other liobns will cause a vm exit and must
 +be handled by userspace.
 +
 +
  5. The kvm_run structure
  
  
 diff --git a/arch/powerpc/include/asm/kvm_host.h 
 b/arch/powerpc/include/asm/kvm_host.h
 index 36ceb0d..2b70cbc 100644
 --- a/arch/powerpc/include/asm/kvm_host.h
 +++ b/arch/powerpc/include/asm/kvm_host.h
 @@ -178,6 +178,8 @@ struct kvmppc_spapr_tce_table {
  struct kvm *kvm;
  u64 liobn;
  u32 window_size;
 +bool virtmode_only;
 
 I see this is now initialized from the global parameter, but I think
 it would be better to just check the global (debug) parameter
 directly, rather than duplicating it here.


The global parameter is in kvm.ko and the struct above is in the real mode
part which cannot go to the module.



 +struct iommu_group *grp;/* used for IOMMU groups */
  struct page *pages[0];
  };
  
 diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
 b/arch/powerpc/include/asm/kvm_ppc.h
 index d501246..bdfa140 100644
 --- a/arch/powerpc/include/asm/kvm_ppc.h
 +++ b/arch/powerpc/include/asm/kvm_ppc.h
 @@ -139,6 +139,8 @@ extern void kvmppc_xics_free(struct kvm *kvm);
  
  extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
  struct kvm_create_spapr_tce *args);
 +extern long kvm_vm_ioctl_create_spapr_tce_iommu(struct kvm *kvm,
 +struct kvm_create_spapr_tce_iommu *args);
  extern struct kvmppc_spapr_tce_table *kvmppc_find_tce_table(
  struct kvm_vcpu *vcpu, unsigned long liobn);
  extern 

[PATCH 4/5] powerpc/vfio: Implement IOMMU driver for VFIO

2013-05-06 Thread aik
From: Alexey Kardashevskiy a...@ozlabs.ru

VFIO implements platform independent stuff such as
a PCI driver, BAR access (via read/write on a file descriptor
or direct mapping when possible) and IRQ signaling.

The platform dependent part includes IOMMU initialization
and handling.  This implements an IOMMU driver for VFIO
which does mapping/unmapping pages for the guest IO and
provides information about DMA window (required by a POWER
guest).

Cc: David Gibson da...@gibson.dropbear.id.au
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Signed-off-by: Paul Mackerras pau...@samba.org
---

Change log:
* no more PPC versions for vfio_iommu_spapr_tce_dma_(un)map (type1 structs 
reused)
* documentation updated
* containter enable/disable ioctls added
* request_module(spapr_iommu) added
* various locks fixed
* multiple TCE mapping support (no clients for that for now as SPAPR
does it in a different way)


---
 Documentation/vfio.txt  |   63 ++
 drivers/vfio/Kconfig|6 +
 drivers/vfio/Makefile   |1 +
 drivers/vfio/vfio.c |1 +
 drivers/vfio/vfio_iommu_spapr_tce.c |  377 +++
 include/uapi/linux/vfio.h   |   34 
 6 files changed, 482 insertions(+)
 create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index 8eda363..c55533c 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -283,6 +283,69 @@ a direct pass through for VFIO_DEVICE_* ioctls.  The 
read/write/mmap
 interfaces implement the device region access defined by the device's
 own VFIO_DEVICE_GET_REGION_INFO ioctl.
 
+
+PPC64 sPAPR implementation note
+---
+
+This implementation has some specifics:
+
+1) Only one IOMMU group per container is supported as an IOMMU group
+represents the minimal entity which isolation can be guaranteed for and
+groups are allocated statically, one per a Partitionable Endpoint (PE)
+(PE is often a PCI domain but not always).
+
+2) The hardware supports so called DMA windows - the PCI address range
+within which DMA transfer is allowed, any attempt to access address space
+out of the window leads to the whole PE isolation.
+
+3) PPC64 guests are paravirtualized but not fully emulated. There is an API
+to map/unmap pages for DMA, and it normally maps 1..32 pages per call and
+currently there is no way to reduce the number of calls. In order to make 
things
+faster, the map/unmap handling has been implemented in real mode which provides
+an excellent performance which has limitations such as inability to do
+locked pages accounting in real time.
+
+So 3 additional ioctls have been added:
+
+   VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
+   of the DMA window on the PCI bus.
+
+   VFIO_IOMMU_ENABLE - enables the container. The locked pages accounting
+   is done at this point. This lets user first to know what
+   the DMA window is and adjust rlimit before doing any real job.
+
+   VFIO_IOMMU_DISABLE - disables the container.
+
+
+The code flow from the example above should be slightly changed:
+
+   .
+   /* Add the group to the container */
+   ioctl(group, VFIO_GROUP_SET_CONTAINER, container);
+
+   /* Enable the IOMMU model we want */
+   ioctl(container, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU)
+
+   /* Get addition sPAPR IOMMU info */
+   vfio_iommu_spapr_tce_info spapr_iommu_info;
+   ioctl(container, VFIO_IOMMU_SPAPR_TCE_GET_INFO, spapr_iommu_info);
+
+   if (ioctl(container, VFIO_IOMMU_ENABLE))
+   /* Cannot enable container, may be low rlimit */
+
+   /* Allocate some space and setup a DMA mapping */
+   dma_map.vaddr = mmap(0, 1024 * 1024, PROT_READ | PROT_WRITE,
+MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+
+   dma_map.size = 1024 * 1024;
+   dma_map.iova = 0; /* 1MB starting at 0x0 from device view */
+   dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+
+   /* Check here is .iova/.size are within DMA window from 
spapr_iommu_info */
+
+   ioctl(container, VFIO_IOMMU_MAP_DMA, dma_map);
+   .
+
 ---
 
 [1] VFIO was originally an acronym for Virtual Function I/O in its
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 7cd5dec..b464687 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
depends on VFIO
default n
 
+config VFIO_IOMMU_SPAPR_TCE
+   tristate
+   depends on VFIO  SPAPR_TCE_IOMMU
+   default n
+
 menuconfig VFIO
tristate VFIO Non-Privileged userspace driver framework
depends on IOMMU_API
select VFIO_IOMMU_TYPE1 if X86
+   select VFIO_IOMMU_SPAPR_TCE if 

[PATCH 5/5] powerpc/vfio: Enable on pSeries platform

2013-05-06 Thread aik
From: Alexey Kardashevskiy a...@ozlabs.ru

The enables VFIO on the pSeries platform, enabling user space
programs to access PCI devices directly.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Cc: David Gibson da...@gibson.dropbear.id.au
Signed-off-by: Paul Mackerras pau...@samba.org
---
 arch/powerpc/platforms/pseries/iommu.c |4 
 drivers/iommu/Kconfig  |2 +-
 drivers/vfio/Kconfig   |2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index e2685ba..e178acc 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -613,6 +613,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 
iommu_table_setparms(pci-phb, dn, tbl);
pci-iommu_table = iommu_init_table(tbl, pci-phb-node);
+   iommu_register_group(tbl, pci_domain_nr(bus), 0);
 
/* Divide the rest (1.75GB) among the children */
pci-phb-dma_window_size = 0x8000ul;
@@ -657,6 +658,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
   ppci-phb-node);
iommu_table_setparms_lpar(ppci-phb, pdn, tbl, dma_window);
ppci-iommu_table = iommu_init_table(tbl, ppci-phb-node);
+   iommu_register_group(tbl, pci_domain_nr(bus), 0);
pr_debug(  created table: %p\n, ppci-iommu_table);
}
 }
@@ -683,6 +685,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
   phb-node);
iommu_table_setparms(phb, dn, tbl);
PCI_DN(dn)-iommu_table = iommu_init_table(tbl, phb-node);
+   iommu_register_group(tbl, pci_domain_nr(phb-bus), 0);
set_iommu_table_base(dev-dev, PCI_DN(dn)-iommu_table);
return;
}
@@ -1145,6 +1148,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev 
*dev)
   pci-phb-node);
iommu_table_setparms_lpar(pci-phb, pdn, tbl, dma_window);
pci-iommu_table = iommu_init_table(tbl, pci-phb-node);
+   iommu_register_group(tbl, pci_domain_nr(pci-phb-bus), 0);
pr_debug(  created table: %p\n, pci-iommu_table);
} else {
pr_debug(  found DMA window, table: %p\n, pci-iommu_table);
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 175e0f4..2d75ea0 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -189,7 +189,7 @@ config EXYNOS_IOMMU_DEBUG
 
 config SPAPR_TCE_IOMMU
bool sPAPR TCE IOMMU Support
-   depends on PPC_POWERNV
+   depends on PPC_POWERNV || PPC_PSERIES
select IOMMU_API
help
  Enables bits of IOMMU API required by VFIO. The iommu_ops
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index b464687..26b3d9d 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -12,7 +12,7 @@ menuconfig VFIO
tristate VFIO Non-Privileged userspace driver framework
depends on IOMMU_API
select VFIO_IOMMU_TYPE1 if X86
-   select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
+   select VFIO_IOMMU_SPAPR_TCE if (PPC_POWERNV || PPC_PSERIES)
help
  VFIO provides a framework for secure userspace device drivers.
  See Documentation/vfio.txt for more details.
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/5] iommu: Move initialization earlier

2013-05-06 Thread aik
From: Alexey Kardashevskiy a...@ozlabs.ru

The iommu_init() call initializes IOMMU internal structures and data
required for the API to function such as iommu_group_alloc().
It is registered as a subsys_initcall.

One of the IOMMU users is a PCI subsystem on POWER which discovers new
IOMMU tables during the PCI scan so the most logical place to call
iommu_group_alloc() is when a new group is just discovered. However
PCI scan is done from subsys_initcall hook as well, which makes
use of the IOMMU API impossible.

This moves IOMMU subsystem initialization one step earlier.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Cc: David Gibson da...@gibson.dropbear.id.au
Signed-off-by: Paul Mackerras pau...@samba.org
---
 drivers/iommu/iommu.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 5514dfa..0de83eb 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -890,7 +890,7 @@ static int __init iommu_init(void)
 
return 0;
 }
-subsys_initcall(iommu_init);
+arch_initcall(iommu_init);
 
 int iommu_domain_get_attr(struct iommu_domain *domain,
  enum iommu_attr attr, void *data)
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


  1   2   >