Re: [uq/master PATCH] kvmvapic: add ioport read accessor
On 2013-05-05 22:51, Marcelo Tosatti wrote: Necessary since memory region accessor assumes read and write methods are registered. Otherwise reading I/O port 0x7e segfaults. https://bugzilla.redhat.com/show_bug.cgi?id=954306 Signed-off-by: Marcelo Tosatti mtosa...@redhat.com diff --git a/hw/i386/kvmvapic.c b/hw/i386/kvmvapic.c index 5b558aa..655483b 100644 --- a/hw/i386/kvmvapic.c +++ b/hw/i386/kvmvapic.c @@ -687,8 +687,14 @@ static void vapic_write(void *opaque, hwaddr addr, uint64_t data, } } +static uint64_t vapic_read(void *opaque, hwaddr addr, unsigned size) +{ +return 0x; +} + static const MemoryRegionOps vapic_ops = { .write = vapic_write, +.read = vapic_read, .endianness = DEVICE_NATIVE_ENDIAN, }; Right. I'm just wondering why the guest reads from that port. Reviewed-by: Jan Kiszka jan.kis...@siemens.com -- Siemens AG, Corporate Technology, CT RTC ITP SDP-DE Corporate Competence Center Embedded Linux -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 02/13] nEPT: Move gpte_access() and prefetch_invalid_gpte() to paging_tmpl.h
For preparation, we just move gpte_access() and prefetch_invalid_gpte() from mmu.c to paging_tmpl.h. Signed-off-by: Nadav Har'El n...@il.ibm.com Signed-off-by: Jun Nakajima jun.nakaj...@intel.com Signed-off-by: Xinhao Xu xinhao...@intel.com --- arch/x86/kvm/mmu.c | 30 -- arch/x86/kvm/paging_tmpl.h | 40 +++- 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 956ca35..a431495 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2480,26 +2480,6 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, return gfn_to_pfn_memslot_atomic(slot, gfn); } -static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, u64 *spte, - u64 gpte) -{ - if (is_rsvd_bits_set(vcpu-arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) - goto no_present; - - if (!is_present_gpte(gpte)) - goto no_present; - - if (!(gpte PT_ACCESSED_MASK)) - goto no_present; - - return false; - -no_present: - drop_spte(vcpu-kvm, spte); - return true; -} - static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *start, u64 *end) @@ -3399,16 +3379,6 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, return false; } -static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte) -{ - unsigned access; - - access = (gpte (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; - access = ~(gpte PT64_NX_SHIFT); - - return access; -} - static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte) { unsigned index; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 105dd5b..13ceca6 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -103,6 +103,36 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return (ret != orig_pte); } +static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, u64 *spte, + u64 gpte) +{ + if (is_rsvd_bits_set(vcpu-arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) + goto no_present; + + if (!is_present_gpte(gpte)) + goto no_present; + + if (!(gpte PT_ACCESSED_MASK)) + goto no_present; + + return false; + +no_present: + drop_spte(vcpu-kvm, spte); + return true; +} + +static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte) +{ + unsigned access; + + access = (gpte (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; + access = ~(gpte PT64_NX_SHIFT); + + return access; +} + static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, struct guest_walker *walker, @@ -225,7 +255,7 @@ retry_walk: } accessed_dirty = pte; - pte_access = pt_access gpte_access(vcpu, pte); + pte_access = pt_access FNAME(gpte_access)(vcpu, pte); walker-ptes[walker-level - 1] = pte; } while (!is_last_gpte(mmu, walker-level, pte)); @@ -309,13 +339,13 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, gfn_t gfn; pfn_t pfn; - if (prefetch_invalid_gpte(vcpu, sp, spte, gpte)) + if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) return false; pgprintk(%s: gpte %llx spte %p\n, __func__, (u64)gpte, spte); gfn = gpte_to_gfn(gpte); - pte_access = sp-role.access gpte_access(vcpu, gpte); + pte_access = sp-role.access FNAME(gpte_access)(vcpu, gpte); protect_clean_gpte(pte_access, gpte); pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, no_dirty_log (pte_access ACC_WRITE_MASK)); @@ -782,14 +812,14 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) sizeof(pt_element_t))) return -EINVAL; - if (prefetch_invalid_gpte(vcpu, sp, sp-spt[i], gpte)) { + if (FNAME(prefetch_invalid_gpte)(vcpu, sp, sp-spt[i], gpte)) { vcpu-kvm-tlbs_dirty++; continue; } gfn = gpte_to_gfn(gpte); pte_access = sp-role.access; - pte_access = gpte_access(vcpu, gpte); + pte_access = FNAME(gpte_access)(vcpu, gpte); protect_clean_gpte(pte_access, gpte); if (sync_mmio_spte(sp-spt[i], gfn,
[PATCH v2 01/13] nEPT: Support LOAD_IA32_EFER entry/exit controls for L1
Recent KVM, since http://kerneltrap.org/mailarchive/linux-kvm/2010/5/2/6261577 switch the EFER MSR when EPT is used and the host and guest have different NX bits. So if we add support for nested EPT (L1 guest using EPT to run L2) and want to be able to run recent KVM as L1, we need to allow L1 to use this EFER switching feature. To do this EFER switching, KVM uses VM_ENTRY/EXIT_LOAD_IA32_EFER if available, and if it isn't, it uses the generic VM_ENTRY/EXIT_MSR_LOAD. This patch adds support for the former (the latter is still unsupported). Nested entry and exit emulation (prepare_vmcs_02 and load_vmcs12_host_state, respectively) already handled VM_ENTRY/EXIT_LOAD_IA32_EFER correctly. So all that's left to do in this patch is to properly advertise this feature to L1. Note that vmcs12's VM_ENTRY/EXIT_LOAD_IA32_EFER are emulated by L0, by using vmx_set_efer (which itself sets one of several vmcs02 fields), so we always support this feature, regardless of whether the host supports it. Signed-off-by: Nadav Har'El n...@il.ibm.com Signed-off-by: Jun Nakajima jun.nakaj...@intel.com Signed-off-by: Xinhao Xu xinhao...@intel.com --- arch/x86/kvm/vmx.c | 18 ++ 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 867b810..485ded6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2057,6 +2057,7 @@ static __init void nested_vmx_setup_ctls_msrs(void) #else nested_vmx_exit_ctls_high = 0; #endif + nested_vmx_exit_ctls_high |= VM_EXIT_LOAD_IA32_EFER; /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, @@ -2064,6 +2065,7 @@ static __init void nested_vmx_setup_ctls_msrs(void) nested_vmx_entry_ctls_low = 0; nested_vmx_entry_ctls_high = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE; + nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_IA32_EFER; /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, @@ -7050,10 +7052,18 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vcpu-arch.cr0_guest_owned_bits = ~vmcs12-cr0_guest_host_mask; vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu-arch.cr0_guest_owned_bits); - /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */ - vmcs_write32(VM_EXIT_CONTROLS, - vmcs12-vm_exit_controls | vmcs_config.vmexit_ctrl); - vmcs_write32(VM_ENTRY_CONTROLS, vmcs12-vm_entry_controls | + /* L2-L1 exit controls are emulated - the hardware exit is to L0 so +* we should use its exit controls. Note that IA32_MODE, LOAD_IA32_EFER +* bits are further modified by vmx_set_efer() below. +*/ + vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); + + /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are +* emulated by vmx_set_efer(), below. +*/ + vmcs_write32(VM_ENTRY_CONTROLS, + (vmcs12-vm_entry_controls ~VM_ENTRY_LOAD_IA32_EFER + ~VM_ENTRY_IA32E_MODE) | (vmcs_config.vmentry_ctrl ~VM_ENTRY_IA32E_MODE)); if (vmcs12-vm_entry_controls VM_ENTRY_LOAD_IA32_PAT) -- 1.8.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 03/13] nEPT: Add EPT tables support to paging_tmpl.h
This is the first patch in a series which adds nested EPT support to KVM's nested VMX. Nested EPT means emulating EPT for an L1 guest so that L1 can use EPT when running a nested guest L2. When L1 uses EPT, it allows the L2 guest to set its own cr3 and take its own page faults without either of L0 or L1 getting involved. This often significanlty improves L2's performance over the previous two alternatives (shadow page tables over EPT, and shadow page tables over shadow page tables). This patch adds EPT support to paging_tmpl.h. paging_tmpl.h contains the code for reading and writing page tables. The code for 32-bit and 64-bit tables is very similar, but not identical, so paging_tmpl.h is #include'd twice in mmu.c, once with PTTTYPE=32 and once with PTTYPE=64, and this generates the two sets of similar functions. There are subtle but important differences between the format of EPT tables and that of ordinary x86 64-bit page tables, so for nested EPT we need a third set of functions to read the guest EPT table and to write the shadow EPT table. So this patch adds third PTTYPE, PTTYPE_EPT, which creates functions (prefixed with EPT) which correctly read and write EPT tables. Signed-off-by: Nadav Har'El n...@il.ibm.com Signed-off-by: Jun Nakajima jun.nakaj...@intel.com Signed-off-by: Xinhao Xu xinhao...@intel.com --- arch/x86/kvm/mmu.c | 5 + arch/x86/kvm/paging_tmpl.h | 43 +-- 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a431495..cb9c6fd 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3388,6 +3388,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp return mmu-last_pte_bitmap (1 index); } +#define PTTYPE_EPT 18 /* arbitrary */ +#define PTTYPE PTTYPE_EPT +#include paging_tmpl.h +#undef PTTYPE + #define PTTYPE 64 #include paging_tmpl.h #undef PTTYPE diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 13ceca6..5644f61 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -50,6 +50,22 @@ #define PT_LEVEL_BITS PT32_LEVEL_BITS #define PT_MAX_FULL_LEVELS 2 #define CMPXCHG cmpxchg +#elif PTTYPE == PTTYPE_EPT + #define pt_element_t u64 + #define guest_walker guest_walkerEPT + #define FNAME(name) EPT_##name + #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) + #define PT_INDEX(addr, level) PT64_INDEX(addr, level) + #define PT_LEVEL_BITS PT64_LEVEL_BITS + #ifdef CONFIG_X86_64 + #define PT_MAX_FULL_LEVELS 4 + #define CMPXCHG cmpxchg + #else + #define CMPXCHG cmpxchg64 + #define PT_MAX_FULL_LEVELS 2 + #endif #else #error Invalid PTTYPE value #endif @@ -80,6 +96,10 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) return (gpte PT_LVL_ADDR_MASK(lvl)) PAGE_SHIFT; } +#if PTTYPE != PTTYPE_EPT +/* + * Comment out this for EPT because update_accessed_dirty_bits() is not used. + */ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pt_element_t __user *ptep_user, unsigned index, pt_element_t orig_pte, pt_element_t new_pte) @@ -102,6 +122,7 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return (ret != orig_pte); } +#endif static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, @@ -126,13 +147,21 @@ no_present: static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte) { unsigned access; - +#if PTTYPE == PTTYPE_EPT + access = (gpte (VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK | + VMX_EPT_EXECUTABLE_MASK)); +#else access = (gpte (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; access = ~(gpte PT64_NX_SHIFT); +#endif return access; } +#if PTTYPE != PTTYPE_EPT +/* + * EPT A/D bit support is not implemented. + */ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, struct guest_walker *walker, @@ -169,6 +198,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, } return 0; } +#endif /* * Fetch a guest pte for a guest virtual address @@ -177,7 +207,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gva_t addr, u32 access) { - int ret; pt_element_t pte; pt_element_t __user *uninitialized_var(ptep_user); gfn_t table_gfn; @@
[PATCH v2 04/13] nEPT: Define EPT-specific link_shadow_page()
Since link_shadow_page() is used by a routine in mmu.c, add an EPT-specific link_shadow_page() in paging_tmp.h, rather than moving it. Signed-off-by: Nadav Har'El n...@il.ibm.com Signed-off-by: Jun Nakajima jun.nakaj...@intel.com Signed-off-by: Xinhao Xu xinhao...@intel.com --- arch/x86/kvm/paging_tmpl.h | 20 1 file changed, 20 insertions(+) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 5644f61..51dca23 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -461,6 +461,18 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, } } +#if PTTYPE == PTTYPE_EPT +static void FNAME(link_shadow_page)(u64 *sptep, struct kvm_mmu_page *sp) +{ + u64 spte; + + spte = __pa(sp-spt) | VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK | + VMX_EPT_EXECUTABLE_MASK; + + mmu_spte_set(sptep, spte); +} +#endif + /* * Fetch a shadow pte for a specific level in the paging hierarchy. * If the guest tries to write a write-protected page, we need to @@ -513,7 +525,11 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, goto out_gpte_changed; if (sp) +#if PTTYPE == PTTYPE_EPT + FNAME(link_shadow_page)(it.sptep, sp); +#else link_shadow_page(it.sptep, sp); +#endif } for (; @@ -533,7 +549,11 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, true, direct_access, it.sptep); +#if PTTYPE == PTTYPE_EPT + FNAME(link_shadow_page)(it.sptep, sp); +#else link_shadow_page(it.sptep, sp); +#endif } clear_sp_write_flooding_count(it.sptep); -- 1.8.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 05/13] nEPT: MMU context for nested EPT
KVM's existing shadow MMU code already supports nested TDP. To use it, we need to set up a new MMU context for nested EPT, and create a few callbacks for it (nested_ept_*()). This context should also use the EPT versions of the page table access functions (defined in the previous patch). Then, we need to switch back and forth between this nested context and the regular MMU context when switching between L1 and L2 (when L1 runs this L2 with EPT). Signed-off-by: Nadav Har'El n...@il.ibm.com Signed-off-by: Jun Nakajima jun.nakaj...@intel.com Signed-off-by: Xinhao Xu xinhao...@intel.com --- arch/x86/kvm/mmu.c | 38 ++ arch/x86/kvm/mmu.h | 1 + arch/x86/kvm/vmx.c | 53 - 3 files changed, 91 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index cb9c6fd..99bfc5e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3644,6 +3644,44 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); +int kvm_init_shadow_EPT_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) +{ + ASSERT(vcpu); + ASSERT(!VALID_PAGE(vcpu-arch.mmu.root_hpa)); + + context-shadow_root_level = kvm_x86_ops-get_tdp_level(); + + context-nx = is_nx(vcpu); /* TODO: ? */ + context-new_cr3 = paging_new_cr3; + context-page_fault = EPT_page_fault; + context-gva_to_gpa = EPT_gva_to_gpa; + context-sync_page = EPT_sync_page; + context-invlpg = EPT_invlpg; + context-update_pte = EPT_update_pte; + context-free = paging_free; + context-root_level = context-shadow_root_level; + context-root_hpa = INVALID_PAGE; + context-direct_map = false; + + /* TODO: reset_rsvds_bits_mask() is not built for EPT, we need + something different. +*/ + reset_rsvds_bits_mask(vcpu, context); + + + /* TODO: I copied these from kvm_init_shadow_mmu, I don't know why + they are done, or why they write to vcpu-arch.mmu and not context +*/ + vcpu-arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); + vcpu-arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); + vcpu-arch.mmu.base_role.smep_andnot_wp = + kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) + !is_write_protection(vcpu); + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_init_shadow_EPT_mmu); + static int init_kvm_softmmu(struct kvm_vcpu *vcpu) { int r = kvm_init_shadow_mmu(vcpu, vcpu-arch.walk_mmu); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 6987108..19dd5ab 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -54,6 +54,7 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); +int kvm_init_shadow_EPT_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 485ded6..8fdcacf 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -918,6 +918,11 @@ static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12, return vmcs12-pin_based_vm_exec_control PIN_BASED_VIRTUAL_NMIS; } +static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); +} + static inline bool is_exception(u32 intr_info) { return (intr_info (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -6873,6 +6878,46 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) entry-ecx |= bit(X86_FEATURE_VMX); } +/* Callbacks for nested_ept_init_mmu_context: */ + +static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) +{ + /* return the page table to be shadowed - in our case, EPT12 */ + return get_vmcs12(vcpu)-ept_pointer; +} + +static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, + struct x86_exception *fault) +{ + struct vmcs12 *vmcs12; + nested_vmx_vmexit(vcpu); + vmcs12 = get_vmcs12(vcpu); + /* +* Note no need to set vmcs12-vm_exit_reason as it is already copied +* from vmcs02 in nested_vmx_vmexit() above, i.e., EPT_VIOLATION. +*/ + vmcs12-exit_qualification = fault-error_code; + vmcs12-guest_physical_address = fault-address; +} + +static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) +{ + int r = kvm_init_shadow_EPT_mmu(vcpu, vcpu-arch.mmu); + + vcpu-arch.mmu.set_cr3 = vmx_set_cr3; + vcpu-arch.mmu.get_cr3 = nested_ept_get_cr3; + vcpu-arch.mmu.inject_page_fault = nested_ept_inject_page_fault; + + vcpu-arch.walk_mmu =
[PATCH v2 06/13] nEPT: Fix cr3 handling in nested exit and entry
The existing code for handling cr3 and related VMCS fields during nested exit and entry wasn't correct in all cases: If L2 is allowed to control cr3 (and this is indeed the case in nested EPT), during nested exit we must copy the modified cr3 from vmcs02 to vmcs12, and we forgot to do so. This patch adds this copy. If L0 isn't controlling cr3 when running L2 (i.e., L0 is using EPT), and whoever does control cr3 (L1 or L2) is using PAE, the processor might have saved PDPTEs and we should also save them in vmcs12 (and restore later). Signed-off-by: Nadav Har'El n...@il.ibm.com Signed-off-by: Jun Nakajima jun.nakaj...@intel.com Signed-off-by: Xinhao Xu xinhao...@intel.com --- arch/x86/kvm/vmx.c | 37 - 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8fdcacf..d797d3e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7163,10 +7163,26 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmx_set_cr4(vcpu, vmcs12-guest_cr4); vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); - /* shadow page tables on either EPT or shadow page tables */ + /* +* Note that kvm_set_cr3() and kvm_mmu_reset_context() will do the +* right thing, and set GUEST_CR3 and/or EPT_POINTER in all supported +* settings: 1. shadow page tables on shadow page tables, 2. shadow +* page tables on EPT, 3. EPT on EPT. +*/ kvm_set_cr3(vcpu, vmcs12-guest_cr3); kvm_mmu_reset_context(vcpu); + /* +* Additionally, except when L0 is using shadow page tables, L1 or +* L2 control guest_cr3 for L2, so they may also have saved PDPTEs +*/ + if (enable_ept) { + vmcs_write64(GUEST_PDPTR0, vmcs12-guest_pdptr0); + vmcs_write64(GUEST_PDPTR1, vmcs12-guest_pdptr1); + vmcs_write64(GUEST_PDPTR2, vmcs12-guest_pdptr2); + vmcs_write64(GUEST_PDPTR3, vmcs12-guest_pdptr3); + } + kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12-guest_rsp); kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12-guest_rip); } @@ -7398,6 +7414,25 @@ void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12-guest_pending_dbg_exceptions = vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); + /* +* In some cases (usually, nested EPT), L2 is allowed to change its +* own CR3 without exiting. If it has changed it, we must keep it. +* Of course, if L0 is using shadow page tables, GUEST_CR3 was defined +* by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. +*/ + if (enable_ept) + vmcs12-guest_cr3 = vmcs_read64(GUEST_CR3); + /* +* Additionally, except when L0 is using shadow page tables, L1 or +* L2 control guest_cr3 for L2, so save their PDPTEs +*/ + if (enable_ept) { + vmcs12-guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); + vmcs12-guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); + vmcs12-guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); + vmcs12-guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); + } + /* TODO: These cannot have changed unless we have MSR bitmaps and * the relevant bit asks not to trap the change */ vmcs12-guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); -- 1.8.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 07/13] nEPT: Fix wrong test in kvm_set_cr3
kvm_set_cr3() attempts to check if the new cr3 is a valid guest physical address. The problem is that with nested EPT, cr3 is an *L2* physical address, not an L1 physical address as this test expects. As the comment above this test explains, it isn't necessary, and doesn't correspond to anything a real processor would do. So this patch removes it. Note that this wrong test could have also theoretically caused problems in nested NPT, not just in nested EPT. However, in practice, the problem was avoided: nested_svm_vmexit()/vmrun() do not call kvm_set_cr3 in the nested NPT case, and instead set the vmcb (and arch.cr3) directly, thus circumventing the problem. Additional potential calls to the buggy function are avoided in that we don't trap cr3 modifications when nested NPT is enabled. However, because in nested VMX we did want to use kvm_set_cr3() (as requested in Avi Kivity's review of the original nested VMX patches), we can't avoid this problem and need to fix it. Signed-off-by: Nadav Har'El n...@il.ibm.com Signed-off-by: Jun Nakajima jun.nakaj...@intel.com Signed-off-by: Xinhao Xu xinhao...@intel.com --- arch/x86/kvm/x86.c | 11 --- 1 file changed, 11 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e172132..c34590d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -659,17 +659,6 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) */ } - /* -* Does the new cr3 value map to physical memory? (Note, we -* catch an invalid cr3 even in real-mode, because it would -* cause trouble later on when we turn on paging anyway.) -* -* A real CPU would silently accept an invalid cr3 and would -* attempt to use it - with largely undefined (and often hard -* to debug) behavior on the guest side. -*/ - if (unlikely(!gfn_to_memslot(vcpu-kvm, cr3 PAGE_SHIFT))) - return 1; vcpu-arch.cr3 = cr3; __set_bit(VCPU_EXREG_CR3, (ulong *)vcpu-arch.regs_avail); vcpu-arch.mmu.new_cr3(vcpu); -- 1.8.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 08/13] nEPT: Some additional comments
Some additional comments to preexisting code: Explain who (L0 or L1) handles EPT violation and misconfiguration exits. Don't mention shadow on either EPT or shadow as the only two options. Signed-off-by: Nadav Har'El n...@il.ibm.com Signed-off-by: Jun Nakajima jun.nakaj...@intel.com Signed-off-by: Xinhao Xu xinhao...@intel.com --- arch/x86/kvm/vmx.c | 13 + 1 file changed, 13 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d797d3e..419b9e3 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6127,7 +6127,20 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); case EXIT_REASON_EPT_VIOLATION: + /* +* L0 always deals with the EPT violation. If nested EPT is +* used, and the nested mmu code discovers that the address is +* missing in the guest EPT table (EPT12), the EPT violation +* will be injected with nested_ept_inject_page_fault() +*/ + return 0; case EXIT_REASON_EPT_MISCONFIG: + /* +* L2 never uses directly L1's EPT, but rather L0's own EPT +* table (shadow on EPT) or a merged EPT table that L0 built +* (EPT on EPT). So any problems with the structure of the +* table is L0's fault. +*/ return 0; case EXIT_REASON_WBINVD: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); -- 1.8.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 09/13] nEPT: Advertise EPT to L1
Advertise the support of EPT to the L1 guest, through the appropriate MSR. This is the last patch of the basic Nested EPT feature, so as to allow bisection through this patch series: The guest will not see EPT support until this last patch, and will not attempt to use the half-applied feature. Signed-off-by: Nadav Har'El n...@il.ibm.com Signed-off-by: Jun Nakajima jun.nakaj...@intel.com Signed-off-by: Xinhao Xu xinhao...@intel.com --- arch/x86/include/asm/vmx.h | 2 ++ arch/x86/kvm/vmx.c | 17 +++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index b6fbf86..79a5beb 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -376,7 +376,9 @@ enum vmcs_field { #define VMX_EPTP_WB_BIT(1ull 14) #define VMX_EPT_2MB_PAGE_BIT (1ull 16) #define VMX_EPT_1GB_PAGE_BIT (1ull 17) +#define VMX_EPT_INVEPT_BIT (1ull 20) #define VMX_EPT_AD_BIT (1ull 21) +#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull 24) #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull 26) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 419b9e3..de6cfb4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2027,6 +2027,7 @@ static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high; static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high; static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; +static u32 nested_vmx_ept_caps; static __init void nested_vmx_setup_ctls_msrs(void) { /* @@ -2102,6 +2103,18 @@ static __init void nested_vmx_setup_ctls_msrs(void) nested_vmx_secondary_ctls_low = 0; nested_vmx_secondary_ctls_high = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + if (enable_ept) { + /* nested EPT: emulate EPT also to L1 */ + nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT; + nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT; + nested_vmx_ept_caps |= + VMX_EPT_INVEPT_BIT | VMX_EPT_EXTENT_GLOBAL_BIT | + VMX_EPT_EXTENT_CONTEXT_BIT | + VMX_EPT_EXTENT_INDIVIDUAL_BIT; + nested_vmx_ept_caps = vmx_capability.ept; + } else + nested_vmx_ept_caps = 0; + } static inline bool vmx_control_verify(u32 control, u32 low, u32 high) @@ -2201,8 +2214,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) nested_vmx_secondary_ctls_high); break; case MSR_IA32_VMX_EPT_VPID_CAP: - /* Currently, no nested ept or nested vpid */ - *pdata = 0; + /* Currently, no nested vpid support */ + *pdata = nested_vmx_ept_caps; break; default: return 0; -- 1.8.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 10/13] nEPT: Nested INVEPT
If we let L1 use EPT, we should probably also support the INVEPT instruction. In our current nested EPT implementation, when L1 changes its EPT table for L2 (i.e., EPT12), L0 modifies the shadow EPT table (EPT02), and in the course of this modification already calls INVEPT. Therefore, when L1 calls INVEPT, we don't really need to do anything. In particular we *don't* need to call the real INVEPT again. All we do in our INVEPT is verify the validity of the call, and its parameters, and then do nothing. In KVM Forum 2010, Dong et al. presented Nested Virtualization Friendly KVM and classified our current nested EPT implementation as shadow-like virtual EPT. He recommended instead a different approach, which he called VTLB-like virtual EPT. If we had taken that alternative approach, INVEPT would have had a bigger role: L0 would only rebuild the shadow EPT table when L1 calls INVEPT. Signed-off-by: Nadav Har'El n...@il.ibm.com Signed-off-by: Jun Nakajima jun.nakaj...@intel.com Signed-off-by: Xinhao Xu xinhao...@intel.com --- arch/x86/include/uapi/asm/vmx.h | 1 + arch/x86/kvm/vmx.c | 83 + 2 files changed, 84 insertions(+) diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 2871fcc..ec51012 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -65,6 +65,7 @@ #define EXIT_REASON_EOI_INDUCED 45 #define EXIT_REASON_EPT_VIOLATION 48 #define EXIT_REASON_EPT_MISCONFIG 49 +#define EXIT_REASON_INVEPT 50 #define EXIT_REASON_WBINVD 54 #define EXIT_REASON_XSETBV 55 #define EXIT_REASON_APIC_WRITE 56 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index de6cfb4..86e4022 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5879,6 +5879,87 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) return 1; } +/* Emulate the INVEPT instruction */ +static int handle_invept(struct kvm_vcpu *vcpu) +{ + u32 vmx_instruction_info; + unsigned long type; + gva_t gva; + struct x86_exception e; + struct { + u64 eptp, gpa; + } operand; + + if (!(nested_vmx_secondary_ctls_high SECONDARY_EXEC_ENABLE_EPT) || + !(nested_vmx_ept_caps VMX_EPT_INVEPT_BIT)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + /* According to the Intel VMX instruction reference, the memory +* operand is read even if it isn't needed (e.g., for type==global) +*/ + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmx_instruction_info, gva)) + return 1; + if (kvm_read_guest_virt(vcpu-arch.emulate_ctxt, gva, operand, + sizeof(operand), e)) { + kvm_inject_page_fault(vcpu, e); + return 1; + } + + type = kvm_register_read(vcpu, (vmx_instruction_info 28) 0xf); + + switch (type) { + case VMX_EPT_EXTENT_GLOBAL: + if (!(nested_vmx_ept_caps VMX_EPT_EXTENT_GLOBAL_BIT)) + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + else { + /* +* Do nothing: when L1 changes EPT12, we already +* update EPT02 (the shadow EPT table) and call INVEPT. +* So when L1 calls INVEPT, there's nothing left to do. +*/ + nested_vmx_succeed(vcpu); + } + break; + case VMX_EPT_EXTENT_CONTEXT: + if (!(nested_vmx_ept_caps VMX_EPT_EXTENT_CONTEXT_BIT)) + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + else { + /* Do nothing */ + nested_vmx_succeed(vcpu); + } + break; + case VMX_EPT_EXTENT_INDIVIDUAL_ADDR: + if (!(nested_vmx_ept_caps VMX_EPT_EXTENT_INDIVIDUAL_BIT)) + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + else { + /* Do nothing */ + nested_vmx_succeed(vcpu); + } + break; + default: + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + } + + skip_emulated_instruction(vcpu); + return 1; +} + /* * The exit handlers return
[PATCH v2 11/13] nEPT: Miscelleneous cleanups
Some trivial code cleanups not really related to nested EPT. Signed-off-by: Nadav Har'El n...@il.ibm.com Signed-off-by: Jun Nakajima jun.nakaj...@intel.com Signed-off-by: Xinhao Xu xinhao...@intel.com Reviewed-by: Paolo Bonzini pbonz...@redhat.com --- arch/x86/kvm/vmx.c | 6 ++ 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 86e4022..914cdda 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -616,7 +616,6 @@ static void nested_release_page_clean(struct page *page) static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); -static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -912,8 +911,7 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) (vmcs12-secondary_vm_exec_control bit); } -static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12, - struct kvm_vcpu *vcpu) +static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) { return vmcs12-pin_based_vm_exec_control PIN_BASED_VIRTUAL_NMIS; } @@ -6321,7 +6319,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) if (unlikely(!cpu_has_virtual_nmis() vmx-soft_vnmi_blocked !(is_guest_mode(vcpu) nested_cpu_has_virtual_nmis( - get_vmcs12(vcpu), vcpu { + get_vmcs12(vcpu) { if (vmx_interrupt_allowed(vcpu)) { vmx-soft_vnmi_blocked = 0; } else if (vmx-vnmi_blocked_time 10LL -- 1.8.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 12/13] nEPT: Move is_rsvd_bits_set() to paging_tmpl.h
Move is_rsvd_bits_set() to paging_tmpl.h so that it can be used to check reserved bits in EPT page table entries as well. Signed-off-by: Jun Nakajima jun.nakaj...@intel.com Signed-off-by: Xinhao Xu xinhao...@intel.com --- arch/x86/kvm/mmu.c | 8 arch/x86/kvm/paging_tmpl.h | 12 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 99bfc5e..054c68b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2460,14 +2460,6 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) mmu_free_roots(vcpu); } -static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) -{ - int bit7; - - bit7 = (gpte 7) 1; - return (gpte mmu-rsvd_bits_mask[bit7][level-1]) != 0; -} - static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 51dca23..777d5d7 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -124,11 +124,19 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, } #endif +static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level) +{ + int bit7; + + bit7 = (gpte 7) 1; + return (gpte mmu-rsvd_bits_mask[bit7][level-1]) != 0; +} + static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, u64 gpte) { - if (is_rsvd_bits_set(vcpu-arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) + if (FNAME(is_rsvd_bits_set)(vcpu-arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) goto no_present; if (!is_present_gpte(gpte)) @@ -279,7 +287,7 @@ retry_walk: if (unlikely(!is_present_gpte(pte))) goto error; - if (unlikely(is_rsvd_bits_set(vcpu-arch.mmu, pte, + if (unlikely(FNAME(is_rsvd_bits_set)(vcpu-arch.mmu, pte, walker-level))) { errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK; goto error; -- 1.8.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 13/13] nEPT: Inject EPT violation/misconfigration
Add code to detect EPT misconfiguration and inject it to L1 VMM. Also, it injects more correct exit qualification upon EPT violation to L1 VMM. Now L1 can correctly go to ept_misconfig handler (instead of wrongly going to fast_page_fault), it will try to handle mmio page fault, if failed, it is a real EPT misconfiguration. Signed-off-by: Jun Nakajima jun.nakaj...@intel.com Signed-off-by: Xinhao Xu xinhao...@intel.com --- arch/x86/include/asm/kvm_host.h | 4 +++ arch/x86/kvm/mmu.c | 5 --- arch/x86/kvm/mmu.h | 5 +++ arch/x86/kvm/paging_tmpl.h | 26 ++ arch/x86/kvm/vmx.c | 79 +++-- 5 files changed, 111 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4979778..a32bda6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -262,6 +262,8 @@ struct kvm_mmu { void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte); + bool (*check_tdp_pte)(u64 pte, int level); + hpa_t root_hpa; int root_level; int shadow_root_level; @@ -504,6 +506,8 @@ struct kvm_vcpu_arch { * instruction. */ bool write_fault_to_shadow_pgtable; + + unsigned long exit_qualification; /* set at EPT violation at this point */ }; struct kvm_lpage_info { diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 054c68b..613fbd2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -230,11 +230,6 @@ static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access) return false; } -static inline u64 rsvd_bits(int s, int e) -{ - return ((1ULL (e - s + 1)) - 1) s; -} - void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask) { diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 19dd5ab..8aebd5a 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -91,6 +91,11 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu) return kvm_read_cr0_bits(vcpu, X86_CR0_WP); } +static inline u64 rsvd_bits(int s, int e) +{ + return ((1ULL (e - s + 1)) - 1) s; +} + /* * Will a fault with a given page-fault error code (pfec) cause a permission * fault with the given access (in ACC_* format)? diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 777d5d7..e4a0d72 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -126,10 +126,14 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level) { +#if PTTYPE == PTTYPE_EPT + return (mmu-check_tdp_pte(gpte, level)); +#else int bit7; bit7 = (gpte 7) 1; return (gpte mmu-rsvd_bits_mask[bit7][level-1]) != 0; +#endif } static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, @@ -352,6 +356,28 @@ error: walker-fault.vector = PF_VECTOR; walker-fault.error_code_valid = true; walker-fault.error_code = errcode; + +#if PTTYPE == PTTYPE_EPT + /* +* Use PFERR_RSVD_MASK in erorr_code to to tell if EPT +* misconfiguration requires to be injected. The detection is +* done by is_rsvd_bits_set() above. +* +* We set up the value of exit_qualification to inject: +* [2:0] -- Derive from [2:0] of real exit_qualification at EPT violation +* [5:3] -- Calculated by the page walk of the guest EPT page tables +* [7:8] -- Clear to 0. +* +* The other bits are set to 0. +*/ + if (!(errcode PFERR_RSVD_MASK)) { + unsigned long exit_qualification = vcpu-arch.exit_qualification; + + pte_access = pt_access pte; + vcpu-arch.exit_qualification = ((pte_access 0x7) 3) | + (exit_qualification 0x7); + } +#endif walker-fault.address = addr; walker-fault.nested_page_fault = mmu != vcpu-arch.walk_mmu; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 914cdda..4edf1fe 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5074,6 +5074,8 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) /* ept page table is present? */ error_code |= (exit_qualification 3) 0x1; + vcpu-arch.exit_qualification = exit_qualification; + return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); } @@ -6994,7 +6996,7 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) } static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, - struct x86_exception *fault) +struct x86_exception *fault) { struct vmcs12 *vmcs12;
Re: [PATCH v2 01/13] nEPT: Support LOAD_IA32_EFER entry/exit controls for L1
On 2013-05-06 09:04, Jun Nakajima wrote: Recent KVM, since http://kerneltrap.org/mailarchive/linux-kvm/2010/5/2/6261577 switch the EFER MSR when EPT is used and the host and guest have different NX bits. So if we add support for nested EPT (L1 guest using EPT to run L2) and want to be able to run recent KVM as L1, we need to allow L1 to use this EFER switching feature. To do this EFER switching, KVM uses VM_ENTRY/EXIT_LOAD_IA32_EFER if available, and if it isn't, it uses the generic VM_ENTRY/EXIT_MSR_LOAD. This patch adds support for the former (the latter is still unsupported). Nested entry and exit emulation (prepare_vmcs_02 and load_vmcs12_host_state, respectively) already handled VM_ENTRY/EXIT_LOAD_IA32_EFER correctly. So all that's left to do in this patch is to properly advertise this feature to L1. Note that vmcs12's VM_ENTRY/EXIT_LOAD_IA32_EFER are emulated by L0, by using vmx_set_efer (which itself sets one of several vmcs02 fields), so we always support this feature, regardless of whether the host supports it. Signed-off-by: Nadav Har'El n...@il.ibm.com Signed-off-by: Jun Nakajima jun.nakaj...@intel.com Signed-off-by: Xinhao Xu xinhao...@intel.com --- arch/x86/kvm/vmx.c | 18 ++ 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 867b810..485ded6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2057,6 +2057,7 @@ static __init void nested_vmx_setup_ctls_msrs(void) #else nested_vmx_exit_ctls_high = 0; #endif + nested_vmx_exit_ctls_high |= VM_EXIT_LOAD_IA32_EFER; You are using the wrong baseline. This does not apply against next. Please fix (you can find the properly rebased version in my tree). Thanks, Jan signature.asc Description: OpenPGP digital signature
[PATCH 4/5] powerpc/vfio: Implement IOMMU driver for VFIO
From: Alexey Kardashevskiy a...@ozlabs.ru VFIO implements platform independent stuff such as a PCI driver, BAR access (via read/write on a file descriptor or direct mapping when possible) and IRQ signaling. The platform dependent part includes IOMMU initialization and handling. This implements an IOMMU driver for VFIO which does mapping/unmapping pages for the guest IO and provides information about DMA window (required by a POWER guest). Cc: David Gibson da...@gibson.dropbear.id.au Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru Signed-off-by: Paul Mackerras pau...@samba.org --- Change log: * no more PPC versions for vfio_iommu_spapr_tce_dma_(un)map (type1 structs reused) * documentation updated * containter enable/disable ioctls added * request_module(spapr_iommu) added * various locks fixed * multiple TCE mapping support (no clients for that for now as SPAPR does it in a different way) --- Documentation/vfio.txt | 63 ++ drivers/vfio/Kconfig|6 + drivers/vfio/Makefile |1 + drivers/vfio/vfio.c |1 + drivers/vfio/vfio_iommu_spapr_tce.c | 377 +++ include/uapi/linux/vfio.h | 34 6 files changed, 482 insertions(+) create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt index 8eda363..c55533c 100644 --- a/Documentation/vfio.txt +++ b/Documentation/vfio.txt @@ -283,6 +283,69 @@ a direct pass through for VFIO_DEVICE_* ioctls. The read/write/mmap interfaces implement the device region access defined by the device's own VFIO_DEVICE_GET_REGION_INFO ioctl. + +PPC64 sPAPR implementation note +--- + +This implementation has some specifics: + +1) Only one IOMMU group per container is supported as an IOMMU group +represents the minimal entity which isolation can be guaranteed for and +groups are allocated statically, one per a Partitionable Endpoint (PE) +(PE is often a PCI domain but not always). + +2) The hardware supports so called DMA windows - the PCI address range +within which DMA transfer is allowed, any attempt to access address space +out of the window leads to the whole PE isolation. + +3) PPC64 guests are paravirtualized but not fully emulated. There is an API +to map/unmap pages for DMA, and it normally maps 1..32 pages per call and +currently there is no way to reduce the number of calls. In order to make things +faster, the map/unmap handling has been implemented in real mode which provides +an excellent performance which has limitations such as inability to do +locked pages accounting in real time. + +So 3 additional ioctls have been added: + + VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start + of the DMA window on the PCI bus. + + VFIO_IOMMU_ENABLE - enables the container. The locked pages accounting + is done at this point. This lets user first to know what + the DMA window is and adjust rlimit before doing any real job. + + VFIO_IOMMU_DISABLE - disables the container. + + +The code flow from the example above should be slightly changed: + + . + /* Add the group to the container */ + ioctl(group, VFIO_GROUP_SET_CONTAINER, container); + + /* Enable the IOMMU model we want */ + ioctl(container, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU) + + /* Get addition sPAPR IOMMU info */ + vfio_iommu_spapr_tce_info spapr_iommu_info; + ioctl(container, VFIO_IOMMU_SPAPR_TCE_GET_INFO, spapr_iommu_info); + + if (ioctl(container, VFIO_IOMMU_ENABLE)) + /* Cannot enable container, may be low rlimit */ + + /* Allocate some space and setup a DMA mapping */ + dma_map.vaddr = mmap(0, 1024 * 1024, PROT_READ | PROT_WRITE, +MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + + dma_map.size = 1024 * 1024; + dma_map.iova = 0; /* 1MB starting at 0x0 from device view */ + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + + /* Check here is .iova/.size are within DMA window from spapr_iommu_info */ + + ioctl(container, VFIO_IOMMU_MAP_DMA, dma_map); + . + --- [1] VFIO was originally an acronym for Virtual Function I/O in its diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 7cd5dec..b464687 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1 depends on VFIO default n +config VFIO_IOMMU_SPAPR_TCE + tristate + depends on VFIO SPAPR_TCE_IOMMU + default n + menuconfig VFIO tristate VFIO Non-Privileged userspace driver framework depends on IOMMU_API select VFIO_IOMMU_TYPE1 if X86 + select VFIO_IOMMU_SPAPR_TCE if
[PATCH 5/5] powerpc/vfio: Enable on pSeries platform
From: Alexey Kardashevskiy a...@ozlabs.ru The enables VFIO on the pSeries platform, enabling user space programs to access PCI devices directly. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru Cc: David Gibson da...@gibson.dropbear.id.au Signed-off-by: Paul Mackerras pau...@samba.org --- arch/powerpc/platforms/pseries/iommu.c |4 drivers/iommu/Kconfig |2 +- drivers/vfio/Kconfig |2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index e2685ba..e178acc 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -613,6 +613,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) iommu_table_setparms(pci-phb, dn, tbl); pci-iommu_table = iommu_init_table(tbl, pci-phb-node); + iommu_register_group(tbl, pci_domain_nr(bus), 0); /* Divide the rest (1.75GB) among the children */ pci-phb-dma_window_size = 0x8000ul; @@ -657,6 +658,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) ppci-phb-node); iommu_table_setparms_lpar(ppci-phb, pdn, tbl, dma_window); ppci-iommu_table = iommu_init_table(tbl, ppci-phb-node); + iommu_register_group(tbl, pci_domain_nr(bus), 0); pr_debug( created table: %p\n, ppci-iommu_table); } } @@ -683,6 +685,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) phb-node); iommu_table_setparms(phb, dn, tbl); PCI_DN(dn)-iommu_table = iommu_init_table(tbl, phb-node); + iommu_register_group(tbl, pci_domain_nr(phb-bus), 0); set_iommu_table_base(dev-dev, PCI_DN(dn)-iommu_table); return; } @@ -1145,6 +1148,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) pci-phb-node); iommu_table_setparms_lpar(pci-phb, pdn, tbl, dma_window); pci-iommu_table = iommu_init_table(tbl, pci-phb-node); + iommu_register_group(tbl, pci_domain_nr(pci-phb-bus), 0); pr_debug( created table: %p\n, pci-iommu_table); } else { pr_debug( found DMA window, table: %p\n, pci-iommu_table); diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 175e0f4..2d75ea0 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -189,7 +189,7 @@ config EXYNOS_IOMMU_DEBUG config SPAPR_TCE_IOMMU bool sPAPR TCE IOMMU Support - depends on PPC_POWERNV + depends on PPC_POWERNV || PPC_PSERIES select IOMMU_API help Enables bits of IOMMU API required by VFIO. The iommu_ops diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index b464687..26b3d9d 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -12,7 +12,7 @@ menuconfig VFIO tristate VFIO Non-Privileged userspace driver framework depends on IOMMU_API select VFIO_IOMMU_TYPE1 if X86 - select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV + select VFIO_IOMMU_SPAPR_TCE if (PPC_POWERNV || PPC_PSERIES) help VFIO provides a framework for secure userspace device drivers. See Documentation/vfio.txt for more details. -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/5] VFIO PPC64: add VFIO support on POWERPC64
From: Alexey Kardashevskiy a...@ozlabs.ru The series adds support for VFIO on POWERPC in user space (such as QEMU). The in-kernel real mode IOMMU support is added by another series posted separately. As the first and main aim of this series is the POWERNV platform support, the Enable on POWERNV platform patch goes first and introduces an API to be used by the VFIO IOMMU driver. The Enable on pSeries platform patch simply registers PHBs in the IOMMU subsystem and expects the API to be present, it enables VFIO support in fully emulated QEMU guests. These patches were tested against 3.8 and the iommu: Move initialization earlier patch is already in 3.9 so I am including it here only for the reference. Change log: * cleanups and minor fixes * added support for pSeries * separated from in-kernel IOMMU handling series (should make it easier to get sob'ed) * signed-off-by Paul Mackerras Alexey Kardashevskiy (5): iommu: Move initialization earlier KVM: PPC: iommu: Add missing kvm_iommu_map_pages/kvm_iommu_unmap_pages powerpc/vfio: Enable on POWERNV platform powerpc/vfio: Implement IOMMU driver for VFIO powerpc/vfio: Enable on pSeries platform Documentation/vfio.txt | 63 + arch/powerpc/include/asm/iommu.h| 26 ++ arch/powerpc/include/asm/kvm_host.h | 14 + arch/powerpc/kernel/iommu.c | 319 +++ arch/powerpc/platforms/powernv/pci-ioda.c |1 + arch/powerpc/platforms/powernv/pci-p5ioc2.c |5 +- arch/powerpc/platforms/powernv/pci.c|2 + arch/powerpc/platforms/pseries/iommu.c |4 + drivers/iommu/Kconfig |8 + drivers/iommu/iommu.c |2 +- drivers/vfio/Kconfig|6 + drivers/vfio/Makefile |1 + drivers/vfio/vfio.c |1 + drivers/vfio/vfio_iommu_spapr_tce.c | 377 +++ include/uapi/linux/vfio.h | 34 +++ 15 files changed, 861 insertions(+), 2 deletions(-) create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/5] iommu: Move initialization earlier
From: Alexey Kardashevskiy a...@ozlabs.ru The iommu_init() call initializes IOMMU internal structures and data required for the API to function such as iommu_group_alloc(). It is registered as a subsys_initcall. One of the IOMMU users is a PCI subsystem on POWER which discovers new IOMMU tables during the PCI scan so the most logical place to call iommu_group_alloc() is when a new group is just discovered. However PCI scan is done from subsys_initcall hook as well, which makes use of the IOMMU API impossible. This moves IOMMU subsystem initialization one step earlier. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru Cc: David Gibson da...@gibson.dropbear.id.au Signed-off-by: Paul Mackerras pau...@samba.org --- drivers/iommu/iommu.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 5514dfa..0de83eb 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -890,7 +890,7 @@ static int __init iommu_init(void) return 0; } -subsys_initcall(iommu_init); +arch_initcall(iommu_init); int iommu_domain_get_attr(struct iommu_domain *domain, enum iommu_attr attr, void *data) -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/5] KVM: PPC: iommu: Add missing kvm_iommu_map_pages/kvm_iommu_unmap_pages
From: Alexey Kardashevskiy a...@ozlabs.ru The IOMMU API implements groups creating/deletion, device binding and IOMMU map/unmap operations. The PowerPC implementation uses most of the API except map/unmap operations, which are implemented on POWER using hypercalls. However, in order to link a kernel with the CONFIG_IOMMU_API enabled, the empty kvm_iommu_map_pages/kvm_iommu_unmap_pages have to be defined, so this defines them. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru Cc: David Gibson da...@gibson.dropbear.id.au Signed-off-by: Paul Mackerras pau...@samba.org --- arch/powerpc/include/asm/kvm_host.h | 14 ++ 1 file changed, 14 insertions(+) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index b6a047e..c025d91 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -603,4 +603,18 @@ struct kvm_vcpu_arch { #define __KVM_HAVE_ARCH_WQP +#ifdef CONFIG_IOMMU_API +/* POWERPC does not use IOMMU API for mapping/unmapping */ +static inline int kvm_iommu_map_pages(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + return 0; +} + +static inline void kvm_iommu_unmap_pages(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ +} +#endif /* CONFIG_IOMMU_API */ + #endif /* __POWERPC_KVM_HOST_H__ */ -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/5] powerpc/vfio: Enable on POWERNV platform
From: Alexey Kardashevskiy a...@ozlabs.ru This initializes IOMMU groups based on the IOMMU configuration discovered during the PCI scan on POWERNV (POWER non virtualized) platform. The IOMMU groups are to be used later by the VFIO driver, which is used for PCI pass through. It also implements an API for mapping/unmapping pages for guest PCI drivers and providing DMA window properties. This API is going to be used later by QEMU-VFIO to handle h_put_tce hypercalls from the KVM guest. The iommu_put_tce_user_mode() does only a single page mapping as an API for adding many mappings at once is going to be added later. Although this driver has been tested only on the POWERNV platform, it should work on any platform which supports TCE tables. As h_put_tce hypercall is received by the host kernel and processed by the QEMU (what involves calling the host kernel again), performance is not the best - circa 220MB/s on 10Gb ethernet network. To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config option and configure VFIO as required. Cc: David Gibson da...@gibson.dropbear.id.au Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru Signed-off-by: Paul Mackerras pau...@samba.org --- arch/powerpc/include/asm/iommu.h| 26 +++ arch/powerpc/kernel/iommu.c | 319 +++ arch/powerpc/platforms/powernv/pci-ioda.c |1 + arch/powerpc/platforms/powernv/pci-p5ioc2.c |5 +- arch/powerpc/platforms/powernv/pci.c|2 + drivers/iommu/Kconfig |8 + 6 files changed, 360 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index cbfe678..98d1422 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -76,6 +76,9 @@ struct iommu_table { struct iommu_pool large_pool; struct iommu_pool pools[IOMMU_NR_POOLS]; unsigned long *it_map; /* A simple allocation bitmap for now */ +#ifdef CONFIG_IOMMU_API + struct iommu_group *it_group; +#endif }; struct scatterlist; @@ -98,6 +101,8 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name); */ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, int nid); +extern void iommu_register_group(struct iommu_table *tbl, +int pci_domain_number, unsigned long pe_num); extern int iommu_map_sg(struct device *dev, struct iommu_table *tbl, struct scatterlist *sglist, int nelems, @@ -147,5 +152,26 @@ static inline void iommu_restore(void) } #endif +/* The API to support IOMMU operations for VFIO */ +extern int iommu_tce_clear_param_check(struct iommu_table *tbl, + unsigned long ioba, unsigned long tce_value, + unsigned long npages); +extern int iommu_tce_put_param_check(struct iommu_table *tbl, + unsigned long ioba, unsigned long tce); +extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, + unsigned long hwaddr, enum dma_data_direction direction); +extern unsigned long iommu_clear_tce(struct iommu_table *tbl, + unsigned long entry); +extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl, + unsigned long entry, unsigned long pages); +extern int iommu_put_tce_user_mode(struct iommu_table *tbl, + unsigned long entry, unsigned long tce); + +extern void iommu_flush_tce(struct iommu_table *tbl); +extern int iommu_take_ownership(struct iommu_table *tbl); +extern void iommu_release_ownership(struct iommu_table *tbl); + +extern enum dma_data_direction iommu_tce_direction(unsigned long tce); + #endif /* __KERNEL__ */ #endif /* _ASM_IOMMU_H */ diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index c862fd7..debedd2 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -36,6 +36,8 @@ #include linux/hash.h #include linux/fault-inject.h #include linux/pci.h +#include linux/iommu.h +#include linux/sched.h #include asm/io.h #include asm/prom.h #include asm/iommu.h @@ -44,6 +46,7 @@ #include asm/kdump.h #include asm/fadump.h #include asm/vio.h +#include asm/tce.h #define DBG(...) @@ -717,6 +720,12 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name) return; } +#ifdef CONFIG_IOMMU_API + if (tbl-it_group) { + iommu_group_put(tbl-it_group); + BUG_ON(tbl-it_group); + } +#endif /* verify that table contains no entries */ if (!bitmap_empty(tbl-it_map, tbl-it_size)) pr_warn(%s: Unexpected TCEs for %s\n, __func__, node_name); @@ -853,3 +862,313 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size, free_pages((unsigned long)vaddr, get_order(size)); } } + +#ifdef CONFIG_IOMMU_API +/* + * SPAPR TCE API + */ +static void
[PATCH 0/6] KVM: PPC: IOMMU in-kernel handling
This series is supposed to accelerate IOMMU operations in real and virtual mode in the host kernel for the KVM guest. The first user is VFIO however this series does not contain any VFIO related code as the connection between VFIO and the new handlers is to be made in QEMU via ioctl to the KVM fd. Although the series compiles, it does not make sense without VFIO patches which are posted separately. The iommu: Add a function to find an iommu group by id patch has already gone to linux-next (from iommu tree) but it is not in upstream yet so I am including it here for the reference. Alexey Kardashevskiy (6): KVM: PPC: Make lookup_linux_pte public KVM: PPC: Add support for multiple-TCE hcalls powerpc: Prepare to support kernel handling of IOMMU map/unmap iommu: Add a function to find an iommu group by id KVM: PPC: Add support for IOMMU in-kernel handling KVM: PPC: Add hugepage support for IOMMU in-kernel handling Documentation/virtual/kvm/api.txt| 43 +++ arch/powerpc/include/asm/kvm_host.h |4 + arch/powerpc/include/asm/kvm_ppc.h | 44 ++- arch/powerpc/include/asm/pgtable-ppc64.h |4 + arch/powerpc/include/uapi/asm/kvm.h |7 + arch/powerpc/kvm/book3s_64_vio.c | 433 +++- arch/powerpc/kvm/book3s_64_vio_hv.c | 464 -- arch/powerpc/kvm/book3s_hv.c | 23 ++ arch/powerpc/kvm/book3s_hv_rm_mmu.c |5 +- arch/powerpc/kvm/book3s_hv_rmhandlers.S |6 + arch/powerpc/kvm/book3s_pr_papr.c| 37 ++- arch/powerpc/kvm/powerpc.c | 15 + arch/powerpc/mm/init_64.c| 77 - drivers/iommu/iommu.c| 29 ++ include/linux/iommu.h|1 + include/uapi/linux/kvm.h |3 + 16 files changed, 1159 insertions(+), 36 deletions(-) -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/6] KVM: PPC: Make lookup_linux_pte public
The lookup_linux_pte() function returns a linux PTE which is needed in the process of converting KVM guest physical address into host real address in real mode. This conversion will be used by upcoming support of H_PUT_TCE_INDIRECT, as the TCE list address comes from the guest and is a guest physical address. This makes lookup_linux_pte() public so that code can call it. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru Cc: David Gibson da...@gibson.dropbear.id.au Signed-off-by: Paul Mackerras pau...@samba.org --- arch/powerpc/include/asm/kvm_ppc.h |3 +++ arch/powerpc/kvm/book3s_hv_rm_mmu.c |5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 41426c9..99da298 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -379,4 +379,7 @@ static inline ulong kvmppc_get_ea_indexed(struct kvm_vcpu *vcpu, int ra, int rb) return ea; } +pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva, + int writing, unsigned long *pte_sizep); + #endif /* __POWERPC_KVM_PPC_H__ */ diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 6dcbb49..18fc382 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -134,8 +134,8 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, unlock_rmap(rmap); } -static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva, - int writing, unsigned long *pte_sizep) +pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva, + int writing, unsigned long *pte_sizep) { pte_t *ptep; unsigned long ps = *pte_sizep; @@ -154,6 +154,7 @@ static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva, return __pte(0); return kvmppc_read_update_linux_pte(ptep, writing); } +EXPORT_SYMBOL_GPL(lookup_linux_pte); static inline void unlock_hpte(unsigned long *hpte, unsigned long hpte_v) { -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/6] powerpc: Prepare to support kernel handling of IOMMU map/unmap
The current VFIO-on-POWER implementation supports only user mode driven mapping, i.e. QEMU is sending requests to map/unmap pages. However this approach is really slow, so we want to move that to KVM. Since H_PUT_TCE can be extremely performance sensitive (especially with network adapters where each packet needs to be mapped/unmapped) we chose to implement that as a fast hypercall directly in real mode (processor still in the guest context but MMU off). To be able to do that, we need to provide some facilities to access the struct page count within that real mode environment as things like the sparsemem vmemmap mappings aren't accessible. This adds an API to increment/decrement page counter as get_user_pages API used for user mode mapping does not work in the real mode. CONFIG_SPARSEMEM_VMEMMAP and CONFIG_FLATMEM are supported. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru Reviewed-by: Paul Mackerras pau...@samba.org Cc: David Gibson da...@gibson.dropbear.id.au Signed-off-by: Paul Mackerras pau...@samba.org --- arch/powerpc/include/asm/pgtable-ppc64.h |4 ++ arch/powerpc/mm/init_64.c| 77 +- 2 files changed, 80 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index 0182c20..4c56ede 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -377,6 +377,10 @@ static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, } #endif /* !CONFIG_HUGETLB_PAGE */ +struct page *realmode_pfn_to_page(unsigned long pfn); +int realmode_get_page(struct page *page); +int realmode_put_page(struct page *page); + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_PGTABLE_PPC64_H_ */ diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 95a4529..838b8ae 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -297,5 +297,80 @@ int __meminit vmemmap_populate(struct page *start_page, return 0; } -#endif /* CONFIG_SPARSEMEM_VMEMMAP */ +/* + * We do not have access to the sparsemem vmemmap, so we fallback to + * walking the list of sparsemem blocks which we already maintain for + * the sake of crashdump. In the long run, we might want to maintain + * a tree if performance of that linear walk becomes a problem. + * + * Any of realmode_ functions can fail due to: + * 1) As real sparsemem blocks do not lay in RAM continously (they + * are in virtual address space which is not available in the real mode), + * the requested page struct can be split between blocks so get_page/put_page + * may fail. + * 2) When huge pages are used, the get_page/put_page API will fail + * in real mode as the linked addresses in the page struct are virtual + * too. + * When 1) or 2) takes place, the API returns an error code to cause + * an exit to kernel virtual mode where the operation will be completed. + */ +struct page *realmode_pfn_to_page(unsigned long pfn) +{ + struct vmemmap_backing *vmem_back; + struct page *page; + unsigned long page_size = 1 mmu_psize_defs[mmu_vmemmap_psize].shift; + unsigned long pg_va = (unsigned long) pfn_to_page(pfn); + + for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back-list) { + if (pg_va vmem_back-virt_addr) + continue; + + /* Check that page struct is not split between real pages */ + if ((pg_va + sizeof(struct page)) + (vmem_back-virt_addr + page_size)) + return NULL; + + page = (struct page *) (vmem_back-phys + pg_va - + vmem_back-virt_addr); + return page; + } + + return NULL; +} +EXPORT_SYMBOL_GPL(realmode_pfn_to_page); + +#elif defined(CONFIG_FLATMEM) + +struct page *realmode_pfn_to_page(unsigned long pfn) +{ + struct page *page = pfn_to_page(pfn); + return page; +} +EXPORT_SYMBOL_GPL(realmode_pfn_to_page); + +#endif /* CONFIG_SPARSEMEM_VMEMMAP/CONFIG_FLATMEM */ + +#if defined(CONFIG_SPARSEMEM_VMEMMAP) || defined(CONFIG_FLATMEM) +int realmode_get_page(struct page *page) +{ + if (PageTail(page)) + return -EAGAIN; + + get_page(page); + + return 0; +} +EXPORT_SYMBOL_GPL(realmode_get_page); + +int realmode_put_page(struct page *page) +{ + if (PageCompound(page)) + return -EAGAIN; + + put_page(page); + + return 0; +} +EXPORT_SYMBOL_GPL(realmode_put_page); +#endif -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/6] iommu: Add a function to find an iommu group by id
As IOMMU groups are exposed to the user space by their numbers, the user space can use them in various kernel APIs so the kernel might need an API to find a group by its ID. As an example, QEMU VFIO on PPC64 platform needs it to associate a logical bus number (LIOBN) with a specific IOMMU group in order to support in-kernel handling of DMA map/unmap requests. This adds the iommu_group_get_by_id(id) function which performs this search. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru Signed-off-by: Paul Mackerras pau...@samba.org --- drivers/iommu/iommu.c | 29 + include/linux/iommu.h |1 + 2 files changed, 30 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index ddbdaca..5514dfa 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -204,6 +204,35 @@ again: } EXPORT_SYMBOL_GPL(iommu_group_alloc); +struct iommu_group *iommu_group_get_by_id(int id) +{ + struct kobject *group_kobj; + struct iommu_group *group; + const char *name; + + if (!iommu_group_kset) + return NULL; + + name = kasprintf(GFP_KERNEL, %d, id); + if (!name) + return NULL; + + group_kobj = kset_find_obj(iommu_group_kset, name); + kfree(name); + + if (!group_kobj) + return NULL; + + group = container_of(group_kobj, struct iommu_group, kobj); + BUG_ON(group-id != id); + + kobject_get(group-devices_kobj); + kobject_put(group-kobj); + + return group; +} +EXPORT_SYMBOL_GPL(iommu_group_get_by_id); + /** * iommu_group_get_iommudata - retrieve iommu_data registered for a group * @group: the group diff --git a/include/linux/iommu.h b/include/linux/iommu.h index f3b99e1..00e5d7d 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -113,6 +113,7 @@ struct iommu_ops { extern int bus_set_iommu(struct bus_type *bus, struct iommu_ops *ops); extern bool iommu_present(struct bus_type *bus); extern struct iommu_domain *iommu_domain_alloc(struct bus_type *bus); +extern struct iommu_group *iommu_group_get_by_id(int id); extern void iommu_domain_free(struct iommu_domain *domain); extern int iommu_attach_device(struct iommu_domain *domain, struct device *dev); -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 5/6] KVM: PPC: Add support for IOMMU in-kernel handling
This allows the host kernel to handle H_PUT_TCE, H_PUT_TCE_INDIRECT and H_STUFF_TCE requests without passing them to QEMU, which should save time on switching to QEMU and back. Both real and virtual modes are supported - whenever the kernel fails to handle TCE request, it passes it to the virtual mode. If it the virtual mode handlers fail, then the request is passed to the user mode, for example, to QEMU. This adds a new KVM_CAP_SPAPR_TCE_IOMMU ioctl to asssociate a virtual PCI bus ID (LIOBN) with an IOMMU group, which enables in-kernel handling of IOMMU map/unmap. This adds a special case for huge pages (16MB). The reference counting cannot be easily done for such pages in real mode (when MMU is off) so we added a list of huge pages. It is populated in virtual mode and get_page is called just once per a huge page. Real mode handlers check if the requested page is huge and in the list, then no reference counting is done, otherwise an exit to virtual mode happens. The list is released at KVM exit. At the moment the fastest card available for tests uses up to 9 huge pages so walking through this list is not very expensive. However this can change and we may want to optimize this. This also adds the virt_only parameter to the KVM module for debug and performance check purposes. Tests show that this patch increases transmission speed from 220MB/s to 750..1020MB/s on 10Gb network (Chelsea CXGB3 10Gb ethernet card). Cc: David Gibson da...@gibson.dropbear.id.au Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru Signed-off-by: Paul Mackerras pau...@samba.org --- Documentation/virtual/kvm/api.txt | 28 arch/powerpc/include/asm/kvm_host.h |2 + arch/powerpc/include/asm/kvm_ppc.h |2 + arch/powerpc/include/uapi/asm/kvm.h |7 + arch/powerpc/kvm/book3s_64_vio.c| 242 ++- arch/powerpc/kvm/book3s_64_vio_hv.c | 192 +++ arch/powerpc/kvm/powerpc.c | 12 ++ include/uapi/linux/kvm.h|2 + 8 files changed, 485 insertions(+), 2 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index f621cd6..2039767 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2127,6 +2127,34 @@ written, then `n_invalid' invalid entries, invalidating any previously valid entries found. +4.79 KVM_CREATE_SPAPR_TCE_IOMMU + +Capability: KVM_CAP_SPAPR_TCE_IOMMU +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_create_spapr_tce_iommu (in) +Returns: 0 on success, -1 on error + +This creates a link between IOMMU group and a hardware TCE (translation +control entry) table. This link lets the host kernel know what IOMMU +group (i.e. TCE table) to use for the LIOBN number passed with +H_PUT_TCE, H_PUT_TCE_INDIRECT, H_STUFF_TCE hypercalls. + +/* for KVM_CAP_SPAPR_TCE_IOMMU */ +struct kvm_create_spapr_tce_iommu { + __u64 liobn; + __u32 iommu_id; + __u32 flags; +}; + +No flag is supported at the moment. + +When the guest issues TCE call on a liobn for which a TCE table has been +registered, the kernel will handle it in real mode, updating the hardware +TCE table. TCE table calls for other liobns will cause a vm exit and must +be handled by userspace. + + 5. The kvm_run structure diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 36ceb0d..2b70cbc 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -178,6 +178,8 @@ struct kvmppc_spapr_tce_table { struct kvm *kvm; u64 liobn; u32 window_size; + bool virtmode_only; + struct iommu_group *grp;/* used for IOMMU groups */ struct page *pages[0]; }; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index d501246..bdfa140 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -139,6 +139,8 @@ extern void kvmppc_xics_free(struct kvm *kvm); extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce *args); +extern long kvm_vm_ioctl_create_spapr_tce_iommu(struct kvm *kvm, + struct kvm_create_spapr_tce_iommu *args); extern struct kvmppc_spapr_tce_table *kvmppc_find_tce_table( struct kvm_vcpu *vcpu, unsigned long liobn); extern long kvmppc_emulated_h_put_tce(struct kvmppc_spapr_tce_table *stt, diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 681b314..b67d44b 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -291,6 +291,13 @@ struct kvm_create_spapr_tce { __u32 window_size; }; +/* for KVM_CAP_SPAPR_TCE_IOMMU */ +struct kvm_create_spapr_tce_iommu { + __u64 liobn; + __u32 iommu_id; + __u32 flags; +}; + /* for KVM_ALLOCATE_RMA */
[PATCH 2/6] KVM: PPC: Add support for multiple-TCE hcalls
This adds real mode handlers for the H_PUT_TCE_INDIRECT and H_STUFF_TCE hypercalls for QEMU emulated devices such as virtio devices or emulated PCI. These calls allow adding multiple entries (up to 512) into the TCE table in one call which saves time on transition to/from real mode. This adds a guest physical to host real address converter and calls the existing H_PUT_TCE handler. The converting function is going to be fully utilized by upcoming VFIO supporting patches. This also implements the KVM_CAP_PPC_MULTITCE capability, so in order to support the functionality of this patch, QEMU needs to query for this capability and set the hcall-multi-tce hypertas property only if the capability is present, otherwise there will be serious performance degradation. Cc: David Gibson da...@gibson.dropbear.id.au Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru Signed-off-by: Paul Mackerras pau...@samba.org --- Documentation/virtual/kvm/api.txt | 15 ++ arch/powerpc/include/asm/kvm_ppc.h | 15 +- arch/powerpc/kvm/book3s_64_vio.c| 114 +++ arch/powerpc/kvm/book3s_64_vio_hv.c | 231 +++ arch/powerpc/kvm/book3s_hv.c| 23 +++ arch/powerpc/kvm/book3s_hv_rmhandlers.S |6 + arch/powerpc/kvm/book3s_pr_papr.c | 37 - arch/powerpc/kvm/powerpc.c |3 + include/uapi/linux/kvm.h|1 + 9 files changed, 413 insertions(+), 32 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index a4df553..f621cd6 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2463,3 +2463,18 @@ For mmu types KVM_MMU_FSL_BOOKE_NOHV and KVM_MMU_FSL_BOOKE_HV: where num_sets is the tlb_sizes[] value divided by the tlb_ways[] value. - The tsize field of mas1 shall be set to 4K on TLB0, even though the hardware ignores this value for TLB0. + + +6.4 KVM_CAP_PPC_MULTITCE + +Architectures: ppc +Parameters: none +Returns: 0 on success; -1 on error + +This capability enables the guest to put/remove multiple TCE entries +per hypercall which significanly accelerates DMA operations for PPC KVM +guests. + +When this capability is enabled, H_PUT_TCE_INDIRECT and H_STUFF_TCE are +expected to occur rather than H_PUT_TCE which supports only one TCE entry +per call. diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 99da298..d501246 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -139,8 +139,19 @@ extern void kvmppc_xics_free(struct kvm *kvm); extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce *args); -extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, -unsigned long ioba, unsigned long tce); +extern struct kvmppc_spapr_tce_table *kvmppc_find_tce_table( + struct kvm_vcpu *vcpu, unsigned long liobn); +extern long kvmppc_emulated_h_put_tce(struct kvmppc_spapr_tce_table *stt, + unsigned long ioba, unsigned long tce); +extern long kvmppc_virtmode_h_put_tce(struct kvm_vcpu *vcpu, + unsigned long liobn, unsigned long ioba, + unsigned long tce); +extern long kvmppc_virtmode_h_put_tce_indirect(struct kvm_vcpu *vcpu, + unsigned long liobn, unsigned long ioba, + unsigned long tce_list, unsigned long npages); +extern long kvmppc_virtmode_h_stuff_tce(struct kvm_vcpu *vcpu, + unsigned long liobn, unsigned long ioba, + unsigned long tce_value, unsigned long npages); extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *rma); extern struct kvmppc_linear_info *kvm_alloc_rma(void); diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 72ffc89..643ac1e 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -14,6 +14,7 @@ * * Copyright 2010 Paul Mackerras, IBM Corp. pau...@au1.ibm.com * Copyright 2011 David Gibson, IBM Corporation d...@au1.ibm.com + * Copyright 2013 Alexey Kardashevskiy, IBM Corporation a...@au1.ibm.com */ #include linux/types.h @@ -36,9 +37,14 @@ #include asm/ppc-opcode.h #include asm/kvm_host.h #include asm/udbg.h +#include asm/iommu.h #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) +#define ERROR_ADDR (~(unsigned long)0x0) +/* + * TCE tables handlers. + */ static long kvmppc_stt_npages(unsigned long window_size) { return ALIGN((window_size SPAPR_TCE_SHIFT) @@ -148,3 +154,111 @@ fail: } return ret; } + +/* + * Virtual mode handling of IOMMU map/unmap. + */ +/* Converts guest physical address into host virtual */ +static unsigned long get_virt_address(struct kvm_vcpu *vcpu, + unsigned long gpa) +{ + unsigned long hva, gfn = gpa PAGE_SHIFT;
[PATCH 6/6] KVM: PPC: Add hugepage support for IOMMU in-kernel handling
This adds special support for huge pages (16MB). The reference counting cannot be easily done for such pages in real mode (when MMU is off) so we added a list of huge pages. It is populated in virtual mode and get_page is called just once per a huge page. Real mode handlers check if the requested page is huge and in the list, then no reference counting is done, otherwise an exit to virtual mode happens. The list is released at KVM exit. At the moment the fastest card available for tests uses up to 9 huge pages so walking through this list is not very expensive. However this can change and we may want to optimize this. Cc: David Gibson da...@gibson.dropbear.id.au Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru Signed-off-by: Paul Mackerras pau...@samba.org --- arch/powerpc/include/asm/kvm_host.h |2 + arch/powerpc/include/asm/kvm_ppc.h | 24 +++ arch/powerpc/kvm/book3s_64_vio.c| 79 ++- arch/powerpc/kvm/book3s_64_vio_hv.c | 47 - 4 files changed, 149 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 2b70cbc..b6a047e 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -180,6 +180,8 @@ struct kvmppc_spapr_tce_table { u32 window_size; bool virtmode_only; struct iommu_group *grp;/* used for IOMMU groups */ + struct list_head hugepages; /* used for IOMMU groups */ + spinlock_t hugepages_lock; /* used for IOMMU groups */ struct page *pages[0]; }; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index bdfa140..3c95464 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -154,6 +154,30 @@ extern long kvmppc_virtmode_h_put_tce_indirect(struct kvm_vcpu *vcpu, extern long kvmppc_virtmode_h_stuff_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba, unsigned long tce_value, unsigned long npages); + +/* + * The KVM guest can be backed with 16MB pages (qemu switch + * -mem-path /var/lib/hugetlbfs/global/pagesize-16MB/). + * In this case, we cannot do page counting from the real mode + * as the compound pages are used - they are linked in a list + * with pointers as virtual addresses which are inaccessible + * in real mode. + * + * The code below keeps a 16MB pages list and uses page struct + * in real mode if it is already locked in RAM and inserted into + * the list or switches to the virtual mode where it can be + * handled in a usual manner. + */ +struct iommu_kvmppc_hugepage { + struct list_head list; + pte_t pte; /* Huge page PTE */ + unsigned long pa; /* Base phys address used as a real TCE */ + struct page *page; /* page struct of the very first subpage */ + unsigned long size; /* Huge page size (always 16MB at the moment) */ +}; +extern struct iommu_kvmppc_hugepage *kvmppc_iommu_hugepage_find( + struct kvmppc_spapr_tce_table *tt, pte_t pte); + extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *rma); extern struct kvmppc_linear_info *kvm_alloc_rma(void); diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 98cf949..274458d 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -54,6 +54,59 @@ static bool kvmppc_tce_virt_only = false; module_param_named(virt_only, kvmppc_tce_virt_only, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(virt_only, Disable realmode handling of IOMMU map/unmap); +#ifdef CONFIG_IOMMU_API +/* + * Adds a new huge page descriptor to the list. + */ +static struct iommu_kvmppc_hugepage *kvmppc_iommu_hugepage_add( + struct kvmppc_spapr_tce_table *tt, + pte_t pte, unsigned long va, unsigned long pg_size) +{ + int ret; + struct iommu_kvmppc_hugepage *hp; + struct page *p; + + va = va ~(pg_size - 1); + ret = get_user_pages_fast(va, 1, true/*write*/, p); + if ((ret != 1) || !p) + return NULL; + + hp = kzalloc(sizeof(*hp), GFP_KERNEL); + if (!hp) + return NULL; + + hp-page = p; + hp-pte = pte; + hp-pa = __pa((unsigned long) page_address(hp-page)); + hp-size = pg_size; + + spin_lock(tt-hugepages_lock); + list_add(hp-list, tt-hugepages); + spin_unlock(tt-hugepages_lock); + + return hp; +} + +static void kvmppc_iommu_hugepages_init(struct kvmppc_spapr_tce_table *tt) +{ + INIT_LIST_HEAD(tt-hugepages); + spin_lock_init(tt-hugepages_lock); +} + +static void kvmppc_iommu_hugepages_cleanup(struct kvmppc_spapr_tce_table *tt) +{ + struct iommu_kvmppc_hugepage *hp, *tmp; + + spin_lock(tt-hugepages_lock); + list_for_each_entry_safe(hp, tmp, tt-hugepages, list) {
Re: [PATCH 0/3] vhost cleanups and separate module
Asias He as...@redhat.com writes: Asias He (3): vhost: Remove vhost_enable_zcopy in vhost.h vhost: Move VHOST_NET_FEATURES to net.c vhost: Make vhost a separate module I like these cleanups, MST pleasee apply. I have some other cleanups which are on hold for the moment pending MST's vhost_net simplification. MST, how's that going? Thanks, Rusty. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/3] vhost cleanups and separate module
Hello Rusty, On Mon, May 06, 2013 at 03:41:36PM +0930, Rusty Russell wrote: Asias He as...@redhat.com writes: Asias He (3): vhost: Remove vhost_enable_zcopy in vhost.h vhost: Move VHOST_NET_FEATURES to net.c vhost: Make vhost a separate module I like these cleanups, MST pleasee apply. I have some other cleanups which are on hold for the moment pending MST's vhost_net simplification. MST, how's that going? Do you mean patches in your rusty/vringh branch? I want to do the frame assumption conversion for vhost-scsi on top of the vringh series. Thanks, Rusty. -- Asias -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 4/4] vhost-net: Cleanup vhost_ubuf adn vhost_zcopy
Typo a/adn/and/ On Fri, May 03, 2013 at 02:25:18PM +0800, Asias He wrote: - Rename vhost_ubuf to vhost_net_ubuf - Rename vhost_zcopy_mask to vhost_net_zcopy_mask - Make funcs static Signed-off-by: Asias He as...@redhat.com --- drivers/vhost/net.c | 58 +++-- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index eb73217..4548c0b 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -70,7 +70,7 @@ enum { VHOST_NET_VQ_MAX = 2, }; -struct vhost_ubuf_ref { +struct vhost_net_ubuf_ref { struct kref kref; wait_queue_head_t wait; struct vhost_virtqueue *vq; @@ -93,7 +93,7 @@ struct vhost_net_virtqueue { struct ubuf_info *ubuf_info; /* Reference counting for outstanding ubufs. * Protected by vq mutex. Writers must also take device mutex. */ - struct vhost_ubuf_ref *ubufs; + struct vhost_net_ubuf_ref *ubufs; }; struct vhost_net { @@ -110,24 +110,25 @@ struct vhost_net { bool tx_flush; }; -static unsigned vhost_zcopy_mask __read_mostly; +static unsigned vhost_net_zcopy_mask __read_mostly; -void vhost_enable_zcopy(int vq) +static void vhost_net_enable_zcopy(int vq) { - vhost_zcopy_mask |= 0x1 vq; + vhost_net_zcopy_mask |= 0x1 vq; } -static void vhost_zerocopy_done_signal(struct kref *kref) +static void vhost_net_zerocopy_done_signal(struct kref *kref) { - struct vhost_ubuf_ref *ubufs = container_of(kref, struct vhost_ubuf_ref, - kref); + struct vhost_net_ubuf_ref *ubufs; + + ubufs = container_of(kref, struct vhost_net_ubuf_ref, kref); wake_up(ubufs-wait); } -struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *vq, - bool zcopy) +static struct vhost_net_ubuf_ref * +vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy) { - struct vhost_ubuf_ref *ubufs; + struct vhost_net_ubuf_ref *ubufs; /* No zero copy backend? Nothing to count. */ if (!zcopy) return NULL; @@ -140,14 +141,14 @@ struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *vq, return ubufs; } -void vhost_ubuf_put(struct vhost_ubuf_ref *ubufs) +static void vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs) { - kref_put(ubufs-kref, vhost_zerocopy_done_signal); + kref_put(ubufs-kref, vhost_net_zerocopy_done_signal); } -void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *ubufs) +static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs) { - kref_put(ubufs-kref, vhost_zerocopy_done_signal); + kref_put(ubufs-kref, vhost_net_zerocopy_done_signal); wait_event(ubufs-wait, !atomic_read(ubufs-kref.refcount)); kfree(ubufs); } @@ -159,7 +160,7 @@ static void vhost_net_clear_ubuf_info(struct vhost_net *n) int i; for (i = 0; i n-dev.nvqs; ++i) { - zcopy = vhost_zcopy_mask (0x1 i); + zcopy = vhost_net_zcopy_mask (0x1 i); if (zcopy) kfree(n-vqs[i].ubuf_info); } @@ -171,7 +172,7 @@ int vhost_net_set_ubuf_info(struct vhost_net *n) int i; for (i = 0; i n-dev.nvqs; ++i) { - zcopy = vhost_zcopy_mask (0x1 i); + zcopy = vhost_net_zcopy_mask (0x1 i); if (!zcopy) continue; n-vqs[i].ubuf_info = kmalloc(sizeof(*n-vqs[i].ubuf_info) * @@ -183,7 +184,7 @@ int vhost_net_set_ubuf_info(struct vhost_net *n) err: while (i--) { - zcopy = vhost_zcopy_mask (0x1 i); + zcopy = vhost_net_zcopy_mask (0x1 i); if (!zcopy) continue; kfree(n-vqs[i].ubuf_info); @@ -305,7 +306,7 @@ static int vhost_zerocopy_signal_used(struct vhost_net *net, static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success) { - struct vhost_ubuf_ref *ubufs = ubuf-ctx; + struct vhost_net_ubuf_ref *ubufs = ubuf-ctx; struct vhost_virtqueue *vq = ubufs-vq; int cnt = atomic_read(ubufs-kref.refcount); @@ -322,7 +323,7 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success) /* set len to mark this desc buffers done DMA */ vq-heads[ubuf-desc].len = success ? VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN; - vhost_ubuf_put(ubufs); + vhost_net_ubuf_put(ubufs); } /* Expects to be always run from workqueue - which acts as @@ -345,7 +346,7 @@ static void handle_tx(struct vhost_net *net) int err; size_t hdr_size; struct socket *sock; - struct vhost_ubuf_ref *uninitialized_var(ubufs); + struct vhost_net_ubuf_ref *uninitialized_var(ubufs); bool zcopy, zcopy_used; /* TODO: check that we are
Re: [PATCH 4/4] vhost-net: Cleanup vhost_ubuf adn vhost_zcopy
On Mon, May 6, 2013 at 4:17 PM, Michael S. Tsirkin m...@redhat.com wrote: Typo a/adn/and/ Yes. Catched this up and and fixed already. On Fri, May 03, 2013 at 02:25:18PM +0800, Asias He wrote: - Rename vhost_ubuf to vhost_net_ubuf - Rename vhost_zcopy_mask to vhost_net_zcopy_mask - Make funcs static Signed-off-by: Asias He as...@redhat.com --- drivers/vhost/net.c | 58 +++-- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index eb73217..4548c0b 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -70,7 +70,7 @@ enum { VHOST_NET_VQ_MAX = 2, }; -struct vhost_ubuf_ref { +struct vhost_net_ubuf_ref { struct kref kref; wait_queue_head_t wait; struct vhost_virtqueue *vq; @@ -93,7 +93,7 @@ struct vhost_net_virtqueue { struct ubuf_info *ubuf_info; /* Reference counting for outstanding ubufs. * Protected by vq mutex. Writers must also take device mutex. */ - struct vhost_ubuf_ref *ubufs; + struct vhost_net_ubuf_ref *ubufs; }; struct vhost_net { @@ -110,24 +110,25 @@ struct vhost_net { bool tx_flush; }; -static unsigned vhost_zcopy_mask __read_mostly; +static unsigned vhost_net_zcopy_mask __read_mostly; -void vhost_enable_zcopy(int vq) +static void vhost_net_enable_zcopy(int vq) { - vhost_zcopy_mask |= 0x1 vq; + vhost_net_zcopy_mask |= 0x1 vq; } -static void vhost_zerocopy_done_signal(struct kref *kref) +static void vhost_net_zerocopy_done_signal(struct kref *kref) { - struct vhost_ubuf_ref *ubufs = container_of(kref, struct vhost_ubuf_ref, - kref); + struct vhost_net_ubuf_ref *ubufs; + + ubufs = container_of(kref, struct vhost_net_ubuf_ref, kref); wake_up(ubufs-wait); } -struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *vq, - bool zcopy) +static struct vhost_net_ubuf_ref * +vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy) { - struct vhost_ubuf_ref *ubufs; + struct vhost_net_ubuf_ref *ubufs; /* No zero copy backend? Nothing to count. */ if (!zcopy) return NULL; @@ -140,14 +141,14 @@ struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *vq, return ubufs; } -void vhost_ubuf_put(struct vhost_ubuf_ref *ubufs) +static void vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs) { - kref_put(ubufs-kref, vhost_zerocopy_done_signal); + kref_put(ubufs-kref, vhost_net_zerocopy_done_signal); } -void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *ubufs) +static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs) { - kref_put(ubufs-kref, vhost_zerocopy_done_signal); + kref_put(ubufs-kref, vhost_net_zerocopy_done_signal); wait_event(ubufs-wait, !atomic_read(ubufs-kref.refcount)); kfree(ubufs); } @@ -159,7 +160,7 @@ static void vhost_net_clear_ubuf_info(struct vhost_net *n) int i; for (i = 0; i n-dev.nvqs; ++i) { - zcopy = vhost_zcopy_mask (0x1 i); + zcopy = vhost_net_zcopy_mask (0x1 i); if (zcopy) kfree(n-vqs[i].ubuf_info); } @@ -171,7 +172,7 @@ int vhost_net_set_ubuf_info(struct vhost_net *n) int i; for (i = 0; i n-dev.nvqs; ++i) { - zcopy = vhost_zcopy_mask (0x1 i); + zcopy = vhost_net_zcopy_mask (0x1 i); if (!zcopy) continue; n-vqs[i].ubuf_info = kmalloc(sizeof(*n-vqs[i].ubuf_info) * @@ -183,7 +184,7 @@ int vhost_net_set_ubuf_info(struct vhost_net *n) err: while (i--) { - zcopy = vhost_zcopy_mask (0x1 i); + zcopy = vhost_net_zcopy_mask (0x1 i); if (!zcopy) continue; kfree(n-vqs[i].ubuf_info); @@ -305,7 +306,7 @@ static int vhost_zerocopy_signal_used(struct vhost_net *net, static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success) { - struct vhost_ubuf_ref *ubufs = ubuf-ctx; + struct vhost_net_ubuf_ref *ubufs = ubuf-ctx; struct vhost_virtqueue *vq = ubufs-vq; int cnt = atomic_read(ubufs-kref.refcount); @@ -322,7 +323,7 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success) /* set len to mark this desc buffers done DMA */ vq-heads[ubuf-desc].len = success ? VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN; - vhost_ubuf_put(ubufs); + vhost_net_ubuf_put(ubufs); } /* Expects to be always run from workqueue - which acts as @@ -345,7 +346,7 @@ static void handle_tx(struct vhost_net *net) int err; size_t hdr_size; struct socket *sock; - struct vhost_ubuf_ref *uninitialized_var(ubufs); + struct vhost_net_ubuf_ref
[PATCH v2 00/11] vhost cleanups
MST, This is on top of [PATCH 0/2] vhost-net fix ubuf. Asias He (11): vhost: Remove vhost_enable_zcopy in vhost.h vhost: Move VHOST_NET_FEATURES to net.c vhost: Make vhost a separate module vhost: Remove comments for hdr in vhost.h vhost: Simplify dev-vqs[i] access vhost-net: Cleanup vhost_ubuf and vhost_zcopy vhost-scsi: Remove unnecessary forward struct vhost_scsi declaration vhost-scsi: Rename struct vhost_scsi *s to *vs vhost-scsi: Make func indention more consistent vhost-scsi: Rename struct tcm_vhost_tpg *tv_tpg to *tpg vhost-scsi: Rename struct tcm_vhost_cmd *tv_cmd to *cmd drivers/vhost/Kconfig | 8 + drivers/vhost/Makefile | 3 +- drivers/vhost/net.c| 64 --- drivers/vhost/scsi.c | 470 ++--- drivers/vhost/vhost.c | 86 +++-- drivers/vhost/vhost.h | 11 +- 6 files changed, 361 insertions(+), 281 deletions(-) -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 01/11] vhost: Remove vhost_enable_zcopy in vhost.h
It is net.c specific. Signed-off-by: Asias He as...@redhat.com --- drivers/vhost/vhost.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index cc23bc4..076c9ac 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -192,7 +192,4 @@ static inline int vhost_has_feature(struct vhost_dev *dev, int bit) acked_features = rcu_dereference_index_check(dev-acked_features, 1); return acked_features (1 bit); } - -void vhost_enable_zcopy(int vq); - #endif -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 02/11] vhost: Move VHOST_NET_FEATURES to net.c
vhost.h should not depend on device specific marcos like VHOST_NET_F_VIRTIO_NET_HDR and VIRTIO_NET_F_MRG_RXBUF. Signed-off-by: Asias He as...@redhat.com --- drivers/vhost/net.c | 6 ++ drivers/vhost/vhost.h | 3 --- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 354665a..06b2447 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -59,6 +59,12 @@ MODULE_PARM_DESC(experimental_zcopytx, Enable Zero Copy TX; #define VHOST_DMA_IS_DONE(len) ((len) = VHOST_DMA_DONE_LEN) enum { + VHOST_NET_FEATURES = VHOST_FEATURES | +(1ULL VHOST_NET_F_VIRTIO_NET_HDR) | +(1ULL VIRTIO_NET_F_MRG_RXBUF), +}; + +enum { VHOST_NET_VQ_RX = 0, VHOST_NET_VQ_TX = 1, VHOST_NET_VQ_MAX = 2, diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 076c9ac..6bf81a9 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -178,9 +178,6 @@ enum { (1ULL VIRTIO_RING_F_INDIRECT_DESC) | (1ULL VIRTIO_RING_F_EVENT_IDX) | (1ULL VHOST_F_LOG_ALL), - VHOST_NET_FEATURES = VHOST_FEATURES | -(1ULL VHOST_NET_F_VIRTIO_NET_HDR) | -(1ULL VIRTIO_NET_F_MRG_RXBUF), }; static inline int vhost_has_feature(struct vhost_dev *dev, int bit) -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 03/11] vhost: Make vhost a separate module
Currently, vhost-net and vhost-scsi are sharing the vhost core code. However, vhost-scsi shares the code by including the vhost.c file directly. Making vhost a separate module makes it is easier to share code with other vhost devices. Signed-off-by: Asias He as...@redhat.com --- drivers/vhost/Kconfig | 8 drivers/vhost/Makefile | 3 ++- drivers/vhost/scsi.c | 1 - drivers/vhost/vhost.c | 51 +- drivers/vhost/vhost.h | 2 ++ 5 files changed, 62 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index 8b9226d..017a1e8 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -1,6 +1,7 @@ config VHOST_NET tristate Host kernel accelerator for virtio net depends on NET EVENTFD (TUN || !TUN) (MACVTAP || !MACVTAP) + select VHOST select VHOST_RING ---help--- This kernel module can be loaded in host kernel to accelerate @@ -13,6 +14,7 @@ config VHOST_NET config VHOST_SCSI tristate VHOST_SCSI TCM fabric driver depends on TARGET_CORE EVENTFD m + select VHOST select VHOST_RING default n ---help--- @@ -24,3 +26,9 @@ config VHOST_RING ---help--- This option is selected by any driver which needs to access the host side of a virtio ring. + +config VHOST + tristate + ---help--- + This option is selected by any driver which needs to access + the core of vhost. diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile index 654e9afb..e0441c3 100644 --- a/drivers/vhost/Makefile +++ b/drivers/vhost/Makefile @@ -1,7 +1,8 @@ obj-$(CONFIG_VHOST_NET) += vhost_net.o -vhost_net-y := vhost.o net.o +vhost_net-y := net.o obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o vhost_scsi-y := scsi.o obj-$(CONFIG_VHOST_RING) += vringh.o +obj-$(CONFIG_VHOST)+= vhost.o diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 5179f7a..2dcb94a 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -49,7 +49,6 @@ #include linux/llist.h #include linux/bitmap.h -#include vhost.c #include vhost.h #define TCM_VHOST_VERSION v0.1 diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index de9441a..e406d5f 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -25,6 +25,7 @@ #include linux/slab.h #include linux/kthread.h #include linux/cgroup.h +#include linux/module.h #include vhost.h @@ -66,6 +67,7 @@ void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn) work-flushing = 0; work-queue_seq = work-done_seq = 0; } +EXPORT_SYMBOL_GPL(vhost_work_init); /* Init poll structure */ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, @@ -79,6 +81,7 @@ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, vhost_work_init(poll-work, fn); } +EXPORT_SYMBOL_GPL(vhost_poll_init); /* Start polling a file. We add ourselves to file's wait queue. The caller must * keep a reference to a file until after vhost_poll_stop is called. */ @@ -101,6 +104,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file *file) return ret; } +EXPORT_SYMBOL_GPL(vhost_poll_start); /* Stop polling a file. After this function returns, it becomes safe to drop the * file reference. You must also flush afterwards. */ @@ -111,6 +115,7 @@ void vhost_poll_stop(struct vhost_poll *poll) poll-wqh = NULL; } } +EXPORT_SYMBOL_GPL(vhost_poll_stop); static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work, unsigned seq) @@ -123,7 +128,7 @@ static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work, return left = 0; } -static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) +void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) { unsigned seq; int flushing; @@ -138,6 +143,7 @@ static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) spin_unlock_irq(dev-work_lock); BUG_ON(flushing 0); } +EXPORT_SYMBOL_GPL(vhost_work_flush); /* Flush any work that has been scheduled. When calling this, don't hold any * locks that are also used by the callback. */ @@ -145,6 +151,7 @@ void vhost_poll_flush(struct vhost_poll *poll) { vhost_work_flush(poll-dev, poll-work); } +EXPORT_SYMBOL_GPL(vhost_poll_flush); void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) { @@ -158,11 +165,13 @@ void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) } spin_unlock_irqrestore(dev-work_lock, flags); } +EXPORT_SYMBOL_GPL(vhost_work_queue); void vhost_poll_queue(struct vhost_poll *poll) { vhost_work_queue(poll-dev, poll-work); } +EXPORT_SYMBOL_GPL(vhost_poll_queue); static void vhost_vq_reset(struct vhost_dev *dev,
[PATCH v2 04/11] vhost: Remove comments for hdr in vhost.h
It is supposed to be removed when hdr is moved into vhost_net_virtqueue. Signed-off-by: Asias He as...@redhat.com --- drivers/vhost/vhost.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 94a80eb..51aeb5f 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -101,9 +101,6 @@ struct vhost_virtqueue { u64 log_addr; struct iovec iov[UIO_MAXIOV]; - /* hdr is used to store the virtio header. -* Since each iovec has = 1 byte length, we never need more than -* header length entries to store the header. */ struct iovec *indirect; struct vring_used_elem *heads; /* We use a kind of RCU to access private pointer. -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 05/11] vhost: Simplify dev-vqs[i] access
Signed-off-by: Asias He as...@redhat.com --- drivers/vhost/vhost.c | 35 ++- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index e406d5f..74bc779 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -260,17 +260,16 @@ static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq) /* Helper to allocate iovec buffers for all vqs. */ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev) { + struct vhost_virtqueue *vq; int i; for (i = 0; i dev-nvqs; ++i) { - dev-vqs[i]-indirect = kmalloc(sizeof *dev-vqs[i]-indirect * - UIO_MAXIOV, GFP_KERNEL); - dev-vqs[i]-log = kmalloc(sizeof *dev-vqs[i]-log * UIO_MAXIOV, - GFP_KERNEL); - dev-vqs[i]-heads = kmalloc(sizeof *dev-vqs[i]-heads * - UIO_MAXIOV, GFP_KERNEL); - if (!dev-vqs[i]-indirect || !dev-vqs[i]-log || - !dev-vqs[i]-heads) + vq = dev-vqs[i]; + vq-indirect = kmalloc(sizeof *vq-indirect * UIO_MAXIOV, + GFP_KERNEL); + vq-log = kmalloc(sizeof *vq-log * UIO_MAXIOV, GFP_KERNEL); + vq-heads = kmalloc(sizeof *vq-heads * UIO_MAXIOV, GFP_KERNEL); + if (!vq-indirect || !vq-log || !vq-heads) goto err_nomem; } return 0; @@ -292,6 +291,7 @@ static void vhost_dev_free_iovecs(struct vhost_dev *dev) long vhost_dev_init(struct vhost_dev *dev, struct vhost_virtqueue **vqs, int nvqs) { + struct vhost_virtqueue *vq; int i; dev-vqs = vqs; @@ -306,15 +306,16 @@ long vhost_dev_init(struct vhost_dev *dev, dev-worker = NULL; for (i = 0; i dev-nvqs; ++i) { - dev-vqs[i]-log = NULL; - dev-vqs[i]-indirect = NULL; - dev-vqs[i]-heads = NULL; - dev-vqs[i]-dev = dev; - mutex_init(dev-vqs[i]-mutex); - vhost_vq_reset(dev, dev-vqs[i]); - if (dev-vqs[i]-handle_kick) - vhost_poll_init(dev-vqs[i]-poll, - dev-vqs[i]-handle_kick, POLLIN, dev); + vq = dev-vqs[i]; + vq-log = NULL; + vq-indirect = NULL; + vq-heads = NULL; + vq-dev = dev; + mutex_init(vq-mutex); + vhost_vq_reset(dev, vq); + if (vq-handle_kick) + vhost_poll_init(vq-poll, vq-handle_kick, + POLLIN, dev); } return 0; -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 06/11] vhost-net: Cleanup vhost_ubuf and vhost_zcopy
- Rename vhost_ubuf to vhost_net_ubuf - Rename vhost_zcopy_mask to vhost_net_zcopy_mask - Make funcs static Signed-off-by: Asias He as...@redhat.com --- drivers/vhost/net.c | 58 +++-- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 06b2447..2b51e23 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -70,7 +70,7 @@ enum { VHOST_NET_VQ_MAX = 2, }; -struct vhost_ubuf_ref { +struct vhost_net_ubuf_ref { struct kref kref; wait_queue_head_t wait; struct vhost_virtqueue *vq; @@ -93,7 +93,7 @@ struct vhost_net_virtqueue { struct ubuf_info *ubuf_info; /* Reference counting for outstanding ubufs. * Protected by vq mutex. Writers must also take device mutex. */ - struct vhost_ubuf_ref *ubufs; + struct vhost_net_ubuf_ref *ubufs; }; struct vhost_net { @@ -110,24 +110,25 @@ struct vhost_net { bool tx_flush; }; -static unsigned vhost_zcopy_mask __read_mostly; +static unsigned vhost_net_zcopy_mask __read_mostly; -void vhost_enable_zcopy(int vq) +static void vhost_net_enable_zcopy(int vq) { - vhost_zcopy_mask |= 0x1 vq; + vhost_net_zcopy_mask |= 0x1 vq; } -static void vhost_zerocopy_done_signal(struct kref *kref) +static void vhost_net_zerocopy_done_signal(struct kref *kref) { - struct vhost_ubuf_ref *ubufs = container_of(kref, struct vhost_ubuf_ref, - kref); + struct vhost_net_ubuf_ref *ubufs; + + ubufs = container_of(kref, struct vhost_net_ubuf_ref, kref); wake_up(ubufs-wait); } -struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *vq, - bool zcopy) +static struct vhost_net_ubuf_ref * +vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy) { - struct vhost_ubuf_ref *ubufs; + struct vhost_net_ubuf_ref *ubufs; /* No zero copy backend? Nothing to count. */ if (!zcopy) return NULL; @@ -140,14 +141,14 @@ struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *vq, return ubufs; } -void vhost_ubuf_put(struct vhost_ubuf_ref *ubufs) +static void vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs) { - kref_put(ubufs-kref, vhost_zerocopy_done_signal); + kref_put(ubufs-kref, vhost_net_zerocopy_done_signal); } -void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *ubufs) +static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs) { - kref_put(ubufs-kref, vhost_zerocopy_done_signal); + kref_put(ubufs-kref, vhost_net_zerocopy_done_signal); wait_event(ubufs-wait, !atomic_read(ubufs-kref.refcount)); kfree(ubufs); } @@ -159,7 +160,7 @@ static void vhost_net_clear_ubuf_info(struct vhost_net *n) int i; for (i = 0; i n-dev.nvqs; ++i) { - zcopy = vhost_zcopy_mask (0x1 i); + zcopy = vhost_net_zcopy_mask (0x1 i); if (zcopy) kfree(n-vqs[i].ubuf_info); } @@ -171,7 +172,7 @@ int vhost_net_set_ubuf_info(struct vhost_net *n) int i; for (i = 0; i n-dev.nvqs; ++i) { - zcopy = vhost_zcopy_mask (0x1 i); + zcopy = vhost_net_zcopy_mask (0x1 i); if (!zcopy) continue; n-vqs[i].ubuf_info = kmalloc(sizeof(*n-vqs[i].ubuf_info) * @@ -183,7 +184,7 @@ int vhost_net_set_ubuf_info(struct vhost_net *n) err: while (i--) { - zcopy = vhost_zcopy_mask (0x1 i); + zcopy = vhost_net_zcopy_mask (0x1 i); if (!zcopy) continue; kfree(n-vqs[i].ubuf_info); @@ -305,7 +306,7 @@ static int vhost_zerocopy_signal_used(struct vhost_net *net, static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success) { - struct vhost_ubuf_ref *ubufs = ubuf-ctx; + struct vhost_net_ubuf_ref *ubufs = ubuf-ctx; struct vhost_virtqueue *vq = ubufs-vq; int cnt = atomic_read(ubufs-kref.refcount); @@ -322,7 +323,7 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success) /* set len to mark this desc buffers done DMA */ vq-heads[ubuf-desc].len = success ? VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN; - vhost_ubuf_put(ubufs); + vhost_net_ubuf_put(ubufs); } /* Expects to be always run from workqueue - which acts as @@ -345,7 +346,7 @@ static void handle_tx(struct vhost_net *net) int err; size_t hdr_size; struct socket *sock; - struct vhost_ubuf_ref *uninitialized_var(ubufs); + struct vhost_net_ubuf_ref *uninitialized_var(ubufs); bool zcopy, zcopy_used; /* TODO: check that we are running from vhost_worker? */ @@ -441,7 +442,7 @@ static void handle_tx(struct vhost_net
[PATCH v2 07/11] vhost-scsi: Remove unnecessary forward struct vhost_scsi declaration
It was needed when struct tcm_vhost_tpg is in tcm_vhost.h Signed-off-by: Asias He as...@redhat.com --- drivers/vhost/scsi.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 2dcb94a..02ddedd 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -115,7 +115,6 @@ struct tcm_vhost_nacl { struct se_node_acl se_node_acl; }; -struct vhost_scsi; struct tcm_vhost_tpg { /* Vhost port target portal group tag for TCM */ u16 tport_tpgt; -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 08/11] vhost-scsi: Rename struct vhost_scsi *s to *vs
vs is used everywhere, make the naming more consistent. Signed-off-by: Asias He as...@redhat.com --- drivers/vhost/scsi.c | 56 ++-- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 02ddedd..d4798e1 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -1342,63 +1342,63 @@ static int vhost_scsi_set_features(struct vhost_scsi *vs, u64 features) static int vhost_scsi_open(struct inode *inode, struct file *f) { - struct vhost_scsi *s; + struct vhost_scsi *vs; struct vhost_virtqueue **vqs; int r, i; - s = kzalloc(sizeof(*s), GFP_KERNEL); - if (!s) + vs = kzalloc(sizeof(*vs), GFP_KERNEL); + if (!vs) return -ENOMEM; vqs = kmalloc(VHOST_SCSI_MAX_VQ * sizeof(*vqs), GFP_KERNEL); if (!vqs) { - kfree(s); + kfree(vs); return -ENOMEM; } - vhost_work_init(s-vs_completion_work, vhost_scsi_complete_cmd_work); - vhost_work_init(s-vs_event_work, tcm_vhost_evt_work); + vhost_work_init(vs-vs_completion_work, vhost_scsi_complete_cmd_work); + vhost_work_init(vs-vs_event_work, tcm_vhost_evt_work); - s-vs_events_nr = 0; - s-vs_events_missed = false; + vs-vs_events_nr = 0; + vs-vs_events_missed = false; - vqs[VHOST_SCSI_VQ_CTL] = s-vqs[VHOST_SCSI_VQ_CTL].vq; - vqs[VHOST_SCSI_VQ_EVT] = s-vqs[VHOST_SCSI_VQ_EVT].vq; - s-vqs[VHOST_SCSI_VQ_CTL].vq.handle_kick = vhost_scsi_ctl_handle_kick; - s-vqs[VHOST_SCSI_VQ_EVT].vq.handle_kick = vhost_scsi_evt_handle_kick; + vqs[VHOST_SCSI_VQ_CTL] = vs-vqs[VHOST_SCSI_VQ_CTL].vq; + vqs[VHOST_SCSI_VQ_EVT] = vs-vqs[VHOST_SCSI_VQ_EVT].vq; + vs-vqs[VHOST_SCSI_VQ_CTL].vq.handle_kick = vhost_scsi_ctl_handle_kick; + vs-vqs[VHOST_SCSI_VQ_EVT].vq.handle_kick = vhost_scsi_evt_handle_kick; for (i = VHOST_SCSI_VQ_IO; i VHOST_SCSI_MAX_VQ; i++) { - vqs[i] = s-vqs[i].vq; - s-vqs[i].vq.handle_kick = vhost_scsi_handle_kick; + vqs[i] = vs-vqs[i].vq; + vs-vqs[i].vq.handle_kick = vhost_scsi_handle_kick; } - r = vhost_dev_init(s-dev, vqs, VHOST_SCSI_MAX_VQ); + r = vhost_dev_init(vs-dev, vqs, VHOST_SCSI_MAX_VQ); - tcm_vhost_init_inflight(s, NULL); + tcm_vhost_init_inflight(vs, NULL); if (r 0) { kfree(vqs); - kfree(s); + kfree(vs); return r; } - f-private_data = s; + f-private_data = vs; return 0; } static int vhost_scsi_release(struct inode *inode, struct file *f) { - struct vhost_scsi *s = f-private_data; + struct vhost_scsi *vs = f-private_data; struct vhost_scsi_target t; - mutex_lock(s-dev.mutex); - memcpy(t.vhost_wwpn, s-vs_vhost_wwpn, sizeof(t.vhost_wwpn)); - mutex_unlock(s-dev.mutex); - vhost_scsi_clear_endpoint(s, t); - vhost_dev_stop(s-dev); - vhost_dev_cleanup(s-dev, false); + mutex_lock(vs-dev.mutex); + memcpy(t.vhost_wwpn, vs-vs_vhost_wwpn, sizeof(t.vhost_wwpn)); + mutex_unlock(vs-dev.mutex); + vhost_scsi_clear_endpoint(vs, t); + vhost_dev_stop(vs-dev); + vhost_dev_cleanup(vs-dev, false); /* Jobs can re-queue themselves in evt kick handler. Do extra flush. */ - vhost_scsi_flush(s); - kfree(s-dev.vqs); - kfree(s); + vhost_scsi_flush(vs); + kfree(vs-dev.vqs); + kfree(vs); return 0; } -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 09/11] vhost-scsi: Make func indention more consistent
Signed-off-by: Asias He as...@redhat.com --- drivers/vhost/scsi.c | 154 +-- 1 file changed, 88 insertions(+), 66 deletions(-) diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index d4798e1..d9781ed 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -333,11 +333,12 @@ static u32 tcm_vhost_get_default_depth(struct se_portal_group *se_tpg) return 1; } -static u32 tcm_vhost_get_pr_transport_id(struct se_portal_group *se_tpg, - struct se_node_acl *se_nacl, - struct t10_pr_registration *pr_reg, - int *format_code, - unsigned char *buf) +static u32 +tcm_vhost_get_pr_transport_id(struct se_portal_group *se_tpg, + struct se_node_acl *se_nacl, + struct t10_pr_registration *pr_reg, + int *format_code, + unsigned char *buf) { struct tcm_vhost_tpg *tpg = container_of(se_tpg, struct tcm_vhost_tpg, se_tpg); @@ -363,10 +364,11 @@ static u32 tcm_vhost_get_pr_transport_id(struct se_portal_group *se_tpg, format_code, buf); } -static u32 tcm_vhost_get_pr_transport_id_len(struct se_portal_group *se_tpg, - struct se_node_acl *se_nacl, - struct t10_pr_registration *pr_reg, - int *format_code) +static u32 +tcm_vhost_get_pr_transport_id_len(struct se_portal_group *se_tpg, + struct se_node_acl *se_nacl, + struct t10_pr_registration *pr_reg, + int *format_code) { struct tcm_vhost_tpg *tpg = container_of(se_tpg, struct tcm_vhost_tpg, se_tpg); @@ -392,10 +394,11 @@ static u32 tcm_vhost_get_pr_transport_id_len(struct se_portal_group *se_tpg, format_code); } -static char *tcm_vhost_parse_pr_out_transport_id(struct se_portal_group *se_tpg, - const char *buf, - u32 *out_tid_len, - char **port_nexus_ptr) +static char * +tcm_vhost_parse_pr_out_transport_id(struct se_portal_group *se_tpg, + const char *buf, + u32 *out_tid_len, + char **port_nexus_ptr) { struct tcm_vhost_tpg *tpg = container_of(se_tpg, struct tcm_vhost_tpg, se_tpg); @@ -421,8 +424,8 @@ static char *tcm_vhost_parse_pr_out_transport_id(struct se_portal_group *se_tpg, port_nexus_ptr); } -static struct se_node_acl *tcm_vhost_alloc_fabric_acl( - struct se_portal_group *se_tpg) +static struct se_node_acl * +tcm_vhost_alloc_fabric_acl(struct se_portal_group *se_tpg) { struct tcm_vhost_nacl *nacl; @@ -435,8 +438,9 @@ static struct se_node_acl *tcm_vhost_alloc_fabric_acl( return nacl-se_node_acl; } -static void tcm_vhost_release_fabric_acl(struct se_portal_group *se_tpg, - struct se_node_acl *se_nacl) +static void +tcm_vhost_release_fabric_acl(struct se_portal_group *se_tpg, +struct se_node_acl *se_nacl) { struct tcm_vhost_nacl *nacl = container_of(se_nacl, struct tcm_vhost_nacl, se_node_acl); @@ -531,8 +535,9 @@ static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt *evt) kfree(evt); } -static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs, - u32 event, u32 reason) +static struct tcm_vhost_evt * +tcm_vhost_allocate_evt(struct vhost_scsi *vs, + u32 event, u32 reason) { struct vhost_virtqueue *vq = vs-vqs[VHOST_SCSI_VQ_EVT].vq; struct tcm_vhost_evt *evt; @@ -576,8 +581,8 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd) kfree(tv_cmd); } -static void tcm_vhost_do_evt_work(struct vhost_scsi *vs, - struct tcm_vhost_evt *evt) +static void +tcm_vhost_do_evt_work(struct vhost_scsi *vs, struct tcm_vhost_evt *evt) { struct vhost_virtqueue *vq = vs-vqs[VHOST_SCSI_VQ_EVT].vq; struct virtio_scsi_event *event = evt-event; @@ -698,12 +703,12 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work) vhost_signal(vs-dev, vs-vqs[vq].vq); } -static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd( - struct vhost_virtqueue *vq, - struct tcm_vhost_tpg *tv_tpg, - struct virtio_scsi_cmd_req *v_req, - u32 exp_data_len, - int data_direction) +static struct tcm_vhost_cmd * +vhost_scsi_allocate_cmd(struct vhost_virtqueue *vq, + struct tcm_vhost_tpg *tv_tpg, + struct virtio_scsi_cmd_req *v_req, + u32 exp_data_len, + int data_direction) { struct tcm_vhost_cmd *tv_cmd; struct tcm_vhost_nexus *tv_nexus; @@ -734,8 +739,11 @@ static struct tcm_vhost_cmd
[PATCH v2 10/11] vhost-scsi: Rename struct tcm_vhost_tpg *tv_tpg to *tpg
Signed-off-by: Asias He as...@redhat.com --- drivers/vhost/scsi.c | 122 +-- 1 file changed, 61 insertions(+), 61 deletions(-) diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index d9781ed..353145f 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -705,7 +705,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work) static struct tcm_vhost_cmd * vhost_scsi_allocate_cmd(struct vhost_virtqueue *vq, - struct tcm_vhost_tpg *tv_tpg, + struct tcm_vhost_tpg *tpg, struct virtio_scsi_cmd_req *v_req, u32 exp_data_len, int data_direction) @@ -713,7 +713,7 @@ vhost_scsi_allocate_cmd(struct vhost_virtqueue *vq, struct tcm_vhost_cmd *tv_cmd; struct tcm_vhost_nexus *tv_nexus; - tv_nexus = tv_tpg-tpg_nexus; + tv_nexus = tpg-tpg_nexus; if (!tv_nexus) { pr_err(Unable to locate active struct tcm_vhost_nexus\n); return ERR_PTR(-EIO); @@ -895,7 +895,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) { struct tcm_vhost_tpg **vs_tpg; struct virtio_scsi_cmd_req v_req; - struct tcm_vhost_tpg *tv_tpg; + struct tcm_vhost_tpg *tpg; struct tcm_vhost_cmd *tv_cmd; u32 exp_data_len, data_first, data_num, data_direction; unsigned out, in, i; @@ -981,10 +981,10 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) /* Extract the tpgt */ target = v_req.lun[1]; - tv_tpg = ACCESS_ONCE(vs_tpg[target]); + tpg = ACCESS_ONCE(vs_tpg[target]); /* Target does not exist, fail the request */ - if (unlikely(!tv_tpg)) { + if (unlikely(!tpg)) { vhost_scsi_send_bad_target(vs, vq, head, out); continue; } @@ -993,7 +993,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) for (i = 0; i data_num; i++) exp_data_len += vq-iov[data_first + i].iov_len; - tv_cmd = vhost_scsi_allocate_cmd(vq, tv_tpg, v_req, + tv_cmd = vhost_scsi_allocate_cmd(vq, tpg, v_req, exp_data_len, data_direction); if (IS_ERR(tv_cmd)) { vq_err(vq, vhost_scsi_allocate_cmd failed %ld\n, @@ -1172,7 +1172,7 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs, struct vhost_scsi_target *t) { struct tcm_vhost_tport *tv_tport; - struct tcm_vhost_tpg *tv_tpg; + struct tcm_vhost_tpg *tpg; struct tcm_vhost_tpg **vs_tpg; struct vhost_virtqueue *vq; int index, ret, i, len; @@ -1199,32 +1199,32 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs, if (vs-vs_tpg) memcpy(vs_tpg, vs-vs_tpg, len); - list_for_each_entry(tv_tpg, tcm_vhost_list, tv_tpg_list) { - mutex_lock(tv_tpg-tv_tpg_mutex); - if (!tv_tpg-tpg_nexus) { - mutex_unlock(tv_tpg-tv_tpg_mutex); + list_for_each_entry(tpg, tcm_vhost_list, tv_tpg_list) { + mutex_lock(tpg-tv_tpg_mutex); + if (!tpg-tpg_nexus) { + mutex_unlock(tpg-tv_tpg_mutex); continue; } - if (tv_tpg-tv_tpg_vhost_count != 0) { - mutex_unlock(tv_tpg-tv_tpg_mutex); + if (tpg-tv_tpg_vhost_count != 0) { + mutex_unlock(tpg-tv_tpg_mutex); continue; } - tv_tport = tv_tpg-tport; + tv_tport = tpg-tport; if (!strcmp(tv_tport-tport_name, t-vhost_wwpn)) { - if (vs-vs_tpg vs-vs_tpg[tv_tpg-tport_tpgt]) { + if (vs-vs_tpg vs-vs_tpg[tpg-tport_tpgt]) { kfree(vs_tpg); - mutex_unlock(tv_tpg-tv_tpg_mutex); + mutex_unlock(tpg-tv_tpg_mutex); ret = -EEXIST; goto out; } - tv_tpg-tv_tpg_vhost_count++; - tv_tpg-vhost_scsi = vs; - vs_tpg[tv_tpg-tport_tpgt] = tv_tpg; + tpg-tv_tpg_vhost_count++; + tpg-vhost_scsi = vs; + vs_tpg[tpg-tport_tpgt] = tpg; smp_mb__after_atomic_inc(); match = true; } - mutex_unlock(tv_tpg-tv_tpg_mutex); + mutex_unlock(tpg-tv_tpg_mutex); } if (match) { @@ -1262,7 +1262,7 @@ vhost_scsi_clear_endpoint(struct vhost_scsi *vs,
[PATCH v2 11/11] vhost-scsi: Rename struct tcm_vhost_cmd *tv_cmd to *cmd
This way, we use cmd for struct tcm_vhost_cmd and evt for struct tcm_vhost_cmd. Signed-off-by: Asias He as...@redhat.com --- drivers/vhost/scsi.c | 142 +-- 1 file changed, 71 insertions(+), 71 deletions(-) diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 353145f..d860b58 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -499,28 +499,28 @@ static int tcm_vhost_get_cmd_state(struct se_cmd *se_cmd) return 0; } -static void vhost_scsi_complete_cmd(struct tcm_vhost_cmd *tv_cmd) +static void vhost_scsi_complete_cmd(struct tcm_vhost_cmd *cmd) { - struct vhost_scsi *vs = tv_cmd-tvc_vhost; + struct vhost_scsi *vs = cmd-tvc_vhost; - llist_add(tv_cmd-tvc_completion_list, vs-vs_completion_list); + llist_add(cmd-tvc_completion_list, vs-vs_completion_list); vhost_work_queue(vs-dev, vs-vs_completion_work); } static int tcm_vhost_queue_data_in(struct se_cmd *se_cmd) { - struct tcm_vhost_cmd *tv_cmd = container_of(se_cmd, + struct tcm_vhost_cmd *cmd = container_of(se_cmd, struct tcm_vhost_cmd, tvc_se_cmd); - vhost_scsi_complete_cmd(tv_cmd); + vhost_scsi_complete_cmd(cmd); return 0; } static int tcm_vhost_queue_status(struct se_cmd *se_cmd) { - struct tcm_vhost_cmd *tv_cmd = container_of(se_cmd, + struct tcm_vhost_cmd *cmd = container_of(se_cmd, struct tcm_vhost_cmd, tvc_se_cmd); - vhost_scsi_complete_cmd(tv_cmd); + vhost_scsi_complete_cmd(cmd); return 0; } @@ -561,24 +561,24 @@ tcm_vhost_allocate_evt(struct vhost_scsi *vs, return evt; } -static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd) +static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *cmd) { - struct se_cmd *se_cmd = tv_cmd-tvc_se_cmd; + struct se_cmd *se_cmd = cmd-tvc_se_cmd; /* TODO locking against target/backend threads? */ transport_generic_free_cmd(se_cmd, 1); - if (tv_cmd-tvc_sgl_count) { + if (cmd-tvc_sgl_count) { u32 i; - for (i = 0; i tv_cmd-tvc_sgl_count; i++) - put_page(sg_page(tv_cmd-tvc_sgl[i])); + for (i = 0; i cmd-tvc_sgl_count; i++) + put_page(sg_page(cmd-tvc_sgl[i])); - kfree(tv_cmd-tvc_sgl); + kfree(cmd-tvc_sgl); } - tcm_vhost_put_inflight(tv_cmd-inflight); + tcm_vhost_put_inflight(cmd-inflight); - kfree(tv_cmd); + kfree(cmd); } static void @@ -661,7 +661,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work) vs_completion_work); DECLARE_BITMAP(signal, VHOST_SCSI_MAX_VQ); struct virtio_scsi_cmd_resp v_rsp; - struct tcm_vhost_cmd *tv_cmd; + struct tcm_vhost_cmd *cmd; struct llist_node *llnode; struct se_cmd *se_cmd; int ret, vq; @@ -669,32 +669,32 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work) bitmap_zero(signal, VHOST_SCSI_MAX_VQ); llnode = llist_del_all(vs-vs_completion_list); while (llnode) { - tv_cmd = llist_entry(llnode, struct tcm_vhost_cmd, + cmd = llist_entry(llnode, struct tcm_vhost_cmd, tvc_completion_list); llnode = llist_next(llnode); - se_cmd = tv_cmd-tvc_se_cmd; + se_cmd = cmd-tvc_se_cmd; pr_debug(%s tv_cmd %p resid %u status %#02x\n, __func__, - tv_cmd, se_cmd-residual_count, se_cmd-scsi_status); + cmd, se_cmd-residual_count, se_cmd-scsi_status); memset(v_rsp, 0, sizeof(v_rsp)); v_rsp.resid = se_cmd-residual_count; /* TODO is status_qualifier field needed? */ v_rsp.status = se_cmd-scsi_status; v_rsp.sense_len = se_cmd-scsi_sense_length; - memcpy(v_rsp.sense, tv_cmd-tvc_sense_buf, + memcpy(v_rsp.sense, cmd-tvc_sense_buf, v_rsp.sense_len); - ret = copy_to_user(tv_cmd-tvc_resp, v_rsp, sizeof(v_rsp)); + ret = copy_to_user(cmd-tvc_resp, v_rsp, sizeof(v_rsp)); if (likely(ret == 0)) { struct vhost_scsi_virtqueue *q; - vhost_add_used(tv_cmd-tvc_vq, tv_cmd-tvc_vq_desc, 0); - q = container_of(tv_cmd-tvc_vq, struct vhost_scsi_virtqueue, vq); + vhost_add_used(cmd-tvc_vq, cmd-tvc_vq_desc, 0); + q = container_of(cmd-tvc_vq, struct vhost_scsi_virtqueue, vq); vq = q - vs-vqs; __set_bit(vq, signal); } else pr_err(Faulted on virtio_scsi_cmd_resp\n); -
Re: [PATCH v2 00/11] vhost cleanups
On Mon, May 06, 2013 at 04:38:18PM +0800, Asias He wrote: MST, This is on top of [PATCH 0/2] vhost-net fix ubuf. Acked-by: Michael S. Tsirkin m...@redhat.com Once -rc1 is out I'll fork -next and apply them. Thanks a lot! Nicholas, recently attempts to push patches through both net and target trees resulted in a bit of a mess, so let's stick to the common tree (unless there's a dependency that makes us not to) until rate of changes in the common code calms down a bit. OK? Asias He (11): vhost: Remove vhost_enable_zcopy in vhost.h vhost: Move VHOST_NET_FEATURES to net.c vhost: Make vhost a separate module vhost: Remove comments for hdr in vhost.h vhost: Simplify dev-vqs[i] access vhost-net: Cleanup vhost_ubuf and vhost_zcopy vhost-scsi: Remove unnecessary forward struct vhost_scsi declaration vhost-scsi: Rename struct vhost_scsi *s to *vs vhost-scsi: Make func indention more consistent vhost-scsi: Rename struct tcm_vhost_tpg *tv_tpg to *tpg vhost-scsi: Rename struct tcm_vhost_cmd *tv_cmd to *cmd drivers/vhost/Kconfig | 8 + drivers/vhost/Makefile | 3 +- drivers/vhost/net.c| 64 --- drivers/vhost/scsi.c | 470 ++--- drivers/vhost/vhost.c | 86 +++-- drivers/vhost/vhost.h | 11 +- 6 files changed, 361 insertions(+), 281 deletions(-) -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/3] vhost cleanups and separate module
On Mon, May 06, 2013 at 03:41:36PM +0930, Rusty Russell wrote: Asias He as...@redhat.com writes: Asias He (3): vhost: Remove vhost_enable_zcopy in vhost.h vhost: Move VHOST_NET_FEATURES to net.c vhost: Make vhost a separate module I like these cleanups, MST pleasee apply. Absolutely. Except it's 3.11 material and I can only usefully create a -next branch once -rc1 is out. I have some other cleanups which are on hold for the moment pending MST's vhost_net simplification. MST, how's that going? Not too well. The array of status bytes which was designed to complete packets in order turns out to be a very efficient datastructure: It gives us a way to signal completions that is completely lockless for multiple completers, and using the producer/consumer model saves extra scans for the common case. Overall I can save some memory and clean up some code but can't get rid of the producer/consumer idices (currently named upend/done indices) which is what you asked me to do. Your cleanups basically don't work with zcopy because they ignore the upend/done indices? Would you like to post them, noting they only work with zcopy off, and we'll look for a way to apply them, together? Thanks, Rusty. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 03/11] vhost: Make vhost a separate module
On Mon, May 06, 2013 at 04:38:21PM +0800, Asias He wrote: Currently, vhost-net and vhost-scsi are sharing the vhost core code. However, vhost-scsi shares the code by including the vhost.c file directly. Making vhost a separate module makes it is easier to share code with other vhost devices. Signed-off-by: Asias He as...@redhat.com --- drivers/vhost/Kconfig | 8 drivers/vhost/Makefile | 3 ++- drivers/vhost/scsi.c | 1 - drivers/vhost/vhost.c | 51 +- drivers/vhost/vhost.h | 2 ++ 5 files changed, 62 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index 8b9226d..017a1e8 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -1,6 +1,7 @@ config VHOST_NET tristate Host kernel accelerator for virtio net depends on NET EVENTFD (TUN || !TUN) (MACVTAP || !MACVTAP) + select VHOST select VHOST_RING ---help--- This kernel module can be loaded in host kernel to accelerate @@ -13,6 +14,7 @@ config VHOST_NET config VHOST_SCSI tristate VHOST_SCSI TCM fabric driver depends on TARGET_CORE EVENTFD m + select VHOST select VHOST_RING default n ---help--- @@ -24,3 +26,9 @@ config VHOST_RING ---help--- This option is selected by any driver which needs to access the host side of a virtio ring. + +config VHOST + tristate + ---help--- + This option is selected by any driver which needs to access + the core of vhost. diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile index 654e9afb..e0441c3 100644 --- a/drivers/vhost/Makefile +++ b/drivers/vhost/Makefile @@ -1,7 +1,8 @@ obj-$(CONFIG_VHOST_NET) += vhost_net.o -vhost_net-y := vhost.o net.o +vhost_net-y := net.o obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o vhost_scsi-y := scsi.o obj-$(CONFIG_VHOST_RING) += vringh.o +obj-$(CONFIG_VHOST) += vhost.o diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 5179f7a..2dcb94a 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -49,7 +49,6 @@ #include linux/llist.h #include linux/bitmap.h -#include vhost.c #include vhost.h #define TCM_VHOST_VERSION v0.1 diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index de9441a..e406d5f 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -25,6 +25,7 @@ #include linux/slab.h #include linux/kthread.h #include linux/cgroup.h +#include linux/module.h #include vhost.h @@ -66,6 +67,7 @@ void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn) work-flushing = 0; work-queue_seq = work-done_seq = 0; } +EXPORT_SYMBOL_GPL(vhost_work_init); /* Init poll structure */ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, @@ -79,6 +81,7 @@ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, vhost_work_init(poll-work, fn); } +EXPORT_SYMBOL_GPL(vhost_poll_init); /* Start polling a file. We add ourselves to file's wait queue. The caller must * keep a reference to a file until after vhost_poll_stop is called. */ @@ -101,6 +104,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file *file) return ret; } +EXPORT_SYMBOL_GPL(vhost_poll_start); /* Stop polling a file. After this function returns, it becomes safe to drop the * file reference. You must also flush afterwards. */ @@ -111,6 +115,7 @@ void vhost_poll_stop(struct vhost_poll *poll) poll-wqh = NULL; } } +EXPORT_SYMBOL_GPL(vhost_poll_stop); static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work, unsigned seq) @@ -123,7 +128,7 @@ static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work, return left = 0; } -static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) +void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) { unsigned seq; int flushing; @@ -138,6 +143,7 @@ static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) spin_unlock_irq(dev-work_lock); BUG_ON(flushing 0); } +EXPORT_SYMBOL_GPL(vhost_work_flush); /* Flush any work that has been scheduled. When calling this, don't hold any * locks that are also used by the callback. */ @@ -145,6 +151,7 @@ void vhost_poll_flush(struct vhost_poll *poll) { vhost_work_flush(poll-dev, poll-work); } +EXPORT_SYMBOL_GPL(vhost_poll_flush); void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) { @@ -158,11 +165,13 @@ void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) } spin_unlock_irqrestore(dev-work_lock, flags); } +EXPORT_SYMBOL_GPL(vhost_work_queue); void vhost_poll_queue(struct vhost_poll *poll) {
Re: [PATCH v2 03/11] vhost: Make vhost a separate module
On Mon, May 06, 2013 at 04:38:21PM +0800, Asias He wrote: Currently, vhost-net and vhost-scsi are sharing the vhost core code. However, vhost-scsi shares the code by including the vhost.c file directly. Making vhost a separate module makes it is easier to share code with other vhost devices. Signed-off-by: Asias He as...@redhat.com Also this will break test.c, right? Let's fix it in the same commit too. --- drivers/vhost/Kconfig | 8 drivers/vhost/Makefile | 3 ++- drivers/vhost/scsi.c | 1 - drivers/vhost/vhost.c | 51 +- drivers/vhost/vhost.h | 2 ++ 5 files changed, 62 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index 8b9226d..017a1e8 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -1,6 +1,7 @@ config VHOST_NET tristate Host kernel accelerator for virtio net depends on NET EVENTFD (TUN || !TUN) (MACVTAP || !MACVTAP) + select VHOST select VHOST_RING ---help--- This kernel module can be loaded in host kernel to accelerate @@ -13,6 +14,7 @@ config VHOST_NET config VHOST_SCSI tristate VHOST_SCSI TCM fabric driver depends on TARGET_CORE EVENTFD m + select VHOST select VHOST_RING default n ---help--- @@ -24,3 +26,9 @@ config VHOST_RING ---help--- This option is selected by any driver which needs to access the host side of a virtio ring. + +config VHOST + tristate + ---help--- + This option is selected by any driver which needs to access + the core of vhost. diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile index 654e9afb..e0441c3 100644 --- a/drivers/vhost/Makefile +++ b/drivers/vhost/Makefile @@ -1,7 +1,8 @@ obj-$(CONFIG_VHOST_NET) += vhost_net.o -vhost_net-y := vhost.o net.o +vhost_net-y := net.o obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o vhost_scsi-y := scsi.o obj-$(CONFIG_VHOST_RING) += vringh.o +obj-$(CONFIG_VHOST) += vhost.o diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 5179f7a..2dcb94a 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -49,7 +49,6 @@ #include linux/llist.h #include linux/bitmap.h -#include vhost.c #include vhost.h #define TCM_VHOST_VERSION v0.1 diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index de9441a..e406d5f 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -25,6 +25,7 @@ #include linux/slab.h #include linux/kthread.h #include linux/cgroup.h +#include linux/module.h #include vhost.h @@ -66,6 +67,7 @@ void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn) work-flushing = 0; work-queue_seq = work-done_seq = 0; } +EXPORT_SYMBOL_GPL(vhost_work_init); /* Init poll structure */ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, @@ -79,6 +81,7 @@ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, vhost_work_init(poll-work, fn); } +EXPORT_SYMBOL_GPL(vhost_poll_init); /* Start polling a file. We add ourselves to file's wait queue. The caller must * keep a reference to a file until after vhost_poll_stop is called. */ @@ -101,6 +104,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file *file) return ret; } +EXPORT_SYMBOL_GPL(vhost_poll_start); /* Stop polling a file. After this function returns, it becomes safe to drop the * file reference. You must also flush afterwards. */ @@ -111,6 +115,7 @@ void vhost_poll_stop(struct vhost_poll *poll) poll-wqh = NULL; } } +EXPORT_SYMBOL_GPL(vhost_poll_stop); static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work, unsigned seq) @@ -123,7 +128,7 @@ static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work, return left = 0; } -static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) +void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) { unsigned seq; int flushing; @@ -138,6 +143,7 @@ static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) spin_unlock_irq(dev-work_lock); BUG_ON(flushing 0); } +EXPORT_SYMBOL_GPL(vhost_work_flush); /* Flush any work that has been scheduled. When calling this, don't hold any * locks that are also used by the callback. */ @@ -145,6 +151,7 @@ void vhost_poll_flush(struct vhost_poll *poll) { vhost_work_flush(poll-dev, poll-work); } +EXPORT_SYMBOL_GPL(vhost_poll_flush); void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) { @@ -158,11 +165,13 @@ void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) } spin_unlock_irqrestore(dev-work_lock, flags); }
Re: [PATCH v2 00/11] vhost cleanups
On Mon, May 06, 2013 at 04:38:18PM +0800, Asias He wrote: MST, This is on top of [PATCH 0/2] vhost-net fix ubuf. Okay, how about making EVENT_IDX work for virtio-scsi? I'm guessing it's some messup with feature negotiation, that's what all event-idx bugs came down to so far. Asias He (11): vhost: Remove vhost_enable_zcopy in vhost.h vhost: Move VHOST_NET_FEATURES to net.c vhost: Make vhost a separate module vhost: Remove comments for hdr in vhost.h vhost: Simplify dev-vqs[i] access vhost-net: Cleanup vhost_ubuf and vhost_zcopy vhost-scsi: Remove unnecessary forward struct vhost_scsi declaration vhost-scsi: Rename struct vhost_scsi *s to *vs vhost-scsi: Make func indention more consistent vhost-scsi: Rename struct tcm_vhost_tpg *tv_tpg to *tpg vhost-scsi: Rename struct tcm_vhost_cmd *tv_cmd to *cmd drivers/vhost/Kconfig | 8 + drivers/vhost/Makefile | 3 +- drivers/vhost/net.c| 64 --- drivers/vhost/scsi.c | 470 ++--- drivers/vhost/vhost.c | 86 +++-- drivers/vhost/vhost.h | 11 +- 6 files changed, 361 insertions(+), 281 deletions(-) -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: virtio performance analysis
On Thu, May 02, 2013 at 09:13:29AM +0530, nitesh narayan lal wrote: Hi, I am currently working on the virtio performance analysis in PowerPC . That's nice. I was looking at the virtio front-end code and had done tracing by adding WARN_ON condition in skb_recv_done(),xmit_skb_done( ) and virtqueue_kick(). That's a bit heavy-handed. Why not just use ftrace? What I had seen is virtqueue_kick() internally it calls iowrite16() which will cause an exit to QEMU, You probably should be looking at vhost-net and not userspace virtio-net if you are interested in performance. now either I send a packets from Guest or receive a packets to guest sart_xmit() will be called and inside start_xmit there is a call to virtqueue_kick() causing Guest exit. Also for every packet or ack sent from Guest there is an exception received while sending or receiving the next packet/ack. Not exactly, we can buffer many events in case guest and host are running in parallel. See vring_need_event and its uses. Due to all of the above factors mentioned their will be an increase in signal, EXTINT and guest exits Increase as compared to what? and hence it will effect the CPU performance. This is what my analysis is so far, it would be great if I could get some help on this such that whether it seems appropriate or not ? Regards Nitesh Narayan Lal ___ Virtualization mailing list virtualizat...@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 06/11] vhost-net: Cleanup vhost_ubuf and vhost_zcopy
On Mon, May 06, 2013 at 04:38:24PM +0800, Asias He wrote: - Rename vhost_ubuf to vhost_net_ubuf - Rename vhost_zcopy_mask to vhost_net_zcopy_mask - Make funcs static Signed-off-by: Asias He as...@redhat.com OK this actually fixes a warning introduced by patch 1, so I'll pull this in too (don't like builds with warnings). Then your patch 1 can go in as is (some warnings during bisect builds this might trigger don't worry me). --- drivers/vhost/net.c | 58 +++-- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 06b2447..2b51e23 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -70,7 +70,7 @@ enum { VHOST_NET_VQ_MAX = 2, }; -struct vhost_ubuf_ref { +struct vhost_net_ubuf_ref { struct kref kref; wait_queue_head_t wait; struct vhost_virtqueue *vq; @@ -93,7 +93,7 @@ struct vhost_net_virtqueue { struct ubuf_info *ubuf_info; /* Reference counting for outstanding ubufs. * Protected by vq mutex. Writers must also take device mutex. */ - struct vhost_ubuf_ref *ubufs; + struct vhost_net_ubuf_ref *ubufs; }; struct vhost_net { @@ -110,24 +110,25 @@ struct vhost_net { bool tx_flush; }; -static unsigned vhost_zcopy_mask __read_mostly; +static unsigned vhost_net_zcopy_mask __read_mostly; -void vhost_enable_zcopy(int vq) +static void vhost_net_enable_zcopy(int vq) { - vhost_zcopy_mask |= 0x1 vq; + vhost_net_zcopy_mask |= 0x1 vq; } -static void vhost_zerocopy_done_signal(struct kref *kref) +static void vhost_net_zerocopy_done_signal(struct kref *kref) { - struct vhost_ubuf_ref *ubufs = container_of(kref, struct vhost_ubuf_ref, - kref); + struct vhost_net_ubuf_ref *ubufs; + + ubufs = container_of(kref, struct vhost_net_ubuf_ref, kref); wake_up(ubufs-wait); } -struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *vq, - bool zcopy) +static struct vhost_net_ubuf_ref * +vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy) { - struct vhost_ubuf_ref *ubufs; + struct vhost_net_ubuf_ref *ubufs; /* No zero copy backend? Nothing to count. */ if (!zcopy) return NULL; @@ -140,14 +141,14 @@ struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *vq, return ubufs; } -void vhost_ubuf_put(struct vhost_ubuf_ref *ubufs) +static void vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs) { - kref_put(ubufs-kref, vhost_zerocopy_done_signal); + kref_put(ubufs-kref, vhost_net_zerocopy_done_signal); } -void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *ubufs) +static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs) { - kref_put(ubufs-kref, vhost_zerocopy_done_signal); + kref_put(ubufs-kref, vhost_net_zerocopy_done_signal); wait_event(ubufs-wait, !atomic_read(ubufs-kref.refcount)); kfree(ubufs); } @@ -159,7 +160,7 @@ static void vhost_net_clear_ubuf_info(struct vhost_net *n) int i; for (i = 0; i n-dev.nvqs; ++i) { - zcopy = vhost_zcopy_mask (0x1 i); + zcopy = vhost_net_zcopy_mask (0x1 i); if (zcopy) kfree(n-vqs[i].ubuf_info); } @@ -171,7 +172,7 @@ int vhost_net_set_ubuf_info(struct vhost_net *n) int i; for (i = 0; i n-dev.nvqs; ++i) { - zcopy = vhost_zcopy_mask (0x1 i); + zcopy = vhost_net_zcopy_mask (0x1 i); if (!zcopy) continue; n-vqs[i].ubuf_info = kmalloc(sizeof(*n-vqs[i].ubuf_info) * @@ -183,7 +184,7 @@ int vhost_net_set_ubuf_info(struct vhost_net *n) err: while (i--) { - zcopy = vhost_zcopy_mask (0x1 i); + zcopy = vhost_net_zcopy_mask (0x1 i); if (!zcopy) continue; kfree(n-vqs[i].ubuf_info); @@ -305,7 +306,7 @@ static int vhost_zerocopy_signal_used(struct vhost_net *net, static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success) { - struct vhost_ubuf_ref *ubufs = ubuf-ctx; + struct vhost_net_ubuf_ref *ubufs = ubuf-ctx; struct vhost_virtqueue *vq = ubufs-vq; int cnt = atomic_read(ubufs-kref.refcount); @@ -322,7 +323,7 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success) /* set len to mark this desc buffers done DMA */ vq-heads[ubuf-desc].len = success ? VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN; - vhost_ubuf_put(ubufs); + vhost_net_ubuf_put(ubufs); } /* Expects to be always run from workqueue - which acts as @@ -345,7 +346,7 @@ static void handle_tx(struct vhost_net *net) int err; size_t hdr_size;
[PATCH] vhost: drop virtio_net.h dependency
There's no net specific code in vhost.c anymore, don't include the virtio_net.h header. Signed-off-by: Michael S. Tsirkin m...@redhat.com --- This is on top of Asias' patches, already queued so just FYI. drivers/vhost/vhost.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index de9441a..dcde269 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -13,7 +13,6 @@ #include linux/eventfd.h #include linux/vhost.h -#include linux/virtio_net.h #include linux/mm.h #include linux/mmu_context.h #include linux/miscdevice.h -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v17 RESEND] pvpanic: pvpanic device driver
On Mon, May 06, 2013 at 11:39:35AM +0800, Hu Tao wrote: On Fri, May 03, 2013 at 06:59:18PM -0300, Marcelo Tosatti wrote: On Fri, May 03, 2013 at 10:47:10AM +0800, Hu Tao wrote: pvpanic device is a qemu simulated device through which guest panic event is sent to host. Signed-off-by: Hu Tao hu...@cn.fujitsu.com --- drivers/platform/x86/Kconfig | 7 +++ drivers/platform/x86/Makefile | 2 + drivers/platform/x86/pvpanic.c | 115 + 3 files changed, 124 insertions(+) create mode 100644 drivers/platform/x86/pvpanic.c diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 3338437..527ed04 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -781,4 +781,11 @@ config APPLE_GMUX graphics as well as the backlight. Currently only backlight control is supported by the driver. +config PVPANIC + tristate pvpanic device support + depends on ACPI + ---help--- + This driver provides support for pvpanic device, which is a qemu + simulated device through which guest panic event is sent to host. + endif # X86_PLATFORM_DEVICES diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile index ace2b38..ef0ec74 100644 --- a/drivers/platform/x86/Makefile +++ b/drivers/platform/x86/Makefile @@ -51,3 +51,5 @@ obj-$(CONFIG_INTEL_OAKTRAIL)+= intel_oaktrail.o obj-$(CONFIG_SAMSUNG_Q10)+= samsung-q10.o obj-$(CONFIG_APPLE_GMUX) += apple-gmux.o obj-$(CONFIG_CHROMEOS_LAPTOP)+= chromeos_laptop.o + +obj-$(CONFIG_PVPANIC) += pvpanic.o diff --git a/drivers/platform/x86/pvpanic.c b/drivers/platform/x86/pvpanic.c new file mode 100644 index 000..81c95ec --- /dev/null +++ b/drivers/platform/x86/pvpanic.c @@ -0,0 +1,115 @@ +/* + * pvpanic.c - pvpanic Device Support + * + * Copyright (C) 2013 Fujitsu. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME : fmt + +#include linux/kernel.h +#include linux/module.h +#include linux/init.h +#include linux/types.h +#include acpi/acpi_bus.h +#include acpi/acpi_drivers.h + +MODULE_AUTHOR(Hu Tao hu...@cn.fujitsu.com); +MODULE_DESCRIPTION(pvpanic device driver); +MODULE_LICENSE(GPL); + +static int pvpanic_add(struct acpi_device *device); +static int pvpanic_remove(struct acpi_device *device); + +static const struct acpi_device_id pvpanic_device_ids[] = { + { QEMU0001, 0}, + { , 0}, +}; +MODULE_DEVICE_TABLE(acpi, pvpanic_device_ids); + +#define PVPANIC_PANICKED (1 0) + +static acpi_handle handle; + +static struct acpi_driver pvpanic_driver = { + .name = pvpanic, + .class =QEMU, + .ids = pvpanic_device_ids, + .ops = { + .add = pvpanic_add, + .remove = pvpanic_remove, + }, + .owner =THIS_MODULE, +}; + +static void +pvpanic_send_event(unsigned int event) +{ + union acpi_object arg; + struct acpi_object_list arg_list; + + if (!handle) + return; + + arg.type = ACPI_TYPE_INTEGER; + arg.integer.value = event; + + arg_list.count = 1; + arg_list.pointer = arg; + + acpi_evaluate_object(handle, WRPT, arg_list, NULL); +} Is it safe to call acpi_evaluate_object from a panic notifier? For example: - Has it been confirmed that no code invoked via acpi_evaluate_object can panic() ? Confirmed. - acpi_ex_enter_interpreter grabs a mutex. Is that path ever used? Unfortunately yes. As I can tell, there are 2 places in the path to grab a mutex: when searching the namespace for the method, and when executing the method. I didn't find a non-blocking version of acpi_evaluate_object. Then thats a bug because the pvpanic notifier is called with interrupts disabled. Suppose its safer to use an OUT instruction? -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo
Re: [uq/master PATCH] kvmvapic: add ioport read accessor
On Sun, May 05, 2013 at 05:51:49PM -0300, Marcelo Tosatti wrote: Necessary since memory region accessor assumes read and write methods are registered. Otherwise reading I/O port 0x7e segfaults. https://bugzilla.redhat.com/show_bug.cgi?id=954306 Signed-off-by: Marcelo Tosatti mtosa...@redhat.com Applied, thanks. diff --git a/hw/i386/kvmvapic.c b/hw/i386/kvmvapic.c index 5b558aa..655483b 100644 --- a/hw/i386/kvmvapic.c +++ b/hw/i386/kvmvapic.c @@ -687,8 +687,14 @@ static void vapic_write(void *opaque, hwaddr addr, uint64_t data, } } +static uint64_t vapic_read(void *opaque, hwaddr addr, unsigned size) +{ +return 0x; +} + static const MemoryRegionOps vapic_ops = { .write = vapic_write, +.read = vapic_read, .endianness = DEVICE_NATIVE_ENDIAN, }; -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 00/11] vhost cleanups
On Mon, May 06, 2013 at 01:07:46PM +0300, Michael S. Tsirkin wrote: On Mon, May 06, 2013 at 04:38:18PM +0800, Asias He wrote: MST, This is on top of [PATCH 0/2] vhost-net fix ubuf. Okay, how about making EVENT_IDX work for virtio-scsi? I'm guessing it's some messup with feature negotiation, that's what all event-idx bugs came down to so far. Yes, IIRC, EVENT_IDX works for vhost-scsi now. Will cook a patch to enable it. It should go 3.10, right? Asias He (11): vhost: Remove vhost_enable_zcopy in vhost.h vhost: Move VHOST_NET_FEATURES to net.c vhost: Make vhost a separate module vhost: Remove comments for hdr in vhost.h vhost: Simplify dev-vqs[i] access vhost-net: Cleanup vhost_ubuf and vhost_zcopy vhost-scsi: Remove unnecessary forward struct vhost_scsi declaration vhost-scsi: Rename struct vhost_scsi *s to *vs vhost-scsi: Make func indention more consistent vhost-scsi: Rename struct tcm_vhost_tpg *tv_tpg to *tpg vhost-scsi: Rename struct tcm_vhost_cmd *tv_cmd to *cmd drivers/vhost/Kconfig | 8 + drivers/vhost/Makefile | 3 +- drivers/vhost/net.c| 64 --- drivers/vhost/scsi.c | 470 ++--- drivers/vhost/vhost.c | 86 +++-- drivers/vhost/vhost.h | 11 +- 6 files changed, 361 insertions(+), 281 deletions(-) -- 1.8.1.4 -- Asias -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] vhost: drop virtio_net.h dependency
On Mon, May 06, 2013 at 01:37:34PM +0300, Michael S. Tsirkin wrote: There's no net specific code in vhost.c anymore, don't include the virtio_net.h header. Signed-off-by: Michael S. Tsirkin m...@redhat.com Reviewed-by: Asias He as...@redhat.com --- This is on top of Asias' patches, already queued so just FYI. drivers/vhost/vhost.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index de9441a..dcde269 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -13,7 +13,6 @@ #include linux/eventfd.h #include linux/vhost.h -#include linux/virtio_net.h #include linux/mm.h #include linux/mmu_context.h #include linux/miscdevice.h -- MST -- Asias -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 03/11] vhost: Make vhost a separate module
On Mon, May 06, 2013 at 01:03:42PM +0300, Michael S. Tsirkin wrote: On Mon, May 06, 2013 at 04:38:21PM +0800, Asias He wrote: Currently, vhost-net and vhost-scsi are sharing the vhost core code. However, vhost-scsi shares the code by including the vhost.c file directly. Making vhost a separate module makes it is easier to share code with other vhost devices. Signed-off-by: Asias He as...@redhat.com Also this will break test.c, right? Let's fix it in the same commit too. I will fix it up and remove the useless 'return'. --- drivers/vhost/Kconfig | 8 drivers/vhost/Makefile | 3 ++- drivers/vhost/scsi.c | 1 - drivers/vhost/vhost.c | 51 +- drivers/vhost/vhost.h | 2 ++ 5 files changed, 62 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index 8b9226d..017a1e8 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -1,6 +1,7 @@ config VHOST_NET tristate Host kernel accelerator for virtio net depends on NET EVENTFD (TUN || !TUN) (MACVTAP || !MACVTAP) + select VHOST select VHOST_RING ---help--- This kernel module can be loaded in host kernel to accelerate @@ -13,6 +14,7 @@ config VHOST_NET config VHOST_SCSI tristate VHOST_SCSI TCM fabric driver depends on TARGET_CORE EVENTFD m + select VHOST select VHOST_RING default n ---help--- @@ -24,3 +26,9 @@ config VHOST_RING ---help--- This option is selected by any driver which needs to access the host side of a virtio ring. + +config VHOST + tristate + ---help--- + This option is selected by any driver which needs to access + the core of vhost. diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile index 654e9afb..e0441c3 100644 --- a/drivers/vhost/Makefile +++ b/drivers/vhost/Makefile @@ -1,7 +1,8 @@ obj-$(CONFIG_VHOST_NET) += vhost_net.o -vhost_net-y := vhost.o net.o +vhost_net-y := net.o obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o vhost_scsi-y := scsi.o obj-$(CONFIG_VHOST_RING) += vringh.o +obj-$(CONFIG_VHOST)+= vhost.o diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 5179f7a..2dcb94a 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -49,7 +49,6 @@ #include linux/llist.h #include linux/bitmap.h -#include vhost.c #include vhost.h #define TCM_VHOST_VERSION v0.1 diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index de9441a..e406d5f 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -25,6 +25,7 @@ #include linux/slab.h #include linux/kthread.h #include linux/cgroup.h +#include linux/module.h #include vhost.h @@ -66,6 +67,7 @@ void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn) work-flushing = 0; work-queue_seq = work-done_seq = 0; } +EXPORT_SYMBOL_GPL(vhost_work_init); /* Init poll structure */ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, @@ -79,6 +81,7 @@ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, vhost_work_init(poll-work, fn); } +EXPORT_SYMBOL_GPL(vhost_poll_init); /* Start polling a file. We add ourselves to file's wait queue. The caller must * keep a reference to a file until after vhost_poll_stop is called. */ @@ -101,6 +104,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file *file) return ret; } +EXPORT_SYMBOL_GPL(vhost_poll_start); /* Stop polling a file. After this function returns, it becomes safe to drop the * file reference. You must also flush afterwards. */ @@ -111,6 +115,7 @@ void vhost_poll_stop(struct vhost_poll *poll) poll-wqh = NULL; } } +EXPORT_SYMBOL_GPL(vhost_poll_stop); static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work, unsigned seq) @@ -123,7 +128,7 @@ static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work, return left = 0; } -static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) +void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) { unsigned seq; int flushing; @@ -138,6 +143,7 @@ static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) spin_unlock_irq(dev-work_lock); BUG_ON(flushing 0); } +EXPORT_SYMBOL_GPL(vhost_work_flush); /* Flush any work that has been scheduled. When calling this, don't hold any * locks that are also used by the callback. */ @@ -145,6 +151,7 @@ void vhost_poll_flush(struct vhost_poll *poll) { vhost_work_flush(poll-dev, poll-work); } +EXPORT_SYMBOL_GPL(vhost_poll_flush); void vhost_work_queue(struct vhost_dev *dev,
Re: [PATCH v4 4/6] KVM: MMU: fast invalid all shadow pages
On Mon, May 06, 2013 at 11:39:11AM +0800, Xiao Guangrong wrote: On 05/04/2013 08:52 AM, Marcelo Tosatti wrote: On Sat, May 04, 2013 at 12:51:06AM +0800, Xiao Guangrong wrote: On 05/03/2013 11:53 PM, Marcelo Tosatti wrote: On Fri, May 03, 2013 at 01:52:07PM +0800, Xiao Guangrong wrote: On 05/03/2013 09:05 AM, Marcelo Tosatti wrote: + +/* + * Fast invalid all shadow pages belong to @slot. + * + * @slot != NULL means the invalidation is caused the memslot specified + * by @slot is being deleted, in this case, we should ensure that rmap + * and lpage-info of the @slot can not be used after calling the function. + * + * @slot == NULL means the invalidation due to other reasons, we need + * not care rmap and lpage-info since they are still valid after calling + * the function. + */ +void kvm_mmu_invalid_memslot_pages(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + spin_lock(kvm-mmu_lock); + kvm-arch.mmu_valid_gen++; + + /* + * All shadow paes are invalid, reset the large page info, + * then we can safely desotry the memslot, it is also good + * for large page used. + */ + kvm_clear_all_lpage_info(kvm); Xiao, I understood it was agreed that simple mmu_lock lockbreak while avoiding zapping of newly instantiated pages upon a if(spin_needbreak) cond_resched_lock() cycle was enough as a first step? And then later introduce root zapping along with measurements. https://lkml.org/lkml/2013/4/22/544 Yes, it is. See the changelog in 0/0: we use lock-break technique to zap all sptes linked on the invalid rmap, it is not very effective but good for the first step. Thanks! Sure, but what is up with zeroing kvm_clear_all_lpage_info(kvm) and zapping the root? Only lock-break technique along with generation number was what was agreed. Marcelo, Please Wait... I am completely confused. :( Let's clarify zeroing kvm_clear_all_lpage_info(kvm) and zapping the root first. Are these changes you wanted? void kvm_mmu_invalid_memslot_pages(struct kvm *kvm, struct kvm_memory_slot *slot) { spin_lock(kvm-mmu_lock); kvm-arch.mmu_valid_gen++; /* Zero all root pages.*/ restart: list_for_each_entry_safe(sp, node, kvm-arch.active_mmu_pages, link) { if (!sp-root_count) continue; if (kvm_mmu_prepare_zap_page(kvm, sp, invalid_list)) goto restart; } /* * All shadow paes are invalid, reset the large page info, * then we can safely desotry the memslot, it is also good * for large page used. */ kvm_clear_all_lpage_info(kvm); kvm_mmu_commit_zap_page(kvm, invalid_list); spin_unlock(kvm-mmu_lock); } static void rmap_remove(struct kvm *kvm, u64 *spte) { struct kvm_mmu_page *sp; gfn_t gfn; unsigned long *rmapp; sp = page_header(__pa(spte)); + + /* Let invalid sp do not access its rmap. */ + if (!sp_is_valid(sp)) + return; + gfn = kvm_mmu_page_get_gfn(sp, spte - sp-spt); rmapp = gfn_to_rmap(kvm, gfn, sp-role.level); pte_list_remove(spte, rmapp); } If yes, there is the reason why we can not do this that i mentioned before: after call kvm_mmu_invalid_memslot_pages(), the memslot-rmap will be destroyed. Later, if host reclaim page, the mmu-notify handlers, -invalidate_page and -invalidate_range_start, can not find any spte using the host page, then Accessed/Dirty for host page is missing tracked. (missing call kvm_set_pfn_accessed and kvm_set_pfn_dirty properly.) What's your idea? Step 1) Fix kvm_mmu_zap_all's behaviour: introduce lockbreak via spin_needbreak. Use generation numbers so that in case kvm_mmu_zap_all releases mmu_lock and reacquires it again, only shadow pages from the generation with which kvm_mmu_zap_all started are zapped (this guarantees forward progress and eventual termination). kvm_mmu_zap_generation() spin_lock(mmu_lock) int generation = kvm-arch.mmu_generation; for_each_shadow_page(sp) { if (sp-generation == kvm-arch.mmu_generation) zap_page(sp) if (spin_needbreak(mmu_lock)) { kvm-arch.mmu_generation++; cond_resched_lock(mmu_lock); } } kvm_mmu_zap_all() spin_lock(mmu_lock) for_each_shadow_page(sp) { if (spin_needbreak(mmu_lock)) { cond_resched_lock(mmu_lock); } } Use kvm_mmu_zap_generation for kvm_arch_flush_shadow_memslot. Use kvm_mmu_zap_all for kvm_mmu_notifier_release,kvm_destroy_vm. This addresses the main problem: excessively long hold times of kvm_mmu_zap_all with very large
[PATCH 0/1] [PULL] qemu-kvm.git uq/master queue
Anthony please pull if it is not too later for 1.5. The following changes since commit 467b34689d277fa56c09ad07ca0f08d7d7539f6d: Update OpenBIOS images (2013-05-05 09:53:22 +) are available in the git repository at: git://git.kernel.org/pub/scm/virt/kvm/qemu-kvm.git uq/master for you to fetch changes up to 0c1cd0ae2a4faabeb948b9a07ea1696e853de174: kvmvapic: add ioport read accessor (2013-05-06 14:52:26 +0300) Marcelo Tosatti (1): kvmvapic: add ioport read accessor hw/i386/kvmvapic.c |6 ++ 1 file changed, 6 insertions(+) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/1] kvmvapic: add ioport read accessor
From: Marcelo Tosatti mtosa...@redhat.com Necessary since memory region accessor assumes read and write methods are registered. Otherwise reading I/O port 0x7e segfaults. https://bugzilla.redhat.com/show_bug.cgi?id=954306 Signed-off-by: Marcelo Tosatti mtosa...@redhat.com Reviewed-by: Jan Kiszka jan.kis...@siemens.com Signed-off-by: Gleb Natapov g...@redhat.com --- hw/i386/kvmvapic.c |6 ++ 1 file changed, 6 insertions(+) diff --git a/hw/i386/kvmvapic.c b/hw/i386/kvmvapic.c index 5b558aa..655483b 100644 --- a/hw/i386/kvmvapic.c +++ b/hw/i386/kvmvapic.c @@ -687,8 +687,14 @@ static void vapic_write(void *opaque, hwaddr addr, uint64_t data, } } +static uint64_t vapic_read(void *opaque, hwaddr addr, unsigned size) +{ +return 0x; +} + static const MemoryRegionOps vapic_ops = { .write = vapic_write, +.read = vapic_read, .endianness = DEVICE_NATIVE_ENDIAN, }; -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4 4/6] KVM: MMU: fast invalid all shadow pages
On 05/06/2013 08:36 PM, Gleb Natapov wrote: Step 1) Fix kvm_mmu_zap_all's behaviour: introduce lockbreak via spin_needbreak. Use generation numbers so that in case kvm_mmu_zap_all releases mmu_lock and reacquires it again, only shadow pages from the generation with which kvm_mmu_zap_all started are zapped (this guarantees forward progress and eventual termination). kvm_mmu_zap_generation() spin_lock(mmu_lock) int generation = kvm-arch.mmu_generation; for_each_shadow_page(sp) { if (sp-generation == kvm-arch.mmu_generation) zap_page(sp) if (spin_needbreak(mmu_lock)) { kvm-arch.mmu_generation++; cond_resched_lock(mmu_lock); } } kvm_mmu_zap_all() spin_lock(mmu_lock) for_each_shadow_page(sp) { if (spin_needbreak(mmu_lock)) { cond_resched_lock(mmu_lock); } } Use kvm_mmu_zap_generation for kvm_arch_flush_shadow_memslot. Use kvm_mmu_zap_all for kvm_mmu_notifier_release,kvm_destroy_vm. This addresses the main problem: excessively long hold times of kvm_mmu_zap_all with very large guests. Do you see any problem with this logic? This was what i was thinking we agreed. No. I understand it and it can work. Actually, it is similar with Gleb's idea that zapping stale shadow pages (and uses lock break technique), after some discussion, we thought only zap shadow pages that are reachable from the slot's rmap is better, that is this patchset does. (https://lkml.org/lkml/2013/4/23/73) But this is not what the patch is doing. Close, but not the same :) Okay. :) Instead of zapping shadow pages reachable from slot's rmap the patch does kvm_unmap_rmapp() which drop all spte without zapping shadow pages. That is why you need special code to re-init lpage_info. What I proposed was to call zap_page() on all shadow pages reachable from rmap. This will take care of lpage_info counters. Does this make sense? Unfortunately, no! We still need to care lpage_info. lpage_info is used to count the number of guest page tables in the memslot. For example, there is a memslot: memslot[0].based_gfn = 0, memslot[0].npages = 100, and there is a shadow page: sp-role.direct =0, sp-role.level = 4, sp-gfn = 10. this sp is counted in the memslot[0] but it can not be found by walking memslot[0]-rmap since there is no last mapping in this shadow page. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 00/11] vhost cleanups
On Mon, May 06, 2013 at 08:05:26PM +0800, Asias He wrote: On Mon, May 06, 2013 at 01:07:46PM +0300, Michael S. Tsirkin wrote: On Mon, May 06, 2013 at 04:38:18PM +0800, Asias He wrote: MST, This is on top of [PATCH 0/2] vhost-net fix ubuf. Okay, how about making EVENT_IDX work for virtio-scsi? I'm guessing it's some messup with feature negotiation, that's what all event-idx bugs came down to so far. Yes, IIRC, EVENT_IDX works for vhost-scsi now. Will cook a patch to enable it. It should go 3.10, right? If it's early in the cycle, I think it can. Asias He (11): vhost: Remove vhost_enable_zcopy in vhost.h vhost: Move VHOST_NET_FEATURES to net.c vhost: Make vhost a separate module vhost: Remove comments for hdr in vhost.h vhost: Simplify dev-vqs[i] access vhost-net: Cleanup vhost_ubuf and vhost_zcopy vhost-scsi: Remove unnecessary forward struct vhost_scsi declaration vhost-scsi: Rename struct vhost_scsi *s to *vs vhost-scsi: Make func indention more consistent vhost-scsi: Rename struct tcm_vhost_tpg *tv_tpg to *tpg vhost-scsi: Rename struct tcm_vhost_cmd *tv_cmd to *cmd drivers/vhost/Kconfig | 8 + drivers/vhost/Makefile | 3 +- drivers/vhost/net.c| 64 --- drivers/vhost/scsi.c | 470 ++--- drivers/vhost/vhost.c | 86 +++-- drivers/vhost/vhost.h | 11 +- 6 files changed, 361 insertions(+), 281 deletions(-) -- 1.8.1.4 -- Asias -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v17 RESEND] pvpanic: pvpanic device driver
Il 06/05/2013 13:38, Marcelo Tosatti ha scritto: On Mon, May 06, 2013 at 11:39:35AM +0800, Hu Tao wrote: On Fri, May 03, 2013 at 06:59:18PM -0300, Marcelo Tosatti wrote: On Fri, May 03, 2013 at 10:47:10AM +0800, Hu Tao wrote: pvpanic device is a qemu simulated device through which guest panic event is sent to host. Signed-off-by: Hu Tao hu...@cn.fujitsu.com --- drivers/platform/x86/Kconfig | 7 +++ drivers/platform/x86/Makefile | 2 + drivers/platform/x86/pvpanic.c | 115 + 3 files changed, 124 insertions(+) create mode 100644 drivers/platform/x86/pvpanic.c diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 3338437..527ed04 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -781,4 +781,11 @@ config APPLE_GMUX graphics as well as the backlight. Currently only backlight control is supported by the driver. +config PVPANIC + tristate pvpanic device support + depends on ACPI + ---help--- +This driver provides support for pvpanic device, which is a qemu +simulated device through which guest panic event is sent to host. + endif # X86_PLATFORM_DEVICES diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile index ace2b38..ef0ec74 100644 --- a/drivers/platform/x86/Makefile +++ b/drivers/platform/x86/Makefile @@ -51,3 +51,5 @@ obj-$(CONFIG_INTEL_OAKTRAIL) += intel_oaktrail.o obj-$(CONFIG_SAMSUNG_Q10) += samsung-q10.o obj-$(CONFIG_APPLE_GMUX) += apple-gmux.o obj-$(CONFIG_CHROMEOS_LAPTOP) += chromeos_laptop.o + +obj-$(CONFIG_PVPANIC) += pvpanic.o diff --git a/drivers/platform/x86/pvpanic.c b/drivers/platform/x86/pvpanic.c new file mode 100644 index 000..81c95ec --- /dev/null +++ b/drivers/platform/x86/pvpanic.c @@ -0,0 +1,115 @@ +/* + * pvpanic.c - pvpanic Device Support + * + * Copyright (C) 2013 Fujitsu. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME : fmt + +#include linux/kernel.h +#include linux/module.h +#include linux/init.h +#include linux/types.h +#include acpi/acpi_bus.h +#include acpi/acpi_drivers.h + +MODULE_AUTHOR(Hu Tao hu...@cn.fujitsu.com); +MODULE_DESCRIPTION(pvpanic device driver); +MODULE_LICENSE(GPL); + +static int pvpanic_add(struct acpi_device *device); +static int pvpanic_remove(struct acpi_device *device); + +static const struct acpi_device_id pvpanic_device_ids[] = { + { QEMU0001, 0}, + { , 0}, +}; +MODULE_DEVICE_TABLE(acpi, pvpanic_device_ids); + +#define PVPANIC_PANICKED (1 0) + +static acpi_handle handle; + +static struct acpi_driver pvpanic_driver = { + .name = pvpanic, + .class =QEMU, + .ids = pvpanic_device_ids, + .ops = { + .add = pvpanic_add, + .remove = pvpanic_remove, + }, + .owner =THIS_MODULE, +}; + +static void +pvpanic_send_event(unsigned int event) +{ + union acpi_object arg; + struct acpi_object_list arg_list; + + if (!handle) + return; + + arg.type = ACPI_TYPE_INTEGER; + arg.integer.value = event; + + arg_list.count = 1; + arg_list.pointer = arg; + + acpi_evaluate_object(handle, WRPT, arg_list, NULL); +} Is it safe to call acpi_evaluate_object from a panic notifier? For example: - Has it been confirmed that no code invoked via acpi_evaluate_object can panic() ? Confirmed. - acpi_ex_enter_interpreter grabs a mutex. Is that path ever used? Unfortunately yes. As I can tell, there are 2 places in the path to grab a mutex: when searching the namespace for the method, and when executing the method. I didn't find a non-blocking version of acpi_evaluate_object. Then thats a bug because the pvpanic notifier is called with interrupts disabled. Suppose its safer to use an OUT instruction? Yeah, it probably is... You can still use acpi_walk_resources to find the port number instead of hard-coding 0x505... Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at
Re: [PATCH v2 00/11] vhost cleanups
On Mon, May 06, 2013 at 04:15:35PM +0300, Michael S. Tsirkin wrote: On Mon, May 06, 2013 at 08:05:26PM +0800, Asias He wrote: On Mon, May 06, 2013 at 01:07:46PM +0300, Michael S. Tsirkin wrote: On Mon, May 06, 2013 at 04:38:18PM +0800, Asias He wrote: MST, This is on top of [PATCH 0/2] vhost-net fix ubuf. Okay, how about making EVENT_IDX work for virtio-scsi? I'm guessing it's some messup with feature negotiation, that's what all event-idx bugs came down to so far. Yes, IIRC, EVENT_IDX works for vhost-scsi now. Will cook a patch to enable it. It should go 3.10, right? If it's early in the cycle, I think it can. Well, let's queue it for 3.11. Asias He (11): vhost: Remove vhost_enable_zcopy in vhost.h vhost: Move VHOST_NET_FEATURES to net.c vhost: Make vhost a separate module vhost: Remove comments for hdr in vhost.h vhost: Simplify dev-vqs[i] access vhost-net: Cleanup vhost_ubuf and vhost_zcopy vhost-scsi: Remove unnecessary forward struct vhost_scsi declaration vhost-scsi: Rename struct vhost_scsi *s to *vs vhost-scsi: Make func indention more consistent vhost-scsi: Rename struct tcm_vhost_tpg *tv_tpg to *tpg vhost-scsi: Rename struct tcm_vhost_cmd *tv_cmd to *cmd drivers/vhost/Kconfig | 8 + drivers/vhost/Makefile | 3 +- drivers/vhost/net.c| 64 --- drivers/vhost/scsi.c | 470 ++--- drivers/vhost/vhost.c | 86 +++-- drivers/vhost/vhost.h | 11 +- 6 files changed, 361 insertions(+), 281 deletions(-) -- 1.8.1.4 -- Asias -- Asias -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] vhost: drop virtio_net.h dependency
On Mon, May 06, 2013 at 01:37:34PM +0300, Michael S. Tsirkin wrote: There's no net specific code in vhost.c anymore, don't include the virtio_net.h header. Did you push the it to your tree. I am not seeing it. Signed-off-by: Michael S. Tsirkin m...@redhat.com --- This is on top of Asias' patches, already queued so just FYI. drivers/vhost/vhost.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index de9441a..dcde269 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -13,7 +13,6 @@ #include linux/eventfd.h #include linux/vhost.h -#include linux/virtio_net.h #include linux/mm.h #include linux/mmu_context.h #include linux/miscdevice.h -- MST -- Asias -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [v1][KVM][PATCH 1/1] kvm:ppc: enable doorbell exception with E500MC
On 05/06/2013 04:53 AM, Tiejun Chen wrote: Actually E500MC also support doorbell exception, and CONFIG_PPC_E500MC can cover BOOK3E/BOOK3E_64 as well. Signed-off-by: Tiejun Chentiejun.c...@windriver.com --- arch/powerpc/kvm/booke.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 1020119..dc1f590 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -795,7 +795,7 @@ static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu, kvmppc_fill_pt_regs(regs); timer_interrupt(regs); break; -#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3E_64) +#if defined(CONFIG_PPC_E500MC) I suppose you mean CONFIG_KVM_E500MC here? Why didn't this work for you before? The ifdef above should cover the same range of CPUs. Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
KVM call agenda for 2013-05-07
Hi Please send in any agenda topics you are interested in. Later, Juan. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4 4/6] KVM: MMU: fast invalid all shadow pages
On Mon, May 06, 2013 at 09:10:11PM +0800, Xiao Guangrong wrote: On 05/06/2013 08:36 PM, Gleb Natapov wrote: Step 1) Fix kvm_mmu_zap_all's behaviour: introduce lockbreak via spin_needbreak. Use generation numbers so that in case kvm_mmu_zap_all releases mmu_lock and reacquires it again, only shadow pages from the generation with which kvm_mmu_zap_all started are zapped (this guarantees forward progress and eventual termination). kvm_mmu_zap_generation() spin_lock(mmu_lock) int generation = kvm-arch.mmu_generation; for_each_shadow_page(sp) { if (sp-generation == kvm-arch.mmu_generation) zap_page(sp) if (spin_needbreak(mmu_lock)) { kvm-arch.mmu_generation++; cond_resched_lock(mmu_lock); } } kvm_mmu_zap_all() spin_lock(mmu_lock) for_each_shadow_page(sp) { if (spin_needbreak(mmu_lock)) { cond_resched_lock(mmu_lock); } } Use kvm_mmu_zap_generation for kvm_arch_flush_shadow_memslot. Use kvm_mmu_zap_all for kvm_mmu_notifier_release,kvm_destroy_vm. This addresses the main problem: excessively long hold times of kvm_mmu_zap_all with very large guests. Do you see any problem with this logic? This was what i was thinking we agreed. No. I understand it and it can work. Actually, it is similar with Gleb's idea that zapping stale shadow pages (and uses lock break technique), after some discussion, we thought only zap shadow pages that are reachable from the slot's rmap is better, that is this patchset does. (https://lkml.org/lkml/2013/4/23/73) But this is not what the patch is doing. Close, but not the same :) Okay. :) Instead of zapping shadow pages reachable from slot's rmap the patch does kvm_unmap_rmapp() which drop all spte without zapping shadow pages. That is why you need special code to re-init lpage_info. What I proposed was to call zap_page() on all shadow pages reachable from rmap. This will take care of lpage_info counters. Does this make sense? Unfortunately, no! We still need to care lpage_info. lpage_info is used to count the number of guest page tables in the memslot. For example, there is a memslot: memslot[0].based_gfn = 0, memslot[0].npages = 100, and there is a shadow page: sp-role.direct =0, sp-role.level = 4, sp-gfn = 10. this sp is counted in the memslot[0] but it can not be found by walking memslot[0]-rmap since there is no last mapping in this shadow page. Right, so what about walking mmu_page_hash for each gfn belonging to the slot that is in process to be removed to find those? -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4 4/6] KVM: MMU: fast invalid all shadow pages
On 05/07/2013 01:24 AM, Gleb Natapov wrote: On Mon, May 06, 2013 at 09:10:11PM +0800, Xiao Guangrong wrote: On 05/06/2013 08:36 PM, Gleb Natapov wrote: Step 1) Fix kvm_mmu_zap_all's behaviour: introduce lockbreak via spin_needbreak. Use generation numbers so that in case kvm_mmu_zap_all releases mmu_lock and reacquires it again, only shadow pages from the generation with which kvm_mmu_zap_all started are zapped (this guarantees forward progress and eventual termination). kvm_mmu_zap_generation() spin_lock(mmu_lock) int generation = kvm-arch.mmu_generation; for_each_shadow_page(sp) { if (sp-generation == kvm-arch.mmu_generation) zap_page(sp) if (spin_needbreak(mmu_lock)) { kvm-arch.mmu_generation++; cond_resched_lock(mmu_lock); } } kvm_mmu_zap_all() spin_lock(mmu_lock) for_each_shadow_page(sp) { if (spin_needbreak(mmu_lock)) { cond_resched_lock(mmu_lock); } } Use kvm_mmu_zap_generation for kvm_arch_flush_shadow_memslot. Use kvm_mmu_zap_all for kvm_mmu_notifier_release,kvm_destroy_vm. This addresses the main problem: excessively long hold times of kvm_mmu_zap_all with very large guests. Do you see any problem with this logic? This was what i was thinking we agreed. No. I understand it and it can work. Actually, it is similar with Gleb's idea that zapping stale shadow pages (and uses lock break technique), after some discussion, we thought only zap shadow pages that are reachable from the slot's rmap is better, that is this patchset does. (https://lkml.org/lkml/2013/4/23/73) But this is not what the patch is doing. Close, but not the same :) Okay. :) Instead of zapping shadow pages reachable from slot's rmap the patch does kvm_unmap_rmapp() which drop all spte without zapping shadow pages. That is why you need special code to re-init lpage_info. What I proposed was to call zap_page() on all shadow pages reachable from rmap. This will take care of lpage_info counters. Does this make sense? Unfortunately, no! We still need to care lpage_info. lpage_info is used to count the number of guest page tables in the memslot. For example, there is a memslot: memslot[0].based_gfn = 0, memslot[0].npages = 100, and there is a shadow page: sp-role.direct =0, sp-role.level = 4, sp-gfn = 10. this sp is counted in the memslot[0] but it can not be found by walking memslot[0]-rmap since there is no last mapping in this shadow page. Right, so what about walking mmu_page_hash for each gfn belonging to the slot that is in process to be removed to find those? That will cost lots of time. The size of hashtable is 1 10. If the memslot has 4M memory, it will walk all the entries, the cost is the same as walking active_list (maybe litter more). And a memslot has 4M memory is the normal case i think. Another point is that lpage_info stops mmu to use large page. If we do not reset lpage_info, mmu is using 4K page until the invalid-sp is zapped. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4 4/6] KVM: MMU: fast invalid all shadow pages
On Mon, May 06, 2013 at 11:39:11AM +0800, Xiao Guangrong wrote: On 05/04/2013 08:52 AM, Marcelo Tosatti wrote: On Sat, May 04, 2013 at 12:51:06AM +0800, Xiao Guangrong wrote: On 05/03/2013 11:53 PM, Marcelo Tosatti wrote: On Fri, May 03, 2013 at 01:52:07PM +0800, Xiao Guangrong wrote: On 05/03/2013 09:05 AM, Marcelo Tosatti wrote: + +/* + * Fast invalid all shadow pages belong to @slot. + * + * @slot != NULL means the invalidation is caused the memslot specified + * by @slot is being deleted, in this case, we should ensure that rmap + * and lpage-info of the @slot can not be used after calling the function. + * + * @slot == NULL means the invalidation due to other reasons, we need + * not care rmap and lpage-info since they are still valid after calling + * the function. + */ +void kvm_mmu_invalid_memslot_pages(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + spin_lock(kvm-mmu_lock); + kvm-arch.mmu_valid_gen++; + + /* + * All shadow paes are invalid, reset the large page info, + * then we can safely desotry the memslot, it is also good + * for large page used. + */ + kvm_clear_all_lpage_info(kvm); Xiao, I understood it was agreed that simple mmu_lock lockbreak while avoiding zapping of newly instantiated pages upon a if(spin_needbreak) cond_resched_lock() cycle was enough as a first step? And then later introduce root zapping along with measurements. https://lkml.org/lkml/2013/4/22/544 Yes, it is. See the changelog in 0/0: we use lock-break technique to zap all sptes linked on the invalid rmap, it is not very effective but good for the first step. Thanks! Sure, but what is up with zeroing kvm_clear_all_lpage_info(kvm) and zapping the root? Only lock-break technique along with generation number was what was agreed. Marcelo, Please Wait... I am completely confused. :( Let's clarify zeroing kvm_clear_all_lpage_info(kvm) and zapping the root first. Are these changes you wanted? void kvm_mmu_invalid_memslot_pages(struct kvm *kvm, struct kvm_memory_slot *slot) { spin_lock(kvm-mmu_lock); kvm-arch.mmu_valid_gen++; /* Zero all root pages.*/ restart: list_for_each_entry_safe(sp, node, kvm-arch.active_mmu_pages, link) { if (!sp-root_count) continue; if (kvm_mmu_prepare_zap_page(kvm, sp, invalid_list)) goto restart; } /* * All shadow paes are invalid, reset the large page info, * then we can safely desotry the memslot, it is also good * for large page used. */ kvm_clear_all_lpage_info(kvm); kvm_mmu_commit_zap_page(kvm, invalid_list); spin_unlock(kvm-mmu_lock); } static void rmap_remove(struct kvm *kvm, u64 *spte) { struct kvm_mmu_page *sp; gfn_t gfn; unsigned long *rmapp; sp = page_header(__pa(spte)); + + /* Let invalid sp do not access its rmap. */ + if (!sp_is_valid(sp)) + return; + gfn = kvm_mmu_page_get_gfn(sp, spte - sp-spt); rmapp = gfn_to_rmap(kvm, gfn, sp-role.level); pte_list_remove(spte, rmapp); } If yes, there is the reason why we can not do this that i mentioned before: after call kvm_mmu_invalid_memslot_pages(), the memslot-rmap will be destroyed. Later, if host reclaim page, the mmu-notify handlers, -invalidate_page and -invalidate_range_start, can not find any spte using the host page, then Accessed/Dirty for host page is missing tracked. (missing call kvm_set_pfn_accessed and kvm_set_pfn_dirty properly.) What's your idea? Step 1) Fix kvm_mmu_zap_all's behaviour: introduce lockbreak via spin_needbreak. Use generation numbers so that in case kvm_mmu_zap_all releases mmu_lock and reacquires it again, only shadow pages from the generation with which kvm_mmu_zap_all started are zapped (this guarantees forward progress and eventual termination). kvm_mmu_zap_generation() spin_lock(mmu_lock) int generation = kvm-arch.mmu_generation; for_each_shadow_page(sp) { if (sp-generation == kvm-arch.mmu_generation) zap_page(sp) if (spin_needbreak(mmu_lock)) { kvm-arch.mmu_generation++; cond_resched_lock(mmu_lock); } } kvm_mmu_zap_all() spin_lock(mmu_lock) for_each_shadow_page(sp) { if (spin_needbreak(mmu_lock)) { cond_resched_lock(mmu_lock); } } Use kvm_mmu_zap_generation for kvm_arch_flush_shadow_memslot. Use kvm_mmu_zap_all for kvm_mmu_notifier_release,kvm_destroy_vm. This addresses the main problem: excessively long hold times of kvm_mmu_zap_all with very large
Re: [RFC PATCH 04/11] kvm tools: console: unconditionally output to any console
On 05/03/2013 12:09 PM, Will Deacon wrote: On Fri, May 03, 2013 at 05:02:14PM +0100, Sasha Levin wrote: On 05/03/2013 05:19 AM, Pekka Enberg wrote: On Wed, May 1, 2013 at 6:50 PM, Will Deacon will.dea...@arm.com wrote: From: Marc Zyngier marc.zyng...@arm.com Kvmtool suppresses any output to a console that has not been elected as *the* console. While this makes sense on the input side (we want the input to be sent to one console driver only), it seems to be the wrong thing to do on the output side, as it effectively prevents the guest from switching from one console to another (think earlyprintk using 8250 to virtio console). After all, the guest *does* poke this device and outputs something there. Just remove the kvm-cfg.active_console test from the output paths. Signed-off-by: Marc Zyngier marc.zyng...@arm.com Signed-off-by: Will Deacon will.dea...@arm.com Seems reasonable. Asias, Sasha? I remember at trying it some time ago but dropped it for a reason I don't remember at the moment. Can I have the weekend to play with it to try and figure out why? There's no rush from my point of view (hence the RFC) so take as long as you need! Looks good to me! Thanks, Sasha -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/5] KVM: PPC: iommu: Add missing kvm_iommu_map_pages/kvm_iommu_unmap_pages
On Mon, 2013-05-06 at 17:21 +1000, a...@ozlabs.ru wrote: From: Alexey Kardashevskiy a...@ozlabs.ru The IOMMU API implements groups creating/deletion, device binding and IOMMU map/unmap operations. The PowerPC implementation uses most of the API except map/unmap operations, which are implemented on POWER using hypercalls. However, in order to link a kernel with the CONFIG_IOMMU_API enabled, the empty kvm_iommu_map_pages/kvm_iommu_unmap_pages have to be defined, so this defines them. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru Cc: David Gibson da...@gibson.dropbear.id.au Signed-off-by: Paul Mackerras pau...@samba.org --- arch/powerpc/include/asm/kvm_host.h | 14 ++ 1 file changed, 14 insertions(+) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index b6a047e..c025d91 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -603,4 +603,18 @@ struct kvm_vcpu_arch { #define __KVM_HAVE_ARCH_WQP +#ifdef CONFIG_IOMMU_API +/* POWERPC does not use IOMMU API for mapping/unmapping */ +static inline int kvm_iommu_map_pages(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + return 0; +} + +static inline void kvm_iommu_unmap_pages(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ +} +#endif /* CONFIG_IOMMU_API */ + #endif /* __POWERPC_KVM_HOST_H__ */ This is no longer needed, Gleb applied my patch for 3.10 that make all of KVM device assignment dependent on a build config option and the top level kvm_host.h now includes this when that is not set. Thanks, Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] mips/kvm: Fix ABI for compatibility with 64-bit guests.
From: David Daney david.da...@cavium.com There are several parts to this: o All registers are 64-bits wide, 32-bit guests use the least significant portion of the register storage fields. o FPU register formats are defined. o CP0 Registers are manipulated via the KVM_GET_MSRS/KVM_SET_MSRS mechanism. The vcpu_ioctl_get_regs and vcpu_ioctl_set_regs function pointers become unused so they were removed. Some IOCTL functions were moved to kvm_trap_emul because the implementations are only for that flavor of KVM host. In the future, if hardware based virtualization is added, they can be hidden behind function pointers as appropriate. Signed-off-by: David Daney david.da...@cavium.com --- arch/mips/include/asm/kvm.h | 106 ++--- arch/mips/include/asm/kvm_host.h | 6 +- arch/mips/kernel/asm-offsets.c | 64 arch/mips/kvm/kvm_mips.c | 124 +++ arch/mips/kvm/kvm_mips_emul.c| 108 ++--- arch/mips/kvm/kvm_trap_emul.c| 330 ++- 6 files changed, 480 insertions(+), 258 deletions(-) diff --git a/arch/mips/include/asm/kvm.h b/arch/mips/include/asm/kvm.h index 85789ea..83c44d8 100644 --- a/arch/mips/include/asm/kvm.h +++ b/arch/mips/include/asm/kvm.h @@ -1,55 +1,113 @@ /* -* This file is subject to the terms and conditions of the GNU General Public -* License. See the file COPYING in the main directory of this archive -* for more details. -* -* Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved. -* Authors: Sanjay Lal sanj...@kymasys.com -*/ + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of this archive + * for more details. + * + * Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved. + * Copyright (C) 2013 Cavium, Inc. + * Authors: Sanjay Lal sanj...@kymasys.com + */ #ifndef __LINUX_KVM_MIPS_H #define __LINUX_KVM_MIPS_H #include linux/types.h -#define __KVM_MIPS - -#define N_MIPS_COPROC_REGS 32 -#define N_MIPS_COPROC_SEL 8 +/* + * KVM MIPS specific structures and definitions. + * + * Some parts derived from the x86 version of this file. + */ /* for KVM_GET_REGS and KVM_SET_REGS */ +/* + * If Config[AT] is zero (32-bit CPU), the register contents are + * stored in the lower 32-bits of the struct kvm_regs fields and sign + * extended to 64-bits. + */ struct kvm_regs { - __u32 gprs[32]; - __u32 hi; - __u32 lo; - __u32 pc; + /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ + __u64 gpr[32]; + __u64 hi, lo; + __u64 pc; +}; - __u32 cp0reg[N_MIPS_COPROC_REGS][N_MIPS_COPROC_SEL]; +/* for KVM_GET_FPU and KVM_SET_FPU */ +/* + * If Status[FR] is zero (32-bit FPU), the upper 32-bits of the FPRs + * are zero filled. + */ +struct kvm_fpu { + __u64 fpr[32]; + __u32 fir; + __u32 fccr; + __u32 fexr; + __u32 fenr; + __u32 fcsr; + __u32 pad; }; -/* for KVM_GET_SREGS and KVM_SET_SREGS */ -struct kvm_sregs { + +/* + * For MIPS, we use the same APIs as x86, where 'msr' corresponds to a + * CP0 register. The index field is broken down as follows: + * + * bits[2..0] - Register 'sel' index. + * bits[7..3] - Register 'rd' index. + * bits[15..8] - Must be zero. + * bits[31..16] - 0 - CP0 registers. + * + * Other sets registers may be added in the future. Each set would + * have its own identifier in bits[31..16]. + * + * For MSRs that are narrower than 64-bits, the value is stored in the + * low order bits of the data field, and sign extended to 64-bits. + */ +#define KVM_MIPS_MSR_CP0 0 +struct kvm_msr_entry { + __u32 index; + __u32 reserved; + __u64 data; }; -/* for KVM_GET_FPU and KVM_SET_FPU */ -struct kvm_fpu { +/* for KVM_GET_MSRS and KVM_SET_MSRS */ +struct kvm_msrs { + __u32 nmsrs; /* number of msrs in entries */ + __u32 pad; + + struct kvm_msr_entry entries[0]; }; +/* for KVM_GET_MSR_INDEX_LIST */ +struct kvm_msr_list { + __u32 nmsrs; /* number of msrs in entries */ + __u32 indices[0]; +}; + +/* + * KVM MIPS specific structures and definitions + * + */ struct kvm_debug_exit_arch { + __u64 epc; }; /* for KVM_SET_GUEST_DEBUG */ struct kvm_guest_debug_arch { }; +/* definition of registers in kvm_run */ +struct kvm_sync_regs { +}; + +/* dummy definition */ +struct kvm_sregs { +}; + struct kvm_mips_interrupt { /* in */ __u32 cpu; __u32 irq; }; -/* definition of registers in kvm_run */ -struct kvm_sync_regs { -}; - #endif /* __LINUX_KVM_MIPS_H */ diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index e68781e..3a5b2c8 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -360,7 +360,7 @@ struct kvm_vcpu_arch { uint32_t guest_inst; /* GPRS */ - unsigned long gprs[32]; + unsigned long gpr[32];
Re: [PATCH] mips/kvm: Fix ABI for compatibility with 64-bit guests.
On May 6, 2013, at 3:39 PM, David Daney wrote: /* for KVM_GET_REGS and KVM_SET_REGS */ +/* + * If Config[AT] is zero (32-bit CPU), the register contents are + * stored in the lower 32-bits of the struct kvm_regs fields and sign + * extended to 64-bits. + */ struct kvm_regs { - __u32 gprs[32]; - __u32 hi; - __u32 lo; - __u32 pc; + /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ + __u64 gpr[32]; + __u64 hi, lo; + __u64 pc; +}; - __u32 cp0reg[N_MIPS_COPROC_REGS][N_MIPS_COPROC_SEL]; Hi David, I'll try out the diff with QEMU and confirm that it works as expected. Could you just leave the GPR field in kvm_regs as gprs. Its a minor change but avoids diffs that just replace gprs with gpr. Regards Sanjay -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] mips/kvm: Fix ABI for compatibility with 64-bit guests.
On 05/06/2013 04:11 PM, Sanjay Lal wrote: On May 6, 2013, at 3:39 PM, David Daney wrote: /* for KVM_GET_REGS and KVM_SET_REGS */ +/* + * If Config[AT] is zero (32-bit CPU), the register contents are + * stored in the lower 32-bits of the struct kvm_regs fields and sign + * extended to 64-bits. + */ struct kvm_regs { - __u32 gprs[32]; - __u32 hi; - __u32 lo; - __u32 pc; + /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ + __u64 gpr[32]; + __u64 hi, lo; + __u64 pc; +}; - __u32 cp0reg[N_MIPS_COPROC_REGS][N_MIPS_COPROC_SEL]; Hi David, I'll try out the diff with QEMU and confirm that it works as expected. Could you just leave the GPR field in kvm_regs as gprs. Its a minor change but avoids diffs that just replace gprs with gpr. Well, there were two changes with respect to 'gprs' vs. 'gpr'. The change you show above only results in a small handful of diff lines. My argument for the change is that it will be part of a public ABI, and should be short and concise, so I changed it to 'gpr'. I also changed the field with the same name in struct kvm_vcpu_arch to match, which causes the changes in asm-offsets.c and quite a few other places as well. One could argue that this one was gratuitous, but I thought it would be nice for them to match. Since it is an internal implementation detail, it is not that important, so I could revert this part if there are strong objections. David Daney -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] kvm/ppc/booke64: Hard disable interrupts when entering the guest
On 05/05/2013 04:03:08 PM, Benjamin Herrenschmidt wrote: On Fri, 2013-05-03 at 18:45 -0500, Scott Wood wrote: kvmppc_lazy_ee_enable() was causing interrupts to be soft-enabled (albeit hard-disabled) in kvmppc_restart_interrupt(). This led to warnings, and possibly breakage if the interrupt state was later saved and then restored (leading to interrupts being hard-and-soft enabled when they should be at least soft-disabled). Simply removing kvmppc_lazy_ee_enable() leaves interrupts only soft-disabled when we enter the guest, but they will be hard-disabled when we exit the guest -- without PACA_IRQ_HARD_DIS ever being set, so the local_irq_enable() fails to hard-enable. While we could just set PACA_IRQ_HARD_DIS after an exit to compensate, instead hard-disable interrupts before entering the guest. This way, we won't have to worry about interactions if we take an interrupt during the guest entry code. While I don't see any obvious interactions, it could change in the future (e.g. it would be bad if the non-hv code were used on 64-bit or if 32-bit guest lazy interrupt disabling, since the non-hv code changes IVPR among other things). Shouldn't the interrupts be marked soft-enabled (even if hard disabled) when entering the guest ? Ie. The last stage of entry will hard enable, so they should be soft-enabled too... if not, latency trackers will consider the whole guest periods as interrupt disabled... OK... I guess we already have that problem on 32-bit as well? Now, kvmppc_lazy_ee_enable() seems to be clearly bogus to me. It will unconditionally set soft_enabled and clear irq_happened from a soft-disabled state, thus potentially losing a pending event. Book3S HV seems to be keeping interrupts fully enabled all the way until the asm hard disables, which would be fine except that I'm worried we are racy vs. need_resched signals. One thing you may be able to do is call prep_irq_for_idle(). This will tell you if something happened, giving you a chance to abort/re-enable before you go the guest. As long as we go straight from IRQs fully enabled to hard-disabled, before we check for signals and such, I don't think we need that (and using it would raise the question of what to do on 32-bit). What if we just take this patch, and add trace_hardirqs_on() just before entering the guest? This would be similar to what the 32-bit non-KVM exception return code does (except it would be in C code). Perhaps we could set soft_enabled as well, but then we'd have to clear it again before calling kvmppc_restart_interrupt() -- since the KVM exception handlers don't actually care about soft_enabled (it would just be for consistency), I'd rather just leave soft_enabled off. We also don't want PACA_IRQ_HARD_DIS to be cleared the way prep_irq_for_idle() does, because that's what lets the local_irq_enable() do the hard-enabling after we exit the guest. -Scott -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] kvm/ppc/booke64: Hard disable interrupts when entering the guest
On Mon, 2013-05-06 at 18:53 -0500, Scott Wood wrote: Ie. The last stage of entry will hard enable, so they should be soft-enabled too... if not, latency trackers will consider the whole guest periods as interrupt disabled... OK... I guess we already have that problem on 32-bit as well? 32-bit doesn't do lazy disable, so the situation is a lot easier there. Now, kvmppc_lazy_ee_enable() seems to be clearly bogus to me. It will unconditionally set soft_enabled and clear irq_happened from a soft-disabled state, thus potentially losing a pending event. Book3S HV seems to be keeping interrupts fully enabled all the way until the asm hard disables, which would be fine except that I'm worried we are racy vs. need_resched signals. One thing you may be able to do is call prep_irq_for_idle(). This will tell you if something happened, giving you a chance to abort/re-enable before you go the guest. As long as we go straight from IRQs fully enabled to hard-disabled, before we check for signals and such, I don't think we need that (and using it would raise the question of what to do on 32-bit). Except that you have to mark them as soft enabled before you enter the guest with interrupts on... But yes, I see your point. If interrupts are fully enabled and you call hard_irq_disable(), there should be no chance for anything to mess around with irq_happened. However if you set soft-enabled later on before the rfid that returns to the guest and sets EE, you *must* also clear PACA_IRQ_HARD_DIS in irq_happened. If you get that out of sync bad things will happen later on... To be sure all is well, you might want to WARN_ON(get_paca()-irq_happened == PACA_IRQ_HARD_DIS); (with a comment explaining why so). Another problem is that hard_irq_disable() doesn't call trace_hardirqs_off()... We might want to fix that: static inline void hard_irq_disable(void) { __hard_irq_disable(); if (get_paca()-soft_enabled) trace_hardirqs_off(); get_paca()-soft_enabled = 0; get_paca()-irq_happened |= PACA_IRQ_HARD_DIS; } What if we just take this patch, and add trace_hardirqs_on() just before entering the guest? You still want to set soft_enabled I'd say ... though I can see how you may get away without it as long as you call trace_hardirqs_off() right on the way back from the guest, but beware some lockdep bits will choke if they ever spot the discrepancy between the traced irq state and soft_enabled. I'd recommend you just keep it in sync. This would be similar to what the 32-bit non-KVM exception return code does (except it would be in C code). Perhaps we could set soft_enabled as well, but then we'd have to clear it again before calling kvmppc_restart_interrupt() -- since the KVM exception handlers don't actually care about soft_enabled (it would just be for consistency), I'd rather just leave soft_enabled off. We also don't want PACA_IRQ_HARD_DIS to be cleared the way prep_irq_for_idle() does, because that's what lets the local_irq_enable() do the hard-enabling after we exit the guest. Then set it again. Don't leave the kernel in a state where soft_enabled is 1 and irq_happened is non-zero. It might work in the specific KVM case we are looking at now because we know we are coming back via KVM exit and putting things right again but it's fragile, somebody will come back and break it, etc... If necessary, create (or improve existing) helpers that do the right state adjustement. The cost of a couple of byte stores is negligible, I'd rather you make sure everything remains in sync at all times. Cheers, Ben. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/5] KVM: PPC: iommu: Add missing kvm_iommu_map_pages/kvm_iommu_unmap_pages
On 05/07/2013 07:07 AM, Alex Williamson wrote: On Mon, 2013-05-06 at 17:21 +1000, a...@ozlabs.ru wrote: From: Alexey Kardashevskiy a...@ozlabs.ru The IOMMU API implements groups creating/deletion, device binding and IOMMU map/unmap operations. The PowerPC implementation uses most of the API except map/unmap operations, which are implemented on POWER using hypercalls. However, in order to link a kernel with the CONFIG_IOMMU_API enabled, the empty kvm_iommu_map_pages/kvm_iommu_unmap_pages have to be defined, so this defines them. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru Cc: David Gibson da...@gibson.dropbear.id.au Signed-off-by: Paul Mackerras pau...@samba.org --- arch/powerpc/include/asm/kvm_host.h | 14 ++ 1 file changed, 14 insertions(+) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index b6a047e..c025d91 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -603,4 +603,18 @@ struct kvm_vcpu_arch { #define __KVM_HAVE_ARCH_WQP +#ifdef CONFIG_IOMMU_API +/* POWERPC does not use IOMMU API for mapping/unmapping */ +static inline int kvm_iommu_map_pages(struct kvm *kvm, +struct kvm_memory_slot *slot) +{ +return 0; +} + +static inline void kvm_iommu_unmap_pages(struct kvm *kvm, +struct kvm_memory_slot *slot) +{ +} +#endif /* CONFIG_IOMMU_API */ + #endif /* __POWERPC_KVM_HOST_H__ */ This is no longer needed, Gleb applied my patch for 3.10 that make all of KVM device assignment dependent on a build config option and the top level kvm_host.h now includes this when that is not set. Thanks, Cannot find it, could you point me please where it is on github or git.kernel.org? Thanks. -- Alexey -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [v1][KVM][PATCH 1/1] kvm:ppc: enable doorbell exception with E500MC
On 05/06/2013 10:58 PM, Alexander Graf wrote: On 05/06/2013 04:53 AM, Tiejun Chen wrote: Actually E500MC also support doorbell exception, and CONFIG_PPC_E500MC can cover BOOK3E/BOOK3E_64 as well. Signed-off-by: Tiejun Chentiejun.c...@windriver.com --- arch/powerpc/kvm/booke.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 1020119..dc1f590 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -795,7 +795,7 @@ static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu, kvmppc_fill_pt_regs(regs); timer_interrupt(regs); break; -#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3E_64) +#if defined(CONFIG_PPC_E500MC) I suppose you mean CONFIG_KVM_E500MC here? Why didn't this work for you before? This works for me. Here I just mean currently CONFIG_PPC_E500MC is always selected no matter what CONFIG_PPC_FSL_BOOK3E or CONFIG_PPC_BOOK3E_64 is enabled. And especially, this is already in the arch/powerpc/kvm/booke.c file, so I think one #ifdef (CONFIG_PPC_E500MC) is enough and also makes sense. The ifdef above should cover the same range of CPUs. Or furthermore, the #ifdef CONFIG_PPC_DOORBELL is reasonable to cover this. Tiejun -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
irqfd issue
Hi, I am seeing this with linus/master. Any ideas? [ 34.168356] IPv6: ADDRCONF(NETDEV_UP): virbr0: link is not ready [ 36.743758] BUG: unable to handle kernel paging request at 00030029 [ 36.745177] IP: [81c08584] __mutex_lock_slowpath+0x34/0x240 [ 36.746576] PGD 0 [ 36.747962] Oops: [#1] SMP [ 36.749343] Modules linked in: ipt_MASQUERADE iptable_nat nf_nat_ipv4 nf_nat vhost_net vhost [ 36.750753] CPU: 0 PID: 4260 Comm: qemu-kvm Not tainted 3.9.0+ #752 [ 36.752130] Hardware name: Dell Inc. OptiPlex 790/0V5HMK, BIOS A11 12/30/2011 [ 36.753495] task: 88021fba8000 ti: 88021f0c4000 task.ti: 88021f0c4000 [ 36.754847] RIP: 0010:[81c08584] [81c08584] __mutex_lock_slowpath+0x34/0x240 [ 36.756228] RSP: 0018:88021f0c5c88 EFLAGS: 00010202 [ 36.757584] RAX: 0001 RBX: 880223ffb420 RCX: [ 36.758926] RDX: 00030001 RSI: 88021f0c5d60 RDI: 880223ffb420 [ 36.760268] RBP: 88021f0c5cf8 R08: 88021f0c R09: [ 36.761602] R10: 8802209c3f10 R11: R12: 880223ffb420 [ 36.762924] R13: 88022236c000 R14: 8802236213b0 R15: 880223ffb420 [ 36.764232] FS: () GS:88022dc0() knlGS: [ 36.765543] CS: 0010 DS: ES: CR0: 80050033 [ 36.766869] CR2: 00030029 CR3: 0240b000 CR4: 000427f0 [ 36.768220] DR0: DR1: DR2: [ 36.769565] DR3: DR6: 0ff0 DR7: 0400 [ 36.770889] Stack: [ 36.772200] 0092 88021fba8000 88021f0c5cb8 81c0b2c7 [ 36.773551] 8802210a4fc0 8802210a4fc0 88021f0c5d08 810c0f2c [ 36.774884] 000e 880223ffb420 88021f0c5d38 88022236c000 [ 36.776194] Call Trace: [ 36.777482] [81c0b2c7] ? _raw_spin_unlock_irqrestore+0x37/0x40 [ 36.778789] [810c0f2c] ? try_to_wake_up+0x1ec/0x290 [ 36.780107] [81c0852b] mutex_lock+0x2b/0x50 [ 36.781420] [810a9b6d] flush_workqueue+0x9d/0x560 [ 36.782729] [8100933f] kvm_irqfd_release+0x8f/0xa0 [ 36.784046] [8100456d] kvm_vm_release+0x1d/0x30 [ 36.785367] [811a732a] __fput+0xba/0x240 [ 36.786693] [811a751e] fput+0xe/0x10 [ 36.788007] [810af685] task_work_run+0xa5/0xe0 [ 36.789317] [81092cd7] do_exit+0x2d7/0xac0 [ 36.790622] [811a4a04] ? fsnotify_modify+0x64/0x80 [ 36.791896] [8140327a] ? trace_hardirqs_off_thunk+0x3a/0x6c [ 36.793141] [81093511] do_group_exit+0x51/0xc0 [ 36.794358] [81093597] SyS_exit_group+0x17/0x20 [ 36.795547] [81c13882] system_call_fastpath+0x16/0x1b [ 36.796731] Code: 55 41 54 53 48 83 ec 48 66 66 66 66 90 65 48 8b 04 25 00 b8 00 00 49 89 fc 48 89 45 98 48 8b 57 18 b8 01 00 00 00 48 85 d2 74 03 8b 42 28 85 c0 0f 84 e6 00 00 00 65 48 8b 04 25 08 b8 00 00 48 [ 36.798194] RIP [81c08584] __mutex_lock_slowpath+0x34/0x240 [ 36.799567] RSP 88021f0c5c88 [ 36.800943] CR2: 00030029 [ 36.813185] ---[ end trace 4877613defb9fc19 ]--- [ 36.813188] Fixing recursive fault but reboot is needed! [ 37.011566] usb 2-1.1: link qh8-0601/880223a9c600 start 3 [1/2 us] [ 70.539341] usb usb1: usb port1's DeviceRemovable is changed to 1 according to platform information. [ 70.539546] usb usb2: usb port1's DeviceRemovable is changed to 1 according to platform information. [ 70.862490] nr_pdflush_threads exported in /proc is scheduled for removal [ 70.862558] sysctl: The scan_unevictable_pages sysctl/node-interface has been disabled for lack of a legitimate use case. If you have one, please send an email to linux...@kvack.org. -- Asias -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/5] KVM: PPC: iommu: Add missing kvm_iommu_map_pages/kvm_iommu_unmap_pages
On Tue, 2013-05-07 at 10:49 +1000, Alexey Kardashevskiy wrote: On 05/07/2013 07:07 AM, Alex Williamson wrote: On Mon, 2013-05-06 at 17:21 +1000, a...@ozlabs.ru wrote: From: Alexey Kardashevskiy a...@ozlabs.ru The IOMMU API implements groups creating/deletion, device binding and IOMMU map/unmap operations. The PowerPC implementation uses most of the API except map/unmap operations, which are implemented on POWER using hypercalls. However, in order to link a kernel with the CONFIG_IOMMU_API enabled, the empty kvm_iommu_map_pages/kvm_iommu_unmap_pages have to be defined, so this defines them. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru Cc: David Gibson da...@gibson.dropbear.id.au Signed-off-by: Paul Mackerras pau...@samba.org --- arch/powerpc/include/asm/kvm_host.h | 14 ++ 1 file changed, 14 insertions(+) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index b6a047e..c025d91 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -603,4 +603,18 @@ struct kvm_vcpu_arch { #define __KVM_HAVE_ARCH_WQP +#ifdef CONFIG_IOMMU_API +/* POWERPC does not use IOMMU API for mapping/unmapping */ +static inline int kvm_iommu_map_pages(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + return 0; +} + +static inline void kvm_iommu_unmap_pages(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ +} +#endif /* CONFIG_IOMMU_API */ + #endif /* __POWERPC_KVM_HOST_H__ */ This is no longer needed, Gleb applied my patch for 3.10 that make all of KVM device assignment dependent on a build config option and the top level kvm_host.h now includes this when that is not set. Thanks, Cannot find it, could you point me please where it is on github or git.kernel.org? Thanks. http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=2a5bab1004729f3302c776e53ee7c895b98bb1ce -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH v10 7/7] KVM: VMX: Use posted interrupt to deliver virtual interrupt
Yangminqiang wrote on 2013-05-03: Nakajima, Jun wrote on 2013-04-26: Subject: Re: [PATCH v10 7/7] KVM: VMX: Use posted interrupt to deliver virtual interrupt On Fri, Apr 26, 2013 at 2:29 AM, Yangminqiang yangminqi...@huawei.com wrote: Ivytown or newer platform supported it. Ivytown? Do you mean Ivy Bridge? Ivy Town is the codename of Ivy Bridge-based servers. One more question, what is the relationship between x2APIC and APIC virtualization? APIC-v requires x2APIC or APIC-v includes x2APIC? If you are using x2apic way(MSR base access)inside guest and want to benefit from apic virtualization technology, then you should set virtual x2apic bit in Secondary Processor-Based VM-Execution Controls. Best regards, Yang
Re: [RFC][KVM][PATCH 1/1] kvm:ppc:booke-64: soft-disable interrupts
On 05/07/2013 07:50 AM, Scott Wood wrote: On 05/05/2013 10:13:17 PM, tiejun.chen wrote: On 05/06/2013 11:10 AM, Tiejun Chen wrote: For the external interrupt, the decrementer exception and the doorbell excpetion, we also need to soft-disable interrupts while doing as host interrupt handlers since the DO_KVM hook is always performed to skip EXCEPTION_COMMON then miss this original chance with the 'ints' (INTS_DISABLE). http://patchwork.ozlabs.org/patch/241344/ http://patchwork.ozlabs.org/patch/241412/ :-) I'm observing the same behaviour as well: WARN_ON_ONCE(!irqs_disabled()); Signed-off-by: Tiejun Chen tiejun.c...@windriver.com --- arch/powerpc/kvm/bookehv_interrupts.S |9 + 1 file changed, 9 insertions(+) diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S index e8ed7d6..2fd62bf 100644 --- a/arch/powerpc/kvm/bookehv_interrupts.S +++ b/arch/powerpc/kvm/bookehv_interrupts.S @@ -33,6 +33,8 @@ #ifdef CONFIG_64BIT #include asm/exception-64e.h +#include asm/hw_irq.h +#include asm/irqflags.h #else #include ../kernel/head_booke.h /* for THREAD_NORMSAVE() */ #endif @@ -469,6 +471,13 @@ _GLOBAL(kvmppc_resume_host) PPC_LLr3, HOST_RUN(r1) mrr5, r14 /* intno */ mrr14, r4 /* Save vcpu pointer. */ +#ifdef CONFIG_64BIT +/* Should we soft-disable interrupts? */ +andi.r6, r5, BOOKE_INTERRUPT_EXTERNAL | BOOKE_INTERRUPT_DECREMENTER | BOOKE_INTERRUPT_DOORBELL +beqskip_soft_dis +SOFT_DISABLE_INTS(r7,r8) +skip_soft_dis: +#endif Why wouldn't we always disable them? kvmppc_handle_exit() will enable interrupts when it's ready. This only disable soft interrupt for kvmppc_restart_interrupt() that restarts interrupts if they were meant for the host: a. SOFT_DISABLE_INTS() only for BOOKE_INTERRUPT_EXTERNAL | BOOKE_INTERRUPT_DECREMENTER | BOOKE_INTERRUPT_DOORBELL b. bl kvmppc_handle_exit c. kvmppc_handle_exit() { int r = RESUME_HOST; int s; /* update before a new last_exit_type is rewritten */ kvmppc_update_timing_stats(vcpu); /* restart interrupts if they were meant for the host */ kvmppc_restart_interrupt(vcpu, exit_nr); local_irq_enable(); == Enable again. And shouldn't we handle kvmppc_restart_interrupt() like the original HOST flow? #define MASKABLE_EXCEPTION(trapnum, intnum, label, hdlr, ack) \ START_EXCEPTION(label); \ NORMAL_EXCEPTION_PROLOG(trapnum, intnum, PROLOG_ADDITION_MASKABLE)\ EXCEPTION_COMMON(trapnum, PACA_EXGEN, *INTS_DISABLE*) \ ... So I think this should be reasonable :) Tiejun -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC][KVM][PATCH 1/1] kvm:ppc:booke-64: soft-disable interrupts
On 05/07/2013 10:06 AM, Scott Wood wrote: On 05/06/2013 08:56:25 PM, tiejun.chen wrote: On 05/07/2013 07:50 AM, Scott Wood wrote: On 05/05/2013 10:13:17 PM, tiejun.chen wrote: On 05/06/2013 11:10 AM, Tiejun Chen wrote: For the external interrupt, the decrementer exception and the doorbell excpetion, we also need to soft-disable interrupts while doing as host interrupt handlers since the DO_KVM hook is always performed to skip EXCEPTION_COMMON then miss this original chance with the 'ints' (INTS_DISABLE). http://patchwork.ozlabs.org/patch/241344/ http://patchwork.ozlabs.org/patch/241412/ :-) I'm observing the same behaviour as well: WARN_ON_ONCE(!irqs_disabled()); So, could you explain the benefits of your approach over what's being discussed in those threads? They're a long thread so I think I need to take time to see :) Why wouldn't we always disable them? kvmppc_handle_exit() will enable interrupts when it's ready. This only disable soft interrupt for kvmppc_restart_interrupt() that restarts interrupts if they were meant for the host: a. SOFT_DISABLE_INTS() only for BOOKE_INTERRUPT_EXTERNAL | BOOKE_INTERRUPT_DECREMENTER | BOOKE_INTERRUPT_DOORBELL Those aren't the only exceptions that can end up going to the host. We could get a TLB miss that results in a heavyweight MMIO exit, etc. This is like host handler, so I'm just disabling soft interrupt during kvmppc_restart_interrupt() for Doorbell interrupt/Decrementer Interrupt/External Input Interrupt. I don't see anything should be disabled for any TLB exception in host handler. And I'd rather see any fix for this problem stay out of the asm code. We already have an appropriate SOFT_DISABLE_INTS so I think we can take this easily :) b. bl kvmppc_handle_exit c. kvmppc_handle_exit() { int r = RESUME_HOST; int s; /* update before a new last_exit_type is rewritten */ kvmppc_update_timing_stats(vcpu); /* restart interrupts if they were meant for the host */ kvmppc_restart_interrupt(vcpu, exit_nr); local_irq_enable();== Enable again. And shouldn't we handle kvmppc_restart_interrupt() like the original HOST flow? #define MASKABLE_EXCEPTION(trapnum, intnum, label, hdlr, ack) \ START_EXCEPTION(label); \ NORMAL_EXCEPTION_PROLOG(trapnum, intnum, PROLOG_ADDITION_MASKABLE)\ EXCEPTION_COMMON(trapnum, PACA_EXGEN, *INTS_DISABLE*) \ ... Could you elaborate on what you mean? In host handler, we always use MASKABLE_EXCEPTION() to define-to-handle some exceptions: Doorbell interrupt/Decrementer Interrupt/External Input Interrupt: #define MASKABLE_EXCEPTION(trapnum, intnum, label, hdlr, ack) \ START_EXCEPTION(label); \ NORMAL_EXCEPTION_PROLOG(trapnum, intnum, PROLOG_ADDITION_MASKABLE)\ EXCEPTION_COMMON(trapnum, PACA_EXGEN, *INTS_DISABLE*) \ This would call INTS_DISABLE, which is equal to SOFT_DISABLE_INTS(), to disable soft interrupt before call all associated handlers: do_IRQ()/timer_interrupt()/doorbell_exception(). But DO_KVM hook always skips INTS_DISABLE. So I think we also need to do INTS_DISABLE for kvmppc_restart_interrupt() since actually that restarts interrupts for the host with a similar way as they are called by host. Tiejun -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 04/11] kvm tools: console: unconditionally output to any console
On Tue, May 7, 2013 at 2:34 AM, Sasha Levin sasha.le...@oracle.com wrote: On 05/03/2013 12:09 PM, Will Deacon wrote: On Fri, May 03, 2013 at 05:02:14PM +0100, Sasha Levin wrote: On 05/03/2013 05:19 AM, Pekka Enberg wrote: On Wed, May 1, 2013 at 6:50 PM, Will Deacon will.dea...@arm.com wrote: From: Marc Zyngier marc.zyng...@arm.com Kvmtool suppresses any output to a console that has not been elected as *the* console. While this makes sense on the input side (we want the input to be sent to one console driver only), it seems to be the wrong thing to do on the output side, as it effectively prevents the guest from switching from one console to another (think earlyprintk using 8250 to virtio console). After all, the guest *does* poke this device and outputs something there. Just remove the kvm-cfg.active_console test from the output paths. Signed-off-by: Marc Zyngier marc.zyng...@arm.com Signed-off-by: Will Deacon will.dea...@arm.com Seems reasonable. Asias, Sasha? I remember at trying it some time ago but dropped it for a reason I don't remember at the moment. Can I have the weekend to play with it to try and figure out why? There's no rush from my point of view (hence the RFC) so take as long as you need! Looks good to me! Thanks, Sasha I am fine with having 8250 emulated by KVMTOOL, but I am more inclined towards having a full para-virtualized (PV) machine emulated by KVMTOOL. Best Regards, Anup -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
KVM: x86: perform kvmclock updates in lockstep
It is necessary for each vcpus system_timestamp memory copy to be updated from one sample of the nanosecond kernel clock. If this is not the case, and NTP changes frequency adjustment, different vcpus will make use of different time bases. Signed-off-by: Marcelo Tosatti mtosa...@redhat.com diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 94f35d2..1ef4287 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1965,7 +1965,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) kvmclock_reset(vcpu); vcpu-arch.time = data; - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); /* we verify if the enable bit is set... */ if (!(data 1)) @@ -2665,7 +2665,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (unlikely(vcpu-arch.tsc_offset_adjustment)) { adjust_tsc_offset_host(vcpu, vcpu-arch.tsc_offset_adjustment); vcpu-arch.tsc_offset_adjustment = 0; - set_bit(KVM_REQ_CLOCK_UPDATE, vcpu-requests); + set_bit(KVM_REQ_MASTERCLOCK_UPDATE, vcpu-requests); } if (unlikely(vcpu-cpu != cpu) || check_tsc_unstable()) { @@ -2684,7 +2684,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) * kvmclock on vcpu-cpu migration */ if (!vcpu-kvm-arch.use_master_clock || vcpu-cpu == -1) - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); if (vcpu-cpu != cpu) kvm_migrate_timers(vcpu); vcpu-cpu = cpu; @@ -5092,7 +5092,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu-cpu != freq-cpu) continue; - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); if (vcpu-cpu != smp_processor_id()) send_ipi = 1; } -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] kvm/ppc/booke64: Hard disable interrupts when entering the guest
On 05/06/2013 07:03:14 PM, Benjamin Herrenschmidt wrote: On Mon, 2013-05-06 at 18:53 -0500, Scott Wood wrote: Ie. The last stage of entry will hard enable, so they should be soft-enabled too... if not, latency trackers will consider the whole guest periods as interrupt disabled... OK... I guess we already have that problem on 32-bit as well? 32-bit doesn't do lazy disable, so the situation is a lot easier there. Right, but it still currently enters the guest with interrupts marked as disabled, so we'd have the same latency tracker issue. Another problem is that hard_irq_disable() doesn't call trace_hardirqs_off()... We might want to fix that: static inline void hard_irq_disable(void) { __hard_irq_disable(); if (get_paca()-soft_enabled) trace_hardirqs_off(); get_paca()-soft_enabled = 0; get_paca()-irq_happened |= PACA_IRQ_HARD_DIS; } Is it possible there are places that assume the current behavior? We also don't want PACA_IRQ_HARD_DIS to be cleared the way prep_irq_for_idle() does, because that's what lets the local_irq_enable() do the hard-enabling after we exit the guest. Then set it again. Don't leave the kernel in a state where soft_enabled is 1 and irq_happened is non-zero. It might work in the specific KVM case we are looking at now because we know we are coming back via KVM exit and putting things right again but it's fragile, somebody will come back and break it, etc... KVM is a pretty special case -- at least on booke, it's required that all exits from guest state go through the KVM exception code. I think it's less likely that that changes, than something breaks in the code to fix up lazy ee state (especially since we've already seen the latter happen). I'll give it a shot, though. If necessary, create (or improve existing) helpers that do the right state adjustement. The cost of a couple of byte stores is negligible, I'd rather you make sure everything remains in sync at all times. My concern was mainly about complexity -- it seemed simpler to just say that the during guest execution, CPU is in a special state that is not visible to anything that cares about lazy EE. The fact that EE can actually be *off* and we still take the interrupt supports its specialness. :-) -Scott -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/5] KVM: PPC: iommu: Add missing kvm_iommu_map_pages/kvm_iommu_unmap_pages
On 05/07/2013 11:42 AM, Alex Williamson wrote: On Tue, 2013-05-07 at 10:49 +1000, Alexey Kardashevskiy wrote: On 05/07/2013 07:07 AM, Alex Williamson wrote: On Mon, 2013-05-06 at 17:21 +1000, a...@ozlabs.ru wrote: From: Alexey Kardashevskiy a...@ozlabs.ru The IOMMU API implements groups creating/deletion, device binding and IOMMU map/unmap operations. The PowerPC implementation uses most of the API except map/unmap operations, which are implemented on POWER using hypercalls. However, in order to link a kernel with the CONFIG_IOMMU_API enabled, the empty kvm_iommu_map_pages/kvm_iommu_unmap_pages have to be defined, so this defines them. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru Cc: David Gibson da...@gibson.dropbear.id.au Signed-off-by: Paul Mackerras pau...@samba.org --- arch/powerpc/include/asm/kvm_host.h | 14 ++ 1 file changed, 14 insertions(+) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index b6a047e..c025d91 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -603,4 +603,18 @@ struct kvm_vcpu_arch { #define __KVM_HAVE_ARCH_WQP +#ifdef CONFIG_IOMMU_API +/* POWERPC does not use IOMMU API for mapping/unmapping */ +static inline int kvm_iommu_map_pages(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + return 0; +} + +static inline void kvm_iommu_unmap_pages(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ +} +#endif /* CONFIG_IOMMU_API */ + #endif /* __POWERPC_KVM_HOST_H__ */ This is no longer needed, Gleb applied my patch for 3.10 that make all of KVM device assignment dependent on a build config option and the top level kvm_host.h now includes this when that is not set. Thanks, Cannot find it, could you point me please where it is on github or git.kernel.org? Thanks. http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=2a5bab1004729f3302c776e53ee7c895b98bb1ce Yes, I confirm, this is patch is not need any more. Thanks! -- Alexey -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] kvm/ppc: interrupt disabling fixes
booke64 was not maintaing consistent lazy ee state when exiting the guest, leading to warnings and worse. booke32 was less affected due to the absence of lazy ee, but it was still feeding bad information into trace_hardirqs_off/on -- we don't want guest execution to be seen as an IRQs off interval. book3s_pr also has this problem. book3s_pr and booke both used kvmppc_lazy_ee_enable() without hard-disabling EE first, which could lead to races when irq_happened is cleared, or if an interrupt happens after kvmppc_lazy_ee_enable(), and possibly other issues. Now, on book3s_pr and booke, always hard-disable interrupts before kvmppc_prepare_to_enter(), but leave them soft-enabled. On book3s, this should results in the right lazy EE state when the asm code hard-enables on an exit. On booke, we call hard_irq_disable() rather than hard-enable immediately. Signed-off-by: Scott Wood scottw...@freescale.com Cc: Mihai Caraman mihai.cara...@freescale.com Cc: Benjamin Herrenschmidt b...@kernel.crashing.org Cc: Tiejun Chen tiejun.c...@windriver.com --- Only tested on booke (32 and 64 bit). Testers of book3s_pr would be appreciated (particularly with lockdep enabled). --- arch/powerpc/include/asm/kvm_ppc.h |7 +++ arch/powerpc/kvm/book3s_pr.c |6 -- arch/powerpc/kvm/booke.c | 12 ++-- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index a5287fe..e55d7e5 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -399,6 +399,13 @@ static inline void kvmppc_mmu_flush_icache(pfn_t pfn) static inline void kvmppc_lazy_ee_enable(void) { #ifdef CONFIG_PPC64 + /* +* To avoid races, the caller must have gone directly from having +* interrupts fully-enabled to hard-disabled. +*/ + WARN_ON(local_paca-irq_happened != PACA_IRQ_HARD_DIS); + trace_hardirqs_on(); + /* Only need to enable IRQs by hard enabling them after this */ local_paca-irq_happened = 0; local_paca-soft_enabled = 1; diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index d09baf1..a1e70113 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -884,7 +884,8 @@ program_interrupt: * and if we really did time things so badly, then we just exit * again due to a host external interrupt. */ - local_irq_disable(); + hard_irq_disable(); + trace_hardirqs_off(); s = kvmppc_prepare_to_enter(vcpu); if (s = 0) { local_irq_enable(); @@ -1121,7 +1122,8 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) * really did time things so badly, then we just exit again due to * a host external interrupt. */ - local_irq_disable(); + hard_irq_disable(); + trace_hardirqs_off(); ret = kvmppc_prepare_to_enter(vcpu); if (ret = 0) { local_irq_enable(); diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index ecbe908..5dc1f53 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -666,7 +666,8 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) return -EINVAL; } - local_irq_disable(); + hard_irq_disable(); + trace_hardirqs_off(); s = kvmppc_prepare_to_enter(vcpu); if (s = 0) { local_irq_enable(); @@ -834,6 +835,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, int s; int idx; +#ifdef CONFIG_PPC64 + WARN_ON(local_paca-irq_happened != 0); +#endif + hard_irq_disable(); + trace_hardirqs_off(); + /* update before a new last_exit_type is rewritten */ kvmppc_update_timing_stats(vcpu); @@ -1150,7 +1157,8 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, * aren't already exiting to userspace for some other reason. */ if (!(r RESUME_HOST)) { - local_irq_disable(); + hard_irq_disable(); + trace_hardirqs_off(); s = kvmppc_prepare_to_enter(vcpu); if (s = 0) { local_irq_enable(); -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v18] pvpanic: pvpanic device driver
pvpanic device is a qemu simulated device through which guest panic event is sent to host. Signed-off-by: Hu Tao hu...@cn.fujitsu.com --- v17 - v18: 1. call acpi_walk_resources to get the port, and usb outb instead of acpi_evaluate_oject in panic notifier callback 2. reword help message drivers/platform/x86/Kconfig | 8 +++ drivers/platform/x86/Makefile | 2 + drivers/platform/x86/pvpanic.c | 122 + 3 files changed, 132 insertions(+) create mode 100644 drivers/platform/x86/pvpanic.c diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 3338437..8577261 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -781,4 +781,12 @@ config APPLE_GMUX graphics as well as the backlight. Currently only backlight control is supported by the driver. +config PVPANIC + tristate pvpanic device support + depends on ACPI + ---help--- + This driver provides support for the pvpanic device. pvpanic is + a paravirtualized device provided by QEMU; it lets a virtual machine + (guest) communicate panic events to the host. + endif # X86_PLATFORM_DEVICES diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile index ace2b38..ef0ec74 100644 --- a/drivers/platform/x86/Makefile +++ b/drivers/platform/x86/Makefile @@ -51,3 +51,5 @@ obj-$(CONFIG_INTEL_OAKTRAIL) += intel_oaktrail.o obj-$(CONFIG_SAMSUNG_Q10) += samsung-q10.o obj-$(CONFIG_APPLE_GMUX) += apple-gmux.o obj-$(CONFIG_CHROMEOS_LAPTOP) += chromeos_laptop.o + +obj-$(CONFIG_PVPANIC) += pvpanic.o diff --git a/drivers/platform/x86/pvpanic.c b/drivers/platform/x86/pvpanic.c new file mode 100644 index 000..ddec5cb --- /dev/null +++ b/drivers/platform/x86/pvpanic.c @@ -0,0 +1,122 @@ +/* + * pvpanic.c - pvpanic Device Support + * + * Copyright (C) 2013 Fujitsu. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME : fmt + +#include linux/kernel.h +#include linux/module.h +#include linux/init.h +#include linux/types.h +#include acpi/acpi_bus.h +#include acpi/acpi_drivers.h + +MODULE_AUTHOR(Hu Tao hu...@cn.fujitsu.com); +MODULE_DESCRIPTION(pvpanic device driver); +MODULE_LICENSE(GPL); + +static int pvpanic_add(struct acpi_device *device); +static int pvpanic_remove(struct acpi_device *device); + +static const struct acpi_device_id pvpanic_device_ids[] = { + { QEMU0001, 0}, + { , 0}, +}; +MODULE_DEVICE_TABLE(acpi, pvpanic_device_ids); + +#define PVPANIC_PANICKED (1 0) + +static u16 port; + +static struct acpi_driver pvpanic_driver = { + .name = pvpanic, + .class =QEMU, + .ids = pvpanic_device_ids, + .ops = { + .add = pvpanic_add, + .remove = pvpanic_remove, + }, + .owner =THIS_MODULE, +}; + +static void +pvpanic_send_event(unsigned int event) +{ + if (port) + outb(event, port); +} + +static int +pvpanic_panic_notify(struct notifier_block *nb, unsigned long code, +void *unused) +{ + pvpanic_send_event(PVPANIC_PANICKED); + return NOTIFY_DONE; +} + +static struct notifier_block pvpanic_panic_nb = { + .notifier_call = pvpanic_panic_notify, +}; + + +static acpi_status +pvpanic_walk_resources(struct acpi_resource *res, void *context) +{ + switch (res-type) { + case ACPI_RESOURCE_TYPE_END_TAG: + return AE_OK; + + case ACPI_RESOURCE_TYPE_IO: + port = res-data.io.minimum; + return AE_OK; + + default: + return AE_ERROR; + } +} + +static int pvpanic_add(struct acpi_device *device) +{ + acpi_status status; + u64 ret; + + status = acpi_evaluate_integer(device-handle, _STA, NULL, + ret); + + if (ACPI_FAILURE(status) || (ret 0x0B) != 0x0B) + return -ENODEV; + + acpi_walk_resources(device-handle, METHOD_NAME__CRS, + pvpanic_walk_resources, NULL); + + atomic_notifier_chain_register(panic_notifier_list, +
Re: [PATCH v4 4/6] KVM: MMU: fast invalid all shadow pages
On 05/07/2013 03:50 AM, Marcelo Tosatti wrote: On Mon, May 06, 2013 at 11:39:11AM +0800, Xiao Guangrong wrote: On 05/04/2013 08:52 AM, Marcelo Tosatti wrote: On Sat, May 04, 2013 at 12:51:06AM +0800, Xiao Guangrong wrote: On 05/03/2013 11:53 PM, Marcelo Tosatti wrote: On Fri, May 03, 2013 at 01:52:07PM +0800, Xiao Guangrong wrote: On 05/03/2013 09:05 AM, Marcelo Tosatti wrote: + +/* + * Fast invalid all shadow pages belong to @slot. + * + * @slot != NULL means the invalidation is caused the memslot specified + * by @slot is being deleted, in this case, we should ensure that rmap + * and lpage-info of the @slot can not be used after calling the function. + * + * @slot == NULL means the invalidation due to other reasons, we need + * not care rmap and lpage-info since they are still valid after calling + * the function. + */ +void kvm_mmu_invalid_memslot_pages(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + spin_lock(kvm-mmu_lock); + kvm-arch.mmu_valid_gen++; + + /* + * All shadow paes are invalid, reset the large page info, + * then we can safely desotry the memslot, it is also good + * for large page used. + */ + kvm_clear_all_lpage_info(kvm); Xiao, I understood it was agreed that simple mmu_lock lockbreak while avoiding zapping of newly instantiated pages upon a if(spin_needbreak) cond_resched_lock() cycle was enough as a first step? And then later introduce root zapping along with measurements. https://lkml.org/lkml/2013/4/22/544 Yes, it is. See the changelog in 0/0: we use lock-break technique to zap all sptes linked on the invalid rmap, it is not very effective but good for the first step. Thanks! Sure, but what is up with zeroing kvm_clear_all_lpage_info(kvm) and zapping the root? Only lock-break technique along with generation number was what was agreed. Marcelo, Please Wait... I am completely confused. :( Let's clarify zeroing kvm_clear_all_lpage_info(kvm) and zapping the root first. Are these changes you wanted? void kvm_mmu_invalid_memslot_pages(struct kvm *kvm, struct kvm_memory_slot *slot) { spin_lock(kvm-mmu_lock); kvm-arch.mmu_valid_gen++; /* Zero all root pages.*/ restart: list_for_each_entry_safe(sp, node, kvm-arch.active_mmu_pages, link) { if (!sp-root_count) continue; if (kvm_mmu_prepare_zap_page(kvm, sp, invalid_list)) goto restart; } /* * All shadow paes are invalid, reset the large page info, * then we can safely desotry the memslot, it is also good * for large page used. */ kvm_clear_all_lpage_info(kvm); kvm_mmu_commit_zap_page(kvm, invalid_list); spin_unlock(kvm-mmu_lock); } static void rmap_remove(struct kvm *kvm, u64 *spte) { struct kvm_mmu_page *sp; gfn_t gfn; unsigned long *rmapp; sp = page_header(__pa(spte)); + + /* Let invalid sp do not access its rmap. */ + if (!sp_is_valid(sp)) + return; + gfn = kvm_mmu_page_get_gfn(sp, spte - sp-spt); rmapp = gfn_to_rmap(kvm, gfn, sp-role.level); pte_list_remove(spte, rmapp); } If yes, there is the reason why we can not do this that i mentioned before: after call kvm_mmu_invalid_memslot_pages(), the memslot-rmap will be destroyed. Later, if host reclaim page, the mmu-notify handlers, -invalidate_page and -invalidate_range_start, can not find any spte using the host page, then Accessed/Dirty for host page is missing tracked. (missing call kvm_set_pfn_accessed and kvm_set_pfn_dirty properly.) What's your idea? Step 1) Fix kvm_mmu_zap_all's behaviour: introduce lockbreak via spin_needbreak. Use generation numbers so that in case kvm_mmu_zap_all releases mmu_lock and reacquires it again, only shadow pages from the generation with which kvm_mmu_zap_all started are zapped (this guarantees forward progress and eventual termination). kvm_mmu_zap_generation() spin_lock(mmu_lock) int generation = kvm-arch.mmu_generation; for_each_shadow_page(sp) { if (sp-generation == kvm-arch.mmu_generation) zap_page(sp) if (spin_needbreak(mmu_lock)) { kvm-arch.mmu_generation++; cond_resched_lock(mmu_lock); } } kvm_mmu_zap_all() spin_lock(mmu_lock) for_each_shadow_page(sp) { if (spin_needbreak(mmu_lock)) { cond_resched_lock(mmu_lock); } } Use kvm_mmu_zap_generation for kvm_arch_flush_shadow_memslot. Use kvm_mmu_zap_all for kvm_mmu_notifier_release,kvm_destroy_vm. This addresses the main problem: excessively long hold times of kvm_mmu_zap_all with very large guests. Do you see any problem with this logic? This was what i was thinking we agreed. No. I understand it
Re: [PATCH] kvm/ppc/booke64: Hard disable interrupts when entering the guest
On Mon, 2013-05-06 at 22:05 -0500, Scott Wood wrote: On 05/06/2013 07:03:14 PM, Benjamin Herrenschmidt wrote: On Mon, 2013-05-06 at 18:53 -0500, Scott Wood wrote: Ie. The last stage of entry will hard enable, so they should be soft-enabled too... if not, latency trackers will consider the whole guest periods as interrupt disabled... OK... I guess we already have that problem on 32-bit as well? 32-bit doesn't do lazy disable, so the situation is a lot easier there. Right, but it still currently enters the guest with interrupts marked as disabled, so we'd have the same latency tracker issue. Another problem is that hard_irq_disable() doesn't call trace_hardirqs_off()... We might want to fix that: static inline void hard_irq_disable(void) { __hard_irq_disable(); if (get_paca()-soft_enabled) trace_hardirqs_off(); get_paca()-soft_enabled = 0; get_paca()-irq_happened |= PACA_IRQ_HARD_DIS; } Is it possible there are places that assume the current behavior? There aren't many callers, I think this should be safe. Most callers call it with interrupts already soft disabled, so that should be a nop in these cases (idle for example). But I can give it a quick spin today on a machine or two. We also don't want PACA_IRQ_HARD_DIS to be cleared the way prep_irq_for_idle() does, because that's what lets the local_irq_enable() do the hard-enabling after we exit the guest. Then set it again. Don't leave the kernel in a state where soft_enabled is 1 and irq_happened is non-zero. It might work in the specific KVM case we are looking at now because we know we are coming back via KVM exit and putting things right again but it's fragile, somebody will come back and break it, etc... KVM is a pretty special case -- at least on booke, it's required that all exits from guest state go through the KVM exception code. I think it's less likely that that changes, than something breaks in the code to fix up lazy ee state (especially since we've already seen the latter happen). I'll give it a shot, though. If necessary, create (or improve existing) helpers that do the right state adjustement. The cost of a couple of byte stores is negligible, I'd rather you make sure everything remains in sync at all times. My concern was mainly about complexity -- it seemed simpler to just say that the during guest execution, CPU is in a special state that is not visible to anything that cares about lazy EE. The fact that EE can actually be *off* and we still take the interrupt supports its specialness. :-) Yeah ... sort of :-) Cheers, Ben. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] kvm/ppc: interrupt disabling fixes
On Mon, 2013-05-06 at 22:32 -0500, Scott Wood wrote: + hard_irq_disable(); + trace_hardirqs_off(); I still think hard_irq_disable() should be fixed to do the right thing here :-) I'll do that standalone patch here and give it a spin. Cheers, Ben. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Fwd: Booting physically installed Windows while in Arch (AHCI support in OVMF?)
06.05.2013 00:42, Evert Heylen wrote: Please, any help? I think the easiest way is to convert your existing system from UEFI back to regular bios + MBR. For that, you need to disable UEFI boot in bios and convert GPT to MBR on the HDD. This is because, as you figured, ahci support in OVMF isn't exactly working, and because generally, UEFI emulation and UEFI support as whole is a bit too young still. /mjt I'm currently in such a state I won't be able to sleep well before I make some progress on this. I've already described my situation quite precisly, if one needs even more information, just ask. I've now also tried with a separate img containing DUET, so I can use the default seabios to boot DUET, which can boot Windows in UEFI mode. However, DUET just doesn't see my disk at all, be it in IDE or AHCI mode. If I boot the same img *physically* (from a usb), I can enter DUET and I can see my physical disk (which is running in AHCI mode). So I guess this is an issue with KVM/QEMU. Any ideas would be greatly appreciated. On Sun, Apr 28, 2013 at 6:29 PM, Evert Heylen everthey...@gmail.com wrote: Hi all, My situation is the following: My PC (x64) has an UEFI capable motherboard (ASRock Z77). On my hard drive (which is GPT formatted ofc), I have Windows 7 installed on /dev/sda3 and Arch Linux on /dev/sda2. I can boot both OS'es. However, I would like to boot Windows while in Arch, using KVM. I'm using the OVMF images. I tried it right away with this command: qemu-system-x86_64 -enable-kvm -smp 4 -cpu host -m 4096 -hda /dev/sda -L /path/to/ovmf/ It doesn't work. When booting in safe mode in windows, I can see that windows fails when trying to load CLASSPNP.sys . After some googling I found out that it might be because qemu 'mounts' the drive in IDE mode, while windows expects it to be in AHCI mode (because it was installed in AHCI mode). Then, after some more googling, I tried this command, which should (correct me if I'm wrong) mount the drive in AHCI mode. qemu-system-x86_64 -enable-kvm -smp 4 -cpu host -m 4096 -L /path/to/ovmf -device ahci,id=ahci0 -drive if=none,file=/dev/sda,format=raw,id=drive-sata0-0-0 -device driver=ide-drive,bus=ahci0.0,drive=drive-sata0-0-0,id=sata0-0-0 However, with this command OVMF doesn't seem to recognise any drive at all, the 'Boot from file' screen is empty. So, I would like to know if OVMF supports AHCI, and if it doesn't, do you have any other ideas? I know it's generally not a good idea to boot a physically installed OS in a vm, but I want to try it anyway. Thanks, Evert -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/6] KVM: PPC: Add support for multiple-TCE hcalls
On Mon, May 06, 2013 at 05:25:53PM +1000, Alexey Kardashevskiy wrote: This adds real mode handlers for the H_PUT_TCE_INDIRECT and H_STUFF_TCE hypercalls for QEMU emulated devices such as virtio devices or emulated PCI. These calls allow adding multiple entries (up to 512) into the TCE table in one call which saves time on transition to/from real mode. This adds a guest physical to host real address converter and calls the existing H_PUT_TCE handler. The converting function is going to be fully utilized by upcoming VFIO supporting patches. This also implements the KVM_CAP_PPC_MULTITCE capability, so in order to support the functionality of this patch, QEMU needs to query for this capability and set the hcall-multi-tce hypertas property only if the capability is present, otherwise there will be serious performance degradation. Cc: David Gibson da...@gibson.dropbear.id.au Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru Signed-off-by: Paul Mackerras pau...@samba.org Fwiw, it would be nice to get this patch merged, regardless of the rest of the VFIO/powerpc patches. -- David Gibson| I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson signature.asc Description: Digital signature
Re: [PATCH 5/6] KVM: PPC: Add support for IOMMU in-kernel handling
On Mon, May 06, 2013 at 05:25:56PM +1000, Alexey Kardashevskiy wrote: This allows the host kernel to handle H_PUT_TCE, H_PUT_TCE_INDIRECT and H_STUFF_TCE requests without passing them to QEMU, which should save time on switching to QEMU and back. Both real and virtual modes are supported - whenever the kernel fails to handle TCE request, it passes it to the virtual mode. If it the virtual mode handlers fail, then the request is passed to the user mode, for example, to QEMU. This adds a new KVM_CAP_SPAPR_TCE_IOMMU ioctl to asssociate a virtual PCI bus ID (LIOBN) with an IOMMU group, which enables in-kernel handling of IOMMU map/unmap. This adds a special case for huge pages (16MB). The reference counting cannot be easily done for such pages in real mode (when MMU is off) so we added a list of huge pages. It is populated in virtual mode and get_page is called just once per a huge page. Real mode handlers check if the requested page is huge and in the list, then no reference counting is done, otherwise an exit to virtual mode happens. The list is released at KVM exit. At the moment the fastest card available for tests uses up to 9 huge pages so walking through this list is not very expensive. However this can change and we may want to optimize this. This also adds the virt_only parameter to the KVM module for debug and performance check purposes. Tests show that this patch increases transmission speed from 220MB/s to 750..1020MB/s on 10Gb network (Chelsea CXGB3 10Gb ethernet card). Cc: David Gibson da...@gibson.dropbear.id.au Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru Signed-off-by: Paul Mackerras pau...@samba.org --- Documentation/virtual/kvm/api.txt | 28 arch/powerpc/include/asm/kvm_host.h |2 + arch/powerpc/include/asm/kvm_ppc.h |2 + arch/powerpc/include/uapi/asm/kvm.h |7 + arch/powerpc/kvm/book3s_64_vio.c| 242 ++- arch/powerpc/kvm/book3s_64_vio_hv.c | 192 +++ arch/powerpc/kvm/powerpc.c | 12 ++ include/uapi/linux/kvm.h|2 + 8 files changed, 485 insertions(+), 2 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index f621cd6..2039767 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2127,6 +2127,34 @@ written, then `n_invalid' invalid entries, invalidating any previously valid entries found. +4.79 KVM_CREATE_SPAPR_TCE_IOMMU + +Capability: KVM_CAP_SPAPR_TCE_IOMMU +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_create_spapr_tce_iommu (in) +Returns: 0 on success, -1 on error + +This creates a link between IOMMU group and a hardware TCE (translation +control entry) table. This link lets the host kernel know what IOMMU +group (i.e. TCE table) to use for the LIOBN number passed with +H_PUT_TCE, H_PUT_TCE_INDIRECT, H_STUFF_TCE hypercalls. + +/* for KVM_CAP_SPAPR_TCE_IOMMU */ +struct kvm_create_spapr_tce_iommu { + __u64 liobn; + __u32 iommu_id; Wouldn't it be more in keeping + __u32 flags; +}; + +No flag is supported at the moment. + +When the guest issues TCE call on a liobn for which a TCE table has been +registered, the kernel will handle it in real mode, updating the hardware +TCE table. TCE table calls for other liobns will cause a vm exit and must +be handled by userspace. + + 5. The kvm_run structure diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 36ceb0d..2b70cbc 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -178,6 +178,8 @@ struct kvmppc_spapr_tce_table { struct kvm *kvm; u64 liobn; u32 window_size; + bool virtmode_only; I see this is now initialized from the global parameter, but I think it would be better to just check the global (debug) parameter directly, rather than duplicating it here. + struct iommu_group *grp;/* used for IOMMU groups */ struct page *pages[0]; }; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index d501246..bdfa140 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -139,6 +139,8 @@ extern void kvmppc_xics_free(struct kvm *kvm); extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce *args); +extern long kvm_vm_ioctl_create_spapr_tce_iommu(struct kvm *kvm, + struct kvm_create_spapr_tce_iommu *args); extern struct kvmppc_spapr_tce_table *kvmppc_find_tce_table( struct kvm_vcpu *vcpu, unsigned long liobn); extern long kvmppc_emulated_h_put_tce(struct kvmppc_spapr_tce_table *stt, diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
Re: [PATCH 5/6] KVM: PPC: Add support for IOMMU in-kernel handling
On 05/07/2013 03:29 PM, David Gibson wrote: On Mon, May 06, 2013 at 05:25:56PM +1000, Alexey Kardashevskiy wrote: This allows the host kernel to handle H_PUT_TCE, H_PUT_TCE_INDIRECT and H_STUFF_TCE requests without passing them to QEMU, which should save time on switching to QEMU and back. Both real and virtual modes are supported - whenever the kernel fails to handle TCE request, it passes it to the virtual mode. If it the virtual mode handlers fail, then the request is passed to the user mode, for example, to QEMU. This adds a new KVM_CAP_SPAPR_TCE_IOMMU ioctl to asssociate a virtual PCI bus ID (LIOBN) with an IOMMU group, which enables in-kernel handling of IOMMU map/unmap. This adds a special case for huge pages (16MB). The reference counting cannot be easily done for such pages in real mode (when MMU is off) so we added a list of huge pages. It is populated in virtual mode and get_page is called just once per a huge page. Real mode handlers check if the requested page is huge and in the list, then no reference counting is done, otherwise an exit to virtual mode happens. The list is released at KVM exit. At the moment the fastest card available for tests uses up to 9 huge pages so walking through this list is not very expensive. However this can change and we may want to optimize this. This also adds the virt_only parameter to the KVM module for debug and performance check purposes. Tests show that this patch increases transmission speed from 220MB/s to 750..1020MB/s on 10Gb network (Chelsea CXGB3 10Gb ethernet card). Cc: David Gibson da...@gibson.dropbear.id.au Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru Signed-off-by: Paul Mackerras pau...@samba.org --- Documentation/virtual/kvm/api.txt | 28 arch/powerpc/include/asm/kvm_host.h |2 + arch/powerpc/include/asm/kvm_ppc.h |2 + arch/powerpc/include/uapi/asm/kvm.h |7 + arch/powerpc/kvm/book3s_64_vio.c| 242 ++- arch/powerpc/kvm/book3s_64_vio_hv.c | 192 +++ arch/powerpc/kvm/powerpc.c | 12 ++ include/uapi/linux/kvm.h|2 + 8 files changed, 485 insertions(+), 2 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index f621cd6..2039767 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2127,6 +2127,34 @@ written, then `n_invalid' invalid entries, invalidating any previously valid entries found. +4.79 KVM_CREATE_SPAPR_TCE_IOMMU + +Capability: KVM_CAP_SPAPR_TCE_IOMMU +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_create_spapr_tce_iommu (in) +Returns: 0 on success, -1 on error + +This creates a link between IOMMU group and a hardware TCE (translation +control entry) table. This link lets the host kernel know what IOMMU +group (i.e. TCE table) to use for the LIOBN number passed with +H_PUT_TCE, H_PUT_TCE_INDIRECT, H_STUFF_TCE hypercalls. + +/* for KVM_CAP_SPAPR_TCE_IOMMU */ +struct kvm_create_spapr_tce_iommu { +__u64 liobn; +__u32 iommu_id; Wouldn't it be more in keeping pardon? +__u32 flags; +}; + +No flag is supported at the moment. + +When the guest issues TCE call on a liobn for which a TCE table has been +registered, the kernel will handle it in real mode, updating the hardware +TCE table. TCE table calls for other liobns will cause a vm exit and must +be handled by userspace. + + 5. The kvm_run structure diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 36ceb0d..2b70cbc 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -178,6 +178,8 @@ struct kvmppc_spapr_tce_table { struct kvm *kvm; u64 liobn; u32 window_size; +bool virtmode_only; I see this is now initialized from the global parameter, but I think it would be better to just check the global (debug) parameter directly, rather than duplicating it here. The global parameter is in kvm.ko and the struct above is in the real mode part which cannot go to the module. +struct iommu_group *grp;/* used for IOMMU groups */ struct page *pages[0]; }; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index d501246..bdfa140 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -139,6 +139,8 @@ extern void kvmppc_xics_free(struct kvm *kvm); extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce *args); +extern long kvm_vm_ioctl_create_spapr_tce_iommu(struct kvm *kvm, +struct kvm_create_spapr_tce_iommu *args); extern struct kvmppc_spapr_tce_table *kvmppc_find_tce_table( struct kvm_vcpu *vcpu, unsigned long liobn); extern
[PATCH 4/5] powerpc/vfio: Implement IOMMU driver for VFIO
From: Alexey Kardashevskiy a...@ozlabs.ru VFIO implements platform independent stuff such as a PCI driver, BAR access (via read/write on a file descriptor or direct mapping when possible) and IRQ signaling. The platform dependent part includes IOMMU initialization and handling. This implements an IOMMU driver for VFIO which does mapping/unmapping pages for the guest IO and provides information about DMA window (required by a POWER guest). Cc: David Gibson da...@gibson.dropbear.id.au Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru Signed-off-by: Paul Mackerras pau...@samba.org --- Change log: * no more PPC versions for vfio_iommu_spapr_tce_dma_(un)map (type1 structs reused) * documentation updated * containter enable/disable ioctls added * request_module(spapr_iommu) added * various locks fixed * multiple TCE mapping support (no clients for that for now as SPAPR does it in a different way) --- Documentation/vfio.txt | 63 ++ drivers/vfio/Kconfig|6 + drivers/vfio/Makefile |1 + drivers/vfio/vfio.c |1 + drivers/vfio/vfio_iommu_spapr_tce.c | 377 +++ include/uapi/linux/vfio.h | 34 6 files changed, 482 insertions(+) create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt index 8eda363..c55533c 100644 --- a/Documentation/vfio.txt +++ b/Documentation/vfio.txt @@ -283,6 +283,69 @@ a direct pass through for VFIO_DEVICE_* ioctls. The read/write/mmap interfaces implement the device region access defined by the device's own VFIO_DEVICE_GET_REGION_INFO ioctl. + +PPC64 sPAPR implementation note +--- + +This implementation has some specifics: + +1) Only one IOMMU group per container is supported as an IOMMU group +represents the minimal entity which isolation can be guaranteed for and +groups are allocated statically, one per a Partitionable Endpoint (PE) +(PE is often a PCI domain but not always). + +2) The hardware supports so called DMA windows - the PCI address range +within which DMA transfer is allowed, any attempt to access address space +out of the window leads to the whole PE isolation. + +3) PPC64 guests are paravirtualized but not fully emulated. There is an API +to map/unmap pages for DMA, and it normally maps 1..32 pages per call and +currently there is no way to reduce the number of calls. In order to make things +faster, the map/unmap handling has been implemented in real mode which provides +an excellent performance which has limitations such as inability to do +locked pages accounting in real time. + +So 3 additional ioctls have been added: + + VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start + of the DMA window on the PCI bus. + + VFIO_IOMMU_ENABLE - enables the container. The locked pages accounting + is done at this point. This lets user first to know what + the DMA window is and adjust rlimit before doing any real job. + + VFIO_IOMMU_DISABLE - disables the container. + + +The code flow from the example above should be slightly changed: + + . + /* Add the group to the container */ + ioctl(group, VFIO_GROUP_SET_CONTAINER, container); + + /* Enable the IOMMU model we want */ + ioctl(container, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU) + + /* Get addition sPAPR IOMMU info */ + vfio_iommu_spapr_tce_info spapr_iommu_info; + ioctl(container, VFIO_IOMMU_SPAPR_TCE_GET_INFO, spapr_iommu_info); + + if (ioctl(container, VFIO_IOMMU_ENABLE)) + /* Cannot enable container, may be low rlimit */ + + /* Allocate some space and setup a DMA mapping */ + dma_map.vaddr = mmap(0, 1024 * 1024, PROT_READ | PROT_WRITE, +MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + + dma_map.size = 1024 * 1024; + dma_map.iova = 0; /* 1MB starting at 0x0 from device view */ + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + + /* Check here is .iova/.size are within DMA window from spapr_iommu_info */ + + ioctl(container, VFIO_IOMMU_MAP_DMA, dma_map); + . + --- [1] VFIO was originally an acronym for Virtual Function I/O in its diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 7cd5dec..b464687 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1 depends on VFIO default n +config VFIO_IOMMU_SPAPR_TCE + tristate + depends on VFIO SPAPR_TCE_IOMMU + default n + menuconfig VFIO tristate VFIO Non-Privileged userspace driver framework depends on IOMMU_API select VFIO_IOMMU_TYPE1 if X86 + select VFIO_IOMMU_SPAPR_TCE if
[PATCH 5/5] powerpc/vfio: Enable on pSeries platform
From: Alexey Kardashevskiy a...@ozlabs.ru The enables VFIO on the pSeries platform, enabling user space programs to access PCI devices directly. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru Cc: David Gibson da...@gibson.dropbear.id.au Signed-off-by: Paul Mackerras pau...@samba.org --- arch/powerpc/platforms/pseries/iommu.c |4 drivers/iommu/Kconfig |2 +- drivers/vfio/Kconfig |2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index e2685ba..e178acc 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -613,6 +613,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) iommu_table_setparms(pci-phb, dn, tbl); pci-iommu_table = iommu_init_table(tbl, pci-phb-node); + iommu_register_group(tbl, pci_domain_nr(bus), 0); /* Divide the rest (1.75GB) among the children */ pci-phb-dma_window_size = 0x8000ul; @@ -657,6 +658,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) ppci-phb-node); iommu_table_setparms_lpar(ppci-phb, pdn, tbl, dma_window); ppci-iommu_table = iommu_init_table(tbl, ppci-phb-node); + iommu_register_group(tbl, pci_domain_nr(bus), 0); pr_debug( created table: %p\n, ppci-iommu_table); } } @@ -683,6 +685,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) phb-node); iommu_table_setparms(phb, dn, tbl); PCI_DN(dn)-iommu_table = iommu_init_table(tbl, phb-node); + iommu_register_group(tbl, pci_domain_nr(phb-bus), 0); set_iommu_table_base(dev-dev, PCI_DN(dn)-iommu_table); return; } @@ -1145,6 +1148,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) pci-phb-node); iommu_table_setparms_lpar(pci-phb, pdn, tbl, dma_window); pci-iommu_table = iommu_init_table(tbl, pci-phb-node); + iommu_register_group(tbl, pci_domain_nr(pci-phb-bus), 0); pr_debug( created table: %p\n, pci-iommu_table); } else { pr_debug( found DMA window, table: %p\n, pci-iommu_table); diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 175e0f4..2d75ea0 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -189,7 +189,7 @@ config EXYNOS_IOMMU_DEBUG config SPAPR_TCE_IOMMU bool sPAPR TCE IOMMU Support - depends on PPC_POWERNV + depends on PPC_POWERNV || PPC_PSERIES select IOMMU_API help Enables bits of IOMMU API required by VFIO. The iommu_ops diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index b464687..26b3d9d 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -12,7 +12,7 @@ menuconfig VFIO tristate VFIO Non-Privileged userspace driver framework depends on IOMMU_API select VFIO_IOMMU_TYPE1 if X86 - select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV + select VFIO_IOMMU_SPAPR_TCE if (PPC_POWERNV || PPC_PSERIES) help VFIO provides a framework for secure userspace device drivers. See Documentation/vfio.txt for more details. -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/5] iommu: Move initialization earlier
From: Alexey Kardashevskiy a...@ozlabs.ru The iommu_init() call initializes IOMMU internal structures and data required for the API to function such as iommu_group_alloc(). It is registered as a subsys_initcall. One of the IOMMU users is a PCI subsystem on POWER which discovers new IOMMU tables during the PCI scan so the most logical place to call iommu_group_alloc() is when a new group is just discovered. However PCI scan is done from subsys_initcall hook as well, which makes use of the IOMMU API impossible. This moves IOMMU subsystem initialization one step earlier. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru Cc: David Gibson da...@gibson.dropbear.id.au Signed-off-by: Paul Mackerras pau...@samba.org --- drivers/iommu/iommu.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 5514dfa..0de83eb 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -890,7 +890,7 @@ static int __init iommu_init(void) return 0; } -subsys_initcall(iommu_init); +arch_initcall(iommu_init); int iommu_domain_get_attr(struct iommu_domain *domain, enum iommu_attr attr, void *data) -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html