[kvm-unit-tests] x86: pkeys: add test for PKEYS

2015-11-15 Thread Huaitong Han
Signed-off-by: Huaitong Han 

diff --git a/config/config-x86-common.mak b/config/config-x86-common.mak
index c2f9908..2ef98cc 100644
--- a/config/config-x86-common.mak
+++ b/config/config-x86-common.mak
@@ -36,7 +36,8 @@ tests-common = $(TEST_DIR)/vmexit.flat $(TEST_DIR)/tsc.flat \
$(TEST_DIR)/kvmclock_test.flat  $(TEST_DIR)/eventinj.flat \
$(TEST_DIR)/s3.flat $(TEST_DIR)/pmu.flat \
$(TEST_DIR)/tsc_adjust.flat $(TEST_DIR)/asyncpf.flat \
-   $(TEST_DIR)/init.flat $(TEST_DIR)/smap.flat
+   $(TEST_DIR)/init.flat $(TEST_DIR)/smap.flat \
+   $(TEST_DIR)/pku.flat
 
 ifdef API
 tests-common += api/api-sample
@@ -104,6 +105,8 @@ $(TEST_DIR)/pcid.elf: $(cstart.o) $(TEST_DIR)/pcid.o
 
 $(TEST_DIR)/smap.elf: $(cstart.o) $(TEST_DIR)/smap.o
 
+$(TEST_DIR)/pku.elf: $(cstart.o) $(TEST_DIR)/pku.o
+
 $(TEST_DIR)/vmx.elf: $(cstart.o) $(TEST_DIR)/vmx.o $(TEST_DIR)/vmx_tests.o
 
 $(TEST_DIR)/debug.elf: $(cstart.o) $(TEST_DIR)/debug.o
diff --git a/lib/x86/processor.h b/lib/x86/processor.h
index 7973879..f7aa5ec 100644
--- a/lib/x86/processor.h
+++ b/lib/x86/processor.h
@@ -26,6 +26,7 @@
 #define X86_CR4_PAE0x0020
 #define X86_CR4_PCIDE  0x0002
 #define X86_CR4_SMAP   0x0020
+#define X86_CR4_PKE0x0040
 
 #define X86_IA32_EFER  0xc080
 #define X86_EFER_LMA   (1UL << 8)
diff --git a/x86/pku.c b/x86/pku.c
new file mode 100644
index 000..0e00b99
--- /dev/null
+++ b/x86/pku.c
@@ -0,0 +1,161 @@
+#include "libcflat.h"
+#include "x86/desc.h"
+#include "x86/processor.h"
+#include "x86/vm.h"
+#include "x86/msr.h"
+
+#define X86_FEATURE_PKU  3
+#define CR0_WP_MASK  (1UL << 16)
+#define PTE_PKEY_BIT 59
+#define USER_BASE(1 << 24)
+#define USER_VAR(v)  (*((__typeof__(&(v))) (((unsigned long)&v) + 
USER_BASE)))
+
+volatile int pf_count = 0;
+volatile unsigned save;
+volatile unsigned test;
+
+void set_cr0_wp(int wp)
+{
+unsigned long cr0 = read_cr0();
+
+cr0 &= ~CR0_WP_MASK;
+if (wp)
+cr0 |= CR0_WP_MASK;
+write_cr0(cr0);
+}
+
+static inline u32 read_pkru(void)
+{
+unsigned int eax, edx;
+unsigned int ecx = 0;
+unsigned int pkru;
+
+asm volatile(".byte 0x0f,0x01,0xee\n\t"
+ : "=a" (eax), "=d" (edx)
+ : "c" (ecx));
+pkru = eax;
+return pkru;
+}
+
+static void write_pkru(u32 pkru)
+{
+unsigned int eax = pkru;
+unsigned int ecx = 0;
+unsigned int edx = 0;
+
+asm volatile(".byte 0x0f,0x01,0xef\n\t"
+: : "a" (eax), "c" (ecx), "d" (edx));
+}
+
+void do_pf_tss(unsigned long error_code)
+{
+pf_count++;
+save = test;
+write_pkru(0);
+}
+
+extern void pf_tss(void);
+
+asm ("pf_tss: \n\t"
+#ifdef __x86_64__
+// no task on x86_64, save/restore caller-save regs
+"push %rax; push %rcx; push %rdx; push %rsi; push %rdi\n"
+"push %r8; push %r9; push %r10; push %r11\n"
+#endif
+"call do_pf_tss \n\t"
+#ifdef __x86_64__
+"pop %r11; pop %r10; pop %r9; pop %r8\n"
+"pop %rdi; pop %rsi; pop %rdx; pop %rcx; pop %rax\n"
+#endif
+"add $"S", %"R "sp\n\t" // discard error code
+"iret"W" \n\t"
+"jmp pf_tss\n\t"
+);
+
+static void init_test()
+{
+pf_count = 0;
+
+invlpg(&test);
+invlpg(&USER_VAR(test));
+write_pkru(0);
+set_cr0_wp(0);
+}
+
+int main(int ac, char **av)
+{
+unsigned long i;
+unsigned int pkey = 0x2;
+unsigned int pkru_ad = 0x10;
+unsigned int pkru_wd = 0x20;
+
+if (!(cpuid_indexed(7, 0).c & (1 << X86_FEATURE_PKU))) {
+printf("PKU not enabled, exiting\n");
+exit(1);
+}
+
+setup_vm();
+setup_alt_stack();
+set_intr_alt_stack(14, pf_tss);
+wrmsr(MSR_EFER, rdmsr(MSR_EFER) | EFER_LMA);
+
+for (i = 0; i < USER_BASE; i += PAGE_SIZE) {
+*get_pte(phys_to_virt(read_cr3()), phys_to_virt(i)) &= ~PTE_USER;
+*get_pte(phys_to_virt(read_cr3()), phys_to_virt(i)) |= ((unsigned 
long)pkey << PTE_PKEY_BIT);
+invlpg((void *)i);
+}
+
+for (i = USER_BASE; i < 2 * USER_BASE; i += PAGE_SIZE) {
+*get_pte(phys_to_virt(read_cr3()), phys_to_virt(i)) &= ~USER_BASE;
+*get_pte(phys_to_virt(read_cr3()), phys_to_virt(i)) |= ((unsigned 
long)pkey << PTE_PKEY_BIT);
+invlpg((void *)i);
+}
+
+write_cr4(read_cr4() | X86_CR4_PKE);
+write_cr3(read_cr3());
+
+init_test();
+set_cr0_wp(1);
+write_pkru(pkru_ad);
+test = 21;
+report("write to supervisor page when pkru is ad and wp == 1", pf_count == 
0 && test == 21);
+
+init_test();
+set_cr0_wp(0);
+write_pkru(pkru_ad);
+test = 22;
+report("write to supervisor page when pkru is ad and wp == 0", pf_count == 
0 && test == 22);
+
+init_test();
+set_cr0_wp(1);
+write_pkru(pkru_wd);
+test = 23;
+report("write to supervisor page when pkru is wd and wp == 1", pf_count == 
0 && test == 23);
+
+init_test();
+set_cr0_wp(0);
+write_pkru(pkru_wd);

[PATCH V2 0/7] KVM, pkeys: add memory protection-key support

2015-11-15 Thread Huaitong Han
Changes in v2:
*Add pku.c for kvm-unit-tests.
*Optimize permission_fault codes for patch4.
*Delete is_long_mode and PK for patch5.
*Squash cpuid and cr4 patches.

The protection-key feature provides an additional mechanism by which IA-32e
paging controls access to usermode addresses.

Hardware support for protection keys for user pages is enumerated with CPUID
feature flag CPUID.7.0.ECX[3]:PKU. Software support is CPUID.7.0.ECX[4]:OSPKE
with the setting of CR4.PKE(bit 22).

When CR4.PKE = 1, every linear address is associated with the 4-bit protection
key located in bits 62:59 of the paging-structure entry that mapped the page
containing the linear address. The PKRU register determines, for each
protection key, whether user-mode addresses with that protection key may be
read or written.

The PKRU register (protection key rights for user pages) is a 32-bit register
with the following format: for each i (0 ≤ i ≤ 15), PKRU[2i] is the
access-disable bit for protection key i (ADi); PKRU[2i+1] is the write-disable
bit for protection key i (WDi).

Software can use the RDPKRU and WRPKRU instructions with ECX = 0 to read and
write PKRU. In addition, the PKRU register is XSAVE-managed state and can thus
be read and written by instructions in the XSAVE feature set.

PFEC.PK (bit 5) is defined as protection key violations.

The specification of Protection Keys can be found at SDM (4.6.2, volume 3)
http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf.

The kernel native patchset have not yet been merged to upstream, you can found
at git://git.kernel.org/pub/scm/linux/kernel/git/daveh/x86-pkeys.git pkeys-v007.

Huaitong Han (7):
  KVM, pkeys: expose CPUID/CR4 to guest
  KVM, pkeys: disable pkeys for guests in non-paging mode
  KVM, pkeys: update memeory permission bitmask for pkeys
  KVM, pkeys: add pkeys support for permission_fault logic
  KVM, pkeys: Add pkeys support for gva_to_gpa funcions
  KVM, pkeys: add pkeys support for xsave state
  KVM, pkeys: disable PKU feature without ept

 arch/x86/include/asm/kvm_host.h | 11 +---
 arch/x86/kvm/cpuid.c| 23 +++--
 arch/x86/kvm/cpuid.h|  8 ++
 arch/x86/kvm/mmu.c  | 32 +--
 arch/x86/kvm/mmu.h  | 56 +
 arch/x86/kvm/paging_tmpl.h  | 18 ++---
 arch/x86/kvm/vmx.c  | 10 
 arch/x86/kvm/x86.c  | 27 ++--
 arch/x86/kvm/x86.h  |  3 ++-
 9 files changed, 160 insertions(+), 28 deletions(-)

-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[kvm-unit-tests] x86: smap: add smap check to unittests.cfg

2015-11-15 Thread Huaitong Han
Signed-off-by: Huaitong Han 

diff --git a/x86/unittests.cfg b/x86/unittests.cfg
index 14e36a4..6d3dc89 100644
--- a/x86/unittests.cfg
+++ b/x86/unittests.cfg
@@ -72,6 +72,10 @@ groups = vmexit
 file = access.flat
 arch = x86_64
 
+[smap]
+file = smap.flat
+extra_params = -cpu host
+
 [pku]
 file = pku.flat
 arch = x86_64
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V2 1/7] KVM, pkeys: expose CPUID/CR4 to guest

2015-11-15 Thread Huaitong Han
This patch exposes CPUID/CR4 to guest.

X86_FEATURE_PKU is referred to as "PKU" in the hardware documentation:
CPUID.7.0.ECX[3]:PKU. X86_FEATURE_OSPKE is software support for pkeys,
enumerated with CPUID.7.0.ECX[4]:OSPKE, and it reflects the setting of
CR4.PKE(bit 22).

Signed-off-by: Huaitong Han 

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c12e845..3bbc1cb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -55,7 +55,8 @@
  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | 
X86_CR4_PCIDE \
  | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
- | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP))
+ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP \
+ | X86_CR4_PKE))
 
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 156441b..ece687b 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -81,6 +81,17 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
apic->lapic_timer.timer_mode_mask = 1 << 17;
}
 
+   best = kvm_find_cpuid_entry(vcpu, 7, 0);
+   if (!best)
+   return 0;
+
+   /*Update OSPKE bit */
+   if (boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) {
+   best->ecx &= ~F(OSPKE);
+   if (kvm_read_cr4_bits(vcpu, X86_CR4_PKE))
+   best->ecx |= F(OSPKE);
+   }
+
best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
if (!best) {
vcpu->arch.guest_supported_xcr0 = 0;
@@ -354,6 +365,9 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
const u32 kvm_supported_word10_x86_features =
F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves;
 
+   /* cpuid 7.0.ecx*/
+   const u32 kvm_supported_word11_x86_features = F(PKU) | 0 /*OSPKE*/;
+
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
 
@@ -431,10 +445,13 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
cpuid_mask(&entry->ebx, 9);
// TSC_ADJUST is emulated
entry->ebx |= F(TSC_ADJUST);
-   } else
+   entry->ecx &= kvm_supported_word11_x86_features;
+   cpuid_mask(&entry->ecx, 13);
+   } else {
entry->ebx = 0;
+   entry->ecx = 0;
+   }
entry->eax = 0;
-   entry->ecx = 0;
entry->edx = 0;
break;
}
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index dd05b9c..7775158 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -70,6 +70,14 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu 
*vcpu)
return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
 }
 
+static inline bool guest_cpuid_has_pku(struct kvm_vcpu *vcpu)
+{
+   struct kvm_cpuid_entry2 *best;
+
+   best = kvm_find_cpuid_entry(vcpu, 7, 0);
+   return best && (best->ecx & bit(X86_FEATURE_PKU));
+}
+
 static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu)
 {
struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2d4e54d..5181834 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -709,7 +709,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
unsigned long old_cr4 = kvm_read_cr4(vcpu);
unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
-  X86_CR4_SMEP | X86_CR4_SMAP;
+  X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
 
if (cr4 & CR4_RESERVED_BITS)
return 1;
@@ -726,6 +726,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE))
return 1;
 
+   if (!guest_cpuid_has_pku(vcpu) && (cr4 & X86_CR4_PKE))
+   return 1;
+
if (is_long_mode(vcpu)) {
if (!(cr4 & X86_CR4_PAE))
return 1;
@@ -751,7 +754,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
(!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
kvm_mmu_reset_context(vcpu);
 
-   if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
+   if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
kvm_update_cpuid(vcpu);
 
return 0;
@@ -6838,7 +6841,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 
mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
-   if (sregs->cr4 & X86_CR4_OSXSAVE

[PATCH V2 7/7] KVM, pkeys: disable PKU feature without ept

2015-11-15 Thread Huaitong Han
This patch disables CPUID:PKU without ept.

Signed-off-by: Huaitong Han 

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index ece687b..e1113ae 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -447,6 +447,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
entry->ebx |= F(TSC_ADJUST);
entry->ecx &= kvm_supported_word11_x86_features;
cpuid_mask(&entry->ecx, 13);
+   if (!tdp_enabled)
+   entry->ecx &= ~F(PKU);
} else {
entry->ebx = 0;
entry->ecx = 0;
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V2 0/3] target-i386: add memory protection-key support

2015-11-15 Thread Huaitong Han
Changes in v2:
*Fix memcpy error for xsave state.
*Fix TCG_7_0_ECX_FEATURES to 0.
*Make subjects more readable.

The protection-key feature provides an additional mechanism by which IA-32e
paging controls access to usermode addresses.

Hardware support for protection keys for user pages is enumerated with CPUID
feature flag CPUID.7.0.ECX[3]:PKU. Software support is CPUID.7.0.ECX[4]:OSPKE
with the setting of CR4.PKE(bit 22).

The PKRU register is XSAVE-managed state CPUID.D.0.EAX[9], the size of XSAVE
state component for PKRU is 8 bytes, the offset is 0xa80.

The specification of Protection Keys can be found at SDM (4.6.2, volume 3)
http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf.


Huaitong Han (3):
  target-i386: add pkeys support for cpuid handling
  target-i386: add pkeys support for xsave state handling
  target-i386: add pkeys support for vm migration

 target-i386/cpu.c | 23 ++-
 target-i386/cpu.h |  7 +++
 target-i386/kvm.c |  3 +++
 target-i386/machine.c | 23 +++
 4 files changed, 55 insertions(+), 1 deletion(-)

-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V2 2/7] KVM, pkeys: disable pkeys for guests in non-paging mode

2015-11-15 Thread Huaitong Han
Pkeys is disabled if CPU is in non-paging mode in hardware. However KVM
always uses paging mode to emulate guest non-paging, mode with TDP. To
emulate this behavior, pkeys needs to be manually disabled when guest
switches to non-paging mode.

Signed-off-by: Huaitong Han 

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d019868..9b12c80 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3645,14 +3645,14 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned 
long cr4)
hw_cr4 &= ~X86_CR4_PAE;
hw_cr4 |= X86_CR4_PSE;
/*
-* SMEP/SMAP is disabled if CPU is in non-paging mode
-* in hardware. However KVM always uses paging mode to
-* emulate guest non-paging mode with TDP.
-* To emulate this behavior, SMEP/SMAP needs to be
+* SMEP/SMAP/PKU is disabled if CPU is in non-paging
+* mode in hardware. However KVM always uses paging
+* mode to emulate guest non-paging mode with TDP.
+* To emulate this behavior, SMEP/SMAP/PKU needs to be
 * manually disabled when guest switches to non-paging
 * mode.
 */
-   hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
+   hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
} else if (!(cr4 & X86_CR4_PAE)) {
hw_cr4 &= ~X86_CR4_PAE;
}
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V2 6/7] KVM, pkeys: add pkeys support for xsave state

2015-11-15 Thread Huaitong Han
This patch adds pkeys support for xsave state.

Signed-off-by: Huaitong Han 

diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index f2afa5f..0f71d5d 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -182,7 +182,8 @@ bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu 
*vcpu, gfn_t gfn,
 
 #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
-   | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512)
+   | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
+   | XFEATURE_MASK_PKRU)
 extern u64 host_xcr0;
 
 extern u64 kvm_supported_xcr0(void);
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V2 3/3] target-i386: add pkeys support for vm migration

2015-11-15 Thread Huaitong Han
This patch adds pkeys support for vm migration.

Signed-off-by: Huaitong Han 

diff --git a/target-i386/machine.c b/target-i386/machine.c
index a0df64b..1b190c7 100644
--- a/target-i386/machine.c
+++ b/target-i386/machine.c
@@ -725,6 +725,26 @@ static const VMStateDescription vmstate_xss = {
 VMSTATE_END_OF_LIST()
 }
 };
+#ifdef TARGET_X86_64
+static bool pkru_needed(void *opaque)
+{
+X86CPU *cpu = opaque;
+CPUX86State *env = &cpu->env;
+
+return env->pkru != 0;
+}
+
+static const VMStateDescription vmstate_pkru = {
+.name = "cpu/pkru",
+.version_id = 1,
+.minimum_version_id = 1,
+.needed = pkru_needed,
+.fields = (VMStateField[]){
+VMSTATE_UINT32(env.pkru, X86CPU),
+VMSTATE_END_OF_LIST()
+}
+};
+#endif
 
 VMStateDescription vmstate_x86_cpu = {
 .name = "cpu",
@@ -844,6 +864,9 @@ VMStateDescription vmstate_x86_cpu = {
 &vmstate_msr_hyperv_time,
 &vmstate_avx512,
 &vmstate_xss,
+#ifdef TARGET_X86_64
+&vmstate_pkru,
+#endif
 NULL
 }
 };
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V2 5/7] KVM, pkeys: Add pkeys support for gva_to_gpa funcions

2015-11-15 Thread Huaitong Han
This patch adds pkeys support for gva_to_gpa funcions.

Signed-off-by: Huaitong Han 

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7a84b83..bd942f3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3960,6 +3960,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, 
gva_t gva,
  struct x86_exception *exception)
 {
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+   access |= kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ? PFERR_PK_MASK : 0;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 }
 
@@ -3976,6 +3977,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, 
gva_t gva,
 {
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_WRITE_MASK;
+   access |= kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ? PFERR_PK_MASK : 0;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 }
 
@@ -4026,10 +4028,13 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt 
*ctxt,
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
unsigned offset;
int ret;
+   gpa_t gpa;
+
+   access |= kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ? PFERR_PK_MASK : 0;
 
/* Inline kvm_read_guest_virt_helper for speed.  */
-   gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, 
access|PFERR_FETCH_MASK,
-   exception);
+   gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
+   access | PFERR_FETCH_MASK, exception);
if (unlikely(gpa == UNMAPPED_GVA))
return X86EMUL_PROPAGATE_FAULT;
 
@@ -4050,6 +4055,7 @@ int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
 {
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+   access |= kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ? PFERR_PK_MASK : 0;
 
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
  exception);
@@ -4073,9 +4079,13 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt 
*ctxt,
void *data = val;
int r = X86EMUL_CONTINUE;
 
+   u32 access = PFERR_WRITE_MASK;
+
+   access |= kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ? PFERR_PK_MASK : 0;
+
while (bytes) {
gpa_t gpa =  vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
-PFERR_WRITE_MASK,
+access,
 exception);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V2 3/7] KVM, pkeys: update memeory permission bitmask for pkeys

2015-11-15 Thread Huaitong Han
Pkeys define a new status bit in the PFEC. PFEC.PK (bit 5), if some
conditions is true, the fault is considered as a PKU violation.

This patch updates memeory permission bitmask for pkeys.

Signed-off-by: Huaitong Han 

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3bbc1cb..8852b9f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -159,12 +159,14 @@ enum {
 #define PFERR_USER_BIT 2
 #define PFERR_RSVD_BIT 3
 #define PFERR_FETCH_BIT 4
+#define PFERR_PK_BIT 5
 
 #define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
 #define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
 #define PFERR_USER_MASK (1U << PFERR_USER_BIT)
 #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
 #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
+#define PFERR_PK_MASK (1U << PFERR_PK_BIT)
 
 /* apic attention bits */
 #define KVM_APIC_CHECK_VAPIC   0
@@ -288,10 +290,12 @@ struct kvm_mmu {
 
/*
 * Bitmap; bit set = permission fault
-* Byte index: page fault error code [4:1]
+* Byte index: page fault error code [5:1]
 * Bit index: pte permissions in ACC_* format
+*
+* Add PFEC.PK (bit 5) for protection-key violations
 */
-   u8 permissions[16];
+   u8 permissions[32];
 
u64 *pae_root;
u64 *lm_root;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 69088a1..0568635 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3793,16 +3793,22 @@ static void update_permission_bitmask(struct kvm_vcpu 
*vcpu,
 {
unsigned bit, byte, pfec;
u8 map;
-   bool fault, x, w, u, wf, uf, ff, smapf, cr4_smap, cr4_smep, smap = 0;
+   bool fault, x, w, u, smap = 0, pku = 0;
+   bool wf, uf, ff, smapf, rsvdf, pkuf;
+   bool cr4_smap, cr4_smep, cr4_pku;
 
cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
+   cr4_pku = kvm_read_cr4_bits(vcpu, X86_CR4_PKE);
+
for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
pfec = byte << 1;
map = 0;
wf = pfec & PFERR_WRITE_MASK;
uf = pfec & PFERR_USER_MASK;
ff = pfec & PFERR_FETCH_MASK;
+   rsvdf = pfec & PFERR_RSVD_MASK;
+   pkuf = pfec & PFERR_PK_MASK;
/*
 * PFERR_RSVD_MASK bit is set in PFEC if the access is not
 * subject to SMAP restrictions, and cleared otherwise. The
@@ -3841,12 +3847,34 @@ static void update_permission_bitmask(struct kvm_vcpu 
*vcpu,
 *   clearer.
 */
smap = cr4_smap && u && !uf && !ff;
+
+   /*
+   * PKU:additional mechanism by which the paging
+   * controls access to user-mode addresses based
+   * on the value in the PKRU register. A fault is
+   * considered as a PKU violation if all of the
+   * following conditions are true:
+   * 1.CR4_PKE=1.
+   * 2.EFER_LMA=1.
+   * 3.page is present with no reserved bit
+   *   violations.
+   * 4.the access is not an instruction fetch.
+   * 5.the access is to a user page.
+   * 6.PKRU.AD=1
+   *   or The access is a data write and
+   *  PKRU.WD=1 and either CR0.WP=1
+   *  or it is a user access.
+   *
+   * The 2nd and 6th conditions are computed
+   * dynamically in permission_fault.
+   */
+   pku = cr4_pku && !rsvdf && !ff && u;
} else
/* Not really needed: no U/S accesses on ept  */
u = 1;
 
fault = (ff && !x) || (uf && !u) || (wf && !w) ||
-   (smapf && smap);
+   (smapf && smap) || (pkuf && pku);
map |= fault << bit;
}
mmu->permissions[byte] = map;
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V2 1/3] target-i386: add pkeys support for cpuid handling

2015-11-15 Thread Huaitong Han
This patch adds pkeys support for cpuid handling.

Signed-off-by: Huaitong Han 

diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index 4d1b085..2ff73ee 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -264,6 +264,17 @@ static const char *cpuid_7_0_ebx_feature_name[] = {
 NULL, NULL, "avx512pf", "avx512er", "avx512cd", NULL, NULL, NULL,
 };
 
+static const char *cpuid_7_0_ecx_feature_name[] = {
+NULL, NULL, "pku", "ospke",
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+};
+
 static const char *cpuid_apm_edx_feature_name[] = {
 NULL, NULL, NULL, NULL,
 NULL, NULL, NULL, NULL,
@@ -351,6 +362,7 @@ static const char *cpuid_6_feature_name[] = {
   CPUID_7_0_EBX_FSGSBASE, CPUID_7_0_EBX_HLE, CPUID_7_0_EBX_AVX2,
   CPUID_7_0_EBX_ERMS, CPUID_7_0_EBX_INVPCID, CPUID_7_0_EBX_RTM,
   CPUID_7_0_EBX_RDSEED */
+#define TCG_7_0_ECX_FEATURES 0
 #define TCG_APM_FEATURES 0
 #define TCG_6_EAX_FEATURES CPUID_6_EAX_ARAT
 
@@ -408,6 +420,13 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
 .cpuid_reg = R_EBX,
 .tcg_features = TCG_7_0_EBX_FEATURES,
 },
+[FEAT_7_0_ECX] = {
+.feat_names = cpuid_7_0_ecx_feature_name,
+.cpuid_eax = 7,
+.cpuid_needs_ecx = true, .cpuid_ecx = 0,
+.cpuid_reg = R_ECX,
+.tcg_features = TCG_7_0_ECX_FEATURES,
+},
 [FEAT_8000_0007_EDX] = {
 .feat_names = cpuid_apm_edx_feature_name,
 .cpuid_eax = 0x8007,
@@ -2401,7 +2420,7 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, 
uint32_t count,
 if (count == 0) {
 *eax = 0; /* Maximum ECX value for sub-leaves */
 *ebx = env->features[FEAT_7_0_EBX]; /* Feature flags */
-*ecx = 0; /* Reserved */
+*ecx = env->features[FEAT_7_0_ECX]; /* Feature flags */
 *edx = 0; /* Reserved */
 } else {
 *eax = 0;
diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index ead2832..c2e7501 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -408,6 +408,7 @@ typedef enum FeatureWord {
 FEAT_1_EDX, /* CPUID[1].EDX */
 FEAT_1_ECX, /* CPUID[1].ECX */
 FEAT_7_0_EBX,   /* CPUID[EAX=7,ECX=0].EBX */
+FEAT_7_0_ECX,   /* CPUID[EAX=7,ECX=0].ECX */
 FEAT_8000_0001_EDX, /* CPUID[8000_0001].EDX */
 FEAT_8000_0001_ECX, /* CPUID[8000_0001].ECX */
 FEAT_8000_0007_EDX, /* CPUID[8000_0007].EDX */
@@ -576,6 +577,9 @@ typedef uint32_t FeatureWordArray[FEATURE_WORDS];
 #define CPUID_7_0_EBX_AVX512ER (1U << 27) /* AVX-512 Exponential and 
Reciprocal */
 #define CPUID_7_0_EBX_AVX512CD (1U << 28) /* AVX-512 Conflict Detection */
 
+#define CPUID_7_0_ECX_PKU  (1U << 3)
+#define CPUID_7_0_ECX_OSPKE(1U << 4)
+
 #define CPUID_XSAVE_XSAVEOPT   (1U << 0)
 #define CPUID_XSAVE_XSAVEC (1U << 1)
 #define CPUID_XSAVE_XGETBV1(1U << 2)
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V2 2/3] target-i386: add pkeys support for xsave state handling

2015-11-15 Thread Huaitong Han
This patch adds pkeys support for xsave state handling.

Signed-off-by: Huaitong Han 

diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index 2ff73ee..f65f785 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -487,6 +487,8 @@ static const ExtSaveArea ext_save_areas[] = {
 .offset = 0x480, .size = 0x200 },
 [7] = { .feature = FEAT_7_0_EBX, .bits = CPUID_7_0_EBX_AVX512F,
 .offset = 0x680, .size = 0x400 },
+[9] = { .feature = FEAT_7_0_ECX, .bits = CPUID_7_0_ECX_PKU,
+.offset = 0xA80, .size = 0x8 },
 };
 
 const char *get_register_name_32(unsigned int reg)
diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index c2e7501..2230b3e 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -401,6 +401,7 @@
 #define XSTATE_OPMASK   (1ULL << 5)
 #define XSTATE_ZMM_Hi256(1ULL << 6)
 #define XSTATE_Hi16_ZMM (1ULL << 7)
+#define XSTATE_PKRU (1ULL << 9)
 
 
 /* CPUID feature words */
@@ -984,6 +985,8 @@ typedef struct CPUX86State {
 uint64_t xcr0;
 uint64_t xss;
 
+uint32_t pkru;
+
 TPRAccess tpr_access_type;
 } CPUX86State;
 
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 066d03d..16a8eff 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -1092,6 +1092,7 @@ static int kvm_put_fpu(X86CPU *cpu)
 #define XSAVE_OPMASK  272
 #define XSAVE_ZMM_Hi256   288
 #define XSAVE_Hi16_ZMM416
+#define XSAVE_PKRU672
 
 static int kvm_put_xsave(X86CPU *cpu)
 {
@@ -1145,6 +1146,7 @@ static int kvm_put_xsave(X86CPU *cpu)
 #ifdef TARGET_X86_64
 memcpy(&xsave->region[XSAVE_Hi16_ZMM], &env->xmm_regs[16],
 16 * sizeof env->xmm_regs[16]);
+memcpy(&xsave->region[XSAVE_PKRU], &env->pkru, sizeof env->pkru);
 #endif
 r = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XSAVE, xsave);
 return r;
@@ -1516,6 +1518,7 @@ static int kvm_get_xsave(X86CPU *cpu)
 #ifdef TARGET_X86_64
 memcpy(&env->xmm_regs[16], &xsave->region[XSAVE_Hi16_ZMM],
16 * sizeof env->xmm_regs[16]);
+memcpy(&env->pkru, &xsave->region[XSAVE_PKRU], sizeof env->pkru);
 #endif
 return 0;
 }
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V2 4/7] KVM, pkeys: add pkeys support for permission_fault logic

2015-11-15 Thread Huaitong Han
Protection keys define a new 4-bit protection key field (PKEY) in bits
62:59 of leaf entries of the page tables, the PKEY is an index to PKRU
register(16 domains), every domain has 2 bits(write disable bit, access
disable bit).

Static logic has been produced in update_permission_bitmask, dynamic logic
need read pkey from page table entries, get pkru value, and deduce the
correct result.

Signed-off-by: Huaitong Han 

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index e4202e4..c76e744 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -3,6 +3,7 @@
 
 #include 
 #include "kvm_cache_regs.h"
+#include "x86.h"
 
 #define PT64_PT_BITS 9
 #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
@@ -24,6 +25,7 @@
 #define PT_PAGE_SIZE_MASK (1ULL << PT_PAGE_SIZE_SHIFT)
 #define PT_PAT_MASK (1ULL << 7)
 #define PT_GLOBAL_MASK (1ULL << 8)
+
 #define PT64_NX_SHIFT 63
 #define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
 
@@ -45,6 +47,10 @@
 #define PT_PAGE_TABLE_LEVEL 1
 #define PT_MAX_HUGEPAGE_LEVEL (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES - 1)
 
+#define PKRU_READ   0
+#define PKRU_WRITE  1
+#define PKRU_ATTRS  2
+
 static inline u64 rsvd_bits(int s, int e)
 {
return ((1ULL << (e - s + 1)) - 1) << s;
@@ -145,10 +151,50 @@ static inline bool is_write_protection(struct kvm_vcpu 
*vcpu)
  * fault with the given access (in ACC_* format)?
  */
 static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
-   unsigned pte_access, unsigned pfec)
+   unsigned pte_access, unsigned pte_pkeys, unsigned pfec)
 {
-   int cpl = kvm_x86_ops->get_cpl(vcpu);
-   unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
+   unsigned long smap, rflags;
+   u32 pkru, pkru_bits;
+   int cpl, index;
+   bool wf, uf;
+
+   cpl = kvm_x86_ops->get_cpl(vcpu);
+   rflags = kvm_x86_ops->get_rflags(vcpu);
+
+   /*
+   * PKU is computed dynamically in permission_fault.
+   * 2nd and 6th conditions:
+   * 2.EFER_LMA=1
+   * 6.PKRU.AD=1
+   *   or The access is a data write and PKRU.WD=1 and
+   *  either CR0.WP=1 or it is a user mode access
+   */
+   pkru = is_long_mode(vcpu) ? read_pkru() : 0;
+   if (unlikely(pkru) && (pfec & PFERR_PK_MASK))
+   {
+   /*
+   * PKRU defines 32 bits, there are 16 domains and 2 attribute 
bits per
+   * domain in pkru, pkey is the index to a defined domain, so the 
value
+   * of pkey * PKRU_ATTRS is offset of a defined domain.
+   */
+   pkru_bits = (pkru >> (pte_pkeys * PKRU_ATTRS)) & 3;
+
+   wf = pfec & PFERR_WRITE_MASK;
+   uf = pfec & PFERR_USER_MASK;
+
+   /*
+   * Ignore PKRU.WD if not relevant to this access (a read,
+   * or a supervisor mode access if CR0.WP=0).
+   * So 6th conditions is equivalent to "pkru_bits != 0"
+   */
+   if (!wf || (!uf && !is_write_protection(vcpu)))
+   pkru_bits &= ~(1 << PKRU_WRITE);
+
+   /* Flip pfec on PK bit if pkru_bits is zero */
+   pfec ^= pkru_bits ? 0 : PFERR_PK_MASK;
+   }
+   else
+   pfec &= ~PFERR_PK_MASK;
 
/*
 * If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1.
@@ -163,8 +209,8 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, 
struct kvm_mmu *mmu,
 * but it will be one in index if SMAP checks are being overridden.
 * It is important to keep this branchless.
 */
-   unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC);
-   int index = (pfec >> 1) +
+   smap = (cpl - 3) & (rflags & X86_EFLAGS_AC);
+   index = (pfec >> 1) +
(smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1));
 
WARN_ON(pfec & PFERR_RSVD_MASK);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 736e6ab..02daa97 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -253,6 +253,15 @@ static int FNAME(update_accessed_dirty_bits)(struct 
kvm_vcpu *vcpu,
}
return 0;
 }
+static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
+{
+   unsigned pkeys = 0;
+#if PTTYPE == 64
+   pte_t pte = {.pte = gpte};
+   pkeys = pte_pkey(pte);
+#endif
+   return pkeys;
+}
 
 /*
  * Fetch a guest pte for a guest virtual address
@@ -265,12 +274,13 @@ static int FNAME(walk_addr_generic)(struct guest_walker 
*walker,
pt_element_t pte;
pt_element_t __user *uninitialized_var(ptep_user);
gfn_t table_gfn;
-   unsigned index, pt_access, pte_access, accessed_dirty;
+   unsigned index, pt_access, pte_access, accessed_dirty, pte_pkeys;
gpa_t pte_gpa;
int offset;
const int write_fault = access & PFERR_WRITE_MASK;
const int user_fault  = access & PFERR_USER_MASK;
const int fetch

Re: [PATCH v1 2/2] dma-mapping-common: add DMA attribute - DMA_ATTR_IOMMU_BYPASS

2015-11-15 Thread Benjamin Serebrin
We looked into Intel IOMMU performance a while ago and learned a few
useful things.  We generally did a parallel 200 thread TCP_RR test, as
this churns through mappings quickly and uses all available cores.

First, the main bottleneck was software performance[1].  This study
preceded the recent patch to break the locks into pools ("Break up
monolithic iommu table/lock into finer graularity pools and lock").
There were several points of lock contention:
- the RB tree is per device (and in the network test, there's one
device).  Every dma_map and unmap holds the lock.
- the RB tree lock is held during invalidations as well.  There's a
250-entry queue for invalidations that doesn't do any batching
intelligence (for example, promote to larger-range invalidations,
flush entire device, etc).  RB tree locks may be held while waiting
for invalidation drains.  Invalidations have even worse behavior with
ATS enabled for a given device.
- the RB tree has one entry per dma_map call (that entry is deleted by
the corresponding dma_unmap).  If we had merged all adjacent entries
when we could, we would have not lost any information that's actually
used by the code today.  (There could be a check that a dma_unmap
actually covers the entire region that was mapped, but there isn't.)
At boot (without network traffic), two common NICs' drivers show tens
of thousands of static dma_maps that never go away; this means the RB
tree is ~14-16 levels deep.  A rbtree walk (holding that big lock) has
a 14-16 level pointer chase through mostly cache-cold entries.  I
wrote a modification to the RB tree handling that merges nodes that
represent abutting IOVA ranges (and unmerges them on dma_unmap), and
the same drivers created around 7 unique entries.  Steady state grew
to a few hundreds and maybe a thousand, but the fragmentation didn't
get worse than that.  This optimization got about a third of the
performance back.

Omer's paper 
(https://www.usenix.org/system/files/conference/atc15/atc15-paper-peleg.pdf)
has some promising approaches.  The magazine avoids the RB tree issue.

I'm interested in seeing if the dynamic 1:1 with a mostly-lock-free
page table cleanup algorithm could do well.

There are correctness fixes and optimizations left in the invalidation
path: I want strict-ish semantics (a page doesn't go back into the
freelist until the last IOTLB/IOMMU TLB entry is invalidated) with
good performance, and that seems to imply that an additional page
reference should be gotten at dma_map time and put back at the
completion of the IOMMU flush routine.  (This is worthy of much
discussion.)

Additionally, we can find ways to optimize the flush routine by
realizing that if we have frequent maps and unmaps, it may be because
the device creates and destroys buffers a lot; these kind of workloads
use an IOVA for one event and then never come back.  Maybe TLBs don't
do much good and we could just flush the entire IOMMU TLB [and ATS
cache] for that BDF.

We'll try to get free time to do some of these things soon.

Ben


1: We verified that the IOMMU costs are almost entirely software
overheads by forcing software 1:1 mode, where we create page tables
for all physical addresses.  We tested using leaf nodes of size 4KB,
of 2MB, and of 1GB.  In call cases, there is zero runtime maintenance
of the page tables, and no IOMMU invalidations.  We did piles of DMA
maximizing x16 PCIe bandwidth on multiple lanes, to random DRAM
addresses.  At 4KB page size, we could see some bandwidth slowdown,
but at 2MB and 1GB, there was < 1% performance loss as compared with
IOMMU off.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH] KVM: x86: Add lowest-priority support for vt-d posted-interrupts

2015-11-15 Thread Wu, Feng
Hi Paolo,

Any comments about this patch, thanks in advance!

Thanks,
Feng

> -Original Message-
> From: Wu, Feng
> Sent: Monday, November 9, 2015 10:47 AM
> To: pbonz...@redhat.com
> Cc: kvm@vger.kernel.org; linux-ker...@vger.kernel.org; Wu, Feng
> 
> Subject: [PATCH] KVM: x86: Add lowest-priority support for vt-d posted-
> interrupts
> 
> Use vector-hashing to handle lowest-priority interrupts for
> posted-interrupts. As an example, modern Intel CPUs use this
> method to handle lowest-priority interrupts.
> 
> Signed-off-by: Feng Wu 
> ---
>  arch/x86/include/asm/kvm_host.h |  2 ++
>  arch/x86/kvm/irq_comm.c | 52
> +
>  arch/x86/kvm/lapic.c| 57
> +
>  arch/x86/kvm/lapic.h|  2 ++
>  arch/x86/kvm/vmx.c  | 14 --
>  5 files changed, 125 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h
> b/arch/x86/include/asm/kvm_host.h
> index 9265196..e225106 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1258,6 +1258,8 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
> 
>  bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
>struct kvm_vcpu **dest_vcpu);
> +struct kvm_vcpu *kvm_intr_vector_hashing_dest(struct kvm *kvm,
> +   struct kvm_lapic_irq *irq);
> 
>  void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
>struct kvm_lapic_irq *irq);
> diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
> index 84b96d3..8156e45 100644
> --- a/arch/x86/kvm/irq_comm.c
> +++ b/arch/x86/kvm/irq_comm.c
> @@ -266,6 +266,58 @@ out:
>   return r;
>  }
> 
> +/*
> + * This routine handles lowest-priority interrupts using vector-hashing
> + * mechanism. As an example, modern Intel CPUs use this method to
> handle
> + * lowest-priority interrupts.
> + *
> + * Here is the details about the vector-hashing mechanism:
> + * 1. For lowest-priority interrupts, store all the possible destination
> + *vCPUs in an array.
> + * 2. Use "guest vector % max number of destination vCPUs" to find the
> right
> + *destination vCPU in the array for the lowest-priority interrupt.
> + */
> +struct kvm_vcpu *kvm_intr_vector_hashing_dest(struct kvm *kvm,
> +   struct kvm_lapic_irq *irq)
> +
> +{
> + unsigned long
> dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
> + unsigned int dest_vcpus = 0;
> + struct kvm_vcpu *vcpu;
> + unsigned int i, mod, idx = 0;
> +
> + vcpu = kvm_intr_vector_hashing_dest_fast(kvm, irq);
> + if (vcpu)
> + return vcpu;
> +
> + memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
> +
> + kvm_for_each_vcpu(i, vcpu, kvm) {
> + if (!kvm_apic_present(vcpu))
> + continue;
> +
> + if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
> + irq->dest_id, irq->dest_mode))
> + continue;
> +
> + __set_bit(vcpu->vcpu_id, dest_vcpu_bitmap);
> + dest_vcpus++;
> + }
> +
> + if (dest_vcpus == 0)
> + return NULL;
> +
> + mod = irq->vector % dest_vcpus;
> +
> + for (i = 0; i <= mod; i++) {
> + idx = find_next_bit(dest_vcpu_bitmap, KVM_MAX_VCPUS,
> idx) + 1;
> + BUG_ON(idx >= KVM_MAX_VCPUS);
> + }
> +
> + return kvm_get_vcpu(kvm, idx - 1);
> +}
> +EXPORT_SYMBOL_GPL(kvm_intr_vector_hashing_dest);
> +
>  bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
>struct kvm_vcpu **dest_vcpu)
>  {
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index ecd4ea1..4937aa4 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -816,6 +816,63 @@ out:
>   return ret;
>  }
> 
> +struct kvm_vcpu *kvm_intr_vector_hashing_dest_fast(struct kvm *kvm,
> +struct kvm_lapic_irq *irq)
> +{
> + struct kvm_apic_map *map;
> + struct kvm_vcpu *vcpu = NULL;
> +
> + if (irq->shorthand)
> + return NULL;
> +
> + rcu_read_lock();
> + map = rcu_dereference(kvm->arch.apic_map);
> +
> + if (!map)
> + goto out;
> +
> + if ((irq->dest_mode != APIC_DEST_PHYSICAL) &&
> + kvm_lowest_prio_delivery(irq)) {
> + u16 cid;
> + int i, idx = 0;
> + unsigned long bitmap = 1;
> + unsigned int mod, dest_vcpus = 0;
> + struct kvm_lapic **dst = NULL;
> +
> +
> + if (!kvm_apic_logical_map_valid(map))
> + goto out;
> +
> + apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
> +
> + if (cid >= ARRAY_SIZE(map->logical_map))
> + goto out;
> +
> + dst = map->logical_map

Re: [PATCH 06/10] KVM: x86: MMU: Consolidate WARN_ON/BUG_ON checks for reverse-mapped sptes

2015-11-15 Thread Takuya Yoshikawa

On 2015/11/14 7:08, Marcelo Tosatti wrote:

On Thu, Nov 12, 2015 at 08:53:43PM +0900, Takuya Yoshikawa wrote:

At some call sites of rmap_get_first() and rmap_get_next(), BUG_ON is
placed right after the call to detect unrelated sptes which must not be
found in the reverse-mapping list.

Move this check in rmap_get_first/next() so that all call sites, not
just the users of the for_each_rmap_spte() macro, will be checked the
same way.  In addition, change the BUG_ON to WARN_ON since killing the
whole host is the last thing that KVM should try.


It should be a BUG_ON, if KVM continues it will corrupt (more) memory.


In the sense that we cannot predict what kind of corruption it will
cause, I agree with you.

But if it can only corrupt that guest's memory, it is a bit sad to
kill unrelated guests, and host, too.  Anyway, since we cannot say
for sure what a possible bug can cause, I agree with you now.

Thanks,
  Takuya

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] KVM: powerpc: kvmppc_visible_gpa can be boolean

2015-11-15 Thread Yaowei Bai
In another patch kvm_is_visible_gfn is maken return bool due to this
function only returns zero or one as its return value, let's also make
kvmppc_visible_gpa return bool to keep consistent.

No functional change.

Signed-off-by: Yaowei Bai 
---
 arch/powerpc/kvm/book3s_pr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 64891b0..70fb08d 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -512,7 +512,7 @@ static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct 
kvmppc_pte *pte)
put_page(hpage);
 }
 
-static int kvmppc_visible_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+static bool kvmppc_visible_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
 {
ulong mp_pa = vcpu->arch.magic_page_pa;
 
@@ -521,7 +521,7 @@ static int kvmppc_visible_gpa(struct kvm_vcpu *vcpu, gpa_t 
gpa)
 
gpa &= ~0xFFFULL;
if (unlikely(mp_pa) && unlikely((mp_pa & KVM_PAM) == (gpa & KVM_PAM))) {
-   return 1;
+   return true;
}
 
return kvm_is_visible_gfn(vcpu->kvm, gpa >> PAGE_SHIFT);
-- 
1.9.1



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 05/10] KVM: x86: MMU: Use for_each_rmap_spte macro instead of pte_list_walk()

2015-11-15 Thread Takuya Yoshikawa

On 2015/11/14 18:20, Marcelo Tosatti wrote:


The actual issue is this: a higher level page that had, under its children,
no out of sync pages, now, due to your addition, a child that is unsync:

initial state:
level1

final state:

level1 -x-> level2 -x-> level3

Where -x-> are the links created by this pagefault fixing round.

If _any_ page under you is unsync (not necessarily the ones this
pagefault is accessing), you have to mark parents unsync.


I understand this, but I don't think my patch will break this.

What kvm_mmu_mark_parents_unsync() does is:

  for each p_i in sp->parent_ptes rmap chain
mark_unsync(p_i);

Then, mark_unsync() finds the parent sp including that p_i to
set ->unsync_child_bitmap and increment ->unsync_children if
necessary.  It may also call kvm_mmu_mark_parents_unsync()
recursively.

I understand we need to tell the parents "you have an unsync
child/descendant" until this information reaches the top level
by that recursive calls.

But since these recursive calls cannot come back to the starting sp,
the child->parent graph has no loop, each mark_unsync(p_i) will not
be affected by other parents in that sp->parent_ptes rmap chain,
from which we started the recursive calls.


As the following code shows, my patch does mark_unsync(parent_pte)
separately, and then mmu_page_add_parent_pte(vcpu, sp, parent_pte):


-   } else if (sp->unsync)
+   if (parent_pte)
+   mark_unsync(parent_pte);
+   } else if (sp->unsync) {
kvm_mmu_mark_parents_unsync(sp);
+   if (parent_pte)
+   mark_unsync(parent_pte);
+   }
+   mmu_page_add_parent_pte(vcpu, sp, parent_pte);


So, as you worried, during each mark_unsync(p_i) is processed,
this parent_pte does not exist in that sp->parent_ptes rmap chain.

But as I explained above, this does not change anything about what
each mark_unsync(p_i) call does, so keeps the original behaviour.


By the way, I think "kvm_mmu_mark_parents_unsync" and "mark_unsync"
do not tell what they actually do well. When I first saw the names,
I thought they would just set the parents' sp->unsync.

To reflect the following meaning better, it should be
propagate_unsync(_to_parents) or something:

  Tell the parents "you have an unsync child/descendant"
  until this unsync information reaches the top level


Thanks,
  Takuya


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] KVM: kvm_is_visible_gfn can be boolean

2015-11-15 Thread Yaowei Bai
On Sat, Nov 14, 2015 at 06:07:51PM +0800, Amos Jianjun Kong wrote:
> On Sat, Nov 14, 2015 at 11:21 AM, Yaowei Bai
>  wrote:
> > This patch makes kvm_is_visible_gfn return bool due to this particular
> > function only using either one or zero as its return value.
> >
> > No functional change.
> >
> > Signed-off-by: Yaowei Bai 
> 
> Hi Yaowei,

Hi,

> 
> > ---
> >  include/linux/kvm_host.h | 2 +-
> >  virt/kvm/kvm_main.c  | 6 +++---
> >  2 files changed, 4 insertions(+), 4 deletions(-)
> >
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 5706a21..4436539 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -623,7 +623,7 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct 
> > gfn_to_hva_cache *ghc,
> >  int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
> >  int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
> >  struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
> > -int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
> > +bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
> >  unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn);
> >  void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
> >
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index 484079e..73cbb41 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -1164,15 +1164,15 @@ struct kvm_memory_slot 
> > *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn
> > return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn);
> >  }
> >
> > -int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
> > +bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
> >  {
> > struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
> >
> > if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS ||
> >   memslot->flags & KVM_MEMSLOT_INVALID)
> > -   return 0;
> > +   return false;
> >
> > -   return 1;
> > +   return true;
> >  }
> >  EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
> 
> 
> kvm_is_visible_gfn() is also used in arch/powerpc/kvm/book3s_pr.c:
> 
> static int kvmppc_visible_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
> {
> ..
> if (unlikely(mp_pa) && unlikely((mp_pa & KVM_PAM) == (gpa & 
> KVM_PAM))) {
> return 1;
> }
> 
> return kvm_is_visible_gfn(vcpu->kvm, gpa >> PAGE_SHIFT);
> }
> 
> Do we still need to update that function?

OK, i'll send another patch to update it, thanks.

> 
> Thanks, Amos


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 107561] 4.2 breaks PCI passthrough in QEMU/KVM

2015-11-15 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=107561

bre...@gmail.com changed:

   What|Removed |Added

 CC||bre...@gmail.com

--- Comment #3 from bre...@gmail.com ---
Issue is still present in 4.4-rc1 and same conditions exist. After lowering it
below 2.5G of ram, the VM boots normally. Anything over 2.5G will cause it to
boot really slow and utilize a lot of cpu.

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH v3 1/3] target-i386: add a subsection for migrating vcpu's TSC rate

2015-11-15 Thread Haozhong Zhang
On 11/13/15 13:21, Eduardo Habkost wrote:
> On Fri, Nov 13, 2015 at 10:23:54AM +0800, Haozhong Zhang wrote:
> > On 11/11/15 22:27, Haozhong Zhang wrote:
> > > On 11/11/15 12:16, Eduardo Habkost wrote:
> > [...]
> > > > > diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
> > > > > index 2f8f396..858ed69 100644
> > > > > --- a/hw/i386/pc_q35.c
> > > > > +++ b/hw/i386/pc_q35.c
> > > > > @@ -385,6 +385,7 @@ static void 
> > > > > pc_q35_2_4_machine_options(MachineClass *m)
> > > > >  pc_q35_2_5_machine_options(m);
> > > > >  m->alias = NULL;
> > > > >  pcmc->broken_reserved_end = true;
> > > > > +pcmc->save_tsc_khz = false;
> > > > 
> > > > I had suggested the PCMachineClass field, but now I've been thinking:
> > > > all other fields related to tsc_khz are in X86CPU, so I believe this
> > > > belongs to X86CPU too. It could be a simple X86CPU property set by
> > > > PC_COMPAT_2_4.
> > > >
> > > 
> > > Reasonable, will update in the next version.
> > 
> > Or maybe no ...
> > 
> > I think there is still a problem to set a X86CPU property in
> > PC_COMPAT_2_4:
> > 
> > if I create a property for save_tsc_khz by adding
> >   DEFINE_PROP_BOOL("save-tsc-freq", X86CPU, save_tsc_khz, true)
> > in x86_cpu_properties and add
> >   {
> >   .driver   = TYPE_X86_CPU,
> >   .property = "save-tsc-freq",
> >   .value= "off",
> >   }
> > in PC_COMPAT_2_4, then "save-tsc-freq" will also become a
> > user-visible cpu option. But we agreed on keeping it as an
> > internal flag in the previous discussion.
> > 
> > Any other ways to set a property in PC_COMPAT_* while keeping that
> > property internal?
> 
> I don't think making it internal is a requirement. It just make
> things simpler because it allowed us to postpone decisions about
> the user-visible parts.
> 
> ...which seems to be a good reason to keep it on PCMachineClass
> by now, if you prefer it that way. The subsection code is already
> on machine.c and not on cpu.c, anyway.
>

Thanks, I'll keep it in PCMachineClass in the next version.

Haozhong
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 05/10] KVM: x86: MMU: Use for_each_rmap_spte macro instead of pte_list_walk()

2015-11-15 Thread Marcelo Tosatti
On Fri, Nov 13, 2015 at 07:47:28PM -0200, Marcelo Tosatti wrote:
> On Thu, Nov 12, 2015 at 08:52:45PM +0900, Takuya Yoshikawa wrote:
> > kvm_mmu_mark_parents_unsync() alone uses pte_list_walk(), witch does
> > nearly the same as the for_each_rmap_spte macro.  The only difference
> > is that is_shadow_present_pte() checks cannot be placed there because
> > kvm_mmu_mark_parents_unsync() can be called with a new parent pointer
> > whose entry is not set yet.
> > 
> > By calling mark_unsync() separately for the parent and adding the parent
> > pointer to the parent_ptes chain later in kvm_mmu_get_page(), the macro
> > works with no problem.
> > 
> > Signed-off-by: Takuya Yoshikawa 
> > ---
> >  arch/x86/kvm/mmu.c | 36 +---
> >  1 file changed, 13 insertions(+), 23 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > index e8cfdc4..1691171 100644
> > --- a/arch/x86/kvm/mmu.c
> > +++ b/arch/x86/kvm/mmu.c
> > @@ -1007,26 +1007,6 @@ static void pte_list_remove(u64 *spte, unsigned long 
> > *pte_list)
> > }
> >  }
> >  
> > -typedef void (*pte_list_walk_fn) (u64 *spte);
> > -static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
> > -{
> > -   struct pte_list_desc *desc;
> > -   int i;
> > -
> > -   if (!*pte_list)
> > -   return;
> > -
> > -   if (!(*pte_list & 1))
> > -   return fn((u64 *)*pte_list);
> > -
> > -   desc = (struct pte_list_desc *)(*pte_list & ~1ul);
> > -   while (desc) {
> > -   for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
> > -   fn(desc->sptes[i]);
> > -   desc = desc->more;
> > -   }
> > -}
> > -
> >  static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
> > struct kvm_memory_slot *slot)
> >  {
> > @@ -1741,7 +1721,12 @@ static struct kvm_mmu_page 
> > *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
> >  static void mark_unsync(u64 *spte);
> >  static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
> >  {
> > -   pte_list_walk(&sp->parent_ptes, mark_unsync);
> > +   u64 *sptep;
> > +   struct rmap_iterator iter;
> > +
> > +   for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
> > +   mark_unsync(sptep);
> > +   }
> >  }
> >  
> >  static void mark_unsync(u64 *spte)
> > @@ -2111,12 +2096,17 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct 
> > kvm_vcpu *vcpu,
> 
> Faulting a spte, and one of the levels of sptes, either
> 
> 
>   spte-1
>   spte-2
>   spte-3
> 
> has present bit clear. So we're searching for a guest page to shadow, with
> gfn "gfn".
> 
> > if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
> > break;
> 
> If a shadow for gfn exists, but is unsync, sync guest-page ---to--> kvm
> sptes.
> 
> > -   mmu_page_add_parent_pte(vcpu, sp, parent_pte);
> 
> add "gfn" (actually its "struct kvm_mmu_page *sp" pointer) to
> the parent.
> > if (sp->unsync_children) {
> > kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
> > kvm_mmu_mark_parents_unsync(sp);
> 
> kvm_mmu_mark_parents_unsync relied on the links from current level all
> the way to top level to mark all levels unsync, so that on guest entry,
> KVM_REQ_MMU_SYNC is processed and any level is brought from guest -->
> kvm pages. This now fails, because you removed "mmu_page_add_parent_pte"
> (the link is not formed all the way to root).
> 
> Unless i am missing something, this is not correct.

The actual issue is this: a higher level page that had, under its children,
no out of sync pages, now, due to your addition, a child that is unsync:

initial state:
level1 

final state:

level1 -x-> level2 -x-> level3

Where -x-> are the links created by this pagefault fixing round.

If _any_ page under you is unsync (not necessarily the ones this
pagefault is accessing), you have to mark parents unsync.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 107921] Regression with KVM virtual machine using VFIO for PCI passthrough.

2015-11-15 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=107921

--- Comment #8 from Jasen Borisov  ---
Created attachment 193121
  --> https://bugzilla.kernel.org/attachment.cgi?id=193121&action=edit
`lspci -vv` output from the host system

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 107921] Regression with KVM virtual machine using VFIO for PCI passthrough.

2015-11-15 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=107921

--- Comment #7 from Jasen Borisov  ---
Created attachment 193111
  --> https://bugzilla.kernel.org/attachment.cgi?id=193111&action=edit
libvirt virtual machine domain configuration

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 107921] Regression with KVM virtual machine using VFIO for PCI passthrough.

2015-11-15 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=107921

--- Comment #6 from Jasen Borisov  ---
Created attachment 193101
  --> https://bugzilla.kernel.org/attachment.cgi?id=193101&action=edit
`dmesg` output from 4.3.0

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 107921] Regression with KVM virtual machine using VFIO for PCI passthrough.

2015-11-15 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=107921

--- Comment #5 from Jasen Borisov  ---
Created attachment 193091
  --> https://bugzilla.kernel.org/attachment.cgi?id=193091&action=edit
`dmesg` output from 4.2.6

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 107921] Regression with KVM virtual machine using VFIO for PCI passthrough.

2015-11-15 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=107921

--- Comment #4 from Jasen Borisov  ---
Created attachment 193081
  --> https://bugzilla.kernel.org/attachment.cgi?id=193081&action=edit
`dmesg` output from 4.1.13.

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 107921] Regression with KVM virtual machine using VFIO for PCI passthrough.

2015-11-15 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=107921

--- Comment #3 from Jasen Borisov  ---
Created attachment 193071
  --> https://bugzilla.kernel.org/attachment.cgi?id=193071&action=edit
Kernel config for my 4.3.0 test build.

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 107921] Regression with KVM virtual machine using VFIO for PCI passthrough.

2015-11-15 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=107921

--- Comment #2 from Jasen Borisov  ---
Created attachment 193061
  --> https://bugzilla.kernel.org/attachment.cgi?id=193061&action=edit
Kernel config for my 4.2.6 test build.

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 107921] Regression with KVM virtual machine using VFIO for PCI passthrough.

2015-11-15 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=107921

--- Comment #1 from Jasen Borisov  ---
Created attachment 193051
  --> https://bugzilla.kernel.org/attachment.cgi?id=193051&action=edit
Kernel config for my 4.1.13 test build.

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 107921] New: Regression with KVM virtual machine using VFIO for PCI passthrough.

2015-11-15 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=107921

Bug ID: 107921
   Summary: Regression with KVM virtual machine using VFIO for PCI
passthrough.
   Product: Virtualization
   Version: unspecified
Kernel Version: 4.2
  Hardware: x86-64
OS: Linux
  Tree: Mainline
Status: NEW
  Severity: high
  Priority: P1
 Component: kvm
  Assignee: virtualization_...@kernel-bugs.osdl.org
  Reporter: tajj...@gmail.com
Regression: No

KVM-based virtual machine using host PCIe devices via VFIO hangs in kernels 4.2
and later. Works correctly (guest boots and is fully usable) in 4.1.x kernels.
The hang happens soon after the virtual machine is started: during the
TianoCore firmware boot splash, before the guest OS even starts booting.

I have a KVM virtual machine which uses three of my host PCIe devices via VFIO:
my graphics card, its audio component, and an XHCI USB controller.

The virtual machine works flawlessly with the 4.1.x kernel series. However,
when I upgraded to 4.2, the virtual machine started to hang during boot,
becoming unusable.

I tried removing various combinations of VFIO PCI devices from my virtual
machine configuration (only passing through the XHCI controller, only the
graphics card, etc), and the problem persisted, so I know it is not caused by
any one particular device I am trying to pass through.

The guest OS in the virtual machine is Windows 10, but that should not make any
difference with regards to this bug, as the hang happens earlier during the VM
startup (before Windows even starts booting).

I am using pure UEFI firmware (no CSM) on both the host and the guest, so there
is no VGA involved anywhere, and the graphics card is passed through as a
regular PCIe device.

I am using Gentoo Linux, and have tried this on kernels 4.1.13, 4.2.6, and
4.3.0 built from mainline sources. 4.1.13 is the only one of the above versions
which works correctly. I created this virtual machine back when I was using an
earlier 4.1.x revision, and it worked there, too. I initially noticed this
issue with 4.2.0 back when it was first released (which is why I stayed back
with 4.1.x), and I am sorry for not reporting it earlier.

Relevant hardware on my system:
- Intel i7 Extreme 5960X CPU
- ASRock X99M Extreme4 motherboard
- 32GB DDR4 2666MHz RAM
- NVIDIA GeForce GTX 980 graphics card (that I am passing through to the VM)
- AMD Radeon r7 250 (for graphics on the host)
- Generic USB3 (XHCI) PCIe expansion card (based on VIA hardware according to
`lspci`)

I have attached: 1) kernel configs for each kernel version above, 2) `lspci
-vv` output, 3) my libvirt virtual machine configuration, 4) `dmesg` output
from each kernel version, taken after attempting to start the virtual machine
straight after booting the system.

My kernel commandline (same on each kernel version tried):
"resume=/dev/nvme0n1 resume_offset=6681926 rw root=/dev/nvme0n1 rootfstype=f2fs
intel_iommu=on iommu=pt vfio-pci.ids=10de:13c0,10de:0fbb,1106:3483
vfio-pci.disable_vga=1 kvm.ignore_msrs=1 hugepages=6000"

I am booting my kernel with EFISTUB; no separate bootloader present.

My QEMU version is 2.4.1, and my libvirt version is 1.2.21.

This is my first kernel bug report, so please let me know if there is any
additional information I should provide to help diagnose/resolve the issue.

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] kvm/vmx: EPTP switching test

2015-11-15 Thread Michael S. Tsirkin
This patch adds a new parameter: eptp_switching_test, which enables
testing EPT switching on VMX if supported by hardware.  All EPT entries
are initialized to the same value so this adds no useful functionality
by itself, but can be used to test VMFUNC performance, and serve as a
basis for future features based on EPTP switching.

Support for nested virt is not enabled.

This was tested using the following code within guest:
#define VMX_VMFUNC ".byte 0x0f,0x01,0xd4"
static void vmfunc(unsigned int nr, unsigned int ept)
{
asm volatile(VMX_VMFUNC
 :
 : "a"(nr), "c"(ept)
 : "memory");
}

VMFUNC instruction cost was measured at ~122 cycles.
(Note: recent versions of gnu toolchain support
 the vmfunc instruction - removing the need for writing
 the bytecode manually).

Signed-off-by: Michael S. Tsirkin 
---

I think I'd like to put this upstream so future eptp switching work can
be implemented on top. Comments?

 arch/x86/include/asm/vmx.h |  7 
 arch/x86/kvm/vmx.c | 84 ++
 2 files changed, 91 insertions(+)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 448b7ca..ceb68d9 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -69,10 +69,13 @@
 #define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY0x0200
 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING  0x0400
 #define SECONDARY_EXEC_ENABLE_INVPCID  0x1000
+#define SECONDARY_EXEC_ENABLE_VM_FUNCTIONS 0x2000
 #define SECONDARY_EXEC_SHADOW_VMCS  0x4000
 #define SECONDARY_EXEC_ENABLE_PML   0x0002
 #define SECONDARY_EXEC_XSAVES  0x0010
 
+/* Definitions for VM-function controls */
+#define VM_FUNCTION_EPTP_SWITCHING 0x0001
 
 #define PIN_BASED_EXT_INTR_MASK 0x0001
 #define PIN_BASED_NMI_EXITING   0x0008
@@ -153,6 +156,8 @@ enum vmcs_field {
APIC_ACCESS_ADDR_HIGH   = 0x2015,
POSTED_INTR_DESC_ADDR   = 0x2016,
POSTED_INTR_DESC_ADDR_HIGH  = 0x2017,
+   VM_FUNCTION_CTRL= 0x2018,
+   VM_FUNCTION_CTRL_HIGH   = 0x2019,
EPT_POINTER = 0x201a,
EPT_POINTER_HIGH= 0x201b,
EOI_EXIT_BITMAP0= 0x201c,
@@ -163,6 +168,8 @@ enum vmcs_field {
EOI_EXIT_BITMAP2_HIGH   = 0x2021,
EOI_EXIT_BITMAP3= 0x2022,
EOI_EXIT_BITMAP3_HIGH   = 0x2023,
+   EPTP_LIST_ADDRESS   = 0x2024,
+   EPTP_LIST_ADDRESS_HIGH  = 0x2025,
VMREAD_BITMAP   = 0x2026,
VMWRITE_BITMAP  = 0x2028,
XSS_EXIT_BITMAP = 0x202C,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6a8bc64..3d1f613 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -45,6 +45,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "trace.h"
 #include "pmu.h"
@@ -105,6 +106,9 @@ static u64 __read_mostly host_xss;
 static bool __read_mostly enable_pml = 1;
 module_param_named(pml, enable_pml, bool, S_IRUGO);
 
+static bool __read_mostly enable_eptp_switching = 0;
+module_param_named(eptp_switching_test, enable_eptp_switching, bool, S_IRUGO);
+
 #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
 #define KVM_VM_CR0_ALWAYS_ON   \
@@ -547,6 +551,10 @@ struct vcpu_vmx {
/* Support for PML */
 #define PML_ENTITY_NUM 512
struct page *pml_pg;
+
+   /* Support for EPTP switching */
+#define EPTP_LIST_NUM  512
+   struct page *eptp_list_pg;
 };
 
 enum segment_cache_field {
@@ -1113,6 +1121,22 @@ static inline bool cpu_has_vmx_pml(void)
return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
 }
 
+static inline bool cpu_has_vmx_vm_functions(void)
+{
+   return vmcs_config.cpu_based_2nd_exec_ctrl &
+   SECONDARY_EXEC_ENABLE_VM_FUNCTIONS;
+}
+
+/* check if the cpu supports writing EPTP switching */
+static inline bool cpu_has_vmx_eptp_switching(void)
+{
+   u64 vmx_msr;
+
+   rdmsrl(MSR_IA32_VMX_VMFUNC, vmx_msr);
+   /* This MSR has same format as VM-function controls */
+   return vmx_msr & VM_FUNCTION_EPTP_SWITCHING;
+}
+
 static inline bool report_flexpriority(void)
 {
return flexpriority_enabled;
@@ -3011,6 +3035,7 @@ static __init int setup_vmcs_config(struct vmcs_config 
*vmcs_conf)
SECONDARY_EXEC_PAUSE_LOOP_EXITING |
SECONDARY_EXEC_RDTSCP |
SECONDARY_EXEC_ENABLE_INVPCID |
+   SECONDARY_EXEC_ENABLE_VM_FUNCTIONS |

[PATCH] KVM-async_pf: Delete an unnecessary check before the function call "kmem_cache_destroy"

2015-11-15 Thread SF Markus Elfring
From: Markus Elfring 
Date: Sun, 15 Nov 2015 10:40:36 +0100

The kmem_cache_destroy() function tests whether its argument is NULL
and then returns immediately. Thus the test around the call is not needed.

This issue was detected by using the Coccinelle software.

Signed-off-by: Markus Elfring 
---
 virt/kvm/async_pf.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 77d42be..3531599 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -57,8 +57,7 @@ int kvm_async_pf_init(void)
 
 void kvm_async_pf_deinit(void)
 {
-   if (async_pf_cache)
-   kmem_cache_destroy(async_pf_cache);
+   kmem_cache_destroy(async_pf_cache);
async_pf_cache = NULL;
 }
 
-- 
2.6.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html