[PATCH v3] test: Add IDT framework

2010-06-13 Thread Avi Kivity
Signed-off-by: Sheng Yang sh...@linux.intel.com
Signed-off-by: Avi Kivity a...@redhat.com
---

v3: rearrange printf()s

v2: accurate instruction boundary tests
avoid playing with the stack; use exception tables instead



 kvm/test/config-x86-common.mak |2 +
 kvm/test/config-x86_64.mak |2 +-
 kvm/test/flat.lds  |7 ++-
 kvm/test/lib/x86/idt.h |   19 +
 kvm/test/x86/idt.c |  150 
 kvm/test/x86/idt_test.c|   49 +
 6 files changed, 227 insertions(+), 2 deletions(-)
 create mode 100644 kvm/test/lib/x86/idt.h
 create mode 100644 kvm/test/x86/idt.c
 create mode 100644 kvm/test/x86/idt_test.c

diff --git a/kvm/test/config-x86-common.mak b/kvm/test/config-x86-common.mak
index c97de52..800b635 100644
--- a/kvm/test/config-x86-common.mak
+++ b/kvm/test/config-x86-common.mak
@@ -59,6 +59,8 @@ $(TEST_DIR)/realmode.o: bits = 32
 
 $(TEST_DIR)/msr.flat: $(cstart.o) $(TEST_DIR)/msr.o
 
+$(TEST_DIR)/idt_test.flat: $(cstart.o) $(TEST_DIR)/idt.o $(TEST_DIR)/idt_test.o
+
 arch_clean:
$(RM) $(TEST_DIR)/*.o $(TEST_DIR)/*.flat \
$(TEST_DIR)/.*.d $(TEST_DIR)/lib/.*.d $(TEST_DIR)/lib/*.o
diff --git a/kvm/test/config-x86_64.mak b/kvm/test/config-x86_64.mak
index d8fd2b5..f9cd121 100644
--- a/kvm/test/config-x86_64.mak
+++ b/kvm/test/config-x86_64.mak
@@ -5,6 +5,6 @@ ldarch = elf64-x86-64
 CFLAGS += -D__x86_64__
 
 tests = $(TEST_DIR)/access.flat $(TEST_DIR)/apic.flat \
- $(TEST_DIR)/emulator.flat
+ $(TEST_DIR)/emulator.flat $(TEST_DIR)/idt_test.flat
 
 include config-x86-common.mak
diff --git a/kvm/test/flat.lds b/kvm/test/flat.lds
index 4120595..4888f3a 100644
--- a/kvm/test/flat.lds
+++ b/kvm/test/flat.lds
@@ -4,7 +4,12 @@ SECTIONS
 stext = .;
 .text : { *(.init) *(.text) *(.text.*) }
 . = ALIGN(4K);
-.data : { *(.data) }
+.data : {
+  *(.data)
+  exception_table_start = .;
+  *(.data.ex)
+ exception_table_end = .;
+ }
 . = ALIGN(16);
 .rodata : { *(.rodata) }
 . = ALIGN(16);
diff --git a/kvm/test/lib/x86/idt.h b/kvm/test/lib/x86/idt.h
new file mode 100644
index 000..6babcb4
--- /dev/null
+++ b/kvm/test/lib/x86/idt.h
@@ -0,0 +1,19 @@
+#ifndef __IDT_TEST__
+#define __IDT_TEST__
+
+void setup_idt(void);
+
+#define ASM_TRY(catch)  \
+movl $0, %%gs:4 \n\t  \
+.pushsection .data.ex \n\t\
+.quad f,  catch \n\t\
+.popsection \n\t  \
+:
+
+#define UD_VECTOR   6
+#define GP_VECTOR   13
+
+unsigned exception_vector(void);
+unsigned exception_error_code(void);
+
+#endif
diff --git a/kvm/test/x86/idt.c b/kvm/test/x86/idt.c
new file mode 100644
index 000..999b3f0
--- /dev/null
+++ b/kvm/test/x86/idt.c
@@ -0,0 +1,150 @@
+#include idt.h
+#include libcflat.h
+
+typedef struct {
+unsigned short offset0;
+unsigned short selector;
+unsigned short ist : 3;
+unsigned short : 5;
+unsigned short type : 4;
+unsigned short : 1;
+unsigned short dpl : 2;
+unsigned short p : 1;
+unsigned short offset1;
+unsigned offset2;
+unsigned reserved;
+} idt_entry_t;
+
+static idt_entry_t idt[256];
+
+typedef struct {
+unsigned short limit;
+unsigned long linear_addr;
+} __attribute__((packed)) descriptor_table_t;
+
+void lidt(idt_entry_t *idt, int nentries)
+{
+descriptor_table_t dt;
+
+dt.limit = nentries * sizeof(*idt) - 1;
+dt.linear_addr = (unsigned long)idt;
+asm volatile (lidt %0 : : m(dt));
+}
+
+unsigned short read_cs()
+{
+unsigned short r;
+
+asm volatile (mov %%cs, %0 : =r(r));
+return r;
+}
+
+void memset(void *a, unsigned char v, int n)
+{
+unsigned char *x = a;
+
+while (n--)
+   *x++ = v;
+}
+
+void set_idt_entry(idt_entry_t *e, void *addr, int dpl)
+{
+memset(e, 0, sizeof *e);
+e-offset0 = (unsigned long)addr;
+e-selector = read_cs();
+e-ist = 0;
+e-type = 14;
+e-dpl = dpl;
+e-p = 1;
+e-offset1 = (unsigned long)addr  16;
+e-offset2 = (unsigned long)addr  32;
+}
+
+struct ex_regs {
+unsigned long rax, rcx, rdx, rbx;
+unsigned long dummy, rbp, rsi, rdi;
+unsigned long r8, r9, r10, r11;
+unsigned long r12, r13, r14, r15;
+unsigned long vector;
+unsigned long error_code;
+unsigned long rip;
+unsigned long cs;
+unsigned long rflags;
+};
+
+struct ex_record {
+unsigned long rip;
+unsigned long handler;
+};
+
+extern struct ex_record exception_table_start, exception_table_end;
+
+void do_handle_exception(struct ex_regs *regs)
+{
+struct ex_record *ex;
+unsigned ex_val;
+
+ex_val = regs-vector | (regs-error_code  16);
+
+asm(mov %0, %%gs:4 : : r(ex_val));
+
+for (ex = exception_table_start; ex != exception_table_end; ++ex) {
+if (ex-rip == regs-rip) {
+regs-rip 

Re: [PATCH] These are my first patches to the kernel

2010-06-13 Thread Avi Kivity

On 06/12/2010 03:17 PM, K.de Jong wrote:

 From 6b9ac8708d856a425a9aaa598acdea3d89485bde Mon Sep 17 00:00:00 2001
From: UndiFineDk.dej...@undifined.nl
Date: Sat, 12 Jun 2010 00:24:28 +0200
Subject: [PATCH] These are my first patches to the kernel
Janitor patches to kvm
Linus2.6: virt/kvm/

   


Most of the patch is just whitespace changes, which don't improve the 
code in any way.  I much prefer patches that fix bugs or add features.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] kvm, ept: remove the default write bit

2010-06-13 Thread Avi Kivity

On 06/11/2010 10:50 PM, Marcelo Tosatti wrote:

On Fri, Jun 11, 2010 at 07:30:50PM +0800, Lai Jiangshan wrote:
   

When ept enabled, current code set shadow_base_present_pte
including the write bit, thus all pte entries have
writabe bit, and it means guest os can always
write to any mapped page (even VMM maps RO pages for
the guest.)

We always use get_user_pages(write=1), so this bad code does not
cause any bad result currently.

But it is really bad, so fix it, and we will use RO pages future.

We will set writabe bit when it is really writable (determined by
the parameters of the set_spte())

Signed-off-by: Lai Jiangshanla...@cn.fujitsu.com
---
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index fdb18cf..c7565ea 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4365,8 +4365,7 @@ static int __init vmx_init(void)

if (enable_ept) {
bypass_guest_pf = 0;
-   kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
-   VMX_EPT_WRITABLE_MASK);
+   kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK);
kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
VMX_EPT_EXECUTABLE_MASK);
kvm_enable_tdp();
 

You can remove the call to kvm_mmu_set_base_ptes entirely, because
VMX_EPT_READABLE_MASK == PT_PRESENT_MASK.
   


We can leave that to a later patch which removes kvm_mmu_set_base_ptes() 
entirely.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3] test: Add XSAVE unit test

2010-06-13 Thread Avi Kivity

On 06/11/2010 10:45 AM, Sheng Yang wrote:

Based on IDT test framework.

   


Nice and comprehensive.


+
+int main(void)
+{
+setup_idt();
+if (check_cpuid_1_ecx(CPUID_1_ECX_XSAVE)) {
+printf(CPU has XSAVE feature\n);
+test_xsave();
+} else {
+printf(CPU don't has XSAVE feature\n);
+test_no_xsave();
+}
+printf(Total test: %d\n, total_tests);
+if (fail_tests == 0)
+printf(ALL PASS!\n);
+else
+printf(Fail %d tests.\n, fail_tests);
+return 1;
+}
   


Need to return 0 if !fail_tests, so we can hook this up to autotest.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4] test: Add XSAVE unit test

2010-06-13 Thread Sheng Yang
Based on IDT test framework.

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 kvm/test/config-x86-common.mak |2 +
 kvm/test/config-x86_64.mak |3 +-
 kvm/test/x86/xsave.c   |  262 
 3 files changed, 266 insertions(+), 1 deletions(-)
 create mode 100644 kvm/test/x86/xsave.c

diff --git a/kvm/test/config-x86-common.mak b/kvm/test/config-x86-common.mak
index 800b635..0e1ccce 100644
--- a/kvm/test/config-x86-common.mak
+++ b/kvm/test/config-x86-common.mak
@@ -61,6 +61,8 @@ $(TEST_DIR)/msr.flat: $(cstart.o) $(TEST_DIR)/msr.o
 
 $(TEST_DIR)/idt_test.flat: $(cstart.o) $(TEST_DIR)/idt.o $(TEST_DIR)/idt_test.o
 
+$(TEST_DIR)/xsave.flat: $(cstart.o) $(TEST_DIR)/idt.o $(TEST_DIR)/xsave.o
+
 arch_clean:
$(RM) $(TEST_DIR)/*.o $(TEST_DIR)/*.flat \
$(TEST_DIR)/.*.d $(TEST_DIR)/lib/.*.d $(TEST_DIR)/lib/*.o
diff --git a/kvm/test/config-x86_64.mak b/kvm/test/config-x86_64.mak
index f9cd121..2da2906 100644
--- a/kvm/test/config-x86_64.mak
+++ b/kvm/test/config-x86_64.mak
@@ -5,6 +5,7 @@ ldarch = elf64-x86-64
 CFLAGS += -D__x86_64__
 
 tests = $(TEST_DIR)/access.flat $(TEST_DIR)/apic.flat \
- $(TEST_DIR)/emulator.flat $(TEST_DIR)/idt_test.flat
+ $(TEST_DIR)/emulator.flat $(TEST_DIR)/idt_test.flat \
+ $(TEST_DIR)/xsave.flat
 
 include config-x86-common.mak
diff --git a/kvm/test/x86/xsave.c b/kvm/test/x86/xsave.c
new file mode 100644
index 000..d5cd2d8
--- /dev/null
+++ b/kvm/test/x86/xsave.c
@@ -0,0 +1,262 @@
+#include libcflat.h
+#include idt.h
+
+#ifdef __x86_64__
+#define uint64_t unsigned long
+#else
+#define uint64_t unsigned long long
+#endif
+
+static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
+unsigned int *ecx, unsigned int *edx)
+{
+/* ecx is often an input as well as an output. */
+asm volatile(cpuid
+: =a (*eax),
+=b (*ebx),
+=c (*ecx),
+=d (*edx)
+: 0 (*eax), 2 (*ecx));
+}
+
+/*
+ * Generic CPUID function
+ * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
+ * resulting in stale register contents being returned.
+ */
+void cpuid(unsigned int op,
+unsigned int *eax, unsigned int *ebx,
+unsigned int *ecx, unsigned int *edx)
+{
+*eax = op;
+*ecx = 0;
+__cpuid(eax, ebx, ecx, edx);
+}
+
+/* Some CPUID calls want 'count' to be placed in ecx */
+void cpuid_count(unsigned int op, int count,
+unsigned int *eax, unsigned int *ebx,
+unsigned int *ecx, unsigned int *edx)
+{
+*eax = op;
+*ecx = count;
+__cpuid(eax, ebx, ecx, edx);
+}
+
+int xgetbv_checking(u32 index, u64 *result)
+{
+u32 eax, edx;
+
+asm volatile(ASM_TRY(1f)
+.byte 0x0f,0x01,0xd0\n\t /* xgetbv */
+1:
+: =a (eax), =d (edx)
+: c (index));
+*result = eax + ((u64)edx  32);
+return exception_vector();
+}
+
+int xsetbv_checking(u32 index, u64 value)
+{
+u32 eax = value;
+u32 edx = value  32;
+
+asm volatile(ASM_TRY(1f)
+.byte 0x0f,0x01,0xd1\n\t /* xsetbv */
+1:
+: : a (eax), d (edx), c (index));
+return exception_vector();
+}
+
+unsigned long read_cr4(void)
+{
+unsigned long val;
+asm volatile(mov %%cr4,%0 : =r (val));
+return val;
+}
+
+int write_cr4_checking(unsigned long val)
+{
+asm volatile(ASM_TRY(1f)
+mov %0,%%cr4\n\t
+1:: : r (val));
+return exception_vector();
+}
+
+#define CPUID_1_ECX_XSAVE  (1  26)
+#define CPUID_1_ECX_OSXSAVE(1  27)
+int check_cpuid_1_ecx(unsigned int bit)
+{
+unsigned int eax, ebx, ecx, edx;
+cpuid(1, eax, ebx, ecx, edx);
+if (ecx  bit)
+return 1;
+return 0;
+}
+
+uint64_t get_supported_xcr0(void)
+{
+unsigned int eax, ebx, ecx, edx;
+cpuid_count(0xd, 0, eax, ebx, ecx, edx);
+printf(eax %x, ebx %x, ecx %x, edx %x\n,
+eax, ebx, ecx, edx);
+return eax + ((u64)edx  32);
+}
+
+#define X86_CR4_OSXSAVE0x0004
+#define XCR_XFEATURE_ENABLED_MASK   0x
+#define XCR_XFEATURE_ILLEGAL_MASK   0x0010
+
+#define XSTATE_FP   0x1
+#define XSTATE_SSE  0x2
+#define XSTATE_YMM  0x4
+
+static int total_tests, fail_tests;
+
+void pass_if(int condition)
+{
+total_tests ++;
+if (condition)
+printf(Pass!\n);
+else {
+printf(Fail!\n);
+fail_tests ++;
+}
+}
+
+void test_xsave(void)
+{
+unsigned long cr4;
+uint64_t supported_xcr0;
+uint64_t test_bits;
+u64 xcr0;
+int r;
+
+printf(Legal instruction testing:\n);
+supported_xcr0 = get_supported_xcr0();
+printf(Supported XCR0 bits: 0x%x\n, supported_xcr0);
+
+printf(Check minimal XSAVE required bits: );
+test_bits = XSTATE_FP | XSTATE_SSE;
+pass_if((supported_xcr0  test_bits) == test_bits);
+
+printf(Set CR4 OSXSAVE: );
+cr4 = read_cr4();
+r = write_cr4_checking(cr4 | 

Re: [PATCH v4] test: Add XSAVE unit test

2010-06-13 Thread Sheng Yang
On Sunday 13 June 2010 16:43:42 Avi Kivity wrote:
 On 06/13/2010 11:32 AM, Sheng Yang wrote:
  Based on IDT test framework.
  +
  +int main(void)
  +{
  +setup_idt();
  +if (check_cpuid_1_ecx(CPUID_1_ECX_XSAVE)) {
  +printf(CPU has XSAVE feature\n);
  +test_xsave();
  +} else {
  +printf(CPU don't has XSAVE feature\n);
  +test_no_xsave();
  +}
  +printf(Total test: %d\n, total_tests);
  +if (fail_tests == 0)
  +printf(ALL PASS!\n);
  +else {
  +printf(Fail %d tests.\n, fail_tests);
  +return 0;
  +}
  +return 1;
  +}
 
 Wrong way, 0 = success, !0 = fail (just like shell exit codes).

Indeed oops. Finally checked your first comment to this test case patch, and 
found 
I completely misread it...

--
regards
Yang, Sheng
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4] test: Add XSAVE unit test

2010-06-13 Thread Sheng Yang
Based on IDT test framework.

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 kvm/test/config-x86-common.mak |2 +
 kvm/test/config-x86_64.mak |3 +-
 kvm/test/x86/xsave.c   |  262 
 3 files changed, 266 insertions(+), 1 deletions(-)
 create mode 100644 kvm/test/x86/xsave.c

diff --git a/kvm/test/config-x86-common.mak b/kvm/test/config-x86-common.mak
index 800b635..0e1ccce 100644
--- a/kvm/test/config-x86-common.mak
+++ b/kvm/test/config-x86-common.mak
@@ -61,6 +61,8 @@ $(TEST_DIR)/msr.flat: $(cstart.o) $(TEST_DIR)/msr.o
 
 $(TEST_DIR)/idt_test.flat: $(cstart.o) $(TEST_DIR)/idt.o $(TEST_DIR)/idt_test.o
 
+$(TEST_DIR)/xsave.flat: $(cstart.o) $(TEST_DIR)/idt.o $(TEST_DIR)/xsave.o
+
 arch_clean:
$(RM) $(TEST_DIR)/*.o $(TEST_DIR)/*.flat \
$(TEST_DIR)/.*.d $(TEST_DIR)/lib/.*.d $(TEST_DIR)/lib/*.o
diff --git a/kvm/test/config-x86_64.mak b/kvm/test/config-x86_64.mak
index f9cd121..2da2906 100644
--- a/kvm/test/config-x86_64.mak
+++ b/kvm/test/config-x86_64.mak
@@ -5,6 +5,7 @@ ldarch = elf64-x86-64
 CFLAGS += -D__x86_64__
 
 tests = $(TEST_DIR)/access.flat $(TEST_DIR)/apic.flat \
- $(TEST_DIR)/emulator.flat $(TEST_DIR)/idt_test.flat
+ $(TEST_DIR)/emulator.flat $(TEST_DIR)/idt_test.flat \
+ $(TEST_DIR)/xsave.flat
 
 include config-x86-common.mak
diff --git a/kvm/test/x86/xsave.c b/kvm/test/x86/xsave.c
new file mode 100644
index 000..a22b44c
--- /dev/null
+++ b/kvm/test/x86/xsave.c
@@ -0,0 +1,262 @@
+#include libcflat.h
+#include idt.h
+
+#ifdef __x86_64__
+#define uint64_t unsigned long
+#else
+#define uint64_t unsigned long long
+#endif
+
+static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
+unsigned int *ecx, unsigned int *edx)
+{
+/* ecx is often an input as well as an output. */
+asm volatile(cpuid
+: =a (*eax),
+=b (*ebx),
+=c (*ecx),
+=d (*edx)
+: 0 (*eax), 2 (*ecx));
+}
+
+/*
+ * Generic CPUID function
+ * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
+ * resulting in stale register contents being returned.
+ */
+void cpuid(unsigned int op,
+unsigned int *eax, unsigned int *ebx,
+unsigned int *ecx, unsigned int *edx)
+{
+*eax = op;
+*ecx = 0;
+__cpuid(eax, ebx, ecx, edx);
+}
+
+/* Some CPUID calls want 'count' to be placed in ecx */
+void cpuid_count(unsigned int op, int count,
+unsigned int *eax, unsigned int *ebx,
+unsigned int *ecx, unsigned int *edx)
+{
+*eax = op;
+*ecx = count;
+__cpuid(eax, ebx, ecx, edx);
+}
+
+int xgetbv_checking(u32 index, u64 *result)
+{
+u32 eax, edx;
+
+asm volatile(ASM_TRY(1f)
+.byte 0x0f,0x01,0xd0\n\t /* xgetbv */
+1:
+: =a (eax), =d (edx)
+: c (index));
+*result = eax + ((u64)edx  32);
+return exception_vector();
+}
+
+int xsetbv_checking(u32 index, u64 value)
+{
+u32 eax = value;
+u32 edx = value  32;
+
+asm volatile(ASM_TRY(1f)
+.byte 0x0f,0x01,0xd1\n\t /* xsetbv */
+1:
+: : a (eax), d (edx), c (index));
+return exception_vector();
+}
+
+unsigned long read_cr4(void)
+{
+unsigned long val;
+asm volatile(mov %%cr4,%0 : =r (val));
+return val;
+}
+
+int write_cr4_checking(unsigned long val)
+{
+asm volatile(ASM_TRY(1f)
+mov %0,%%cr4\n\t
+1:: : r (val));
+return exception_vector();
+}
+
+#define CPUID_1_ECX_XSAVE  (1  26)
+#define CPUID_1_ECX_OSXSAVE(1  27)
+int check_cpuid_1_ecx(unsigned int bit)
+{
+unsigned int eax, ebx, ecx, edx;
+cpuid(1, eax, ebx, ecx, edx);
+if (ecx  bit)
+return 1;
+return 0;
+}
+
+uint64_t get_supported_xcr0(void)
+{
+unsigned int eax, ebx, ecx, edx;
+cpuid_count(0xd, 0, eax, ebx, ecx, edx);
+printf(eax %x, ebx %x, ecx %x, edx %x\n,
+eax, ebx, ecx, edx);
+return eax + ((u64)edx  32);
+}
+
+#define X86_CR4_OSXSAVE0x0004
+#define XCR_XFEATURE_ENABLED_MASK   0x
+#define XCR_XFEATURE_ILLEGAL_MASK   0x0010
+
+#define XSTATE_FP   0x1
+#define XSTATE_SSE  0x2
+#define XSTATE_YMM  0x4
+
+static int total_tests, fail_tests;
+
+void pass_if(int condition)
+{
+total_tests ++;
+if (condition)
+printf(Pass!\n);
+else {
+printf(Fail!\n);
+fail_tests ++;
+}
+}
+
+void test_xsave(void)
+{
+unsigned long cr4;
+uint64_t supported_xcr0;
+uint64_t test_bits;
+u64 xcr0;
+int r;
+
+printf(Legal instruction testing:\n);
+supported_xcr0 = get_supported_xcr0();
+printf(Supported XCR0 bits: 0x%x\n, supported_xcr0);
+
+printf(Check minimal XSAVE required bits: );
+test_bits = XSTATE_FP | XSTATE_SSE;
+pass_if((supported_xcr0  test_bits) == test_bits);
+
+printf(Set CR4 OSXSAVE: );
+cr4 = read_cr4();
+r = write_cr4_checking(cr4 | 

RE: [RFC PATCH v7 01/19] Add a new structure for skb buffer from external.

2010-06-13 Thread Xin, Xiaohui
-Original Message-
From: netdev-ow...@vger.kernel.org [mailto:netdev-ow...@vger.kernel.org] On 
Behalf Of
Xin, Xiaohui
Sent: Saturday, June 12, 2010 5:31 PM
To: Herbert Xu
Cc: Stephen Hemminger; net...@vger.kernel.org; kvm@vger.kernel.org;
linux-ker...@vger.kernel.org; m...@redhat.com; mi...@elte.hu; 
da...@davemloft.net;
jd...@linux.intel.com
Subject: RE: [RFC PATCH v7 01/19] Add a new structure for skb buffer from 
external.

-Original Message-
From: Herbert Xu [mailto:herb...@gondor.apana.org.au]
Sent: Friday, June 11, 2010 1:21 PM
To: Xin, Xiaohui
Cc: Stephen Hemminger; net...@vger.kernel.org; kvm@vger.kernel.org;
linux-ker...@vger.kernel.org; m...@redhat.com; mi...@elte.hu; 
da...@davemloft.net;
jd...@linux.intel.com
Subject: Re: [RFC PATCH v7 01/19] Add a new structure for skb buffer from 
external.

On Wed, Jun 09, 2010 at 05:54:02PM +0800, Xin, Xiaohui wrote:

 I'm not sure if I understand your way correctly:
 1) Does the way only deal with driver with SG feature? Since packet
 is non-linear...

No the hardware doesn't have to support SG.  You just need to
place the entire packet contents in a page instead of skb-head.

 2) Is skb-data still pointing to guest user buffers?
 If yes, how to avoid the modifications to net core change to skb?

skb-data would not point to guest user buffers.  In the common
case the packet is not modified on its way to the guest so this
is not an issue.

In the rare case where it is modified, you only have to copy the
bits which are modified and the cost of that is inconsequential
since you have to write to that memory anyway.

 3) In our way only parts of drivers need be modified to support zero-copy.
 and here, need we modify all the drivers?

If you're asking the portion of each driver supporting zero-copy
that needs to be modified, then AFAICS this doesn't change that
very much at all.

 I think to make skb-head empty at first will cause more effort to pass the 
 check with
 skb header. Have I missed something here? I really make the skb-head NULL
 just before kfree(skb) in skb_release_data(), it's done by callback we have 
 made for skb.

No I'm not suggesting you set it to NULL.  It should have some
memory allocated, but skb_headlen(skb) should be zero.

Please have a look at how the napi_gro_frags interface works (e.g.,
in drivers/net/cxgb3/sge.c).  This is exactly the model that I am
suggesting.

Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} herb...@gondor.apana.org.au
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

Herbert,
I explained what I think the thought in your mind here, please clarify if
something missed.

1) Modify driver from netdev_alloc_skb() to alloc user pages if dev is 
zero-copyed.
  If the driver support PS mode, then modify alloc_page() too.
2) Add napi_gro_frags() in driver to receive the user pages instead of 
driver's receiving
function.
3) napi_gro_frags() will allocate small skb and pull the header data from
the first page to skb-data.

Is above the way what you have suggested?
I have thought something in detail about the way.

1) The first page will have an offset after the header is copied into 
allocated kernel skb.
The offset should be recalculated when the user page data is transferred to 
guest. This
may modify some of the gro code.

2) napi_gro_frags() may remove a page when it's data is totally be pulled, but 
we cannot
put a user page as normally. This may modify the gro code too.

3) When the user buffer returned to guest, some of them need to be appended a 
vnet header.
That means for some pages, the vnet header room should be reserved when 
allocated.
But we cannot know which one will be used as the first page when allocated. If 
we reserved
vnet header for each page, since the set_skb_frag() in guest driver only use 
the offset 0 for
second pages, then page data will be wrong.

4) Since the user buffer pages should be released, so we still need a dtor 
callback to do that,
and then I still need a place to hold it. How do you think about to put it in 
skb_shinfo?

Currently I can only think of this.
How do you think about then?

Thanks
Xiaohui

Herbert,
In this way, I think we should create 3 functions at least in drivers to 
allocate rx buffer, to receive the rx buffers, and to clean the rx buffers.

We can also have another way here. We can provide a function to only substitute 
alloc_page(), and a function to release the pages when cleaning the rx buffers.
The skb for the rx buffer can be allocated in original way, and when pushing 
the data to guest, the header data will be copied to guest buffer. In this way, 
we 
should reserve sufficient room for the header in the first guest user buffers. 
That need modifications to guest virtio-net kernel. And this way only suitable 
for
PS mode supported driver. Considered the advanced driver mostly has PS mode.
So it should be not a critical issue.

Thanks
Xiaohui
 

--
To 

Re: [PATCH v3] KVM: x86: XSAVE/XRSTOR live migration support

2010-06-13 Thread Sheng Yang
On Sunday 13 June 2010 16:26:18 Avi Kivity wrote:
 On 06/11/2010 07:36 AM, Sheng Yang wrote:
  This patch enable save/restore of xsave state.
  
  +static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
  +   struct kvm_xsave *guest_xsave)
  +{
  +   u64 xstate_bv =
  +   *(u64 *)guest_xsave-region[XSAVE_HDR_OFFSET / sizeof(u32)];
  +   int size;
  +
  +   if (cpu_has_xsave) {
  +   if (xstate_bv  XSTATE_YMM)
  +   size = XSAVE_YMM_OFFSET + XSAVE_YMM_SIZE;
  +   else
  +   size = XSAVE_HDR_OFFSET + XSAVE_HDR_SIZE;
  +   memcpy(vcpu-arch.guest_fpu.state-xsave,
  +   guest_xsave-region, size);
 
 This allows userspace to overflow host memory by specifying XSTATE_YMM
 on a host that doesn't support it.
 
 Better to just use the host's size of the structure.

Yes, should good enough.
 
  +   } else {
  +   if (xstate_bv  ~XSTATE_FPSSE)
  +   return -EINVAL;
  +   size = sizeof(struct i387_fxsave_struct);
  +   memcpy(vcpu-arch.guest_fpu.state-fxsave,
  +   guest_xsave-region, size);
  +   }
  +   return 0;
  +}
  +
  
  +
  +static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
  +  struct kvm_xcrs *guest_xcrs)
  +{
  +   int i, r = 0;
  +
  +   if (!cpu_has_xsave)
  +   return -EINVAL;
 
 Too strict?

For no cpu_has_xsave, the KVM_CAP_XCRS would return 0, so this ioctl shouldn't 
be 
called.
 
  +
  +   if (guest_xcrs-nr_xcrs  KVM_MAX_XCRS)
  +   return -EFAULT;
 
 EFAULT is for faults during access to userspace.  EINVAL or E2BIG.
 
 Need to ensure flags is 0 for forward compatibility.

OK.
 
  +
  +   for (i = 0; i  guest_xcrs-nr_xcrs; i++)
  +   /* Only support XCR0 currently */
  +   if (guest_xcrs-xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) {
  +   r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
  +   guest_xcrs-xcrs[0].value);
  +   break;
  +   }
  +   if (r)
  +   r = -EFAULT;
 
 EINVAL

OK
 
  +   return r;
  +}
  +

--
regards
Yang, Sheng
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3] KVM: x86: XSAVE/XRSTOR live migration support

2010-06-13 Thread Avi Kivity

On 06/13/2010 12:10 PM, Sheng Yang wrote:



+
+static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
+  struct kvm_xcrs *guest_xcrs)
+{
+   int i, r = 0;
+
+   if (!cpu_has_xsave)
+   return -EINVAL;
   

Too strict?
 

For no cpu_has_xsave, the KVM_CAP_XCRS would return 0, so this ioctl shouldn't 
be
called.
   


Right.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4] KVM: x86: XSAVE/XRSTOR live migration support

2010-06-13 Thread Sheng Yang
This patch enable save/restore of xsave state.

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 Documentation/kvm/api.txt|   74 ++
 arch/x86/include/asm/kvm.h   |   22 +++
 arch/x86/include/asm/xsave.h |7 ++-
 arch/x86/kvm/x86.c   |  139 ++
 include/linux/kvm.h  |   12 
 5 files changed, 252 insertions(+), 2 deletions(-)

diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index 159b4ef..ffba03f 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -922,6 +922,80 @@ Define which vcpu is the Bootstrap Processor (BSP).  
Values are the same
 as the vcpu id in KVM_CREATE_VCPU.  If this ioctl is not called, the default
 is vcpu 0.
 
+4.41 KVM_GET_XSAVE
+
+Capability: KVM_CAP_XSAVE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_xsave (out)
+Returns: 0 on success, -1 on error
+
+struct kvm_xsave {
+   __u32 region[1024];
+};
+
+This ioctl would copy current vcpu's xsave struct to the userspace.
+
+4.42 KVM_SET_XSAVE
+
+Capability: KVM_CAP_XSAVE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_xsave (in)
+Returns: 0 on success, -1 on error
+
+struct kvm_xsave {
+   __u32 region[1024];
+};
+
+This ioctl would copy userspace's xsave struct to the kernel.
+
+4.43 KVM_GET_XCRS
+
+Capability: KVM_CAP_XCRS
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_xcrs (out)
+Returns: 0 on success, -1 on error
+
+struct kvm_xcr {
+   __u32 xcr;
+   __u32 reserved;
+   __u64 value;
+};
+
+struct kvm_xcrs {
+   __u32 nr_xcrs;
+   __u32 flags;
+   struct kvm_xcr xcrs[KVM_MAX_XCRS];
+   __u64 padding[16];
+};
+
+This ioctl would copy current vcpu's xcrs to the userspace.
+
+4.44 KVM_SET_XCRS
+
+Capability: KVM_CAP_XCRS
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_xcrs (in)
+Returns: 0 on success, -1 on error
+
+struct kvm_xcr {
+   __u32 xcr;
+   __u32 reserved;
+   __u64 value;
+};
+
+struct kvm_xcrs {
+   __u32 nr_xcrs;
+   __u32 flags;
+   struct kvm_xcr xcrs[KVM_MAX_XCRS];
+   __u64 padding[16];
+};
+
+This ioctl would set vcpu's xcr to the value userspace specified.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index ff90055..4d8dcbd 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -22,6 +22,8 @@
 #define __KVM_HAVE_XEN_HVM
 #define __KVM_HAVE_VCPU_EVENTS
 #define __KVM_HAVE_DEBUGREGS
+#define __KVM_HAVE_XSAVE
+#define __KVM_HAVE_XCRS
 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
@@ -299,4 +301,24 @@ struct kvm_debugregs {
__u64 reserved[9];
 };
 
+/* for KVM_CAP_XSAVE */
+struct kvm_xsave {
+   __u32 region[1024];
+};
+
+#define KVM_MAX_XCRS   16
+
+struct kvm_xcr {
+   __u32 xcr;
+   __u32 reserved;
+   __u64 value;
+};
+
+struct kvm_xcrs {
+   __u32 nr_xcrs;
+   __u32 flags;
+   struct kvm_xcr xcrs[KVM_MAX_XCRS];
+   __u64 padding[16];
+};
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 29ee4e4..32c3666 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -13,8 +13,11 @@
 
 #define FXSAVE_SIZE512
 
-#define XSTATE_YMM_SIZE 256
-#define XSTATE_YMM_OFFSET (512 + 64)
+#define XSAVE_HDR_SIZE 64
+#define XSAVE_HDR_OFFSETFXSAVE_SIZE
+
+#define XSAVE_YMM_SIZE 256
+#define XSAVE_YMM_OFFSET(XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
 
 /*
  * These are the features that the OS can handle currently.
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7a4073b..682b5f2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1698,6 +1698,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_PCI_SEGMENT:
case KVM_CAP_DEBUGREGS:
case KVM_CAP_X86_ROBUST_SINGLESTEP:
+   case KVM_CAP_XSAVE:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -1721,6 +1722,9 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_MCE:
r = KVM_MAX_MCE_BANKS;
break;
+   case KVM_CAP_XCRS:
+   r = cpu_has_xsave;
+   break;
default:
r = 0;
break;
@@ -2373,6 +2377,77 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct 
kvm_vcpu *vcpu,
return 0;
 }
 
+static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
+struct kvm_xsave *guest_xsave)
+{
+   if (cpu_has_xsave)
+   memcpy(guest_xsave-region,
+   vcpu-arch.guest_fpu.state-xsave,
+   sizeof(struct xsave_struct));
+   else {
+   memcpy(guest_xsave-region,
+   vcpu-arch.guest_fpu.state-fxsave,
+   

Re: [PATCH V2] VFIO driver: Non-privileged user level PCI drivers

2010-06-13 Thread Michael S. Tsirkin
On Fri, Jun 11, 2010 at 03:15:53PM -0700, Tom Lyon wrote:
 [ bunch of stuff about MSI-X checking and IOMMUs and config registers...]
 
 OK, here's the thing.  The IOMMU API today does not do squat about
 dealing with interrupts. Interrupts are special because the APIC
 addresses are not each in their own page.  Yes, the IOMMU hardware
 supports it (at least Intel), and there's some Intel intr remapping
 code (not AMD), but it doesn't look like it is enough.

The iommu book from AMD seems to say that interrupt remapping table
address is taken from the device table entry.  So hardware support seems
to be there, and to me it looks like it should be enough.
Need to look at the iommu/msi code some more to figure out
whether what linux does is handling this correctly -
if it doesn't we need to fix that.

 Therefore, we must not allow the user level driver to diddle the MSI
 or MSI-X areas - either in config space or in the device memory space.

It won't help.
Consider that you want to let a userspace driver control
the device with DMA capabilities.

So if there is a range of addresses that device
can write into that can break host, these writes
can be triggered by userspace. Limiting
userspace access to MSI registers won't help:
you need a way to protect host from the device.

  If the device doesn't have its MSI-X registers in nice page aligned
  areas, then it is not well-behaved and it is S.O.L. The SR-IOV spec
  recommends that devices be designed the well-behaved way.
 
 When the code in vfio_pci_config speaks of virtualization it means
 that there are fake registers which the user driver can read or write,
 but do not affect the real registers. BARs are one case, MSI regs
 another. The PCI vendor and device ID are virtual because SR-IOV
 doesn't supply them but I wanted the user driver to find them in the
 same old place.

Sorry, I still don't understand why do we bother.  All this is already
implemented in userspace.  Why can't we just use this existing userspace
implementation?  It seems that all kernel needs to do is prevent
userspace from writing BARs.

Why can't we replace all this complexity with basically:

if (addr = PCI_BASE_ADDRESS_5  addr + len = PCI_BASE_ADDRESS_0)
return -ENOPERM;

And maybe another register or two. Most registers should be fine.

 [ Re: Hotplug and Suspend/Resume]
 There are *plenty* of real drivers - brand new ones - which don't
 bother with these today.  Yeah, I can see adding them to the framework
 someday - but if there's no urgent need then it is way down the
 priority list.

Well, for kernel drivers everything mostly works out of the box, it is
handled by the PCI subsystem.  So some kind of framework will need to be
added for userspace drivers as well.  And I suspect this issue won't be
fixable later without breaking applications.

 Meanwhile, the other uses beckon.

Which other uses? I thought the whole point was fixing
what's broken with current kvm implementation.
So it seems to be we should not rush it ignoring existing issues such as
hotplug.

 And I never heard
 the Infiniband users complaining about not having these things.

I did.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/24] Nested VMX, v5

2010-06-13 Thread Nadav Har'El
Hi Avi,

This is a followup of our nested VMX patches that Orit Wasserman posted in
December. We've addressed most of the comments and concerns that you and
others on the mailing list had with the previous patch set. We hope you'll
find these patches easier to understand, and suitable for applying to KVM.


The following 24 patches implement nested VMX support. The patches enable a
guest to use the VMX APIs in order to run its own nested guests. I.e., it
allows running hypervisors (that use VMX) under KVM. We describe the theory
behind this work, our implementation, and its performance characteristics,
in IBM Research report H-0282, The Turtles Project: Design and Implementation
of Nested Virtualization, available at:

http://bit.ly/a0o9te

The current patches support running Linux under a nested KVM using shadow
page table (with bypass_guest_pf disabled). They support multiple nested
hypervisors, which can run multiple guests. Only 64-bit nested hypervisors
are supported. SMP is supported. Additional patches for running Windows under
nested KVM, and Linux under nested VMware server, and support for nested EPT,
are currently running in the lab, and will be sent as follow-on patchsets.

These patches were written by:
 Abel Gordon, abelg at il.ibm.com
 Nadav Har'El, nyh at il.ibm.com
 Orit Wasserman, oritw at il.ibm.com
 Ben-Ami Yassor, benami at il.ibm.com
 Muli Ben-Yehuda, muli at il.ibm.com

With contributions by:
 Anthony Liguori, aliguori at us.ibm.com
 Mike Day, mdday at us.ibm.com

This work was inspired by the nested SVM support by Alexander Graf and Joerg
Roedel.


Changes since v4:
* Rebased to the current KVM tree.
* Support for lazy FPU loading.
* Implemented about 90 requests and suggestions made on the mailing list
  regarding the previous version of this patch set.
* Split the changes into many more, and better documented, patches.

--
Nadav Har'El
IBM Haifa Research Lab
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/24] Move nested option from svm.c to x86.c

2010-06-13 Thread Nadav Har'El
The SVM module had a nested option, on by default, which controls whether
to allow nested virtualization. Now that VMX also supports nested
virtualization, we can move this option to x86.c, for both SVM and VMX.

The nested option takes three possible values. 0 disables nested
virtualization on both SVM and VMX, and 1 enables it on both.
The value 2, which is the default when this module option is not explicitly
set, asks each of SVM or VMX to choose its own default; Currently, VMX
disables nested virtualization in this case, while SVM leaves it enabled.

When nested VMX becomes more mature, this default should probably be changed
to enable nested virtualization on both architectures.

Signed-off-by: Nadav Har'El n...@il.ibm.com
---
--- .before/arch/x86/kvm/svm.c  2010-06-13 15:01:28.0 +0300
+++ .after/arch/x86/kvm/svm.c   2010-06-13 15:01:28.0 +0300
@@ -158,9 +158,6 @@ static int npt = 1;
 
 module_param(npt, int, S_IRUGO);
 
-static int nested = 1;
-module_param(nested, int, S_IRUGO);
-
 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 static void svm_complete_interrupts(struct vcpu_svm *svm);
 
--- .before/arch/x86/kvm/x86.c  2010-06-13 15:01:28.0 +0300
+++ .after/arch/x86/kvm/x86.c   2010-06-13 15:01:28.0 +0300
@@ -95,6 +95,17 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
 int ignore_msrs = 0;
 module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
 
+/* If nested=1, nested virtualization is supported. I.e., the guest may use
+ * VMX or SVM (as appropriate) and be a hypervisor for its own guests.
+ * If nested=0, nested virtualization is not supported.
+ * When nested starts as 2 (which is the default), it is later modified by the
+ * specific module used (VMX or SVM). Currently, nested will be left enabled
+ * on SVM, but reset to 0 on VMX.
+ */
+int nested = 2;
+EXPORT_SYMBOL_GPL(nested);
+module_param(nested, int, S_IRUGO);
+
 #define KVM_NR_SHARED_MSRS 16
 
 struct kvm_shared_msrs_global {
--- .before/arch/x86/kvm/x86.h  2010-06-13 15:01:28.0 +0300
+++ .after/arch/x86/kvm/x86.h   2010-06-13 15:01:28.0 +0300
@@ -75,4 +75,6 @@ static inline struct kvm_mem_aliases *kv
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
 
+extern int nested;
+
 #endif
--- .before/arch/x86/kvm/vmx.c  2010-06-13 15:01:28.0 +0300
+++ .after/arch/x86/kvm/vmx.c   2010-06-13 15:01:28.0 +0300
@@ -4310,6 +4310,12 @@ static int __init vmx_init(void)
 {
int r, i;
 
+   /* By default (when nested==2), turn off nested support. This check
+* should be removed when nested VMX is considered mature enough.
+*/
+   if (nested != 1)
+   nested = 0;
+
rdmsrl_safe(MSR_EFER, host_efer);
 
for (i = 0; i  NR_VMX_MSR; ++i)
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/24] Add VMX and SVM to list of supported cpuid features

2010-06-13 Thread Nadav Har'El
Add the VMX CPU feature to the list of CPU featuress KVM advertises with
the KVM_GET_SUPPORTED_CPUID ioctl (unless the nested module option is off).

Qemu uses this ioctl, and intersects KVM's list with its own list of desired
cpu features (depending on the -cpu option given to qemu) to determine the
final list of features presented to the guest.
This patch also does the same for SVM: KVM now advertises it supports SVM,
unless the nested module option is off.

Signed-off-by: Nadav Har'El n...@il.ibm.com
---
--- .before/arch/x86/kvm/x86.c  2010-06-13 15:01:28.0 +0300
+++ .after/arch/x86/kvm/x86.c   2010-06-13 15:01:28.0 +0300
@@ -1923,7 +1923,7 @@ static void do_cpuid_ent(struct kvm_cpui
/* cpuid 1.ecx */
const u32 kvm_supported_word4_x86_features =
F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
-   0 /* DS-CPL, VMX, SMX, EST */ |
+   0 /* DS-CPL */ | (nested ? F(VMX) : 0) | 0 /* SMX, EST */ |
0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
0 /* Reserved, DCA */ | F(XMM4_1) |
@@ -1931,7 +1931,8 @@ static void do_cpuid_ent(struct kvm_cpui
0 /* Reserved, XSAVE, OSXSAVE */;
/* cpuid 0x8001.ecx */
const u32 kvm_supported_word6_x86_features =
-   F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
+   F(LAHF_LM) | F(CMP_LEGACY) | (nested ? F(SVM) : 0) |
+   0 /* ExtApicSpace */ |
F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
0 /* SKINIT */ | 0 /* WDT */;
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/24] Implement VMXON and VMXOFF

2010-06-13 Thread Nadav Har'El
This patch allows a guest to use the VMXON and VMXOFF instructions, and
emulates them accordingly. Basically this amounts to checking some
prerequisites, and then remembering whether the guest has enabled or disabled
VMX operation.

Signed-off-by: Nadav Har'El n...@il.ibm.com
---
--- .before/arch/x86/kvm/vmx.c  2010-06-13 15:01:28.0 +0300
+++ .after/arch/x86/kvm/vmx.c   2010-06-13 15:01:28.0 +0300
@@ -117,6 +117,16 @@ struct shared_msr_entry {
u64 mask;
 };
 
+/* The nested_vmx structure is part of vcpu_vmx, and holds information we need
+ * for correct emulation of VMX (i.e., nested VMX) on this vcpu. For example,
+ * the current VMCS set by L1, a list of the VMCSs used to run the active
+ * L2 guests on the hardware, and more.
+ */
+struct nested_vmx {
+   /* Has the level1 guest done vmxon? */
+   bool vmxon;
+};
+
 struct vcpu_vmx {
struct kvm_vcpu   vcpu;
struct list_head  local_vcpus_link;
@@ -168,6 +178,9 @@ struct vcpu_vmx {
u32 exit_reason;
 
bool rdtscp_enabled;
+
+   /* Support for guest hypervisors (nested VMX) */
+   struct nested_vmx nested;
 };
 
 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -3353,6 +3366,93 @@ static int handle_vmx_insn(struct kvm_vc
return 1;
 }
 
+/* Emulate the VMXON instruction.
+ * Currently, we just remember that VMX is active, and do not save or even
+ * inspect the argument to VMXON (the so-called VMXON pointer) because we
+ * do not currently need to store anything in that guest-allocated memory
+ * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
+ * argument is different from the VMXON pointer (which the spec says they do).
+ */
+static int handle_vmon(struct kvm_vcpu *vcpu)
+{
+   struct kvm_segment cs;
+   struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+   /* The Intel VMX Instruction Reference lists a bunch of bits that
+* are prerequisite to running VMXON, most notably CR4.VMXE must be
+* set to 1. Otherwise, we should fail with #UD. We test these now:
+*/
+   if (!nested) {
+   kvm_queue_exception(vcpu, UD_VECTOR);
+   return 1;
+   }
+
+   if (!(vcpu-arch.cr4  X86_CR4_VMXE) ||
+   !(vcpu-arch.cr0  X86_CR0_PE) ||
+   (vmx_get_rflags(vcpu)  X86_EFLAGS_VM)) {
+   kvm_queue_exception(vcpu, UD_VECTOR);
+   return 1;
+   }
+
+   vmx_get_segment(vcpu, cs, VCPU_SREG_CS);
+   if (is_long_mode(vcpu)  !cs.l) {
+   kvm_queue_exception(vcpu, UD_VECTOR);
+   return 1;
+   }
+
+   if (vmx_get_cpl(vcpu)) {
+   kvm_inject_gp(vcpu, 0);
+   return 1;
+   }
+
+   vmx-nested.vmxon = 1;
+
+   skip_emulated_instruction(vcpu);
+   return 1;
+}
+
+/*
+ * Intel's VMX Instruction Reference specifies a common set of prerequisites
+ * for running VMX instructions (except VMXON, whose prerequisites are
+ * slightly different). It also specifies what exception to inject otherwise.
+ */
+static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
+{
+   struct kvm_segment cs;
+   struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+   if (!vmx-nested.vmxon) {
+   kvm_queue_exception(vcpu, UD_VECTOR);
+   return 0;
+   }
+
+   vmx_get_segment(vcpu, cs, VCPU_SREG_CS);
+   if ((vmx_get_rflags(vcpu)  X86_EFLAGS_VM) ||
+   (is_long_mode(vcpu)  !cs.l)) {
+   kvm_queue_exception(vcpu, UD_VECTOR);
+   return 0;
+   }
+
+   if (vmx_get_cpl(vcpu)) {
+   kvm_inject_gp(vcpu, 0);
+   return 0;
+   }
+
+   return 1;
+}
+
+/* Emulate the VMXOFF instruction */
+static int handle_vmoff(struct kvm_vcpu *vcpu)
+{
+   if (!nested_vmx_check_permission(vcpu))
+   return 1;
+
+   to_vmx(vcpu)-nested.vmxon = 0;
+
+   skip_emulated_instruction(vcpu);
+   return 1;
+}
+
 static int handle_invlpg(struct kvm_vcpu *vcpu)
 {
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -3642,8 +3742,8 @@ static int (*kvm_vmx_exit_handlers[])(st
[EXIT_REASON_VMREAD]  = handle_vmx_insn,
[EXIT_REASON_VMRESUME]= handle_vmx_insn,
[EXIT_REASON_VMWRITE] = handle_vmx_insn,
-   [EXIT_REASON_VMOFF]   = handle_vmx_insn,
-   [EXIT_REASON_VMON]= handle_vmx_insn,
+   [EXIT_REASON_VMOFF]   = handle_vmoff,
+   [EXIT_REASON_VMON]= handle_vmon,
[EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
[EXIT_REASON_APIC_ACCESS] = handle_apic_access,
[EXIT_REASON_WBINVD]  = handle_wbinvd,
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  

[PATCH 4/24] Allow setting the VMXE bit in CR4

2010-06-13 Thread Nadav Har'El
This patch allows the guest to enable the VMXE bit in CR4, which is a
prerequisite to running VMXON.

Signed-off-by: Nadav Har'El n...@il.ibm.com
---
--- .before/arch/x86/kvm/x86.c  2010-06-13 15:01:28.0 +0300
+++ .after/arch/x86/kvm/x86.c   2010-06-13 15:01:28.0 +0300
@@ -501,7 +501,7 @@ int __kvm_set_cr4(struct kvm_vcpu *vcpu,
!load_pdptrs(vcpu, vcpu-arch.cr3))
return 1;
 
-   if (cr4  X86_CR4_VMXE)
+   if (cr4  X86_CR4_VMXE  !nested)
return 1;
 
kvm_x86_ops-set_cr4(vcpu, cr4);
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/24] Introduce vmcs12: a VMCS structure for L1

2010-06-13 Thread Nadav Har'El
An implementation of VMX needs to define a VMCS structure. This structure
is kept in guest memory, but is opaque to the guest (who can only read or
write it with VMX instructions).

This patch starts to define the VMCS structure which our nested VMX
implementation will present to L1. We call it vmcs12, as it is the VMCS
that L1 keeps for its L2 guests.

This patch also adds the notion (as required by the VMX spec) of the current
VMCS, and finally includes utility functions for mapping the guest-allocated
VMCSs in host memory.

Signed-off-by: Nadav Har'El n...@il.ibm.com
---
--- .before/arch/x86/kvm/vmx.c  2010-06-13 15:01:28.0 +0300
+++ .after/arch/x86/kvm/vmx.c   2010-06-13 15:01:28.0 +0300
@@ -117,6 +117,29 @@ struct shared_msr_entry {
u64 mask;
 };
 
+#define VMCS12_REVISION 0x11e57ed0
+
+/*
+ * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
+ * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
+ * a VMCS structure (which is opaque to the guest), and vmcs12 is our emulated
+ * VMX's VMCS. This structure is stored in guest memory specified by VMPTRLD,
+ * and accessed by the guest using VMREAD/VMWRITE/VMCLEAR instructions. More
+ * than one of these structures may exist, if L1 runs multiple L2 guests.
+ * nested_vmx_run() will use the data here to build a VMCS for the underlying
+ * hardware which will be used to run L2.
+ * This structure is packed in order to preseve the binary content after live
+ * migration. If there are changes in the content or layout, VMCS12_REVISION
+ * must be changed.
+ */
+struct __attribute__ ((__packed__)) vmcs12 {
+   /* According to the Intel spec, a VMCS region must start with the
+* following two fields. Then follow implementation-specific data.
+*/
+   u32 revision_id;
+   u32 abort;
+};
+
 /* The nested_vmx structure is part of vcpu_vmx, and holds information we need
  * for correct emulation of VMX (i.e., nested VMX) on this vcpu. For example,
  * the current VMCS set by L1, a list of the VMCSs used to run the active
@@ -125,6 +148,11 @@ struct shared_msr_entry {
 struct nested_vmx {
/* Has the level1 guest done vmxon? */
bool vmxon;
+
+   /* The guest-physical address of the current VMCS L1 keeps for L2 */
+   gpa_t current_vmptr;
+   /* The host-usable pointer to the above. Set by nested_map_current() */
+   struct vmcs12 *current_l2_page;
 };
 
 struct vcpu_vmx {
@@ -188,6 +216,61 @@ static inline struct vcpu_vmx *to_vmx(st
return container_of(vcpu, struct vcpu_vmx, vcpu);
 }
 
+static struct page *nested_get_page(struct kvm_vcpu *vcpu, u64 vmcs_addr)
+{
+   struct page *vmcs_page =
+   gfn_to_page(vcpu-kvm, vmcs_addr  PAGE_SHIFT);
+
+   if (is_error_page(vmcs_page)) {
+   printk(KERN_ERR %s error allocating page 0x%llx\n,
+  __func__, vmcs_addr);
+   kvm_release_page_clean(vmcs_page);
+   return NULL;
+   }
+   return vmcs_page;
+}
+
+static int nested_map_current(struct kvm_vcpu *vcpu)
+{
+   struct vcpu_vmx *vmx = to_vmx(vcpu);
+   struct page *vmcs_page =
+   nested_get_page(vcpu, vmx-nested.current_vmptr);
+
+   if (vmcs_page == NULL) {
+   printk(KERN_INFO %s: failure in nested_get_page\n, __func__);
+   return 0;
+   }
+
+   if (vmx-nested.current_l2_page) {
+   printk(KERN_INFO Shadow vmcs already mapped\n);
+   BUG_ON(1);
+   return 0;
+   }
+
+   vmx-nested.current_l2_page = kmap_atomic(vmcs_page, KM_USER0);
+   return 1;
+}
+
+static void nested_unmap_current(struct kvm_vcpu *vcpu)
+{
+   struct page *page;
+   struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+   if (!vmx-nested.current_l2_page) {
+   printk(KERN_INFO Shadow vmcs already unmapped\n);
+   BUG_ON(1);
+   return;
+   }
+
+   page = kmap_atomic_to_page(vmx-nested.current_l2_page);
+
+   kunmap_atomic(vmx-nested.current_l2_page, KM_USER0);
+
+   kvm_release_page_dirty(page);
+
+   vmx-nested.current_l2_page = NULL;
+}
+
 static int init_rmode(struct kvm *kvm);
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
@@ -4186,6 +4269,9 @@ static struct kvm_vcpu *vmx_create_vcpu(
goto free_vmcs;
}
 
+   vmx-nested.current_vmptr = -1ull;
+   vmx-nested.current_l2_page = NULL;
+
return vmx-vcpu;
 
 free_vmcs:
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 6/24] Implement reading and writing of VMX MSRs

2010-06-13 Thread Nadav Har'El
When the guest can use VMX instructions (when the nested module option is
on), it should also be able to read and write VMX MSRs, e.g., to query about
VMX capabilities. This patch adds this support.

Signed-off-by: Nadav Har'El n...@il.ibm.com
---
--- .before/arch/x86/kvm/x86.c  2010-06-13 15:01:28.0 +0300
+++ .after/arch/x86/kvm/x86.c   2010-06-13 15:01:28.0 +0300
@@ -702,7 +702,11 @@ static u32 msrs_to_save[] = {
 #ifdef CONFIG_X86_64
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
-   MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
+   MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
+   MSR_IA32_FEATURE_CONTROL,  MSR_IA32_VMX_BASIC,
+   MSR_IA32_VMX_PINBASED_CTLS, MSR_IA32_VMX_PROCBASED_CTLS,
+   MSR_IA32_VMX_EXIT_CTLS, MSR_IA32_VMX_ENTRY_CTLS,
+   MSR_IA32_VMX_PROCBASED_CTLS2, MSR_IA32_VMX_EPT_VPID_CAP,
 };
 
 static unsigned num_msrs_to_save;
--- .before/arch/x86/kvm/vmx.c  2010-06-13 15:01:28.0 +0300
+++ .after/arch/x86/kvm/vmx.c   2010-06-13 15:01:28.0 +0300
@@ -1231,6 +1231,98 @@ static void guest_write_tsc(u64 guest_ts
 }
 
 /*
+ * If we allow our guest to use VMX instructions, we should also let it use
+ * VMX-specific MSRs.
+ */
+static int nested_vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+{
+   u64 vmx_msr = 0;
+   u32 vmx_msr_high, vmx_msr_low;
+
+   switch (msr_index) {
+   case MSR_IA32_FEATURE_CONTROL:
+   *pdata = 0;
+   break;
+   case MSR_IA32_VMX_BASIC:
+   /*
+* This MSR reports some information about VMX support of the
+* processor. We should return information about the VMX we
+* emulate for the guest, and the VMCS structure we give it -
+* not about the VMX support of the underlying hardware. Some
+* However, some capabilities of the underlying hardware are
+* used directly by our emulation (e.g., the physical address
+* width), so these are copied from what the hardware reports.
+*/
+   *pdata = VMCS12_REVISION |
+   (((u64)sizeof(struct vmcs12))  32);
+   rdmsrl(MSR_IA32_VMX_BASIC, vmx_msr);
+#define VMX_BASIC_64   0x0001LLU
+#define VMX_BASIC_MEM_TYPE 0x003cLLU
+#define VMX_BASIC_INOUT0x0040LLU
+   *pdata |= vmx_msr 
+   (VMX_BASIC_64 | VMX_BASIC_MEM_TYPE | VMX_BASIC_INOUT);
+   break;
+#define CORE2_PINBASED_CTLS_MUST_BE_ONE  0x0016
+#define MSR_IA32_VMX_TRUE_PINBASED_CTLS  0x48d
+   case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+   case MSR_IA32_VMX_PINBASED_CTLS:
+   vmx_msr_low  = CORE2_PINBASED_CTLS_MUST_BE_ONE;
+   vmx_msr_high = CORE2_PINBASED_CTLS_MUST_BE_ONE |
+   PIN_BASED_EXT_INTR_MASK |
+   PIN_BASED_NMI_EXITING |
+   PIN_BASED_VIRTUAL_NMIS;
+   *pdata = vmx_msr_low | ((u64)vmx_msr_high  32);
+   break;
+   case MSR_IA32_VMX_PROCBASED_CTLS:
+   /* This MSR determines which vm-execution controls the L1
+* hypervisor may ask, or may not ask, to enable. Normally we
+* can only allow enabling features which the hardware can
+* support, but we limit ourselves to allowing only known
+* features that were tested nested. We allow disabling any
+* feature (even if the hardware can't disable it).
+*/
+   rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
+
+   vmx_msr_low = 0; /* allow disabling any feature */
+   vmx_msr_high = /* do not expose new untested features */
+   CPU_BASED_HLT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
+   CPU_BASED_CR3_STORE_EXITING | CPU_BASED_USE_IO_BITMAPS |
+   CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING |
+   CPU_BASED_MWAIT_EXITING | CPU_BASED_MONITOR_EXITING |
+   CPU_BASED_INVLPG_EXITING | CPU_BASED_TPR_SHADOW |
+   CPU_BASED_USE_MSR_BITMAPS |
+#ifdef CONFIG_X86_64
+   CPU_BASED_CR8_LOAD_EXITING |
+   CPU_BASED_CR8_STORE_EXITING |
+#endif
+   CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+   *pdata = vmx_msr_low | ((u64)vmx_msr_high  32);
+   break;
+   case MSR_IA32_VMX_EXIT_CTLS:
+   *pdata = 0;
+#ifdef CONFIG_X86_64
+   *pdata |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
+#endif
+   break;
+   case MSR_IA32_VMX_ENTRY_CTLS:
+   *pdata = 0;
+   break;
+   case MSR_IA32_VMX_PROCBASED_CTLS2:
+   *pdata = 0;

[PATCH 7/24] Understanding guest pointers to vmcs12 structures

2010-06-13 Thread Nadav Har'El
This patch includes a couple of utility functions for extracting pointer
operands of VMX instructions issued by L1 (a guest hypervisor), and
translating guest-given vmcs12 virtual addresses to guest-physical addresses.

Signed-off-by: Nadav Har'El n...@il.ibm.com
---
--- .before/arch/x86/kvm/x86.c  2010-06-13 15:01:29.0 +0300
+++ .after/arch/x86/kvm/x86.c   2010-06-13 15:01:29.0 +0300
@@ -3286,13 +3286,14 @@ static int kvm_fetch_guest_virt(gva_t ad
  access | PFERR_FETCH_MASK, error);
 }
 
-static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
+int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
   struct kvm_vcpu *vcpu, u32 *error)
 {
u32 access = (kvm_x86_ops-get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
  error);
 }
+EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
 
 static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int 
bytes,
   struct kvm_vcpu *vcpu, u32 *error)
--- .before/arch/x86/kvm/x86.h  2010-06-13 15:01:29.0 +0300
+++ .after/arch/x86/kvm/x86.h   2010-06-13 15:01:29.0 +0300
@@ -75,6 +75,9 @@ static inline struct kvm_mem_aliases *kv
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
 
+int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
+   struct kvm_vcpu *vcpu, u32 *error);
+
 extern int nested;
 
 #endif
--- .before/arch/x86/kvm/vmx.c  2010-06-13 15:01:29.0 +0300
+++ .after/arch/x86/kvm/vmx.c   2010-06-13 15:01:29.0 +0300
@@ -3654,6 +3654,86 @@ static int handle_vmoff(struct kvm_vcpu 
return 1;
 }
 
+/*
+ * Decode the memory-address operand of a vmx instruction, according to the
+ * Intel spec.
+ */
+#define VMX_OPERAND_SCALING(vii)   ((vii)  3)
+#define VMX_OPERAND_ADDR_SIZE(vii) (((vii)  7)  7)
+#define VMX_OPERAND_IS_REG(vii)((vii)  (1u  10))
+#define VMX_OPERAND_SEG_REG(vii)   (((vii)  15)  7)
+#define VMX_OPERAND_INDEX_REG(vii) (((vii)  18)  0xf)
+#define VMX_OPERAND_INDEX_INVALID(vii) ((vii)  (1u  22))
+#define VMX_OPERAND_BASE_REG(vii)  (((vii)  23)  0xf)
+#define VMX_OPERAND_BASE_INVALID(vii)  ((vii)  (1u  27))
+#define VMX_OPERAND_REG(vii)   (((vii)  3)  0xf)
+#define VMX_OPERAND_REG2(vii)  (((vii)  28)  0xf)
+static gva_t get_vmx_mem_address(struct kvm_vcpu *vcpu,
+unsigned long exit_qualification,
+u32 vmx_instruction_info)
+{
+   int  scaling = VMX_OPERAND_SCALING(vmx_instruction_info);
+   int  addr_size = VMX_OPERAND_ADDR_SIZE(vmx_instruction_info);
+   bool is_reg = VMX_OPERAND_IS_REG(vmx_instruction_info);
+   int  seg_reg = VMX_OPERAND_SEG_REG(vmx_instruction_info);
+   int  index_reg = VMX_OPERAND_SEG_REG(vmx_instruction_info);
+   bool index_is_valid = !VMX_OPERAND_INDEX_INVALID(vmx_instruction_info);
+   int  base_reg   = VMX_OPERAND_BASE_REG(vmx_instruction_info);
+   bool base_is_valid  = !VMX_OPERAND_BASE_INVALID(vmx_instruction_info);
+   gva_t addr;
+
+   if (is_reg) {
+   kvm_queue_exception(vcpu, UD_VECTOR);
+   return 0;
+   }
+
+   switch (addr_size) {
+   case 1: /* 32 bit. high bits are undefined according to the spec: */
+   exit_qualification = 0x;
+   break;
+   case 2: /* 64 bit */
+   break;
+   default: /* addr_size=0 means 16 bit */
+   return 0;
+   }
+
+   /* Addr = segment_base + offset */
+   /* offfset = Base + [Index * Scale] + Displacement */
+   addr = vmx_get_segment_base(vcpu, seg_reg);
+   if (base_is_valid)
+   addr += kvm_register_read(vcpu, base_reg);
+   if (index_is_valid)
+   addr += kvm_register_read(vcpu, index_reg)scaling;
+   addr += exit_qualification; /* holds the displacement */
+
+   return addr;
+}
+
+static int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, gpa_t *gpap)
+{
+   int r;
+   gva_t gva = get_vmx_mem_address(vcpu,
+   vmcs_readl(EXIT_QUALIFICATION),
+   vmcs_read32(VMX_INSTRUCTION_INFO));
+   if (gva == 0)
+   return 1;
+   *gpap = 0;
+   r = kvm_read_guest_virt(gva, gpap, sizeof(*gpap), vcpu, NULL);
+   if (r) {
+   printk(KERN_ERR %s cannot read guest vmcs addr %lx : %d\n,
+  __func__, gva, r);
+   return r;
+   }
+   /* According to the spec, VMCS addresses must be 4K aligned */
+   if (!IS_ALIGNED(*gpap, PAGE_SIZE)) {
+   printk(KERN_DEBUG %s addr %llx not aligned\n,
+  __func__, *gpap);
+   return 1;
+   }
+
+   return 0;
+}
+
 static int 

[PATCH 8/24] Hold a vmcs02 for each vmcs12

2010-06-13 Thread Nadav Har'El
In this patch we add a list of L0 (hardware) VMCSs, which we'll use to hold a 
hardware VMCS for each active L1 VMCS (i.e., for each L2 guest).

We call each of these L0 VMCSs a vmcs02, as it is the VMCS that L0 uses
to run its nested guest L2.

Signed-off-by: Nadav Har'El n...@il.ibm.com
---
--- .before/arch/x86/kvm/vmx.c  2010-06-13 15:01:29.0 +0300
+++ .after/arch/x86/kvm/vmx.c   2010-06-13 15:01:29.0 +0300
@@ -140,6 +140,12 @@ struct __attribute__ ((__packed__)) vmcs
u32 abort;
 };
 
+struct vmcs_list {
+   struct list_head list;
+   gpa_t vmcs_addr;
+   struct vmcs *l2_vmcs;
+};
+
 /* The nested_vmx structure is part of vcpu_vmx, and holds information we need
  * for correct emulation of VMX (i.e., nested VMX) on this vcpu. For example,
  * the current VMCS set by L1, a list of the VMCSs used to run the active
@@ -153,6 +159,10 @@ struct nested_vmx {
gpa_t current_vmptr;
/* The host-usable pointer to the above. Set by nested_map_current() */
struct vmcs12 *current_l2_page;
+
+   /* list of real (hardware) VMCS, one for each L2 guest of L1 */
+   struct list_head l2_vmcs_list; /* a vmcs_list */
+   int l2_vmcs_num;
 };
 
 struct vcpu_vmx {
@@ -1754,6 +1764,84 @@ static void free_vmcs(struct vmcs *vmcs)
free_pages((unsigned long)vmcs, vmcs_config.order);
 }
 
+static struct vmcs *nested_get_current_vmcs(struct kvm_vcpu *vcpu)
+{
+   struct vcpu_vmx *vmx = to_vmx(vcpu);
+   struct vmcs_list *list_item, *n;
+
+   list_for_each_entry_safe(list_item, n, vmx-nested.l2_vmcs_list, list)
+   if (list_item-vmcs_addr == vmx-nested.current_vmptr)
+   return list_item-l2_vmcs;
+
+   return NULL;
+}
+
+/* Allocate an L0 VMCS (vmcs02) for the current L1 VMCS (vmcs12), if one
+ * does not already exist. The allocation is done in L0 memory, so to avoid
+ * denial-of-service attack by guests, we limit the number of concurrently-
+ * allocated vmcss. A well-behaving L1 will VMCLEAR unused vmcs12s and not
+ * trigger this limit.
+ */
+static const int NESTED_MAX_VMCS = 256;
+static int nested_create_current_vmcs(struct kvm_vcpu *vcpu)
+{
+   struct vmcs_list *new_l2_guest;
+   struct vmcs *l2_vmcs;
+
+   if (nested_get_current_vmcs(vcpu))
+   return 0; /* nothing to do - we already have a VMCS */
+
+   if (to_vmx(vcpu)-nested.l2_vmcs_num = NESTED_MAX_VMCS)
+   return -ENOMEM;
+
+   new_l2_guest = (struct vmcs_list *)
+   kmalloc(sizeof(struct vmcs_list), GFP_KERNEL);
+   if (!new_l2_guest)
+   return -ENOMEM;
+
+   l2_vmcs = alloc_vmcs();
+   if (!l2_vmcs) {
+   kfree(new_l2_guest);
+   return -ENOMEM;
+   }
+
+   new_l2_guest-vmcs_addr = to_vmx(vcpu)-nested.current_vmptr;
+   new_l2_guest-l2_vmcs = l2_vmcs;
+   list_add((new_l2_guest-list), (to_vmx(vcpu)-nested.l2_vmcs_list));
+   to_vmx(vcpu)-nested.l2_vmcs_num++;
+   return 0;
+}
+
+/* Free the current L2 VMCS, and remove it from l2_vmcs_list */
+static void nested_free_current_vmcs(struct kvm_vcpu *vcpu)
+{
+   struct vcpu_vmx *vmx = to_vmx(vcpu);
+   struct vmcs_list *list_item, *n;
+
+   list_for_each_entry_safe(list_item, n, vmx-nested.l2_vmcs_list, list)
+   if (list_item-vmcs_addr == vmx-nested.current_vmptr) {
+   free_vmcs(list_item-l2_vmcs);
+   list_del((list_item-list));
+   kfree(list_item);
+   vmx-nested.l2_vmcs_num--;
+   return;
+   }
+}
+
+static void free_l1_state(struct kvm_vcpu *vcpu)
+{
+   struct vcpu_vmx *vmx = to_vmx(vcpu);
+   struct vmcs_list *list_item, *n;
+
+   list_for_each_entry_safe(list_item, n,
+   vmx-nested.l2_vmcs_list, list) {
+   free_vmcs(list_item-l2_vmcs);
+   list_del((list_item-list));
+   kfree(list_item);
+   }
+   vmx-nested.l2_vmcs_num = 0;
+}
+
 static void free_kvm_area(void)
 {
int cpu;
@@ -3606,6 +3694,9 @@ static int handle_vmon(struct kvm_vcpu *
return 1;
}
 
+   INIT_LIST_HEAD((vmx-nested.l2_vmcs_list));
+   vmx-nested.l2_vmcs_num = 0;
+
vmx-nested.vmxon = 1;
 
skip_emulated_instruction(vcpu);
@@ -3650,6 +3741,8 @@ static int handle_vmoff(struct kvm_vcpu 
 
to_vmx(vcpu)-nested.vmxon = 0;
 
+   free_l1_state(vcpu);
+
skip_emulated_instruction(vcpu);
return 1;
 }
@@ -4402,6 +4495,8 @@ static void vmx_free_vcpu(struct kvm_vcp
struct vcpu_vmx *vmx = to_vmx(vcpu);
 
free_vpid(vmx);
+   if (vmx-nested.vmxon)
+   free_l1_state(vcpu);
vmx_free_vmcs(vcpu);
kfree(vmx-guest_msrs);
kvm_vcpu_uninit(vcpu);
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More 

[PATCH 9/24] Implement VMCLEAR

2010-06-13 Thread Nadav Har'El
This patch implements the VMCLEAR instruction.

Signed-off-by: Nadav Har'El n...@il.ibm.com
---
--- .before/arch/x86/kvm/vmx.c  2010-06-13 15:01:29.0 +0300
+++ .after/arch/x86/kvm/vmx.c   2010-06-13 15:01:29.0 +0300
@@ -138,6 +138,8 @@ struct __attribute__ ((__packed__)) vmcs
 */
u32 revision_id;
u32 abort;
+
+   bool launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
 };
 
 struct vmcs_list {
@@ -3827,6 +3829,46 @@ static int read_guest_vmcs_gpa(struct kv
return 0;
 }
 
+static void clear_rflags_cf_zf(struct kvm_vcpu *vcpu)
+{
+   unsigned long rflags;
+   rflags = vmx_get_rflags(vcpu);
+   rflags = ~(X86_EFLAGS_CF | X86_EFLAGS_ZF);
+   vmx_set_rflags(vcpu, rflags);
+}
+
+/* Emulate the VMCLEAR instruction */
+static int handle_vmclear(struct kvm_vcpu *vcpu)
+{
+   struct vcpu_vmx *vmx = to_vmx(vcpu);
+   gpa_t guest_vmcs_addr, save_current_vmptr;
+
+   if (!nested_vmx_check_permission(vcpu))
+   return 1;
+
+   if (read_guest_vmcs_gpa(vcpu, guest_vmcs_addr))
+   return 1;
+
+   save_current_vmptr = vmx-nested.current_vmptr;
+
+   vmx-nested.current_vmptr = guest_vmcs_addr;
+   if (!nested_map_current(vcpu))
+   return 1;
+   vmx-nested.current_l2_page-launch_state = 0;
+   nested_unmap_current(vcpu);
+
+   nested_free_current_vmcs(vcpu);
+
+   if (save_current_vmptr == guest_vmcs_addr)
+   vmx-nested.current_vmptr = -1ull;
+   else
+   vmx-nested.current_vmptr = save_current_vmptr;
+
+   skip_emulated_instruction(vcpu);
+   clear_rflags_cf_zf(vcpu);
+   return 1;
+}
+
 static int handle_invlpg(struct kvm_vcpu *vcpu)
 {
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -4109,7 +4151,7 @@ static int (*kvm_vmx_exit_handlers[])(st
[EXIT_REASON_HLT] = handle_halt,
[EXIT_REASON_INVLPG]  = handle_invlpg,
[EXIT_REASON_VMCALL]  = handle_vmcall,
-   [EXIT_REASON_VMCLEAR] = handle_vmx_insn,
+   [EXIT_REASON_VMCLEAR] = handle_vmclear,
[EXIT_REASON_VMLAUNCH]= handle_vmx_insn,
[EXIT_REASON_VMPTRLD] = handle_vmx_insn,
[EXIT_REASON_VMPTRST] = handle_vmx_insn,
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 10/24] Implement VMPTRLD

2010-06-13 Thread Nadav Har'El
This patch implements the VMPTRLD instruction.

Signed-off-by: Nadav Har'El n...@il.ibm.com
---
--- .before/arch/x86/kvm/vmx.c  2010-06-13 15:01:29.0 +0300
+++ .after/arch/x86/kvm/vmx.c   2010-06-13 15:01:29.0 +0300
@@ -3829,6 +3829,26 @@ static int read_guest_vmcs_gpa(struct kv
return 0;
 }
 
+static void set_rflags_to_vmx_fail_invalid(struct kvm_vcpu *vcpu)
+{
+   unsigned long rflags;
+   rflags = vmx_get_rflags(vcpu);
+   rflags |= X86_EFLAGS_CF;
+   rflags = ~X86_EFLAGS_PF  ~X86_EFLAGS_AF  ~X86_EFLAGS_ZF 
+   ~X86_EFLAGS_SF  ~X86_EFLAGS_OF;
+   vmx_set_rflags(vcpu, rflags);
+}
+
+static void set_rflags_to_vmx_fail_valid(struct kvm_vcpu *vcpu)
+{
+   unsigned long rflags;
+   rflags = vmx_get_rflags(vcpu);
+   rflags |= X86_EFLAGS_ZF;
+   rflags = ~X86_EFLAGS_PF  ~X86_EFLAGS_AF  ~X86_EFLAGS_CF 
+   ~X86_EFLAGS_SF  ~X86_EFLAGS_OF;
+   vmx_set_rflags(vcpu, rflags);
+}
+
 static void clear_rflags_cf_zf(struct kvm_vcpu *vcpu)
 {
unsigned long rflags;
@@ -3869,6 +3889,57 @@ static int handle_vmclear(struct kvm_vcp
return 1;
 }
 
+static bool verify_vmcs12_revision(struct kvm_vcpu *vcpu, gpa_t 
guest_vmcs_addr)
+{
+   bool ret;
+   struct vmcs12 *vmcs12;
+   struct page *vmcs_page = nested_get_page(vcpu, guest_vmcs_addr);
+   if (vmcs_page == NULL)
+   return 0;
+   vmcs12 = (struct vmcs12 *)kmap_atomic(vmcs_page, KM_USER0);
+   if (vmcs12-revision_id == VMCS12_REVISION)
+   ret = 1;
+   else {
+   set_rflags_to_vmx_fail_valid(vcpu);
+   ret = 0;
+   }
+   kunmap_atomic(vmcs12, KM_USER0);
+   kvm_release_page_dirty(vmcs_page);
+   return ret;
+}
+
+/* Emulate the VMPTRLD instruction */
+static int handle_vmptrld(struct kvm_vcpu *vcpu)
+{
+   struct vcpu_vmx *vmx = to_vmx(vcpu);
+   gpa_t guest_vmcs_addr;
+
+   if (!nested_vmx_check_permission(vcpu))
+   return 1;
+
+   if (read_guest_vmcs_gpa(vcpu, guest_vmcs_addr)) {
+   set_rflags_to_vmx_fail_invalid(vcpu);
+   return 1;
+   }
+
+   if (!verify_vmcs12_revision(vcpu, guest_vmcs_addr))
+   return 1;
+
+   if (vmx-nested.current_vmptr != guest_vmcs_addr) {
+   vmx-nested.current_vmptr = guest_vmcs_addr;
+
+   if (nested_create_current_vmcs(vcpu)) {
+   printk(KERN_ERR %s error could not allocate memory,
+   __func__);
+   return -ENOMEM;
+   }
+   }
+
+   clear_rflags_cf_zf(vcpu);
+   skip_emulated_instruction(vcpu);
+   return 1;
+}
+
 static int handle_invlpg(struct kvm_vcpu *vcpu)
 {
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -4153,7 +4224,7 @@ static int (*kvm_vmx_exit_handlers[])(st
[EXIT_REASON_VMCALL]  = handle_vmcall,
[EXIT_REASON_VMCLEAR] = handle_vmclear,
[EXIT_REASON_VMLAUNCH]= handle_vmx_insn,
-   [EXIT_REASON_VMPTRLD] = handle_vmx_insn,
+   [EXIT_REASON_VMPTRLD] = handle_vmptrld,
[EXIT_REASON_VMPTRST] = handle_vmx_insn,
[EXIT_REASON_VMREAD]  = handle_vmx_insn,
[EXIT_REASON_VMRESUME]= handle_vmx_insn,
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 12/24] Add VMCS fields to the vmcs12

2010-06-13 Thread Nadav Har'El
In this patch we add to vmcs12 (the VMCS that L1 keeps for L2) all the
standard VMCS fields. These fields are encapsulated in a struct shadow_vmcs.

Later patches will enable L1 to read and write these fields using VMREAD/
VMWRITE, and they will be used during a VMLAUNCH/VMRESUME in preparing a real
VMCS for running L2.

Signed-off-by: Nadav Har'El n...@il.ibm.com
---
--- .before/arch/x86/kvm/vmx.c  2010-06-13 15:01:29.0 +0300
+++ .after/arch/x86/kvm/vmx.c   2010-06-13 15:01:29.0 +0300
@@ -117,6 +117,136 @@ struct shared_msr_entry {
u64 mask;
 };
 
+/* shadow_vmcs is a structure used in nested VMX for holding a copy of all
+ * standard VMCS fields. It is used for emulating a VMCS for L1 (see vmcs12),
+ * and also for easier access to VMCS data (see l1_shadow_vmcs).
+ */
+struct __attribute__ ((__packed__)) shadow_vmcs {
+   u16 virtual_processor_id;
+   u16 guest_es_selector;
+   u16 guest_cs_selector;
+   u16 guest_ss_selector;
+   u16 guest_ds_selector;
+   u16 guest_fs_selector;
+   u16 guest_gs_selector;
+   u16 guest_ldtr_selector;
+   u16 guest_tr_selector;
+   u16 host_es_selector;
+   u16 host_cs_selector;
+   u16 host_ss_selector;
+   u16 host_ds_selector;
+   u16 host_fs_selector;
+   u16 host_gs_selector;
+   u16 host_tr_selector;
+   u64 io_bitmap_a;
+   u64 io_bitmap_b;
+   u64 msr_bitmap;
+   u64 vm_exit_msr_store_addr;
+   u64 vm_exit_msr_load_addr;
+   u64 vm_entry_msr_load_addr;
+   u64 tsc_offset;
+   u64 virtual_apic_page_addr;
+   u64 apic_access_addr;
+   u64 ept_pointer;
+   u64 guest_physical_address;
+   u64 vmcs_link_pointer;
+   u64 guest_ia32_debugctl;
+   u64 guest_ia32_pat;
+   u64 guest_pdptr0;
+   u64 guest_pdptr1;
+   u64 guest_pdptr2;
+   u64 guest_pdptr3;
+   u64 host_ia32_pat;
+   u32 pin_based_vm_exec_control;
+   u32 cpu_based_vm_exec_control;
+   u32 exception_bitmap;
+   u32 page_fault_error_code_mask;
+   u32 page_fault_error_code_match;
+   u32 cr3_target_count;
+   u32 vm_exit_controls;
+   u32 vm_exit_msr_store_count;
+   u32 vm_exit_msr_load_count;
+   u32 vm_entry_controls;
+   u32 vm_entry_msr_load_count;
+   u32 vm_entry_intr_info_field;
+   u32 vm_entry_exception_error_code;
+   u32 vm_entry_instruction_len;
+   u32 tpr_threshold;
+   u32 secondary_vm_exec_control;
+   u32 vm_instruction_error;
+   u32 vm_exit_reason;
+   u32 vm_exit_intr_info;
+   u32 vm_exit_intr_error_code;
+   u32 idt_vectoring_info_field;
+   u32 idt_vectoring_error_code;
+   u32 vm_exit_instruction_len;
+   u32 vmx_instruction_info;
+   u32 guest_es_limit;
+   u32 guest_cs_limit;
+   u32 guest_ss_limit;
+   u32 guest_ds_limit;
+   u32 guest_fs_limit;
+   u32 guest_gs_limit;
+   u32 guest_ldtr_limit;
+   u32 guest_tr_limit;
+   u32 guest_gdtr_limit;
+   u32 guest_idtr_limit;
+   u32 guest_es_ar_bytes;
+   u32 guest_cs_ar_bytes;
+   u32 guest_ss_ar_bytes;
+   u32 guest_ds_ar_bytes;
+   u32 guest_fs_ar_bytes;
+   u32 guest_gs_ar_bytes;
+   u32 guest_ldtr_ar_bytes;
+   u32 guest_tr_ar_bytes;
+   u32 guest_interruptibility_info;
+   u32 guest_activity_state;
+   u32 guest_sysenter_cs;
+   u32 host_ia32_sysenter_cs;
+   unsigned long cr0_guest_host_mask;
+   unsigned long cr4_guest_host_mask;
+   unsigned long cr0_read_shadow;
+   unsigned long cr4_read_shadow;
+   unsigned long cr3_target_value0;
+   unsigned long cr3_target_value1;
+   unsigned long cr3_target_value2;
+   unsigned long cr3_target_value3;
+   unsigned long exit_qualification;
+   unsigned long guest_linear_address;
+   unsigned long guest_cr0;
+   unsigned long guest_cr3;
+   unsigned long guest_cr4;
+   unsigned long guest_es_base;
+   unsigned long guest_cs_base;
+   unsigned long guest_ss_base;
+   unsigned long guest_ds_base;
+   unsigned long guest_fs_base;
+   unsigned long guest_gs_base;
+   unsigned long guest_ldtr_base;
+   unsigned long guest_tr_base;
+   unsigned long guest_gdtr_base;
+   unsigned long guest_idtr_base;
+   unsigned long guest_dr7;
+   unsigned long guest_rsp;
+   unsigned long guest_rip;
+   unsigned long guest_rflags;
+   unsigned long guest_pending_dbg_exceptions;
+   unsigned long guest_sysenter_esp;
+   unsigned long guest_sysenter_eip;
+   unsigned long host_cr0;
+   unsigned long host_cr3;
+   unsigned long host_cr4;
+   unsigned long host_fs_base;
+   unsigned long host_gs_base;
+   unsigned long host_tr_base;
+   unsigned long host_gdtr_base;
+   unsigned long host_idtr_base;
+   unsigned long host_ia32_sysenter_esp;
+   unsigned long host_ia32_sysenter_eip;
+   unsigned long host_rsp;

[PATCH 13/24] Implement VMREAD and VMWRITE

2010-06-13 Thread Nadav Har'El
Implement the VMREAD and VMWRITE instructions. With these instructions, L1
can read and write to the VMCS it is holding. The values are read or written
to the fields of the shadow_vmcs structure introduced in the previous patch.

Signed-off-by: Nadav Har'El n...@il.ibm.com
---
--- .before/arch/x86/kvm/vmx.c  2010-06-13 15:01:29.0 +0300
+++ .after/arch/x86/kvm/vmx.c   2010-06-13 15:01:29.0 +0300
@@ -299,6 +299,42 @@ struct nested_vmx {
int l2_vmcs_num;
 };
 
+enum vmcs_field_type {
+   VMCS_FIELD_TYPE_U16 = 0,
+   VMCS_FIELD_TYPE_U64 = 1,
+   VMCS_FIELD_TYPE_U32 = 2,
+   VMCS_FIELD_TYPE_ULONG = 3
+};
+
+#define VMCS_FIELD_LENGTH_OFFSET 13
+#define VMCS_FIELD_LENGTH_MASK 0x6000
+
+static inline int vmcs_field_type(unsigned long field)
+{
+   if (0x1  field)/* one of the *_HIGH fields, all are 32 bit */
+   return VMCS_FIELD_TYPE_U32;
+   return (VMCS_FIELD_LENGTH_MASK  field)  13;
+}
+
+static inline int vmcs_field_size(int field_type, struct kvm_vcpu *vcpu)
+{
+   switch (field_type) {
+   case VMCS_FIELD_TYPE_U16:
+   return 2;
+   case VMCS_FIELD_TYPE_U32:
+   return 4;
+   case VMCS_FIELD_TYPE_U64:
+   return 8;
+   case VMCS_FIELD_TYPE_ULONG:
+#ifdef CONFIG_X86_64
+   if (is_long_mode(vcpu))
+   return 8;
+#endif
+   return 4;
+   }
+   return 0; /* should never happen */
+}
+
 struct vcpu_vmx {
struct kvm_vcpu   vcpu;
struct list_head  local_vcpus_link;
@@ -4184,6 +4220,189 @@ static int handle_vmclear(struct kvm_vcp
return 1;
 }
 
+static inline bool nested_vmcs_read_any(struct kvm_vcpu *vcpu,
+   unsigned long field, u64 *ret)
+{
+   short offset = vmcs_field_to_offset(field);
+   char *p;
+
+   if (offset  0)
+   return 0;
+   if (!to_vmx(vcpu)-nested.current_l2_page)
+   return 0;
+
+   p = ((char *)(get_shadow_vmcs(vcpu))) + offset;
+
+   switch (vmcs_field_type(field)) {
+   case VMCS_FIELD_TYPE_ULONG:
+   *ret = *((unsigned long *)p);
+   return 1;
+   case VMCS_FIELD_TYPE_U16:
+   *ret = (u16) *((unsigned long *)p);
+   return 1;
+   case VMCS_FIELD_TYPE_U32:
+   *ret = (u32) *((unsigned long *)p);
+   return 1;
+   case VMCS_FIELD_TYPE_U64:
+   *ret = *((u64 *)p);
+   return 1;
+   default:
+   return 0; /* can never happen. */
+   }
+}
+
+static int handle_vmread_reg(struct kvm_vcpu *vcpu, int reg,
+unsigned long field)
+{
+   u64 field_value;
+   if (!nested_vmcs_read_any(vcpu, field, field_value))
+   return 0;
+
+#ifdef CONFIG_X86_64
+   switch (vmcs_field_type(field)) {
+   case VMCS_FIELD_TYPE_U64: case VMCS_FIELD_TYPE_ULONG:
+   if (!is_long_mode(vcpu)) {
+   kvm_register_write(vcpu, reg+1, field_value  32);
+   field_value = (u32)field_value;
+   }
+   }
+#endif
+   kvm_register_write(vcpu, reg, field_value);
+   return 1;
+}
+
+static int handle_vmread_mem(struct kvm_vcpu *vcpu, gva_t gva,
+unsigned long field)
+{
+   u64 field_value;
+   if (!nested_vmcs_read_any(vcpu, field, field_value))
+   return 0;
+
+   /* It's ok to use *_system, because handle_vmread verifies cpl=0 */
+   kvm_write_guest_virt_system(gva, field_value,
+vmcs_field_size(vmcs_field_type(field), vcpu),
+vcpu, NULL);
+   return 1;
+}
+
+static int handle_vmread(struct kvm_vcpu *vcpu)
+{
+   unsigned long field;
+   unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+   u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+   gva_t gva = 0;
+   int read_succeed;
+
+   if (!nested_vmx_check_permission(vcpu))
+   return 1;
+
+   if (!nested_map_current(vcpu)) {
+   printk(KERN_INFO %s invalid shadow vmcs\n, __func__);
+   set_rflags_to_vmx_fail_invalid(vcpu);
+   return 1;
+   }
+
+   /* decode instruction info to get the field to read and where to store
+* its value */
+   field = kvm_register_read(vcpu, VMX_OPERAND_REG2(vmx_instruction_info));
+   if (VMX_OPERAND_IS_REG(vmx_instruction_info)) {
+   read_succeed = handle_vmread_reg(vcpu,
+   VMX_OPERAND_REG(vmx_instruction_info), field);
+   } else {
+   gva = get_vmx_mem_address(vcpu, exit_qualification,
+ vmx_instruction_info);
+   if (gva == 0)
+   return 1;
+   read_succeed = handle_vmread_mem(vcpu, gva, field);
+   }
+
+   if (read_succeed) {

[PATCH 11/24] Implement VMPTRST

2010-06-13 Thread Nadav Har'El
This patch implements the VMPTRST instruction. 

Signed-off-by: Nadav Har'El n...@il.ibm.com
---
--- .before/arch/x86/kvm/x86.c  2010-06-13 15:01:29.0 +0300
+++ .after/arch/x86/kvm/x86.c   2010-06-13 15:01:29.0 +0300
@@ -3301,7 +3301,7 @@ static int kvm_read_guest_virt_system(gv
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
 }
 
-static int kvm_write_guest_virt_system(gva_t addr, void *val,
+int kvm_write_guest_virt_system(gva_t addr, void *val,
   unsigned int bytes,
   struct kvm_vcpu *vcpu,
   u32 *error)
@@ -,6 +,7 @@ static int kvm_write_guest_virt_system(g
 out:
return r;
 }
+EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
 
 static int emulator_read_emulated(unsigned long addr,
  void *val,
--- .before/arch/x86/kvm/x86.h  2010-06-13 15:01:29.0 +0300
+++ .after/arch/x86/kvm/x86.h   2010-06-13 15:01:29.0 +0300
@@ -78,6 +78,9 @@ void kvm_after_handle_nmi(struct kvm_vcp
 int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
struct kvm_vcpu *vcpu, u32 *error);
 
+int kvm_write_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
+struct kvm_vcpu *vcpu, u32 *error);
+
 extern int nested;
 
 #endif
--- .before/arch/x86/kvm/vmx.c  2010-06-13 15:01:29.0 +0300
+++ .after/arch/x86/kvm/vmx.c   2010-06-13 15:01:29.0 +0300
@@ -3940,6 +3940,33 @@ static int handle_vmptrld(struct kvm_vcp
return 1;
 }
 
+/* Emulate the VMPTRST instruction */
+static int handle_vmptrst(struct kvm_vcpu *vcpu)
+{
+   int r = 0;
+   unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+   u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+   gva_t vmcs_gva;
+
+   if (!nested_vmx_check_permission(vcpu))
+   return 1;
+
+   vmcs_gva = get_vmx_mem_address(vcpu, exit_qualification,
+  vmx_instruction_info);
+   if (vmcs_gva == 0)
+   return 1;
+   r = kvm_write_guest_virt_system(vmcs_gva,
+(void *)to_vmx(vcpu)-nested.current_vmptr,
+sizeof(u64), vcpu, NULL);
+   if (r) {
+   printk(KERN_INFO %s failed to write vmptr\n, __func__);
+   return 1;
+   }
+   clear_rflags_cf_zf(vcpu);
+   skip_emulated_instruction(vcpu);
+   return 1;
+}
+
 static int handle_invlpg(struct kvm_vcpu *vcpu)
 {
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -4225,7 +4252,7 @@ static int (*kvm_vmx_exit_handlers[])(st
[EXIT_REASON_VMCLEAR] = handle_vmclear,
[EXIT_REASON_VMLAUNCH]= handle_vmx_insn,
[EXIT_REASON_VMPTRLD] = handle_vmptrld,
-   [EXIT_REASON_VMPTRST] = handle_vmx_insn,
+   [EXIT_REASON_VMPTRST] = handle_vmptrst,
[EXIT_REASON_VMREAD]  = handle_vmx_insn,
[EXIT_REASON_VMRESUME]= handle_vmx_insn,
[EXIT_REASON_VMWRITE] = handle_vmx_insn,
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 14/24] Prepare vmcs02 from vmcs01 and vmcs12

2010-06-13 Thread Nadav Har'El
This patch contains code to prepare the VMCS which can be used to actually
run the L2 guest, vmcs02. prepare_vmcs02 appropriately merges the information
in shadow_vmcs that L1 built for L2 (vmcs12), and that in the VMCS that we
built for L1 (vmcs01).

VMREAD/WRITE can only access one VMCS at a time (the current VMCS), which
makes it difficult for us to read from vmcs01 while writing to vmcs12. This
is why we first make a copy of vmcs01 in memory (l1_shadow_vmcs) and then
read that memory copy while writing to vmcs12.

Signed-off-by: Nadav Har'El n...@il.ibm.com
---
--- .before/arch/x86/kvm/vmx.c  2010-06-13 15:01:29.0 +0300
+++ .after/arch/x86/kvm/vmx.c   2010-06-13 15:01:29.0 +0300
@@ -849,6 +849,36 @@ static inline bool report_flexpriority(v
return flexpriority_enabled;
 }
 
+static inline bool nested_cpu_has_vmx_tpr_shadow(struct kvm_vcpu *vcpu)
+{
+   return cpu_has_vmx_tpr_shadow() 
+   get_shadow_vmcs(vcpu)-cpu_based_vm_exec_control 
+   CPU_BASED_TPR_SHADOW;
+}
+
+static inline bool nested_cpu_has_secondary_exec_ctrls(struct kvm_vcpu *vcpu)
+{
+   return cpu_has_secondary_exec_ctrls() 
+   get_shadow_vmcs(vcpu)-cpu_based_vm_exec_control 
+   CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+}
+
+static inline bool nested_vm_need_virtualize_apic_accesses(struct kvm_vcpu
+  *vcpu)
+{
+   return nested_cpu_has_secondary_exec_ctrls(vcpu) 
+   (get_shadow_vmcs(vcpu)-secondary_vm_exec_control 
+   SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+}
+
+static inline bool nested_cpu_has_vmx_ept(struct kvm_vcpu *vcpu)
+{
+   return nested_cpu_has_secondary_exec_ctrls(vcpu) 
+   (get_shadow_vmcs(vcpu)-secondary_vm_exec_control 
+   SECONDARY_EXEC_ENABLE_EPT);
+}
+
+
 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 {
int i;
@@ -1292,6 +1322,39 @@ static void vmx_load_host_state(struct v
preempt_enable();
 }
 
+int load_vmcs_host_state(struct shadow_vmcs *src)
+{
+   vmcs_write16(HOST_ES_SELECTOR, src-host_es_selector);
+   vmcs_write16(HOST_CS_SELECTOR, src-host_cs_selector);
+   vmcs_write16(HOST_SS_SELECTOR, src-host_ss_selector);
+   vmcs_write16(HOST_DS_SELECTOR, src-host_ds_selector);
+   vmcs_write16(HOST_FS_SELECTOR, src-host_fs_selector);
+   vmcs_write16(HOST_GS_SELECTOR, src-host_gs_selector);
+   vmcs_write16(HOST_TR_SELECTOR, src-host_tr_selector);
+
+   vmcs_write64(TSC_OFFSET, src-tsc_offset);
+
+   if (vmcs_config.vmexit_ctrl  VM_EXIT_LOAD_IA32_PAT)
+   vmcs_write64(HOST_IA32_PAT, src-host_ia32_pat);
+
+   vmcs_write32(HOST_IA32_SYSENTER_CS, src-host_ia32_sysenter_cs);
+
+   vmcs_writel(HOST_CR0, src-host_cr0);
+   vmcs_writel(HOST_CR3, src-host_cr3);
+   vmcs_writel(HOST_CR4, src-host_cr4);
+   vmcs_writel(HOST_FS_BASE, src-host_fs_base);
+   vmcs_writel(HOST_GS_BASE, src-host_gs_base);
+   vmcs_writel(HOST_TR_BASE, src-host_tr_base);
+   vmcs_writel(HOST_GDTR_BASE, src-host_gdtr_base);
+   vmcs_writel(HOST_IDTR_BASE, src-host_idtr_base);
+   vmcs_writel(HOST_RSP, src-host_rsp);
+   vmcs_writel(HOST_RIP, src-host_rip);
+   vmcs_writel(HOST_IA32_SYSENTER_ESP, src-host_ia32_sysenter_esp);
+   vmcs_writel(HOST_IA32_SYSENTER_EIP, src-host_ia32_sysenter_eip);
+
+   return 0;
+}
+
 /*
  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
  * vcpu mutex is already taken.
@@ -1922,6 +1985,71 @@ static void vmclear_local_vcpus(void)
__vcpu_clear(vmx);
 }
 
+int load_vmcs_common(struct shadow_vmcs *src)
+{
+   vmcs_write16(GUEST_ES_SELECTOR, src-guest_es_selector);
+   vmcs_write16(GUEST_CS_SELECTOR, src-guest_cs_selector);
+   vmcs_write16(GUEST_SS_SELECTOR, src-guest_ss_selector);
+   vmcs_write16(GUEST_DS_SELECTOR, src-guest_ds_selector);
+   vmcs_write16(GUEST_FS_SELECTOR, src-guest_fs_selector);
+   vmcs_write16(GUEST_GS_SELECTOR, src-guest_gs_selector);
+   vmcs_write16(GUEST_LDTR_SELECTOR, src-guest_ldtr_selector);
+   vmcs_write16(GUEST_TR_SELECTOR, src-guest_tr_selector);
+
+   vmcs_write64(GUEST_IA32_DEBUGCTL, src-guest_ia32_debugctl);
+
+   if (vmcs_config.vmentry_ctrl  VM_ENTRY_LOAD_IA32_PAT)
+   vmcs_write64(GUEST_IA32_PAT, src-guest_ia32_pat);
+
+   vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, src-vm_entry_intr_info_field);
+   vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+src-vm_entry_exception_error_code);
+   vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, src-vm_entry_instruction_len);
+
+   vmcs_write32(GUEST_ES_LIMIT, src-guest_es_limit);
+   vmcs_write32(GUEST_CS_LIMIT, src-guest_cs_limit);
+   vmcs_write32(GUEST_SS_LIMIT, src-guest_ss_limit);
+   vmcs_write32(GUEST_DS_LIMIT, src-guest_ds_limit);
+   vmcs_write32(GUEST_FS_LIMIT, src-guest_fs_limit);
+   

[PATCH 15/24] Move register-syncing to a function

2010-06-13 Thread Nadav Har'El
Move code that syncs dirty RSP and RIP registers back to the VMCS, into a
function. We will need to call this function from additional places in the
next patch.

Signed-off-by: Nadav Har'El n...@il.ibm.com
---
--- .before/arch/x86/kvm/vmx.c  2010-06-13 15:01:29.0 +0300
+++ .after/arch/x86/kvm/vmx.c   2010-06-13 15:01:29.0 +0300
@@ -5114,6 +5114,15 @@ static void fixup_rmode_irq(struct vcpu_
| vmx-rmode.irq.vector;
 }
 
+static inline void sync_cached_regs_to_vmcs(struct kvm_vcpu *vcpu)
+{
+   if (test_bit(VCPU_REGS_RSP, (unsigned long *)vcpu-arch.regs_dirty))
+   vmcs_writel(GUEST_RSP, vcpu-arch.regs[VCPU_REGS_RSP]);
+   if (test_bit(VCPU_REGS_RIP, (unsigned long *)vcpu-arch.regs_dirty))
+   vmcs_writel(GUEST_RIP, vcpu-arch.regs[VCPU_REGS_RIP]);
+   vcpu-arch.regs_dirty = 0;
+}
+
 #ifdef CONFIG_X86_64
 #define R r
 #define Q q
@@ -5135,10 +5144,7 @@ static void vmx_vcpu_run(struct kvm_vcpu
if (vmx-emulation_required  emulate_invalid_guest_state)
return;
 
-   if (test_bit(VCPU_REGS_RSP, (unsigned long *)vcpu-arch.regs_dirty))
-   vmcs_writel(GUEST_RSP, vcpu-arch.regs[VCPU_REGS_RSP]);
-   if (test_bit(VCPU_REGS_RIP, (unsigned long *)vcpu-arch.regs_dirty))
-   vmcs_writel(GUEST_RIP, vcpu-arch.regs[VCPU_REGS_RIP]);
+   sync_cached_regs_to_vmcs(vcpu);
 
/* When single-stepping over STI and MOV SS, we must clear the
 * corresponding interruptibility bits in the guest state. Otherwise
@@ -5246,7 +5252,6 @@ static void vmx_vcpu_run(struct kvm_vcpu
 
vcpu-arch.regs_avail = ~((1  VCPU_REGS_RIP) | (1  VCPU_REGS_RSP)
  | (1  VCPU_EXREG_PDPTR));
-   vcpu-arch.regs_dirty = 0;
 
vmx-idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
if (vmx-rmode.irq.pending)
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 16/24] Implement VMLAUNCH and VMRESUME

2010-06-13 Thread Nadav Har'El
Implement the VMLAUNCH and VMRESUME instructions, allowing a guest
hypervisor to run its own guests.

Signed-off-by: Nadav Har'El n...@il.ibm.com
---
--- .before/arch/x86/kvm/vmx.c  2010-06-13 15:01:29.0 +0300
+++ .after/arch/x86/kvm/vmx.c   2010-06-13 15:01:29.0 +0300
@@ -272,6 +272,9 @@ struct __attribute__ ((__packed__)) vmcs
struct shadow_vmcs shadow_vmcs;
 
bool launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
+
+   int cpu;
+   int launched;
 };
 
 struct vmcs_list {
@@ -297,6 +300,24 @@ struct nested_vmx {
/* list of real (hardware) VMCS, one for each L2 guest of L1 */
struct list_head l2_vmcs_list; /* a vmcs_list */
int l2_vmcs_num;
+
+   /* Are we running a nested guest now */
+   bool nested_mode;
+   /* Level 1 state for switching to level 2 and back */
+   struct  {
+   u64 efer;
+   unsigned long cr3;
+   unsigned long cr4;
+   u64 io_bitmap_a;
+   u64 io_bitmap_b;
+   u64 msr_bitmap;
+   int cpu;
+   int launched;
+   } l1_state;
+   /* Level 1 shadow vmcs for switching to level 2 and back */
+   struct shadow_vmcs *l1_shadow_vmcs;
+   /* Level 1 vmcs loaded into the processor */
+   struct vmcs *l1_vmcs;
 };
 
 enum vmcs_field_type {
@@ -1407,6 +1428,19 @@ static void vmx_vcpu_load(struct kvm_vcp
new_offset = vmcs_read64(TSC_OFFSET) + delta;
vmcs_write64(TSC_OFFSET, new_offset);
}
+
+   if (vmx-nested.l1_shadow_vmcs != NULL) {
+   struct shadow_vmcs *l1svmcs =
+   vmx-nested.l1_shadow_vmcs;
+   l1svmcs-host_tr_base = vmcs_readl(HOST_TR_BASE);
+   l1svmcs-host_gdtr_base = vmcs_readl(HOST_GDTR_BASE);
+   l1svmcs-host_ia32_sysenter_esp =
+   vmcs_readl(HOST_IA32_SYSENTER_ESP);
+   if (tsc_this  vcpu-arch.host_tsc)
+   l1svmcs-tsc_offset = vmcs_read64(TSC_OFFSET);
+   if (vmx-nested.nested_mode)
+   load_vmcs_host_state(l1svmcs);
+   }
}
 }
 
@@ -2301,6 +2335,9 @@ static void free_l1_state(struct kvm_vcp
kfree(list_item);
}
vmx-nested.l2_vmcs_num = 0;
+
+   kfree(vmx-nested.l1_shadow_vmcs);
+   vmx-nested.l1_shadow_vmcs = NULL;
 }
 
 static void free_kvm_area(void)
@@ -4158,6 +4195,13 @@ static int handle_vmon(struct kvm_vcpu *
INIT_LIST_HEAD((vmx-nested.l2_vmcs_list));
vmx-nested.l2_vmcs_num = 0;
 
+   vmx-nested.l1_shadow_vmcs = kzalloc(PAGE_SIZE, GFP_KERNEL);
+   if (!vmx-nested.l1_shadow_vmcs) {
+   printk(KERN_INFO
+   couldn't allocate memory for l1_shadow_vmcs\n);
+   return -ENOMEM;
+   }
+
vmx-nested.vmxon = 1;
 
skip_emulated_instruction(vcpu);
@@ -4348,6 +4392,42 @@ static int handle_vmclear(struct kvm_vcp
return 1;
 }
 
+static int nested_vmx_run(struct kvm_vcpu *vcpu);
+
+static int handle_launch_or_resume(struct kvm_vcpu *vcpu, bool launch)
+{
+   if (!nested_vmx_check_permission(vcpu))
+   return 1;
+
+   if (!nested_map_current(vcpu))
+   return 1;
+   if (to_vmx(vcpu)-nested.current_l2_page-launch_state == launch) {
+   /* Must use VMLAUNCH for the first time, VMRESUME later */
+   set_rflags_to_vmx_fail_valid(vcpu);
+   nested_unmap_current(vcpu);
+   return 1;
+   }
+   nested_unmap_current(vcpu);
+
+   skip_emulated_instruction(vcpu);
+
+   nested_vmx_run(vcpu);
+   return 1;
+}
+
+/* Emulate the VMLAUNCH instruction */
+static int handle_vmlaunch(struct kvm_vcpu *vcpu)
+{
+   return handle_launch_or_resume(vcpu, true);
+}
+
+/* Emulate the VMRESUME instruction */
+static int handle_vmresume(struct kvm_vcpu *vcpu)
+{
+
+   return handle_launch_or_resume(vcpu, false);
+}
+
 static inline bool nested_vmcs_read_any(struct kvm_vcpu *vcpu,
unsigned long field, u64 *ret)
 {
@@ -4892,11 +4972,11 @@ static int (*kvm_vmx_exit_handlers[])(st
[EXIT_REASON_INVLPG]  = handle_invlpg,
[EXIT_REASON_VMCALL]  = handle_vmcall,
[EXIT_REASON_VMCLEAR] = handle_vmclear,
-   [EXIT_REASON_VMLAUNCH]= handle_vmx_insn,
+   [EXIT_REASON_VMLAUNCH]= handle_vmlaunch,
[EXIT_REASON_VMPTRLD] = handle_vmptrld,
[EXIT_REASON_VMPTRST] = handle_vmptrst,
[EXIT_REASON_VMREAD]  = handle_vmread,
-   [EXIT_REASON_VMRESUME]= handle_vmx_insn,
+   [EXIT_REASON_VMRESUME]= handle_vmresume,

[PATCH 17/24] No need for handle_vmx_insn function any more

2010-06-13 Thread Nadav Har'El
Before nested VMX support, the exit handler for a guest executing a VMX
instruction (vmclear, vmlaunch, vmptrld, vmptrst, vmread, vmread, vmresume,
vmwrite, vmon, vmoff), was handle_vmx_insn(). This handler simply threw a #UD
exception. Now that all these exit reasons are properly handled (and emulate
the relevant VMX instruction), nothing calls this dummy handler and it can
be removed.

Signed-off-by: Nadav Har'El n...@il.ibm.com
---
--- .before/arch/x86/kvm/vmx.c  2010-06-13 15:01:29.0 +0300
+++ .after/arch/x86/kvm/vmx.c   2010-06-13 15:01:30.0 +0300
@@ -4147,12 +4147,6 @@ static int handle_vmcall(struct kvm_vcpu
return 1;
 }
 
-static int handle_vmx_insn(struct kvm_vcpu *vcpu)
-{
-   kvm_queue_exception(vcpu, UD_VECTOR);
-   return 1;
-}
-
 /* Emulate the VMXON instruction.
  * Currently, we just remember that VMX is active, and do not save or even
  * inspect the argument to VMXON (the so-called VMXON pointer) because we
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 18/24] Exiting from L2 to L1

2010-06-13 Thread Nadav Har'El
This patch implements nested_vmx_vmexit(), called when the nested L2 guest
exits and we want to run its L1 parent and let it handle this exit.

Note that this will not necessarily be called on every L2 exit. L0 may decide
to handle a particular exit on its own, without L1's involvement; In that
case, L0 will handle the exit, and resume running L2, without running L1 and
without calling nested_vmx_vmexit(). The logic for deciding whether to handle
a particular exit in L1 or in L0, i.e., whether to call nested_vmx_vmexit(),
will appear in the next patch.

Signed-off-by: Nadav Har'El n...@il.ibm.com
---
--- .before/arch/x86/kvm/vmx.c  2010-06-13 15:01:30.0 +0300
+++ .after/arch/x86/kvm/vmx.c   2010-06-13 15:01:30.0 +0300
@@ -5080,9 +5080,13 @@ static void vmx_complete_interrupts(stru
int type;
bool idtv_info_valid;
 
+   vmx-exit_reason = vmcs_read32(VM_EXIT_REASON);
+
+   if (vmx-nested.nested_mode)
+   return;
+
exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 
-   vmx-exit_reason = vmcs_read32(VM_EXIT_REASON);
 
/* Handle machine checks before interrupts are enabled */
if ((vmx-exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
@@ -5978,6 +5982,278 @@ static int nested_vmx_run(struct kvm_vcp
return 1;
 }
 
+/* prepare_vmcs_12 is called when the nested L2 guest exits and we want to
+ * prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), and this
+ * function updates it to reflect the state of the registers during the exit,
+ * and to reflect some changes that happened while L2 was running (and perhaps
+ * made some exits which were handled directly by L0 without going back to L1).
+ */
+void prepare_vmcs_12(struct kvm_vcpu *vcpu)
+{
+   struct shadow_vmcs *vmcs12 = get_shadow_vmcs(vcpu);
+
+   vmcs12-guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
+   vmcs12-guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
+   vmcs12-guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
+   vmcs12-guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
+   vmcs12-guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
+   vmcs12-guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
+   vmcs12-guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
+   vmcs12-guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
+
+   vmcs12-tsc_offset = vmcs_read64(TSC_OFFSET);
+   vmcs12-guest_physical_address = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+   vmcs12-vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
+   vmcs12-guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+   if (vmcs_config.vmentry_ctrl  VM_ENTRY_LOAD_IA32_PAT)
+   vmcs12-guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
+   vmcs12-cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
+   vmcs12-vm_entry_intr_info_field =
+   vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
+   vmcs12-vm_entry_exception_error_code =
+   vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
+   vmcs12-vm_entry_instruction_len =
+   vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
+   vmcs12-vm_instruction_error = vmcs_read32(VM_INSTRUCTION_ERROR);
+   vmcs12-vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
+   vmcs12-vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+   vmcs12-vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+   vmcs12-idt_vectoring_info_field =
+   vmcs_read32(IDT_VECTORING_INFO_FIELD);
+   vmcs12-idt_vectoring_error_code =
+   vmcs_read32(IDT_VECTORING_ERROR_CODE);
+   vmcs12-vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+   vmcs12-vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+   vmcs12-guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
+   vmcs12-guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
+   vmcs12-guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
+   vmcs12-guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
+   vmcs12-guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
+   vmcs12-guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
+   vmcs12-guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
+   vmcs12-guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
+   vmcs12-guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
+   vmcs12-guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
+   vmcs12-guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
+   vmcs12-guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
+   vmcs12-guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
+   vmcs12-guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
+   vmcs12-guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
+   vmcs12-guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
+   vmcs12-guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
+   vmcs12-guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
+   vmcs12-guest_interruptibility_info =
+   

[PATCH 19/24] Deciding if L0 or L1 should handle an L2 exit

2010-06-13 Thread Nadav Har'El
This patch contains the logic of whether an L2 exit should be handled by L0
and then L2 should be resumed, or whether L1 should be run to handle this
exit (using the nested_vmx_vmexit() function of the previous patch).

The basic idea is to let L1 handle the exit only if it actually asked to
trap this sort of event. For example, when L2 exits on a change to CR0,
we check L1's CR0_GUEST_HOST_MASK to see if L1 expressed interest in any
bit which changed; If it did, we exit to L1. But if it didn't it means that
it is we (L0) that wished to trap this event, so we handle it ourselves.

The next two patches add additional logic of what to do when an interrupt or
exception is injected: Does L0 need to do it, should we exit to L1 to do it,
or should we resume L2 and keep the exception to be injected later.

We keep a new flag, nested_run_pending, which can override the decision of
which should run next, L1 or L2. nested_run_pending=1 means that we *must* run
L2 next, not L1. This is necessary in several situations where had L1 run on
bare metal it would not have expected to be resumed at this stage. One
example is when L1 did a VMLAUNCH of L2 and therefore expects L2 to be run.
Another examples is when L2 exits on an #NM exception that L0 asked for
(because of lazy FPU loading), and L0 must deal with the exception and resume
L2 which was in a middle of an instruction, and not resume L1 which does not
expect to see an exit from L2 at this point. nested_run_pending is especially
intended to avoid switching to L1 in the injection decision-point described
above.

Signed-off-by: Nadav Har'El n...@il.ibm.com
---
--- .before/arch/x86/kvm/vmx.c  2010-06-13 15:01:30.0 +0300
+++ .after/arch/x86/kvm/vmx.c   2010-06-13 15:01:30.0 +0300
@@ -318,6 +318,8 @@ struct nested_vmx {
struct shadow_vmcs *l1_shadow_vmcs;
/* Level 1 vmcs loaded into the processor */
struct vmcs *l1_vmcs;
+   /* L2 must run next, and mustn't decide to exit to L1. */
+   bool nested_run_pending;
 };
 
 enum vmcs_field_type {
@@ -900,6 +902,24 @@ static inline bool nested_cpu_has_vmx_ep
 }
 
 
+static inline bool nested_cpu_has_vmx_msr_bitmap(struct kvm_vcpu *vcpu)
+{
+   return get_shadow_vmcs(vcpu)-cpu_based_vm_exec_control 
+   CPU_BASED_USE_MSR_BITMAPS;
+}
+
+static inline bool is_exception(u32 intr_info)
+{
+   return (intr_info  (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+   == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
+}
+
+static inline bool is_nmi(u32 intr_info)
+{
+   return (intr_info  (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+   == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
+}
+
 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 {
int i;
@@ -3694,6 +3714,8 @@ static void vmx_set_nmi_mask(struct kvm_
}
 }
 
+static int nested_vmx_vmexit(struct kvm_vcpu *vcpu, bool is_interrupt);
+
 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
return (vmcs_readl(GUEST_RFLAGS)  X86_EFLAGS_IF) 
@@ -3819,6 +3841,8 @@ static int handle_exception(struct kvm_v
 
if (is_no_device(intr_info)) {
vmx_fpu_activate(vcpu);
+   if (vmx-nested.nested_mode)
+   vmx-nested.nested_run_pending = 1;
return 1;
}
 
@@ -4989,6 +5013,202 @@ static int (*kvm_vmx_exit_handlers[])(st
 static const int kvm_vmx_max_exit_handlers =
ARRAY_SIZE(kvm_vmx_exit_handlers);
 
+/* Return 1 if we should exit from L2 to L1 to handle an MSR access exit,
+ * rather than handle it ourselves in L0. I.e., check L1's MSR bitmap whether
+ * it expressed interest in the current event (read or write a specific MSR).
+ */
+static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
+   struct shadow_vmcs *l2svmcs, u32 exit_code)
+{
+   u32 msr_index = vcpu-arch.regs[VCPU_REGS_RCX];
+   struct page *msr_bitmap_page;
+   void *va;
+   bool ret;
+
+   if (!cpu_has_vmx_msr_bitmap() || !nested_cpu_has_vmx_msr_bitmap(vcpu))
+   return 1;
+
+   msr_bitmap_page = nested_get_page(vcpu, l2svmcs-msr_bitmap);
+   if (!msr_bitmap_page) {
+   printk(KERN_INFO %s error in nested_get_page\n, __func__);
+   return 0;
+   }
+
+   va = kmap_atomic(msr_bitmap_page, KM_USER1);
+   if (exit_code == EXIT_REASON_MSR_WRITE)
+   va += 0x800;
+   if (msr_index = 0xc000) {
+   msr_index -= 0xc000;
+   va += 0x400;
+   }
+   if (msr_index  0x1fff)
+   return 0;
+   ret = test_bit(msr_index, va);
+   kunmap_atomic(va, KM_USER1);
+   return ret;
+}
+
+/* Return 1 if we should exit from L2 to L1 to handle a CR access exit,
+ * rather than handle it ourselves in L0. I.e., check if L1 wanted to
+ * intercept (via guest_host_mask etc.) the current event.
+ */
+static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
+   

[PATCH 20/24] Correct handling of interrupt injection

2010-06-13 Thread Nadav Har'El
When KVM wants to inject an interrupt, the guest should think a real interrupt
has happened. Normally (in the non-nested case) this means checking that the
guest doesn't block interrupts (and if it does, inject when it doesn't - using
the interrupt window VMX mechanism), and setting up the appropriate VMCS
fields for the guest to receive the interrupt.

However, when we are running a nested guest (L2) and its hypervisor (L1)
requested exits on interrupts (as most hypervisors do), the most efficient
thing to do is to exit L2, telling L1 that the exit was caused by an
interrupt, the one we were injecting; Only when L1 asked not to be notified
of interrupts, we should to inject it directly to the running guest L2 (i.e.,
the normal code path).

However, properly doing what is described above requires invasive changes to
the flow of the existing code, which we elected not to do in this stage.
Instead we do something more simplistic and less efficient: we modify
vmx_interrupt_allowed(), which kvm calls to see if it can inject the interrupt
now, to exit from L2 to L1 before continuing the normal code. The normal kvm
code then notices that L1 is blocking interrupts, and sets the interrupt
window to inject the interrupt later to L1. Shortly after, L1 gets the
interrupt while it is itself running, not as an exit from L2. The cost is an
extra L1 exit (the interrupt window).

Signed-off-by: Nadav Har'El n...@il.ibm.com
---
--- .before/arch/x86/kvm/vmx.c  2010-06-13 15:01:30.0 +0300
+++ .after/arch/x86/kvm/vmx.c   2010-06-13 15:01:30.0 +0300
@@ -3591,9 +3591,29 @@ out:
return ret;
 }
 
+/* In nested virtualization, check if L1 asked to exit on external interrupts.
+ * For most existing hypervisors, this will always return true.
+ */
+static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
+{
+   int ret;
+   if (!nested_map_current(vcpu))
+   return 0;
+   ret = get_shadow_vmcs(vcpu)-pin_based_vm_exec_control 
+   PIN_BASED_EXT_INTR_MASK;
+   nested_unmap_current(vcpu);
+   return ret;
+}
+
 static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
u32 cpu_based_vm_exec_control;
+   if (to_vmx(vcpu)-nested.nested_mode  nested_exit_on_intr(vcpu))
+   /* We can get here when nested_run_pending caused
+* vmx_interrupt_allowed() to return false. In this case, do
+* nothing - the interrupt will be injected later.
+*/
+   return;
 
cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
@@ -3718,6 +3738,13 @@ static int nested_vmx_vmexit(struct kvm_
 
 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
+   if (to_vmx(vcpu)-nested.nested_mode  nested_exit_on_intr(vcpu)) {
+   if (to_vmx(vcpu)-nested.nested_run_pending)
+   return 0;
+   nested_vmx_vmexit(vcpu, true);
+   /* fall through to normal code, but now in L1, not L2 */
+   }
+
return (vmcs_readl(GUEST_RFLAGS)  X86_EFLAGS_IF) 
!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) 
(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 21/24] Correct handling of exception injection

2010-06-13 Thread Nadav Har'El
Similar to the previous patch, but concerning injection of exceptions rather
than external interrupts.

Signed-off-by: Nadav Har'El n...@il.ibm.com
---
--- .before/arch/x86/kvm/vmx.c  2010-06-13 15:01:30.0 +0300
+++ .after/arch/x86/kvm/vmx.c   2010-06-13 15:01:30.0 +0300
@@ -1564,6 +1564,9 @@ static void skip_emulated_instruction(st
vmx_set_interrupt_shadow(vcpu, 0);
 }
 
+static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr,
+   bool has_error_code, u32 error_code);
+
 static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
bool has_error_code, u32 error_code,
bool reinject)
@@ -1571,6 +1574,9 @@ static void vmx_queue_exception(struct k
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 intr_info = nr | INTR_INFO_VALID_MASK;
 
+   if (nested_vmx_check_exception(vcpu, nr, has_error_code, error_code))
+   return;
+
if (has_error_code) {
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
@@ -3670,6 +3676,9 @@ static void vmx_inject_nmi(struct kvm_vc
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+   if (vmx-nested.nested_mode)
+   return;
+
if (!cpu_has_virtual_nmis()) {
/*
 * Tracking the NMI-blocked state in software is built upon
@@ -6513,6 +6522,26 @@ static int nested_vmx_vmexit(struct kvm_
return 0;
 }
 
+static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr,
+ bool has_error_code, u32 error_code)
+{
+   if (!to_vmx(vcpu)-nested.nested_mode)
+   return 0;
+   if (!nested_vmx_exit_handled(vcpu, false))
+   return 0;
+   nested_vmx_vmexit(vcpu, false);
+   if (!nested_map_current(vcpu))
+   return 1;
+   get_shadow_vmcs(vcpu)-vm_exit_reason = EXIT_REASON_EXCEPTION_NMI;
+   get_shadow_vmcs(vcpu)-vm_exit_intr_info = (nr
+   | INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK
+   | (has_error_code ?  INTR_INFO_DELIVER_CODE_MASK : 0));
+   if (has_error_code)
+   get_shadow_vmcs(vcpu)-vm_exit_intr_error_code = error_code;
+   nested_unmap_current(vcpu);
+   return 1;
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 22/24] Correct handling of idt vectoring info

2010-06-13 Thread Nadav Har'El
This patch adds correct handling of IDT_VECTORING_INFO_FIELD for the nested
case.

When a guest exits while handling an interrupt or exception, we get this
information in IDT_VECTORING_INFO_FIELD in the VMCS. When L2 exits to L1,
there's nothing we need to do, because L1 will see this field in vmcs12, and
handle it itself. However, when L2 exits and L0 handles the exit itself and
plans to return to L2, L0 must inject this event to L2.

In the normal non-nested case, the idt_vectoring_info case is treated after
the exit. However, in the nested case a decision of whether to return to L2
or L1 also happens during the injection phase (see the previous patches), so
in the nested case we have to treat the idt_vectoring_info right after the
injection, i.e., in the beginning of vmx_vcpu_run, which is the first time
we know for sure if we're staying in L2 (i.e., nested_mode is true).

Signed-off-by: Nadav Har'El n...@il.ibm.com
---
--- .before/arch/x86/kvm/vmx.c  2010-06-13 15:01:30.0 +0300
+++ .after/arch/x86/kvm/vmx.c   2010-06-13 15:01:30.0 +0300
@@ -320,6 +320,10 @@ struct nested_vmx {
struct vmcs *l1_vmcs;
/* L2 must run next, and mustn't decide to exit to L1. */
bool nested_run_pending;
+   /* true if last exit was of L2, and had a valid idt_vectoring_info */
+   bool valid_idt_vectoring_info;
+   /* These are saved if valid_idt_vectoring_info */
+   u32 vm_exit_instruction_len, idt_vectoring_error_code;
 };
 
 enum vmcs_field_type {
@@ -5460,6 +5464,22 @@ static void fixup_rmode_irq(struct vcpu_
| vmx-rmode.irq.vector;
 }
 
+static void nested_handle_valid_idt_vectoring_info(struct vcpu_vmx *vmx)
+{
+   int irq  = vmx-idt_vectoring_info  VECTORING_INFO_VECTOR_MASK;
+   int type = vmx-idt_vectoring_info  VECTORING_INFO_TYPE_MASK;
+   int errCodeValid = vmx-idt_vectoring_info 
+   VECTORING_INFO_DELIVER_CODE_MASK;
+   vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+   irq | type | INTR_INFO_VALID_MASK | errCodeValid);
+
+   vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+   vmx-nested.vm_exit_instruction_len);
+   if (errCodeValid)
+   vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+   vmx-nested.idt_vectoring_error_code);
+}
+
 static inline void sync_cached_regs_to_vmcs(struct kvm_vcpu *vcpu)
 {
if (test_bit(VCPU_REGS_RSP, (unsigned long *)vcpu-arch.regs_dirty))
@@ -5481,6 +5501,9 @@ static void vmx_vcpu_run(struct kvm_vcpu
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+   if (vmx-nested.nested_mode  vmx-nested.valid_idt_vectoring_info)
+   nested_handle_valid_idt_vectoring_info(vmx);
+
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!cpu_has_virtual_nmis()  vmx-soft_vnmi_blocked))
vmx-entry_time = ktime_get();
@@ -5600,6 +5623,16 @@ static void vmx_vcpu_run(struct kvm_vcpu
  | (1  VCPU_EXREG_PDPTR));
 
vmx-idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+
+   vmx-nested.valid_idt_vectoring_info = vmx-nested.nested_mode 
+   (vmx-idt_vectoring_info  VECTORING_INFO_VALID_MASK);
+   if (vmx-nested.valid_idt_vectoring_info) {
+   vmx-nested.vm_exit_instruction_len =
+   vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+   vmx-nested.idt_vectoring_error_code =
+   vmcs_read32(IDT_VECTORING_ERROR_CODE);
+   }
+
if (vmx-rmode.irq.pending)
fixup_rmode_irq(vmx);
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 23/24] Handling of CR0.TS and #NM for Lazy FPU loading

2010-06-13 Thread Nadav Har'El
KVM's Lazy FPU loading means that sometimes L0 needs to set CR0.TS, even
if a guest didn't set it. Moreover, L0 must also trap CR0.TS changes and
NM exceptions, even if we have a guest hypervisor (L1) who didn't want these
traps. And of course, conversely: If L1 wanted to trap these events, we
must let it, even if L0 is not interested in them.

This patch fixes some existing KVM code (in update_exception_bitmap(),
vmx_fpu_activate(), vmx_fpu_deactivate(), handle_cr()) to do the correct
merging of L0's and L1's needs. Note that new code in introduced in previous
patches already handles CR0 correctly (see prepare_vmcs_02(),
prepare_vmcs_12(), and nested_vmx_vmexit()).

Signed-off-by: Nadav Har'El n...@il.ibm.com
---
--- .before/arch/x86/kvm/vmx.c  2010-06-13 15:01:30.0 +0300
+++ .after/arch/x86/kvm/vmx.c   2010-06-13 15:01:30.0 +0300
@@ -1144,6 +1144,27 @@ static void update_exception_bitmap(stru
eb = ~(1u  PF_VECTOR); /* bypass_guest_pf = 0 */
if (vcpu-fpu_active)
eb = ~(1u  NM_VECTOR);
+
+   /* When we are running a nested L2 guest and L1 specified for it a
+* certain exception bitmap, we must trap the same exceptions and pass
+* them to L1. When running L2, we will only handle the exceptions
+* specified above if L1 did not want them.
+*/
+   if (to_vmx(vcpu)-nested.nested_mode) {
+   u32 nested_eb;
+   if (to_vmx(vcpu)-nested.current_l2_page)
+   nested_eb = get_shadow_vmcs(vcpu)-exception_bitmap;
+   else {
+   if (!nested_map_current(vcpu)) {
+   to_vmx(vcpu)-fail = 1;
+   return;
+   }
+   nested_eb = get_shadow_vmcs(vcpu)-exception_bitmap;
+   nested_unmap_current(vcpu);
+   }
+   eb |= nested_eb;
+   }
+
vmcs_write32(EXCEPTION_BITMAP, eb);
 }
 
@@ -1488,8 +1509,25 @@ static void vmx_fpu_activate(struct kvm_
cr0 = ~(X86_CR0_TS | X86_CR0_MP);
cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
vmcs_writel(GUEST_CR0, cr0);
-   update_exception_bitmap(vcpu);
vcpu-arch.cr0_guest_owned_bits = X86_CR0_TS;
+   if (to_vmx(vcpu)-nested.nested_mode) {
+   /* While we (L0) no longer care about NM exceptions or cr0.TS
+* changes, our guest hypervisor (L1) might care in which case
+* we must trap them for it.
+*/
+   u32 eb = vmcs_read32(EXCEPTION_BITMAP)  ~(1u  NM_VECTOR);
+   struct shadow_vmcs *vmcs12;
+   if (!nested_map_current(vcpu)) {
+   to_vmx(vcpu)-fail = 1;
+   return;
+   }
+   vmcs12 = get_shadow_vmcs(vcpu);
+   eb |= vmcs12-exception_bitmap;
+   vcpu-arch.cr0_guest_owned_bits = ~vmcs12-cr0_guest_host_mask;
+   nested_unmap_current(vcpu);
+   vmcs_write32(EXCEPTION_BITMAP, eb);
+   } else
+   update_exception_bitmap(vcpu);
vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu-arch.cr0_guest_owned_bits);
 }
 
@@ -1497,12 +1535,24 @@ static void vmx_decache_cr0_guest_bits(s
 
 static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
 {
+   /* Note that there is no vcpu-fpu_active = 0 here. The caller must
+* set this *before* calling this function.
+*/
vmx_decache_cr0_guest_bits(vcpu);
vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
-   update_exception_bitmap(vcpu);
+   vmcs_write32(EXCEPTION_BITMAP,
+   vmcs_read32(EXCEPTION_BITMAP) | (1u  NM_VECTOR));
vcpu-arch.cr0_guest_owned_bits = 0;
vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu-arch.cr0_guest_owned_bits);
-   vmcs_writel(CR0_READ_SHADOW, vcpu-arch.cr0);
+   if (to_vmx(vcpu)-nested.nested_mode)
+   /* Unfortunately in nested mode we play with arch.cr0's PG
+* bit, so we musn't copy it all, just the relevant TS bit
+*/
+   vmcs_writel(CR0_READ_SHADOW,
+   (vmcs_readl(CR0_READ_SHADOW)  ~X86_CR0_TS) |
+   (vcpu-arch.cr0  X86_CR0_TS));
+   else
+   vmcs_writel(CR0_READ_SHADOW, vcpu-arch.cr0);
 }
 
 static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
@@ -3998,6 +4048,53 @@ vmx_patch_hypercall(struct kvm_vcpu *vcp
hypercall[2] = 0xc1;
 }
 
+/* called to set cr0 as approriate for a mov-to-cr0 exit. */
+static void handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
+{
+   if (to_vmx(vcpu)-nested.nested_mode) {
+   /* When running L2, we usually do what L1 wants: it decides
+* which cr0 bits to intercept, we forward it cr0-change events
+* (see nested_vmx_exit_handled()). We only get here when a cr0
+* bit was 

[PATCH 24/24] Miscellenous small corrections

2010-06-13 Thread Nadav Har'El
Small corrections of KVM (spelling, etc.) not directly related to nested VMX.

Signed-off-by: Nadav Har'El n...@il.ibm.com
---
--- .before/arch/x86/kvm/vmx.c  2010-06-13 15:01:30.0 +0300
+++ .after/arch/x86/kvm/vmx.c   2010-06-13 15:01:30.0 +0300
@@ -992,7 +992,7 @@ static void vmcs_load(struct vmcs *vmcs)
: =g(error) : a(phys_addr), m(phys_addr)
: cc, memory);
if (error)
-   printk(KERN_ERR kvm: vmptrld %p/%llx fail\n,
+   printk(KERN_ERR kvm: vmptrld %p/%llx failed\n,
   vmcs, phys_addr);
 }
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] These are my first patches to the kernel

2010-06-13 Thread K.de Jong
2010/6/13 Avi Kivity a...@redhat.com:
 On 06/12/2010 03:17 PM, K.de Jong wrote:

  From 6b9ac8708d856a425a9aaa598acdea3d89485bde Mon Sep 17 00:00:00 2001
 From: UndiFineDk.dej...@undifined.nl
 Date: Sat, 12 Jun 2010 00:24:28 +0200
 Subject: [PATCH] These are my first patches to the kernel
 Janitor patches to kvm
 Linus2.6: virt/kvm/



 Most of the patch is just whitespace changes, which don't improve the code
 in any way.  I much prefer patches that fix bugs or add features.

Ofcourse everyone prefers bug fixes and new features :-)
I simply looked at greg's submit a patch video.
And thought, hey I could do this, and help clean up code.
Cleaner code makes it more accessible
and I get to know where everything is located.
So I am going to continue making patches like these
and become more familiar with the whole kernel.

-- 
Keimpe de Jong
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] These are my first patches to the kernel

2010-06-13 Thread Wolfram Sang
 So I am going to continue making patches like these
 and become more familiar with the whole kernel.

Better concentrate on the drivers/staging-directory for that. Such patches are
very welcome there, which may be not so true for the rest of the kernel. You
can probably find a couple of mail-threads stating the pros and cons.

Regards,

   Wolfram

-- 
Pengutronix e.K.   | Wolfram Sang|
Industrial Linux Solutions | http://www.pengutronix.de/  |


signature.asc
Description: Digital signature


[KVM-AUTOTEST PATCH 01/14] KVM test: tests_base.cfg.sample: remove inline comments

2010-06-13 Thread Michael Goldish
Inline comments are not supported (yet?) and break the parsing of
tests_base.cfg.sample.

Signed-off-by: Michael Goldish mgold...@redhat.com
---
 client/tests/kvm/tests_base.cfg.sample |   12 
 1 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/client/tests/kvm/tests_base.cfg.sample 
b/client/tests/kvm/tests_base.cfg.sample
index ab8922b..84a903b 100644
--- a/client/tests/kvm/tests_base.cfg.sample
+++ b/client/tests/kvm/tests_base.cfg.sample
@@ -86,22 +86,26 @@ variants:
 initrd = initrd.img
 nic_mode = tap
 variants:
-- cdrom: # install guest from cdrom 
+# Install guest from cdrom 
+- cdrom:
 medium = cdrom
 nic_mode = user
 redirs +=  unattended_install
 kernel =
 initrd = 
-- url: # install guest from http/ftp url
+# Install guest from http/ftp url
+- url:
 medium = url
 extra_params +=  --append ks=floppy
 url = REPLACE_THIS_WITH_TREE_URL
-- nfs: # install guest from nfs nfs_server:nfs_dir
+# Install guest from nfs nfs_server:nfs_dir
+- nfs:
 medium = nfs
 extra_params +=  --append ks=floppy
 nfs_server = REPLACE_THIS_WITH_NFS_SERVER
 nfs_dir = REPLACE_THIS_WITH_NFS_DIRECTORY
-- remote_ks: # install guest with a remote kickstart
+# Install guest with a remote kickstart
+- remote_ks:
 medium = url
 extra_params +=  --append ks=REPLACE_THIS_WITH_URL_OF_KS
 url = REPLACE_THIS_WITH_TREE_URL
-- 
1.5.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[KVM-AUTOTEST PATCH 02/14] KVM test: tests_base.cfg.sample: style modifications

2010-06-13 Thread Michael Goldish
Try to exclude tests as soon as possible.
(Also remove broken linux_s3 exception at the same time.)

Signed-off-by: Michael Goldish mgold...@redhat.com
---
 client/tests/kvm/tests_base.cfg.sample |   22 ++
 1 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/client/tests/kvm/tests_base.cfg.sample 
b/client/tests/kvm/tests_base.cfg.sample
index 84a903b..b302557 100644
--- a/client/tests/kvm/tests_base.cfg.sample
+++ b/client/tests/kvm/tests_base.cfg.sample
@@ -982,7 +982,7 @@ variants:
 
 # Windows section
 - @Windows:
-no autotest linux_s3 vlan_tag
+no autotest linux_s3 vlan_tag ioquit 
unattended_install.(url|nfs|remote_ks)
 shutdown_command = shutdown /s /f /t 0
 reboot_command = shutdown /r /f /t 0
 status_test_command = echo %errorlevel%
@@ -1368,13 +1368,6 @@ variants:
 md5sum = 9fae22f2666369968a76ef59e9a81ced
 
 
-linux_s3
-only Linux
-
-unattended_install.url|unattended_install.nfs|unattended_install.remote_ks:
-only Linux
-
-
 variants:
 - @up:
 no autotest.npb autotest.tsc
@@ -1414,17 +1407,20 @@ virtio|virtio_blk|e1000|balloon_check:
 variants:
 - @qcow2:
 image_format = qcow2
-post_command =  python scripts/check_image.py;
-remove_image = no
+post_command +=  python scripts/check_image.py;
 post_command_timeout = 600
 post_command_noncritical = yes
+ioquit:
+post_command_noncritical = no
 - vmdk:
+no ioquit
 only Fedora Ubuntu Windows
 only smp2
 only rtl8139
 only acpi
 image_format = vmdk
 - raw:
+no ioquit
 only Fedora Ubuntu Windows
 only smp2
 only rtl8139
@@ -1439,12 +1435,6 @@ variants:
 extra_params +=  -mem-path /mnt/kvm_hugepage
 
 
-ioquit:
-post_command_noncritical = no
-only qcow2
-only Linux
-
-
 variants:
 - @no_pci_assignable:
 pci_assignable = no
-- 
1.5.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[KVM-AUTOTEST PATCH 03/14] KVM test: kvm_utils.py: warn about exceptions raised during unpickling of env

2010-06-13 Thread Michael Goldish
Signed-off-by: Michael Goldish mgold...@redhat.com
---
 client/tests/kvm/kvm_utils.py |3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/client/tests/kvm/kvm_utils.py b/client/tests/kvm/kvm_utils.py
index 0da7015..82ecb77 100644
--- a/client/tests/kvm/kvm_utils.py
+++ b/client/tests/kvm/kvm_utils.py
@@ -35,7 +35,8 @@ def load_env(filename, default={}):
 return obj
 # Almost any exception can be raised during unpickling, so let's catch
 # them all
-except:
+except Exception, e:
+logging.warn(e)
 return default
 
 
-- 
1.5.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[KVM-AUTOTEST PATCH 07/14] KVM test: kvm_vm.py: correct add_smp() mistake in make_qemu_command()

2010-06-13 Thread Michael Goldish
add_smp() is defined but not used.

Signed-off-by: Michael Goldish mgold...@redhat.com
---
 client/tests/kvm/kvm_vm.py |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/client/tests/kvm/kvm_vm.py b/client/tests/kvm/kvm_vm.py
index f65d967..039fbff 100755
--- a/client/tests/kvm/kvm_vm.py
+++ b/client/tests/kvm/kvm_vm.py
@@ -350,7 +350,7 @@ class VM:
 
 smp = params.get(smp)
 if smp:
-qemu_cmd +=  -smp %s % smp
+qemu_cmd += add_smp(help, smp)
 
 iso = params.get(cdrom)
 if iso:
-- 
1.5.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[KVM-AUTOTEST PATCH 04/14] KVM test: kvm_utils.py: remove unnecessary imports

2010-06-13 Thread Michael Goldish
Signed-off-by: Michael Goldish mgold...@redhat.com
---
 client/tests/kvm/kvm_utils.py |3 +--
 1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/client/tests/kvm/kvm_utils.py b/client/tests/kvm/kvm_utils.py
index 82ecb77..0ea5a8a 100644
--- a/client/tests/kvm/kvm_utils.py
+++ b/client/tests/kvm/kvm_utils.py
@@ -4,8 +4,7 @@ KVM test utility functions.
 @copyright: 2008-2009 Red Hat Inc.
 
 
-import thread, subprocess, time, string, random, socket, os, signal
-import select, re, logging, commands, cPickle, pty
+import time, string, random, socket, os, signal, re, logging, commands, cPickle
 from autotest_lib.client.bin import utils
 from autotest_lib.client.common_lib import error, logging_config
 import kvm_subprocess
-- 
1.5.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[KVM-AUTOTEST PATCH 09/14] KVM test: remove reference to _screendump_thread at postprocessing

2010-06-13 Thread Michael Goldish
_screendump_thread contains a reference to 'env' which prevents VMs from being
garbage collected.  This makes a difference for multi-iteration tests where
several tests run consecutively in the same process.  Removing the reference
to _screendump_thread also removes a reference to VMs, thus allowing them to be
garbage collected.  This is mainly important for the new monitor classes.

Signed-off-by: Michael Goldish mgold...@redhat.com
---
 client/tests/kvm/kvm_preprocessing.py |2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/client/tests/kvm/kvm_preprocessing.py 
b/client/tests/kvm/kvm_preprocessing.py
index 318bf3f..76c8268 100644
--- a/client/tests/kvm/kvm_preprocessing.py
+++ b/client/tests/kvm/kvm_preprocessing.py
@@ -285,6 +285,8 @@ def postprocess(test, params, env):
 logging.debug(Terminating screendump thread...)
 _screendump_thread_termination_event.set()
 _screendump_thread.join(10)
+_screendump_thread = None
+_screendump_thread_termination_event = None
 
 # Warn about corrupt PPM files
 for f in glob.glob(os.path.join(test.debugdir, *.ppm)):
-- 
1.5.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[KVM-AUTOTEST PATCH 12/14] KVM test: use new monitor interface

2010-06-13 Thread Michael Goldish
- Add new monitor definition syntax that allows definition of multiple monitors.
  Monitors are now defined like other objects in the config file:

  monitors = MyMonitor SomeOtherMonitor YetAnotherMonitor   # defines 3 
monitors
  monitor_type = human# default for all monitors
  monitor_type_SomeOtherMonitor = qmp # applies only to SomeOtherMonitor
  monitor_type_YetAnotherMonitor = qmp# applies only to 
YetAnotherMonitor
  main_monitor = MyMonitor# defines the main monitor to use
  # in the test

- Use the new syntax in tests_base.cfg.sample.

- Establish monitor connections using kvm_monitor in VM.create().
  Store all monitors in self.monitors.  Store main monitor in self.monitor.

- Replace calls to send_monitor_cmd() with appropriate calls to methods of
  self.monitor (the main monitor).

- For now, ignore the parameter screendump_verbose because currently monitor
  commands are always silent (when successful).

Signed-off-by: Michael Goldish mgold...@redhat.com
---
 client/tests/kvm/kvm_preprocessing.py  |   33 ++--
 client/tests/kvm/kvm_test_utils.py |   35 +--
 client/tests/kvm/kvm_vm.py |  226 +---
 client/tests/kvm/tests/balloon_check.py|   12 +-
 client/tests/kvm/tests/boot_savevm.py  |   41 ++--
 client/tests/kvm/tests/ksm_overcommit.py   |8 +-
 client/tests/kvm/tests/pci_hotplug.py  |   13 +-
 client/tests/kvm/tests/physical_resources_check.py |   40 ++--
 client/tests/kvm/tests/shutdown.py |2 +-
 client/tests/kvm/tests/stepmaker.py|   44 ++--
 client/tests/kvm/tests/steps.py|   12 +-
 client/tests/kvm/tests_base.cfg.sample |7 +-
 12 files changed, 232 insertions(+), 241 deletions(-)

diff --git a/client/tests/kvm/kvm_preprocessing.py 
b/client/tests/kvm/kvm_preprocessing.py
index 76c8268..ee3e9b2 100644
--- a/client/tests/kvm/kvm_preprocessing.py
+++ b/client/tests/kvm/kvm_preprocessing.py
@@ -1,7 +1,7 @@
 import sys, os, time, commands, re, logging, signal, glob, threading, shutil
 from autotest_lib.client.bin import test, utils
 from autotest_lib.client.common_lib import error
-import kvm_vm, kvm_utils, kvm_subprocess, ppm_utils
+import kvm_vm, kvm_utils, kvm_subprocess, kvm_monitor, ppm_utils
 try:
 import PIL.Image
 except ImportError:
@@ -83,7 +83,11 @@ def preprocess_vm(test, params, env, name):
 raise error.TestError(Could not start VM)
 
 scrdump_filename = os.path.join(test.debugdir, pre_%s.ppm % name)
-vm.send_monitor_cmd(screendump %s % scrdump_filename)
+try:
+if vm.monitor:
+vm.monitor.screendump(scrdump_filename)
+except kvm_monitor.MonitorError, e:
+logging.warn(e)
 
 
 def postprocess_image(test, params):
@@ -117,7 +121,11 @@ def postprocess_vm(test, params, env, name):
 return
 
 scrdump_filename = os.path.join(test.debugdir, post_%s.ppm % name)
-vm.send_monitor_cmd(screendump %s % scrdump_filename)
+try:
+if vm.monitor:
+vm.monitor.screendump(scrdump_filename)
+except kvm_monitor.MonitorError, e:
+logging.warn(e)
 
 if params.get(kill_vm) == yes:
 kill_vm_timeout = float(params.get(kill_vm_timeout, 0))
@@ -356,8 +364,9 @@ def postprocess(test, params, env):
 for vm in kvm_utils.env_get_all_vms(env):
 if not vm.is_dead():
 logging.info(VM '%s' is alive., vm.name)
-logging.info(The monitor unix socket of '%s' is: %s,
- vm.name, vm.monitor_file_name)
+for m in vm.monitors:
+logging.info('%s' has a %s monitor unix socket at: %s,
+ vm.name, m.protocol, m.filename)
 logging.info(The command line used to start '%s' was:\n%s,
  vm.name, vm.make_qemu_command())
 raise error.JobError(Abort requested (%s) % exc_string)
@@ -403,10 +412,6 @@ def _take_screendumps(test, params, env):
  kvm_utils.generate_random_string(6))
 delay = float(params.get(screendump_delay, 5))
 quality = int(params.get(screendump_quality, 30))
-if params.get(screendump_verbose) == 'yes':
-screendump_verbose = True
-else:
-screendump_verbose = False
 
 cache = {}
 
@@ -414,11 +419,11 @@ def _take_screendumps(test, params, env):
 for vm in kvm_utils.env_get_all_vms(env):
 if vm.is_dead():
 continue
-if screendump_verbose:
-vm.send_monitor_cmd(screendump %s % temp_filename)
-else:
-vm.send_monitor_cmd(screendump %s % temp_filename,
-verbose=False)
+try:
+vm.monitor.screendump(temp_filename)

[KVM-AUTOTEST PATCH 14/14] KVM test: migration: support QMP

2010-06-13 Thread Michael Goldish
If the value returned from a monitor method call is a string, treat it as
human monitor output.  Otherwise treat it as QMP output.

Signed-off-by: Michael Goldish mgold...@redhat.com
---
 client/tests/kvm/kvm_test_utils.py |   15 ---
 1 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/client/tests/kvm/kvm_test_utils.py 
b/client/tests/kvm/kvm_test_utils.py
index c3b6b8a..9fdea87 100644
--- a/client/tests/kvm/kvm_test_utils.py
+++ b/client/tests/kvm/kvm_test_utils.py
@@ -120,15 +120,24 @@ def migrate(vm, env=None):
 # Helper functions
 def mig_finished():
 o = vm.monitor.info(migrate)
-return status: active not in o
+if isinstance(o, str):
+return status: active not in o
+else:
+return o.get(status) != active
 
 def mig_succeeded():
 o = vm.monitor.info(migrate)
-return status: completed in o
+if isinstance(o, str):
+return status: completed in o
+else:
+return o.get(status) == completed
 
 def mig_failed():
 o = vm.monitor.info(migrate)
-return status: failed in o
+if isinstance(o, str):
+return status: failed in o
+else:
+return o.get(status) == failed
 
 # Clone the source VM and ask the clone to wait for incoming migration
 dest_vm = vm.clone()
-- 
1.5.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[KVM-AUTOTEST PATCH 13/14] KVM test: kvm_monitor.py: add QMP interface

2010-06-13 Thread Michael Goldish
An initial QMP client implementation.
Should be fully functional and supports asynchronous events.
However, most tests must be modified to support it, because it returns output
in a different format from the human monitor (the human monitor returns strings
and the QMP one returns dicts or lists).

To enable QMP, set main_monitor to a monitor whose monitor_type is qmp.

For example (a single QMP monitor):

monitors = monitor1
monitor_type_monitor1 = qmp
main_monitor = monitor1

Another example (multiple monitors, both human and QMP):

monitors = MyMonitor SomeOtherMonitor YetAnotherMonitor   # defines 3 
monitors
monitor_type = human# default for all monitors
monitor_type_SomeOtherMonitor = qmp # applies only to SomeOtherMonitor
monitor_type_YetAnotherMonitor = qmp# applies only to YetAnotherMonitor
main_monitor = SomeOtherMonitor # the main monitor is a QMP one, so
# the test will use QMP

Note:
Monitor methods now raise exceptions such as MonitorLockError and QMPCmdError.
If this turns out to be a bad idea, it shouldn't be hard to revert to the old
convention of returning a (status, output) tuple.

Signed-off-by: Michael Goldish mgold...@redhat.com
---
 client/tests/kvm/kvm_monitor.py |  275 +++
 client/tests/kvm/kvm_vm.py  |6 +-
 2 files changed, 279 insertions(+), 2 deletions(-)

diff --git a/client/tests/kvm/kvm_monitor.py b/client/tests/kvm/kvm_monitor.py
index c5cf9c3..76a1a83 100644
--- a/client/tests/kvm/kvm_monitor.py
+++ b/client/tests/kvm/kvm_monitor.py
@@ -6,6 +6,11 @@ Interfaces to the QEMU monitor.
 
 import socket, time, threading, logging
 import kvm_utils
+try:
+import json
+except ImportError:
+logging.warning(Could not import json module. 
+QMP monitor functionality disabled.)
 
 
 class MonitorError(Exception):
@@ -28,6 +33,10 @@ class MonitorProtocolError(MonitorError):
 pass
 
 
+class QMPCmdError(MonitorError):
+pass
+
+
 class Monitor:
 
 Common code for monitor classes.
@@ -114,6 +123,8 @@ class HumanMonitor(Monitor):
 suppress_exceptions is False
 @raise MonitorProtocolError: Raised if the initial (qemu) prompt isn't
 found and suppress_exceptions is False
+@note: Other exceptions may be raised.  See _get_command_output's
+docstring.
 
 try:
 Monitor.__init__(self, filename)
@@ -354,3 +365,267 @@ class HumanMonitor(Monitor):
 @return: The command's output
 
 return self._get_command_output(mouse_button %d % state)
+
+
+class QMPMonitor(Monitor):
+
+Wraps QMP monitor commands.
+
+
+def __init__(self, filename, suppress_exceptions=False):
+
+Connect to the monitor socket and issue the qmp_capabilities command
+
+@param filename: Monitor socket filename
+@raise MonitorConnectError: Raised if the connection fails and
+suppress_exceptions is False
+@note: Other exceptions may be raised if the qmp_capabilities command
+fails.  See _get_command_output's docstring.
+
+try:
+Monitor.__init__(self, filename)
+
+self.protocol = qmp
+self.events = []
+
+# Issue qmp_capabilities
+self._get_command_output(qmp_capabilities)
+
+except MonitorError, e:
+if suppress_exceptions:
+logging.warn(e)
+else:
+raise
+
+
+# Private methods
+
+def _build_cmd(self, cmd, args=None):
+obj = {execute: cmd}
+if args:
+obj[arguments] = args
+return obj
+
+
+def _read_objects(self, timeout=5):
+
+Read lines from monitor and try to decode them.
+Stop when all available lines have been successfully decoded, or when
+timeout expires.  If any decoded objects are asynchronous events, store
+them in self.events.  Return all decoded objects.
+
+@param timeout: Time to wait for all lines to decode successfully
+@return: A list of objects
+
+s = 
+objs = []
+end_time = time.time() + timeout
+while time.time()  end_time:
+s += self._recvall()
+for line in s.splitlines():
+if not line:
+continue
+try:
+obj = json.loads(line)
+except:
+# Found an incomplete or broken line -- keep reading
+break
+objs += [obj]
+else:
+# All lines are OK -- stop reading
+break
+time.sleep(0.1)
+# Keep track of asynchronous events
+self.events += [obj for obj in objs if event in obj]
+return objs
+
+
+def 

[KVM-AUTOTEST PATCH 11/14] KVM test: add kvm_monitor.py, an interface to QEMU monitors

2010-06-13 Thread Michael Goldish
This module should replace vm.send_monitor_cmd().  Instead of connecting to the
monitor each time a command is issued, this module maintains a continuous
connection to the monitor.  It disconnects when a test terminates and
reconnects as soon as the next test begins (upon unpickling).

It currently contains only an interface to the human monitor.  A QMP interface
will be introduced in a future patch.

Signed-off-by: Michael Goldish mgold...@redhat.com
---
 client/tests/kvm/kvm_monitor.py |  356 +++
 1 files changed, 356 insertions(+), 0 deletions(-)
 create mode 100644 client/tests/kvm/kvm_monitor.py

diff --git a/client/tests/kvm/kvm_monitor.py b/client/tests/kvm/kvm_monitor.py
new file mode 100644
index 000..c5cf9c3
--- /dev/null
+++ b/client/tests/kvm/kvm_monitor.py
@@ -0,0 +1,356 @@
+
+Interfaces to the QEMU monitor.
+
+...@copyright: 2008-2010 Red Hat Inc.
+
+
+import socket, time, threading, logging
+import kvm_utils
+
+
+class MonitorError(Exception):
+pass
+
+
+class MonitorConnectError(MonitorError):
+pass
+
+
+class MonitorSendError(MonitorError):
+pass
+
+
+class MonitorLockError(MonitorError):
+pass
+
+
+class MonitorProtocolError(MonitorError):
+pass
+
+
+class Monitor:
+
+Common code for monitor classes.
+
+
+def __init__(self, filename):
+
+Initialize the instance.
+
+@param filename: Monitor socket filename.
+@raise MonitorConnectError: Raised if the connection fails
+
+self.filename = filename
+self.lock = threading.RLock()
+self.socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+self.socket.setblocking(False)
+
+try:
+self.socket.connect(filename)
+except socket.error:
+raise MonitorConnectError(Could not connect to monitor socket)
+
+
+def __del__(self):
+# Automatically close the connection when the instance is garbage
+# collected
+try:
+self.socket.shutdown(socket.SHUT_RDWR)
+except socket.error:
+pass
+self.socket.close()
+
+
+# The following two functions are defined to make sure the state is set
+# exclusively by the constructor call as specified in __getinitargs__().
+
+def __getstate__(self):
+pass
+
+
+def __setstate__(self, state):
+pass
+
+
+def __getinitargs__(self):
+# Save some information when pickling -- will be passed to the
+# constructor upon unpickling
+return self.filename, True
+
+
+def _acquire_lock(self, timeout=20):
+end_time = time.time() + timeout
+while time.time()  end_time:
+if self.lock.acquire(False):
+return True
+time.sleep(0.05)
+return False
+
+
+def _recvall(self):
+s = 
+while True:
+try:
+data = self.socket.recv(1024)
+except socket.error:
+break
+if not data:
+break
+s += data
+return s
+
+
+class HumanMonitor(Monitor):
+
+Wraps human monitor commands.
+
+
+def __init__(self, filename, suppress_exceptions=False):
+
+Connect to the monitor socket and find the (qemu) prompt.
+
+@param filename: Monitor socket filename
+@raise MonitorConnectError: Raised if the connection fails and
+suppress_exceptions is False
+@raise MonitorProtocolError: Raised if the initial (qemu) prompt isn't
+found and suppress_exceptions is False
+
+try:
+Monitor.__init__(self, filename)
+
+self.protocol = human
+
+# Find the initial (qemu) prompt
+s, o = self._read_up_to_qemu_prompt(20)
+if not s:
+raise MonitorProtocolError(Could not find (qemu) prompt 
+   after connecting to monitor. 
+   Output so far: %r % o)
+
+# Save the output of 'help' for future use
+self.help = self._get_command_output(help)
+
+except MonitorError, e:
+if suppress_exceptions:
+logging.warn(e)
+else:
+raise
+
+
+# Private methods
+
+def _read_up_to_qemu_prompt(self, timeout=20):
+o = 
+end_time = time.time() + timeout
+while time.time()  end_time:
+try:
+data = self.socket.recv(1024)
+if not data:
+break
+o += data
+if o.splitlines()[-1].split()[-1] == (qemu):
+return True, \n.join(o.splitlines()[:-1])
+except (socket.error, IndexError):
+time.sleep(0.01)
+return False, \n.join(o.splitlines())
+
+
+def _send_command(self, command):
+
+Send a command 

[KVM-AUTOTEST PATCH 08/14] KVM test: kvm_vm.py: use shell quoting in make_qemu_command() where appropriate

2010-06-13 Thread Michael Goldish
Signed-off-by: Michael Goldish mgold...@redhat.com
---
 client/tests/kvm/kvm_vm.py |   36 ++--
 1 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/client/tests/kvm/kvm_vm.py b/client/tests/kvm/kvm_vm.py
index 039fbff..e40abb4 100755
--- a/client/tests/kvm/kvm_vm.py
+++ b/client/tests/kvm/kvm_vm.py
@@ -204,7 +204,7 @@ class VM:
 return  -name '%s' % name
 
 def add_unix_socket_monitor(help, filename):
-return  -monitor unix:%s,server,nowait % filename
+return  -monitor unix:'%s',server,nowait % filename
 
 def add_mem(help, mem):
 return  -m %s % mem
@@ -214,18 +214,18 @@ class VM:
 
 def add_cdrom(help, filename, index=2):
 if has_option(help, drive):
-return  -drive file=%s,index=%d,media=cdrom % (filename,
- index)
+return  -drive file='%s',index=%d,media=cdrom % (filename,
+   index)
 else:
-return  -cdrom %s % filename
+return  -cdrom '%s' % filename
 
 def add_drive(help, filename, format=None, cache=None, werror=None,
   serial=None, snapshot=False, boot=False):
-cmd =  -drive file=%s % filename
+cmd =  -drive file='%s' % filename
 if format: cmd += ,if=%s % format
 if cache: cmd += ,cache=%s % cache
 if werror: cmd += ,werror=%s % werror
-if serial: cmd += ,serial=%s % serial
+if serial: cmd += ,serial='%s' % serial
 if snapshot: cmd += ,snapshot=on
 if boot: cmd += ,boot=on
 return cmd
@@ -233,23 +233,23 @@ class VM:
 def add_nic(help, vlan, model=None, mac=None):
 cmd =  -net nic,vlan=%d % vlan
 if model: cmd += ,model=%s % model
-if mac: cmd += ,macaddr=%s % mac
+if mac: cmd += ,macaddr='%s' % mac
 return cmd
 
 def add_net(help, vlan, mode, ifname=None, script=None,
 downscript=None):
 cmd =  -net %s,vlan=%d % (mode, vlan)
 if mode == tap:
-if ifname: cmd += ,ifname=%s % ifname
-if script: cmd += ,script=%s % script
-cmd += ,downscript=%s % (downscript or no)
+if ifname: cmd += ,ifname='%s' % ifname
+if script: cmd += ,script='%s' % script
+cmd += ,downscript='%s' % (downscript or no)
 return cmd
 
 def add_floppy(help, filename):
-return  -fda %s % filename
+return  -fda '%s' % filename
 
 def add_tftp(help, filename):
-return  -tftp %s % filename
+return  -tftp '%s' % filename
 
 def add_tcp_redir(help, host_port, guest_port):
 return  -redir tcp:%s::%s % (host_port, guest_port)
@@ -267,22 +267,22 @@ class VM:
 return  -nographic
 
 def add_uuid(help, uuid):
-return  -uuid %s % uuid
+return  -uuid '%s' % uuid
 
 def add_pcidevice(help, host):
-return  -pcidevice host=%s % host
+return  -pcidevice host='%s' % host
 
 def add_initrd(help, filename):
-return  -initrd %s % filename
+return  -initrd '%s' % filename
 
 def add_kernel(help, filename):
-return  -kernel %s % filename
+return  -kernel '%s' % filename
 
 def add_kernel_cmdline(help, cmdline):
-return  -append %s % cmdline
+return  -append '%s' % cmdline
 
 def add_testdev(help, filename):
-return ( -chardev file,id=testlog,path=%s
+return ( -chardev file,id=testlog,path='%s'
  -device testdev,chardev=testlog % filename)
 
 # End of command line option wrappers
-- 
1.5.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/4] x86, fpu: merge __save_init_fpu() implementations

2010-06-13 Thread Avi Kivity
The two __save_init_fpu() implementations are identical, merge them.

Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/include/asm/i387.h |   11 ++-
 1 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 815c5b2..df5badf 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -184,12 +184,6 @@ static inline void fpu_save_init(struct fpu *fpu)
fpu_clear(fpu);
 }
 
-static inline void __save_init_fpu(struct task_struct *tsk)
-{
-   fpu_save_init(tsk-thread.fpu);
-   task_thread_info(tsk)-status = ~TS_USEDFPU;
-}
-
 #else  /* CONFIG_X86_32 */
 
 #ifdef CONFIG_MATH_EMULATION
@@ -277,15 +271,14 @@ end:
;
 }
 
+#endif /* CONFIG_X86_64 */
+
 static inline void __save_init_fpu(struct task_struct *tsk)
 {
fpu_save_init(tsk-thread.fpu);
task_thread_info(tsk)-status = ~TS_USEDFPU;
 }
 
-
-#endif /* CONFIG_X86_64 */
-
 static inline int fpu_fxrstor_checking(struct fpu *fpu)
 {
return fxrstor_checking(fpu-state-fxsave);
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/4] x86, fpu: run device not available trap with interrupts enabled

2010-06-13 Thread Avi Kivity
In order to allow a task's fpu state to fully float, we may need to
bring it back from another processor.  To do that, we need interrupts to
be enabled so we can fire off an IPI to that processor.

May break 80386/7 combos with FERR# wired through the interrupt controller.

Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kernel/traps.c |   13 +
 1 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 142d70c..c7d67cb 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -740,7 +740,6 @@ asmlinkage void math_state_restore(void)
struct task_struct *tsk = thread-task;
 
if (!tsk_used_math(tsk)) {
-   local_irq_enable();
/*
 * does a slab alloc which can sleep
 */
@@ -751,7 +750,6 @@ asmlinkage void math_state_restore(void)
do_group_exit(SIGKILL);
return;
}
-   local_irq_disable();
}
 
clts(); /* Allow maths ops (or we recurse) */
@@ -774,21 +772,20 @@ void math_emulate(struct math_emu_info *info)
 dotraplinkage void __kprobes
 do_device_not_available(struct pt_regs *regs, long error_code)
 {
+   preempt_disable();
+   local_irq_enable();
 #ifdef CONFIG_X86_32
if (read_cr0()  X86_CR0_EM) {
struct math_emu_info info = { };
 
-   conditional_sti(regs);
-
info.regs = regs;
math_emulate(info);
-   } else {
-   math_state_restore(); /* interrupts still off */
-   conditional_sti(regs);
-   }
+   } else
+   math_state_restore();
 #else
math_state_restore();
 #endif
+   preempt_enable();
 }
 
 #ifdef CONFIG_X86_32
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/4] Really lazy fpu

2010-06-13 Thread Avi Kivity
Currently fpu management is only lazy in one direction.  When we switch into
a task, we may avoid loading the fpu state in the hope that the task will
never use it.  If we guess right we save an fpu load/save cycle; if not,
a Device not Available exception will remind us to load the fpu.

However, in the other direction, fpu management is eager.  When we switch out
of an fpu-using task, we always save its fpu state.

This is wasteful if the task(s) that run until we switch back in all don't use
the fpu, since we could have kept the task's fpu on the cpu all this time
and saved an fpu save/load cycle.  This can be quite common with threaded
interrupts, but will also happen with normal kernel threads and even normal
user tasks.

This patch series converts task fpu management to be fully lazy.  When
switching out of a task, we keep its fpu state on the cpu, only flushing it
if some other task needs the fpu.

Open issues/TODO:

- patch 2 enables interrupts during #NM.  There's a comment that says
  it shouldn't be done, presumably because of old-style #FERR handling.
  Need to fix one way or the other (dropping #FERR support, eagerly saving
  state when #FERR is detected, or dropping the entire optimization on i386)
- flush fpu state on cpu offlining (trivial)
- make sure the AMD FXSAVE workaround still works correctly
- reduce IPIs by flushing fpu state when we know a task is being migrated
  (guidance from scheduler folk appreciated)
- preemptible kernel_fpu_begin() to improve latency on raid and crypto setups
  (will post patches)
- lazy host-side kvm fpu management (will post patches)
- accelerate signal delivery by allocating signal handlers their own fpu
  state, and letting them run with the normal task's fpu until they use
  an fp instruction (will generously leave to interested parties)

Avi Kivity (4):
  x86, fpu: merge __save_init_fpu() implementations
  x86, fpu: run device not available trap with interrupts enabled
  x86, fpu: Let the fpu remember which cpu it is active on
  x86, fpu: don't save fpu state when switching from a task

 arch/x86/include/asm/i387.h  |  126 +-
 arch/x86/include/asm/processor.h |4 +
 arch/x86/kernel/i387.c   |3 +
 arch/x86/kernel/process.c|1 +
 arch/x86/kernel/process_32.c |   12 +++-
 arch/x86/kernel/process_64.c |   13 +++--
 arch/x86/kernel/traps.c  |   13 ++---
 7 files changed, 139 insertions(+), 33 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/4] x86, fpu: don't save fpu state when switching from a task

2010-06-13 Thread Avi Kivity
Currently, we load the fpu state lazily when switching into a task: usually
we leave the fpu state in memory and only load it on demand.

However, when switching out of an fpu-using task, we eagerly save the fpu
state to memory.  This can be detrimental if we'll switch right back to this
task without touching the fpu again - we'll have run a save/load cycle for
nothing.

This patch changes fpu saving on switch out to be lazy - we simply leave the
fpu state alone.  If we're lucky, when we're back in this task the fpu state
will be loaded.  If not the fpu API will save the current fpu state and load
our state back.

Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kernel/process_32.c |   12 
 arch/x86/kernel/process_64.c |   13 -
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 8d12878..4cb5bc4 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -302,10 +302,12 @@ __switch_to(struct task_struct *prev_p, struct 
task_struct *next_p)
 * If the task has used fpu the last 5 timeslices, just do a full
 * restore of the math state immediately to avoid the trap; the
 * chances of needing FPU soon are obviously high now
+*
+* If the fpu is remote, we can't preload it since that requires an
+* IPI.  Let a math execption move it locally.
 */
-   preload_fpu = tsk_used_math(next_p)  next_p-fpu_counter  5;
-
-   __unlazy_fpu(prev_p);
+   preload_fpu = tsk_used_math(next_p)  next_p-fpu_counter  5
+!fpu_remote(next-fpu);
 
/* we're going to use this soon, after a few expensive things */
if (preload_fpu)
@@ -351,8 +353,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct 
*next_p)
 
/* If we're going to preload the fpu context, make sure clts
   is run while we're batching the cpu state updates. */
-   if (preload_fpu)
+   if (preload_fpu || fpu_loaded(next-fpu))
clts();
+   else
+   stts();
 
/*
 * Leave lazy mode, flushing any hypercalls made here.
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3c2422a..65d2130 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -383,8 +383,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct 
*next_p)
 * If the task has used fpu the last 5 timeslices, just do a full
 * restore of the math state immediately to avoid the trap; the
 * chances of needing FPU soon are obviously high now
+*
+* If the fpu is remote, we can't preload it since that requires an
+* IPI.  Let a math execption move it locally.
 */
-   preload_fpu = tsk_used_math(next_p)  next_p-fpu_counter  5;
+   preload_fpu = tsk_used_math(next_p)  next_p-fpu_counter  5
+!fpu_remote(next-fpu);
 
/* we're going to use this soon, after a few expensive things */
if (preload_fpu)
@@ -418,12 +422,11 @@ __switch_to(struct task_struct *prev_p, struct 
task_struct *next_p)
 
load_TLS(next, cpu);
 
-   /* Must be after DS reload */
-   unlazy_fpu(prev_p);
-
/* Make sure cpu is ready for new context */
-   if (preload_fpu)
+   if (preload_fpu || fpu_loaded(next-fpu))
clts();
+   else
+   stts();
 
/*
 * Leave lazy mode, flushing any hypercalls made here.
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/4] x86, fpu: Let the fpu remember which cpu it is active on

2010-06-13 Thread Avi Kivity
Add a member fpu-cpu to struct fpu which contains which cpu has this fpu
register set loaded (or -1 if the registers were flushed to memory in
fpu-state).

The various fpu accesors are modified to IPI the loaded cpu if it
happens to be different from the current cpu.

Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/include/asm/i387.h  |  115 +++--
 arch/x86/include/asm/processor.h |4 +
 arch/x86/kernel/i387.c   |3 +
 arch/x86/kernel/process.c|1 +
 4 files changed, 116 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index df5badf..124c89d 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -174,7 +174,7 @@ static inline void fpu_fxsave(struct fpu *fpu)
 #endif
 }
 
-static inline void fpu_save_init(struct fpu *fpu)
+static inline void __fpu_save_init(struct fpu *fpu)
 {
if (use_xsave())
fpu_xsave(fpu);
@@ -222,10 +222,7 @@ static inline int fxrstor_checking(struct 
i387_fxsave_struct *fx)
 #define safe_address (kstat_cpu(0).cpustat.user)
 #endif
 
-/*
- * These must be called with preempt disabled
- */
-static inline void fpu_save_init(struct fpu *fpu)
+static inline void __fpu_save_init(struct fpu *fpu)
 {
if (use_xsave()) {
struct xsave_struct *xstate = fpu-state-xsave;
@@ -273,6 +270,33 @@ end:
 
 #endif /* CONFIG_X86_64 */
 
+static inline bool fpu_loaded(struct fpu *fpu)
+{
+   return fpu-cpu == smp_processor_id();
+}
+
+static inline bool fpu_remote(struct fpu *fpu)
+{
+   return fpu-cpu != -1  fpu-cpu != smp_processor_id();
+}
+
+/*
+ * These must be called with preempt disabled
+ */
+static inline void fpu_save_init(struct fpu *fpu)
+{
+   ulong flags;
+
+   if (__get_cpu_var(current_fpu) != fpu
+   || fpu-cpu != smp_processor_id())
+   return;
+   local_irq_save(flags);
+   __fpu_save_init(fpu);
+   fpu-cpu = -1;
+   __get_cpu_var(current_fpu) = NULL;
+   local_irq_restore(flags);
+}
+
 static inline void __save_init_fpu(struct task_struct *tsk)
 {
fpu_save_init(tsk-thread.fpu);
@@ -284,7 +308,7 @@ static inline int fpu_fxrstor_checking(struct fpu *fpu)
return fxrstor_checking(fpu-state-fxsave);
 }
 
-static inline int fpu_restore_checking(struct fpu *fpu)
+static inline int __fpu_restore_checking(struct fpu *fpu)
 {
if (use_xsave())
return fpu_xrstor_checking(fpu);
@@ -292,6 +316,47 @@ static inline int fpu_restore_checking(struct fpu *fpu)
return fpu_fxrstor_checking(fpu);
 }
 
+static inline void __fpu_unload(void *_fpu)
+{
+   struct fpu *fpu = _fpu;
+   unsigned cr0 = read_cr0();
+
+   if (cr0  X86_CR0_TS)
+   clts();
+   if (__get_cpu_var(current_fpu) == fpu)
+   fpu_save_init(fpu);
+   if (cr0  X86_CR0_TS)
+   write_cr0(cr0);
+}
+
+static inline void fpu_unload(struct fpu *fpu)
+{
+   int cpu = ACCESS_ONCE(fpu-cpu);
+
+   if (cpu != -1)
+   smp_call_function_single(cpu, __fpu_unload, fpu, 1);
+}
+
+static inline int fpu_restore_checking(struct fpu *fpu)
+{
+   ulong flags;
+   struct fpu *oldfpu;
+   int ret;
+
+   if (fpu-cpu == smp_processor_id())
+   return 0;
+   fpu_unload(fpu);
+   local_irq_save(flags);
+   oldfpu = __get_cpu_var(current_fpu);
+   if (oldfpu)
+   fpu_save_init(oldfpu);
+   ret = __fpu_restore_checking(fpu);
+   fpu-cpu = smp_processor_id();
+   __get_cpu_var(current_fpu) = fpu;
+   local_irq_restore(flags);
+   return ret;
+}
+
 static inline int restore_fpu_checking(struct task_struct *tsk)
 {
return fpu_restore_checking(tsk-thread.fpu);
@@ -451,18 +516,46 @@ static bool fpu_allocated(struct fpu *fpu)
return fpu-state != NULL;
 }
 
+static inline void fpu_init_empty(struct fpu *fpu)
+{
+   fpu-state = NULL;
+   fpu-cpu = -1;
+}
+
 static inline int fpu_alloc(struct fpu *fpu)
 {
if (fpu_allocated(fpu))
return 0;
fpu-state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL);
+   fpu-cpu = -1;
if (!fpu-state)
return -ENOMEM;
WARN_ON((unsigned long)fpu-state  15);
return 0;
 }
 
-static inline void fpu_free(struct fpu *fpu)
+static inline void __fpu_forget(void *_fpu)
+{
+   struct fpu *fpu = _fpu;
+
+   if (fpu-cpu == smp_processor_id()) {
+   fpu-cpu = -1;
+   __get_cpu_var(current_fpu) = NULL;
+   }
+}
+
+static inline void fpu_forget(struct fpu *fpu)
+{
+   int cpu;
+
+   preempt_disable();
+   cpu = ACCESS_ONCE(fpu-cpu);
+   if (cpu != -1)
+   smp_call_function_single(cpu, __fpu_forget, fpu, 1);
+   preempt_enable();
+}
+
+static inline void __fpu_free(struct fpu *fpu)
 {
if (fpu-state) {

Re: [RFC][PATCH 1/2] Linux/Guest unmapped page cache control

2010-06-13 Thread Balbir Singh
* Balbir Singh bal...@linux.vnet.ibm.com [2010-06-08 21:21:46]:

 Selectively control Unmapped Page Cache (nospam version)
 
 From: Balbir Singh bal...@linux.vnet.ibm.com
 
 This patch implements unmapped page cache control via preferred
 page cache reclaim. The current patch hooks into kswapd and reclaims
 page cache if the user has requested for unmapped page control.
 This is useful in the following scenario
 
 - In a virtualized environment with cache=writethrough, we see
   double caching - (one in the host and one in the guest). As
   we try to scale guests, cache usage across the system grows.
   The goal of this patch is to reclaim page cache when Linux is running
   as a guest and get the host to hold the page cache and manage it.
   There might be temporary duplication, but in the long run, memory
   in the guests would be used for mapped pages.
 - The option is controlled via a boot option and the administrator
   can selectively turn it on, on a need to use basis.
 
 A lot of the code is borrowed from zone_reclaim_mode logic for
 __zone_reclaim(). One might argue that the with ballooning and
 KSM this feature is not very useful, but even with ballooning,
 we need extra logic to balloon multiple VM machines and it is hard
 to figure out the correct amount of memory to balloon. With these
 patches applied, each guest has a sufficient amount of free memory
 available, that can be easily seen and reclaimed by the balloon driver.
 The additional memory in the guest can be reused for additional
 applications or used to start additional guests/balance memory in
 the host.
 
 KSM currently does not de-duplicate host and guest page cache. The goal
 of this patch is to help automatically balance unmapped page cache when
 instructed to do so.
 
 There are some magic numbers in use in the code, UNMAPPED_PAGE_RATIO
 and the number of pages to reclaim when unmapped_page_control argument
 is supplied. These numbers were chosen to avoid aggressiveness in
 reaping page cache ever so frequently, at the same time providing control.
 
 The sysctl for min_unmapped_ratio provides further control from
 within the guest on the amount of unmapped pages to reclaim.


Are there any major objections to this patch?
 
-- 
Three Cheers,
Balbir
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/4] Really lazy fpu

2010-06-13 Thread Valdis . Kletnieks
On Sun, 13 Jun 2010 18:03:43 +0300, Avi Kivity said:
 Currently fpu management is only lazy in one direction.  When we switch into
 a task, we may avoid loading the fpu state in the hope that the task will
 never use it.  If we guess right we save an fpu load/save cycle; if not,
 a Device not Available exception will remind us to load the fpu.
 
 However, in the other direction, fpu management is eager.  When we switch out
 of an fpu-using task, we always save its fpu state.

Does anybody have numbers on how many clocks it takes a modern CPU design
to do a FPU state save or restore?  I know it must have been painful in the
days before cache memory, having to make added trips out to RAM for 128-bit
registers.  But what's the impact today? (Yes, I see there's the potential
for a painful IPI call - anything else?)

Do we have any numbers on how many saves/restores this will save us when
running the hypothetical standard Gnome desktop environment?  How common
is the we went all the way around to the original single FPU-using task case?


pgpslLeJ3IhFt.pgp
Description: PGP signature


Re: [RFC][PATCH 1/2] Linux/Guest unmapped page cache control

2010-06-13 Thread KAMEZAWA Hiroyuki
On Mon, 14 Jun 2010 00:01:45 +0530
Balbir Singh bal...@linux.vnet.ibm.com wrote:

 * Balbir Singh bal...@linux.vnet.ibm.com [2010-06-08 21:21:46]:
 
  Selectively control Unmapped Page Cache (nospam version)
  
  From: Balbir Singh bal...@linux.vnet.ibm.com
  
  This patch implements unmapped page cache control via preferred
  page cache reclaim. The current patch hooks into kswapd and reclaims
  page cache if the user has requested for unmapped page control.
  This is useful in the following scenario
  
  - In a virtualized environment with cache=writethrough, we see
double caching - (one in the host and one in the guest). As
we try to scale guests, cache usage across the system grows.
The goal of this patch is to reclaim page cache when Linux is running
as a guest and get the host to hold the page cache and manage it.
There might be temporary duplication, but in the long run, memory
in the guests would be used for mapped pages.
  - The option is controlled via a boot option and the administrator
can selectively turn it on, on a need to use basis.
  
  A lot of the code is borrowed from zone_reclaim_mode logic for
  __zone_reclaim(). One might argue that the with ballooning and
  KSM this feature is not very useful, but even with ballooning,
  we need extra logic to balloon multiple VM machines and it is hard
  to figure out the correct amount of memory to balloon. With these
  patches applied, each guest has a sufficient amount of free memory
  available, that can be easily seen and reclaimed by the balloon driver.
  The additional memory in the guest can be reused for additional
  applications or used to start additional guests/balance memory in
  the host.
  
  KSM currently does not de-duplicate host and guest page cache. The goal
  of this patch is to help automatically balance unmapped page cache when
  instructed to do so.
  
  There are some magic numbers in use in the code, UNMAPPED_PAGE_RATIO
  and the number of pages to reclaim when unmapped_page_control argument
  is supplied. These numbers were chosen to avoid aggressiveness in
  reaping page cache ever so frequently, at the same time providing control.
  
  The sysctl for min_unmapped_ratio provides further control from
  within the guest on the amount of unmapped pages to reclaim.
 
 
 Are there any major objections to this patch?
  

This kind of patch needs how it works well measurement.

- How did you measure the effect of the patch ? kernbench is not enough, of 
course.
- Why don't you believe LRU ? And if LRU doesn't work well, should it be
  fixed by a knob rather than generic approach ?
- No side effects ?

- Linux vm guys tend to say, free memory is bad memory. ok, for what
  free memory created by your patch is used ? IOW, I can't see the benefit.
  If free memory that your patch created will be used for another page-cache,
  it will be dropped soon by your patch itself.

  If your patch just drops duplicated, but no more necessary for other kvm,
  I agree your patch may increase available size of page-caches. But you just
  drops unmapped pages.
  Hmm.

Thanks,
-Kame
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Autotest] [KVM-AUTOTEST PATCH 11/14] KVM test: add kvm_monitor.py, an interface to QEMU monitors

2010-06-13 Thread Amos Kong
On Sun, Jun 13, 2010 at 05:33:42PM +0300, Michael Goldish wrote:
 This module should replace vm.send_monitor_cmd().  Instead of connecting to 
 the
 monitor each time a command is issued, this module maintains a continuous
 connection to the monitor.  It disconnects when a test terminates and
 reconnects as soon as the next test begins (upon unpickling).
 
 It currently contains only an interface to the human monitor.  A QMP interface
 will be introduced in a future patch.
 
 Signed-off-by: Michael Goldish mgold...@redhat.com
 ---
  client/tests/kvm/kvm_monitor.py |  356 
 +++
  1 files changed, 356 insertions(+), 0 deletions(-)
  create mode 100644 client/tests/kvm/kvm_monitor.py
 
 diff --git a/client/tests/kvm/kvm_monitor.py b/client/tests/kvm/kvm_monitor.py
 new file mode 100644
 index 000..c5cf9c3
 --- /dev/null
 +++ b/client/tests/kvm/kvm_monitor.py
 @@ -0,0 +1,356 @@
 +
 +Interfaces to the QEMU monitor.
 +
 +...@copyright: 2008-2010 Red Hat Inc.
 +
 +

...

 +class HumanMonitor(Monitor):
 +
 +Wraps human monitor commands.
 +
 +
 +def __init__(self, filename, suppress_exceptions=False):
 +
 +Connect to the monitor socket and find the (qemu) prompt.
 +
 +@param filename: Monitor socket filename
 +@raise MonitorConnectError: Raised if the connection fails and
 +suppress_exceptions is False
 +@raise MonitorProtocolError: Raised if the initial (qemu) prompt 
 isn't
 +found and suppress_exceptions is False
 +
 +try:
 +Monitor.__init__(self, filename)
 +
 +self.protocol = human
 +
 +# Find the initial (qemu) prompt
 +s, o = self._read_up_to_qemu_prompt(20)
 +if not s:
 +raise MonitorProtocolError(Could not find (qemu) prompt 
 +   after connecting to monitor. 
 +   Output so far: %r % o)
 +
 +# Save the output of 'help' for future use
 +self.help = self._get_command_output(help)

Hi Michael,

Here, self.help is a string type.
But you repeatedly define a sub function self.help() below.

If I call vm.monitor.help() in testcase, will touch this err:
 TypeError: 'str' object is not callable

How about using self.help_str = self._get_command_output(help)?
and remove help sub function, Not only repeated name, but repeated function.


Amos

 +
 +except MonitorError, e:
 +if suppress_exceptions:
 +logging.warn(e)
 +else:
 +raise
 +

...

 +
 +def help(self):
 +
 +Send help and return the output.
 +
 +return self._get_command_output(help)

...
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Autotest] [KVM-AUTOTEST PATCH 13/14] KVM test: kvm_monitor.py: add QMP interface

2010-06-13 Thread Amos Kong
On Sun, Jun 13, 2010 at 05:33:44PM +0300, Michael Goldish wrote:
 An initial QMP client implementation.
 Should be fully functional and supports asynchronous events.
 However, most tests must be modified to support it, because it returns output
 in a different format from the human monitor (the human monitor returns 
 strings
 and the QMP one returns dicts or lists).
 
 To enable QMP, set main_monitor to a monitor whose monitor_type is qmp.
 
 For example (a single QMP monitor):
 
 monitors = monitor1
 monitor_type_monitor1 = qmp
 main_monitor = monitor1
 
 Another example (multiple monitors, both human and QMP):
 
 monitors = MyMonitor SomeOtherMonitor YetAnotherMonitor   # defines 3 
 monitors
 monitor_type = human# default for all monitors
 monitor_type_SomeOtherMonitor = qmp # applies only to SomeOtherMonitor
 monitor_type_YetAnotherMonitor = qmp# applies only to 
 YetAnotherMonitor
 main_monitor = SomeOtherMonitor # the main monitor is a QMP one, 
 so
 # the test will use QMP
 
 Note:
 Monitor methods now raise exceptions such as MonitorLockError and QMPCmdError.
 If this turns out to be a bad idea, it shouldn't be hard to revert to the old
 convention of returning a (status, output) tuple.
 
 Signed-off-by: Michael Goldish mgold...@redhat.com
 ---
  client/tests/kvm/kvm_monitor.py |  275 
 +++
  client/tests/kvm/kvm_vm.py  |6 +-
  2 files changed, 279 insertions(+), 2 deletions(-)
 
 diff --git a/client/tests/kvm/kvm_monitor.py b/client/tests/kvm/kvm_monitor.py
 index c5cf9c3..76a1a83 100644
 --- a/client/tests/kvm/kvm_monitor.py
 +++ b/client/tests/kvm/kvm_monitor.py
 @@ -6,6 +6,11 @@ Interfaces to the QEMU monitor.
  
  import socket, time, threading, logging
  import kvm_utils
 +try:
 +import json
 +except ImportError:
 +logging.warning(Could not import json module. 
 +QMP monitor functionality disabled.)
  
  
  class MonitorError(Exception):
 @@ -28,6 +33,10 @@ class MonitorProtocolError(MonitorError):
  pass
  
  
 +class QMPCmdError(MonitorError):
 +pass
 +
 +
  class Monitor:
  
  Common code for monitor classes.
 @@ -114,6 +123,8 @@ class HumanMonitor(Monitor):
  suppress_exceptions is False
  @raise MonitorProtocolError: Raised if the initial (qemu) prompt 
 isn't
  found and suppress_exceptions is False
 +@note: Other exceptions may be raised.  See _get_command_output's
 +docstring.
  
  try:
  Monitor.__init__(self, filename)
 @@ -354,3 +365,267 @@ class HumanMonitor(Monitor):
  @return: The command's output
  
  return self._get_command_output(mouse_button %d % state)
 +
 +
 +class QMPMonitor(Monitor):
 +
 +Wraps QMP monitor commands.
 +
 +
 +def __init__(self, filename, suppress_exceptions=False):
 +
 +Connect to the monitor socket and issue the qmp_capabilities command
 +
 +@param filename: Monitor socket filename
 +@raise MonitorConnectError: Raised if the connection fails and
 +suppress_exceptions is False
 +@note: Other exceptions may be raised if the qmp_capabilities command
 +fails.  See _get_command_output's docstring.
 +
 +try:
 +Monitor.__init__(self, filename)
 +
 +self.protocol = qmp
 +self.events = []
 +
 +# Issue qmp_capabilities
 +self._get_command_output(qmp_capabilities)
 +
 +except MonitorError, e:
 +if suppress_exceptions:
 +logging.warn(e)
 +else:
 +raise
 +
 +
 +# Private methods
 +
 +def _build_cmd(self, cmd, args=None):
 +obj = {execute: cmd}
 +if args:
 +obj[arguments] = args
 +return obj
 +
 +
 +def _read_objects(self, timeout=5):
 +
 +Read lines from monitor and try to decode them.
 +Stop when all available lines have been successfully decoded, or when
 +timeout expires.  If any decoded objects are asynchronous events, 
 store
 +them in self.events.  Return all decoded objects.
 +
 +@param timeout: Time to wait for all lines to decode successfully
 +@return: A list of objects
 +
 +s = 
 +objs = []
 +end_time = time.time() + timeout
 +while time.time()  end_time:
 +s += self._recvall()
 +for line in s.splitlines():
 +if not line:
 +continue
 +try:
 +obj = json.loads(line)
 +except:
 +# Found an incomplete or broken line -- keep reading
 +break
 +objs += [obj]
 +else:
 +# All lines are OK -- stop 

[RFC PATCH 0/5] Introduce canonical device hierarchy string

2010-06-13 Thread Alex Williamson
This is a follow-up to my ramblock overhaul RFC series.  In trying to
come up with a useful name to give to a ramblock, we seemed to be leaning
towards something that represents the existing qdev hierarchy rather
than creating an arbitrary new namespace.  I had some pointers that I
should use the savevm name/instance in the interim, but that has a number
of issues (private to savevm, typically setup too late, inconsistent).

So, I decided to look at what should the savevm string be, with hopes that
if we could figure that out, we can then stuff the resulting string into
both the savevm list and the ramblocks.  This is a stab at doing that.

My premise with this attempt is that we walk the hierarchy and use the
names to create the base of the path.  As we get to the device,
particularly to the parent bus of the device, we need to start looking at
properties to ensure uniqueness.  However, we don't want to use all the
properties or else any properties added or removed from a device will cause
migration failures.  For now, the only properties I've tagged as path
properties are PCI bus addresses and MAC addresses.  It turns out the MAC
isn't needed in most cases since they're typically PCI based, but I left
it in in case someone figures out how to make multiple instances of
ne2k_isa work (or non-PCI based NICs on other archs).  In any case, these
seem like they should be stable properties for a device.

I've compiled all the archs, but I've only actually run x86_64-system.
For a well populated VM, here's what got used as id strings in the
savevm_handlers list:

timer
slirp
slirp
slirp
slirp
slirp
block
ram
cpu_common
cpu
apic
fw_cfg
i8259
i8259
ioapic
PCIBUS
/main-system-bus/pci.0,addr=00.0/i440FX/I440FX
/main-system-bus/pci.0,addr=01.0/PIIX3/PIIX3
/main-system-bus/pci.0,addr=02.0/cirrus-vga/cirrus_vga
/main-system-bus/pci.0/isa.0/mc146818rtc/mc146818rtc
i8254
hpet
/main-system-bus/pci.0/isa.0/isa-serial/serial
ps2kbd
ps2mouse
/main-system-bus/pci.0/isa.0/i8042/pckbd
vmmouse
dma
dma
/main-system-bus/pci.0/isa.0/isa-fdc/fdc
/main-system-bus/pci.0,addr=03.0/i82551,mac=52:54:00:12:34:56/eeprom
/main-system-bus/pci.0,addr=03.0/i82551,mac=52:54:00:12:34:56/i82551
/main-system-bus/pci.0,addr=04.0/virtio-net-pci,mac=52:54:00:12:34:57/virtio-net
/main-system-bus/pci.0,addr=05.0/e1000,mac=52:54:00:12:34:58/e1000
/main-system-bus/pci.0,addr=06.0/rtl8139,mac=52:54:00:12:34:59/rtl8139
/main-system-bus/pci.0,addr=07.0/pcnet,mac=52:54:00:12:34:5a/pcnet
/main-system-bus/pci.0,addr=01.1/piix3-ide/ide
i2c_bus
/main-system-bus/pci.0,addr=01.3/PIIX4_PM/piix4_pm
/main-system-bus/pci.0,addr=08.0/lsi53c895a/lsiscsi
/main-system-bus/pci.0,addr=09.0/virtio-blk-pci/virtio-blk

Let me know what you think.  Thanks,

Alex

---

Alex Williamson (5):
  virtio-net: Incorporate a DeviceState pointer and let savevm track 
instances
  eepro100: Add a dev field to eeprom new/free functions
  savevm: Make use of the new DeviceState param
  savevm: Add DeviceState param
  qdev: Create qdev_get_dev_path()


 audio/audio.c  |2 -
 block-migration.c  |2 -
 exec.c |4 +-
 hw/adb.c   |4 +-
 hw/ads7846.c   |2 -
 hw/apic.c  |2 -
 hw/arm_gic.c   |2 -
 hw/arm_timer.c |4 +-
 hw/armv7m_nvic.c   |2 -
 hw/cirrus_vga.c|2 -
 hw/cuda.c  |2 -
 hw/dma.c   |4 +-
 hw/eepro100.c  |8 ++--
 hw/eeprom93xx.c|8 ++--
 hw/eeprom93xx.h|4 +-
 hw/fw_cfg.c|2 -
 hw/g364fb.c|2 -
 hw/grackle_pci.c   |4 +-
 hw/gt64xxx.c   |3 +
 hw/heathrow_pic.c  |2 -
 hw/hpet.c  |2 -
 hw/hw.h|   18 +---
 hw/i2c.c   |2 -
 hw/i8254.c |2 -
 hw/i8259.c |2 -
 hw/ide/cmd646.c|2 -
 hw/ide/isa.c   |2 -
 hw/ide/macio.c |2 -
 hw/ide/microdrive.c|2 -
 hw/ide/mmio.c  |2 -
 hw/ide/piix.c  |2 -
 hw/ioapic.c|2 -
 hw/m48t59.c|2 -
 hw/mac_dbdma.c |2 -
 hw/mac_nvram.c |2 -
 hw/max111x.c   |3 +
 hw/mipsnet.c   |4 +-
 hw/mst_fpga.c  |2 -
 hw/nand.c  |2 -
 hw/openpic.c   |5 +-
 hw/pci.c   |2 -
 hw/pckbd.c |2 -
 hw/piix4.c |2 -
 hw/pl011.c |2 -
 hw/pl022.c |2 -
 hw/pl061.c |2 -
 hw/ppc4xx_pci.c|4 +-
 hw/ppce500_pci.c   |4 +-
 hw/ps2.c   |4 +-
 hw/pxa2xx.c|   24 +--
 hw/pxa2xx_dma.c|2 -
 hw/pxa2xx_gpio.c   |2 -
 hw/pxa2xx_keypad.c |2 -
 hw/pxa2xx_lcd.c|2 -
 hw/pxa2xx_mmci.c   |2 -
 hw/pxa2xx_pic.c|2 -
 hw/pxa2xx_timer.c  |2 -
 hw/qdev-properties.c   |2 +
 

[RFC PATCH 1/5] qdev: Create qdev_get_dev_path()

2010-06-13 Thread Alex Williamson
qdev_get_dev_path() is intended to be the canonical utility for creating
a string representing the qdev hierarchy of a device.  The path consists
of bus and device names as well as identified properties of the immediate
parent bus and device.  This results in strings like:

/main-system-bus/pci.0,addr=00.0/i440FX
/main-system-bus/pci.0,addr=01.0/PIIX3
/main-system-bus/pci.0,addr=02.0/cirrus-vga
/main-system-bus/pci.0/isa.0/mc146818rtc
/main-system-bus/pci.0/isa.0/isa-serial
/main-system-bus/pci.0/isa.0/i8042
/main-system-bus/pci.0/isa.0/isa-fdc
/main-system-bus/pci.0,addr=03.0/i82551,mac=52:54:00:12:34:56
/main-system-bus/pci.0,addr=04.0/virtio-net-pci,mac=52:54:00:12:34:57
/main-system-bus/pci.0,addr=05.0/e1000,mac=52:54:00:12:34:58
/main-system-bus/pci.0,addr=06.0/rtl8139,mac=52:54:00:12:34:59
/main-system-bus/pci.0,addr=07.0/pcnet,mac=52:54:00:12:34:5a
/main-system-bus/pci.0,addr=01.1/piix3-ide
/main-system-bus/pci.0,addr=01.3/PIIX4_PM
/main-system-bus/pci.0,addr=08.0/lsi53c895a
/main-system-bus/pci.0,addr=09.0/virtio-blk-pci

There are two primary targets for these strings.  The first is vmsave, where
we currently use a device provided string plus instance number to track
SaveStateEntries.  This fails when we introduce device hotplug, particularly
in a case were we have gaps in the instance numbers that cannot easily be
reproduced on a migration target.  The second is for naming RAMBlocks.  For
these, we would like a string that matches the vmstate string.

Signed-off-by: Alex Williamson alex.william...@redhat.com
---

 hw/qdev-properties.c |2 ++
 hw/qdev.c|   57 ++
 hw/qdev.h|5 
 3 files changed, 64 insertions(+), 0 deletions(-)

diff --git a/hw/qdev-properties.c b/hw/qdev-properties.c
index 9ffdba7..e30df88 100644
--- a/hw/qdev-properties.c
+++ b/hw/qdev-properties.c
@@ -453,6 +453,7 @@ PropertyInfo qdev_prop_macaddr = {
 .name  = macaddr,
 .type  = PROP_TYPE_MACADDR,
 .size  = sizeof(MACAddr),
+.flags = PROP_FLAG_PATH,
 .parse = parse_mac,
 .print = print_mac,
 };
@@ -496,6 +497,7 @@ PropertyInfo qdev_prop_pci_devfn = {
 .name  = pci-devfn,
 .type  = PROP_TYPE_UINT32,
 .size  = sizeof(uint32_t),
+.flags = PROP_FLAG_PATH,
 .parse = parse_pci_devfn,
 .print = print_pci_devfn,
 };
diff --git a/hw/qdev.c b/hw/qdev.c
index 36f29ea..dea44fe 100644
--- a/hw/qdev.c
+++ b/hw/qdev.c
@@ -120,6 +120,63 @@ DeviceState *qdev_create(BusState *bus, const char *name)
 return qdev_create_from_info(bus, info);
 }
 
+static int qdev_strprint_parent_path(DeviceState *dev, char *buf, size_t len)
+{
+int offset = 0;
+
+if (dev-parent_bus  dev-parent_bus-parent)
+offset = qdev_strprint_parent_path(dev-parent_bus-parent, buf, len);
+
+offset += snprintf(buf + offset, len - offset,
+/%s, dev-parent_bus-name);
+return offset;
+}
+
+static int qdev_strprint_path_props(DeviceState *dev, Property *props,
+char *buf, size_t len)
+{
+int offset = 0;
+char pbuf[64];
+
+if (!props)
+return 0;
+
+while (props-name) {
+if (props-info-flags  PROP_FLAG_PATH) {
+if (props-info-print) {
+props-info-print(dev, props, pbuf, sizeof(pbuf));
+offset += snprintf(buf + offset, len - offset, ,%s=%s,
+   props-name, pbuf);
+}
+}
+props++;
+}
+return offset;
+}
+
+char *qdev_get_dev_path(DeviceState *dev)
+{
+char buf[256] = ;
+int offset;
+
+if (!dev)
+return NULL;
+
+offset = qdev_strprint_parent_path(dev, buf, sizeof(buf));
+
+offset += qdev_strprint_path_props(dev, dev-parent_bus-info-props,
+   buf + offset, sizeof(buf) - offset);
+
+offset += snprintf(buf + offset, sizeof(buf) - offset, /%s,
+   dev-info-name);
+if (dev-id)
+offset += snprintf(buf + offset, sizeof(buf) - offset, -%s, dev-id);
+
+offset += qdev_strprint_path_props(dev, dev-info-props,
+   buf + offset, sizeof(buf) - offset);
+return qemu_strdup(buf);
+}
+
 static void qdev_print_devinfo(DeviceInfo *info)
 {
 error_printf(name \%s\, bus %s,
diff --git a/hw/qdev.h b/hw/qdev.h
index a44060e..2702384 100644
--- a/hw/qdev.h
+++ b/hw/qdev.h
@@ -96,6 +96,7 @@ struct PropertyInfo {
 const char *name;
 size_t size;
 enum PropertyType type;
+int flags;
 int (*parse)(DeviceState *dev, Property *prop, const char *str);
 int (*print)(DeviceState *dev, Property *prop, char *dest, size_t len);
 };
@@ -201,6 +202,8 @@ extern PropertyInfo qdev_prop_netdev;
 extern PropertyInfo qdev_prop_vlan;
 extern PropertyInfo qdev_prop_pci_devfn;
 
+#define PROP_FLAG_PATH(10)
+
 #define DEFINE_PROP(_name, _state, _field, _prop, _type) { \
 .name  = (_name),

[RFC PATCH 5/5] virtio-net: Incorporate a DeviceState pointer and let savevm track instances

2010-06-13 Thread Alex Williamson
Stuff a pointer to the DeviceState into the VirtIONet structure so that
we can easily remove the vmstate entry later.  Also, let vmstate track
the instance number (it should always be zero internally since the
device path should now be unique).

Signed-off-by: Alex Williamson alex.william...@redhat.com
---

 hw/virtio-net.c |7 ---
 1 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/hw/virtio-net.c b/hw/virtio-net.c
index e9768e0..f41db45 100644
--- a/hw/virtio-net.c
+++ b/hw/virtio-net.c
@@ -60,6 +60,7 @@ typedef struct VirtIONet
 uint8_t *macs;
 } mac_table;
 uint32_t *vlans;
+DeviceState *qdev;
 } VirtIONet;
 
 /* TODO
@@ -890,7 +891,6 @@ static void virtio_net_vmstate_change(void *opaque, int 
running, int reason)
 VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf)
 {
 VirtIONet *n;
-static int virtio_net_id;
 
 n = (VirtIONet *)virtio_common_init(virtio-net, VIRTIO_ID_NET,
 sizeof(struct virtio_net_config),
@@ -923,7 +923,8 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf 
*conf)
 
 n-vlans = qemu_mallocz(MAX_VLAN  3);
 
-register_savevm(NULL, virtio-net, virtio_net_id++, VIRTIO_NET_VM_VERSION,
+n-qdev = dev;
+register_savevm(dev, virtio-net, -1, VIRTIO_NET_VM_VERSION,
 virtio_net_save, virtio_net_load, n);
 n-vmstate = qemu_add_vm_change_state_handler(virtio_net_vmstate_change, 
n);
 
@@ -941,7 +942,7 @@ void virtio_net_exit(VirtIODevice *vdev)
 
 qemu_purge_queued_packets(n-nic-nc);
 
-unregister_savevm(NULL, virtio-net, n);
+unregister_savevm(n-qdev, virtio-net, n);
 
 qemu_free(n-mac_table.macs);
 qemu_free(n-vlans);

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html