[COMMIT master] libkvm: make kvm_create_pit static

2009-05-05 Thread Avi Kivity
From: Michael S. Tsirkin m...@redhat.com

libkvm-x86.c:55: warning: no previous prototype for ‘kvm_create_pit’

Signed-off-by: Michael S. Tsirkin m...@redhat.com
Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/kvm/libkvm/libkvm-x86.c b/kvm/libkvm/libkvm-x86.c
index 2fc4fce..df8cc81 100644
--- a/kvm/libkvm/libkvm-x86.c
+++ b/kvm/libkvm/libkvm-x86.c
@@ -52,7 +52,7 @@ static int kvm_init_tss(kvm_context_t kvm)
return 0;
 }
 
-int kvm_create_pit(kvm_context_t kvm)
+static int kvm_create_pit(kvm_context_t kvm)
 {
 #ifdef KVM_CAP_PIT
int r;
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[COMMIT master] Fix loading extboot option rom

2009-05-05 Thread Avi Kivity
From: Avi Kivity a...@redhat.com

The buffer that is used to store the extboot filename is later overwritten
by the vga rom loading code.  Use strdup() to keep our filename.

Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/hw/pc.c b/hw/pc.c
index db34f53..4b17b9c 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -963,7 +963,7 @@ static void pc_init1(ram_addr_t ram_size, int vga_ram_size,
 
 if (extboot_drive != -1) {
snprintf(buf, sizeof(buf), %s/%s, bios_dir, EXTBOOT_FILENAME);
-option_rom[nb_option_roms++] = buf;
+option_rom[nb_option_roms++] = strdup(buf);
 }
 
 option_rom_offset = qemu_ram_alloc(0x2);
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[COMMIT master] Present kvm with corret apic phys id.

2009-05-05 Thread Avi Kivity
From: Glauber Costa glom...@redhat.com

KVM will 24-shift bits in addr 0x20 (APIC_ID) before actually
using it. We currently load phys_id as s-id. After shifted
by 24 bits, it will result in a meaningless value. We should really
be doing s-id  24, which, after shifted, will lead to the correct
value.

This is for the load function. save has the invert problem.

Signed-off-by: Glauber Costa glom...@redhat.com
Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/hw/apic.c b/hw/apic.c
index 8c059f6..466fb7e 100644
--- a/hw/apic.c
+++ b/hw/apic.c
@@ -835,7 +835,7 @@ static void kvm_kernel_lapic_save_to_user(APICState *s)
 
 kvm_get_lapic(kvm_context, s-cpu_env-cpu_index, kapic);
 
-s-id = kapic_reg(kapic, 0x2);
+s-id = kapic_reg(kapic, 0x2)  24;
 s-tpr = kapic_reg(kapic, 0x8);
 s-arb_id = kapic_reg(kapic, 0x9);
 s-log_dest = kapic_reg(kapic, 0xd)  24;
@@ -868,7 +868,7 @@ static void kvm_kernel_lapic_load_from_user(APICState *s)
 int i;
 
 memset(klapic, 0, sizeof apic);
-kapic_set_reg(klapic, 0x2, s-id);
+kapic_set_reg(klapic, 0x2, s-id  24);
 kapic_set_reg(klapic, 0x8, s-tpr);
 kapic_set_reg(klapic, 0xd, s-log_dest  24);
 kapic_set_reg(klapic, 0xe, s-dest_mode  28 | 0x0fff);
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[COMMIT master] Remove the dependency for phys_ram_base for ipf.c

2009-05-05 Thread Avi Kivity
From: Jes Sorensen j...@sgi.com

Fix ia64 code to use copy_physical_memory_{read,write} in hob and
nvram code, removing dependencies of qemu_get_ram_ptr() usage.

This results in cleaned up APIs and removal of unnecessary global
variables.

Signed-off-by: Jes Sorensen j...@sgi.com
Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/hw/ipf.c b/hw/ipf.c
index 248b01d..d051666 100644
--- a/hw/ipf.c
+++ b/hw/ipf.c
@@ -54,8 +54,6 @@ static fdctrl_t *floppy_controller;
 static RTCState *rtc_state;
 static PCIDevice *i440fx_state;
 
-void *gfw_start;
-
 static uint32_t ipf_to_legacy_io(target_phys_addr_t addr)
 {
 return (uint32_t)(((addr0x3ff)  12  2)|((addr)  0x3));
@@ -455,15 +453,12 @@ static void ipf_init1(ram_addr_t ram_size, int 
vga_ram_size,
 if (kvm_enabled()) {
 unsigned long  image_size;
 uint8_t *image = NULL;
-target_phys_addr_t fw_image_start;
-unsigned long nvram_addr = 0;
+unsigned long nvram_addr;
 unsigned long nvram_fd = 0;
 unsigned long type = READ_FROM_NVRAM;
 unsigned long i = 0;
-
-ram_addr  = qemu_ram_alloc(GFW_SIZE);
-gfw_start = qemu_get_ram_ptr(ram_addr);
-cpu_register_physical_memory(GFW_START, GFW_SIZE, ram_addr);
+unsigned long fw_offset;
+ram_addr_t fw_mem = qemu_ram_alloc(GFW_SIZE);
 
 snprintf(buf, sizeof(buf), %s/%s, bios_dir, FW_FILENAME);
 image = read_image(buf, image_size );
@@ -472,26 +467,27 @@ static void ipf_init1(ram_addr_t ram_size, int 
vga_ram_size,
 fprintf(stderr, Please check Guest firmware at %s\n, buf);
 exit(1);
 }
+fw_offset = GFW_START + GFW_SIZE - image_size;
 
-/* Load Guest Firmware to the proper postion. */
-fw_image_start = GFW_START + GFW_SIZE - image_size;
-cpu_physical_memory_write(fw_image_start, image, image_size);
-free(image);
+cpu_register_physical_memory(GFW_START, GFW_SIZE, fw_mem);
+cpu_physical_memory_write(fw_offset, image, image_size);
 
+free(image);
 
 if (nvram) {
 nvram_addr = NVRAM_START;
 nvram_fd = kvm_ia64_nvram_init(type);
 if (nvram_fd != -1) {
-kvm_ia64_copy_from_nvram_to_GFW(nvram_fd, gfw_start);
+kvm_ia64_copy_from_nvram_to_GFW(nvram_fd);
 close(nvram_fd);
 }
 i = atexit((void *)kvm_ia64_copy_from_GFW_to_nvram);
 if (i != 0)
 fprintf(stderr, cannot set exit function\n);
-}
-kvm_ia64_build_hob(ram_size + above_4g_mem_size, smp_cpus,
-   gfw_start, nvram_addr);
+} else
+nvram_addr = 0;
+
+kvm_ia64_build_hob(ram_size + above_4g_mem_size, smp_cpus, nvram_addr);
 }
 
 /*Register legacy io address space, size:64M*/
@@ -512,17 +508,15 @@ static void ipf_init1(ram_addr_t ram_size, int 
vga_ram_size,
 }
 
 if (cirrus_vga_enabled) {
-if (pci_enabled) {
+if (pci_enabled)
 pci_cirrus_vga_init(pci_bus, vga_ram_size);
-} else {
+else
 isa_cirrus_vga_init(vga_ram_size);
-}
 } else {
-if (pci_enabled) {
+if (pci_enabled)
 pci_vga_init(pci_bus, vga_ram_size, 0, 0);
-} else {
+else
 isa_vga_init(vga_ram_size);
-}
 }
 
 rtc_state = rtc_init(0x70, i8259[8], 2000);
diff --git a/target-ia64/firmware.c b/target-ia64/firmware.c
index ba16bd8..79f8464 100644
--- a/target-ia64/firmware.c
+++ b/target-ia64/firmware.c
@@ -91,12 +91,11 @@ static int add_nvram_hob(void *hob_buf, unsigned long 
nvram_addr);
 static int build_hob(void *hob_buf, unsigned long hob_buf_size,
  unsigned long dom_mem_size, unsigned long vcpus,
  unsigned long nvram_addr);
-static int load_hob(void *hob_buf,
-unsigned long dom_mem_size, void *hob_start);
+static int load_hob(void *hob_buf, unsigned long dom_mem_size);
 
 int
 kvm_ia64_build_hob(unsigned long memsize, unsigned long vcpus,
-   void* fw_start, unsigned long nvram_addr)
+   unsigned long nvram_addr)
 {
 char   *hob_buf;
 
@@ -111,7 +110,8 @@ kvm_ia64_build_hob(unsigned long memsize, unsigned long 
vcpus,
 Hob_Output(Could not build hob);
 return -1;
 }
-if (load_hob(hob_buf, memsize, fw_start + HOB_OFFSET)  0) {
+
+if (load_hob(hob_buf, memsize)  0) {
 free(hob_buf);
 Hob_Output(Could not load hob);
 return -1;
@@ -249,7 +249,7 @@ err_out:
 return -1;
 }
 static int
-load_hob(void *hob_buf, unsigned long dom_mem_size, void *hob_start)
+load_hob(void *hob_buf, unsigned long dom_mem_size)
 {
 int hob_size;
 
@@ -263,7 +263,9 @@ load_hob(void *hob_buf, unsigned long dom_mem_size, void 
*hob_start)
 Hob_Output(No enough memory for hob data);
 return 

[COMMIT master] Build extboot

2009-05-05 Thread Avi Kivity
From: Avi Kivity a...@redhat.com

Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/Makefile b/Makefile
index 157b616..6ce206a 100644
--- a/Makefile
+++ b/Makefile
@@ -411,3 +411,14 @@ tarbin:
 
 # Include automatically generated dependency files
 -include $(wildcard *.d audio/*.d slirp/*.d)
+
+.PHONY: kvm/extboot
+
+all: kvm/extboot
+
+kvm/extboot:
+   $(MAKE) -C $@
+   if ! [ -f pc-bios/extboot.bin ] \
+   || ! cmp -s pc-bios/extboot.bin $@/extboot.bin; then \
+   cp $@/extboot.bin pc-bios/extboot.bin; \
+   fi
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[COMMIT master] Fix extboot merge

2009-05-05 Thread Avi Kivity
From: Avi Kivity a...@redhat.com

Last qemu merge broke extboot completely.  Instead of reading the command,
extboot corrupted the stack.  Instead of writing back the geometry, extboot
wrote nothing.

Fix by reading the command correctly and writing back the results.

Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/hw/extboot.c b/hw/extboot.c
index f66b6c5..b91d54f 100644
--- a/hw/extboot.c
+++ b/hw/extboot.c
@@ -81,8 +81,8 @@ static void extboot_write_cmd(void *opaque, uint32_t addr, 
uint32_t value)
 int blen = 0;
 void *buf = NULL;
 
-cpu_physical_memory_read((value  0x)  4, (uint8_t *)buf,
- sizeof(buf));
+cpu_physical_memory_read((value  0x)  4, (uint8_t *)cmd,
+ sizeof(cmd));
 
 if (cmd.type == 0x01 || cmd.type == 0x02) {
pa = cmd.xfer.segment * 16 + cmd.xfer.offset;
@@ -98,7 +98,6 @@ static void extboot_write_cmd(void *opaque, uint32_t addr, 
uint32_t value)
cmd.query_geometry.heads = heads;
cmd.query_geometry.sectors = sectors;
cmd.query_geometry.nb_sectors = nb_sectors;
-   cpu_physical_memory_set_dirty((value  0x)  4);
break;
 case 0x01:
err = bdrv_read(bs, cmd.xfer.sector, buf, cmd.xfer.nb_sectors);
@@ -118,6 +117,8 @@ static void extboot_write_cmd(void *opaque, uint32_t addr, 
uint32_t value)
break;
 }
 
+cpu_physical_memory_write((value  0x)  4, (uint8_t *)cmd,
+  sizeof(cmd));
 if (buf)
 qemu_free(buf);
 }
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[COMMIT master] KVM: SVM: Fix cross vendor migration issue in segment segment descriptor

2009-05-05 Thread Avi Kivity
From: Andre Przywara andre.przyw...@amd.com

On AMD CPUs sometimes the DB bit in the stack segment
descriptor is left as 1, although the whole segment has
been made unusable. Clear it here to pass an Intel VMX
entry check when cross vendor migrating.

Signed-off-by: Andre Przywara andre.przyw...@amd.com
Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1647e81..61453e6 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -804,6 +804,15 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
if (!var-unusable)
var-type |= 0x1;
break;
+   case VCPU_SREG_SS:
+   /* On AMD CPUs sometimes the DB bit in the segment
+* descriptor is left as 1, although the whole segment has
+* been made unusable. Clear it here to pass an Intel VMX
+* entry check when cross vendor migrating.
+*/
+   if (var-unusable)
+   var-db = 0;
+   break;
}
 }
 
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[COMMIT master] KVM: Drop request_nmi from stats

2009-05-05 Thread Avi Kivity
From: Jan Kiszka jan.kis...@siemens.com

The stats entry request_nmi is no longer used as the related user space
interface was dropped. So clean it up.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8e680c3..5322ee6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -442,7 +442,6 @@ struct kvm_vcpu_stat {
u32 halt_exits;
u32 halt_wakeup;
u32 request_irq_exits;
-   u32 request_nmi_exits;
u32 irq_exits;
u32 host_state_reload;
u32 efer_reload;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2d7082c..308d8e9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -91,7 +91,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ halt_wakeup, VCPU_STAT(halt_wakeup) },
{ hypercalls, VCPU_STAT(hypercalls) },
{ request_irq, VCPU_STAT(request_irq_exits) },
-   { request_nmi, VCPU_STAT(request_nmi_exits) },
{ irq_exits, VCPU_STAT(irq_exits) },
{ host_state_reload, VCPU_STAT(host_state_reload) },
{ efer_reload, VCPU_STAT(efer_reload) },
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/9] Do not allow interrupt injection from userspace if there is a pending event.

2009-05-05 Thread Gleb Natapov

Signed-off-by: Gleb Natapov g...@redhat.com
---
 arch/x86/kvm/x86.c |5 +++--
 1 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2d7082c..12ab1cc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3080,8 +3080,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
kvm_run-ready_for_interrupt_injection = 1;
else
kvm_run-ready_for_interrupt_injection =
-   (kvm_arch_interrupt_allowed(vcpu) 
-!kvm_cpu_has_interrupt(vcpu));
+   kvm_arch_interrupt_allowed(vcpu) 
+   !kvm_cpu_has_interrupt(vcpu) 
+   !kvm_event_needs_reinjection(vcpu);
 }
 
 static void vapic_enter(struct kvm_vcpu *vcpu)
-- 
1.6.2.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/9] [SVM] skip_emulated_instruction() decode an instruction if size is not known

2009-05-05 Thread Gleb Natapov

Signed-off-by: Gleb Natapov g...@redhat.com
---
 arch/x86/kvm/svm.c |   11 +--
 1 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c1ef2b9..14cdfce 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -207,7 +207,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
 
if (!svm-next_rip) {
-   printk(KERN_DEBUG %s: NOP\n, __func__);
+   if (emulate_instruction(vcpu, vcpu-run, 0, 0, EMULTYPE_SKIP) !=
+   EMULATE_DONE)
+   printk(KERN_DEBUG %s: NOP\n, __func__);
return;
}
if (svm-next_rip - kvm_rip_read(vcpu)  MAX_INST_SIZE)
@@ -1836,11 +1838,8 @@ static int task_switch_interception(struct vcpu_svm *svm,
if (reason != TASK_SWITCH_GATE ||
int_type == SVM_EXITINTINFO_TYPE_SOFT ||
(int_type == SVM_EXITINTINFO_TYPE_EXEPT 
-(int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
-   if (emulate_instruction(svm-vcpu, kvm_run, 0, 0,
-   EMULTYPE_SKIP) != EMULATE_DONE)
-   return 0;
-   }
+(int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
+   skip_emulated_instruction(svm-vcpu);
 
return kvm_task_switch(svm-vcpu, tss_selector, reason);
 }
-- 
1.6.2.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/9] Remove irq_pending bitmap

2009-05-05 Thread Gleb Natapov
Only one interrupt vector can be injected from userspace irqchip at
any given time so no need to store it in a bitmap. Put it into interrupt
queue directly.

Signed-off-by: Gleb Natapov g...@redhat.com
---
 arch/x86/include/asm/kvm_host.h |2 --
 arch/x86/kvm/irq.c  |4 ++--
 arch/x86/kvm/x86.c  |   38 +++---
 arch/x86/kvm/x86.h  |   12 
 4 files changed, 13 insertions(+), 43 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8e680c3..cc892f5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -266,8 +266,6 @@ struct kvm_mmu {
 
 struct kvm_vcpu_arch {
u64 host_tsc;
-   unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
-   DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
/*
 * rip and regs accesses must go through
 * kvm_{register,rip}_{read,write} functions.
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 11c2757..96dfbb6 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -50,7 +50,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
struct kvm_pic *s;
 
if (!irqchip_in_kernel(v-kvm))
-   return v-arch.irq_summary;
+   return v-arch.interrupt.pending;
 
if (kvm_apic_has_interrupt(v) == -1) {  /* LAPIC */
if (kvm_apic_accept_pic_intr(v)) {
@@ -72,7 +72,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
int vector;
 
if (!irqchip_in_kernel(v-kvm))
-   return kvm_pop_irq(v);
+   return v-arch.interrupt.nr;
 
vector = kvm_get_apic_interrupt(v); /* APIC */
if (vector == -1) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 12ab1cc..4596927 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1424,8 +1424,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
return -ENXIO;
vcpu_load(vcpu);
 
-   set_bit(irq-irq, vcpu-arch.irq_pending);
-   set_bit(irq-irq / BITS_PER_LONG, vcpu-arch.irq_summary);
+   kvm_queue_interrupt(vcpu, irq-irq);
 
vcpu_put(vcpu);
 
@@ -3562,12 +3561,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
sregs-efer = vcpu-arch.shadow_efer;
sregs-apic_base = kvm_get_apic_base(vcpu);
 
-   if (irqchip_in_kernel(vcpu-kvm))
-   memset(sregs-interrupt_bitmap, 0,
-  sizeof sregs-interrupt_bitmap);
-   else
-   memcpy(sregs-interrupt_bitmap, vcpu-arch.irq_pending,
-  sizeof sregs-interrupt_bitmap);
+   memset(sregs-interrupt_bitmap, 0, sizeof sregs-interrupt_bitmap);
 
if (vcpu-arch.interrupt.pending)
set_bit(vcpu-arch.interrupt.nr,
@@ -4037,7 +4031,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
  struct kvm_sregs *sregs)
 {
int mmu_reset_needed = 0;
-   int i, pending_vec, max_bits;
+   int pending_vec, max_bits;
struct descriptor_table dt;
 
vcpu_load(vcpu);
@@ -4079,24 +4073,14 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
if (mmu_reset_needed)
kvm_mmu_reset_context(vcpu);
 
-   if (!irqchip_in_kernel(vcpu-kvm)) {
-   memcpy(vcpu-arch.irq_pending, sregs-interrupt_bitmap,
-  sizeof vcpu-arch.irq_pending);
-   vcpu-arch.irq_summary = 0;
-   for (i = 0; i  ARRAY_SIZE(vcpu-arch.irq_pending); ++i)
-   if (vcpu-arch.irq_pending[i])
-   __set_bit(i, vcpu-arch.irq_summary);
-   } else {
-   max_bits = (sizeof sregs-interrupt_bitmap)  3;
-   pending_vec = find_first_bit(
-   (const unsigned long *)sregs-interrupt_bitmap,
-   max_bits);
-   /* Only pending external irq is handled here */
-   if (pending_vec  max_bits) {
-   kvm_queue_interrupt(vcpu, pending_vec);
-   pr_debug(Set back pending irq %d\n, pending_vec);
-   }
-   kvm_pic_clear_isr_ack(vcpu-kvm);
+   max_bits = (sizeof sregs-interrupt_bitmap)  3;
+   pending_vec = find_first_bit(
+   (const unsigned long *)sregs-interrupt_bitmap, max_bits);
+   if (pending_vec  max_bits) {
+   kvm_queue_interrupt(vcpu, pending_vec);
+   pr_debug(Set back pending irq %d\n, pending_vec);
+   if (irqchip_in_kernel(vcpu-kvm))
+   kvm_pic_clear_isr_ack(vcpu-kvm);
}
 
kvm_set_segment(vcpu, sregs-cs, VCPU_SREG_CS);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 21203d4..c1f1a8c 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -19,18 +19,6 @@ static inline void kvm_clear_interrupt_queue(struct kvm_vcpu 
*vcpu)

[PATCH 5/9] [VMX] Do not re-execute INTn instruction.

2009-05-05 Thread Gleb Natapov
Re-inject event instead. This is what Intel suggest. Also use correct
instruction length when re-injecting soft fault/interrupt.

Signed-off-by: Gleb Natapov g...@redhat.com
---
 arch/x86/include/asm/kvm_host.h |5 -
 arch/x86/kvm/svm.c  |6 +++---
 arch/x86/kvm/vmx.c  |   29 ++---
 arch/x86/kvm/x86.c  |   13 -
 arch/x86/kvm/x86.h  |9 -
 5 files changed, 45 insertions(+), 17 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index cc892f5..fea0429 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -319,6 +319,8 @@ struct kvm_vcpu_arch {
struct kvm_pio_request pio;
void *pio_data;
 
+   u8 event_exit_inst_len;
+
struct kvm_queued_exception {
bool pending;
bool has_error_code;
@@ -328,6 +330,7 @@ struct kvm_vcpu_arch {
 
struct kvm_queued_interrupt {
bool pending;
+   bool soft;
u8 nr;
} interrupt;
 
@@ -510,7 +513,7 @@ struct kvm_x86_ops {
void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
void (*patch_hypercall)(struct kvm_vcpu *vcpu,
unsigned char *hypercall_addr);
-   void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
+   void (*set_irq)(struct kvm_vcpu *vcpu, int vec, bool soft);
void (*set_nmi)(struct kvm_vcpu *vcpu);
void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
bool has_error_code, u32 error_code);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 14cdfce..d5173a2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2284,7 +2284,7 @@ static void svm_queue_irq(struct kvm_vcpu *vcpu, unsigned 
nr)
SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
 }
 
-static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
+static void svm_set_irq(struct kvm_vcpu *vcpu, int irq, bool soft)
 {
struct vcpu_svm *svm = to_svm(vcpu);
 
@@ -2392,7 +2392,7 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
case SVM_EXITINTINFO_TYPE_EXEPT:
/* In case of software exception do not reinject an exception
   vector, but re-execute and instruction instead */
-   if (vector == BP_VECTOR || vector == OF_VECTOR)
+   if (kvm_exception_is_soft(vector))
break;
if (exitintinfo  SVM_EXITINTINFO_VALID_ERR) {
u32 err = svm-vmcb-control.exit_int_info_err;
@@ -2402,7 +2402,7 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
kvm_queue_exception(svm-vcpu, vector);
break;
case SVM_EXITINTINFO_TYPE_INTR:
-   kvm_queue_interrupt(svm-vcpu, vector);
+   kvm_queue_interrupt(svm-vcpu, vector, false);
break;
default:
break;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a9b30e6..092a3ee 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -779,8 +779,9 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, 
unsigned nr,
return;
}
 
-   if (nr == BP_VECTOR || nr == OF_VECTOR) {
-   vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
+   if (kvm_exception_is_soft(nr)) {
+   vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+vmx-vcpu.arch.event_exit_inst_len);
intr_info |= INTR_TYPE_SOFT_EXCEPTION;
} else
intr_info |= INTR_TYPE_HARD_EXCEPTION;
@@ -2429,9 +2430,10 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
 }
 
-static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
+static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq, bool soft)
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
+   uint32_t intr;
 
KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
 
@@ -2446,8 +2448,14 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int 
irq)
kvm_rip_write(vcpu, vmx-rmode.irq.rip - 1);
return;
}
-   vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-   irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
+   intr = irq | INTR_INFO_VALID_MASK;
+   if (soft) {
+   intr |= INTR_TYPE_SOFT_INTR;
+   vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+vmx-vcpu.arch.event_exit_inst_len);
+   } else
+   intr |= INTR_TYPE_EXT_INTR;
+   vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
 }
 
 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -3008,6 +3016,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, 
struct kvm_run *kvm_run)
  GUEST_INTR_STATE_NMI);

[PATCH 1/9] Unprotect a page if #PF happens during NMI injection.

2009-05-05 Thread Gleb Natapov
It is done for exception and interrupt already.

Signed-off-by: Gleb Natapov g...@redhat.com
---
 arch/x86/kvm/svm.c |3 +--
 arch/x86/kvm/vmx.c |2 +-
 arch/x86/kvm/x86.h |6 ++
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8f411ff..c1ef2b9 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1090,8 +1090,7 @@ static int pf_interception(struct vcpu_svm *svm, struct 
kvm_run *kvm_run)
if (npt_enabled)
svm_flush_tlb(svm-vcpu);
else {
-   if (svm-vcpu.arch.interrupt.pending ||
-   svm-vcpu.arch.exception.pending)
+   if (kvm_event_needs_reinjection(svm-vcpu))
kvm_mmu_unprotect_page_virt(svm-vcpu, fault_address);
}
return kvm_mmu_page_fault(svm-vcpu, fault_address, error_code);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e8a5649..a9b30e6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2599,7 +2599,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct 
kvm_run *kvm_run)
cr2 = vmcs_readl(EXIT_QUALIFICATION);
KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
(u32)((u64)cr2  32), handler);
-   if (vcpu-arch.interrupt.pending || 
vcpu-arch.exception.pending)
+   if (kvm_event_needs_reinjection(vcpu))
kvm_mmu_unprotect_page_virt(vcpu, cr2);
return kvm_mmu_page_fault(vcpu, cr2, error_code);
}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 39350b2..21203d4 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -30,4 +30,10 @@ static inline u8 kvm_pop_irq(struct kvm_vcpu *vcpu)
clear_bit(word_index, vcpu-arch.irq_summary);
return irq;
 }
+
+static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
+{
+   return vcpu-arch.exception.pending || vcpu-arch.interrupt.pending ||
+   vcpu-arch.nmi_injected;
+}
 #endif
-- 
1.6.2.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 8/9] Replace pending exception by PF if it happens serially.

2009-05-05 Thread Gleb Natapov
replace previous exception with a new one in a hope that instruction
re-execution will regenerate lost exception.

Signed-off-by: Gleb Natapov g...@redhat.com
---
 arch/x86/kvm/x86.c |   19 ---
 1 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4ba00ab..a869b89 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -177,16 +177,21 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, 
unsigned long addr,
++vcpu-stat.pf_guest;
 
if (vcpu-arch.exception.pending) {
-   if (vcpu-arch.exception.nr == PF_VECTOR) {
-   printk(KERN_DEBUG kvm: inject_page_fault:
-double fault 0x%lx\n, addr);
-   vcpu-arch.exception.nr = DF_VECTOR;
-   vcpu-arch.exception.error_code = 0;
-   } else if (vcpu-arch.exception.nr == DF_VECTOR) {
+   switch(vcpu-arch.exception.nr) {
+   case DF_VECTOR:
/* triple fault - shutdown */
set_bit(KVM_REQ_TRIPLE_FAULT, vcpu-requests);
+   case PF_VECTOR:
+   vcpu-arch.exception.nr = DF_VECTOR;
+   vcpu-arch.exception.error_code = 0;
+   return;
+   default:
+   /* replace previous exception with a new one in a hope
+  that instruction re-execution will regenerate lost
+  exception */
+   vcpu-arch.exception.pending = false;
+   break;
}
-   return;
}
vcpu-arch.cr2 = addr;
kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
-- 
1.6.2.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 6/9] IRQ/NMI window should always be requested.

2009-05-05 Thread Gleb Natapov
Currently they are not requested if there is pending exception.

Signed-off-by: Gleb Natapov g...@redhat.com
---
 arch/x86/kvm/x86.c |   30 --
 1 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 023842b..bce49da 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3127,8 +3127,11 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
kvm_x86_ops-update_cr8_intercept(vcpu, tpr, max_irr);
 }
 
-static void inject_irq(struct kvm_vcpu *vcpu)
+static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
+   if (vcpu-guest_debug  KVM_GUESTDBG_SINGLESTEP)
+   kvm_x86_ops-drop_interrupt_shadow(vcpu);
+
/* try to reinject previous events if any */
if (vcpu-arch.nmi_injected) {
kvm_x86_ops-set_nmi(vcpu);
@@ -3158,26 +3161,11 @@ static void inject_irq(struct kvm_vcpu *vcpu)
}
 }
 
-static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-   bool req_int_win = !irqchip_in_kernel(vcpu-kvm) 
-   kvm_run-request_interrupt_window;
-
-   if (vcpu-guest_debug  KVM_GUESTDBG_SINGLESTEP)
-   kvm_x86_ops-drop_interrupt_shadow(vcpu);
-
-   inject_irq(vcpu);
-
-   /* enable NMI/IRQ window open exits if needed */
-   if (vcpu-arch.nmi_pending)
-   kvm_x86_ops-enable_nmi_window(vcpu);
-   else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
-   kvm_x86_ops-enable_irq_window(vcpu);
-}
-
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
int r;
+   bool req_int_win = !irqchip_in_kernel(vcpu-kvm) 
+   kvm_run-request_interrupt_window;
 
if (vcpu-requests)
if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, vcpu-requests))
@@ -3235,6 +3223,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, 
struct kvm_run *kvm_run)
else
inject_pending_irq(vcpu, kvm_run);
 
+   /* enable NMI/IRQ window open exits if needed */
+   if (vcpu-arch.nmi_pending)
+   kvm_x86_ops-enable_nmi_window(vcpu);
+   else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+   kvm_x86_ops-enable_irq_window(vcpu);
+
if (kvm_lapic_enabled(vcpu)) {
if (!vcpu-arch.apic-vapic_addr)
update_cr8_intercept(vcpu);
-- 
1.6.2.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 9/9] [SVM] inject NMI after IRET from a previous NMI, not before.

2009-05-05 Thread Gleb Natapov
If NMI is received during handling of another NMI it should be injected
immediately after IRET from previous NMI handler, but SVM intercept IRET
before instruction execution so we can't inject pending NMI at this
point and there is not way to request exit when NMI window opens. This
patch fix SVM code to open NMI window after IRET by single stepping over
IRET instruction.

Signed-off-by: Gleb Natapov g...@redhat.com
---
 arch/x86/include/asm/kvm_host.h |2 +
 arch/x86/kvm/svm.c  |   62 +++---
 2 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fea0429..bcd0857 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -358,6 +358,7 @@ struct kvm_vcpu_arch {
unsigned int time_offset;
struct page *time_page;
 
+   bool singlestep; /* guest is single stepped by KVM */
bool nmi_pending;
bool nmi_injected;
 
@@ -772,6 +773,7 @@ enum {
 #define HF_HIF_MASK(1  1)
 #define HF_VINTR_MASK  (1  2)
 #define HF_NMI_MASK(1  3)
+#define HF_IRET_MASK   (1  4)
 
 /*
  * Hardware virtualization extension instructions may fault if a
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d5173a2..bf10991 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -933,15 +933,16 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
 
 }
 
-static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
+static void update_db_intercept(struct kvm_vcpu *vcpu)
 {
-   int old_debug = vcpu-guest_debug;
struct vcpu_svm *svm = to_svm(vcpu);
 
-   vcpu-guest_debug = dbg-control;
-
svm-vmcb-control.intercept_exceptions =
~((1  DB_VECTOR) | (1  BP_VECTOR));
+
+   if (vcpu-arch.singlestep)
+   svm-vmcb-control.intercept_exceptions |= (1  DB_VECTOR);
+
if (vcpu-guest_debug  KVM_GUESTDBG_ENABLE) {
if (vcpu-guest_debug 
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
@@ -952,6 +953,16 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct 
kvm_guest_debug *dbg)
1  BP_VECTOR;
} else
vcpu-guest_debug = 0;
+}
+
+static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
+{
+   int old_debug = vcpu-guest_debug;
+   struct vcpu_svm *svm = to_svm(vcpu);
+
+   vcpu-guest_debug = dbg-control;
+
+   update_db_intercept(vcpu);
 
if (vcpu-guest_debug  KVM_GUESTDBG_USE_HW_BP)
svm-vmcb-save.dr7 = dbg-arch.debugreg[7];
@@ -1101,14 +1112,30 @@ static int pf_interception(struct vcpu_svm *svm, struct 
kvm_run *kvm_run)
 static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
if (!(svm-vcpu.guest_debug 
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 
+   !svm-vcpu.arch.singlestep) {
kvm_queue_exception(svm-vcpu, DB_VECTOR);
return 1;
}
-   kvm_run-exit_reason = KVM_EXIT_DEBUG;
-   kvm_run-debug.arch.pc = svm-vmcb-save.cs.base + svm-vmcb-save.rip;
-   kvm_run-debug.arch.exception = DB_VECTOR;
-   return 0;
+
+   if (svm-vcpu.arch.singlestep) {
+   svm-vcpu.arch.singlestep = false;
+   if (!(svm-vcpu.guest_debug  KVM_GUESTDBG_SINGLESTEP))
+   svm-vmcb-save.rflags =
+   ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
+   update_db_intercept(to_svm(svm));
+   }
+
+   if (svm-vcpu.guest_debug 
+   (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){
+   kvm_run-exit_reason = KVM_EXIT_DEBUG;
+   kvm_run-debug.arch.pc =
+   svm-vmcb-save.cs.base + svm-vmcb-save.rip;
+   kvm_run-debug.arch.exception = DB_VECTOR;
+   return 0;
+   }
+
+   return 1;
 }
 
 static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
@@ -1855,7 +1882,7 @@ static int iret_interception(struct vcpu_svm *svm, struct 
kvm_run *kvm_run)
 {
++svm-vcpu.stat.nmi_window_exits;
svm-vmcb-control.intercept = ~(1UL  INTERCEPT_IRET);
-   svm-vcpu.arch.hflags = ~HF_NMI_MASK;
+   svm-vcpu.arch.hflags |= HF_IRET_MASK;
return 1;
 }
 
@@ -2331,8 +2358,16 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
 {
struct vcpu_svm *svm = to_svm(vcpu);
 
-   if (svm-vmcb-control.int_state  SVM_INTERRUPT_SHADOW_MASK)
-   enable_irq_window(vcpu);
+   if ((svm-vcpu.arch.hflags  (HF_NMI_MASK | HF_IRET_MASK))
+   == HF_NMI_MASK)
+   return; /* IRET will cause a vm exit */
+
+   /* Something prevents NMI from been injected. Single step over
+  possible problem (IRET or exception injection or 

Re: Paravirtualisation or not?

2009-05-05 Thread Stefan Hajnoczi
 If a set of drivers essentially implementing the virtio framework
 (virtio_pci, virtio_ring, virtio queues) were available for
 windows, that would be *really* neat.
I haven't tried them myself but I think this will give you virtio-net
for Windows:
http://sourceforge.net/project/showfiles.php?group_id=180599package_id=267943

More information:
http://www.linux-kvm.com/content/tip-how-setup-windows-guest-paravirtual-network-drivers

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 9/9] [SVM] inject NMI after IRET from a previous NMI, not before.

2009-05-05 Thread Jan Kiszka
Gleb Natapov wrote:
 If NMI is received during handling of another NMI it should be injected
 immediately after IRET from previous NMI handler, but SVM intercept IRET
 before instruction execution so we can't inject pending NMI at this
 point and there is not way to request exit when NMI window opens. This
 patch fix SVM code to open NMI window after IRET by single stepping over
 IRET instruction.
 
 Signed-off-by: Gleb Natapov g...@redhat.com
 ---
  arch/x86/include/asm/kvm_host.h |2 +
  arch/x86/kvm/svm.c  |   62 +++---
  2 files changed, 52 insertions(+), 12 deletions(-)
 
 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
 index fea0429..bcd0857 100644
 --- a/arch/x86/include/asm/kvm_host.h
 +++ b/arch/x86/include/asm/kvm_host.h
 @@ -358,6 +358,7 @@ struct kvm_vcpu_arch {
   unsigned int time_offset;
   struct page *time_page;
  
 + bool singlestep; /* guest is single stepped by KVM */
   bool nmi_pending;
   bool nmi_injected;
  
 @@ -772,6 +773,7 @@ enum {
  #define HF_HIF_MASK  (1  1)
  #define HF_VINTR_MASK(1  2)
  #define HF_NMI_MASK  (1  3)
 +#define HF_IRET_MASK (1  4)
  
  /*
   * Hardware virtualization extension instructions may fault if a
 diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
 index d5173a2..bf10991 100644
 --- a/arch/x86/kvm/svm.c
 +++ b/arch/x86/kvm/svm.c
 @@ -933,15 +933,16 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
  
  }
  
 -static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug 
 *dbg)
 +static void update_db_intercept(struct kvm_vcpu *vcpu)
  {
 - int old_debug = vcpu-guest_debug;
   struct vcpu_svm *svm = to_svm(vcpu);
  
 - vcpu-guest_debug = dbg-control;
 -
   svm-vmcb-control.intercept_exceptions =
   ~((1  DB_VECTOR) | (1  BP_VECTOR));
 +
 + if (vcpu-arch.singlestep)
 + svm-vmcb-control.intercept_exceptions |= (1  DB_VECTOR);
 +
   if (vcpu-guest_debug  KVM_GUESTDBG_ENABLE) {
   if (vcpu-guest_debug 
   (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
 @@ -952,6 +953,16 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct 
 kvm_guest_debug *dbg)
   1  BP_VECTOR;
   } else
   vcpu-guest_debug = 0;
 +}
 +
 +static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug 
 *dbg)
 +{
 + int old_debug = vcpu-guest_debug;
 + struct vcpu_svm *svm = to_svm(vcpu);
 +
 + vcpu-guest_debug = dbg-control;
 +
 + update_db_intercept(vcpu);
  
   if (vcpu-guest_debug  KVM_GUESTDBG_USE_HW_BP)
   svm-vmcb-save.dr7 = dbg-arch.debugreg[7];
 @@ -1101,14 +1112,30 @@ static int pf_interception(struct vcpu_svm *svm, 
 struct kvm_run *kvm_run)
  static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
  {
   if (!(svm-vcpu.guest_debug 
 -   (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
 +   (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 
 + !svm-vcpu.arch.singlestep) {
   kvm_queue_exception(svm-vcpu, DB_VECTOR);
   return 1;
   }
 - kvm_run-exit_reason = KVM_EXIT_DEBUG;
 - kvm_run-debug.arch.pc = svm-vmcb-save.cs.base + svm-vmcb-save.rip;
 - kvm_run-debug.arch.exception = DB_VECTOR;
 - return 0;
 +
 + if (svm-vcpu.arch.singlestep) {
 + svm-vcpu.arch.singlestep = false;
 + if (!(svm-vcpu.guest_debug  KVM_GUESTDBG_SINGLESTEP))
 + svm-vmcb-save.rflags =
 + ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
 + update_db_intercept(to_svm(svm));
 + }
 +
 + if (svm-vcpu.guest_debug 
 + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){
 + kvm_run-exit_reason = KVM_EXIT_DEBUG;
 + kvm_run-debug.arch.pc =
 + svm-vmcb-save.cs.base + svm-vmcb-save.rip;
 + kvm_run-debug.arch.exception = DB_VECTOR;
 + return 0;
 + }
 +
 + return 1;
  }
  
  static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 @@ -1855,7 +1882,7 @@ static int iret_interception(struct vcpu_svm *svm, 
 struct kvm_run *kvm_run)
  {
   ++svm-vcpu.stat.nmi_window_exits;
   svm-vmcb-control.intercept = ~(1UL  INTERCEPT_IRET);
 - svm-vcpu.arch.hflags = ~HF_NMI_MASK;
 + svm-vcpu.arch.hflags |= HF_IRET_MASK;
   return 1;
  }
  
 @@ -2331,8 +2358,16 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
  {
   struct vcpu_svm *svm = to_svm(vcpu);
  
 - if (svm-vmcb-control.int_state  SVM_INTERRUPT_SHADOW_MASK)
 - enable_irq_window(vcpu);
 + if ((svm-vcpu.arch.hflags  (HF_NMI_MASK | HF_IRET_MASK))
 + == HF_NMI_MASK)
 + return; /* IRET will cause a vm exit */
 +
 + /* Something prevents NMI from been injected. Single step over
 +possible problem (IRET 

Re: Paravirtualisation or not?

2009-05-05 Thread Pantelis Koukousoulas
On Tue, May 5, 2009 at 11:37 AM, Stefan Hajnoczi stefa...@gmail.com wrote:
 If a set of drivers essentially implementing the virtio framework
 (virtio_pci, virtio_ring, virtio queues) were available for
 windows, that would be *really* neat.
 I haven't tried them myself but I think this will give you virtio-net
 for Windows:
 http://sourceforge.net/project/showfiles.php?group_id=180599package_id=267943

 More information:
 http://www.linux-kvm.com/content/tip-how-setup-windows-guest-paravirtual-network-drivers

Hi Stefan :)

Sure, closed-source virtio-net drivers exist (in fact there is a newer
version than the one
you linked. I think it is 12/2008 distributed as an iso). The point
(and the advantage
of Xen in this area) is that Xen provides the source too under GPL.

Even if there was source available for the virtio framework only (and
not net at all)
it would still be useful to others wanting to write virtio drivers for windows.

It is harder for a third party to do this job because you would have to make the
decision to either use the Windows DDK and samples (which means you can't
release under GPL and thus you can't reuse or even look at the current virtio
implementations) or use GPL and the current linux virtio code as a base but
in this case you can forget DDK and the samples (at least that is my
understanding).

Cheers,
Pantelis
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/1] qemu-kvm: virtio-net: Re-instate GSO code removed upstream

2009-05-05 Thread Mark McLoughlin
This commit:

   commit 559a8f45f34cc50d1a60b4f67a06614d506b2e01
   Subject: Remove stray GSO code from virtio_net (Mark McLoughlin)

Removed some GSO code from upstream qemu.git, but it needs to
be re-instated in qemu-kvm.git.

Reported-by: Sridhar Samudrala s...@us.ibm.com
Signed-off-by: Mark McLoughlin mar...@redhat.com
---
 hw/virtio-net.c |5 +
 1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/hw/virtio-net.c b/hw/virtio-net.c
index ac8e030..e5d7add 100644
--- a/hw/virtio-net.c
+++ b/hw/virtio-net.c
@@ -424,6 +424,11 @@ static int receive_filter(VirtIONet *n, const uint8_t 
*buf, int size)
 if (n-promisc)
 return 1;
 
+#ifdef TAP_VNET_HDR
+if (tap_has_vnet_hdr(n-vc-vlan-first_client))
+ptr += sizeof(struct virtio_net_hdr);
+#endif
+
 if (!memcmp(ptr[12], vlan, sizeof(vlan))) {
 int vid = be16_to_cpup((uint16_t *)(ptr + 14))  0xfff;
 if (!(n-vlans[vid  5]  (1U  (vid  0x1f
-- 
1.6.0.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [kvm] virtio-net not working with the latest qemu-kvm git

2009-05-05 Thread Mark McLoughlin
On Mon, 2009-05-04 at 11:44 -0600, Alex Williamson wrote:
 On Mon, 2009-05-04 at 09:50 -0700, Sridhar Samudrala wrote:
  When i moved to the latest qemu-kvm git tree from kvm-85, i noticed that
  networking stopped working between the host and the guest.
  It started working when i put the device in promiscuos mode by running
  tcpdump in background on the guest. 
  
  After browsing through the recent patches, i found that the following commit
  is causing the regression.
  
  Remove stray GSO code from virtio_net (Mark McLoughlin)
  http://git.kernel.org/?p=virt/kvm/qemu-kvm.git;a=commitdiff;h=559a8f45f34cc50d1a60b4f67a06614d506b2e01
  
  The comment doesn't seem to match with the code that is removed with this 
  patch.
 
 Yep, I agree, the removed code is not bogus.  We have to skip the vnet
 header to to get to the ethernet header, which we do the filtering on.

The code was removed in upstream qemu.git, but we need it re-instated in
qemu-kvm.git. Just sent a patch to do that.

Thanks,
Mark.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 9/9] [SVM] inject NMI after IRET from a previous NMI, not before.

2009-05-05 Thread Gleb Natapov
On Tue, May 05, 2009 at 10:45:20AM +0200, Jan Kiszka wrote:
  @@ -2331,8 +2358,16 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
   {
  struct vcpu_svm *svm = to_svm(vcpu);
   
  -   if (svm-vmcb-control.int_state  SVM_INTERRUPT_SHADOW_MASK)
  -   enable_irq_window(vcpu);
  +   if ((svm-vcpu.arch.hflags  (HF_NMI_MASK | HF_IRET_MASK))
  +   == HF_NMI_MASK)
  +   return; /* IRET will cause a vm exit */
  +
  +   /* Something prevents NMI from been injected. Single step over
  +  possible problem (IRET or exception injection or interrupt
  +  shadow) */
  +   vcpu-arch.singlestep = true;
  +   svm-vmcb-save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
 
 Can you single-step like this out of an IRQ handler? I mean, IRET will
 restore the flags from the stack, and those settings should be
 overwritten. Or am I missing something?
 
It seems to be working :) Shouldn't CPU checks single step before
executing IRET and thus using old flags value? It is interesting to
check what rflag value is immediately after IRET.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 9/9] [SVM] inject NMI after IRET from a previous NMI, not before.

2009-05-05 Thread Jan Kiszka
Gleb Natapov wrote:
 On Tue, May 05, 2009 at 10:45:20AM +0200, Jan Kiszka wrote:
 @@ -2331,8 +2358,16 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
  {
 struct vcpu_svm *svm = to_svm(vcpu);
  
 -   if (svm-vmcb-control.int_state  SVM_INTERRUPT_SHADOW_MASK)
 -   enable_irq_window(vcpu);
 +   if ((svm-vcpu.arch.hflags  (HF_NMI_MASK | HF_IRET_MASK))
 +   == HF_NMI_MASK)
 +   return; /* IRET will cause a vm exit */
 +
 +   /* Something prevents NMI from been injected. Single step over
 +  possible problem (IRET or exception injection or interrupt
 +  shadow) */
 +   vcpu-arch.singlestep = true;
 +   svm-vmcb-save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
 Can you single-step like this out of an IRQ handler? I mean, IRET will
 restore the flags from the stack, and those settings should be
 overwritten. Or am I missing something?

 It seems to be working :) Shouldn't CPU checks single step before
 executing IRET and thus using old flags value? It is interesting to
 check what rflag value is immediately after IRET.

Hmm, guess I have to re-read some manuals. But regarding
rflags-after-iret, I think it should be cleared due to that restoring
from the stack.

Jan

-- 
Siemens AG, Corporate Technology, CT SE 2
Corporate Competence Center Embedded Linux
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 9/9] [SVM] inject NMI after IRET from a previous NMI, not before.

2009-05-05 Thread Gleb Natapov
On Tue, May 05, 2009 at 11:25:13AM +0200, Jan Kiszka wrote:
 Gleb Natapov wrote:
  On Tue, May 05, 2009 at 10:45:20AM +0200, Jan Kiszka wrote:
  @@ -2331,8 +2358,16 @@ static void enable_nmi_window(struct kvm_vcpu 
  *vcpu)
   {
struct vcpu_svm *svm = to_svm(vcpu);
   
  - if (svm-vmcb-control.int_state  SVM_INTERRUPT_SHADOW_MASK)
  - enable_irq_window(vcpu);
  + if ((svm-vcpu.arch.hflags  (HF_NMI_MASK | HF_IRET_MASK))
  + == HF_NMI_MASK)
  + return; /* IRET will cause a vm exit */
  +
  + /* Something prevents NMI from been injected. Single step over
  +possible problem (IRET or exception injection or interrupt
  +shadow) */
  + vcpu-arch.singlestep = true;
  + svm-vmcb-save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
  Can you single-step like this out of an IRQ handler? I mean, IRET will
  restore the flags from the stack, and those settings should be
  overwritten. Or am I missing something?
 
  It seems to be working :) Shouldn't CPU checks single step before
  executing IRET and thus using old flags value? It is interesting to
  check what rflag value is immediately after IRET.
 
 Hmm, guess I have to re-read some manuals. But regarding
 rflags-after-iret, I think it should be cleared due to that restoring
 from the stack.
 
Just re-tested this once more. DB is intercepted after IRET and TF/RF is
cleared already.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 9/9] [SVM] inject NMI after IRET from a previous NMI, not before.

2009-05-05 Thread Gleb Natapov
I noticed a small bug in previous patch. Use this one instead.
(change update_db_intercept(to_svm(svm)) - update_db_intercept(svm-vcpu))


If NMI is received during handling of another NMI it should be injected
immediately after IRET from previous NMI handler, but SVM intercept IRET
before instruction execution so we can't inject pending NMI at this
point and there is not way to request exit when NMI window opens. This
patch fix SVM code to open NMI window after IRET by single stepping over
IRET instruction.

Signed-off-by: Gleb Natapov g...@redhat.com
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fea0429..bcd0857 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -358,6 +358,7 @@ struct kvm_vcpu_arch {
unsigned int time_offset;
struct page *time_page;
 
+   bool singlestep; /* guest is single stepped by KVM */
bool nmi_pending;
bool nmi_injected;
 
@@ -772,6 +773,7 @@ enum {
 #define HF_HIF_MASK(1  1)
 #define HF_VINTR_MASK  (1  2)
 #define HF_NMI_MASK(1  3)
+#define HF_IRET_MASK   (1  4)
 
 /*
  * Hardware virtualization extension instructions may fault if a
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d5173a2..5c00258 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -933,15 +933,16 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
 
 }
 
-static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
+static void update_db_intercept(struct kvm_vcpu *vcpu)
 {
-   int old_debug = vcpu-guest_debug;
struct vcpu_svm *svm = to_svm(vcpu);
 
-   vcpu-guest_debug = dbg-control;
-
svm-vmcb-control.intercept_exceptions =
~((1  DB_VECTOR) | (1  BP_VECTOR));
+
+   if (vcpu-arch.singlestep)
+   svm-vmcb-control.intercept_exceptions |= (1  DB_VECTOR);
+
if (vcpu-guest_debug  KVM_GUESTDBG_ENABLE) {
if (vcpu-guest_debug 
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
@@ -952,6 +953,16 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct 
kvm_guest_debug *dbg)
1  BP_VECTOR;
} else
vcpu-guest_debug = 0;
+}
+
+static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
+{
+   int old_debug = vcpu-guest_debug;
+   struct vcpu_svm *svm = to_svm(vcpu);
+
+   vcpu-guest_debug = dbg-control;
+
+   update_db_intercept(vcpu);
 
if (vcpu-guest_debug  KVM_GUESTDBG_USE_HW_BP)
svm-vmcb-save.dr7 = dbg-arch.debugreg[7];
@@ -1101,14 +1112,30 @@ static int pf_interception(struct vcpu_svm *svm, struct 
kvm_run *kvm_run)
 static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
if (!(svm-vcpu.guest_debug 
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 
+   !svm-vcpu.arch.singlestep) {
kvm_queue_exception(svm-vcpu, DB_VECTOR);
return 1;
}
-   kvm_run-exit_reason = KVM_EXIT_DEBUG;
-   kvm_run-debug.arch.pc = svm-vmcb-save.cs.base + svm-vmcb-save.rip;
-   kvm_run-debug.arch.exception = DB_VECTOR;
-   return 0;
+
+   if (svm-vcpu.arch.singlestep) {
+   svm-vcpu.arch.singlestep = false;
+   if (!(svm-vcpu.guest_debug  KVM_GUESTDBG_SINGLESTEP))
+   svm-vmcb-save.rflags =
+   ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
+   update_db_intercept(svm-vcpu);
+   }
+
+   if (svm-vcpu.guest_debug 
+   (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){
+   kvm_run-exit_reason = KVM_EXIT_DEBUG;
+   kvm_run-debug.arch.pc =
+   svm-vmcb-save.cs.base + svm-vmcb-save.rip;
+   kvm_run-debug.arch.exception = DB_VECTOR;
+   return 0;
+   }
+
+   return 1;
 }
 
 static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
@@ -1855,7 +1882,7 @@ static int iret_interception(struct vcpu_svm *svm, struct 
kvm_run *kvm_run)
 {
++svm-vcpu.stat.nmi_window_exits;
svm-vmcb-control.intercept = ~(1UL  INTERCEPT_IRET);
-   svm-vcpu.arch.hflags = ~HF_NMI_MASK;
+   svm-vcpu.arch.hflags |= HF_IRET_MASK;
return 1;
 }
 
@@ -2331,8 +2358,16 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
 {
struct vcpu_svm *svm = to_svm(vcpu);
 
-   if (svm-vmcb-control.int_state  SVM_INTERRUPT_SHADOW_MASK)
-   enable_irq_window(vcpu);
+   if ((svm-vcpu.arch.hflags  (HF_NMI_MASK | HF_IRET_MASK))
+   == HF_NMI_MASK)
+   return; /* IRET will cause a vm exit */
+
+   /* Something prevents NMI from been injected. Single step over
+  possible problem (IRET or exception injection or interrupt
+  shadow) */
+   

Re: qemu/hw/device-assignment: questions about msix_table_page

2009-05-05 Thread Michael S. Tsirkin
On Mon, Apr 27, 2009 at 10:30:17PM +0800, Sheng Yang wrote:
 If guest can write to the real device MSI-X table directly, it would
 cause chaos on interrupt delivery, for what guest see is totally
 different with what's host see...
   
Obviously.
   
Thanks,
 

What's the reason that this page is unmapped from the qemu memory space?
Specifically what do these lines do:
int offset = r_dev-msix_table_addr - real_region-base_addr;
ret = munmap(region-u.r_virtbase + offset, TARGET_PAGE_SIZE);

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: qemu/hw/device-assignment: questions about msix_table_page

2009-05-05 Thread Marcelo Tosatti
On Tue, May 05, 2009 at 12:51:36PM +0300, Michael S. Tsirkin wrote:
 On Mon, Apr 27, 2009 at 10:30:17PM +0800, Sheng Yang wrote:
  If guest can write to the real device MSI-X table directly, it would
  cause chaos on interrupt delivery, for what guest see is totally
  different with what's host see...

 Obviously.

 Thanks,
  
 
 What's the reason that this page is unmapped from the qemu memory space?
 Specifically what do these lines do:
 int offset = r_dev-msix_table_addr - real_region-base_addr;
 ret = munmap(region-u.r_virtbase + offset, TARGET_PAGE_SIZE);

I believe this allows accesses to this page (the MSI-X table), which
is part of the guest address space (through kvm memory slots), to be
trapped by qemu.

Since there is no actual page in this guest address, KVM treats accesses
as MMIO and forwards them to QEMU.



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY should support MSI?

2009-05-05 Thread Michael S. Tsirkin
The new KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY ioctls have
been merged for 2.6.30. However, I note that PCI spec allows devices to
support multiple vectors with MSI as well (support will be in linux
2.6.30).

Even though qemu for now only uses a single vector with MSI, it would
seem that it's better to make the kernel/user interface generic straight
away rather than add more ioctls later. What do you think? It might not
be too late to fix this for 2.6.30.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: qemu/hw/device-assignment: questions about msix_table_page

2009-05-05 Thread Marcelo Tosatti
On Tue, May 05, 2009 at 01:34:50PM +0300, Michael S. Tsirkin wrote:
 On Tue, May 05, 2009 at 07:19:45AM -0300, Marcelo Tosatti wrote:
  On Tue, May 05, 2009 at 12:51:36PM +0300, Michael S. Tsirkin wrote:
   On Mon, Apr 27, 2009 at 10:30:17PM +0800, Sheng Yang wrote:
If guest can write to the real device MSI-X table directly, it 
would
cause chaos on interrupt delivery, for what guest see is totally
different with what's host see...
  
   Obviously.
  
   Thanks,

   
   What's the reason that this page is unmapped from the qemu memory space?
   Specifically what do these lines do:
   int offset = r_dev-msix_table_addr - real_region-base_addr;
   ret = munmap(region-u.r_virtbase + offset, TARGET_PAGE_SIZE);
  
  I believe this allows accesses to this page (the MSI-X table), which
  is part of the guest address space (through kvm memory slots), to be
  trapped by qemu.
  
  Since there is no actual page in this guest address, KVM treats accesses
  as MMIO and forwards them to QEMU.
  
  
 
 I thought about this too.
 But why is this necessary for assigned MSI-X but not for emulated devices 
 such as
 e.g. e1000? All e1000 does seems to be cpu_register_physical_memory ...

Because there is no registered (kvm) memory slot for the range which
e1000 registers its MMIO? Not sure about the address of the MSI-X table
page, but you could achieve the same effect by splitting the slot which
it lives in two, with a 1 page hole between them.

BTW this is why you can't map the MSI-X table page directly, you want
accesses to be trapped.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: qemu/hw/device-assignment: questions about msix_table_page

2009-05-05 Thread Michael S. Tsirkin
On Tue, May 05, 2009 at 07:19:45AM -0300, Marcelo Tosatti wrote:
 On Tue, May 05, 2009 at 12:51:36PM +0300, Michael S. Tsirkin wrote:
  On Mon, Apr 27, 2009 at 10:30:17PM +0800, Sheng Yang wrote:
   If guest can write to the real device MSI-X table directly, it 
   would
   cause chaos on interrupt delivery, for what guest see is totally
   different with what's host see...
 
  Obviously.
 
  Thanks,
   
  
  What's the reason that this page is unmapped from the qemu memory space?
  Specifically what do these lines do:
  int offset = r_dev-msix_table_addr - real_region-base_addr;
  ret = munmap(region-u.r_virtbase + offset, TARGET_PAGE_SIZE);
 
 I believe this allows accesses to this page (the MSI-X table), which
 is part of the guest address space (through kvm memory slots), to be
 trapped by qemu.
 
 Since there is no actual page in this guest address, KVM treats accesses
 as MMIO and forwards them to QEMU.
 
 

I thought about this too.
But why is this necessary for assigned MSI-X but not for emulated devices such 
as
e.g. e1000? All e1000 does seems to be cpu_register_physical_memory ...

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY should support MSI?

2009-05-05 Thread Marcelo Tosatti
On Tue, May 05, 2009 at 01:30:28PM +0300, Michael S. Tsirkin wrote:
 The new KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY ioctls have
 been merged for 2.6.30. However, I note that PCI spec allows devices to
 support multiple vectors with MSI as well (support will be in linux
 2.6.30).
 
 Even though qemu for now only uses a single vector with MSI, it would
 seem that it's better to make the kernel/user interface generic straight
 away rather than add more ioctls later. What do you think? It might not
 be too late to fix this for 2.6.30.

Can't you use more than one KVM_ASSIGN_SET_MSIX_ENTRY call per assigned
device?

If you can't, it would be better to change the ioctls before 2.6.30 is
release IMO.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY should support MSI?

2009-05-05 Thread Michael S. Tsirkin
On Tue, May 05, 2009 at 08:04:15AM -0300, Marcelo Tosatti wrote:
 On Tue, May 05, 2009 at 01:30:28PM +0300, Michael S. Tsirkin wrote:
  The new KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY ioctls have
  been merged for 2.6.30. However, I note that PCI spec allows devices to
  support multiple vectors with MSI as well (support will be in linux
  2.6.30).
  
  Even though qemu for now only uses a single vector with MSI, it would
  seem that it's better to make the kernel/user interface generic straight
  away rather than add more ioctls later. What do you think? It might not
  be too late to fix this for 2.6.30.
 
 Can't you use more than one KVM_ASSIGN_SET_MSIX_ENTRY call per assigned
 device?

Sure, but only one KVM_ASSIGN_SET_MSIX_NR.

 If you can't, it would be better to change the ioctls before 2.6.30 is
 release IMO.
 

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM x86_64 with SR-IOV..? (device passthrough with LIO-Target v3.0)

2009-05-05 Thread Nicholas A. Bellinger
On Tue, 2009-05-05 at 03:43 -0700, Nicholas A. Bellinger wrote:
 On Tue, 2009-05-05 at 09:42 +0800, Yu Zhao wrote:
  Hi,
  
  The VF also works in the host if the VF driver is programed properly.
  So it would be easier to develop the VF driver in the host and then
  verify the VF driver in the guest.
  
  BTW, I didn't see the SR-IOV is enabled in your dmesg, did you select
  the CONFIG_PCI_IOV in the kernel .config?
  
  Thanks,
  Yu
  
 
 Greetings Yu and Sheng,
 
 So the original attachment was for the v2.6.29-fc11 host kernel output,
 I ended up jumping to v2.6.30-rc3 (and making sure CONFIG_PCI_IOV was
 enabled) for KVM host with kvm-85 and now things are looking quite
 stable for me.
 
 So far I have been able to successfully push LIO-Target v3.0 traffic
 *inside* a v2.6.29.2 KVM guest via the onboard e1000e (02:00.0) port
 from another Linux/iSCSI Initiator machine using a Intel 1 Gb/sec port.
 I am running badblocks tests to iSCSI Logical Units for RAMDISK_DR and
 FILEIO storage objects (in the KVM Guest), and they are passing
 validation and I am seeing ~500 Mb/sec of throughput and very low CPU
 usage in the KVM guests.
 

Ok I am seeing another issue with the e1000e port on 02:00.0..:

As i start to push multiple badblocks tests RAMDISK_DR iSCSI Logical
units into KVM Guest running LIO v2.6.29.2 from the external Linux/iSCSI
Initiator machine, after about 100 GB of iSCSI traffic, I see the
following exception in KVM host v2.6.30-rc3:

DRHD: handling fault status reg 2
DMAR:[DMA Write] Request device [02:00.0] fault addr 7fc958b01 
DMAR:[fault reason 04] Access beyond MGAW
pci-stub :02:00.0: irq 59 for MSI/MSI-X
pci-stub :02:00.0: irq 60 for MSI/MSI-X
pci-stub :02:00.0: irq 61 for MSI/MSI-X

I am able to restart the LIO-Target KVM Guest and the Linux/iSCSI
Initiators are able to reconnect..  Wow, very cool..

Not sure if this is a bug in the target_core_mod RAMDISK_DR subsystem
plugin (mapping struct iovec to internally allocated struct page) or
what.  I will have to look at the DMAR code to understand what this
exception means..

--nab

 One issue I did notice while using the pci-stub method of
 device-assignment with same e1000 port (02:00.0) was while using an
 iSCSI Initiator (Open-iSCSI) on the KVM Host machine and doing sustained
 traffic into the LIO-Target KVM Guest on the same local KVM host to max
 out traffic between the other onboard e1000e port (03.00.0), I see the
 following:
 
 pci-stub :02:00.0: PCI INT A - GSI 17 (level, low) - IRQ 17
 assign device: host bdf = 2:0:0
 pci-stub :02:00.0: irq 59 for MSI/MSI-X
 pci-stub :02:00.0: irq 59 for MSI/MSI-X
 pci-stub :02:00.0: irq 59 for MSI/MSI-X
 pci-stub :02:00.0: irq 59 for MSI/MSI-X
 pci-stub :02:00.0: irq 59 for MSI/MSI-X
 pci-stub :02:00.0: irq 60 for MSI/MSI-X
 pci-stub :02:00.0: irq 61 for MSI/MSI-X
 scsi4 : iSCSI Initiator over TCP/IP
 scsi 4:0:0:0: Direct-Access LIO-ORG  RAMDISK-DR   3.0  PQ: 0 ANSI: 5
 sd 4:0:0:0: Attached scsi generic sg1 type 0
 scsi 4:0:0:1: Direct-Access LIO-ORG  RAMDISK-DR   3.0  PQ: 0 ANSI: 5
 sd 4:0:0:1: Attached scsi generic sg2 type 0
 sd 4:0:0:0: [sdb] 262144 512-byte hardware sectors: (134 MB/128 MiB)
 sd 4:0:0:1: [sdc] 262144 512-byte hardware sectors: (134 MB/128 MiB)
 sd 4:0:0:0: [sdb] Write Protect is off
 sd 4:0:0:0: [sdb] Mode Sense: 2f 00 00 00
 sd 4:0:0:1: [sdc] Write Protect is off
 sd 4:0:0:1: [sdc] Mode Sense: 2f 00 00 00
 sd 4:0:0:0: [sdb] Write cache: disabled, read cache: enabled, doesn't support 
 DPO or FUA
 sd 4:0:0:1: [sdc] Write cache: disabled, read cache: enabled, doesn't support 
 DPO or FUA
  sdb:6 sdc: unknown partition table
 sd 4:0:0:0: [sdb] Attached SCSI disk
  unknown partition table
 sd 4:0:0:1: [sdc] Attached SCSI disk
 [ cut here ]
 WARNING: at kernel/irq/manage.c:260 enable_irq+0x36/0x50()
 Hardware name: empty
 Unbalanced enable for IRQ 59
 Modules linked in: ipt_REJECT xt_tcpudp bridge stp sunrpc iptable_filter 
 ip_tables xt_state nf_conntrack ip6table_filter ip6_tables x_tables ib_iser 
 rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr ipv6 iscsi_tcp libiscsi_tcp 
 libiscsi scsi_transport_iscsi cpufreq_ondemand acpi_cpufreq freq_table ext3 
 jbd loop dm_multipath scsi_dh kvm_intel kvm uinput i2c_i801 firewire_ohci 
 joydev firewire_core sg i2c_core 8250_pnp crc_itu_t e1000e 8250 serial_core 
 rtc_cmos pcspkr serio_raw rtc_core rtc_lib button sd_mod dm_snapshot dm_zero 
 dm_mirror dm_region_hash dm_log dm_mod uhci_hcd ohci_hcd ehci_hcd ata_piix 
 libata scsi_mod [last unloaded: microcode]
 Pid: 51, comm: events/0 Tainted: GW  2.6.30-rc3 #11
 Call Trace:
  [80235fee] ? warn_slowpath+0xcb/0xe8
  [80253a7c] ? generic_exec_single+0x6a/0x88
  [8022acec] ? update_curr+0x67/0xeb
  [a0198748] ? vcpu_kick_intr+0x0/0x1 [kvm]
  [8020a5d8] ? __switch_to+0xb6/0x274
  [8022b70a] ? __dequeue_entity+0x1b/0x2f
  [a01ac7e4] ? 

Re: KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY should support MSI?

2009-05-05 Thread Gregory Haskins
Michael S. Tsirkin wrote:
 The new KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY ioctls have
 been merged for 2.6.30. However, I note that PCI spec allows devices to
 support multiple vectors with MSI as well (support will be in linux
 2.6.30).

 Even though qemu for now only uses a single vector with MSI, it would
 seem that it's better to make the kernel/user interface generic straight
 away rather than add more ioctls later. What do you think? It might not
 be too late to fix this for 2.6.30.
   

+1




signature.asc
Description: OpenPGP digital signature


Re: qemu/hw/device-assignment: questions about msix_table_page

2009-05-05 Thread Michael S. Tsirkin
On Tue, May 05, 2009 at 07:49:10AM -0300, Marcelo Tosatti wrote:
 On Tue, May 05, 2009 at 01:34:50PM +0300, Michael S. Tsirkin wrote:
  On Tue, May 05, 2009 at 07:19:45AM -0300, Marcelo Tosatti wrote:
   On Tue, May 05, 2009 at 12:51:36PM +0300, Michael S. Tsirkin wrote:
On Mon, Apr 27, 2009 at 10:30:17PM +0800, Sheng Yang wrote:
 If guest can write to the real device MSI-X table directly, 
 it would
 cause chaos on interrupt delivery, for what guest see is 
 totally
 different with what's host see...
   
Obviously.
   
Thanks,
 

What's the reason that this page is unmapped from the qemu memory space?
Specifically what do these lines do:
int offset = r_dev-msix_table_addr - 
real_region-base_addr;
ret = munmap(region-u.r_virtbase + offset, 
TARGET_PAGE_SIZE);
   
   I believe this allows accesses to this page (the MSI-X table), which
   is part of the guest address space (through kvm memory slots), to be
   trapped by qemu.
   
   Since there is no actual page in this guest address, KVM treats accesses
   as MMIO and forwards them to QEMU.
   
   
  
  I thought about this too.
  But why is this necessary for assigned MSI-X but not for emulated devices 
  such as
  e.g. e1000? All e1000 does seems to be cpu_register_physical_memory ...
 
 Because there is no registered (kvm) memory slot for the range which
 e1000 registers its MMIO?

ret = kvm_register_phys_mem(kvm_context, e_phys,
region-u.r_virtbase,
TARGET_PAGE_ALIGN(e_size), 0);
is what creates this slot, correct?

 Not sure about the address of the MSI-X table
 page, but you could achieve the same effect by splitting the slot which
 it lives in two, with a 1 page hole between them.
 
 BTW this is why you can't map the MSI-X table page directly, you want
 accesses to be trapped.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY should support MSI?

2009-05-05 Thread Avi Kivity

Michael S. Tsirkin wrote:

On Tue, May 05, 2009 at 08:04:15AM -0300, Marcelo Tosatti wrote:
  

On Tue, May 05, 2009 at 01:30:28PM +0300, Michael S. Tsirkin wrote:


The new KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY ioctls have
been merged for 2.6.30. However, I note that PCI spec allows devices to
support multiple vectors with MSI as well (support will be in linux
2.6.30).

Even though qemu for now only uses a single vector with MSI, it would
seem that it's better to make the kernel/user interface generic straight
away rather than add more ioctls later. What do you think? It might not
be too late to fix this for 2.6.30.
  

Can't you use more than one KVM_ASSIGN_SET_MSIX_ENTRY call per assigned
device?



Sure, but only one KVM_ASSIGN_SET_MSIX_NR.

  


MSIX_NR is the size of the table, while MSIX_ENTRY updates a single 
entry, if I read the code correctly.



--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] kvm: fix comment on locking

2009-05-05 Thread Michael S. Tsirkin
__kvm_set_memory_region callers must (and do) take slots_lock, not
mmap_sem, for write. Fix the comment to match this reality.

Signed-off-by: Michael S. Tsirkin m...@redhat.com
---
 virt/kvm/kvm_main.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 605697e..060d86c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -911,7 +911,7 @@ static int kvm_vm_release(struct inode *inode, struct file 
*filp)
  *
  * Discontiguous memory is allowed, mostly for framebuffers.
  *
- * Must be called holding mmap_sem for write.
+ * Must be called holding slots_lock for write.
  */
 int __kvm_set_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
-- 
1.6.3.rc3.1.g830204
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY should support MSI?

2009-05-05 Thread Michael S. Tsirkin
On Tue, May 05, 2009 at 02:57:10PM +0300, Avi Kivity wrote:
 Michael S. Tsirkin wrote:
 On Tue, May 05, 2009 at 08:04:15AM -0300, Marcelo Tosatti wrote:
   
 On Tue, May 05, 2009 at 01:30:28PM +0300, Michael S. Tsirkin wrote:
 
 The new KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY ioctls have
 been merged for 2.6.30. However, I note that PCI spec allows devices to
 support multiple vectors with MSI as well (support will be in linux
 2.6.30).

 Even though qemu for now only uses a single vector with MSI, it would
 seem that it's better to make the kernel/user interface generic straight
 away rather than add more ioctls later. What do you think? It might not
 be too late to fix this for 2.6.30.
   
 Can't you use more than one KVM_ASSIGN_SET_MSIX_ENTRY call per assigned
 device?
 

 Sure, but only one KVM_ASSIGN_SET_MSIX_NR.

   

 MSIX_NR is the size of the table, while MSIX_ENTRY updates a single  
 entry, if I read the code correctly.

Right. So we'll need something like this for MSI as well.
Actually maybe MSIX_NR MSIX_ENTRY should be renamed to MSI_NR / MSI_ENTRY
and changed to do the right thing depending on the IRQ type?

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[ kvm-Bugs-2787205 ] Video: KVM graphics performance dropped

2009-05-05 Thread SourceForge.net
Bugs item #2787205, was opened at 2009-05-05 15:02
Message generated for change (Tracker Item Submitted) made by technologov
You can respond by visiting: 
https://sourceforge.net/tracker/?func=detailatid=893831aid=2787205group_id=180599

Please note that this message will contain a full copy of the comment thread,
including the initial issue submission, for this request,
not just the latest update.
Category: None
Group: None
Status: Open
Resolution: None
Priority: 8
Private: No
Submitted By: Technologov (technologov)
Assigned to: Nobody/Anonymous (nobody)
Summary: Video: KVM graphics performance dropped

Initial Comment:
Starting with KVM-84 the video performance dropped to a turtle speed when using 
remote X11 SDL rendering.
KVM-85 is even worser, because in addition to slow speed, it adds flickering.

KVM is basically useless when working over remote X11/SDL.

-Alexey, 5.5.2009.

--

You can respond by visiting: 
https://sourceforge.net/tracker/?func=detailatid=893831aid=2787205group_id=180599
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY should support MSI?

2009-05-05 Thread Avi Kivity

Michael S. Tsirkin wrote:

On Tue, May 05, 2009 at 02:57:10PM +0300, Avi Kivity wrote:
  

Michael S. Tsirkin wrote:


On Tue, May 05, 2009 at 08:04:15AM -0300, Marcelo Tosatti wrote:
  
  

On Tue, May 05, 2009 at 01:30:28PM +0300, Michael S. Tsirkin wrote:



The new KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY ioctls have
been merged for 2.6.30. However, I note that PCI spec allows devices to
support multiple vectors with MSI as well (support will be in linux
2.6.30).

Even though qemu for now only uses a single vector with MSI, it would
seem that it's better to make the kernel/user interface generic straight
away rather than add more ioctls later. What do you think? It might not
be too late to fix this for 2.6.30.
  
  

Can't you use more than one KVM_ASSIGN_SET_MSIX_ENTRY call per assigned
device?



Sure, but only one KVM_ASSIGN_SET_MSIX_NR.

  
  
MSIX_NR is the size of the table, while MSIX_ENTRY updates a single  
entry, if I read the code correctly.



Right. So we'll need something like this for MSI as well.
Actually maybe MSIX_NR MSIX_ENTRY should be renamed to MSI_NR / MSI_ENTRY
and changed to do the right thing depending on the IRQ type?
  


Works for me.  Sheng, is there a reason why it wasn't done like this?

btw, it could be further simplified by using irqfd.  Instead of the host 
device tying directly into kvm, it could just trigger an eventfd; and we 
could terminate the eventfd either in kvm (irqfd) or in qemu.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY should support MSI?

2009-05-05 Thread Michael S. Tsirkin
On Tue, May 05, 2009 at 03:08:40PM +0300, Avi Kivity wrote:
 Michael S. Tsirkin wrote:
 On Tue, May 05, 2009 at 02:57:10PM +0300, Avi Kivity wrote:
   
 Michael S. Tsirkin wrote:
 
 On Tue, May 05, 2009 at 08:04:15AM -0300, Marcelo Tosatti wrote:
 
 On Tue, May 05, 2009 at 01:30:28PM +0300, Michael S. Tsirkin wrote:
 
 The new KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY ioctls have
 been merged for 2.6.30. However, I note that PCI spec allows devices to
 support multiple vectors with MSI as well (support will be in linux
 2.6.30).

 Even though qemu for now only uses a single vector with MSI, it would
 seem that it's better to make the kernel/user interface generic straight
 away rather than add more ioctls later. What do you think? It might not
 be too late to fix this for 2.6.30.
 
 Can't you use more than one KVM_ASSIGN_SET_MSIX_ENTRY call per assigned
 device?
 
 Sure, but only one KVM_ASSIGN_SET_MSIX_NR.

 
 MSIX_NR is the size of the table, while MSIX_ENTRY updates a single   
 entry, if I read the code correctly.
 

 Right. So we'll need something like this for MSI as well.
 Actually maybe MSIX_NR MSIX_ENTRY should be renamed to MSI_NR / MSI_ENTRY
 and changed to do the right thing depending on the IRQ type?
   

 Works for me.  Sheng, is there a reason why it wasn't done like this?

 btw, it could be further simplified by using irqfd.  Instead of the host  
 device tying directly into kvm, it could just trigger an eventfd; and we  
 could terminate the eventfd either in kvm (irqfd) or in qemu.

This probably is outside the scope for 2.6.30 :)

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Serialize qcow2 writes

2009-05-05 Thread Avi Kivity
Commit 641636d (qcow2 corruption: Fix alloc_cluster_link_l2; 4df8f71 on
stable-0.10) exposes a bug with concurrent allocating qcow2 writes: the
writes will trigger a call to free_any_clusters() and corrupt the image.

As a temporary workaround until a real fix is written, this patch serializes
writes to avoid the issue.  With this, I can install Fedora 10 on a virtio
disk.

Signed-off-by: Avi Kivity a...@redhat.com
---
 block-qcow2.c |   27 ++-
 1 files changed, 26 insertions(+), 1 deletions(-)

diff --git a/block-qcow2.c b/block-qcow2.c
index 1f33125..6685915 100644
--- a/block-qcow2.c
+++ b/block-qcow2.c
@@ -157,6 +157,8 @@ typedef struct BDRVQcowState {
 int snapshots_size;
 int nb_snapshots;
 QCowSnapshot *snapshots;
+int write_in_progress;
+TAILQ_HEAD(QCow2DeferredWrites, QCowAIOCB) deferred_writes;
 } BDRVQcowState;
 
 static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset);
@@ -371,6 +373,9 @@ static int qcow_open(BlockDriverState *bs, const char 
*filename, int flags)
 if (qcow_read_snapshots(bs)  0)
 goto fail;
 
+s-write_in_progress = 0;
+TAILQ_INIT(s-deferred_writes);
+
 #ifdef DEBUG_ALLOC
 check_refcounts(bs);
 #endif
@@ -1274,6 +1279,7 @@ typedef struct QCowAIOCB {
 QEMUIOVector hd_qiov;
 QEMUBH *bh;
 QCowL2Meta l2meta;
+TAILQ_ENTRY(QCowAIOCB) deferred_writes_link;
 } QCowAIOCB;
 
 static void qcow_aio_read_cb(void *opaque, int ret);
@@ -1439,6 +1445,8 @@ static BlockDriverAIOCB *qcow_aio_readv(BlockDriverState 
*bs,
 return acb-common;
 }
 
+static void fire_deferred_writes(BDRVQcowState *s);
+
 static void qcow_aio_write_cb(void *opaque, int ret)
 {
 QCowAIOCB *acb = opaque;
@@ -1509,6 +1517,21 @@ done:
 qemu_vfree(acb-orig_buf);
 acb-common.cb(acb-common.opaque, ret);
 qemu_aio_release(acb);
+
+s-write_in_progress = 0;
+fire_deferred_writes(s);
+}
+
+static void fire_deferred_writes(BDRVQcowState *s)
+{
+QCowAIOCB *acb;
+
+if (!s-write_in_progress  !TAILQ_EMPTY(s-deferred_writes)) {
+s-write_in_progress = 1;
+acb = TAILQ_FIRST(s-deferred_writes);
+TAILQ_REMOVE(s-deferred_writes, acb, deferred_writes_link);
+qcow_aio_write_cb(acb, 0);
+}
 }
 
 static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs,
@@ -1524,7 +1547,9 @@ static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState 
*bs,
 if (!acb)
 return NULL;
 
-qcow_aio_write_cb(acb, 0);
+TAILQ_INSERT_TAIL(s-deferred_writes, acb, deferred_writes_link);
+fire_deferred_writes(s);
+
 return acb-common;
 }
 
-- 
1.6.1.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: qemu/hw/device-assignment: questions about msix_table_page

2009-05-05 Thread Michael S. Tsirkin
On Tue, May 05, 2009 at 07:49:10AM -0300, Marcelo Tosatti wrote:
 On Tue, May 05, 2009 at 01:34:50PM +0300, Michael S. Tsirkin wrote:
  On Tue, May 05, 2009 at 07:19:45AM -0300, Marcelo Tosatti wrote:
   On Tue, May 05, 2009 at 12:51:36PM +0300, Michael S. Tsirkin wrote:
On Mon, Apr 27, 2009 at 10:30:17PM +0800, Sheng Yang wrote:
 If guest can write to the real device MSI-X table directly, 
 it would
 cause chaos on interrupt delivery, for what guest see is 
 totally
 different with what's host see...
   
Obviously.
   
Thanks,
 

What's the reason that this page is unmapped from the qemu memory space?
Specifically what do these lines do:
int offset = r_dev-msix_table_addr - 
real_region-base_addr;
ret = munmap(region-u.r_virtbase + offset, 
TARGET_PAGE_SIZE);
   
   I believe this allows accesses to this page (the MSI-X table), which
   is part of the guest address space (through kvm memory slots), to be
   trapped by qemu.
   
   Since there is no actual page in this guest address, KVM treats accesses
   as MMIO and forwards them to QEMU.
   
   
  
  I thought about this too.
  But why is this necessary for assigned MSI-X but not for emulated devices 
  such as
  e.g. e1000? All e1000 does seems to be cpu_register_physical_memory ...
 
 Because there is no registered (kvm) memory slot for the range which
 e1000 registers its MMIO? Not sure about the address of the MSI-X table
 page, but you could achieve the same effect by splitting the slot which
 it lives in two, with a 1 page hole between them.

You could also move the emulated MSI-X table, sticking it on top of the
existing BAR. Since PCI config includes the pointer to the table,
a driver that reads this pointer will continue to work.

Of course, there's no guarantee that guest drivers don't just hard-code
this offset.

 BTW this is why you can't map the MSI-X table page directly, you want
 accesses to be trapped.

BTW current design won't work if the base page size is  4K, will it?
The hole covers a page, so you'll get faults outside the MSI-X table.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY should support MSI?

2009-05-05 Thread Michael S. Tsirkin
On Tue, May 05, 2009 at 03:08:40PM +0300, Avi Kivity wrote:
 Michael S. Tsirkin wrote:
 On Tue, May 05, 2009 at 02:57:10PM +0300, Avi Kivity wrote:
   
 Michael S. Tsirkin wrote:
 
 On Tue, May 05, 2009 at 08:04:15AM -0300, Marcelo Tosatti wrote:
 
 On Tue, May 05, 2009 at 01:30:28PM +0300, Michael S. Tsirkin wrote:
 
 The new KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY ioctls have
 been merged for 2.6.30. However, I note that PCI spec allows devices to
 support multiple vectors with MSI as well (support will be in linux
 2.6.30).

 Even though qemu for now only uses a single vector with MSI, it would
 seem that it's better to make the kernel/user interface generic straight
 away rather than add more ioctls later. What do you think? It might not
 be too late to fix this for 2.6.30.
 
 Can't you use more than one KVM_ASSIGN_SET_MSIX_ENTRY call per assigned
 device?
 
 Sure, but only one KVM_ASSIGN_SET_MSIX_NR.

 
 MSIX_NR is the size of the table, while MSIX_ENTRY updates a single   
 entry, if I read the code correctly.
 

 Right. So we'll need something like this for MSI as well.
 Actually maybe MSIX_NR MSIX_ENTRY should be renamed to MSI_NR / MSI_ENTRY
 and changed to do the right thing depending on the IRQ type?
   

 Works for me.  Sheng, is there a reason why it wasn't done like this?

 btw, it could be further simplified by using irqfd.  Instead of the host  
 device tying directly into kvm, it could just trigger an eventfd; and we  
 could terminate the eventfd either in kvm (irqfd) or in qemu.

If you are going wild, you could then split this code out from kvm
into something like a UIO driver. E.g. qemu could then in theory
support assigned devices even without VT-d hardware support in CPU.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 0/3] generic hypercall support

2009-05-05 Thread Gregory Haskins
(Applies to Linus' tree, b4348f32dae3cb6eb4bc21c7ed8f76c0b11e9d6a)

Please see patch 1/3 for a description.  This has been tested with a KVM
guest on x86_64 and appears to work properly.  Comments, please.

-Greg

---

Gregory Haskins (3):
  kvm: add pv_cpu_ops.hypercall support to the guest
  x86: add generic hypercall support
  add generic hypercall support


 arch/Kconfig |3 +
 arch/x86/Kconfig |1 
 arch/x86/include/asm/paravirt.h  |   13 ++
 arch/x86/include/asm/processor.h |6 +++
 arch/x86/kernel/kvm.c|   22 ++
 include/linux/hypercall.h|   83 ++
 6 files changed, 128 insertions(+), 0 deletions(-)
 create mode 100644 include/linux/hypercall.h

-- 
Signature
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 1/3] add generic hypercall support

2009-05-05 Thread Gregory Haskins
We add a generic hypercall() mechanism for use by IO code which is
compatible with a variety of hypervisors, but which prefers to use
hypercalls over other types of hypervisor traps for performance and/or
feature reasons.

For instance, consider an emulated PCI device in KVM.  Today we can chose
to do IO over MMIO or PIO infrastructure, but they each have their own
distinct disadvantages:

*) MMIO causes a page-fault, which must be decoded by the hypervisor and is
   therefore fairly expensive.

*) PIO is more direct than MMIO, but it poses other problems such as:
  a) can have a small limited address space (x86 is 2^16)
  b) is a narrow-band interface (one 8, 16, 32, 64 bit word at a time)
  c) not available on all archs (PCI mentions ppc as problematic) and
 is therefore recommended to avoid.

Hypercalls, on the other hand, offer a direct access path like PIOs, yet
do not suffer the same drawbacks such as a limited address space or a
narrow-band interface.  Hypercalls are much more friendly to software
to software interaction since we can pack multiple registers in a way
that is natural and simple for software to utilize.

The problem with hypercalls today is that there is no generic support.
There is various support for hypervisor specific implementations (for
instance, see  kvm_hypercall0() in arch/x86/include/asm/kvm_para.h).  This
makes it difficult to implement a device that is hypervisor agnostic since
it would not only need to know the hypercall ABI, but also which platform
specific function call it should make.

If we can convey a dynamic binding to a specific hypercall vector in a
generic way (out of the scope of this patch series), then an IO driver
could utilize that dynamic binding to communicate without requiring
hypervisor specific knowledge.  Therefore, we implement a system wide
hypercall() interface based on a variable length list of unsigned longs
(representing registers to pack) and expect that various arch/hypervisor
implementations can fill in the details, if supported.  This is expected
to be done as part of the pv_ops infrastructure, which is the natural
hook-point for hypervisor specific code.  Note, however, that the
generic hypercall() interface does not require the implementation to use
pv_ops if so desired.

Example use case:
--

Consider a PCI device X.  It can already advertise MMIO/PIO regions via
its BAR infrastructure.  With this new model it could also advertise a
hypercall vector in its device-specific upper configuration space.  (The
allocation and assignment of this vector on the backend is beyond the scope
of this series).  The guest-side driver for device X would sense (via
something like a feature-bit) if the hypercall was available and valid,
read the value with a configuration cycle, and proceed to ignore the BARs
in favor of using the hypercall() interface.

Signed-off-by: Gregory Haskins ghask...@novell.com
---

 include/linux/hypercall.h |   83 +
 1 files changed, 83 insertions(+), 0 deletions(-)
 create mode 100644 include/linux/hypercall.h

diff --git a/include/linux/hypercall.h b/include/linux/hypercall.h
new file mode 100644
index 000..c8a1492
--- /dev/null
+++ b/include/linux/hypercall.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2009 Novell.  All Rights Reserved.
+ *
+ * Author:
+ *  Gregory Haskins ghask...@novell.com
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef _LINUX_HYPERCALL_H
+#define _LINUX_HYPERCALL_H
+
+#ifdef CONFIG_HAVE_HYPERCALL
+
+long hypercall(unsigned long nr, unsigned long *args, size_t count);
+
+#else
+
+static inline long
+hypercall(unsigned long nr, unsigned long *args, size_t count)
+{
+   return -EINVAL;
+}
+
+#endif /* CONFIG_HAVE_HYPERCALL */
+
+#define hypercall0(nr) hypercall(nr, NULL, 0)
+#define hypercall1(nr, a1)  \
+   ({  \
+   unsigned long __args[] = { a1, };   \
+   long __ret; \
+   __ret = hypercall(nr, __args, ARRAY_SIZE(__args));  \
+   __ret;  \
+   })
+#define hypercall2(nr, a1, a2) \
+   ({   

[RFC PATCH 3/3] kvm: add pv_cpu_ops.hypercall support to the guest

2009-05-05 Thread Gregory Haskins
Signed-off-by: Gregory Haskins ghask...@novell.com
---

 arch/x86/kernel/kvm.c |   22 ++
 1 files changed, 22 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 33019dd..d299ed5 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -50,6 +50,26 @@ static void kvm_io_delay(void)
 {
 }
 
+static long _kvm_hypercall(unsigned long nr,
+  unsigned long *args,
+  size_t count)
+{
+   switch (count) {
+   case 0:
+   return kvm_hypercall0(nr);
+   case 1:
+   return kvm_hypercall1(nr, args[0]);
+   case 2:
+   return kvm_hypercall2(nr, args[0], args[1]);
+   case 3:
+   return kvm_hypercall3(nr, args[0], args[1], args[2]);
+   case 4:
+   return kvm_hypercall4(nr, args[0], args[1], args[2], args[3]);
+   default:
+   return -EINVAL;
+   }
+}
+
 static void kvm_mmu_op(void *buffer, unsigned len)
 {
int r;
@@ -207,6 +227,8 @@ static void paravirt_ops_setup(void)
if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
pv_cpu_ops.io_delay = kvm_io_delay;
 
+   pv_cpu_ops.hypercall = _kvm_hypercall;
+
if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) {
pv_mmu_ops.set_pte = kvm_set_pte;
pv_mmu_ops.set_pte_at = kvm_set_pte_at;

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 2/3] x86: add generic hypercall support

2009-05-05 Thread Gregory Haskins
This adds a hypercall() vector to x86 pv_cpu_ops to be optionally filled in
by a hypervisor driver as it loads its other pv_ops components.  We also
declare x86 as CONFIG_HAVE_HYPERCALL to enable the generic hypercall code
whenever the user builds for x86.

Signed-off-by: Gregory Haskins ghask...@novell.com
---

 arch/Kconfig |3 +++
 arch/x86/Kconfig |1 +
 arch/x86/include/asm/paravirt.h  |   13 +
 arch/x86/include/asm/processor.h |6 ++
 4 files changed, 23 insertions(+), 0 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 78a35e9..239b658 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -112,3 +112,6 @@ config HAVE_DMA_API_DEBUG
 
 config HAVE_DEFAULT_NO_SPIN_MUTEXES
bool
+
+config HAVE_HYPERCALL
+bool
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index df9e885..3c609cf 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -46,6 +46,7 @@ config X86
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_BZIP2
select HAVE_KERNEL_LZMA
+   select HAVE_HYPERCALL
 
 config ARCH_DEFCONFIG
string
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 378e369..ed22c84 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -6,6 +6,7 @@
 #ifdef CONFIG_PARAVIRT
 #include asm/pgtable_types.h
 #include asm/asm.h
+#include asm/errno.h
 
 /* Bitmask of what can be clobbered: usually at least eax. */
 #define CLBR_NONE 0
@@ -203,6 +204,8 @@ struct pv_cpu_ops {
 
void (*swapgs)(void);
 
+   long (*hypercall)(unsigned long nr, unsigned long *args, size_t count);
+
struct pv_lazy_ops lazy_mode;
 };
 
@@ -723,6 +726,16 @@ static inline void __cpuid(unsigned int *eax, unsigned int 
*ebx,
PVOP_VCALL4(pv_cpu_ops.cpuid, eax, ebx, ecx, edx);
 }
 
+static inline long hypercall(unsigned long nr,
+unsigned long *args,
+size_t count)
+{
+   if (!pv_cpu_ops.hypercall)
+   return -EINVAL;
+
+   return pv_cpu_ops.hypercall(nr, args, count);
+}
+
 /*
  * These special macros can be used to get or set a debugging register
  */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c2cceae..8fa988d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -570,6 +570,12 @@ static inline void native_swapgs(void)
 #define __cpuidnative_cpuid
 #define paravirt_enabled() 0
 
+static inline long
+hypercall(unsigned long nr, unsigned long *args, size_t count)
+{
+   return -EINVAL;
+}
+
 /*
  * These special macros can be used to get or set a debugging register
  */

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY should support MSI?

2009-05-05 Thread Avi Kivity

Michael S. Tsirkin wrote:

Works for me.  Sheng, is there a reason why it wasn't done like this?

btw, it could be further simplified by using irqfd.  Instead of the host  
device tying directly into kvm, it could just trigger an eventfd; and we  
could terminate the eventfd either in kvm (irqfd) or in qemu.



If you are going wild, you could then split this code out from kvm
into something like a UIO driver. E.g. qemu could then in theory
support assigned devices even without VT-d hardware support in CPU.
  


That's my thinking.  PCI interrupts don't work because we need to do 
some hacky stuff in there, but MSI should.  Oh, and we could improve UIO 
support for interrupts when using MSI, since there's no need to 
acknowledge the interrupt.


Support we can tell the kernel to signal an eventfd whenever an MSI 
fires.  We then ask kvm for an irqfd, and give that irqfd to the kernel 
for the MSI.


Voila, we assign an interrupt from userspace, without the device or kvm 
knowing anything about it.  Like you say, we can assign the device to 
pure qemu, or to a userspace driver.


Beautiful, I finally found something to replace my old Lego set.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/3] generic hypercall support

2009-05-05 Thread Avi Kivity

Gregory Haskins wrote:

(Applies to Linus' tree, b4348f32dae3cb6eb4bc21c7ed8f76c0b11e9d6a)

Please see patch 1/3 for a description.  This has been tested with a KVM
guest on x86_64 and appears to work properly.  Comments, please.
  


What about the hypercalls in include/asm/kvm_para.h?

In general, hypercalls cannot be generic since each hypervisor 
implements its own ABI.  The abstraction needs to be at a higher level 
(pv_ops is such a level).



--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/3] generic hypercall support

2009-05-05 Thread Gregory Haskins
Avi Kivity wrote:
 Gregory Haskins wrote:
 (Applies to Linus' tree, b4348f32dae3cb6eb4bc21c7ed8f76c0b11e9d6a)

 Please see patch 1/3 for a description.  This has been tested with a KVM
 guest on x86_64 and appears to work properly.  Comments, please.
   

 What about the hypercalls in include/asm/kvm_para.h?

 In general, hypercalls cannot be generic since each hypervisor
 implements its own ABI.
Please see the prologue to 1/3.  Its all described there, including a
use case which I think answers your questions.  If there is still
ambiguity, let me know.

   The abstraction needs to be at a higher level (pv_ops is such a level).
Yep, agreed.  Thats exactly what this series is doing, actually.


-Greg





signature.asc
Description: OpenPGP digital signature


Re: [PATCH] deal with interrupt shadow state for emulated instruction

2009-05-05 Thread Glauber Costa
 Hmm, if the guest runs an infinite emulated 'mov ss', it will keep  
 toggling the MOV_SS bit, but STI will remain set, so we'll never allow  
 an interrupt into the guest kernel.
We have no choice but returning both flags, since svm does not differentiate
between them.

But see below for an alternative path that makes it a non-issue.


 diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
 index d2664fc..797d41f 100644
 --- a/arch/x86/kvm/x86_emulate.c
 +++ b/arch/x86/kvm/x86_emulate.c
 @@ -1618,6 +1618,16 @@ special_insn:
  int err;
  sel = c-src.val;
 +if (c-modrm_reg == VCPU_SREG_SS) {
 +u32 int_shadow =
 +kvm_x86_ops-get_interrupt_shadow(ctxt-vcpu);
 +/* See sti emulation for an explanation of this */
 +if ((int_shadow  X86_SHADOW_INT_MOV_SS))
 +ctxt-interruptibility = 
 ~X86_SHADOW_INT_MOV_SS;
 +else
 +ctxt-interruptibility |= X86_SHADOW_INT_MOV_SS;
 +}
   

 ^=
 =p \o/

After re-reading this, masking the flags in here makes no sense.

I am moving to an approach in which I do

if (!(int_shadow  X86_SHADOW_INT_MOV_SS))
ctxt-interruptibility = X86_SHADOW_INT_MOV_SS;

Since if the next instruction is an sti, it is certainly not an sti; sti 
instruction
(the current is mov ss, after all). So we should mask it anyway. This also 
solves
nicely the problem you raised at svm.c.

 @@ -1846,10 +1856,23 @@ special_insn:
  ctxt-eflags = ~X86_EFLAGS_IF;
  c-dst.type = OP_NONE;  /* Disable writeback. */
  break;
 -case 0xfb: /* sti */
 +case 0xfb: { /* sti */
 +u32 int_shadow = kvm_x86_ops-get_interrupt_shadow(ctxt-vcpu);
 +/*
 + * an sti; sti; sequence only disable interrupts for the first
 + * instruction. So, if the last instruction, be it emulated or
 + * not, left the system with the INT_STI flag enabled, it
 + * means that the last instruction is an sti. We should not
 + * leave the flag on in this case
 + */
 +if ((int_shadow  X86_SHADOW_INT_STI))
 +ctxt-interruptibility = ~X86_SHADOW_INT_STI;
 +else
 +ctxt-interruptibility |= X86_SHADOW_INT_STI;
   

 ^=
ditto

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/3] generic hypercall support

2009-05-05 Thread Avi Kivity

Gregory Haskins wrote:

Avi Kivity wrote:
  

Gregory Haskins wrote:


(Applies to Linus' tree, b4348f32dae3cb6eb4bc21c7ed8f76c0b11e9d6a)

Please see patch 1/3 for a description.  This has been tested with a KVM
guest on x86_64 and appears to work properly.  Comments, please.
  
  

What about the hypercalls in include/asm/kvm_para.h?

In general, hypercalls cannot be generic since each hypervisor
implements its own ABI.


Please see the prologue to 1/3.  Its all described there, including a
use case which I think answers your questions.  If there is still
ambiguity, let me know.

  


Yeah, sorry.


  The abstraction needs to be at a higher level (pv_ops is such a level).


Yep, agreed.  Thats exactly what this series is doing, actually.
  


No, it doesn't.  It makes making hypercalls a pv_op, but hypervisors 
don't implement the same ABI.


pv_ops all _use_ hypercalls to implement higher level operations, like 
set_pte (probably the only place set_pte can be considered a high level 
operation).


In this case, the higher level event could be 
hypervisor_dynamic_event(number); each pv_ops implementation would use 
its own hypercalls to implement that.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/3] generic hypercall support

2009-05-05 Thread Gregory Haskins
Avi Kivity wrote:
 Gregory Haskins wrote:
 Avi Kivity wrote:
  
 Gregory Haskins wrote:

 (Applies to Linus' tree, b4348f32dae3cb6eb4bc21c7ed8f76c0b11e9d6a)

 Please see patch 1/3 for a description.  This has been tested with
 a KVM
 guest on x86_64 and appears to work properly.  Comments, please.
 
 What about the hypercalls in include/asm/kvm_para.h?

 In general, hypercalls cannot be generic since each hypervisor
 implements its own ABI.
 
 Please see the prologue to 1/3.  Its all described there, including a
 use case which I think answers your questions.  If there is still
 ambiguity, let me know.

   

 Yeah, sorry.

   The abstraction needs to be at a higher level (pv_ops is such a
 level).
 
 Yep, agreed.  Thats exactly what this series is doing, actually.
   

 No, it doesn't.  It makes making hypercalls a pv_op, but hypervisors
 don't implement the same ABI.
Yes, that is true, but I think the issue right now is more of
semantics.  I think we are on the same page.

So you would never have someone making a generic
hypercall(KVM_HC_MMU_OP).  I agree.  What I am proposing here is more
akin to PIO-BAR + iowrite()/ioread().  E.g. the infrastructure sets up
the addressing (where in PIO this is literally an address, and for
hypercalls this is a vector), but the device defines the ABI at that
address.  So its really the device end-point that is defining the ABI
here, not the hypervisor (per se) and thats why I thought its ok to
declare these generic.  But to your point below...


 pv_ops all _use_ hypercalls to implement higher level operations, like
 set_pte (probably the only place set_pte can be considered a high
 level operation).

 In this case, the higher level event could be
 hypervisor_dynamic_event(number); each pv_ops implementation would use
 its own hypercalls to implement that.

I see.  I had designed it slightly different where KVM could assign any
top level vector it wanted and thus that drove the guest-side interface
you see here to be more generic hypercall.  However, I think your
proposal is perfectly fine too and it makes sense to more narrowly focus
these calls as specifically dynamic...as thats the only vectors that
we could technically use like this anyway.

So rather than allocate a top-level vector, I will add KVM_HC_DYNAMIC
to kvm_para.h, and I will change the interface to follow suit (something
like s/hypercall/dynhc).  Sound good?

Thanks, Avi,
-Greg




signature.asc
Description: OpenPGP digital signature


Re: [KVM PATCH v4 0/2] irqfd

2009-05-05 Thread Davide Libenzi
On Mon, 4 May 2009, Gregory Haskins wrote:

 (Applies to kvm.git:7da2e3ba, plus you will also need Davide Libenzi's
 eventfd_file_create() patch, which you can find here:
 
 http://www.mail-archive.com/kvm@vger.kernel.org/msg13923.html

Ping me back if Al acks the irqfd thing, that I'll take a better look at 
the patch above and make an official post. Without any users, I'd rather 
leave the current code as is.


- Davide


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/3] generic hypercall support

2009-05-05 Thread Gregory Haskins
Gregory Haskins wrote:
 So rather than allocate a top-level vector, I will add KVM_HC_DYNAMIC
 to kvm_para.h, and I will change the interface to follow suit (something
 like s/hypercall/dynhc).  Sound good?
   

A small ramification of this change will be that I will need to do
something like add a feature-bit to cpuid for detecting if HC_DYNAMIC is
supported on the backend or not.  The current v1 design doesn't suffer
from this requirement because the presence of the dynamic vector itself
is enough to know its supported.  I like Avi's proposal enough to say
that its worth this minor inconvenience, but FYI I will have to
additionally submit a userspace patch for v2 if we go this route.

-Greg





signature.asc
Description: OpenPGP digital signature


Re: [KVM PATCH v4 0/2] irqfd

2009-05-05 Thread Gregory Haskins
Davide Libenzi wrote:
 On Mon, 4 May 2009, Gregory Haskins wrote:

   
 (Applies to kvm.git:7da2e3ba, plus you will also need Davide Libenzi's
 eventfd_file_create() patch, which you can find here:

 http://www.mail-archive.com/kvm@vger.kernel.org/msg13923.html
 

 Ping me back if Al acks the irqfd thing, that I'll take a better look at 
 the patch above and make an official post. Without any users, I'd rather 
 leave the current code as is.
   

Will do, Davide.  Thank you.

-Greg




signature.asc
Description: OpenPGP digital signature


[PATCH 0/6] kvm-s390: collection of kvm-s390 fixes

2009-05-05 Thread ehrhardt
From: Christian Ehrhardt ehrha...@de.ibm.com

This is a collection of fixes for kvm-s390 that originate from several tests
made in the last few months. They are now tested a while and should be ready
to be merged.

All six patches are created either by Carsten Otte or Christain Borntraeger.
I'm just the one stumbling across the filled patch queue and cleaning them up
for submission. The patches themselve have proper tags to account creator etc.

*not sending patches a few weeks makes somewhat forgetful - I beg a pardon from
all on cc that got it two times now after adding the kvm list this time.

Patches included:
[PATCH 1/6] kvm-s390: Fix memory slot versus run'
[PATCH 2/6] kvm-s390: use hrtimer for clock wakeup from idle'
[PATCH 3/6] kvm-s390: optimize float int lock: spin_lock_bh -- spin_lock'
[PATCH 4/6] kvm-s390: Unlink vcpu on destroy'
[PATCH 5/6] kvm-s390: Sanity check on validity intercept'
[PATCH 6/6] kvm-s390: Verify memory in kvm run'

Overall-Diffstat:
 arch/s390/include/asm/kvm_host.h |5 ++-
 arch/s390/kvm/intercept.c|   28 ---
 arch/s390/kvm/interrupt.c|   55 ---
 arch/s390/kvm/kvm-s390.c |   50 ---
 arch/s390/kvm/kvm-s390.h |4 ++
 arch/s390/kvm/priv.c |4 +-
 arch/s390/kvm/sigp.c |   16 +--
 7 files changed, 110 insertions(+), 52 deletions(-)
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/6] kvm-s390: optimize float int lock: spin_lock_bh -- spin_lock

2009-05-05 Thread ehrhardt
From: Christian Borntraeger borntrae...@de.ibm.com

The floating interrupt lock is only taken in process context. We can
replace all spin_lock_bh with standard spin_lock calls.

Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com
---
 arch/s390/kvm/interrupt.c |   20 ++--
 arch/s390/kvm/kvm-s390.c  |4 ++--
 arch/s390/kvm/priv.c  |4 ++--
 arch/s390/kvm/sigp.c  |   16 
 4 files changed, 22 insertions(+), 22 deletions(-)

Index: kvm/arch/s390/kvm/interrupt.c
===
--- kvm.orig/arch/s390/kvm/interrupt.c
+++ kvm/arch/s390/kvm/interrupt.c
@@ -301,13 +301,13 @@ int kvm_cpu_has_interrupt(struct kvm_vcp
}
 
if ((!rc)  atomic_read(fi-active)) {
-   spin_lock_bh(fi-lock);
+   spin_lock(fi-lock);
list_for_each_entry(inti, fi-list, list)
if (__interrupt_is_deliverable(vcpu, inti)) {
rc = 1;
break;
}
-   spin_unlock_bh(fi-lock);
+   spin_unlock(fi-lock);
}
 
if ((!rc)  (vcpu-arch.sie_block-ckc 
@@ -368,7 +368,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu
hrtimer_start(vcpu-arch.ckc_timer, ktime_set (0, sltime) , 
HRTIMER_MODE_REL);
VCPU_EVENT(vcpu, 5, enabled wait via clock comparator: %llx ns, 
sltime);
 no_timer:
-   spin_lock_bh(vcpu-arch.local_int.float_int-lock);
+   spin_lock(vcpu-arch.local_int.float_int-lock);
spin_lock_bh(vcpu-arch.local_int.lock);
add_wait_queue(vcpu-arch.local_int.wq, wait);
while (list_empty(vcpu-arch.local_int.list) 
@@ -377,18 +377,18 @@ no_timer:
!signal_pending(current)) {
set_current_state(TASK_INTERRUPTIBLE);
spin_unlock_bh(vcpu-arch.local_int.lock);
-   spin_unlock_bh(vcpu-arch.local_int.float_int-lock);
+   spin_unlock(vcpu-arch.local_int.float_int-lock);
vcpu_put(vcpu);
schedule();
vcpu_load(vcpu);
-   spin_lock_bh(vcpu-arch.local_int.float_int-lock);
+   spin_lock(vcpu-arch.local_int.float_int-lock);
spin_lock_bh(vcpu-arch.local_int.lock);
}
__unset_cpu_idle(vcpu);
__set_current_state(TASK_RUNNING);
remove_wait_queue(vcpu-wq, wait);
spin_unlock_bh(vcpu-arch.local_int.lock);
-   spin_unlock_bh(vcpu-arch.local_int.float_int-lock);
+   spin_unlock(vcpu-arch.local_int.float_int-lock);
hrtimer_try_to_cancel(vcpu-arch.ckc_timer);
return 0;
 }
@@ -455,7 +455,7 @@ void kvm_s390_deliver_pending_interrupts
if (atomic_read(fi-active)) {
do {
deliver = 0;
-   spin_lock_bh(fi-lock);
+   spin_lock(fi-lock);
list_for_each_entry_safe(inti, n, fi-list, list) {
if (__interrupt_is_deliverable(vcpu, inti)) {
list_del(inti-list);
@@ -466,7 +466,7 @@ void kvm_s390_deliver_pending_interrupts
}
if (list_empty(fi-list))
atomic_set(fi-active, 0);
-   spin_unlock_bh(fi-lock);
+   spin_unlock(fi-lock);
if (deliver) {
__do_deliver_interrupt(vcpu, inti);
kfree(inti);
@@ -531,7 +531,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
 
mutex_lock(kvm-lock);
fi = kvm-arch.float_int;
-   spin_lock_bh(fi-lock);
+   spin_lock(fi-lock);
list_add_tail(inti-list, fi-list);
atomic_set(fi-active, 1);
sigcpu = find_first_bit(fi-idle_mask, KVM_MAX_VCPUS);
@@ -548,7 +548,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
if (waitqueue_active(li-wq))
wake_up_interruptible(li-wq);
spin_unlock_bh(li-lock);
-   spin_unlock_bh(fi-lock);
+   spin_unlock(fi-lock);
mutex_unlock(kvm-lock);
return 0;
 }
Index: kvm/arch/s390/kvm/kvm-s390.c
===
--- kvm.orig/arch/s390/kvm/kvm-s390.c
+++ kvm/arch/s390/kvm/kvm-s390.c
@@ -323,11 +323,11 @@ struct kvm_vcpu *kvm_arch_vcpu_create(st
spin_lock_init(vcpu-arch.local_int.lock);
INIT_LIST_HEAD(vcpu-arch.local_int.list);
vcpu-arch.local_int.float_int = kvm-arch.float_int;
-   spin_lock_bh(kvm-arch.float_int.lock);
+   spin_lock(kvm-arch.float_int.lock);
kvm-arch.float_int.local_int[id] = vcpu-arch.local_int;
init_waitqueue_head(vcpu-arch.local_int.wq);
vcpu-arch.local_int.cpuflags = vcpu-arch.sie_block-cpuflags;
-   spin_unlock_bh(kvm-arch.float_int.lock);
+   spin_unlock(kvm-arch.float_int.lock);
 
rc = 

[PATCH 1/6] kvm-s390: Fix memory slot versus run

2009-05-05 Thread ehrhardt
From: Carsten Otte co...@de.ibm.com

This patch fixes an incorrectness in the kvm backend for s390.
In case virtual cpus are being created before the corresponding
memory slot is being registered, we need to update the sie
control blocks for the virtual cpus. In order to do that, we
use the vcpu-mutex to lock out kvm_run and friends. This way
we can ensure a consistent update of the memory for the entire
smp configuration.

Reported-by: Mijo Safradin m...@linux.vnet.ibm.com
Signed-off-by: Carsten Otte co...@de.ibm.com
---
 arch/s390/kvm/kvm-s390.c |   24 
 1 file changed, 20 insertions(+), 4 deletions(-)

Index: kvm/arch/s390/kvm/kvm-s390.c
===
--- kvm.orig/arch/s390/kvm/kvm-s390.c
+++ kvm/arch/s390/kvm/kvm-s390.c
@@ -657,6 +657,8 @@ int kvm_arch_set_memory_region(struct kv
struct kvm_memory_slot old,
int user_alloc)
 {
+   int i;
+
/* A few sanity checks. We can have exactly one memory slot which has
   to start at guest virtual zero and which has to be located at a
   page boundary in userland and which has to end at a page boundary.
@@ -676,13 +678,27 @@ int kvm_arch_set_memory_region(struct kv
if (mem-memory_size  (PAGE_SIZE - 1))
return -EINVAL;
 
+   /* lock all vcpus */
+   for (i = 0; i  KVM_MAX_VCPUS; ++i) {
+   if (kvm-vcpus[i])
+   mutex_lock(kvm-vcpus[i]-mutex);
+   }
+
kvm-arch.guest_origin = mem-userspace_addr;
kvm-arch.guest_memsize = mem-memory_size;
 
-   /* FIXME: we do want to interrupt running CPUs and update their memory
-  configuration now to avoid race conditions. But hey, changing the
-  memory layout while virtual CPUs are running is usually bad
-  programming practice. */
+   /* update sie control blocks, and unlock all vcpus */
+   for (i = 0; i  KVM_MAX_VCPUS; ++i) {
+   if (kvm-vcpus[i]) {
+   kvm-vcpus[i]-arch.sie_block-gmsor =
+   kvm-arch.guest_origin;
+   kvm-vcpus[i]-arch.sie_block-gmslm =
+   kvm-arch.guest_memsize +
+   kvm-arch.guest_origin +
+   VIRTIODESCSPACE - 1ul;
+   mutex_unlock(kvm-vcpus[i]-mutex);
+   }
+   }
 
return 0;
 }
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/6] kvm-s390: use hrtimer for clock wakeup from idle

2009-05-05 Thread ehrhardt
From: Christian Borntraeger borntrae...@de.ibm.com

This patch reworks the s390 clock comparator wakeup to hrtimer. The clock
comparator is a per-cpu value that is compared against the TOD clock. If
ckc = TOD an external interrupt 1004 is triggered. Since the clock comparator
and the TOD clock have a much higher resolution than jiffies we should use
hrtimers to trigger the wakeup. This speeds up guest nanosleep for small
values.

Since hrtimers callbacks run in hard-irq context, I added a tasklet to do
the actual work with enabled interrupts. 

Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com
Signed-off-by: Carsten Otte co...@de.ibm.com
---
 include/asm/kvm_host.h |5 -
 kvm/interrupt.c|   35 +--
 kvm/kvm-s390.c |7 +--
 kvm/kvm-s390.h |4 +++-
 4 files changed, 37 insertions(+), 14 deletions(-)

Index: kvm/arch/s390/include/asm/kvm_host.h
===
--- kvm.orig/arch/s390/include/asm/kvm_host.h   2009-05-05 15:58:45.0 
+0200
+++ kvm/arch/s390/include/asm/kvm_host.h2009-05-05 16:16:49.0 
+0200
@@ -13,6 +13,8 @@
 
 #ifndef ASM_KVM_HOST_H
 #define ASM_KVM_HOST_H
+#include linux/hrtimer.h
+#include linux/interrupt.h
 #include linux/kvm_host.h
 #include asm/debug.h
 #include asm/cpuid.h
@@ -210,7 +212,8 @@
s390_fp_regs  guest_fpregs;
unsigned int  guest_acrs[NUM_ACRS];
struct kvm_s390_local_interrupt local_int;
-   struct timer_list ckc_timer;
+   struct hrtimerckc_timer;
+   struct tasklet_struct tasklet;
union  {
cpuid_t   cpu_id;
u64   stidp_data;
Index: kvm/arch/s390/kvm/interrupt.c
===
--- kvm.orig/arch/s390/kvm/interrupt.c  2009-05-05 15:58:45.0 +0200
+++ kvm/arch/s390/kvm/interrupt.c   2009-05-05 16:18:02.0 +0200
@@ -12,6 +12,8 @@
 
 #include asm/lowcore.h
 #include asm/uaccess.h
+#include linux/hrtimer.h
+#include linux/interrupt.h
 #include linux/kvm_host.h
 #include linux/signal.h
 #include kvm-s390.h
@@ -361,12 +363,12 @@
return 0;
}
 
-   sltime = (vcpu-arch.sie_block-ckc - now) / (0xf424ul / HZ) + 1;
+   sltime = (vcpu-arch.sie_block-ckc - now)/4096*1000;
 
-   vcpu-arch.ckc_timer.expires = jiffies + sltime;
-
-   add_timer(vcpu-arch.ckc_timer);
-   VCPU_EVENT(vcpu, 5, enabled wait timer:%llx jiffies, sltime);
+   hrtimer_start(vcpu-arch.ckc_timer, ktime_set(0, sltime),
+ HRTIMER_MODE_REL);
+   VCPU_EVENT(vcpu, 5, enabled wait via clock comparator: %llx ns,
+  sltime);
 no_timer:
spin_lock_bh(vcpu-arch.local_int.float_int-lock);
spin_lock_bh(vcpu-arch.local_int.lock);
@@ -389,21 +391,34 @@
remove_wait_queue(vcpu-wq, wait);
spin_unlock_bh(vcpu-arch.local_int.lock);
spin_unlock_bh(vcpu-arch.local_int.float_int-lock);
-   del_timer(vcpu-arch.ckc_timer);
+   hrtimer_try_to_cancel(vcpu-arch.ckc_timer);
return 0;
 }
 
-void kvm_s390_idle_wakeup(unsigned long data)
+void kvm_s390_tasklet(unsigned long parm)
 {
-   struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
+   struct kvm_vcpu *vcpu = (struct kvm_vcpu *) parm;
 
-   spin_lock_bh(vcpu-arch.local_int.lock);
+   spin_lock(vcpu-arch.local_int.lock);
vcpu-arch.local_int.timer_due = 1;
if (waitqueue_active(vcpu-arch.local_int.wq))
wake_up_interruptible(vcpu-arch.local_int.wq);
-   spin_unlock_bh(vcpu-arch.local_int.lock);
+   spin_unlock(vcpu-arch.local_int.lock);
 }
 
+/*
+ * low level hrtimer wake routine. Because this runs in hardirq context
+ * we schedule a tasklet to do the real work.
+ */
+enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer)
+{
+   struct kvm_vcpu *vcpu;
+
+   vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer);
+   tasklet_schedule(vcpu-arch.tasklet);
+
+   return HRTIMER_NORESTART;
+}
 
 void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
 {
Index: kvm/arch/s390/kvm/kvm-s390.c
===
--- kvm.orig/arch/s390/kvm/kvm-s390.c   2009-05-05 16:16:48.0 +0200
+++ kvm/arch/s390/kvm/kvm-s390.c2009-05-05 16:16:49.0 +0200
@@ -15,6 +15,7 @@
 #include linux/compiler.h
 #include linux/err.h
 #include linux/fs.h
+#include linux/hrtimer.h
 #include linux/init.h
 #include linux/kvm.h
 #include linux/kvm_host.h
@@ -286,8 +287,10 @@
vcpu-arch.sie_block-gmsor = vcpu-kvm-arch.guest_origin;
vcpu-arch.sie_block-ecb   = 2;
vcpu-arch.sie_block-eca   = 0xC1002001U;
-   setup_timer(vcpu-arch.ckc_timer, kvm_s390_idle_wakeup,
-(unsigned long) vcpu);
+   hrtimer_init(vcpu-arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+   

[PATCH 5/6] kvm-s390: Sanity check on validity intercept

2009-05-05 Thread ehrhardt
From: Carsten Otte co...@de.ibm.com

This patch adds a sanity check for the content of the guest
prefix register content before faulting in the cpu lowcore
that it refers to. The guest might end up in an endless loop
where SIE complains about missing lowcore with incorrect
content of the prefix register without this fix.

Reported-by: Mijo Safradin m...@linux.vnet.ibm.com
Signed-off-by: Carsten Otte co...@de.ibm.com
---
 arch/s390/kvm/intercept.c |   28 ++--
 1 file changed, 18 insertions(+), 10 deletions(-)

Index: kvm/arch/s390/kvm/intercept.c
===
--- kvm.orig/arch/s390/kvm/intercept.c
+++ kvm/arch/s390/kvm/intercept.c
@@ -154,17 +154,25 @@ static int handle_stop(struct kvm_vcpu *
 static int handle_validity(struct kvm_vcpu *vcpu)
 {
int viwhy = vcpu-arch.sie_block-ipb  16;
+   int rc;
+
vcpu-stat.exit_validity++;
-   if (viwhy == 0x37) {
-   fault_in_pages_writeable((char __user *)
-vcpu-kvm-arch.guest_origin +
-vcpu-arch.sie_block-prefix,
-PAGE_SIZE);
-   return 0;
-   }
-   VCPU_EVENT(vcpu, 2, unhandled validity intercept code %d,
-  viwhy);
-   return -ENOTSUPP;
+   if ((viwhy == 0x37)  (vcpu-arch.sie_block-prefix
+   = vcpu-kvm-arch.guest_memsize - 2*PAGE_SIZE)){
+   rc = fault_in_pages_writeable((char __user *)
+vcpu-kvm-arch.guest_origin +
+vcpu-arch.sie_block-prefix,
+2*PAGE_SIZE);
+   if (rc)
+   /* user will receive sigsegv, exit to user */
+   rc = -ENOTSUPP;
+   } else
+   rc = -ENOTSUPP;
+
+   if (rc)
+   VCPU_EVENT(vcpu, 2, unhandled validity intercept code %d,
+  viwhy);
+   return rc;
 }
 
 static int handle_instruction(struct kvm_vcpu *vcpu)
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/6] kvm-s390: Unlink vcpu on destroy

2009-05-05 Thread ehrhardt
From: Carsten Otte co...@de.ibm.com

This patch makes sure we do unlink a vcpu's sie control block
from the system control area in kvm_arch_vcpu_destroy. This
prevents illegal accesses to the sie control block from other
virtual cpus after free.

Reported-by: Mijo Safradin m...@linux.vnet.ibm.com
Signed-off-by: Carsten Otte co...@de.ibm.com
---
 arch/s390/kvm/kvm-s390.c |9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

Index: kvm/arch/s390/kvm/kvm-s390.c
===
--- kvm.orig/arch/s390/kvm/kvm-s390.c
+++ kvm/arch/s390/kvm/kvm-s390.c
@@ -195,6 +195,9 @@ out_nokvm:
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
VCPU_EVENT(vcpu, 3, %s, free cpu);
+   if (vcpu-kvm-arch.sca-cpu[vcpu-vcpu_id].sda ==
+   (__u64) vcpu-arch.sie_block)
+   vcpu-kvm-arch.sca-cpu[vcpu-vcpu_id].sda = 0;
free_page((unsigned long)(vcpu-arch.sie_block));
kvm_vcpu_uninit(vcpu);
kfree(vcpu);
@@ -307,8 +310,10 @@ struct kvm_vcpu *kvm_arch_vcpu_create(st
 
vcpu-arch.sie_block-icpua = id;
BUG_ON(!kvm-arch.sca);
-   BUG_ON(kvm-arch.sca-cpu[id].sda);
-   kvm-arch.sca-cpu[id].sda = (__u64) vcpu-arch.sie_block;
+   if (!kvm-arch.sca-cpu[id].sda)
+   kvm-arch.sca-cpu[id].sda = (__u64) vcpu-arch.sie_block;
+   else
+   BUG_ON(!kvm-vcpus[id]); /* vcpu does already exist */
vcpu-arch.sie_block-scaoh = (__u32)(((__u64)kvm-arch.sca)  32);
vcpu-arch.sie_block-scaol = (__u32)(__u64)kvm-arch.sca;
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 6/6] kvm-s390: Verify memory in kvm run

2009-05-05 Thread ehrhardt
From: Carsten Otte co...@de.ibm.com

This check verifies that the guest we're trying to run in KVM_RUN
has some memory assigned to it. It enters an endless exception
loop if this is not the case.

Reported-by: Mijo Safradin m...@linux.vnet.ibm.com
Signed-off-by: Carsten Otte co...@de.ibm.com
---
 arch/s390/kvm/kvm-s390.c |6 ++
 1 file changed, 6 insertions(+)

Index: kvm/arch/s390/kvm/kvm-s390.c
===
--- kvm.orig/arch/s390/kvm/kvm-s390.c
+++ kvm/arch/s390/kvm/kvm-s390.c
@@ -478,6 +478,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_v
 
vcpu_load(vcpu);
 
+   /* verify, that memory has been registered */
+   if (!vcpu-kvm-arch.guest_memsize) {
+   vcpu_put(vcpu);
+   return -EINVAL;
+   }
+
if (vcpu-sigset_active)
sigprocmask(SIG_SETMASK, vcpu-sigset, sigsaved);
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/3] generic hypercall support

2009-05-05 Thread Avi Kivity

Gregory Haskins wrote:

I see.  I had designed it slightly different where KVM could assign any
top level vector it wanted and thus that drove the guest-side interface
you see here to be more generic hypercall.  However, I think your
proposal is perfectly fine too and it makes sense to more narrowly focus
these calls as specifically dynamic...as thats the only vectors that
we could technically use like this anyway.

So rather than allocate a top-level vector, I will add KVM_HC_DYNAMIC
to kvm_para.h, and I will change the interface to follow suit (something
like s/hypercall/dynhc).  Sound good?
  


Yeah.

Another couple of points:

- on the host side, we'd rig this to hit an eventfd.  Nothing stops us 
from rigging pio to hit an eventfd as well, giving us kernel handling 
for pio trigger points.
- pio actually has an advantage over hypercalls with nested guests.  
Since hypercalls don't have an associated port number, the lowermost 
hypervisor must interpret a hypercall as going to a guest's hypervisor, 
and not any lower-level hypervisors.  What it boils down to is that you 
cannot use device assignment to give a guest access to a virtio/vbus 
device from a lower level hypervisor.


(Bah, that's totally unreadable.  What I want is

instead of

  hypervisor[eth0/virtio-server]    
intermediate[virtio-driver/virtio-server]   guest[virtio-driver]


do

  hypervisor[eth0/virtio-server]    intermediate[assign virtio 
device]   guest[virtio-driver]


well, it's probably still unreadable)

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [KVM PATCH v4 2/2] kvm: add support for irqfd via eventfd-notification interface

2009-05-05 Thread Avi Kivity

Gregory Haskins wrote:

KVM provides a complete virtual system environment for guests, including
support for injecting interrupts modeled after the real exception/interrupt
facilities present on the native platform (such as the IDT on x86).
Virtual interrupts can come from a variety of sources (emulated devices,
pass-through devices, etc) but all must be injected to the guest via
the KVM infrastructure.  This patch adds a new mechanism to inject a specific
interrupt to a guest using a decoupled eventfd mechnanism:  Any legal signal
on the irqfd (using eventfd semantics from either userspace or kernel) will
translate into an injected interrupt in the guest at the next available
interrupt window.

 
+struct kvm_irqfd {

+   __u32 gsi;
+   __u32 flags;
+};
+
  


Please add some reserved space here.


+int
+kvm_irqfd(struct kvm *kvm, int gsi, int flags)
+{
+   struct _irqfd *irqfd;
+   struct file *file = NULL;
+   int fd = -1;
+   int ret;
+
+   irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
+   if (!irqfd)
+   return -ENOMEM;
+
+   irqfd-kvm = kvm;
  


You need to increase the refcount on struct kvm here.  Otherwise evil 
userspace will create an irqfd, close the vm and vcpu fds, and inject an 
interrupt.


Otherwise, looks good.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] Re: Question about KVM and PC speaker

2009-05-05 Thread Simon Bienlein

Hello Jan,
Am 04.05.2009 23:00, schrieb Jan Kiszka:

Samuel Thibault wrote:

 Jan Kiszka, le Mon 04 May 2009 22:29:39 +0200, a écrit :

 When I boot the VM from the Lenny CD, there is no audible signal tone.

 Hmm, I successfully tested with '-soundbw pcspk' + my patches or
 -no-kvm-pit. There is probably a different, unrelated issue with your setup.


 Remember that the BIOS support for beeps is probably still missing.
 Simon, you should also test beeps from an installed Linux guest.



Yeah, I forgot... :)

Indeed, I only tested the console beep of a fully installed Linux guest.


In this surrounding, the beep works fine at my place at well. However, 
the beep worked without the patching of the sources when I entered 
-no-kvm-pit. For me as a user there is no noticeable change with the patch.


Is a support for BIOS worked on right now?

Thanks in advance for the further help.

Simon
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 1/3] add generic hypercall support

2009-05-05 Thread Hollis Blanchard
On Tue, 2009-05-05 at 09:24 -0400, Gregory Haskins wrote:
 
 *) PIO is more direct than MMIO, but it poses other problems such as:
   a) can have a small limited address space (x86 is 2^16)
   b) is a narrow-band interface (one 8, 16, 32, 64 bit word at a time)
   c) not available on all archs (PCI mentions ppc as problematic) and
  is therefore recommended to avoid.

Side note: I don't know what PCI has to do with this, and problematic
isn't the word I would use. ;) As far as I know, x86 is the only
still-alive architecture that implements instructions for a separate IO
space (not even ia64 does).

-- 
Hollis Blanchard
IBM Linux Technology Center

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/4] BIOS changes for configuring irq0-inti2 override

2009-05-05 Thread Beth Kon

Beth Kon wrote:

These patches resolve the irq0-inti2 override issue, and get the hpet working
on kvm. 
  

I've found a problem with these patches. I'll resubmit shortly.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [KVM PATCH v4 2/2] kvm: add support for irqfd via eventfd-notification interface

2009-05-05 Thread Gregory Haskins
Avi Kivity wrote:
 Gregory Haskins wrote:
 KVM provides a complete virtual system environment for guests, including
 support for injecting interrupts modeled after the real
 exception/interrupt
 facilities present on the native platform (such as the IDT on x86).
 Virtual interrupts can come from a variety of sources (emulated devices,
 pass-through devices, etc) but all must be injected to the guest via
 the KVM infrastructure.  This patch adds a new mechanism to inject a
 specific
 interrupt to a guest using a decoupled eventfd mechnanism:  Any legal
 signal
 on the irqfd (using eventfd semantics from either userspace or
 kernel) will
 translate into an injected interrupt in the guest at the next available
 interrupt window.

  
 +struct kvm_irqfd {
 +__u32 gsi;
 +__u32 flags;
 +};
 +
   

 Please add some reserved space here.

Ack.  Any rule of thumb here?  How about a __u8 pad[16] ?


 +int
 +kvm_irqfd(struct kvm *kvm, int gsi, int flags)
 +{
 +struct _irqfd *irqfd;
 +struct file *file = NULL;
 +int fd = -1;
 +int ret;
 +
 +irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
 +if (!irqfd)
 +return -ENOMEM;
 +
 +irqfd-kvm = kvm;
   

 You need to increase the refcount on struct kvm here.  Otherwise evil
 userspace will create an irqfd, close the vm and vcpu fds, and inject
 an interrupt.

Good catch.  Will fix.

Thanks Avi,
-Greg




signature.asc
Description: OpenPGP digital signature


Re: [KVM PATCH v4 2/2] kvm: add support for irqfd via eventfd-notification interface

2009-05-05 Thread Avi Kivity

Gregory Haskins wrote:
 
+struct kvm_irqfd {

+__u32 gsi;
+__u32 flags;
+};
+
  
  

Please add some reserved space here.



Ack.  Any rule of thumb here?  How about a __u8 pad[16] ?
  


I'd round it up so the whole thing is 32 bytes (not that it matters).

--
Do not meddle in the internals of kernels, for they are subtle and quick to 
panic.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM x86_64 with SR-IOV..? (device passthrough with LIO-Target v3.0)

2009-05-05 Thread Nicholas A. Bellinger
On Tue, 2009-05-05 at 04:28 -0700, Nicholas A. Bellinger wrote:
 On Tue, 2009-05-05 at 03:43 -0700, Nicholas A. Bellinger wrote:
  On Tue, 2009-05-05 at 09:42 +0800, Yu Zhao wrote:
   Hi,
   
   The VF also works in the host if the VF driver is programed properly.
   So it would be easier to develop the VF driver in the host and then
   verify the VF driver in the guest.
   
   BTW, I didn't see the SR-IOV is enabled in your dmesg, did you select
   the CONFIG_PCI_IOV in the kernel .config?
   
   Thanks,
   Yu
   
  
  Greetings Yu and Sheng,
  
  So the original attachment was for the v2.6.29-fc11 host kernel output,
  I ended up jumping to v2.6.30-rc3 (and making sure CONFIG_PCI_IOV was
  enabled) for KVM host with kvm-85 and now things are looking quite
  stable for me.
  
  So far I have been able to successfully push LIO-Target v3.0 traffic
  *inside* a v2.6.29.2 KVM guest via the onboard e1000e (02:00.0) port
  from another Linux/iSCSI Initiator machine using a Intel 1 Gb/sec port.
  I am running badblocks tests to iSCSI Logical Units for RAMDISK_DR and
  FILEIO storage objects (in the KVM Guest), and they are passing
  validation and I am seeing ~500 Mb/sec of throughput and very low CPU
  usage in the KVM guests.
  
 
 Ok I am seeing another issue with the e1000e port on 02:00.0..:
 
 As i start to push multiple badblocks tests RAMDISK_DR iSCSI Logical
 units into KVM Guest running LIO v2.6.29.2 from the external Linux/iSCSI
 Initiator machine, after about 100 GB of iSCSI traffic, I see the
 following exception in KVM host v2.6.30-rc3:
 
 DRHD: handling fault status reg 2
 DMAR:[DMA Write] Request device [02:00.0] fault addr 7fc958b01 
 DMAR:[fault reason 04] Access beyond MGAW
 pci-stub :02:00.0: irq 59 for MSI/MSI-X
 pci-stub :02:00.0: irq 60 for MSI/MSI-X
 pci-stub :02:00.0: irq 61 for MSI/MSI-X
 
 I am able to restart the LIO-Target KVM Guest and the Linux/iSCSI
 Initiators are able to reconnect..  Wow, very cool..
 
 Not sure if this is a bug in the target_core_mod RAMDISK_DR subsystem
 plugin (mapping struct iovec to internally allocated struct page) or
 what.  I will have to look at the DMAR code to understand what this
 exception means..
 

Greetings Yu, Sheng and Co,

So I have been making progress this morning..  So far, I have hooked up
a LSI mpt-function PCIe SAS adapter into the KVM guest with a Sandisk
SATA SSD 32 GB drive.  It is using MSI interrupts (not MSI-X) and I am
able to push ~70 MB/sec from a 2nd Linux/iSCSI Initiator machine
(running Open-iSCSI) with the 1500 byte MTUs on e1000e ports from within
the KVM guest. 

The interesting thing is that I am having to use IBLOCK export (using
using submit_bio(), and complete emulation of SCSI control path) for
SATA SSD in order to get I/O running stable  Using the pSCSI export I am
getting immediate exceptions from scsi_execute_async() in the v2.6.29.2
KVM guest..  Using a 2nd SAS disk I am able to use target_core_mod/pSCSI
export and push badblocks and LTP disktest traffic however..

Here is a bit about the the setup looks, 

*) Linux/iSCSI Initiator node accessing KVM Guest LIO-Target v3.0
storage:

subjekt:~# lsscsi
[6:0:0:0]diskATA  ST3250820AS  3.AA  /dev/sda
[10:0:0:0]   cd/dvd  PIONEER  DVD-ROM DVD-305  1.06  /dev/scd1
[18:0:0:0]   cd/dvd  TOSHIBA  DVD/HD  X807616  MC08  /dev/scd2
[32:0:0:0]   diskLIO-ORG  RAMDISK-DR   3.0   /dev/sdb
[32:0:0:1]   diskLIO-ORG  RAMDISK-DR   3.0   /dev/sdc
[32:0:0:2]   diskLIO-ORG  FILEIO   3.0   /dev/sdd
[32:0:0:3]   diskLIO-ORG  IBLOCK   3.0   /dev/sde

subjekt:~# sg_inq -i /dev/sde
VPD INQUIRY: Device Identification page
  Designation descriptor number 1, descriptor length: 20
id_type: NAA,  code_set: Binary
associated with the addressed logical unit
  NAA 6, IEEE Company_id: 0x1405
  Vendor Specific Identifier: 0xa97e4ce21
  Vendor Specific Identifier Extension: 0xc0711de829b000c2
  [0x6001405a97e4ce21c0711de829b000c2]
  Designation descriptor number 2, descriptor length: 52
id_type: T10 vendor identification,  code_set: ASCII
associated with the addressed logical unit
  vendor id: LIO-ORG
  vendor specific: IBLOCK:a97e4ce21c0711de829b000c2943d57b
  Designation descriptor number 3, descriptor length: 8
transport: Internet SCSI (iSCSI)
id_type: Relative target port,  code_set: Binary
associated with the target port
  Relative target port: 0x1
  Designation descriptor number 4, descriptor length: 8
transport: Internet SCSI (iSCSI)
id_type: Target port group,  code_set: Binary
associated with the target port
  Target port group: 0x0
  Designation descriptor number 5, descriptor length: 8
id_type: Logical unit group,  code_set: Binary
associated with the addressed logical unit
  Logical unit group: 0x0
  Designation descriptor number 6, descriptor length: 80
transport: Internet SCSI (iSCSI)
id_type: SCSI name string,  code_set: UTF-8
 

Re: [KVM PATCH v4 2/2] kvm: add support for irqfd via eventfd-notification interface

2009-05-05 Thread Gregory Haskins
Avi Kivity wrote:
 Gregory Haskins wrote:

 +int
 +kvm_irqfd(struct kvm *kvm, int gsi, int flags)
 +{
 +struct _irqfd *irqfd;
 +struct file *file = NULL;
 +int fd = -1;
 +int ret;
 +
 +irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
 +if (!irqfd)
 +return -ENOMEM;
 +
 +irqfd-kvm = kvm;
   

 You need to increase the refcount on struct kvm here.  Otherwise evil
 userspace will create an irqfd, close the vm and vcpu fds, and inject
 an interrupt.

I just reviewed the code in prep for v5, and now I remember why I didnt
take a reference:  I designed it the opposite direction:  the vm-fd owns
a reference to the irqfd, and will decouple the kvm context from the
eventfd on shutdown  (see kvm_irqfd_release()).   I still need to spin a
v5 regardless in order to add the padding as previously discussed.  But
let me know if you still see any holes in light of this alternate object
lifetime approach I am using.

-Greg




signature.asc
Description: OpenPGP digital signature


Re: [KVM PATCH v4 2/2] kvm: add support for irqfd via eventfd-notification interface

2009-05-05 Thread Avi Kivity

Gregory Haskins wrote:

Avi Kivity wrote:
  

Gregory Haskins wrote:



+int
+kvm_irqfd(struct kvm *kvm, int gsi, int flags)
+{
+struct _irqfd *irqfd;
+struct file *file = NULL;
+int fd = -1;
+int ret;
+
+irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
+if (!irqfd)
+return -ENOMEM;
+
+irqfd-kvm = kvm;
  
  

You need to increase the refcount on struct kvm here.  Otherwise evil
userspace will create an irqfd, close the vm and vcpu fds, and inject
an interrupt.



I just reviewed the code in prep for v5, and now I remember why I didnt
take a reference:  I designed it the opposite direction:  the vm-fd owns
a reference to the irqfd, and will decouple the kvm context from the
eventfd on shutdown  (see kvm_irqfd_release()).   I still need to spin a
v5 regardless in order to add the padding as previously discussed.  But
let me know if you still see any holes in light of this alternate object
lifetime approach I am using.
  


Right, irqfd_release works.  But I think refcounting is simpler, since 
we already kvm_get_kvm() and kvm_put_kvm(), and you wouldn't need the 
irqfd list.  On the other hand, I'm not sure you get a callback from 
eventfd on close(), so refcounting may not be implementable.


Drat, irqfd_release doesn't work.  You reference kvm-lock in 
irqfd_inject without taking any locks.


btw, there's still your original idea of creating the eventfd in 
userspace and passing it down.  That would be workable if we can see a 
way to both signal the eventfd and get called back in irq context.  
Maybe that's preferable to what we're doing here, but we need to see how 
it would work.


--
Do not meddle in the internals of kernels, for they are subtle and quick to 
panic.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [KVM PATCH v4 2/2] kvm: add support for irqfd via eventfd-notification interface

2009-05-05 Thread Gregory Haskins
Avi Kivity wrote:
 Gregory Haskins wrote:
 Avi Kivity wrote:
  
 Gregory Haskins wrote:


 +int
 +kvm_irqfd(struct kvm *kvm, int gsi, int flags)
 +{
 +struct _irqfd *irqfd;
 +struct file *file = NULL;
 +int fd = -1;
 +int ret;
 +
 +irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
 +if (!irqfd)
 +return -ENOMEM;
 +
 +irqfd-kvm = kvm;
 
 You need to increase the refcount on struct kvm here.  Otherwise evil
 userspace will create an irqfd, close the vm and vcpu fds, and inject
 an interrupt.
 

 I just reviewed the code in prep for v5, and now I remember why I didnt
 take a reference:  I designed it the opposite direction:  the vm-fd owns
 a reference to the irqfd, and will decouple the kvm context from the
 eventfd on shutdown  (see kvm_irqfd_release()).   I still need to spin a
 v5 regardless in order to add the padding as previously discussed.  But
 let me know if you still see any holes in light of this alternate object
 lifetime approach I am using.
   

 Right, irqfd_release works.  But I think refcounting is simpler, since
 we already kvm_get_kvm() and kvm_put_kvm(), and you wouldn't need the
 irqfd list.  On the other hand, I'm not sure you get a callback from
 eventfd on close(), so refcounting may not be implementable.

;)


 Drat, irqfd_release doesn't work.  You reference kvm-lock in
 irqfd_inject without taking any locks.

I *think* this is ok, tho.  I remove myself from the waitq, and then
flush any potentially scheduled deferred work before returning.  This
all happens synchronously to the vm_release() code when the vm-fd is
bring dropped, but before we actually release the struct kvm*. 
Therefore, I think kvm-lock is guaranteed to remain valid for the
duration of the irqfd_release(), and we guarantee it wont be accessed
after the irqfd_release() completes.  Or do you have a different concern?

On this topic of proper ref counts, though

I wonder if I need an extra fget() in there.  I presume that the
evenfd_file_create() returns with only a single reference, which
presumably I am handing one to userspace, and one to the irqfd which
is broken.  Or does fd_install() bump that for me (doesnt look like
it)?  Al, Davide, any comments?


 btw, there's still your original idea of creating the eventfd in
 userspace and passing it down.  That would be workable if we can see a
 way to both signal the eventfd and get called back in irq context. 
 Maybe that's preferable to what we're doing here, but we need to see
 how it would work.

We can do that, but I don't see it as changing the general problem
here.  However, I think if you find that the above comments about the
kvm-lock w.r.t. irqfd_release() are ok, we don't need to worry about
it.  If you prefer the userspace allocation of eventfd() for other
reasons, we can easily go back to that model as well...but its not
strictly necessary for this particular issue iiuc.

-Greg



signature.asc
Description: OpenPGP digital signature


[PATCH] deal with interrupt shadow state for emulated instruction

2009-05-05 Thread Glauber Costa
we currently unblock shadow interrupt state when we skip an instruction,
but failing to do so when we actually emulate one. This blocks interrupts
in key instruction blocks, in particular sti; hlt; sequences

If the instruction emulated is an sti, we have to block shadow interrupts.
The same goes for mov ss. pop ss also needs it, but we don't currently
emulate it.

Without this patch, I cannot boot gpxe option roms at vmx machines.
This is described at https://bugzilla.redhat.com/show_bug.cgi?id=494469

Signed-off-by: Glauber Costa glom...@redhat.com
CC: H. Peter Anvin h...@zytor.com
CC: Avi Kivity a...@redhat.com
---
 arch/x86/include/asm/kvm_host.h|2 +
 arch/x86/include/asm/kvm_x86_emulate.h |6 
 arch/x86/kvm/svm.c |   25 +++-
 arch/x86/kvm/vmx.c |   49 ++--
 arch/x86/kvm/x86.c |7 -
 arch/x86/kvm/x86_emulate.c |   21 +-
 6 files changed, 98 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8e680c3..a49d07b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -510,6 +510,8 @@ struct kvm_x86_ops {
void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+   void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
+   u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu);
void (*patch_hypercall)(struct kvm_vcpu *vcpu,
unsigned char *hypercall_addr);
void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h 
b/arch/x86/include/asm/kvm_x86_emulate.h
index 6a15973..b7ed2c4 100644
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ b/arch/x86/include/asm/kvm_x86_emulate.h
@@ -143,6 +143,9 @@ struct decode_cache {
struct fetch_cache fetch;
 };
 
+#define X86_SHADOW_INT_MOV_SS  1
+#define X86_SHADOW_INT_STI 2
+
 struct x86_emulate_ctxt {
/* Register state before/after emulation. */
struct kvm_vcpu *vcpu;
@@ -152,6 +155,9 @@ struct x86_emulate_ctxt {
int mode;
u32 cs_base;
 
+   /* interruptibility state, as a result of execution of STI or MOV SS */
+   int interruptibility;
+
/* decode cache */
struct decode_cache decode;
 };
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ef43a18..4941dea 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -202,6 +202,27 @@ static int is_external_interrupt(u32 info)
return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
 }
 
+static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
+{
+   struct vcpu_svm *svm = to_svm(vcpu);
+   u32 ret = 0;
+
+   if (svm-vmcb-control.int_state  SVM_INTERRUPT_SHADOW_MASK)
+   ret |= (X86_SHADOW_INT_STI  X86_SHADOW_INT_MOV_SS);
+   return ret;
+}
+
+static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+   struct vcpu_svm *svm = to_svm(vcpu);
+
+   if (mask == 0)
+   svm-vmcb-control.int_state = ~SVM_INTERRUPT_SHADOW_MASK;
+   else
+   svm-vmcb-control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
+
+}
+
 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
struct vcpu_svm *svm = to_svm(vcpu);
@@ -215,7 +236,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
   __func__, kvm_rip_read(vcpu), svm-next_rip);
 
kvm_rip_write(vcpu, svm-next_rip);
-   svm-vmcb-control.int_state = ~SVM_INTERRUPT_SHADOW_MASK;
+   svm_set_interrupt_shadow(vcpu, 0);
 }
 
 static int has_svm(void)
@@ -2637,6 +2658,8 @@ static struct kvm_x86_ops svm_x86_ops = {
.run = svm_vcpu_run,
.handle_exit = handle_exit,
.skip_emulated_instruction = skip_emulated_instruction,
+   .set_interrupt_shadow= svm_set_interrupt_shadow,
+   .get_interrupt_shadow = svm_get_interrupt_shadow,
.patch_hypercall = svm_patch_hypercall,
.set_irq = svm_set_irq,
.set_nmi = svm_inject_nmi,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e8a5649..bbfe894 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -736,23 +736,52 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, 
unsigned long rflags)
vmcs_writel(GUEST_RFLAGS, rflags);
 }
 
+static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
+{
+   u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+   int ret = 0;
+
+   if (interruptibility  GUEST_INTR_STATE_STI)
+   ret |= X86_SHADOW_INT_STI;
+   if (interruptibility  GUEST_INTR_STATE_MOV_SS)
+   ret |= X86_SHADOW_INT_MOV_SS;
+
+   return ret;
+}
+
+static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+   u32 

RE: [PATCH] reserved-ram for pci-passthrough without VT-d capable hardware

2009-05-05 Thread Passera, Pablo R
Andrea,
Thanks for your answers. I already patched the kernel and kvm 
(including rombios). The host boots up and the memory mapping is as explained 
in the patch. Now I am trying to launch a vm using memory mapping but it hangs 
after opening the sdl windows and before showing the bios messages. I am 
running qemu command from a console in the host that is running X and the 
command line is the following:

Qemu-system-x86_64 -hda ./dm.img -cdrom /dev/sr0 -m 32 -reserved-ram -boot d

- Is this command line correct?
- Should I run the vm without having started the X in the host machine?
- What should I see after starting the vm? Should the vm take ownership of the 
video card?

Thanks,
Pablo

-Original Message-
From: Andrea Arcangeli [mailto:aarca...@redhat.com]
Sent: Tuesday, April 28, 2009 3:06 PM
To: Passera, Pablo R
Cc: kvm@vger.kernel.org
Subject: Re: [PATCH] reserved-ram for pci-passthrough without VT-d
capable hardware

On Tue, Apr 28, 2009 at 07:35:26AM -0600, Passera, Pablo R wrote:
 - Against which kernel version was this patch generated?

I don't remember exactly (I was just using an upstream hg checkout and
I didn't record its hash value) but I think you can go back to when
e820.c was still shared and it'll likely apply and work.

 - Did you try this on a 32 or 64 bits system?

I only tested it on 64bit but there's no reason why it shouldn't work
on 32bit too.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/4] BIOS changes for configuring irq0-inti2 override

2009-05-05 Thread Sebastian Herbszt

Beth Kon wrote:

@@ -477,6 +480,7 @@ void wrmsr_smp(uint32_t index, uint64_t val)
#define QEMU_CFG_SIGNATURE  0x00
#define QEMU_CFG_ID 0x01
#define QEMU_CFG_UUID   0x02
+#define QEMU_CFG_IRQ0_OVERRIDE 0x0e


Small thing to consider before you resubmit:
In his patch read-additional-acpi-tables-from-a-vm.patch Gleb introduced:

#define QEMU_CFG_ARCH_LOCAL 0x8000
#define QEMU_CFG_ACPI_TABLES  (QEMU_CFG_ARCH_LOCAL + 0)

I think the idea behind this was to seperate the generic part from arch 
specific.
The IRQ0 override seems to be arch specific (x86 only?) just like the ACPI 
tables, right?

- Sebastian

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] Re: Question about KVM and PC speaker

2009-05-05 Thread Sebastian Herbszt

Simon Bienlein wrote:

Is a support for BIOS worked on right now?


The vgabios (vgabios.c) has a FIXME should beep. Volker, do you plan to fix 
this?

Which frequency should be used for the beep? Which delay?
Getting a delay using inb(0x61)  0x10 is still a no go on qemu, right?

- Sebastian

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/3] generic hypercall support

2009-05-05 Thread Chris Wright
* Gregory Haskins (gregory.hask...@gmail.com) wrote:
 So you would never have someone making a generic
 hypercall(KVM_HC_MMU_OP).  I agree.

Which is why I think the interface proposal you've made is wrong.  There's
already hypercall interfaces w/ specific ABI and semantic meaning (which
are typically called directly/indirectly from an existing pv op hook).

But a free-form hypercall(unsigned long nr, unsigned long *args, size_t count)
means hypercall number and arg list must be the same in order for code
to call hypercall() in a hypervisor agnostic way.

The pv_ops level need to have semantic meaning, not a free form
hypercall multiplexor.

thanks,
-chris
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: qemu/hw/device-assignment: questions about msix_table_page

2009-05-05 Thread Sheng Yang
On Tuesday 05 May 2009 20:46:04 Michael S. Tsirkin wrote:
 On Tue, May 05, 2009 at 07:49:10AM -0300, Marcelo Tosatti wrote:
  On Tue, May 05, 2009 at 01:34:50PM +0300, Michael S. Tsirkin wrote:
   On Tue, May 05, 2009 at 07:19:45AM -0300, Marcelo Tosatti wrote:
On Tue, May 05, 2009 at 12:51:36PM +0300, Michael S. Tsirkin wrote:
 On Mon, Apr 27, 2009 at 10:30:17PM +0800, Sheng Yang wrote:
  If guest can write to the real device MSI-X table
  directly, it would cause chaos on interrupt delivery, for
  what guest see is totally different with what's host
  see...

 Obviously.

 Thanks,

 What's the reason that this page is unmapped from the qemu memory
 space? Specifically what do these lines do:
 int offset = r_dev-msix_table_addr -
 real_region-base_addr; ret = munmap(region-u.r_virtbase + offset,
 TARGET_PAGE_SIZE);
   
I believe this allows accesses to this page (the MSI-X table), which
is part of the guest address space (through kvm memory slots), to be
trapped by qemu.
   
Since there is no actual page in this guest address, KVM treats
accesses as MMIO and forwards them to QEMU.
  
   I thought about this too.
   But why is this necessary for assigned MSI-X but not for emulated
   devices such as e.g. e1000? All e1000 does seems to be
   cpu_register_physical_memory ...
 
  Because there is no registered (kvm) memory slot for the range which
  e1000 registers its MMIO? Not sure about the address of the MSI-X table
  page, but you could achieve the same effect by splitting the slot which
  it lives in two, with a 1 page hole between them.

 You could also move the emulated MSI-X table, sticking it on top of the
 existing BAR. Since PCI config includes the pointer to the table,
 a driver that reads this pointer will continue to work.

One BAR can contain more than a MSI-X table... The PCI spec only said the 
other information should be page aligned and can't in the same page of MSI-X 
table(except PBA). I think this method make thing more complicate, we don't 
want to and can't trap other informations in the same BAR...

 Of course, there's no guarantee that guest drivers don't just hard-code
 this offset.

I think this mostly won't happen.

  BTW this is why you can't map the MSI-X table page directly, you want
  accesses to be trapped.

 BTW current design won't work if the base page size is  4K, will it?
 The hole covers a page, so you'll get faults outside the MSI-X table.

Yes. One entry for MSI-X is 16bytes, one page can contain 256 entries. Well, I 
haven't see a device get more than 100 entries, but for this limitation, maybe 
we should limit MSI-X max entries to 256 (rather than 512 entries  
now)temporarily...

-- 
regards
Yang, Sheng
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: PCI device assignment over KVM

2009-05-05 Thread Sheng Yang
On Wednesday 06 May 2009 08:42:37 Tong Liu wrote:
 Hi Sheng,

 I have installed latest KVM-85 release and the Failed to assign irq
 error goes away.
 However my device kernel driver on guest OS (RHEL5u2) can't query my
 card successfully. Our developer said there is memory mapping error
 during KVM device assignment.

Could your developer elaborate it? 

 I am providing all the steps I have done and captures the output as
 pictures here.
 Can you help take a look and see if really my device is not supported by
 KVM yet?

Every other things looks fine. I think the key point should be that memory 
mapping error.

-- 
regards
Yang, Sheng


 Step 1: Before device assignment, /proc/interrupts has my device which
 is not sharing IRQ with others due to MSI-x enabled
 Picture 1.jpg. Even though dmesg shows it is using IRQ 18 during boot
 but /proc shows IRQ 58, etc, I guess it is changed to that number by
 MSI-X after boot.

 Step 2: Unbinding my device :01:00.0 from host and start kvm guest
 Picture 2.jpg, some phy mem error reported but I am assuming it is not
 critical

 At this moment, /proc/interrupts on local machine is changed. My device
 is disppeared and another entry is created: kvm_assigned_intx_device
 as picture 3.jpg shown

 And dmesg didn't show obvious errors too, it shows pci device assigned.
 Picture 4.jpg

 Log into guest OS(RHEL5u2), run dmesg, it shows errors about my device
 which means it is not passed successfully.
 Picture 5.jpg


 I tried other device :06:00.1 which is OK.

 Regards
 Tong


 -Original Message-
 From: Sheng Yang [mailto:sh...@linux.intel.com]
 Sent: Monday, May 04, 2009 9:35 PM
 To: Tong Liu
 Subject: Re: PCI device assignment over KVM

 On Tuesday 05 May 2009 12:06:09 Tong Liu wrote:
  BTW, as you mentioned, after boot up, 06:00.0 got IRQ 56 because it's
  MSI enabled.
  And 01:00.0 is using IRQ18, actually they are not sharing IRQ with any
 
  others.
  (Even though 06:00.0 was using IRQ 18 with 01:00.0 during boot but it
  is converted to 56 after boot up, so nothing shared)
 
  Why is KVM still trying to enable 01:00.0 INTx (dmesg error I put in
  the first email) if 01:00.0 is not sharing IRQ with any other after

 boot?

 You can use cat /proc/interrupts to know if there are other IRQ
 handler for IRQ 18. Also lspci -v.

 If you are sure that there is no sharing interrupt for IRQ 18, you may
 need to look into INTx enabling part of KVM
 (virt/kvm/kvm_main.c:kvm_vm_ioctl_assign_irq()) to know what's happening
 exactly.

 And please try latest KVM and qemu-kvm as well(or kvm-85 release). The
 your dmesg show that the version of your KVM is old - no thing like
 failed to enable INTx device! in current code now. We rework the
 framework two monthes ago.

 --
 regards
 Yang, Sheng

  Thanks
  Tong
 
 
  -Original Message-
  From: Sheng Yang [mailto:sh...@linux.intel.com]
  Sent: Monday, May 04, 2009 6:26 PM
  To: Tong Liu
  Subject: Re: PCI device assignment over KVM
 
  On Tuesday 05 May 2009 05:32:32 Tong Liu wrote:
   Hi Sheng,
  
   My system has VT-d support and I want to assign one PCI-E card to
   guest OS.
   I have an issue with PCI device assignment over KVM.
   I am using latest kernerl 2.6.30-rc4.
  
   Here are the steps I have done:
  
   1. Unbind PCI device 01:00.0 from host using steps documented on KVM
  
   webiste.
   http://www.linux-kvm.org/page/How_to_assign_devices_with_VT-d_in_KVM
   2. Then run the following command: qemu-system-x86_64 -m 4096 -boot
   c -net none -hda vdisk.img -pcidevice host=01:00.0 It reports

 errors:
   Assign_irq: deassign: Invalid argument Failed to assign irq for
   01:00.0: Input/output error Perhaps you are assigning a device
   that shares an IRQ with another device?
   Failed to deassign device 01:00.0 : Invalid argument
  
   In dmesg it shows the following error:
   pci-stub :01:00.0: PCI INT A - GSI 18 (level, low) - IRQ 18
   assign device: host bdf = 1:0:0
   kvm: failed to enable INTx device!
   pci-stub :01:00.0: PCI INT A disabled
   kvm_vm_ioctl_deassign_device: device hasn't been assigned before, so
  
   cannot be deassigned
  
   Info for my device :01:00.0:
   lspci -t:
  
   -[:00]-+-00.0
  +-05.0-[:01]00.0
  
   +-09.0-[:02-07]--+-00.0-[:03-06]--+-00.0-[:04]--
 
  +-01.0-[:05]--
 
   \-02.0-[:06]--+-00.0
  
   \-00.1
  
  |\-00.3-[:07]--
  
   lspci -v:
  
   01:00.0 InfiniBand: Mellanox Technologies MT26428 [ConnectX IB QDR,
   PCIe 2.0 5GT/s] (rev a0)
   Subsystem: Mellanox Technologies Unknown device 0005
   Flags: fast devsel, IRQ 18
   Memory at 9930 (64-bit, non-prefetchable) [size=1M]
   Memory at 9800 (64-bit, prefetchable) [size=8M]
   Capabilities: [40] Power Management version 3
   Capabilities: [48] Vital Product Data
   Capabilities: [9c] MSI-X: Enable+ Mask- TabSize=256
   Capabilities: [60] Express 

Re: KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY should support MSI?

2009-05-05 Thread Sheng Yang
On Tuesday 05 May 2009 20:08:40 Avi Kivity wrote:
 Michael S. Tsirkin wrote:
  On Tue, May 05, 2009 at 02:57:10PM +0300, Avi Kivity wrote:
  Michael S. Tsirkin wrote:
  On Tue, May 05, 2009 at 08:04:15AM -0300, Marcelo Tosatti wrote:
  On Tue, May 05, 2009 at 01:30:28PM +0300, Michael S. Tsirkin wrote:
  The new KVM_ASSIGN_SET_MSIX_NR and KVM_ASSIGN_SET_MSIX_ENTRY ioctls
  have been merged for 2.6.30. However, I note that PCI spec allows
  devices to support multiple vectors with MSI as well (support will be
  in linux 2.6.30).

Well, one question: when did them merged? IIRC, MSI-X related things are still 
pending for 2.6.31... :)

 
  Even though qemu for now only uses a single vector with MSI, it would
  seem that it's better to make the kernel/user interface generic
  straight away rather than add more ioctls later. What do you think?
  It might not be too late to fix this for 2.6.30.
 
  Can't you use more than one KVM_ASSIGN_SET_MSIX_ENTRY call per
  assigned device?
 
  Sure, but only one KVM_ASSIGN_SET_MSIX_NR.
 
  MSIX_NR is the size of the table, while MSIX_ENTRY updates a single
  entry, if I read the code correctly.
 
  Right. So we'll need something like this for MSI as well.
  Actually maybe MSIX_NR MSIX_ENTRY should be renamed to MSI_NR / MSI_ENTRY
  and changed to do the right thing depending on the IRQ type?

 Works for me.  Sheng, is there a reason why it wasn't done like this?

No, I think it's fine. Also some related structure should be modified. And one 
flag field should be add to kvm_assigned_msix_nr and 
kvm_assigned_msix_entry(using padding ones) to indicate the interrupt type, 
for we can't determined the irq type by device's status at that time.

-- 
regards
Yang, Sheng
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM x86_64 with SR-IOV..? (device passthrough with LIO-Target v3.0)

2009-05-05 Thread Sheng Yang
On Tuesday 05 May 2009 18:43:46 Nicholas A. Bellinger wrote:
 On Tue, 2009-05-05 at 09:42 +0800, Yu Zhao wrote:
  Hi,
 
  The VF also works in the host if the VF driver is programed properly.
  So it would be easier to develop the VF driver in the host and then
  verify the VF driver in the guest.
 
  BTW, I didn't see the SR-IOV is enabled in your dmesg, did you select
  the CONFIG_PCI_IOV in the kernel .config?
 
  Thanks,
  Yu

 Greetings Yu and Sheng,

 So the original attachment was for the v2.6.29-fc11 host kernel output,
 I ended up jumping to v2.6.30-rc3 (and making sure CONFIG_PCI_IOV was
 enabled) for KVM host with kvm-85 and now things are looking quite
 stable for me.

 So far I have been able to successfully push LIO-Target v3.0 traffic
 *inside* a v2.6.29.2 KVM guest via the onboard e1000e (02:00.0) port
 from another Linux/iSCSI Initiator machine using a Intel 1 Gb/sec port.
 I am running badblocks tests to iSCSI Logical Units for RAMDISK_DR and
 FILEIO storage objects (in the KVM Guest), and they are passing
 validation and I am seeing ~500 Mb/sec of throughput and very low CPU
 usage in the KVM guests.

 One issue I did notice while using the pci-stub method of
 device-assignment with same e1000 port (02:00.0) was while using an
 iSCSI Initiator (Open-iSCSI) on the KVM Host machine and doing sustained
 traffic into the LIO-Target KVM Guest on the same local KVM host to max
 out traffic between the other onboard e1000e port (03.00.0), I see the
 following:

 pci-stub :02:00.0: PCI INT A - GSI 17 (level, low) - IRQ 17
 assign device: host bdf = 2:0:0
 pci-stub :02:00.0: irq 59 for MSI/MSI-X
 pci-stub :02:00.0: irq 59 for MSI/MSI-X
 pci-stub :02:00.0: irq 59 for MSI/MSI-X
 pci-stub :02:00.0: irq 59 for MSI/MSI-X
 pci-stub :02:00.0: irq 59 for MSI/MSI-X
 pci-stub :02:00.0: irq 60 for MSI/MSI-X
 pci-stub :02:00.0: irq 61 for MSI/MSI-X
 scsi4 : iSCSI Initiator over TCP/IP
 scsi 4:0:0:0: Direct-Access LIO-ORG  RAMDISK-DR   3.0  PQ: 0 ANSI:
 5 sd 4:0:0:0: Attached scsi generic sg1 type 0
 scsi 4:0:0:1: Direct-Access LIO-ORG  RAMDISK-DR   3.0  PQ: 0 ANSI:
 5 sd 4:0:0:1: Attached scsi generic sg2 type 0
 sd 4:0:0:0: [sdb] 262144 512-byte hardware sectors: (134 MB/128 MiB)
 sd 4:0:0:1: [sdc] 262144 512-byte hardware sectors: (134 MB/128 MiB)
 sd 4:0:0:0: [sdb] Write Protect is off
 sd 4:0:0:0: [sdb] Mode Sense: 2f 00 00 00
 sd 4:0:0:1: [sdc] Write Protect is off
 sd 4:0:0:1: [sdc] Mode Sense: 2f 00 00 00
 sd 4:0:0:0: [sdb] Write cache: disabled, read cache: enabled, doesn't
 support DPO or FUA sd 4:0:0:1: [sdc] Write cache: disabled, read cache:
 enabled, doesn't support DPO or FUA sdb:6 sdc: unknown partition table
 sd 4:0:0:0: [sdb] Attached SCSI disk
  unknown partition table
 sd 4:0:0:1: [sdc] Attached SCSI disk
 [ cut here ]
 WARNING: at kernel/irq/manage.c:260 enable_irq+0x36/0x50()
 Hardware name: empty
 Unbalanced enable for IRQ 59
 Modules linked in: ipt_REJECT xt_tcpudp bridge stp sunrpc iptable_filter
 ip_tables xt_state nf_conntrack ip6table_filter ip6_tables x_tables ib_iser
 rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr ipv6 iscsi_tcp
 libiscsi_tcp libiscsi scsi_transport_iscsi cpufreq_ondemand acpi_cpufreq
 freq_table ext3 jbd loop dm_multipath scsi_dh kvm_intel kvm uinput i2c_i801
 firewire_ohci joydev firewire_core sg i2c_core 8250_pnp crc_itu_t e1000e
 8250 serial_core rtc_cmos pcspkr serio_raw rtc_core rtc_lib button sd_mod
 dm_snapshot dm_zero dm_mirror dm_region_hash dm_log dm_mod uhci_hcd
 ohci_hcd ehci_hcd ata_piix libata scsi_mod [last unloaded: microcode] Pid:
 51, comm: events/0 Tainted: GW  2.6.30-rc3 #11
 Call Trace:
  [80235fee] ? warn_slowpath+0xcb/0xe8
  [80253a7c] ? generic_exec_single+0x6a/0x88
  [8022acec] ? update_curr+0x67/0xeb
  [a0198748] ? vcpu_kick_intr+0x0/0x1 [kvm]
  [8020a5d8] ? __switch_to+0xb6/0x274
  [8022b70a] ? __dequeue_entity+0x1b/0x2f
  [a01ac7e4] ? kvm_irq_delivery_to_apic+0xb3/0xf7 [kvm]
  [a01aa4d4] ? __apic_accept_irq+0x15a/0x173 [kvm]
  [a01ac883] ? kvm_set_msi+0x5b/0x60 [kvm]
  [80266d97] ? enable_irq+0x36/0x50
  [a0195ab5] ? kvm_assigned_dev_interrupt_work_handler+0x6d/0xbc
 [kvm] [802449fa] ? worker_thread+0x182/0x223
  [8024820b] ? autoremove_wake_function+0x0/0x2a
  [80244878] ? worker_thread+0x0/0x223
  [80244878] ? worker_thread+0x0/0x223
  [80247e72] ? kthread+0x54/0x7e
  [8020cb0a] ? child_rip+0xa/0x20
  [804d0af5] ? _spin_lock+0x5/0x8
  [80247e1e] ? kthread+0x0/0x7e
  [8020cb00] ? child_rip+0x0/0x20
 ---[ end trace 3fbc2dd20bf89ef1 ]---
  connection1:0: ping timeout of 5 secs expired, last rx 4295286327, last
 ping 4295285518, now 4295286768 connection1:0: detected conn error (1011)

 Attached are the v2.6.30-rc3 KVM host and v2.6.29.2 KVM guest dmesg
 output.  When the 'Unbalanced enable for IRQ 

Re: KVM x86_64 with SR-IOV..? (device passthrough with LIO-Target v3.0)

2009-05-05 Thread Sheng Yang
On Tuesday 05 May 2009 19:28:15 Nicholas A. Bellinger wrote:
 On Tue, 2009-05-05 at 03:43 -0700, Nicholas A. Bellinger wrote:
  On Tue, 2009-05-05 at 09:42 +0800, Yu Zhao wrote:
   Hi,
  
   The VF also works in the host if the VF driver is programed properly.
   So it would be easier to develop the VF driver in the host and then
   verify the VF driver in the guest.
  
   BTW, I didn't see the SR-IOV is enabled in your dmesg, did you select
   the CONFIG_PCI_IOV in the kernel .config?
  
   Thanks,
   Yu
 
  Greetings Yu and Sheng,
 
  So the original attachment was for the v2.6.29-fc11 host kernel output,
  I ended up jumping to v2.6.30-rc3 (and making sure CONFIG_PCI_IOV was
  enabled) for KVM host with kvm-85 and now things are looking quite
  stable for me.
 
  So far I have been able to successfully push LIO-Target v3.0 traffic
  *inside* a v2.6.29.2 KVM guest via the onboard e1000e (02:00.0) port
  from another Linux/iSCSI Initiator machine using a Intel 1 Gb/sec port.
  I am running badblocks tests to iSCSI Logical Units for RAMDISK_DR and
  FILEIO storage objects (in the KVM Guest), and they are passing
  validation and I am seeing ~500 Mb/sec of throughput and very low CPU
  usage in the KVM guests.

 Ok I am seeing another issue with the e1000e port on 02:00.0..:

 As i start to push multiple badblocks tests RAMDISK_DR iSCSI Logical
 units into KVM Guest running LIO v2.6.29.2 from the external Linux/iSCSI
 Initiator machine, after about 100 GB of iSCSI traffic, I see the
 following exception in KVM host v2.6.30-rc3:

 DRHD: handling fault status reg 2
 DMAR:[DMA Write] Request device [02:00.0] fault addr 7fc958b01
 DMAR:[fault reason 04] Access beyond MGAW

This means the fault address is too big It's got 51 bits width which is 
far beyond the physical address limit of current IA32e(48 bits).

Don't know how you can get this...

-- 
regards
Yang, Sheng

 pci-stub :02:00.0: irq 59 for MSI/MSI-X
 pci-stub :02:00.0: irq 60 for MSI/MSI-X
 pci-stub :02:00.0: irq 61 for MSI/MSI-X

 I am able to restart the LIO-Target KVM Guest and the Linux/iSCSI
 Initiators are able to reconnect..  Wow, very cool..

 Not sure if this is a bug in the target_core_mod RAMDISK_DR subsystem
 plugin (mapping struct iovec to internally allocated struct page) or
 what.  I will have to look at the DMAR code to understand what this
 exception means..

 --nab

  One issue I did notice while using the pci-stub method of
  device-assignment with same e1000 port (02:00.0) was while using an
  iSCSI Initiator (Open-iSCSI) on the KVM Host machine and doing sustained
  traffic into the LIO-Target KVM Guest on the same local KVM host to max
  out traffic between the other onboard e1000e port (03.00.0), I see the
  following:
 
  pci-stub :02:00.0: PCI INT A - GSI 17 (level, low) - IRQ 17
  assign device: host bdf = 2:0:0
  pci-stub :02:00.0: irq 59 for MSI/MSI-X
  pci-stub :02:00.0: irq 59 for MSI/MSI-X
  pci-stub :02:00.0: irq 59 for MSI/MSI-X
  pci-stub :02:00.0: irq 59 for MSI/MSI-X
  pci-stub :02:00.0: irq 59 for MSI/MSI-X
  pci-stub :02:00.0: irq 60 for MSI/MSI-X
  pci-stub :02:00.0: irq 61 for MSI/MSI-X
  scsi4 : iSCSI Initiator over TCP/IP
  scsi 4:0:0:0: Direct-Access LIO-ORG  RAMDISK-DR   3.0  PQ: 0
  ANSI: 5 sd 4:0:0:0: Attached scsi generic sg1 type 0
  scsi 4:0:0:1: Direct-Access LIO-ORG  RAMDISK-DR   3.0  PQ: 0
  ANSI: 5 sd 4:0:0:1: Attached scsi generic sg2 type 0
  sd 4:0:0:0: [sdb] 262144 512-byte hardware sectors: (134 MB/128 MiB)
  sd 4:0:0:1: [sdc] 262144 512-byte hardware sectors: (134 MB/128 MiB)
  sd 4:0:0:0: [sdb] Write Protect is off
  sd 4:0:0:0: [sdb] Mode Sense: 2f 00 00 00
  sd 4:0:0:1: [sdc] Write Protect is off
  sd 4:0:0:1: [sdc] Mode Sense: 2f 00 00 00
  sd 4:0:0:0: [sdb] Write cache: disabled, read cache: enabled, doesn't
  support DPO or FUA sd 4:0:0:1: [sdc] Write cache: disabled, read cache:
  enabled, doesn't support DPO or FUA sdb:6 sdc: unknown partition table
  sd 4:0:0:0: [sdb] Attached SCSI disk
   unknown partition table
  sd 4:0:0:1: [sdc] Attached SCSI disk
  [ cut here ]
  WARNING: at kernel/irq/manage.c:260 enable_irq+0x36/0x50()
  Hardware name: empty
  Unbalanced enable for IRQ 59
  Modules linked in: ipt_REJECT xt_tcpudp bridge stp sunrpc iptable_filter
  ip_tables xt_state nf_conntrack ip6table_filter ip6_tables x_tables
  ib_iser rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr ipv6 iscsi_tcp
  libiscsi_tcp libiscsi scsi_transport_iscsi cpufreq_ondemand acpi_cpufreq
  freq_table ext3 jbd loop dm_multipath scsi_dh kvm_intel kvm uinput
  i2c_i801 firewire_ohci joydev firewire_core sg i2c_core 8250_pnp
  crc_itu_t e1000e 8250 serial_core rtc_cmos pcspkr serio_raw rtc_core
  rtc_lib button sd_mod dm_snapshot dm_zero dm_mirror dm_region_hash dm_log
  dm_mod uhci_hcd ohci_hcd ehci_hcd ata_piix libata scsi_mod [last
  unloaded: microcode] Pid: 51, comm: events/0 Tainted: GW 
  2.6.30-rc3 #11

Re: KVM x86_64 with SR-IOV..? (device passthrough with LIO-Target v3.0)

2009-05-05 Thread Sheng Yang
On Wednesday 06 May 2009 01:45:47 Nicholas A. Bellinger wrote:
 On Tue, 2009-05-05 at 04:28 -0700, Nicholas A. Bellinger wrote:
  On Tue, 2009-05-05 at 03:43 -0700, Nicholas A. Bellinger wrote:
   On Tue, 2009-05-05 at 09:42 +0800, Yu Zhao wrote:
Hi,
   
The VF also works in the host if the VF driver is programed properly.
So it would be easier to develop the VF driver in the host and then
verify the VF driver in the guest.
   
BTW, I didn't see the SR-IOV is enabled in your dmesg, did you select
the CONFIG_PCI_IOV in the kernel .config?
   
Thanks,
Yu
  
   Greetings Yu and Sheng,
  
   So the original attachment was for the v2.6.29-fc11 host kernel output,
   I ended up jumping to v2.6.30-rc3 (and making sure CONFIG_PCI_IOV was
   enabled) for KVM host with kvm-85 and now things are looking quite
   stable for me.
  
   So far I have been able to successfully push LIO-Target v3.0 traffic
   *inside* a v2.6.29.2 KVM guest via the onboard e1000e (02:00.0) port
   from another Linux/iSCSI Initiator machine using a Intel 1 Gb/sec port.
   I am running badblocks tests to iSCSI Logical Units for RAMDISK_DR and
   FILEIO storage objects (in the KVM Guest), and they are passing
   validation and I am seeing ~500 Mb/sec of throughput and very low CPU
   usage in the KVM guests.
 
  Ok I am seeing another issue with the e1000e port on 02:00.0..:
 
  As i start to push multiple badblocks tests RAMDISK_DR iSCSI Logical
  units into KVM Guest running LIO v2.6.29.2 from the external Linux/iSCSI
  Initiator machine, after about 100 GB of iSCSI traffic, I see the
  following exception in KVM host v2.6.30-rc3:
 
  DRHD: handling fault status reg 2
  DMAR:[DMA Write] Request device [02:00.0] fault addr 7fc958b01
  DMAR:[fault reason 04] Access beyond MGAW
  pci-stub :02:00.0: irq 59 for MSI/MSI-X
  pci-stub :02:00.0: irq 60 for MSI/MSI-X
  pci-stub :02:00.0: irq 61 for MSI/MSI-X
 
  I am able to restart the LIO-Target KVM Guest and the Linux/iSCSI
  Initiators are able to reconnect..  Wow, very cool..
 
  Not sure if this is a bug in the target_core_mod RAMDISK_DR subsystem
  plugin (mapping struct iovec to internally allocated struct page) or
  what.  I will have to look at the DMAR code to understand what this
  exception means..

 Greetings Yu, Sheng and Co,

 So I have been making progress this morning..  So far, I have hooked up
 a LSI mpt-function PCIe SAS adapter into the KVM guest with a Sandisk
 SATA SSD 32 GB drive.  It is using MSI interrupts (not MSI-X) and I am
 able to push ~70 MB/sec from a 2nd Linux/iSCSI Initiator machine
 (running Open-iSCSI) with the 1500 byte MTUs on e1000e ports from within
 the KVM guest.

Is MSI-X can't be enabled or the device only have MSI capability? Just 
curious...

 The interesting thing is that I am having to use IBLOCK export (using
 using submit_bio(), and complete emulation of SCSI control path) for
 SATA SSD in order to get I/O running stable  Using the pSCSI export I am
 getting immediate exceptions from scsi_execute_async() in the v2.6.29.2
 KVM guest..  

Didn't see exception in the log below... (And buried with iscsi log I can't 
understand. Looking forward for the help from others...) Any thing notable 
show in the host side? I think the target to get pSCSI work well now?

BTW: Maybe you can try the patch from Marcelo titled [patch 0/4] use 
smp_send_reschedule in vcpu_kick / assigned dev host intx race fix.

-- 
regards
Yang, Sheng


 Using a 2nd SAS disk I am able to use target_core_mod/pSCSI
 export and push badblocks and LTP disktest traffic however..

 Here is a bit about the the setup looks,

 *) Linux/iSCSI Initiator node accessing KVM Guest LIO-Target v3.0
 storage:

 subjekt:~# lsscsi
 [6:0:0:0]diskATA  ST3250820AS  3.AA  /dev/sda
 [10:0:0:0]   cd/dvd  PIONEER  DVD-ROM DVD-305  1.06  /dev/scd1
 [18:0:0:0]   cd/dvd  TOSHIBA  DVD/HD  X807616  MC08  /dev/scd2
 [32:0:0:0]   diskLIO-ORG  RAMDISK-DR   3.0   /dev/sdb
 [32:0:0:1]   diskLIO-ORG  RAMDISK-DR   3.0   /dev/sdc
 [32:0:0:2]   diskLIO-ORG  FILEIO   3.0   /dev/sdd
 [32:0:0:3]   diskLIO-ORG  IBLOCK   3.0   /dev/sde

 subjekt:~# sg_inq -i /dev/sde
 VPD INQUIRY: Device Identification page
   Designation descriptor number 1, descriptor length: 20
 id_type: NAA,  code_set: Binary
 associated with the addressed logical unit
   NAA 6, IEEE Company_id: 0x1405
   Vendor Specific Identifier: 0xa97e4ce21
   Vendor Specific Identifier Extension: 0xc0711de829b000c2
   [0x6001405a97e4ce21c0711de829b000c2]
   Designation descriptor number 2, descriptor length: 52
 id_type: T10 vendor identification,  code_set: ASCII
 associated with the addressed logical unit
   vendor id: LIO-ORG
   vendor specific: IBLOCK:a97e4ce21c0711de829b000c2943d57b
   Designation descriptor number 3, descriptor length: 8
 transport: Internet SCSI (iSCSI)
 id_type: Relative target port,  code_set: 

Re: [RFC PATCH 0/3] generic hypercall support

2009-05-05 Thread Gregory Haskins
Chris Wright wrote:
 * Gregory Haskins (gregory.hask...@gmail.com) wrote:
   
 So you would never have someone making a generic
 hypercall(KVM_HC_MMU_OP).  I agree.
 

 Which is why I think the interface proposal you've made is wrong.

I respectfully disagree.  Its only wrong in that the name chosen for the
interface was perhaps too broad/vague.  I still believe the concept is
sound, and the general layering is appropriate. 

   There's
 already hypercall interfaces w/ specific ABI and semantic meaning (which
 are typically called directly/indirectly from an existing pv op hook).
   

Yes, these are different, thus the new interface.

 But a free-form hypercall(unsigned long nr, unsigned long *args, size_t count)
 means hypercall number and arg list must be the same in order for code
 to call hypercall() in a hypervisor agnostic way.
   

Yes, and that is exactly the intention.  I think its perhaps the point
you are missing.

I am well aware that historically the things we do over a hypercall
interface would inherently have meaning only to a specific hypervisor
(e.g. KVM_HC_MMU_OPS (vector 2) via kvm_hypercall()).  However, this
doesn't in any way infer that it is the only use for the general
concept.  Its just the only way they have been exploited to date.

While I acknowledge that the hypervisor certainly must be coordinated
with their use, in their essence hypercalls are just another form of IO
joining the ranks of things like MMIO and PIO.  This is an attempt to
bring them out of the bowels of CONFIG_PARAVIRT to make them a first
class citizen. 

The thing I am building here is really not a general hypercall in the
broad sense.  Rather, its a subset of the hypercall vector namespace. 
It is designed specifically for dynamic binding a synchronous call()
interface to things like virtual devices, and it is therefore these
virtual device models that define the particular ABI within that
namespace.  Thus the ABI in question is explicitly independent of the
underlying hypervisor.  I therefore stand by the proposed design to have
this interface described above the hypervisor support layer (i.e.
pv_ops) (albeit with perhaps a better name like dynamic hypercall as
per my later discussion with Avi).

Consider PIO: The hypervisor (or hardware) and OS negotiate a port
address, but the two end-points are the driver and the device-model (or
real device).  The driver doesnt have to say:

if (kvm)
   kvm_iowrite32(addr, ..);
else if (lguest)
   lguest_iowrite32(addr, ...);
else
   native_iowrite32(addr, ...);

Instead, it just says iowrite32(addr, ...); and the address is used to
route the message appropriately by the platform.  The ABI of that
message, however, is specific to the driver/device and is not
interpreted by kvm/lguest/native-hw infrastructure on the way.

Today, there is no equivelent of a platform agnostic iowrite32() for
hypercalls so the driver would look like the pseudocode above except
substitute with kvm_hypercall(), lguest_hypercall(), etc.  The proposal
is to allow the hypervisor to assign a dynamic vector to resources in
the backend and convey this vector to the guest (such as in PCI
config-space as mentioned in my example use-case).  The provides the
address negotiation function that would normally be done for something
like a pio port-address.   The hypervisor agnostic driver can then use
this globally recognized address-token coupled with other device-private
ABI parameters to communicate with the device.  This can all occur
without the core hypervisor needing to understand the details beyond the
addressing.

What this means to our interface design is that the only thing the
hypervisor really cares about is the first nr parameter.  This acts as
our address-token.  The optional/variable list of args is just payload
as far as the core infrastructure is concerned and are coupled only to
our device ABI.  They were chosen to be an array of ulongs (vs something
like vargs) to reflect the fact that hypercalls are typically passed by
packing registers.

Hope this helps,
-Greg




signature.asc
Description: OpenPGP digital signature


[KVM-AUTOTEST][PATCH] timedrift support

2009-05-05 Thread Bear Yang

Hello everyone,

I like to submit patch to add a new function for 'time drift check' for 
guest running on KVM.


The TimeDrift design logic is below:
1. Set the host as the NTP server
2. Guest only sync it's clock with host *once* when it booted up.
* if the offset value of ntpdate large than 1 sec, the guest will sync 
the clock with host.
* if the offset value of ntpdate less than 1 sec, the guest doesn't need 
sync it's clock with host.


3. Then the cpu stress testing will running on guest.
* a C program will give the real load to guest cpu
4.when the cpustress testing finished. running the commandline ntpdate 
-q host-ip totally 20 times on guest to query the time from host and 
judge whether the guest clock has drift or not.


The details of my patch is attached.

thanks.

Bear.
diff -urN kvm_runtest_2.bak/cpu_stress.c kvm_runtest_2/cpu_stress.c
--- kvm_runtest_2.bak/cpu_stress.c	1969-12-31 19:00:00.0 -0500
+++ kvm_runtest_2/cpu_stress.c	2009-05-05 22:35:34.0 -0400
@@ -0,0 +1,61 @@
+#define _GNU_SOURCE
+#include stdio.h
+#include pthread.h
+#include sched.h
+#include stdlib.h
+#include fcntl.h
+#include math.h
+#include unistd.h
+
+#define MAX_CPUS 256
+#define BUFFSIZE 1024
+
+
+void worker_child(int cpu)
+{
+	int cur_freq;
+	int min_freq;
+	int max_freq;
+	int last_freq;
+	cpu_set_t mask;
+	int i;
+	double x;
+int d = 0;
+	/*
+	 * bind this thread to the specified cpu 
+	 */
+	CPU_ZERO(mask);
+	CPU_SET(cpu, mask);
+	sched_setaffinity(0, CPU_SETSIZE, mask);
+
+	while (d++ != 50) {
+			for (i=0; i10; i++)
+x = sqrt(x);
+	}
+
+	_exit(0);
+
+}
+
+
+main() {
+	cpu_set_t mask;
+	int i;
+	int code;
+
+	if (sched_getaffinity(0, CPU_SETSIZE, mask)  0){
+		perror (sched_getaffinity);
+		exit(1);
+	}
+
+	for (i=0; iCPU_SETSIZE; i++)
+		if (CPU_ISSET(i, mask)){
+			printf (CPU%d\n,i);
+			if (fork() == 0)
+worker_child(i);
+		}
+
+
+	wait(code);
+	exit (WEXITSTATUS(code));
+}
diff -urN kvm_runtest_2.bak/kvm_runtest_2.py kvm_runtest_2/kvm_runtest_2.py
--- kvm_runtest_2.bak/kvm_runtest_2.py	2009-04-29 06:17:29.0 -0400
+++ kvm_runtest_2/kvm_runtest_2.py	2009-04-29 08:06:32.0 -0400
@@ -36,6 +36,8 @@
 autotest: test_routine(kvm_tests,   run_autotest),
 kvm_install:  test_routine(kvm_install, run_kvm_install),
 linux_s3: test_routine(kvm_tests,   run_linux_s3),
+ntp_server_setup: test_routine(kvm_tests,   run_ntp_server_setup),
+timedrift:test_routine(kvm_tests,   run_timedrift),
 }
 
 # Make it possible to import modules from the test's bindir
diff -urN kvm_runtest_2.bak/kvm_tests.cfg.sample kvm_runtest_2/kvm_tests.cfg.sample
--- kvm_runtest_2.bak/kvm_tests.cfg.sample	2009-04-29 06:17:29.0 -0400
+++ kvm_runtest_2/kvm_tests.cfg.sample	2009-04-29 08:09:36.0 -0400
@@ -81,6 +81,10 @@
 - linux_s3:  install setup
 type = linux_s3
 
+- ntp_server_setup:
+type = ntp_server_setup
+- timedrift:  ntp_server_setup
+type = timedrift
 # NICs
 variants:
 - @rtl8139:
diff -urN kvm_runtest_2.bak/kvm_tests.py kvm_runtest_2/kvm_tests.py
--- kvm_runtest_2.bak/kvm_tests.py	2009-04-29 06:17:29.0 -0400
+++ kvm_runtest_2/kvm_tests.py	2009-05-05 23:45:57.0 -0400
@@ -394,3 +394,235 @@
 kvm_log.info(VM resumed after S3)
 
 session.close()
+
+def run_ntp_server_setup(test, params, env):
+
+NTP server configuration and related network file modification
+
+kvm_log.debug(run ntp server setup)
+status = 1
+# stop firewall for NTP server if it is running.
+status = os.system(/etc/init.d/iptables status)
+if status == 0:
+os.system(/etc/init.d/iptables stop)
+status = 1
+
+# prevent dhcp client modify the ntp.conf
+kvm_log.info(prevent dhcp client modify the ntp.conf)
+
+config_file = /etc/sysconfig/network
+network_file = open(/etc/sysconfig/network, a)
+string = PEERNTP=no
+
+if os.system(grep %s %s % (string, config_file)):
+network_file.writelines(str(string)+'\n')
+
+network_file.close()
+  
+# start ntp server on host
+kvm_log.info(backup ntp config file)
+
+ntp_filename = os.path.expanduser(/etc/ntp.conf)
+# backup ntp config file
+backup_bootloader_filename = ntp_filename + _backup
+if os.path.exists(ntp_filename):
+os.rename(ntp_filename, backup_bootloader_filename)
+
+status = os.system(/etc/init.d/ntpd status)
+if status == 0:
+os.system(/etc/init.d/ntpd stop)
+status = 1
+
+kvm_log.info(start ntp server on host)
+
+ntp_cmd = '''
+echo restrict default kod nomodify notrap nopeer noquery  /etc/ntp.conf;\
+echo restrict 127.0.0.1  /etc/ntp.conf;\
+echo driftfile /var/lib/ntp/drift  /etc/ntp.conf;\
+echo keys /etc/ntp/keys  /etc/ntp.conf;\
+echo 

Re: [patch 0/4] use smp_send_reschedule in vcpu_kick / assigned dev host intx race fix

2009-05-05 Thread Sheng Yang
On Thursday 30 April 2009 09:59:56 Sheng Yang wrote:
 On Thursday 30 April 2009 08:56:57 Sheng Yang wrote:
  On Thursday 30 April 2009 01:47:57 Marcelo Tosatti wrote:
   On Tue, Apr 28, 2009 at 03:08:46PM +0800, Sheng Yang wrote:
Ack all. This also solved one bug by my hand. Thanks!
   
I observe one point: the performance of high workload interrupt(e.g.
10 gigabyte oplin card) dropped dramatically with
smp_send_reschedule() method... In one environment(the speed of oplin
card also limited by cpu performance), Using
smp_call_function_single() can get more than 1G bit/s stably(native
got 1.2G), but smp_send_reschedule() can only got around 600M
bit/s... And the rescheduling interrupt number is about 2000/second
per cpu. And the interrupt rate is about tens of thousands per second
for the device.
   
Anyway, this method is more elegant and correct. Though there is
still room for optimize - but of course, the correctness is first
priority.
  
   Are you using the compat code or a kvm.git kernel? Can you remove only
   the last patch (the spinlock) to confirm its the cause of the slowdown?
 
  I am using kvm.git.
 
  I said this because I tried the old version of patch(which have warning)
  and it would got more than 1G/sec.
 
  I'd like to take a close look at what's happened.

 Still ACK this patchset.

 And sorry, my memory messed...

 The old version of patch and this one offered the same performance. So the
 problem is not here.

 I get more than 1g per second by one of myself's experiment.

 Disable/enable irq purposed to use with level interrupt to prevent it send
 interrupt again after kernel handler return, but it not applied to
 MSI/MSI-X. Though some interrupt may be merged with one, but AFAIK the
 driver can handle it well.

 My experiment is discard disable/enable IRQ for MSI/MSI-X, then can get
 much better performance for oplin card, 2x with disable/enable one.

 I would prepare a patch for it.

Hi Avi

Is there any issue blocked this patchset?

Thanks!

-- 
regards
Yang, Sheng
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/9] Remove irq_pending bitmap

2009-05-05 Thread Sheng Yang
On Tuesday 05 May 2009 16:14:29 Gleb Natapov wrote:
 Only one interrupt vector can be injected from userspace irqchip at
 any given time so no need to store it in a bitmap. Put it into interrupt
 queue directly.

 Signed-off-by: Gleb Natapov g...@redhat.com
 ---
  arch/x86/include/asm/kvm_host.h |2 --
  arch/x86/kvm/irq.c  |4 ++--
  arch/x86/kvm/x86.c  |   38
 +++--- arch/x86/kvm/x86.h  |  
 12 
  4 files changed, 13 insertions(+), 43 deletions(-)

 diff --git a/arch/x86/include/asm/kvm_host.h
 b/arch/x86/include/asm/kvm_host.h index 8e680c3..cc892f5 100644
 --- a/arch/x86/include/asm/kvm_host.h
 +++ b/arch/x86/include/asm/kvm_host.h
 @@ -266,8 +266,6 @@ struct kvm_mmu {

  struct kvm_vcpu_arch {
   u64 host_tsc;
 - unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
 - DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
   /*
* rip and regs accesses must go through
* kvm_{register,rip}_{read,write} functions.
 diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
 index 11c2757..96dfbb6 100644
 --- a/arch/x86/kvm/irq.c
 +++ b/arch/x86/kvm/irq.c
 @@ -50,7 +50,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
   struct kvm_pic *s;

   if (!irqchip_in_kernel(v-kvm))
 - return v-arch.irq_summary;
 + return v-arch.interrupt.pending;

   if (kvm_apic_has_interrupt(v) == -1) {  /* LAPIC */
   if (kvm_apic_accept_pic_intr(v)) {
 @@ -72,7 +72,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
   int vector;

   if (!irqchip_in_kernel(v-kvm))
 - return kvm_pop_irq(v);
 + return v-arch.interrupt.nr;

   vector = kvm_get_apic_interrupt(v); /* APIC */
   if (vector == -1) {
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index 12ab1cc..4596927 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -1424,8 +1424,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu
 *vcpu, return -ENXIO;
   vcpu_load(vcpu);

 - set_bit(irq-irq, vcpu-arch.irq_pending);
 - set_bit(irq-irq / BITS_PER_LONG, vcpu-arch.irq_summary);
 + kvm_queue_interrupt(vcpu, irq-irq);

   vcpu_put(vcpu);

 @@ -3562,12 +3561,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu
 *vcpu, sregs-efer = vcpu-arch.shadow_efer;
   sregs-apic_base = kvm_get_apic_base(vcpu);

 - if (irqchip_in_kernel(vcpu-kvm))
 - memset(sregs-interrupt_bitmap, 0,
 -sizeof sregs-interrupt_bitmap);

? When did we discard the saving of pending interrupt for irqchip_in_kernel?

 - else
 - memcpy(sregs-interrupt_bitmap, vcpu-arch.irq_pending,
 -sizeof sregs-interrupt_bitmap);
 + memset(sregs-interrupt_bitmap, 0, sizeof sregs-interrupt_bitmap);

No need to save any pending interrupts? Did I miss anything?

   if (vcpu-arch.interrupt.pending)
   set_bit(vcpu-arch.interrupt.nr,
 @@ -4037,7 +4031,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu
 *vcpu, struct kvm_sregs *sregs)
  {
   int mmu_reset_needed = 0;
 - int i, pending_vec, max_bits;
 + int pending_vec, max_bits;
   struct descriptor_table dt;

   vcpu_load(vcpu);
 @@ -4079,24 +4073,14 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu
 *vcpu, if (mmu_reset_needed)
   kvm_mmu_reset_context(vcpu);

 - if (!irqchip_in_kernel(vcpu-kvm)) {
 - memcpy(vcpu-arch.irq_pending, sregs-interrupt_bitmap,
 -sizeof vcpu-arch.irq_pending);
 - vcpu-arch.irq_summary = 0;
 - for (i = 0; i  ARRAY_SIZE(vcpu-arch.irq_pending); ++i)
 - if (vcpu-arch.irq_pending[i])
 - __set_bit(i, vcpu-arch.irq_summary);
 - } else {
 - max_bits = (sizeof sregs-interrupt_bitmap)  3;
 - pending_vec = find_first_bit(
 - (const unsigned long *)sregs-interrupt_bitmap,
 - max_bits);
 - /* Only pending external irq is handled here */
 - if (pending_vec  max_bits) {
 - kvm_queue_interrupt(vcpu, pending_vec);
 - pr_debug(Set back pending irq %d\n, pending_vec);
 - }
 - kvm_pic_clear_isr_ack(vcpu-kvm);
 + max_bits = (sizeof sregs-interrupt_bitmap)  3;

If interrupt_bitmap is always zero as above, why we got this... For 
compatible?

-- 
regards
Yang, Sheng

 + pending_vec = find_first_bit(
 + (const unsigned long *)sregs-interrupt_bitmap, max_bits);
 + if (pending_vec  max_bits) {
 + kvm_queue_interrupt(vcpu, pending_vec);
 + pr_debug(Set back pending irq %d\n, pending_vec);
 + if (irqchip_in_kernel(vcpu-kvm))
 + kvm_pic_clear_isr_ack(vcpu-kvm);
   }

   kvm_set_segment(vcpu, sregs-cs, VCPU_SREG_CS);
 diff --git a/arch/x86/kvm/x86.h