[PATCH] kvm: external module: compatibility for 2.6.27 hosts with backported hrtimer patches

2008-12-29 Thread Avi Kivity
From: Avi Kivity a...@redhat.com

Some 2.6.27 kernels (F10) have backported htrimer patches which interfere
with the external module hrtimer backports.  Rework the backports to allow
them to coexist.

Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/kernel/external-module-compat-comm.h 
b/kernel/external-module-compat-comm.h
index 27fea15..f2343f6 100644
--- a/kernel/external-module-compat-comm.h
+++ b/kernel/external-module-compat-comm.h
@@ -584,26 +584,33 @@ static inline int get_user_pages_fast(unsigned long 
start, int nr_pages,
 
 #if LINUX_VERSION_CODE  KERNEL_VERSION(2,6,28)
 
-static inline void hrtimer_add_expires_ns(struct hrtimer *timer, u64 delta)
+static inline void kvm_hrtimer_add_expires_ns(struct hrtimer *timer, u64 delta)
 {
timer-expires = ktime_add_ns(timer-expires, delta);
 }
 
-static inline ktime_t hrtimer_get_expires(struct hrtimer *timer)
+static inline ktime_t kvm_hrtimer_get_expires(struct hrtimer *timer)
 {
return timer-expires;
 }
 
-static inline u64 hrtimer_get_expires_ns(struct hrtimer *timer)
+static inline u64 kvm_hrtimer_get_expires_ns(struct hrtimer *timer)
 {
return ktime_to_ns(timer-expires);
 }
 
-static inline void hrtimer_start_expires(struct hrtimer *timer, int mode)
+static inline void kvm_hrtimer_start_expires(struct hrtimer *timer, int mode)
 {
hrtimer_start_p(timer, timer-expires, mode);
 }
 
+#else
+
+#define kvm_hrtimer_add_expires_ns hrtimer_add_expires_ns
+#define kvm_hrtimer_get_expires hrtimer_get_expires
+#define kvm_hrtimer_get_expires_ns hrtimer_get_expires_ns
+#define kvm_hrtimer_start_expires hrtimer_start_expires
+
 #endif
 
 #if LINUX_VERSION_CODE  KERNEL_VERSION(2,6,28)
diff --git a/kernel/ia64/hack-module.awk b/kernel/ia64/hack-module.awk
index 3dd2260..a26d567 100644
--- a/kernel/ia64/hack-module.awk
+++ b/kernel/ia64/hack-module.awk
@@ -1,4 +1,6 @@
 BEGIN { split(INIT_WORK on_each_cpu smp_call_function  \
+ hrtimer_add_expires_ns hrtimer_get_expires  \
+ hrtimer_get_expires_ns hrtimer_start_expires  \
  request_irq, compat_apis); }
 
 /MODULE_AUTHOR/ {
diff --git a/kernel/x86/hack-module.awk b/kernel/x86/hack-module.awk
index f40c972..1c80543 100644
--- a/kernel/x86/hack-module.awk
+++ b/kernel/x86/hack-module.awk
@@ -1,4 +1,6 @@
 BEGIN { split(INIT_WORK tsc_khz desc_struct ldttss_desc64 desc_ptr  \
+ hrtimer_add_expires_ns hrtimer_get_expires  \
+ hrtimer_get_expires_ns hrtimer_start_expires  \
  on_each_cpu relay_open request_irq , compat_apis); }
 
 /^int kvm_init\(/ { anon_inodes = 1 }
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] KVM: introduce kvm_read_guest_virt, kvm_write_guest_virt

2008-12-29 Thread Avi Kivity
From: Izik Eidus iei...@redhat.com

This commit change the name of emulator_read_std into kvm_read_guest_virt,
and add new function name kvm_write_guest_virt that allow writing into a
guest virtual address.

Signed-off-by: Izik Eidus iei...@redhat.com
Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index af00b8c..b1e109b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -609,10 +609,6 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 
 void fx_init(struct kvm_vcpu *vcpu);
 
-int emulator_read_std(unsigned long addr,
- void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu);
 int emulator_write_emulated(unsigned long addr,
const void *val,
unsigned int bytes,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 36aa576..3e92230 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1973,10 +1973,8 @@ static struct kvm_io_device *vcpu_find_mmio_dev(struct 
kvm_vcpu *vcpu,
return dev;
 }
 
-int emulator_read_std(unsigned long addr,
-void *val,
-unsigned int bytes,
-struct kvm_vcpu *vcpu)
+int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
+   struct kvm_vcpu *vcpu)
 {
void *data = val;
int r = X86EMUL_CONTINUE;
@@ -1984,27 +1982,57 @@ int emulator_read_std(unsigned long addr,
while (bytes) {
gpa_t gpa = vcpu-arch.mmu.gva_to_gpa(vcpu, addr);
unsigned offset = addr  (PAGE_SIZE-1);
-   unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
+   unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
 
if (gpa == UNMAPPED_GVA) {
r = X86EMUL_PROPAGATE_FAULT;
goto out;
}
-   ret = kvm_read_guest(vcpu-kvm, gpa, data, tocopy);
+   ret = kvm_read_guest(vcpu-kvm, gpa, data, toread);
if (ret  0) {
r = X86EMUL_UNHANDLEABLE;
goto out;
}
 
-   bytes -= tocopy;
-   data += tocopy;
-   addr += tocopy;
+   bytes -= toread;
+   data += toread;
+   addr += toread;
}
 out:
return r;
 }
-EXPORT_SYMBOL_GPL(emulator_read_std);
+
+int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
+struct kvm_vcpu *vcpu)
+{
+   void *data = val;
+   int r = X86EMUL_CONTINUE;
+
+   while (bytes) {
+   gpa_t gpa = vcpu-arch.mmu.gva_to_gpa(vcpu, addr);
+   unsigned offset = addr  (PAGE_SIZE-1);
+   unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
+   int ret;
+
+   if (gpa == UNMAPPED_GVA) {
+   r = X86EMUL_PROPAGATE_FAULT;
+   goto out;
+   }
+   ret = kvm_write_guest(vcpu-kvm, gpa, data, towrite);
+   if (ret  0) {
+   r = X86EMUL_UNHANDLEABLE;
+   goto out;
+   }
+
+   bytes -= towrite;
+   data += towrite;
+   addr += towrite;
+   }
+out:
+   return r;
+}
+
 
 static int emulator_read_emulated(unsigned long addr,
  void *val,
@@ -2026,8 +2054,8 @@ static int emulator_read_emulated(unsigned long addr,
if ((gpa  PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
goto mmio;
 
-   if (emulator_read_std(addr, val, bytes, vcpu)
-   == X86EMUL_CONTINUE)
+   if (kvm_read_guest_virt(addr, val, bytes, vcpu)
+   == X86EMUL_CONTINUE)
return X86EMUL_CONTINUE;
if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
@@ -2230,7 +2258,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, 
const char *context)
 
rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
 
-   emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
+   kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu);
 
printk(KERN_ERR emulation failed (%s) rip %lx %02x %02x %02x %02x\n,
   context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
@@ -2238,7 +2266,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, 
const char *context)
 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
 
 static struct x86_emulate_ops emulate_ops = {
-   .read_std= emulator_read_std,
+   .read_std= kvm_read_guest_virt,
.read_emulated   = emulator_read_emulated,
.write_emulated  = emulator_write_emulated,
.cmpxchg_emulated  

[PATCH] KVM: remove the vmap usage

2008-12-29 Thread Avi Kivity
From: Izik Eidus iei...@redhat.com

vmap() on guest pages hides those pages from the Linux mm for an extended
(userspace determined) amount of time.  Get rid of it.

Signed-off-by: Izik Eidus iei...@redhat.com
Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3e92230..1059ffc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2368,40 +2368,19 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 }
 EXPORT_SYMBOL_GPL(emulate_instruction);
 
-static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
-{
-   int i;
-
-   for (i = 0; i  ARRAY_SIZE(vcpu-arch.pio.guest_pages); ++i)
-   if (vcpu-arch.pio.guest_pages[i]) {
-   kvm_release_page_dirty(vcpu-arch.pio.guest_pages[i]);
-   vcpu-arch.pio.guest_pages[i] = NULL;
-   }
-}
-
 static int pio_copy_data(struct kvm_vcpu *vcpu)
 {
void *p = vcpu-arch.pio_data;
-   void *q;
+   gva_t q = vcpu-arch.pio.guest_gva;
unsigned bytes;
-   int nr_pages = vcpu-arch.pio.guest_pages[1] ? 2 : 1;
+   int ret;
 
-   q = vmap(vcpu-arch.pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
-PAGE_KERNEL);
-   if (!q) {
-   free_pio_guest_pages(vcpu);
-   return -ENOMEM;
-   }
-   q += vcpu-arch.pio.guest_page_offset;
bytes = vcpu-arch.pio.size * vcpu-arch.pio.cur_count;
if (vcpu-arch.pio.in)
-   memcpy(q, p, bytes);
+   ret = kvm_write_guest_virt(q, p, bytes, vcpu);
else
-   memcpy(p, q, bytes);
-   q -= vcpu-arch.pio.guest_page_offset;
-   vunmap(q);
-   free_pio_guest_pages(vcpu);
-   return 0;
+   ret = kvm_read_guest_virt(q, p, bytes, vcpu);
+   return ret;
 }
 
 int complete_pio(struct kvm_vcpu *vcpu)
@@ -2512,7 +2491,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run 
*run, int in,
vcpu-arch.pio.in = in;
vcpu-arch.pio.string = 0;
vcpu-arch.pio.down = 0;
-   vcpu-arch.pio.guest_page_offset = 0;
vcpu-arch.pio.rep = 0;
 
if (vcpu-run-io.direction == KVM_EXIT_IO_IN)
@@ -2540,9 +2518,7 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct 
kvm_run *run, int in,
  gva_t address, int rep, unsigned port)
 {
unsigned now, in_page;
-   int i, ret = 0;
-   int nr_pages = 1;
-   struct page *page;
+   int ret = 0;
struct kvm_io_device *pio_dev;
 
vcpu-run-exit_reason = KVM_EXIT_IO;
@@ -2554,7 +2530,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct 
kvm_run *run, int in,
vcpu-arch.pio.in = in;
vcpu-arch.pio.string = 1;
vcpu-arch.pio.down = down;
-   vcpu-arch.pio.guest_page_offset = offset_in_page(address);
vcpu-arch.pio.rep = rep;
 
if (vcpu-run-io.direction == KVM_EXIT_IO_IN)
@@ -2574,15 +2549,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct 
kvm_run *run, int in,
else
in_page = offset_in_page(address) + size;
now = min(count, (unsigned long)in_page / size);
-   if (!now) {
-   /*
-* String I/O straddles page boundary.  Pin two guest pages
-* so that we satisfy atomicity constraints.  Do just one
-* transaction to avoid complexity.
-*/
-   nr_pages = 2;
+   if (!now)
now = 1;
-   }
if (down) {
/*
 * String I/O in reverse.  Yuck.  Kill the guest, fix later.
@@ -2597,15 +2565,7 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct 
kvm_run *run, int in,
if (vcpu-arch.pio.cur_count == vcpu-arch.pio.count)
kvm_x86_ops-skip_emulated_instruction(vcpu);
 
-   for (i = 0; i  nr_pages; ++i) {
-   page = gva_to_page(vcpu, address + i * PAGE_SIZE);
-   vcpu-arch.pio.guest_pages[i] = page;
-   if (!page) {
-   kvm_inject_gp(vcpu, 0);
-   free_pio_guest_pages(vcpu);
-   return 1;
-   }
-   }
+   vcpu-arch.pio.guest_gva = address;
 
pio_dev = vcpu_find_pio_dev(vcpu, port,
vcpu-arch.pio.cur_count,
@@ -2613,7 +2573,11 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct 
kvm_run *run, int in,
if (!vcpu-arch.pio.in) {
/* string PIO write */
ret = pio_copy_data(vcpu);
-   if (ret = 0  pio_dev) {
+   if (ret == X86EMUL_PROPAGATE_FAULT) {
+   kvm_inject_gp(vcpu, 0);
+   return 1;
+   }
+   if (ret == 0  pio_dev) {
pio_string_write(pio_dev, vcpu);
complete_pio(vcpu);
if (vcpu-arch.pio.count == 0)
diff --git a/include/linux/kvm_types.h 

Re: [PATCH][v2] kvm-userspace: Load PCI option ROMs

2008-12-29 Thread Avi Kivity

Liu, Kechao wrote:

Hi Avi,

Thanks for your comments. I've updated the patch according to them.
Please review it. Thank you.

Load assigned devices' PCI option ROMs to the RAM of
guest OS. And pass the corresponding devfns to BIOS.

  


Looks good.

+
+/* Write ROM data and devfn to phys_addr */

+cpu_physical_memory_write_rom(0xd + offset, rom, size);
+cpu_physical_memory_write_rom(0xd + offset + size, devfn, 1);
+   


How is the last bit performed on real hardware?  Obviously the ROM can't 
have devfn embedded.


--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/2] remove kvm vmap usage

2008-12-29 Thread Avi Kivity

Izik Eidus wrote:

Remove the vmap usage from kvm, this is needed both for ksm and
get_user_pages != write.
  


applied, thanks.

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[ kvm-Bugs-2474501 ] Migration: Linux x64 SMP guests failures

2008-12-29 Thread SourceForge.net
Bugs item #2474501, was opened at 2008-12-29 11:14
Message generated for change (Tracker Item Submitted) made by Item Submitter
You can respond by visiting: 
https://sourceforge.net/tracker/?func=detailatid=893831aid=2474501group_id=180599

Please note that this message will contain a full copy of the comment thread,
including the initial issue submission, for this request,
not just the latest update.
Category: None
Group: None
Status: Open
Resolution: None
Priority: 5
Private: No
Submitted By: Technologov (technologov)
Assigned to: Nobody/Anonymous (nobody)
Summary: Migration: Linux x64 SMP guests failures

Initial Comment:

Linux x64 SMP guests failed to migrate.

Host: Fedora 7 x64, AMD Opteron 2352, KVM-81

Guests: Fedora 9 x64, CentOS 5 x64

Qemu/KVM command:
/usr/local/bin/qemu-system-x86_64 -monitor 
unix:/tmp/centos5-64__smp2_dst.monitor,server,nowait -pidfile 
/tmp/centos5-64__smp2_dst.pid -name centos5-64__smp2_dst -m 512 -hda 
/vm/centos5-64.qcow2_centos5-64__smp2 -net 
nic,vlan=0,macaddr=20:20:20:00:00:02,model=rtl8139 -net 
tap,vlan=0,ifname=cen_0_5706_02,script=/root/Linstall/git-kvm-autotest-new/client/tests/kvm_runtest/root/build/etc/kvm/qemu-ifup
 -vnc :27 -smp 2 -incoming tcp:0:5027  /dev/null  
/tmp/centos5-64__smp2_dst.out 21 

-Alexey, 29.12.2008.

--

You can respond by visiting: 
https://sourceforge.net/tracker/?func=detailatid=893831aid=2474501group_id=180599
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH][v2] kvm-userspace: Load PCI option ROMs

2008-12-29 Thread Liu, Kechao
Hi Avi,

-Original Message-
From: Avi Kivity [mailto:a...@redhat.com]
Sent: 2008年12月29日 16:29
To: Liu, Kechao
Cc: kvm@vger.kernel.org; Shan, Haitao
Subject: Re: [PATCH][v2] kvm-userspace: Load PCI option ROMs

Liu, Kechao wrote:
 Hi Avi,

 Thanks for your comments. I've updated the patch according to them.
 Please review it. Thank you.

 Load assigned devices' PCI option ROMs to the RAM of
 guest OS. And pass the corresponding devfns to BIOS.



Looks good.

 +
 +/* Write ROM data and devfn to phys_addr */
 +cpu_physical_memory_write_rom(0xd + offset, rom, size);
 +cpu_physical_memory_write_rom(0xd + offset + size, devfn, 1);
 +

How is the last bit performed on real hardware?  Obviously the ROM can't
have devfn embedded.

On a real hardware, BIOS scans PCI devices, loads ROMs and it can get devices' 
devfns. Here, in an easier way, we load option ROMs in QEMU and thus need to
store and pass the devfns.

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

Best Regards,
Liu, Kechao


Re: [PATCH][v2] kvm-userspace: Load PCI option ROMs

2008-12-29 Thread Avi Kivity
Liu, Kechao wrote:
 How is the last bit performed on real hardware?  Obviously the ROM can't
 have devfn embedded.
 

 On a real hardware, BIOS scans PCI devices, loads ROMs and it can get 
 devices' devfns. Here, in an easier way, we load option ROMs in QEMU and thus 
 need to
 store and pass the devfns.
   

Well, it may make sense to provide the ROMs as virtual PCI BARs, and
have the bios do the work. This way, if some driver relies on remapping
the BAR (graphic cards?), it can still work.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to 
panic.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 15/15] KVM: Fix racy in kvm_free_assigned_irq

2008-12-29 Thread Sheng Yang
On Monday 29 December 2008 13:42:22 Amit Shah wrote:
 On Sun, Dec 28, 2008 at 07:24:02PM +0800, Sheng Yang wrote:
  On Sat, Dec 27, 2008 at 06:06:26PM -0200, Marcelo Tosatti wrote:
   On Fri, Dec 26, 2008 at 10:30:07AM +0800, Sheng Yang wrote:
Thanks to Marcelo's observation, The following code have potential
issue:
   
if (cancel_work_sync(assigned_dev-interrupt_work))
kvm_put_kvm(kvm);
   
In fact, cancel_work_sync() would return true either work struct is
only scheduled or the callback of work struct is executed. This code
only consider the former situation.
  
   Why not simply drop the reference inc / dec from irq handler/work
   function?
 
  Sorry, I don't know the answer. After checking the code, I also think
  it's a little strange to increase refernce count here, and I think we
  won't suppose work_handler can release the kvm struct.

 At the time of developing that code, this was my observation:

 I see from the call chain kvm_put_kvm-...-kvm_arch_destroy_vm, no locks
 are taken to actually destroy the vm. We can't be in ioctls, sure. But
 shouldn't the mutex be taken to ensure there's nothing else going on while
 destroying?

 At least with the workqueue model, we could be called asynchronously in
 kernel context and I would have held the mutex and about to inject
 interrupts while everything is being wiped off underneath. However, the
 workqueue model tries its best to schedule the work on the same CPU, though
 we can't use that guarantee to ensure things will be fine.

 ---
 So I had to get a ref to the current vm till we had any pending work
 scheduled. I think I put in comments in the code, but sadly most of my
 comments we stripped out before the merge.

Not quite understand...

The free assigned device in the destroy path of VM, so as free irq. And we got 
cancel_work_sync() in free irq which can sync with the execution of scheduled 
work. And now before cancel_work_sync(), we disable the interrupt so that no 
more schedule work happen again. So after cancel_work_sync(), everything(I 
think it's irq handler and schedule work here) asynchronously should quiet 
down.

Or I miss something?

-- 
regards
Yang, Sheng

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: gettimeofday slow in RHEL4 guests

2008-12-29 Thread Yang, Sheng
On Monday 29 December 2008 02:38:07 Marcelo Tosatti wrote:
 On Tue, Nov 25, 2008 at 01:52:59PM +0100, Andi Kleen wrote:
   But yeah - the remapping of HPET timers to virtual HPET timers sounds
   pretty tough. I wonder if one could overcome that with a little
   hardware support though ...
 
  For gettimeofday better make TSC work. Even in the best case (no
  virtualization) it is much faster than HPET because it sits in the CPU,
  while HPET is far away on the external south bridge.

 The tsc clock on older Linux 2.6 kernels compensates for lost ticks.
 The algorithm uses the PIT count (latched) to measure the delay between
 interrupt generation and handling, and sums that value, on the next
 interrupt, to the TSC delta.

 Sheng investigated this problem in the discussions before in-kernel PIT
 was merged:

 http://www.mail-archive.com/kvm-de...@lists.sourceforge.net/msg13873.html

 The algorithm overcompensates for lost ticks and the guest time runs
 faster than the hosts.

 There are two issues:

 1) A bug in the in-kernel PIT which miscalculates the count value.

 2) For the case where more than one interrupt is lost, and later
 reinjected, the value read from PIT count is meaningless for the purpose
 of the tsc algorithm. The count is interpreted as the delay until the
 next interrupt, which is not the case with reinjection.

 As Sheng mentioned in the thread above, Xen pulls back the TSC value
 when reinjecting interrupts. VMWare ESX has a notion of virtual TSC,
 which I believe is similar in this context.

 For KVM I believe the best immediate solution (for now) is to provide an
 option to disable reinjection, behaving similarly to real hardware. The
 advantage is simplicity compared to virtualizing the time sources.

 The QEMU PIT emulation has a limit on the rate of interrupt reinjection,
 perhaps something similar should be investigated in the future.

 The following patch (which contains the bugfix for 1) and disabled
 reinjection) fixes the severe time drift on RHEL4 with clock=tsc.
 What I'm proposing is to condition reinjection with an option
 (-kvm-pit-no-reinject or something).

I agree that it should go with a user space option to disable rejection, as 
it's hard to overcome the problem that we delayed interrupt injection... 

-- 
regards
Yang, Sheng

 Comments or better ideas?


 diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
 index e665d1c..608af7b 100644
 --- a/arch/x86/kvm/i8254.c
 +++ b/arch/x86/kvm/i8254.c
 @@ -201,13 +201,16 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps)
   if (!atomic_inc_and_test(pt-pending))
   set_bit(KVM_REQ_PENDING_TIMER, vcpu0-requests);

 + if (atomic_read(pt-pending)  1)
 + atomic_set(pt-pending, 1);
 +
   if (vcpu0  waitqueue_active(vcpu0-wq))
   wake_up_interruptible(vcpu0-wq);

   hrtimer_add_expires_ns(pt-timer, pt-period);
   pt-scheduled = hrtimer_get_expires_ns(pt-timer);
   if (pt-period)
 - ps-channels[0].count_load_time = 
 hrtimer_get_expires(pt-timer);
 + ps-channels[0].count_load_time = ktime_get();

   return (pt-period == 0 ? 0 : 1);
  }

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: how increase/decrease ram on running vm ?

2008-12-29 Thread Василец Дмитрий
Xen hypervisor can increase and decrease cpu and ram.
will be able this function on kvm/qemu ?


В Сбт, 27/12/2008 в 01:47 +0900, Ryota OZAKI пишет:
 2008/12/27 Ryota OZAKI ozaki.ry...@gmail.com:
  Have you tried decreasing memory? AFAIK, current ballooning cannot
  increase memory.
 
 oops, i mean ballooning cannot increase memory over the amount of
 memory specified in qemu/kvm arguments.
 
  Regards,
   ozaki-r
 
  2008/12/27 Василец Дмитрий d.vasil...@peterhost.ru:
  i read this , but i haven't balloon in cli.
 
  В Птн, 26/12/2008 в 23:25 +0900, Ryota OZAKI пишет:
  Hi,
 
  http://www.linux-kvm.com/content/memory-ballooning-feature-coming-soon-kvm
 
  This page might help you.
 
  Regards,
ozaki-r
 
  2008/12/26 Василец Дмитрий d.vasil...@peterhost.ru:
   how increase/decrease ram on running vm ?
   i found virtio_balloon module , but don't know how it work.
  
   --
   To unsubscribe from this list: send the line unsubscribe kvm in
   the body of a message to majord...@vger.kernel.org
   More majordomo info at  http://vger.kernel.org/majordomo-info.html
  
  --
  To unsubscribe from this list: send the line unsubscribe kvm in
  the body of a message to majord...@vger.kernel.org
  More majordomo info at  http://vger.kernel.org/majordomo-info.html
 
 
  --
  To unsubscribe from this list: send the line unsubscribe kvm in
  the body of a message to majord...@vger.kernel.org
  More majordomo info at  http://vger.kernel.org/majordomo-info.html
 
 
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: gettimeofday slow in RHEL4 guests

2008-12-29 Thread Avi Kivity

Marcelo Tosatti wrote:

The tsc clock on older Linux 2.6 kernels compensates for lost ticks.
The algorithm uses the PIT count (latched) to measure the delay between
interrupt generation and handling, and sums that value, on the next
interrupt, to the TSC delta.

Sheng investigated this problem in the discussions before in-kernel PIT
was merged:

http://www.mail-archive.com/kvm-de...@lists.sourceforge.net/msg13873.html

The algorithm overcompensates for lost ticks and the guest time runs
faster than the hosts.

There are two issues:

1) A bug in the in-kernel PIT which miscalculates the count value.

2) For the case where more than one interrupt is lost, and later
reinjected, the value read from PIT count is meaningless for the purpose
of the tsc algorithm. The count is interpreted as the delay until the
next interrupt, which is not the case with reinjection.

As Sheng mentioned in the thread above, Xen pulls back the TSC value
when reinjecting interrupts. VMWare ESX has a notion of virtual TSC,
which I believe is similar in this context.

For KVM I believe the best immediate solution (for now) is to provide an
option to disable reinjection, behaving similarly to real hardware. The
advantage is simplicity compared to virtualizing the time sources.

The QEMU PIT emulation has a limit on the rate of interrupt reinjection,
perhaps something similar should be investigated in the future.

The following patch (which contains the bugfix for 1) and disabled
reinjection) fixes the severe time drift on RHEL4 with clock=tsc.
What I'm proposing is to condition reinjection with an option
(-kvm-pit-no-reinject or something).

Comments or better ideas?


diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index e665d1c..608af7b 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -201,13 +201,16 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps)
if (!atomic_inc_and_test(pt-pending))
set_bit(KVM_REQ_PENDING_TIMER, vcpu0-requests);
 
+	if (atomic_read(pt-pending)  1)

+   atomic_set(pt-pending, 1);
+
  


Replace the atomic_inc() with atomic_set(, 1) instead? One less test, 
and more important, the logic is scattered less around the source.



if (vcpu0  waitqueue_active(vcpu0-wq))
wake_up_interruptible(vcpu0-wq);
 
 	hrtimer_add_expires_ns(pt-timer, pt-period);

pt-scheduled = hrtimer_get_expires_ns(pt-timer);
if (pt-period)
-   ps-channels[0].count_load_time = 
hrtimer_get_expires(pt-timer);
+   ps-channels[0].count_load_time = ktime_get();
 
 	return (pt-period == 0 ? 0 : 1);

 }
  


I don't like the idea of punting to the user but looks like we don't 
have a choice.  Hopefully vendors will port kvmclock to these kernels 
and release them as updates -- time simply doesn't work will with 
virtualization, especially Linux guests.


--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 15/15] KVM: Fix racy in kvm_free_assigned_irq

2008-12-29 Thread Avi Kivity

Amit Shah wrote:
I see from the call chain kvm_put_kvm-...-kvm_arch_destroy_vm, no locks are 
taken to actually destroy the vm. We can't be in ioctls, sure. But shouldn't 
the mutex be taken to ensure there's nothing else going on while destroying?


  


Locks are useless to guard against something happening concurrent with 
destruction, since we're about to destroy the lock.


At least with the workqueue model, we could be called asynchronously in kernel 
context and I would have held the mutex and about to inject interrupts while 
everything is being wiped off underneath. However, the workqueue model tries 
its best to schedule the work on the same CPU, though we can't use that 
guarantee to ensure things will be fine.


---
So I had to get a ref to the current vm till we had any pending work scheduled. 


I think that's the right thing to do.


I think I put in comments in the code, but sadly most of my comments we 
stripped out before the merge.
  


Pity.

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 15/15] KVM: Fix racy in kvm_free_assigned_irq

2008-12-29 Thread Avi Kivity

Sheng Yang wrote:
The free assigned device in the destroy path of VM, so as free irq. And we got 
cancel_work_sync() in free irq which can sync with the execution of scheduled 
work. And now before cancel_work_sync(), we disable the interrupt so that no 
more schedule work happen again. So after cancel_work_sync(), everything(I 
think it's irq handler and schedule work here) asynchronously should quiet 
down.


Or I miss something?
  


Suppose the work_struct gets scheduled, but is delayed somewhere in the 
scheduler.  Some kill -9s the VM, and it starts getting destroyed.  
cancel_work_sync() can no longer truly cancel the work, so it has to 
schedule and wait for its completion.


So now we have kvm_assigned_dev_interrupt_work_handler() running in a 
partially destroyed VM.  It may work or not, but it's a fragile 
situation (changing the order of destruction of components will likely 
break things) and it's easy to avoid by keeping the reference count 
elevated.


--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: userspace: Remove duplicated functionality for cpuid processing

2008-12-29 Thread Avi Kivity

Amit Shah wrote:

host_cpuid is now available in target-i386/helper.c.
Remove the duplicated code now in kvm-specific code.

  


Applied, thanks.

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 15/15] KVM: Fix racy in kvm_free_assigned_irq

2008-12-29 Thread Sheng Yang
On Monday 29 December 2008 21:37:52 Avi Kivity wrote:
 Sheng Yang wrote:
  The free assigned device in the destroy path of VM, so as free irq. And
  we got cancel_work_sync() in free irq which can sync with the execution
  of scheduled work. And now before cancel_work_sync(), we disable the
  interrupt so that no more schedule work happen again. So after
  cancel_work_sync(), everything(I think it's irq handler and schedule work
  here) asynchronously should quiet down.
 
  Or I miss something?

 Suppose the work_struct gets scheduled, but is delayed somewhere in the
 scheduler.  Some kill -9s the VM, and it starts getting destroyed.
 cancel_work_sync() can no longer truly cancel the work, so it has to
 schedule and wait for its completion.

 So now we have kvm_assigned_dev_interrupt_work_handler() running in a
 partially destroyed VM.  It may work or not, but it's a fragile
 situation (changing the order of destruction of components will likely
 break things) and it's easy to avoid by keeping the reference count
 elevated.

OK, got it. Thanks for explaining!

-- 
regards
Yang, Sheng

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


kvm-82 messages

2008-12-29 Thread Farkas Levente
hi,
in kvm-82 among the host's messages i see such lines (which i don't see
before):

kvm: 6413: cpu0 unhandled wrmsr: 0xc0010117 data 0
kvm: 6413: cpu0 unhandled rdmsr: 0xc0010117
kvm: 6413: cpu0 unhandled rdmsr: 0xc0010117
kvm: 6413: cpu0 unhandled wrmsr: 0xc0010117 data 0


-- 
  Levente   Si vis pacem para bellum!
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


__purge_vmap_area_lazy crash with CONFIG_PREEMPT_RCU=y

2008-12-29 Thread Marcelo Tosatti
On Wed, Dec 24, 2008 at 04:28:44PM +0100, Andrea Arcangeli wrote:
 On Wed, Dec 24, 2008 at 02:50:57PM +0200, Avi Kivity wrote:
  Marcelo Tosatti wrote:
  The destructor for huge pages uses the backing inode for adjusting
  hugetlbfs accounting.
 
  Hugepage mappings are destroyed by exit_mmap, after
  mmu_notifier_release, so there are no notifications through
  unmap_hugepage_range at this point.
 
  The hugetlbfs inode can be freed with pages backed by it referenced
  by the shadow. When the shadow releases its reference, the huge page
  destructor will access a now freed inode.
 
  Implement the release operation for kvm mmu notifiers to release page
  refs before the hugetlbfs inode is gone.
 

 
  I see this isn't it.  Andrea, comments?
 
 Yeah, the patch looks good, I talked a bit with Marcelo about this by
 PM. The issue is that it's not as strightforward as it seems,
 basically when I implemented the -release handlers and had sptes
 teardown running before the files were closed (instead of waiting the
 kvm anon inode release handler to fire) I was getting bugchecks from
 debug options including preempt=y (certain debug checks only becomes
 functional with preempt enabled unfortunately), so eventually I
 removed -release because for kvm -release wasn't useful because no
 guest mode can run any more by the time mmu notifier -release is
 invoked, and that avoided the issues with the bugchecks.
 
 We'll be using the mmu notifiers -release because it's always called
 just before the filehandle are destroyed, it's not really about the
 guest mode or secondary mmu but just an ordering issue with hugetlbfs
 internals.
 
 So in short if no bugcheck triggers this is fine (at least until
 hugetlbfs provides a way to register some callback to invoke at the
 start of the hugetlbfs-release handler).

The only bugcheck I see, which triggers on vanilla kvm upstream with 
CONFIG_PREEMPT_DEBUG=y and CONFIG_PREEMPT_RCU=y is:

general protection fault:  [#1] PREEMPT SMP DEBUG_PAGEALLOC4ttyS1: 1 
input overrun(s)

last sysfs file: /sys/class/net/tap0/address
CPU 0 
Modules linked in: tun ipt_MASQUERADE iptable_nat nf_nat bridge stp llc 
nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack xt_tcpudp ipt_REJECT 
iptable_filter ip_tables x_tables dm_multipath kvm_intel kvm scsi_wait_scan 
ata_piix libata dm_snapshot dm_zero dm_mirror dm_region_hash dm_log dm_mod 
shpchp pci_hotplug mptsas mptscsih mptbase scsi_transport_sas uhci_hcd ohci_hcd 
ehci_hcd
Pid: 4768, comm: qemu-system-x86 Not tainted 2.6.28-00165-g4f27e3e-dirty #164
RIP: 0010:[8028a5b6]  [8028a5b6] 
__purge_vmap_area_lazy+0x12c/0x163
RSP: 0018:88021e1f9a38  EFLAGS: 00010202
RAX: 6b6b6b6b6b6b6b6b RBX: 6b6b6b6b6b6b6b2b RCX: 0003
RDX: 80a1dae0 RSI: 880028083980 RDI: 0001
RBP: 88021e1f9a78 R08: 0286 R09: 80a1bf50
R10: 880119c270f8 R11: 88021e1f99b8 R12: 88021e1f9a38
R13: 88021e1f9a90 R14: 88021e1f9a98 R15: 813a
FS:  () GS:8080d900() knlGS:
CS:  0010 DS: 002b ES: 002b CR0: 8005003b
CR2: 008d9828 CR3: 00201000 CR4: 26e0
DR0:  DR1:  DR2: 
DR3:  DR6: 0ff0 DR7: 0400
Process qemu-system-x86 (pid: 4768, threadinfo 88021e1f8000, task 
880119c270f8)
Stack:
 88022bdfd840 880119da11b8 c20011c3 813a
  0001 88022ec11c18 88022f061838
 88021e1f9aa8 8028ab1d 88021e1f9aa8 c20021976000
Call Trace:
 [8028ab1d] free_unmap_vmap_area_noflush+0x69/0x70
 [8028ab49] remove_vm_area+0x25/0x71
 [8028ac54] __vunmap+0x3a/0xca
 [8028ad35] vfree+0x29/0x2b
 [a00f98a3] kvm_free_physmem_slot+0x25/0x7c [kvm]
 [a00f9d75] kvm_free_physmem+0x27/0x36 [kvm]
 [a00fccb4] kvm_arch_destroy_vm+0xa6/0xda [kvm]
 [a00f9e11] kvm_put_kvm+0x8d/0xa7 [kvm]
 [a00fa0e2] kvm_vcpu_release+0x13/0x17 [kvm]
 [802a1c07] __fput+0xeb/0x1a3
 [802a1cd4] fput+0x15/0x17
 [8029f26c] filp_close+0x67/0x72
 [802378a8] put_files_struct+0x74/0xc8
 [80237943] exit_files+0x47/0x4f
 [80238fe5] do_exit+0x1eb/0x7a7
 [80587edf] ? _spin_unlock_irq+0x2b/0x51
 [80239614] do_group_exit+0x73/0xa0
 [80242b10] get_signal_to_deliver+0x30c/0x32c
 [8020b4d5] ? sysret_signal+0x19/0x29
 [8020a80f] do_notify_resume+0x8c/0x851
 [8025b811] ? do_futex+0x90/0x92a
 [80256bd7] ? trace_hardirqs_on_caller+0xf0/0x114
 [80587f51] ? _spin_unlock_irqrestore+0x4c/0x68
 [8026be5c] ? __rcu_read_unlock+0x92/0x9e
 [80256bd7] ? trace_hardirqs_on_caller+0xf0/0x114
 [80256c08] ? trace_hardirqs_on+0xd/0xf
 [8024f300] ? getnstimeofday+0x3a/0x96
 [8024c4f0] ? 

Re: gdbstub: packet reply is too long

2008-12-29 Thread Jan Kiszka
Daniel Jacobowitz wrote:
 On Sun, Dec 21, 2008 at 12:44:04AM +0100, Jan Kiszka wrote:
 And that means setting current_gdbarch while keeping target_gdbarch -
 that's where reality (existing gdb code) bites us. Again, I'm not
 arguing against fixing this, I'm arguing in keeping qemu's workaround
 until this is done. I will look into the gdb part, but one after the other.
 
 No, it does not mean setting current_gdbarch different from
 target_gdbarch.  With the current gdbarch set to a 64-bit one that
 accurately describes the target, GDB should be able to debug code
 running in 32-bit mode.  If it can't, there are simply bugs in GDB to
 fix.

Well, in the current gdb design, current_gdbarch is consulted when
disassembling the code while target_gdbarch defines the register set
that is exchanged with the remote stub.

 
 If you'd like to reach some solution to this problem, which I've seen
 come up on the QEMU list a half-dozen times now, please describe how
 you're using GDB on the g...@sourceware.org mailing list and let's see
 if we can't fix the GDB bugs.  I'm pretty sure that any solution is
 going to involve always transferring the x86-64 register set, though.

I'm pretty sure that the final solution will involve extended x86
register sets in order to inform the frontend about the full target CPU
state so that it can set the right current_gdbarch automatically. That's
one reason (the other is current/target_gdbarch decoupling) why I see no
quick bug fix on the gdb side to actually solve the issue and suggest
the reintroduction of the qemu workaround until gdb is enhanced
appropriately.

But you I right, it's time to start a discussion on the gdb list,
hopefully laying the ground for a better x86 low-level support. And
Maybe I actually miss some smart intermediate step towards this.

Jan



signature.asc
Description: OpenPGP digital signature


Re: [PATCH 15/15] KVM: Fix racy in kvm_free_assigned_irq

2008-12-29 Thread Marcelo Tosatti
On Mon, Dec 29, 2008 at 08:23:28PM +0800, Sheng Yang wrote:
 On Monday 29 December 2008 13:42:22 Amit Shah wrote:
  On Sun, Dec 28, 2008 at 07:24:02PM +0800, Sheng Yang wrote:
   On Sat, Dec 27, 2008 at 06:06:26PM -0200, Marcelo Tosatti wrote:
On Fri, Dec 26, 2008 at 10:30:07AM +0800, Sheng Yang wrote:
 Thanks to Marcelo's observation, The following code have potential
 issue:

 if (cancel_work_sync(assigned_dev-interrupt_work))
   kvm_put_kvm(kvm);

 In fact, cancel_work_sync() would return true either work struct is
 only scheduled or the callback of work struct is executed. This code
 only consider the former situation.
   
Why not simply drop the reference inc / dec from irq handler/work
function?
  
   Sorry, I don't know the answer. After checking the code, I also think
   it's a little strange to increase refernce count here, and I think we
   won't suppose work_handler can release the kvm struct.
 
  At the time of developing that code, this was my observation:
 
  I see from the call chain kvm_put_kvm-...-kvm_arch_destroy_vm, no locks
  are taken to actually destroy the vm. We can't be in ioctls, sure. But
  shouldn't the mutex be taken to ensure there's nothing else going on while
  destroying?
 
  At least with the workqueue model, we could be called asynchronously in
  kernel context and I would have held the mutex and about to inject
  interrupts while everything is being wiped off underneath. However, the
  workqueue model tries its best to schedule the work on the same CPU, though
  we can't use that guarantee to ensure things will be fine.
 
  ---
  So I had to get a ref to the current vm till we had any pending work
  scheduled. I think I put in comments in the code, but sadly most of my
  comments we stripped out before the merge.
 
 Not quite understand...
 
 The free assigned device in the destroy path of VM, so as free irq. And we 
 got 
 cancel_work_sync() in free irq which can sync with the execution of scheduled 
 work. And now before cancel_work_sync(), we disable the interrupt so that no 
 more schedule work happen again. So after cancel_work_sync(), everything(I 
 think it's irq handler and schedule work here) asynchronously should quiet 
 down.
 
 Or I miss something?

Thats right. As long as you disable the irq and cancel pending work
before freeing the data structures those paths use.

There is one remaining issue: kvm_assigned_dev_interrupt_work_handler
can re-enable the interrupt for KVM_ASSIGNED_DEV_GUEST_MSI case. Perhaps
you need a new flag to indicate shutdown (so the host IRQ won't be
reenabled).


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: gettimeofday slow in RHEL4 guests

2008-12-29 Thread Dor Laor

Avi Kivity wrote:

Marcelo Tosatti wrote:

The tsc clock on older Linux 2.6 kernels compensates for lost ticks.
The algorithm uses the PIT count (latched) to measure the delay between
interrupt generation and handling, and sums that value, on the next
interrupt, to the TSC delta.

Sheng investigated this problem in the discussions before in-kernel PIT
was merged:

http://www.mail-archive.com/kvm-de...@lists.sourceforge.net/msg13873.html 



The algorithm overcompensates for lost ticks and the guest time runs
faster than the hosts.

There are two issues:

1) A bug in the in-kernel PIT which miscalculates the count value.

2) For the case where more than one interrupt is lost, and later
reinjected, the value read from PIT count is meaningless for the purpose
of the tsc algorithm. The count is interpreted as the delay until the
next interrupt, which is not the case with reinjection.

As Sheng mentioned in the thread above, Xen pulls back the TSC value
when reinjecting interrupts. VMWare ESX has a notion of virtual TSC,
which I believe is similar in this context.

For KVM I believe the best immediate solution (for now) is to provide an
option to disable reinjection, behaving similarly to real hardware. The
advantage is simplicity compared to virtualizing the time sources.

The QEMU PIT emulation has a limit on the rate of interrupt reinjection,
perhaps something similar should be investigated in the future.

The following patch (which contains the bugfix for 1) and disabled
reinjection) fixes the severe time drift on RHEL4 with clock=tsc.
What I'm proposing is to condition reinjection with an option
(-kvm-pit-no-reinject or something).

Comments or better ideas?


diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index e665d1c..608af7b 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -201,13 +201,16 @@ static int __pit_timer_fn(struct kvm_kpit_state 
*ps)

 if (!atomic_inc_and_test(pt-pending))
 set_bit(KVM_REQ_PENDING_TIMER, vcpu0-requests);
 
+if (atomic_read(pt-pending)  1)

+atomic_set(pt-pending, 1);
+
  


Replace the atomic_inc() with atomic_set(, 1) instead? One less test, 
and more important, the logic is scattered less around the source.
But having only a pending bit instead of a counter will cause kvm to 
drop pit irqs on rare high load situations.

The disable reinjection option is better.



 if (vcpu0  waitqueue_active(vcpu0-wq))
 wake_up_interruptible(vcpu0-wq);
 
 hrtimer_add_expires_ns(pt-timer, pt-period);

 pt-scheduled = hrtimer_get_expires_ns(pt-timer);
 if (pt-period)
-ps-channels[0].count_load_time = 
hrtimer_get_expires(pt-timer);

+ps-channels[0].count_load_time = ktime_get();
 
 return (pt-period == 0 ? 0 : 1);

 }
  


I don't like the idea of punting to the user but looks like we don't 
have a choice.  Hopefully vendors will port kvmclock to these kernels 
and release them as updates -- time simply doesn't work will with 
virtualization, especially Linux guests.


Except for these 'tsc compensate' guest, what are the occasions where 
the guest writes his tsc?

If this is the only case we can disable reinjection once we trap tsc writes.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: gettimeofday slow in RHEL4 guests

2008-12-29 Thread Avi Kivity

Dor Laor wrote:
 
+if (atomic_read(pt-pending)  1)

+atomic_set(pt-pending, 1);
+
  


Replace the atomic_inc() with atomic_set(, 1) instead? One less test, 
and more important, the logic is scattered less around the source.
But having only a pending bit instead of a counter will cause kvm to 
drop pit irqs on rare high load situations.

The disable reinjection option is better.


Both variants disable reinjection.  Forcing a counter to 1 every time it 
exceeds 1 is equivalent to maintaining a bit.


In both variants, there is a missing 'if (disable_reinjection)' (Marcelo 
mentioned this in the original message).


Except for these 'tsc compensate' guest, what are the occasions where 
the guest writes his tsc?
If this is the only case we can disable reinjection once we trap tsc 
writes.


I don't think these guests write to the tsc.  Rather, they read the tsc 
and the pit counters and try to correlate.  And fail.


--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: gettimeofday slow in RHEL4 guests

2008-12-29 Thread Avi Kivity

Dor Laor wrote:
 
+if (atomic_read(pt-pending)  1)

+atomic_set(pt-pending, 1);
+
  


Replace the atomic_inc() with atomic_set(, 1) instead? One less test, 
and more important, the logic is scattered less around the source.
But having only a pending bit instead of a counter will cause kvm to 
drop pit irqs on rare high load situations.

The disable reinjection option is better.


Both variants disable reinjection.  Forcing a counter to 1 every time it 
exceeds 1 is equivalent to maintaining a bit.


In both variants, there is a missing 'if (disable_reinjection)' (Marcelo 
mentioned this in the original message).


Except for these 'tsc compensate' guest, what are the occasions where 
the guest writes his tsc?
If this is the only case we can disable reinjection once we trap tsc 
writes.


I don't think these guests write to the tsc.  Rather, they read the tsc 
and the pit counters and try to correlate.  And fail.


--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 1/2] KVM: PIT: fix i8254 pending count read

2008-12-29 Thread Marcelo Tosatti
count_load_time assignment is bogus: its supposed to contain what it
means, not the expiration time.

Signed-off-by: Marcelo Tosatti mtosa...@redhat.com

Index: kvm/arch/x86/kvm/i8254.c
===
--- kvm.orig/arch/x86/kvm/i8254.c
+++ kvm/arch/x86/kvm/i8254.c
@@ -207,7 +207,7 @@ static int __pit_timer_fn(struct kvm_kpi
hrtimer_add_expires_ns(pt-timer, pt-period);
pt-scheduled = hrtimer_get_expires_ns(pt-timer);
if (pt-period)
-   ps-channels[0].count_load_time = 
hrtimer_get_expires(pt-timer);
+   ps-channels[0].count_load_time = ktime_get();
 
return (pt-period == 0 ? 0 : 1);
 }

-- 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 0/2] PIT: optionally disable interrupt reinjection

2008-12-29 Thread Marcelo Tosatti
-- 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 2/2] KVM: PIT: provide an option to disable interrupt reinjection

2008-12-29 Thread Marcelo Tosatti
Certain clocks (such as TSC) in older 2.6 guests overaccount for lost
ticks, causing severe time drift. Interrupt reinjection magnifies the
problem.

Provide an option to disable it.

Signed-off-by: Marcelo Tosatti mtosa...@redhat.com

Index: kvm/arch/x86/kvm/i8254.c
===
--- kvm.orig/arch/x86/kvm/i8254.c
+++ kvm/arch/x86/kvm/i8254.c
@@ -201,6 +201,9 @@ static int __pit_timer_fn(struct kvm_kpi
if (!atomic_inc_and_test(pt-pending))
set_bit(KVM_REQ_PENDING_TIMER, vcpu0-requests);
 
+   if (pt-no_reinject)
+   atomic_set(pt-pending, 1);
+
if (vcpu0  waitqueue_active(vcpu0-wq))
wake_up_interruptible(vcpu0-wq);
 
Index: kvm/arch/x86/kvm/i8254.h
===
--- kvm.orig/arch/x86/kvm/i8254.h
+++ kvm/arch/x86/kvm/i8254.h
@@ -9,6 +9,7 @@ struct kvm_kpit_timer {
s64 period; /* unit: ns */
s64 scheduled;
atomic_t pending;
+   bool no_reinject;
 };
 
 struct kvm_kpit_channel_state {
Index: kvm/arch/x86/kvm/x86.c
===
--- kvm.orig/arch/x86/kvm/x86.c
+++ kvm/arch/x86/kvm/x86.c
@@ -991,6 +991,7 @@ int kvm_dev_ioctl_check_extension(long e
case KVM_CAP_NOP_IO_DELAY:
case KVM_CAP_MP_STATE:
case KVM_CAP_SYNC_MMU:
+   case KVM_CAP_PIT_NO_REINJECT:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -1723,6 +1724,12 @@ static int kvm_vm_ioctl_set_pit(struct k
return r;
 }
 
+static int kvm_vm_ioctl_no_reinject(struct kvm *kvm)
+{
+   kvm-arch.vpit-pit_state.pit_timer.no_reinject = true;
+   return 0;
+}
+
 /*
  * Get (and clear) the dirty memory log for a memory slot.
  */
@@ -1920,6 +1927,16 @@ long kvm_arch_vm_ioctl(struct file *filp
r = 0;
break;
}
+   case KVM_PIT_NO_REINJECT: {
+   r = -ENXIO;
+   if (!kvm-arch.vpit)
+   goto out;
+   r = kvm_vm_ioctl_no_reinject(kvm);
+   if (r)
+   goto out;
+   r = 0;
+   break;
+   }
default:
;
}
Index: kvm/include/linux/kvm.h
===
--- kvm.orig/include/linux/kvm.h
+++ kvm/include/linux/kvm.h
@@ -396,6 +396,9 @@ struct kvm_trace_rec {
 #if defined(CONFIG_X86)
 #define KVM_CAP_SET_GUEST_DEBUG 23
 #endif
+#if defined(CONFIG_X86)
+#define KVM_CAP_PIT_NO_REINJECT 24
+#endif
 
 /*
  * ioctls for VM fds
@@ -429,6 +432,7 @@ struct kvm_trace_rec {
   struct kvm_assigned_pci_dev)
 #define KVM_ASSIGN_IRQ _IOR(KVMIO, 0x70, \
struct kvm_assigned_irq)
+#define KVM_PIT_NO_REINJECT_IO(KVMIO, 0x71)
 
 /*
  * ioctls for vcpu fds

-- 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 1/2] libkvm: pit not reinject support

2008-12-29 Thread Marcelo Tosatti
Signed-off-by: Marcelo Tosatti mtosa...@redhat.com

Index: kvm-userspace.pit/libkvm/libkvm-x86.c
===
--- kvm-userspace.pit.orig/libkvm/libkvm-x86.c
+++ kvm-userspace.pit/libkvm/libkvm-x86.c
@@ -75,6 +75,20 @@ int kvm_create_pit(kvm_context_t kvm)
return 0;
 }
 
+int kvm_pit_no_reinjection(kvm_context_t kvm)
+{
+#ifdef KVM_CAP_PIT_NO_REINJECT
+int r;
+
+r = ioctl(kvm-fd, KVM_CHECK_EXTENSION, KVM_CAP_PIT_NO_REINJECT);
+if (r  0) {
+r = ioctl(kvm-vm_fd, KVM_PIT_NO_REINJECT);
+return r;
+}
+#endif
+return -1;
+}
+
 int kvm_arch_create(kvm_context_t kvm, unsigned long phys_mem_bytes,
void **vm_mem)
 {
Index: kvm-userspace.pit/libkvm/libkvm.h
===
--- kvm-userspace.pit.orig/libkvm/libkvm.h
+++ kvm-userspace.pit/libkvm/libkvm.h
@@ -648,6 +648,8 @@ int kvm_set_pit(kvm_context_t kvm, struc
 
 #endif
 
+int kvm_pit_no_reinjection(kvm_context_t kvm);
+
 #ifdef KVM_CAP_VAPIC
 
 /*!

-- 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 0/2] QEMU/KVM: PIT no interrupt reinjection support

2008-12-29 Thread Marcelo Tosatti
Userspace support for KVM_PIT_NO_REINJECT.

-- 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 2/2] QEMU/KVM: provide an option to disable in-kernel PIT int reinjection

2008-12-29 Thread Marcelo Tosatti
Signed-off-by: Marcelo Tosatti mtosa...@redhat.com

Index: kvm-userspace.pit/qemu/qemu-kvm.c
===
--- kvm-userspace.pit.orig/qemu/qemu-kvm.c
+++ kvm-userspace.pit/qemu/qemu-kvm.c
@@ -11,6 +11,7 @@
 int kvm_allowed = 1;
 int kvm_irqchip = 1;
 int kvm_pit = 1;
+int kvm_pit_no_reinject = 0;
 int kvm_nested = 0;
 
 #include assert.h
@@ -795,6 +796,12 @@ int kvm_qemu_create_context(void)
 r = kvm_arch_qemu_create_context();
 if(r 0)
kvm_qemu_destroy();
+if (kvm_pit_no_reinject) {
+if (kvm_pit_no_reinjection(kvm_context)) {
+fprintf(stderr, failure to disable in-kernel PIT reinjection\n);
+return -1;
+}
+}
 #ifdef TARGET_I386
 destroy_region_works = kvm_destroy_memory_region_works(kvm_context);
 #endif
Index: kvm-userspace.pit/qemu/vl.c
===
--- kvm-userspace.pit.orig/qemu/vl.c
+++ kvm-userspace.pit/qemu/vl.c
@@ -4071,6 +4071,7 @@ static void help(int exitcode)
 #endif
   -no-kvm-irqchip disable KVM kernel mode PIC/IOAPIC/LAPIC\n
   -no-kvm-pit disable KVM kernel mode PIT\n
+  -kvm-pit-no-reinject disable KVM kernel mode PIT interrupt 
reinjection\n
   -enable-nesting enable support for running a VM inside the VM (AMD 
only)\n
 #if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(TARGET_IA64) || 
defined(__linux__)
-pcidevice host=bus:dev.func[,dma=none][,name=string]\n
@@ -4202,6 +4203,7 @@ enum {
 QEMU_OPTION_no_kvm,
 QEMU_OPTION_no_kvm_irqchip,
 QEMU_OPTION_no_kvm_pit,
+QEMU_OPTION_kvm_pit_no_reinject,
 #if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(TARGET_IA64) || 
defined(__linux__)
 QEMU_OPTION_pcidevice,
 #endif
@@ -4298,6 +4300,7 @@ static const QEMUOption qemu_options[] =
 #endif
 { no-kvm-irqchip, 0, QEMU_OPTION_no_kvm_irqchip },
 { no-kvm-pit, 0, QEMU_OPTION_no_kvm_pit },
+{ kvm-pit-no-reinject, 0, QEMU_OPTION_kvm_pit_no_reinject },
 { enable-nesting, 0, QEMU_OPTION_enable_nesting },
 #if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(TARGET_IA64) || 
defined(__linux__)
 { pcidevice, HAS_ARG, QEMU_OPTION_pcidevice },
@@ -5267,6 +5270,11 @@ int main(int argc, char **argv, char **e
kvm_pit = 0;
break;
}
+case QEMU_OPTION_kvm_pit_no_reinject: {
+extern int kvm_pit_no_reinject;
+kvm_pit_no_reinject = 1;
+break;
+}
case QEMU_OPTION_enable_nesting: {
kvm_nested = 1;
break;

-- 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Serial ATA Support - will it come?

2008-12-29 Thread Charles Duffy

Let's back up here a bit --

When you say no disk was found at all when using SCSI emulation, do 
you mean the kernel booted and was unable to find the hard drives, or 
did it not even get that far?


The SCSI emulation uses the sym53c8xx driver, which was first developed 
against 2.0.36; it's been in the kernel long enough that I'm certain 
whatever kernel you're using supports it (or it wouldn't run a modern 
userland either). I'm guessing you're running a monolithic kernel with 
only the drivers you need compiled in, instead of any vendor kernel (as 
any and every vendor kernel would have this driver available). Frankly, 
if you want to allow your users to boot their machines on other than 
your usual hardware, it's only sensible to have a kernel that supports 
the secondary hardware as well as what you normally provision them -- 
whether this secondary hardware is physical or virtual.


That said, there's a mechanism by which you can cheat: Boot the guest 
with an externally-provided kernel using -kernel and -append options to 
kvm. This isn't ideal -- you're no longer going through the guest's 
bootloader and so lose any settings included there -- but should be good 
enough for a rescue environment.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] hook cpu running at a higher level.

2008-12-29 Thread Glauber Costa
This patch removes the kvm_enabled() check from cpu-exec.c.
This file is highly tcg-specific, and we'll probably want it
out when tcg is not compiled in (coming soon, in a theathe near you)

Instead, we hook at the main loop level. The amount of code
duplication introduced is at worst, acceptable, and I believe
it pays. The tcg mainloop is likely to be different from the
hypervisors ones, since tcg runs all cpus in lockstep. KVM
(and probably xen), will be able to span threads out for its
vcpus.

Signed-off-by: Glauber Costa glom...@redhat.com
---
 cpu-exec.c |5 -
 kvm-all.c  |   50 ++
 vl.c   |   16 ++--
 3 files changed, 60 insertions(+), 11 deletions(-)

diff --git a/cpu-exec.c b/cpu-exec.c
index ed1545b..be8ceac 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -333,11 +333,6 @@ int cpu_exec(CPUState *env1)
 }
 #endif
 
-if (kvm_enabled()) {
-kvm_cpu_exec(env);
-longjmp(env-jmp_env, 1);
-}
-
 next_tb = 0; /* force lookup of first TB */
 for(;;) {
 interrupt_request = env-interrupt_request;
diff --git a/kvm-all.c b/kvm-all.c
index 11034df..a279d6c 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -663,3 +663,53 @@ int kvm_has_sync_mmu(void)
 
 return 0;
 }
+
+extern CPUState *cur_cpu;
+extern CPUState *next_cpu;
+
+extern int reset_requested;
+extern int shutdown_requested;
+extern int powerdown_requested;
+
+int kvm_main_loop(void)
+{
+
+int ret, timeout;
+CPUState *env;
+
+cur_cpu = first_cpu;
+next_cpu = first_cpu;
+env = first_cpu;
+
+for(;;) {
+/* get next cpu */
+cpu_single_env = env;
+ret = kvm_cpu_exec(env);
+cpu_single_env = NULL;
+
+if (shutdown_requested)
+break;
+if (reset_requested) {
+reset_requested = 0;
+qemu_system_reset();
+ret = EXCP_INTERRUPT;
+}
+if (powerdown_requested) {
+powerdown_requested = 0;
+qemu_system_powerdown();
+ret = EXCP_INTERRUPT;
+}
+
+if (ret == EXCP_HALTED) {
+timeout = 5000;
+} else {
+timeout = 0;
+}
+
+main_loop_wait(timeout);
+}
+cpu_disable_ticks();
+return ret;
+}
+
+
diff --git a/vl.c b/vl.c
index 0a02151..bcaccc3 100644
--- a/vl.c
+++ b/vl.c
@@ -248,8 +248,8 @@ static struct drive_opt {
 char opt[1024];
 } drives_opt[MAX_DRIVES];
 
-static CPUState *cur_cpu;
-static CPUState *next_cpu;
+CPUState *cur_cpu;
+CPUState *next_cpu;
 static int event_pending = 1;
 /* Conversion factor from emulated instructions to virtual clock ticks.  */
 static int icount_time_shift;
@@ -3452,9 +3452,9 @@ typedef struct QEMUResetEntry {
 } QEMUResetEntry;
 
 static QEMUResetEntry *first_reset_entry;
-static int reset_requested;
-static int shutdown_requested;
-static int powerdown_requested;
+int reset_requested;
+int shutdown_requested;
+int powerdown_requested;
 
 int qemu_shutdown_requested(void)
 {
@@ -5535,7 +5535,11 @@ int main(int argc, char **argv, char **envp)
close(fd);
 }
 
-main_loop();
+if (kvm_enabled())
+kvm_main_loop();
+else
+main_loop();
+
 quit_timers();
 net_cleanup();
 
-- 
1.5.6.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: how increase/decrease ram on running vm ?

2008-12-29 Thread Charles Duffy

Василец Дмитрий wrote:

Xen hypervisor can increase and decrease cpu and ram.
will be able this function on kvm/qemu ?


Last time I was familiar with Xen's balloon driver, it had the exact 
same limitation as the kvm one.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Nested KVM

2008-12-29 Thread Todd Deshane
On Wed, Dec 24, 2008 at 4:20 AM, Alexander Graf ag...@suse.de wrote:


 Ugh. Looks like the emulation part is still broken :-(. Please use the
 attached patch to disable the emulation optimization for now.

 Avi, could you please apply that patch for kvm-82 too, so we get something
 working out? I'll take a closer look at what's broken exactly later on.

 Alex



So I am working with the latest git, from today.

The emulation error went away and the nested KVM guest partially works.

The errors that I am seeing late in the normal guest boot (which seem
non-fatal) are:
Dec 29 18:33:31 amdbox kernel: [ 1060.446054] bad partial csum:
csum=5888/5694 len=80
Dec 29 18:33:33 amdbox kernel: [ 1061.934164] bad partial csum:
csum=5888/5694 len=80
Dec 29 18:33:33 amdbox kernel: [ 1062.170127] bad partial csum:
csum=5888/5694 len=60
Dec 29 18:33:34 amdbox kernel: [ 1063.419124] bad partial csum:
csum=5888/5694 len=270
Dec 29 18:33:35 amdbox kernel: [ 1063.667817] bad partial csum:
csum=5888/5694 len=270
Dec 29 18:33:35 amdbox kernel: [ 1063.927839] bad partial csum:
csum=5888/5694 len=270
Dec 29 18:33:35 amdbox kernel: [ 1064.126336] bad partial csum:
csum=5888/5694 len=252
Dec 29 18:33:35 amdbox kernel: [ 1064.274429] bad partial csum:
csum=5888/5694 len=152
Dec 29 18:33:35 amdbox kernel: [ 1064.522702] bad partial csum:
csum=5888/5694 len=152
Dec 29 18:33:36 amdbox kernel: [ 1064.776290] bad partial csum:
csum=5888/5694 len=152
Dec 29 18:33:38 amdbox kernel: [ 1067.309123] __ratelimit: 4 callbacks
suppressed
Dec 29 18:33:38 amdbox kernel: [ 1067.309126] bad partial csum:
csum=5888/5694 len=252
Dec 29 18:33:39 amdbox kernel: [ 1068.160737] bad partial csum:
csum=5888/5694 len=241
Dec 29 18:33:41 amdbox kernel: [ 1070.170049] bad partial csum:
csum=5888/5694 len=60

After that I am able to start the nested guest with:
sudo qemu-system-x86_64 -hda ubuntu-server.img -cdrom
Desktop/ubuntu-8.10-server-amd64.iso

The nested guest also has the latest git checkout

The nested guest shows the Ubuntu install CD welcome and selecting a
language and starting
the boot process starts a very little bit and the screen goes black.

The nested guest doesn't crash, but becomes very unresponsive, can't
ping it, can't ssh, etc.
It seems like it only runs for a short time before it becomes
unresponsive (less than 30
seconds).

I can attach to the qemu-system-x86_64

(gdb) where
#0  0x7fa8cc4a1482 in select () from /lib/libc.so.6
#1  0x00408bcb in main_loop_wait (timeout=0)
at /backup/src/kvm-src/kvm-userspace/qemu/vl.c:3617
#2  0x005160fa in kvm_main_loop ()
at /backup/src/kvm-src/kvm-userspace/qemu/qemu-kvm.c:599
#3  0x0040d106 in main (argc=value optimized out,
argv=0x7fffd58e9f48, envp=value optimized out)
at /backup/src/kvm-src/kvm-userspace/qemu/vl.c:3779

After some time, the qemu-system-x86_64 process starts to take
between 97 and 100% of the CPU.

The base system is still running OK, but no new messages are printed
in /var/log/syslog

I am sure there are more KVM debugging tricks

Any suggestions?

Thanks,
Todd

-- 
Todd Deshane
http://todddeshane.net
http://runningxen.com
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH][v2] kvm-userspace: Load PCI option ROMs

2008-12-29 Thread Shan, Haitao
Avi Kivity wrote:
 Liu, Kechao wrote:
 How is the last bit performed on real hardware?  Obviously the ROM
 can't have devfn embedded. 
 
 
 On a real hardware, BIOS scans PCI devices, loads ROMs and it can
 get devices' devfns. Here, in an easier way, we load option ROMs in
 QEMU and thus need to store and pass the devfns.  
 
 
 Well, it may make sense to provide the ROMs as virtual PCI BARs, and
 have the bios do the work. This way, if some driver relies on
 remapping the BAR (graphic cards?), it can still work.

I do not quite understand this. Can you elaborate?

Best Regards
Shan Haitao--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 15/15] KVM: Fix racy in kvm_free_assigned_irq

2008-12-29 Thread Sheng Yang
On Monday 29 December 2008 23:20:57 Marcelo Tosatti wrote:
 On Mon, Dec 29, 2008 at 08:23:28PM +0800, Sheng Yang wrote:
  On Monday 29 December 2008 13:42:22 Amit Shah wrote:
   On Sun, Dec 28, 2008 at 07:24:02PM +0800, Sheng Yang wrote:
On Sat, Dec 27, 2008 at 06:06:26PM -0200, Marcelo Tosatti wrote:
 On Fri, Dec 26, 2008 at 10:30:07AM +0800, Sheng Yang wrote:
  Thanks to Marcelo's observation, The following code have
  potential issue:
 
  if (cancel_work_sync(assigned_dev-interrupt_work))
  kvm_put_kvm(kvm);
 
  In fact, cancel_work_sync() would return true either work struct
  is only scheduled or the callback of work struct is executed.
  This code only consider the former situation.

 Why not simply drop the reference inc / dec from irq handler/work
 function?
   
Sorry, I don't know the answer. After checking the code, I also think
it's a little strange to increase refernce count here, and I think we
won't suppose work_handler can release the kvm struct.
  
   At the time of developing that code, this was my observation:
  
   I see from the call chain kvm_put_kvm-...-kvm_arch_destroy_vm, no
   locks are taken to actually destroy the vm. We can't be in ioctls,
   sure. But shouldn't the mutex be taken to ensure there's nothing else
   going on while destroying?
  
   At least with the workqueue model, we could be called asynchronously in
   kernel context and I would have held the mutex and about to inject
   interrupts while everything is being wiped off underneath. However, the
   workqueue model tries its best to schedule the work on the same CPU,
   though we can't use that guarantee to ensure things will be fine.
  
   ---
   So I had to get a ref to the current vm till we had any pending work
   scheduled. I think I put in comments in the code, but sadly most of my
   comments we stripped out before the merge.
 
  Not quite understand...
 
  The free assigned device in the destroy path of VM, so as free irq. And
  we got cancel_work_sync() in free irq which can sync with the execution
  of scheduled work. And now before cancel_work_sync(), we disable the
  interrupt so that no more schedule work happen again. So after
  cancel_work_sync(), everything(I think it's irq handler and schedule work
  here) asynchronously should quiet down.
 
  Or I miss something?

 Thats right. As long as you disable the irq and cancel pending work
 before freeing the data structures those paths use.

 There is one remaining issue: kvm_assigned_dev_interrupt_work_handler
 can re-enable the interrupt for KVM_ASSIGNED_DEV_GUEST_MSI case. Perhaps
 you need a new flag to indicate shutdown (so the host IRQ won't be
 reenabled).

Is it already covered by disable_irq_no_sync() before cancel_work_sync()? I've 
noted this in my comment: the irq may be disabled nested(once for MSI and 
twice for INTx), but I think it's fine for we're going to free it.

-- 
regards
Yang, Sheng
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 2/2] QEMU/KVM: provide an option to disable in-kernel PIT int reinjection

2008-12-29 Thread Sheng Yang
On Tuesday 30 December 2008 01:42:35 Marcelo Tosatti wrote:
 Signed-off-by: Marcelo Tosatti mtosa...@redhat.com

--
  #if defined(TARGET_I386) || defined(TARGET_X86_64) || defined(TARGET_IA64)
 || defined(__linux__) { pcidevice, HAS_ARG, QEMU_OPTION_pcidevice },
 @@ -5267,6 +5270,11 @@ int main(int argc, char **argv, char **e
   kvm_pit = 0;
   break;
   }
 +case QEMU_OPTION_kvm_pit_no_reinject: {
 +extern int kvm_pit_no_reinject;
 +kvm_pit_no_reinject = 1;
 +break;
 +}
   case QEMU_OPTION_enable_nesting: {
   kvm_nested = 1;
   break;

Do we need check the conflict of --kvm-pit-no-reinject and --no-kvm-pit/--no-
kvm-irqchip? Check it ahead of kvm_qemu_create_context() seems a little 
better...

-- 
regards
Yang, Sheng



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: __purge_vmap_area_lazy crash with CONFIG_PREEMPT_RCU=y

2008-12-29 Thread Nick Piggin
On Tuesday 30 December 2008 01:58:21 Marcelo Tosatti wrote:
 On Wed, Dec 24, 2008 at 04:28:44PM +0100, Andrea Arcangeli wrote:
  On Wed, Dec 24, 2008 at 02:50:57PM +0200, Avi Kivity wrote:
   Marcelo Tosatti wrote:
   The destructor for huge pages uses the backing inode for adjusting
   hugetlbfs accounting.
  
   Hugepage mappings are destroyed by exit_mmap, after
   mmu_notifier_release, so there are no notifications through
   unmap_hugepage_range at this point.
  
   The hugetlbfs inode can be freed with pages backed by it referenced
   by the shadow. When the shadow releases its reference, the huge page
   destructor will access a now freed inode.
  
   Implement the release operation for kvm mmu notifiers to release page
   refs before the hugetlbfs inode is gone.
  
   I see this isn't it.  Andrea, comments?
 
  Yeah, the patch looks good, I talked a bit with Marcelo about this by
  PM. The issue is that it's not as strightforward as it seems,
  basically when I implemented the -release handlers and had sptes
  teardown running before the files were closed (instead of waiting the
  kvm anon inode release handler to fire) I was getting bugchecks from
  debug options including preempt=y (certain debug checks only becomes
  functional with preempt enabled unfortunately), so eventually I
  removed -release because for kvm -release wasn't useful because no
  guest mode can run any more by the time mmu notifier -release is
  invoked, and that avoided the issues with the bugchecks.
 
  We'll be using the mmu notifiers -release because it's always called
  just before the filehandle are destroyed, it's not really about the
  guest mode or secondary mmu but just an ordering issue with hugetlbfs
  internals.
 
  So in short if no bugcheck triggers this is fine (at least until
  hugetlbfs provides a way to register some callback to invoke at the
  start of the hugetlbfs-release handler).

 The only bugcheck I see, which triggers on vanilla kvm upstream with
 CONFIG_PREEMPT_DEBUG=y and CONFIG_PREEMPT_RCU=y is:

 general protection fault:  [#1] PREEMPT SMP DEBUG_PAGEALLOC4ttyS1: 1
 input overrun(s)

 last sysfs file: /sys/class/net/tap0/address
 CPU 0
 Modules linked in: tun ipt_MASQUERADE iptable_nat nf_nat bridge stp llc
 nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack xt_tcpudp ipt_REJECT
 iptable_filter ip_tables x_tables dm_multipath kvm_intel kvm scsi_wait_scan
 ata_piix libata dm_snapshot dm_zero dm_mirror dm_region_hash dm_log dm_mod
 shpchp pci_hotplug mptsas mptscsih mptbase scsi_transport_sas uhci_hcd
 ohci_hcd ehci_hcd Pid: 4768, comm: qemu-system-x86 Not tainted
 2.6.28-00165-g4f27e3e-dirty #164 RIP: 0010:[8028a5b6] 
 [8028a5b6] __purge_vmap_area_lazy+0x12c/0x163 RSP:
 0018:88021e1f9a38  EFLAGS: 00010202
 RAX: 6b6b6b6b6b6b6b6b RBX: 6b6b6b6b6b6b6b2b RCX: 0003
 RDX: 80a1dae0 RSI: 880028083980 RDI: 0001
 RBP: 88021e1f9a78 R08: 0286 R09: 80a1bf50
 R10: 880119c270f8 R11: 88021e1f99b8 R12: 88021e1f9a38
 R13: 88021e1f9a90 R14: 88021e1f9a98 R15: 813a
 FS:  () GS:8080d900()
 knlGS: CS:  0010 DS: 002b ES: 002b CR0: 8005003b
 CR2: 008d9828 CR3: 00201000 CR4: 26e0
 DR0:  DR1:  DR2: 
 DR3:  DR6: 0ff0 DR7: 0400
 Process qemu-system-x86 (pid: 4768, threadinfo 88021e1f8000, task
 880119c270f8) Stack:
  88022bdfd840 880119da11b8 c20011c3 813a
   0001 88022ec11c18 88022f061838
  88021e1f9aa8 8028ab1d 88021e1f9aa8 c20021976000
 Call Trace:
  [8028ab1d] free_unmap_vmap_area_noflush+0x69/0x70
  [8028ab49] remove_vm_area+0x25/0x71
  [8028ac54] __vunmap+0x3a/0xca
  [8028ad35] vfree+0x29/0x2b
  [a00f98a3] kvm_free_physmem_slot+0x25/0x7c [kvm]
  [a00f9d75] kvm_free_physmem+0x27/0x36 [kvm]
  [a00fccb4] kvm_arch_destroy_vm+0xa6/0xda [kvm]
  [a00f9e11] kvm_put_kvm+0x8d/0xa7 [kvm]
  [a00fa0e2] kvm_vcpu_release+0x13/0x17 [kvm]
  [802a1c07] __fput+0xeb/0x1a3
  [802a1cd4] fput+0x15/0x17
  [8029f26c] filp_close+0x67/0x72
  [802378a8] put_files_struct+0x74/0xc8
  [80237943] exit_files+0x47/0x4f
  [80238fe5] do_exit+0x1eb/0x7a7
  [80587edf] ? _spin_unlock_irq+0x2b/0x51
  [80239614] do_group_exit+0x73/0xa0
  [80242b10] get_signal_to_deliver+0x30c/0x32c
  [8020b4d5] ? sysret_signal+0x19/0x29
  [8020a80f] do_notify_resume+0x8c/0x851
  [8025b811] ? do_futex+0x90/0x92a
  [80256bd7] ? trace_hardirqs_on_caller+0xf0/0x114
  [80587f51] ? _spin_unlock_irqrestore+0x4c/0x68
  [8026be5c] ? __rcu_read_unlock+0x92/0x9e
  [80256bd7] ? trace_hardirqs_on_caller+0xf0/0x114
  

[0/3][RESEND] Device assignment code clean up and MSI disable support

2008-12-29 Thread Sheng Yang
I splitted the former patchset into 3 smaller one. Here is the first one.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] KVM: Add MSI_ACTION flag for assigned irq

2008-12-29 Thread Sheng Yang
For MSI disable feature later.

Notice I changed ABI here, but due to no userspace patch, I think it's OK.

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 include/linux/kvm.h |3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 42f51dc..c24f207 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -546,6 +546,7 @@ struct kvm_assigned_irq {
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU(1  0)
 
-#define KVM_DEV_IRQ_ASSIGN_ENABLE_MSI  (1  0)
+#define KVM_DEV_IRQ_ASSIGN_MSI_ACTION  (1  0)
+#define KVM_DEV_IRQ_ASSIGN_ENABLE_MSI  (1  1)
 
 #endif
-- 
1.5.4.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] KVM: Add support to disable MSI for assigned device

2008-12-29 Thread Sheng Yang
MSI is always enabled by default for msi2intx=1. But if msi2intx=0, we
have to disable MSI if guest require to do so.

The patch also discard unnecessary msi2intx judgment if guest want to update
MSI state.

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 virt/kvm/kvm_main.c |   12 ++--
 1 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index cd84b3e..111738b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -328,6 +328,15 @@ static int assigned_device_update_msi(struct kvm *kvm,
adev-irq_requested_type = ~KVM_ASSIGNED_DEV_GUEST_MSI;
adev-guest_irq = airq-guest_irq;
adev-ack_notifier.gsi = airq-guest_irq;
+   } else {
+   /*
+* Guest require to disable device MSI, we disable MSI and
+* re-enable INTx by default again. Notice it's only for
+* non-msi2intx.
+*/
+   kvm_free_assigned_irq(kvm, adev);
+   assigned_device_update_intx(kvm, adev, airq);
+   return 0;
}
 
if (adev-irq_requested_type  KVM_ASSIGNED_DEV_HOST_MSI)
@@ -399,8 +408,7 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
}
}
 
-   if ((!msi2intx 
-(assigned_irq-flags  KVM_DEV_IRQ_ASSIGN_ENABLE_MSI)) ||
+   if ((assigned_irq-flags  KVM_DEV_IRQ_ASSIGN_MSI_ACTION) ||
(msi2intx  match-dev-msi_enabled)) {
 #ifdef CONFIG_X86
r = assigned_device_update_msi(kvm, match, assigned_irq);
-- 
1.5.4.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/3] KVM: Use kvm_free_assigned_irq() for free irq

2008-12-29 Thread Sheng Yang
Which is more convenient...

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 virt/kvm/kvm_main.c |   10 ++
 1 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ffd261d..cd84b3e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -284,11 +284,7 @@ static int assigned_device_update_intx(struct kvm *kvm,
return 0;
 
if (irqchip_in_kernel(kvm)) {
-   if (!msi2intx 
-   adev-irq_requested_type  KVM_ASSIGNED_DEV_HOST_MSI) {
-   free_irq(adev-host_irq, (void *)kvm);
-   pci_disable_msi(adev-dev);
-   }
+   kvm_free_assigned_irq(kvm, adev);
 
if (!capable(CAP_SYS_RAWIO))
return -EPERM;
@@ -339,9 +335,7 @@ static int assigned_device_update_msi(struct kvm *kvm,
 
if (irqchip_in_kernel(kvm)) {
if (!msi2intx) {
-   if (adev-irq_requested_type 
-   KVM_ASSIGNED_DEV_HOST_INTX)
-   free_irq(adev-host_irq, (void *)adev);
+   kvm_free_assigned_irq(kvm, adev);
 
r = pci_enable_msi(adev-dev);
if (r)
-- 
1.5.4.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/10][v3] GSI-MSG route layer for MSI/MSI-X

2008-12-29 Thread Sheng Yang
Update from v2:
Add gsi_msg_pending_bitmap, in order to support MSI-X mutiply interrupts.

And this one depends on the former Device assignment code clean up and MSI
disable support patchset.

--
regards
Yang, Sheng
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 03/10] KVM: Improve MSI dispatch function

2008-12-29 Thread Sheng Yang
Prepare to merge with kvm_set_irq().

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 virt/kvm/kvm_main.c |8 
 1 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 3494861..599257e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -87,7 +87,7 @@ static bool kvm_rebooting;
 #ifdef KVM_CAP_DEVICE_ASSIGNMENT
 
 #ifdef CONFIG_X86
-static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev)
+static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev, 
u32 gsi)
 {
int vcpu_id;
struct kvm_vcpu *vcpu;
@@ -99,7 +99,7 @@ static void assigned_device_msi_dispatch(struct 
kvm_assigned_dev_kernel *dev)
BUG_ON(!ioapic);
 
mutex_lock(dev-kvm-gsi_msg_lock);
-   gsi_msg = kvm_find_gsi_msg(dev-kvm, dev-guest_irq);
+   gsi_msg = kvm_find_gsi_msg(dev-kvm, gsi);
if (!gsi_msg) {
printk(KERN_WARNING kvm: fail to find correlated gsi_msg\n);
return;
@@ -143,7 +143,7 @@ static void assigned_device_msi_dispatch(struct 
kvm_assigned_dev_kernel *dev)
}
 }
 #else
-static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev) 
{}
+static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev, 
u32 gsi) {}
 #endif
 
 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head 
*head,
@@ -178,7 +178,7 @@ static void kvm_assigned_dev_interrupt_work_handler(struct 
work_struct *work)
assigned_dev-guest_irq, 1);
else if (assigned_dev-irq_requested_type 
KVM_ASSIGNED_DEV_GUEST_MSI) {
-   assigned_device_msi_dispatch(assigned_dev);
+   assigned_device_msi_dispatch(assigned_dev, 
assigned_dev-guest_irq);
enable_irq(assigned_dev-host_irq);
assigned_dev-host_irq_disabled = false;
}
-- 
1.5.4.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 04/10] KVM: Using ioapic_irqchip() macro for kvm_set_irq

2008-12-29 Thread Sheng Yang

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 virt/kvm/irq_comm.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index abfab46..47243ef 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -39,7 +39,7 @@ void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, 
int level)
 * IOAPIC.  So set the bit in both. The guest will ignore
 * writes to the unused one.
 */
-   kvm_ioapic_set_irq(kvm-arch.vioapic, irq, !!(*irq_state));
+   kvm_ioapic_set_irq(ioapic_irqchip(kvm), irq, !!(*irq_state));
 #ifdef CONFIG_X86
kvm_pic_set_irq(pic_irqchip(kvm), irq, !!(*irq_state));
 #endif
-- 
1.5.4.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 10/10] KVM: bit ops for deliver_bitmap

2008-12-29 Thread Sheng Yang
It's also convenient when we extend KVM supported vcpu number in the future.

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 arch/x86/kvm/lapic.c |7 ---
 virt/kvm/ioapic.c|   24 +---
 virt/kvm/irq_comm.c  |   16 
 3 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index c1e4935..359e02c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -477,9 +477,10 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 
struct kvm_vcpu *target;
struct kvm_vcpu *vcpu;
-   unsigned long lpr_map = 0;
+   DECLARE_BITMAP(lpr_map, KVM_MAX_VCPUS);
int i;
 
+   bitmap_zero(lpr_map, KVM_MAX_VCPUS);
apic_debug(icr_high 0x%x, icr_low 0x%x, 
   short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, 
   dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n,
@@ -494,7 +495,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
if (vcpu-arch.apic 
apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
if (delivery_mode == APIC_DM_LOWEST)
-   set_bit(vcpu-vcpu_id, lpr_map);
+   set_bit(vcpu-vcpu_id, lpr_map);
else
__apic_accept_irq(vcpu-arch.apic, 
delivery_mode,
  vector, level, trig_mode);
@@ -502,7 +503,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
}
 
if (delivery_mode == APIC_DM_LOWEST) {
-   target = kvm_get_lowest_prio_vcpu(vcpu-kvm, vector, lpr_map);
+   target = kvm_get_lowest_prio_vcpu(vcpu-kvm, vector, lpr_map);
if (target != NULL)
__apic_accept_irq(target-arch.apic, delivery_mode,
  vector, level, trig_mode);
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 164a746..bf83f5e 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -195,7 +195,7 @@ void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic 
*ioapic, u8 dest,
 static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 {
union kvm_ioapic_redirect_entry entry = ioapic-redirtbl[irq];
-   unsigned long deliver_bitmask;
+   DECLARE_BITMAP(deliver_bitmask, KVM_MAX_VCPUS);
struct kvm_vcpu *vcpu;
int vcpu_id, r = 0;
 
@@ -205,22 +205,24 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int 
irq)
 entry.fields.delivery_mode, entry.fields.vector,
 entry.fields.trig_mode);
 
-   kvm_get_intr_delivery_bitmask(ioapic, entry, deliver_bitmask);
-   if (!deliver_bitmask) {
-   ioapic_debug(no target on destination\n);
-   return 0;
-   }
+   bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
 
/* Always delivery PIT interrupt to vcpu 0 */
 #ifdef CONFIG_X86
if (irq == 0)
-   deliver_bitmask = 1  0;
+   set_bit(0, deliver_bitmask);
+   else
 #endif
+   kvm_get_intr_delivery_bitmask(ioapic, entry, deliver_bitmask);
+
+   if (find_first_bit(deliver_bitmask, KVM_MAX_VCPUS) = KVM_MAX_VCPUS) {
+   ioapic_debug(no target on destination\n);
+   return 0;
+   }
 
-   for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
-   if (!(deliver_bitmask  (1  vcpu_id)))
-   continue;
-   deliver_bitmask = ~(1  vcpu_id);
+   while ((vcpu_id = find_first_bit(deliver_bitmask, KVM_MAX_VCPUS))
+KVM_MAX_VCPUS) {
+   clear_bit(vcpu_id, deliver_bitmask);
vcpu = ioapic-kvm-vcpus[vcpu_id];
if (vcpu) {
if (entry.fields.delivery_mode ==
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index e74d679..ecda2c1 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -42,7 +42,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
case IOAPIC_LOWEST_PRIORITY:
vcpu = kvm_get_lowest_prio_vcpu(ioapic-kvm,
entry-fields.vector, deliver_bitmask);
-   *deliver_bitmask = 1  vcpu-vcpu_id;
+   set_bit(vcpu-vcpu_id, deliver_bitmask);
break;
case IOAPIC_FIXED:
case IOAPIC_NMI:
@@ -63,11 +63,12 @@ void kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 
gsi, int level)
struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
struct kvm_gsi_msg *gsi_msg;
union kvm_ioapic_redirect_entry entry;
-   unsigned long deliver_bitmask;
+   DECLARE_BITMAP(deliver_bitmask, KVM_MAX_VCPUS);
 
BUG_ON(!ioapic);
 #endif
 
+   bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
if (!(gsi  KVM_GSI_MSG_MASK)) {
int irq = gsi;
 
@@ -111,16 +112,15 @@ void 

[PATCH 08/10] KVM: Change API of kvm_ioapic_get_delivery_bitmask

2008-12-29 Thread Sheng Yang
In order to use with bit ops.

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 virt/kvm/ioapic.c   |   17 -
 virt/kvm/ioapic.h   |4 ++--
 virt/kvm/irq_comm.c |5 +++--
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index af9f5de..ebd5ba6 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -153,22 +153,22 @@ static void ioapic_inj_nmi(struct kvm_vcpu *vcpu)
kvm_vcpu_kick(vcpu);
 }
 
-u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
-   u8 dest_mode)
+void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
+u8 dest_mode, u32 *mask)
 {
-   u32 mask = 0;
int i;
struct kvm *kvm = ioapic-kvm;
struct kvm_vcpu *vcpu;
 
ioapic_debug(dest %d dest_mode %d\n, dest, dest_mode);
 
+   *mask = 0;
if (dest_mode == 0) {   /* Physical mode. */
if (dest == 0xFF) { /* Broadcast. */
for (i = 0; i  KVM_MAX_VCPUS; ++i)
if (kvm-vcpus[i]  kvm-vcpus[i]-arch.apic)
-   mask |= 1  i;
-   return mask;
+   *mask |= 1  i;
+   return;
}
for (i = 0; i  KVM_MAX_VCPUS; ++i) {
vcpu = kvm-vcpus[i];
@@ -176,7 +176,7 @@ u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic 
*ioapic, u8 dest,
continue;
if (kvm_apic_match_physical_addr(vcpu-arch.apic, 
dest)) {
if (vcpu-arch.apic)
-   mask = 1  i;
+   *mask = 1  i;
break;
}
}
@@ -187,10 +187,9 @@ u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic 
*ioapic, u8 dest,
continue;
if (vcpu-arch.apic 
kvm_apic_match_logical_addr(vcpu-arch.apic, dest))
-   mask |= 1  vcpu-vcpu_id;
+   *mask |= 1  vcpu-vcpu_id;
}
-   ioapic_debug(mask %x\n, mask);
-   return mask;
+   ioapic_debug(mask %x\n, *mask);
 }
 
 static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index ee5b0bd..e107dbb 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -70,7 +70,7 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int 
trigger_mode);
 int kvm_ioapic_init(struct kvm *kvm);
 void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
-u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
-   u8 dest_mode);
+void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
+u8 dest_mode, u32 *mask);
 
 #endif
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index d89d8b2..1949587 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -35,8 +35,9 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
 {
struct kvm_vcpu *vcpu;
 
-   *deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic,
-   entry-fields.dest_id, entry-fields.dest_mode);
+   kvm_ioapic_get_delivery_bitmask(ioapic, entry-fields.dest_id,
+   entry-fields.dest_mode,
+   deliver_bitmask);
switch (entry-fields.delivery_mode) {
case IOAPIC_LOWEST_PRIORITY:
vcpu = kvm_get_lowest_prio_vcpu(ioapic-kvm,
-- 
1.5.4.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 09/10] KVM: Update intr delivery func to accept unsigned long* bitmap

2008-12-29 Thread Sheng Yang
Would be used with bit ops, and would be easily extended if KVM_MAX_VCPUS is
increased.

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 arch/x86/kvm/lapic.c |8 
 include/linux/kvm_host.h |2 +-
 virt/kvm/ioapic.c|4 ++--
 virt/kvm/ioapic.h|4 ++--
 virt/kvm/irq_comm.c  |6 +++---
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index afac68c..c1e4935 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -403,7 +403,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int 
delivery_mode,
 }
 
 static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
-  unsigned long bitmap)
+  unsigned long *bitmap)
 {
int last;
int next;
@@ -415,7 +415,7 @@ static struct kvm_lapic *kvm_apic_round_robin(struct kvm 
*kvm, u8 vector,
do {
if (++next == KVM_MAX_VCPUS)
next = 0;
-   if (kvm-vcpus[next] == NULL || !test_bit(next, bitmap))
+   if (kvm-vcpus[next] == NULL || !test_bit(next, bitmap))
continue;
apic = kvm-vcpus[next]-arch.apic;
if (apic  apic_enabled(apic))
@@ -431,7 +431,7 @@ static struct kvm_lapic *kvm_apic_round_robin(struct kvm 
*kvm, u8 vector,
 }
 
 struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
-   unsigned long bitmap)
+   unsigned long *bitmap)
 {
struct kvm_lapic *apic;
 
@@ -502,7 +502,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
}
 
if (delivery_mode == APIC_DM_LOWEST) {
-   target = kvm_get_lowest_prio_vcpu(vcpu-kvm, vector, lpr_map);
+   target = kvm_get_lowest_prio_vcpu(vcpu-kvm, vector, lpr_map);
if (target != NULL)
__apic_accept_irq(target-arch.apic, delivery_mode,
  vector, level, trig_mode);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4f92317..fbf102c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -332,7 +332,7 @@ struct kvm_gsi_msg {
 
 void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
   union kvm_ioapic_redirect_entry *entry,
-  u32 *deliver_bitmask);
+  unsigned long *deliver_bitmask);
 void kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 gsi, int level);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index ebd5ba6..164a746 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -154,7 +154,7 @@ static void ioapic_inj_nmi(struct kvm_vcpu *vcpu)
 }
 
 void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
-u8 dest_mode, u32 *mask)
+u8 dest_mode, unsigned long *mask)
 {
int i;
struct kvm *kvm = ioapic-kvm;
@@ -195,7 +195,7 @@ void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic 
*ioapic, u8 dest,
 static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 {
union kvm_ioapic_redirect_entry entry = ioapic-redirtbl[irq];
-   u32 deliver_bitmask;
+   unsigned long deliver_bitmask;
struct kvm_vcpu *vcpu;
int vcpu_id, r = 0;
 
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index e107dbb..c418a7f 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -65,12 +65,12 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm 
*kvm)
 }
 
 struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
-  unsigned long bitmap);
+  unsigned long *bitmap);
 void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
 int kvm_ioapic_init(struct kvm *kvm);
 void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
 void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
-u8 dest_mode, u32 *mask);
+u8 dest_mode, unsigned long *mask);
 
 #endif
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 1949587..e74d679 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -31,7 +31,7 @@
 
 void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
   union kvm_ioapic_redirect_entry *entry,
-  u32 *deliver_bitmask)
+  unsigned long *deliver_bitmask)
 {
struct kvm_vcpu *vcpu;
 
@@ -41,7 +41,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
switch 

[PATCH 01/10] KVM: Add a route layer to convert MSI message to GSI

2008-12-29 Thread Sheng Yang
Avi's purpose, to use single kvm_set_irq() to deal with all interrupt, including
MSI. So here is it.

struct gsi_msg is a mapping from a special gsi(with KVM_GSI_MSG_MASK) to
MSI/MSI-X message address/data.

Now we support up to 256 gsi_msg mapping, and gsi_msg is allocated by kernel and
provide two ioctls to userspace, which is more flexiable.

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 include/linux/kvm.h  |   14 +
 include/linux/kvm_host.h |   16 ++
 virt/kvm/irq_comm.c  |   70 ++
 virt/kvm/kvm_main.c  |   66 +++
 4 files changed, 166 insertions(+), 0 deletions(-)

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index c24f207..a75e01f 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -396,6 +396,9 @@ struct kvm_trace_rec {
 #if defined(CONFIG_X86)
 #define KVM_CAP_SET_GUEST_DEBUG 23
 #endif
+#if defined(CONFIG_X86)
+#define KVM_CAP_GSI_MSG 24
+#endif
 
 /*
  * ioctls for VM fds
@@ -429,6 +432,8 @@ struct kvm_trace_rec {
   struct kvm_assigned_pci_dev)
 #define KVM_ASSIGN_IRQ _IOR(KVMIO, 0x70, \
struct kvm_assigned_irq)
+#define KVM_REQUEST_GSI_MSG _IOWR(KVMIO, 0x71, struct kvm_assigned_gsi_msg)
+#define KVM_FREE_GSI_MSG _IOR(KVMIO, 0x72, struct kvm_assigned_gsi_msg)
 
 /*
  * ioctls for vcpu fds
@@ -549,4 +554,13 @@ struct kvm_assigned_irq {
 #define KVM_DEV_IRQ_ASSIGN_MSI_ACTION  (1  0)
 #define KVM_DEV_IRQ_ASSIGN_ENABLE_MSI  (1  1)
 
+struct kvm_assigned_gsi_msg {
+   __u32 gsi;
+   struct {
+   __u32 addr_lo;
+   __u32 addr_hi;
+   __u32 data;
+   } msg;
+};
+
 #endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d63e9a4..0e5741a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -132,6 +132,10 @@ struct kvm {
unsigned long mmu_notifier_seq;
long mmu_notifier_count;
 #endif
+   struct hlist_head gsi_msg_list;
+   struct mutex gsi_msg_lock;
+#define KVM_NR_GSI_MSG 256
+   DECLARE_BITMAP(gsi_msg_bitmap, KVM_NR_GSI_MSG);
 };
 
 /* The guest did something we don't support. */
@@ -319,6 +323,14 @@ struct kvm_assigned_dev_kernel {
struct pci_dev *dev;
struct kvm *kvm;
 };
+
+#define KVM_GSI_MSG_MASK0x100ull
+struct kvm_gsi_msg {
+   u32 gsi;
+   struct msi_msg msg;
+   struct hlist_node link;
+};
+
 void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
@@ -326,6 +338,10 @@ void kvm_register_irq_ack_notifier(struct kvm *kvm,
 void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian);
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
+int kvm_update_gsi_msg(struct kvm *kvm, struct kvm_gsi_msg *gsi_msg);
+struct kvm_gsi_msg *kvm_find_gsi_msg(struct kvm *kvm, u32 gsi);
+void kvm_free_gsi_msg(struct kvm *kvm, struct kvm_gsi_msg *gsi_msg);
+void kvm_free_gsi_msg_list(struct kvm *kvm);
 
 #ifdef CONFIG_DMAR
 int kvm_iommu_map_pages(struct kvm *kvm, gfn_t base_gfn,
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index aa5d1e5..abfab46 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -99,3 +99,73 @@ void kvm_free_irq_source_id(struct kvm *kvm, int 
irq_source_id)
clear_bit(irq_source_id, kvm-arch.irq_states[i]);
clear_bit(irq_source_id, kvm-arch.irq_sources_bitmap);
 }
+
+int kvm_update_gsi_msg(struct kvm *kvm, struct kvm_gsi_msg *gsi_msg)
+{
+   struct kvm_gsi_msg *found_msg, *new_gsi_msg;
+   int r, gsi;
+
+   mutex_lock(kvm-gsi_msg_lock);
+   /* Find whether we need a update or a new entry */
+   found_msg = kvm_find_gsi_msg(kvm, gsi_msg-gsi);
+   if (found_msg)
+   *found_msg = *gsi_msg;
+   else {
+   gsi = find_first_zero_bit(kvm-gsi_msg_bitmap, KVM_NR_GSI_MSG);
+   if (gsi = KVM_NR_GSI_MSG) {
+   r = -ENOSPC;
+   goto out;
+   }
+   __set_bit(gsi, kvm-gsi_msg_bitmap);
+   gsi_msg-gsi = gsi | KVM_GSI_MSG_MASK;
+   new_gsi_msg = kzalloc(sizeof(*new_gsi_msg), GFP_KERNEL);
+   if (!new_gsi_msg) {
+   r = -ENOMEM;
+   goto out;
+   }
+   *new_gsi_msg = *gsi_msg;
+   hlist_add_head(new_gsi_msg-link, kvm-gsi_msg_list);
+   }
+   r = 0;
+out:
+   mutex_unlock(kvm-gsi_msg_lock);
+   return r;
+}
+
+/* Call with kvm-gsi_msg_lock hold */
+struct kvm_gsi_msg *kvm_find_gsi_msg(struct kvm *kvm, u32 gsi)
+{
+   struct kvm_gsi_msg *gsi_msg;
+   struct hlist_node *n;
+
+   if (!(gsi  KVM_GSI_MSG_MASK))
+   return NULL;
+   

[PATCH 06/10] KVM: Split IOAPIC structure

2008-12-29 Thread Sheng Yang
Prepared for reuse ioapic_redir_entry for MSI.

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 include/linux/kvm_types.h |   17 +
 virt/kvm/ioapic.c |6 +++---
 virt/kvm/ioapic.h |   17 +
 3 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 5f4a18c..46e3d8d 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -52,4 +52,21 @@ struct kvm_pio_request {
int rep;
 };
 
+union kvm_ioapic_redirect_entry {
+   u64 bits;
+   struct {
+   u8 vector;
+   u8 delivery_mode:3;
+   u8 dest_mode:1;
+   u8 delivery_status:1;
+   u8 polarity:1;
+   u8 remote_irr:1;
+   u8 trig_mode:1;
+   u8 mask:1;
+   u8 reserve:7;
+   u8 reserved[4];
+   u8 dest_id;
+   } fields;
+};
+
 #endif /* __KVM_TYPES_H__ */
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 23b81cf..ebb2ab5 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -85,7 +85,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic 
*ioapic,
 
 static void ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
 {
-   union ioapic_redir_entry *pent;
+   union kvm_ioapic_redirect_entry *pent;
 
pent = ioapic-redirtbl[idx];
 
@@ -272,7 +272,7 @@ void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, 
int level)
 {
u32 old_irr = ioapic-irr;
u32 mask = 1  irq;
-   union ioapic_redir_entry entry;
+   union kvm_ioapic_redirect_entry entry;
 
if (irq = 0  irq  IOAPIC_NUM_PINS) {
entry = ioapic-redirtbl[irq];
@@ -291,7 +291,7 @@ void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, 
int level)
 static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi,
int trigger_mode)
 {
-   union ioapic_redir_entry *ent;
+   union kvm_ioapic_redirect_entry *ent;
 
ent = ioapic-redirtbl[gsi];
 
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 49c9581..ee5b0bd 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -40,22 +40,7 @@ struct kvm_ioapic {
u32 id;
u32 irr;
u32 pad;
-   union ioapic_redir_entry {
-   u64 bits;
-   struct {
-   u8 vector;
-   u8 delivery_mode:3;
-   u8 dest_mode:1;
-   u8 delivery_status:1;
-   u8 polarity:1;
-   u8 remote_irr:1;
-   u8 trig_mode:1;
-   u8 mask:1;
-   u8 reserve:7;
-   u8 reserved[4];
-   u8 dest_id;
-   } fields;
-   } redirtbl[IOAPIC_NUM_PINS];
+   union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS];
struct kvm_io_device dev;
struct kvm *kvm;
void (*ack_notifier)(void *opaque, int irq);
-- 
1.5.4.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 05/10] KVM: Merge MSI handling to kvm_set_irq

2008-12-29 Thread Sheng Yang
Using kvm_set_irq to handle all interrupt injection.

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 include/linux/kvm_host.h |2 +-
 virt/kvm/irq_comm.c  |   98 +++---
 virt/kvm/kvm_main.c  |   77 +++-
 3 files changed, 90 insertions(+), 87 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index aa2606b..5b671b6 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -330,7 +330,7 @@ struct kvm_gsi_msg {
struct hlist_node link;
 };
 
-void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level);
+void kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 gsi, int level);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
   struct kvm_irq_ack_notifier *kian);
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 47243ef..63cdf01 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -20,28 +20,96 @@
  */
 
 #include linux/kvm_host.h
+
+#ifdef CONFIG_X86
+#include asm/msidef.h
+#endif
+
 #include irq.h
 
 #include ioapic.h
 
 /* This should be called with the kvm-lock mutex held */
-void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level)
+void kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 gsi, int level)
 {
-   unsigned long *irq_state = (unsigned long *)kvm-arch.irq_states[irq];
-
-   /* Logical OR for level trig interrupt */
-   if (level)
-   set_bit(irq_source_id, irq_state);
-   else
-   clear_bit(irq_source_id, irq_state);
-
-   /* Not possible to detect if the guest uses the PIC or the
-* IOAPIC.  So set the bit in both. The guest will ignore
-* writes to the unused one.
-*/
-   kvm_ioapic_set_irq(ioapic_irqchip(kvm), irq, !!(*irq_state));
+   unsigned long *irq_state;
+#ifdef CONFIG_X86
+   int vcpu_id;
+   struct kvm_vcpu *vcpu;
+   struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
+   struct kvm_gsi_msg *gsi_msg;
+   int dest_id, vector, dest_mode, trig_mode, delivery_mode;
+   u32 deliver_bitmask;
+
+   BUG_ON(!ioapic);
+#endif
+
+   if (!(gsi  KVM_GSI_MSG_MASK)) {
+   int irq = gsi;
+
+   irq_state = (unsigned long *)kvm-arch.irq_states[irq];
+
+   /* Logical OR for level trig interrupt */
+   if (level)
+   set_bit(irq_source_id, irq_state);
+   else
+   clear_bit(irq_source_id, irq_state);
+
+   /* Not possible to detect if the guest uses the PIC or the
+* IOAPIC.  So set the bit in both. The guest will ignore
+* writes to the unused one.
+*/
+   kvm_ioapic_set_irq(ioapic, irq, !!(*irq_state));
 #ifdef CONFIG_X86
-   kvm_pic_set_irq(pic_irqchip(kvm), irq, !!(*irq_state));
+   kvm_pic_set_irq(pic_irqchip(kvm), irq, !!(*irq_state));
+#endif
+   return;
+   }
+
+#ifdef CONFIG_X86
+   mutex_lock(kvm-gsi_msg_lock);
+   gsi_msg = kvm_find_gsi_msg(kvm, gsi);
+   mutex_unlock(kvm-gsi_msg_lock);
+   if (!gsi_msg) {
+   printk(KERN_WARNING kvm: fail to find correlated gsi_msg\n);
+   return;
+   }
+
+   dest_id = (gsi_msg-msg.address_lo  MSI_ADDR_DEST_ID_MASK)
+MSI_ADDR_DEST_ID_SHIFT;
+   vector = (gsi_msg-msg.data  MSI_DATA_VECTOR_MASK)
+MSI_DATA_VECTOR_SHIFT;
+   dest_mode = test_bit(MSI_ADDR_DEST_MODE_SHIFT,
+   (unsigned long *)gsi_msg-msg.address_lo);
+   trig_mode = test_bit(MSI_DATA_TRIGGER_SHIFT,
+   (unsigned long *)gsi_msg-msg.data);
+   delivery_mode = test_bit(MSI_DATA_DELIVERY_MODE_SHIFT,
+   (unsigned long *)gsi_msg-msg.data);
+   deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic,
+   dest_id, dest_mode);
+   /* IOAPIC delivery mode value is the same as MSI here */
+   switch (delivery_mode) {
+   case IOAPIC_LOWEST_PRIORITY:
+   vcpu = kvm_get_lowest_prio_vcpu(ioapic-kvm, vector,
+   deliver_bitmask);
+   if (vcpu != NULL)
+   kvm_apic_set_irq(vcpu, vector, trig_mode);
+   else
+   printk(KERN_INFO kvm: null lowest priority vcpu!\n);
+   break;
+   case IOAPIC_FIXED:
+   for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
+   if (!(deliver_bitmask  (1  vcpu_id)))
+   continue;
+   deliver_bitmask = ~(1  vcpu_id);
+   vcpu = ioapic-kvm-vcpus[vcpu_id];
+   if (vcpu)
+   kvm_apic_set_irq(vcpu, vector, 

[PATCH 02/10] KVM: Using gsi_msg mapping for MSI device assignment

2008-12-29 Thread Sheng Yang
Convert MSI userspace interface to support gsi_msg mapping(and nobody should
be the user of the old interface...).

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 include/linux/kvm_host.h |1 -
 virt/kvm/kvm_main.c  |   35 ++-
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 0e5741a..aa2606b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -313,7 +313,6 @@ struct kvm_assigned_dev_kernel {
int host_irq;
bool host_irq_disabled;
int guest_irq;
-   struct msi_msg guest_msi;
 #define KVM_ASSIGNED_DEV_GUEST_INTX(1  0)
 #define KVM_ASSIGNED_DEV_GUEST_MSI (1  1)
 #define KVM_ASSIGNED_DEV_HOST_INTX (1  8)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 26bccf9..3494861 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -92,20 +92,30 @@ static void assigned_device_msi_dispatch(struct 
kvm_assigned_dev_kernel *dev)
int vcpu_id;
struct kvm_vcpu *vcpu;
struct kvm_ioapic *ioapic = ioapic_irqchip(dev-kvm);
-   int dest_id = (dev-guest_msi.address_lo  MSI_ADDR_DEST_ID_MASK)
-MSI_ADDR_DEST_ID_SHIFT;
-   int vector = (dev-guest_msi.data  MSI_DATA_VECTOR_MASK)
-MSI_DATA_VECTOR_SHIFT;
-   int dest_mode = test_bit(MSI_ADDR_DEST_MODE_SHIFT,
-   (unsigned long *)dev-guest_msi.address_lo);
-   int trig_mode = test_bit(MSI_DATA_TRIGGER_SHIFT,
-   (unsigned long *)dev-guest_msi.data);
-   int delivery_mode = test_bit(MSI_DATA_DELIVERY_MODE_SHIFT,
-   (unsigned long *)dev-guest_msi.data);
+   struct kvm_gsi_msg *gsi_msg;
+   int dest_id, vector, dest_mode, trig_mode, delivery_mode;
u32 deliver_bitmask;
 
BUG_ON(!ioapic);
 
+   mutex_lock(dev-kvm-gsi_msg_lock);
+   gsi_msg = kvm_find_gsi_msg(dev-kvm, dev-guest_irq);
+   if (!gsi_msg) {
+   printk(KERN_WARNING kvm: fail to find correlated gsi_msg\n);
+   return;
+   }
+   mutex_unlock(dev-kvm-gsi_msg_lock);
+
+   dest_id = (gsi_msg-msg.address_lo  MSI_ADDR_DEST_ID_MASK)
+MSI_ADDR_DEST_ID_SHIFT;
+   vector = (gsi_msg-msg.data  MSI_DATA_VECTOR_MASK)
+MSI_DATA_VECTOR_SHIFT;
+   dest_mode = test_bit(MSI_ADDR_DEST_MODE_SHIFT,
+   (unsigned long *)gsi_msg-msg.address_lo);
+   trig_mode = test_bit(MSI_DATA_TRIGGER_SHIFT,
+   (unsigned long *)gsi_msg-msg.data);
+   delivery_mode = test_bit(MSI_DATA_DELIVERY_MODE_SHIFT,
+   (unsigned long *)gsi_msg-msg.data);
deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic,
dest_id, dest_mode);
/* IOAPIC delivery mode value is the same as MSI here */
@@ -316,17 +326,16 @@ static int assigned_device_update_msi(struct kvm *kvm,
 {
int r;
 
+   adev-guest_irq = airq-guest_irq;
+
if (airq-flags  KVM_DEV_IRQ_ASSIGN_ENABLE_MSI) {
/* x86 don't care upper address of guest msi message addr */
adev-irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_MSI;
adev-irq_requested_type = ~KVM_ASSIGNED_DEV_GUEST_INTX;
-   adev-guest_msi.address_lo = airq-guest_msi.addr_lo;
-   adev-guest_msi.data = airq-guest_msi.data;
adev-ack_notifier.gsi = -1;
} else if (msi2intx) {
adev-irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_INTX;
adev-irq_requested_type = ~KVM_ASSIGNED_DEV_GUEST_MSI;
-   adev-guest_irq = airq-guest_irq;
adev-ack_notifier.gsi = airq-guest_irq;
} else {
/*
-- 
1.5.4.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 07/10] KVM: Unified the delivery of IOAPIC and MSI

2008-12-29 Thread Sheng Yang

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 include/linux/kvm_host.h |3 ++
 virt/kvm/ioapic.c|   84 +-
 virt/kvm/irq_comm.c  |   75 
 3 files changed, 79 insertions(+), 83 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5b671b6..4f92317 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -330,6 +330,9 @@ struct kvm_gsi_msg {
struct hlist_node link;
 };
 
+void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
+  union kvm_ioapic_redirect_entry *entry,
+  u32 *deliver_bitmask);
 void kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 gsi, int level);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index ebb2ab5..af9f5de 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -195,75 +195,53 @@ u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic 
*ioapic, u8 dest,
 
 static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 {
-   u8 dest = ioapic-redirtbl[irq].fields.dest_id;
-   u8 dest_mode = ioapic-redirtbl[irq].fields.dest_mode;
-   u8 delivery_mode = ioapic-redirtbl[irq].fields.delivery_mode;
-   u8 vector = ioapic-redirtbl[irq].fields.vector;
-   u8 trig_mode = ioapic-redirtbl[irq].fields.trig_mode;
+   union kvm_ioapic_redirect_entry entry = ioapic-redirtbl[irq];
u32 deliver_bitmask;
struct kvm_vcpu *vcpu;
int vcpu_id, r = 0;
 
ioapic_debug(dest=%x dest_mode=%x delivery_mode=%x 
 vector=%x trig_mode=%x\n,
-dest, dest_mode, delivery_mode, vector, trig_mode);
+entry.fields.dest, entry.fields.dest_mode,
+entry.fields.delivery_mode, entry.fields.vector,
+entry.fields.trig_mode);
 
-   deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic, dest,
- dest_mode);
+   kvm_get_intr_delivery_bitmask(ioapic, entry, deliver_bitmask);
if (!deliver_bitmask) {
ioapic_debug(no target on destination\n);
return 0;
}
 
-   switch (delivery_mode) {
-   case IOAPIC_LOWEST_PRIORITY:
-   vcpu = kvm_get_lowest_prio_vcpu(ioapic-kvm, vector,
-   deliver_bitmask);
+   /* Always delivery PIT interrupt to vcpu 0 */
 #ifdef CONFIG_X86
-   if (irq == 0)
-   vcpu = ioapic-kvm-vcpus[0];
+   if (irq == 0)
+   deliver_bitmask = 1  0;
 #endif
-   if (vcpu != NULL)
-   r = ioapic_inj_irq(ioapic, vcpu, vector,
-  trig_mode, delivery_mode);
-   else
-   ioapic_debug(null lowest prio vcpu: 
-mask=%x vector=%x delivery_mode=%x\n,
-deliver_bitmask, vector, 
IOAPIC_LOWEST_PRIORITY);
-   break;
-   case IOAPIC_FIXED:
-#ifdef CONFIG_X86
-   if (irq == 0)
-   deliver_bitmask = 1;
-#endif
-   for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
-   if (!(deliver_bitmask  (1  vcpu_id)))
-   continue;
-   deliver_bitmask = ~(1  vcpu_id);
-   vcpu = ioapic-kvm-vcpus[vcpu_id];
-   if (vcpu) {
-   r = ioapic_inj_irq(ioapic, vcpu, vector,
-  trig_mode, delivery_mode);
-   }
-   }
-   break;
-   case IOAPIC_NMI:
-   for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
-   if (!(deliver_bitmask  (1  vcpu_id)))
-   continue;
-   deliver_bitmask = ~(1  vcpu_id);
-   vcpu = ioapic-kvm-vcpus[vcpu_id];
-   if (vcpu)
+
+   for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
+   if (!(deliver_bitmask  (1  vcpu_id)))
+   continue;
+   deliver_bitmask = ~(1  vcpu_id);
+   vcpu = ioapic-kvm-vcpus[vcpu_id];
+   if (vcpu) {
+   if (entry.fields.delivery_mode ==
+   IOAPIC_LOWEST_PRIORITY ||
+   entry.fields.delivery_mode == IOAPIC_FIXED)
+   r = ioapic_inj_irq(ioapic, vcpu,
+  entry.fields.vector,
+  entry.fields.trig_mode,
+  

Re: [PATCH 0/10][v3] GSI-MSG route layer for MSI/MSI-X

2008-12-29 Thread Sheng Yang
On Tuesday 30 December 2008 13:55:52 Sheng Yang wrote:
 Update from v2:
 Add gsi_msg_pending_bitmap, in order to support MSI-X mutiply interrupts.

 And this one depends on the former Device assignment code clean up and MSI
 disable support patchset.

Sorry, a little chaos here. The gsi_msg_pending_bitmap is in MSI-X patchset...

So this one should be almost the same as v2. Just resend.

-- 
regards
Yang, Sheng


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Patch 0/3][v2] Userspace for MSI-X

2008-12-29 Thread Sheng Yang
Change from v2:
Move MMIO intercepting to userspace, and add two new ioctls.

For kernel space MSI-X depends on lots of related patch I sent before, this
time I kept it and wait for others to be checked in.

Please help to review, thanks.

--
regards
Yang, Sheng
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] Add MSI-X related macro to pci.c

2008-12-29 Thread Sheng Yang

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 qemu/hw/pci.h |1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/qemu/hw/pci.h b/qemu/hw/pci.h
index f2a622c..22c5de1 100644
--- a/qemu/hw/pci.h
+++ b/qemu/hw/pci.h
@@ -87,6 +87,7 @@ typedef struct PCIIORegion {
 #define PCI_CAPABILITY_CONFIG_MAX_LENGTH 0x60
 #define PCI_CAPABILITY_CONFIG_DEFAULT_START_ADDR 0x40
 #define PCI_CAPABILITY_CONFIG_MSI_LENGTH 0x10
+#define PCI_CAPABILITY_CONFIG_MSIX_LENGTH 0x10
 
 struct PCIDevice {
 /* PCI config space */
-- 
1.5.4.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] kvm: enable MSI-X capabilty for assigned device

2008-12-29 Thread Sheng Yang
The most important part here, is we emulate a page of MMIO region using a
page of memory. That's because MSI-X table was put in the region and we have to
intercept it.

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 qemu/hw/device-assignment.c |  275 ++-
 qemu/hw/device-assignment.h |6 +
 2 files changed, 276 insertions(+), 5 deletions(-)

diff --git a/qemu/hw/device-assignment.c b/qemu/hw/device-assignment.c
index 2d3e67e..dc2020a 100644
--- a/qemu/hw/device-assignment.c
+++ b/qemu/hw/device-assignment.c
@@ -146,6 +146,7 @@ static void assigned_dev_iomem_map(PCIDevice *pci_dev, int 
region_num,
 {
 AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
 AssignedDevRegion *region = r_dev-v_addrs[region_num];
+PCIRegion *real_region = r_dev-real_device.regions[region_num];
 uint32_t old_ephys = region-e_physbase;
 uint32_t old_esize = region-e_size;
 int first_map = (region-e_size == 0);
@@ -164,10 +165,27 @@ static void assigned_dev_iomem_map(PCIDevice *pci_dev, 
int region_num,
  TARGET_PAGE_ALIGN(old_esize));
 }
 
-if (e_size  0)
+if (e_size  0) {
+/* deal with MSI-X MMIO page */
+if (real_region-base_addr = r_dev-msix_table_addr 
+real_region-base_addr + real_region-size =
+r_dev-msix_table_addr) {
+int offset = r_dev-msix_table_addr - real_region-base_addr;
+ret = munmap(region-u.r_virtbase + offset, TARGET_PAGE_SIZE);
+if (ret == 0)
+DEBUG(munmap done, virt_base 0x%p\n,
+region-u.r_virtbase + offset);
+else {
+fprintf(stderr, %s: fail munmap msix table!\n, __func__);
+exit(1);
+}
+cpu_register_physical_memory(e_phys + offset,
+TARGET_PAGE_SIZE, r_dev-mmio_index);
+}
ret = kvm_register_phys_mem(kvm_context, e_phys,
 region-u.r_virtbase,
 TARGET_PAGE_ALIGN(e_size), 0);
+}
 
 if (ret != 0) {
fprintf(stderr, %s: Error: create new mapping failed\n, __func__);
@@ -570,7 +588,9 @@ void assigned_dev_update_irq(PCIDevice *d)
 }
 }
 
-#if defined(KVM_CAP_DEVICE_MSI)  defined (KVM_CAP_GSI_MSG)
+#ifdef KVM_CAP_GSI_MSG
+
+#ifdef KVM_CAP_DEVICE_MSI
 static void assigned_dev_update_msi(PCIDevice *pci_dev, unsigned int ctrl_pos)
 {
 struct kvm_assigned_irq assigned_irq_data;
@@ -610,14 +630,140 @@ static void assigned_dev_update_msi(PCIDevice *pci_dev, 
unsigned int ctrl_pos)
 }
 #endif
 
-void assigned_device_pci_cap_write_config(PCIDevice *pci_dev, uint32_t address,
+#ifdef KVM_CAP_DEVICE_MSIX
+static int assigned_dev_update_msix_mmio(PCIDevice *pci_dev)
+{
+AssignedDevice *adev = container_of(pci_dev, AssignedDevice, dev);
+u16 entries_nr = 0, entries_max_nr;
+int pos = 0, i, r = 0;
+u32 msg_addr, msg_upper_addr, msg_data, msg_ctrl;
+struct kvm_assigned_msix_nr msix_nr;
+struct kvm_assigned_msix_entry msix_entry;
+struct kvm_assigned_gsi_msg gsi_msg;
+void *va = adev-msix_table_page;
+
+if (adev-cap.available  ASSIGNED_DEVICE_CAP_MSI)
+pos = PCI_CAPABILITY_CONFIG_MSI_LENGTH;
+entries_max_nr = pci_dev-cap.config[pos + 2];
+entries_max_nr = PCI_MSIX_TABSIZE;
+
+/* Get the usable entry number for allocating */
+for (i = 0; i  entries_max_nr; i++) {
+memcpy(msg_ctrl, va + i * 16 + 12, 4);
+/* 0x1 is mask bit for per vector */
+if (msg_ctrl  0x1)
+continue;
+memcpy(msg_data, va + i * 16 + 8, 4);
+/* Ignore unused entry even it's unmasked */
+if (msg_data == 0)
+continue;
+entries_nr ++;
+}
+
+msix_nr.assigned_dev_id = calc_assigned_dev_id(adev-h_busnr,
+  (uint8_t)adev-h_devfn);
+msix_nr.entry_nr = entries_nr;
+r = kvm_set_msix_nr(kvm_context, msix_nr);
+if (r != 0) {
+fprintf(stderr, fail to set MSI-X entry number for MSIX! %s\n,
+   strerror(-r));
+return r;
+}
+
+msix_entry.assigned_dev_id = msix_nr.assigned_dev_id;
+entries_nr = 0;
+for (i = 0; i  entries_max_nr; i++) {
+if (entries_nr = msix_nr.entry_nr)
+break;
+memcpy(msg_ctrl, va + i * 16 + 12, 4);
+if (msg_ctrl  0x1)
+continue;
+memcpy(msg_data, va + i * 16 + 8, 4);
+if (msg_data == 0)
+continue;
+
+memcpy(msg_addr, va + i * 16, 4);
+memcpy(msg_upper_addr, va + i * 16 + 4, 4);
+
+gsi_msg.gsi = 0;
+gsi_msg.msg.addr_lo = msg_addr;
+gsi_msg.msg.addr_hi = msg_upper_addr;
+gsi_msg.msg.data = msg_data;
+r = kvm_request_gsi_msg(kvm_context, gsi_msg);
+if (r) {
+fprintf(stderr, fail to request gsi msg for MSIX! %s\n,
+   

[PATCH 2/3] kvm: add ioctl KVM_SET_MSIX_ENTRY_NR and KVM_SET_MSIX_ENTRY

2008-12-29 Thread Sheng Yang

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 libkvm/libkvm.c |   26 ++
 libkvm/libkvm.h |6 ++
 2 files changed, 32 insertions(+), 0 deletions(-)

diff --git a/libkvm/libkvm.c b/libkvm/libkvm.c
index ddcc929..ad218e1 100644
--- a/libkvm/libkvm.c
+++ b/libkvm/libkvm.c
@@ -1171,3 +1171,29 @@ int kvm_free_gsi_msg(kvm_context_t kvm, uint32_t gsi)
 }
 
 #endif
+
+#ifdef KVM_CAP_DEVICE_MSIX
+int kvm_set_msix_nr(kvm_context_t kvm,
+struct kvm_assigned_msix_nr *msix_nr)
+{
+int ret;
+
+ret = ioctl(kvm-vm_fd, KVM_SET_MSIX_NR, msix_nr);
+if (ret  0)
+return -errno;
+
+return ret;
+}
+
+int kvm_set_msix_entry(kvm_context_t kvm,
+   struct kvm_assigned_msix_entry *entry)
+{
+int ret;
+
+ret = ioctl(kvm-vm_fd, KVM_SET_MSIX_ENTRY, entry);
+if (ret  0)
+return -errno;
+
+return ret;
+}
+#endif
diff --git a/libkvm/libkvm.h b/libkvm/libkvm.h
index 53e57c5..af704f7 100644
--- a/libkvm/libkvm.h
+++ b/libkvm/libkvm.h
@@ -725,4 +725,10 @@ int kvm_request_gsi_msg(kvm_context_t kvm,
 int kvm_free_gsi_msg(kvm_context_t kvm, uint32_t gsi);
 #endif
 
+#ifdef KVM_CAP_DEVICE_MSIX
+int kvm_set_msix_nr(kvm_context_t kvm,
+struct kvm_assigned_msix_nr *msix_nr);
+int kvm_set_msix_entry(kvm_context_t kvm,
+   struct kvm_assigned_msix_entry *entry);
+#endif
 #endif
-- 
1.5.4.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html