Re: [PATCH v5 3/4] kvm: Create kvm_clear_irq()

2012-07-18 Thread Gleb Natapov
On Tue, Jul 17, 2012 at 07:14:52PM +0300, Michael S. Tsirkin wrote:
  _Seems_ racy, or _is_ racy?  Please identify the race.
 
 Look at this:
 
 static inline int kvm_irq_line_state(unsigned long *irq_state,
  int irq_source_id, int level)
 {
 /* Logical OR for level trig interrupt */
 if (level)
 set_bit(irq_source_id, irq_state);
 else
 clear_bit(irq_source_id, irq_state);
 
 return !!(*irq_state);
 }
 
 
 Now:
 If other CPU changes some other bit after the atomic change,
 it looks like !!(*irq_state) might return a stale value.
 
 CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
 If CPU 0 sees a stale value now it will return 0 here
 and interrupt will get cleared.
 
This will hardly happen on x86 especially since bit is set with
serialized instruction. But there is actually a race here.
CPU 0 clears bit 0. CPU 0 read irq_state as 0. CPU 1 sets level to 1.
CPU 1 calls kvm_ioapic_set_irq(1). CPU 0 calls kvm_ioapic_set_irq(0).
No ioapic thinks the level is 0 but irq_state is not 0.

This untested and un-compiled patch should fix it.

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ef91d79..e22c78b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -825,7 +825,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct 
kvm_mmu *mmu,
 void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
 
-int kvm_pic_set_irq(void *opaque, int irq, int level);
+int kvm_pic_set_irq(void *opaque, int irq);
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 81cf4fa..0d6988f 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -188,12 +188,13 @@ void kvm_pic_update_irq(struct kvm_pic *s)
pic_unlock(s);
 }
 
-int kvm_pic_set_irq(void *opaque, int irq, int level)
+int kvm_pic_set_irq(void *opaque, int irq)
 {
struct kvm_pic *s = opaque;
-   int ret = -1;
+   int ret = -1, level;
 
pic_lock(s);
+   level = !!s-irq_states[irq];
if (irq = 0  irq  PIC_NUM_PINS) {
ret = pic_set_irq1(s-pics[irq  3], irq  7, level);
pic_update_irq(s);
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 26fd54d..6ad6a6b 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -191,14 +191,15 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int 
irq)
return kvm_irq_delivery_to_apic(ioapic-kvm, NULL, irqe);
 }
 
-int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
+int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq)
 {
u32 old_irr;
u32 mask = 1  irq;
union kvm_ioapic_redirect_entry entry;
-   int ret = 1;
+   int ret = 1, level;
 
spin_lock(ioapic-lock);
+   level = !!ioapic-irq_states[irq];
old_irr = ioapic-irr;
if (irq = 0  irq  IOAPIC_NUM_PINS) {
entry = ioapic-redirtbl[irq];
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 32872a0..65894dd 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -74,7 +74,7 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int 
trigger_mode);
 bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector);
 int kvm_ioapic_init(struct kvm *kvm);
 void kvm_ioapic_destroy(struct kvm *kvm);
-int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
+int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq);
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
 int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq);
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index a6a0365..db0ccef 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -33,7 +33,7 @@
 
 #include ioapic.h
 
-static inline int kvm_irq_line_state(unsigned long *irq_state,
+static inline void kvm_irq_line_state(unsigned long *irq_state,
 int irq_source_id, int level)
 {
/* Logical OR for level trig interrupt */
@@ -41,8 +41,6 @@ static inline int kvm_irq_line_state(unsigned long *irq_state,
set_bit(irq_source_id, irq_state);
else
clear_bit(irq_source_id, irq_state);
-
-   return !!(*irq_state);
 }
 
 static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
@@ -50,9 +48,9 @@ static int kvm_set_pic_irq(struct 
kvm_kernel_irq_routing_entry *e,
 {
 #ifdef CONFIG_X86
struct kvm_pic *pic = pic_irqchip(kvm);
-   level = kvm_irq_line_state(pic-irq_states[e-irqchip.pin],
+   kvm_irq_line_state(pic-irq_states[e-irqchip.pin],
   irq_source_id, level);
-   return kvm_pic_set_irq(pic, e-irqchip.pin, level);
+   return kvm_pic_set_irq(pic, e-irqchip.pin);
 #else
return -1;
 

Biweekly KVM Test report, kernel 58d8b172... qemu 28c3a9b1..

2012-07-18 Thread Ren, Yongjie
Hi All,

This is KVM upstream test result against kvm.git next branch and qemu-kvm.git 
master branch.
 kvm.git next branch: 58d8b1728ea3da391ef01c43a384ea06ce4b7c8a based on 
kernel 3.5.0-rc1
 qemu-kvm.git master branch: 28c3a9b197900c88f27b14f8862a7a15c00dc7f0

We found no new bug and no bug got fixed in the past two weeks. 
The block issue for device assignment has been fixed in kvm master branch.
As we are using kvm next branch, we still leave it open until fix is included 
in next branch.

New issue (0):

Fixed issue (0):

Old issues (4):
--
1. (Nested-virt)L1 (kvm on kvm)guest panic with parameter -cpu host in qemu 
command line.
  https://bugs.launchpad.net/qemu/+bug/994378
2. Can't install or boot up 32bit win8 guest.
  https://bugs.launchpad.net/qemu/+bug/1007269
3. VT-d/SR-IOV doesn't work in the guest
  https://bugzilla.kernel.org/show_bug.cgi?id=43328
4. vCPU hot-add makes the guest abort. 
  https://bugs.launchpad.net/qemu/+bug/1019179

Test environment:
==
  Platform   Westmere-EPSandybridge-EP
  CPU Cores   2432
  Memory size 24G   32G


Best Regards,
 Yongjie Ren  (Jay)
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/5] Add vhost-blk support

2012-07-18 Thread Asias He

On 07/17/2012 07:11 PM, Stefan Hajnoczi wrote:

On Tue, Jul 17, 2012 at 10:21 AM, Asias He as...@redhat.com wrote:

On 07/17/2012 04:52 PM, Paolo Bonzini wrote:


Il 17/07/2012 10:29, Asias He ha scritto:


So, vhost-blk at least saves ~6 syscalls for us in each request.



Are they really 6?  If I/O is coalesced by a factor of 3, for example
(i.e. each exit processes 3 requests), it's really 2 syscalls per request.



Well. I am counting the number of syscalls in one notify and response
process. Sure the IO can be coalesced.


Linux AIO also supports batching in io_submit() and io_getevents().
Depending on the request pattern in the vring when you process it, you
should be able to do better than 1 set of syscalls per host I/O
request.

Are you taking advantage of that at the moment in your userspace benchmark?


OK. I know that batching in io_submit() and io_getevetns(). There was a 
patch for kvm tool long time ago. Now, both vhost-blk and kvm tool are 
not taking advantage of that atm. There are issues: e.g. How many number 
of request we want to batch? Does this batching hurt latency?


--
Asias


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Re: How to get the real device in guest os after attached a disk?

2012-07-18 Thread Stefan Hajnoczi
On Wed, Jul 18, 2012 at 5:01 AM, Wangpan hzwang...@corp.netease.com wrote:
 But how can I get the serial info in the guest os?

General documentation on persistent block device naming (just grabbed
the first useful link of Google):
https://wiki.archlinux.org/index.php/Persistent_block_device_naming

If your guest is recent enough it will automatically show the
virtio-blk device in /dev/disk/by-id.

If not, you may be able to check /sys/block/vda/serial yourself.  If
that file is not present either your guest doesn't support virtio-blk
serial: either upgrade to a newer guest OS version or use one of the
other persistent naming mechanisms explained in the link above.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/5] Add vhost-blk support

2012-07-18 Thread Stefan Hajnoczi
On Wed, Jul 18, 2012 at 9:12 AM, Asias He as...@redhat.com wrote:
 On 07/17/2012 07:11 PM, Stefan Hajnoczi wrote:

 On Tue, Jul 17, 2012 at 10:21 AM, Asias He as...@redhat.com wrote:

 On 07/17/2012 04:52 PM, Paolo Bonzini wrote:


 Il 17/07/2012 10:29, Asias He ha scritto:


 So, vhost-blk at least saves ~6 syscalls for us in each request.



 Are they really 6?  If I/O is coalesced by a factor of 3, for example
 (i.e. each exit processes 3 requests), it's really 2 syscalls per
 request.



 Well. I am counting the number of syscalls in one notify and response
 process. Sure the IO can be coalesced.


 Linux AIO also supports batching in io_submit() and io_getevents().
 Depending on the request pattern in the vring when you process it, you
 should be able to do better than 1 set of syscalls per host I/O
 request.

 Are you taking advantage of that at the moment in your userspace
 benchmark?


 OK. I know that batching in io_submit() and io_getevetns(). There was a
 patch for kvm tool long time ago. Now, both vhost-blk and kvm tool are not
 taking advantage of that atm. There are issues: e.g. How many number of
 request we want to batch? Does this batching hurt latency?

I didn't mean introducing a delay so that multiple requests can be batched.

I was just thinking of the simple case: when there are a lot of
parallel requests the chance increases that a single vring interrupt
provides several I/O requests.  In that case it's easy for the
virtio-blk implementation to issue them all in one io_submit(2) call.
The same is true for io_getevents(2), there might be several completed
host I/O requests.

The reason I mentioned this was because the actual syscall pattern per
request might not require 1 io_submit(2)/io_getevents(2) if you are
processing a lot of requests in parallel.  The only way to know why
kvmtool is slower is by profiling...

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH] qemu kvm: Recognize PCID feature

2012-07-18 Thread Mao, Junjie
Hi, Avi

Any comments on this patch? :)

 -Original Message-
 From: kvm-ow...@vger.kernel.org [mailto:kvm-ow...@vger.kernel.org] On
 Behalf Of Mao, Junjie
 Sent: Friday, July 13, 2012 12:58 PM
 To: 'kvm@vger.kernel.org'
 Subject: [PATCH] qemu kvm: Recognize PCID feature
 
 This patch makes Qemu recognize the PCID feature specified from
 configuration or command line options.
 
 Signed-off-by: Junjie Mao junjie@intel.com
 ---
  target-i386/cpu.c |2 +-
  1 files changed, 1 insertions(+), 1 deletions(-)
 
 diff --git a/target-i386/cpu.c b/target-i386/cpu.c index 5521709..efc6ece
 100644
 --- a/target-i386/cpu.c
 +++ b/target-i386/cpu.c
 @@ -50,7 +50,7 @@ static const char *ext_feature_name[] = {
  ds_cpl, vmx, smx, est,
  tm2, ssse3, cid, NULL,
  fma, cx16, xtpr, pdcm,
 -NULL, NULL, dca, sse4.1|sse4_1,
 +NULL, pcid, dca, sse4.1|sse4_1,
  sse4.2|sse4_2, x2apic, movbe, popcnt,
  tsc-deadline, aes, xsave, osxsave,
  avx, NULL, NULL, hypervisor,
 --
 1.7.1
 --
 To unsubscribe from this list: send the line unsubscribe kvm in the body of 
 a
 message to majord...@vger.kernel.org More majordomo info at
 http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/5] Add vhost-blk support

2012-07-18 Thread Asias He

On 07/17/2012 09:02 PM, Paolo Bonzini wrote:

Il 17/07/2012 14:48, Michael S. Tsirkin ha scritto:

On Tue, Jul 17, 2012 at 01:03:39PM +0100, Stefan Hajnoczi wrote:

On Tue, Jul 17, 2012 at 12:54 PM, Michael S. Tsirkin m...@redhat.com wrote:

Knowing the answer to that is important before anyone can say whether
this approach is good or not.

Stefan


Why is it?


Because there might be a fix to kvmtool which closes the gap.  It
would be embarassing if vhost-blk was pushed just because no one
looked into what is actually going on.


Embarrasing to whom? Is someone working on an optimization that
makes the work in question redundant, with posting just around
the corner? Then maybe the thing to do is just wait a bit.


Of course there is work going on to make QEMU perform better.  Not sure
about lkvm.


Of course for lkvm also.


And on the flipside, hard evidence of an overhead that cannot be
resolved could be good reason to do more vhost devices in the future.


How can one have hard evidence of an overhead that cannot be resolved?


Since we do have two completely independent userspaces (lkvm and
data-plane QEMU), you can build up some compelling evidence of an
overhead that cannot be resolved in user space.


This does not build the hard evidence either. How can one prove that 
userspace lkvm and data-plane QEMU can not be improved further? The same 
for vhost-blk.



Either way, it's useful to do this before going further.


I think each work should be discussed on its own merits.  Maybe
vhost-blk is just well written. So? What is your conclusion?


If it's just that vhost-blk is written well, my conclusion is that lkvm
people should look into improving their virtio-blk userspace.  We take
hints from each other all the time, for example virtio-scsi will have
unlocked kick in 3.6.

Why can't vhost-* just get into staging, and we call it a day?


OK. I'm fine with staging.

--
Asias


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] x86, hyper: fix build with !CONFIG_KVM_GUEST

2012-07-18 Thread Avi Kivity
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kernel/cpu/hypervisor.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 6d6dd7a..a8f8fa9 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -37,7 +37,9 @@
 #endif
x86_hyper_vmware,
x86_hyper_ms_hyperv,
+#ifdef CONFIG_KVM_GUEST
x86_hyper_kvm,
+#endif
 };
 
 const struct hypervisor_x86 *x86_hyper;
-- 
1.7.11.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/7 v6] introduce a new qom device to deal with panicked event

2012-07-18 Thread Jan Kiszka
On 2012-07-18 03:54, Wen Congyang wrote:
 At 07/06/2012 07:05 PM, Jan Kiszka Wrote:
 On 2012-07-06 11:41, Wen Congyang wrote:
 If the target is x86/x86_64, the guest's kernel will write 0x01 to the
 port KVM_PV_PORT when it is panciked. This patch introduces a new qom
 device kvm_pv_ioport to listen this I/O port, and deal with panicked
 event according to panicked_action's value. The possible actions are:
 1. emit QEVENT_GUEST_PANICKED only
 2. emit QEVENT_GUEST_PANICKED and pause the guest
 3. emit QEVENT_GUEST_PANICKED and poweroff the guest
 4. emit QEVENT_GUEST_PANICKED and reset the guest

 I/O ports does not work for some targets(for example: s390). And you
 can implement another qom device, and include it's code into pv_event.c
 for such target.

 Note: if we emit QEVENT_GUEST_PANICKED only, and the management
 application does not receive this event(the management may not
 run when the event is emitted), the management won't know the
 guest is panicked.

 Signed-off-by: Wen Congyang we...@cn.fujitsu.com
 ---
  hw/kvm/Makefile.objs |2 +-
  hw/kvm/pv_event.c|   73 +++
  hw/kvm/pv_ioport.c   |  133 
 ++
  kvm-stub.c   |9 +++
  kvm.h|3 +
  vl.c |4 ++
  6 files changed, 223 insertions(+), 1 deletions(-)
  create mode 100644 hw/kvm/pv_event.c
  create mode 100644 hw/kvm/pv_ioport.c

 diff --git a/hw/kvm/Makefile.objs b/hw/kvm/Makefile.objs
 index 226497a..23e3b30 100644
 --- a/hw/kvm/Makefile.objs
 +++ b/hw/kvm/Makefile.objs
 @@ -1 +1 @@
 -obj-$(CONFIG_KVM) += clock.o apic.o i8259.o ioapic.o i8254.o
 +obj-$(CONFIG_KVM) += clock.o apic.o i8259.o ioapic.o i8254.o pv_event.o
 diff --git a/hw/kvm/pv_event.c b/hw/kvm/pv_event.c
 new file mode 100644
 index 000..d7ded37
 --- /dev/null
 +++ b/hw/kvm/pv_event.c
 @@ -0,0 +1,73 @@
 +/*
 + * QEMU KVM support, paravirtual event device
 + *
 + * Copyright Fujitsu, Corp. 2012
 + *
 + * Authors:
 + * Wen Congyang we...@cn.fujitsu.com
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or 
 later.
 + * See the COPYING file in the top-level directory.
 + *
 + */
 +
 +#include linux/kvm_para.h
 +#include asm/kvm_para.h
 +#include qobject.h
 +#include qjson.h
 +#include monitor.h
 +#include sysemu.h
 +#include kvm.h
 +
 +/* Possible values for action parameter. */
 +#define PANICKED_REPORT 1   /* emit QEVENT_GUEST_PANICKED only */
 +#define PANICKED_PAUSE  2   /* emit QEVENT_GUEST_PANICKED and pause VM 
 */
 +#define PANICKED_POWEROFF   3   /* emit QEVENT_GUEST_PANICKED and quit VM 
 */
 +#define PANICKED_RESET  4   /* emit QEVENT_GUEST_PANICKED and reset VM 
 */
 +
 +static int panicked_action = PANICKED_REPORT;

 Avoid global variables please when there are device states. This one is
 unneeded anyway (and will generate warnings when build without KVM_PV_PORT).
 
 Hmm, do you mean introduce another qom device to store event action?

I think you should be fine with one device per bus binding, but those
will consist of a common event layer and just different I/O layers (for
bus registration and access).

Jan

-- 
Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
Corporate Competence Center Embedded Linux


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: vga passthrough // questions about pci passthrough

2012-07-18 Thread Jan Kiszka
On 2012-07-18 07:45, Martin Wolf wrote:
 Hello,
 
 i was able to passthrough an AMD 7870 videocard to my win7 guest machine.

Would you add it to http://www.linux-kvm.org/page/VGA_device_assignment?

 my host is ubuntu 12.04 with stock kernel.
 my system contains:
 dq67sw q67 mainboard
 i5-2400s cpu
 sapphire 7870 amd videocard
 xonar d2x (problems to passthrough)
 
 for full functionality i just needed two options
 
 - kernel : iommu=on
 - kvm module: ignore_msrs=1
 (if i would not set it the guest os would crash with a bluescreen)

Can you report (= kernel log) which MSRs are unknown to KVM?

 
 the unigine benchmark ran flawlessly
 also the benchmark included in windows gave my videocard
 similar values (7.7) comparable with my native win7 (7.9)
 
 
 now to my questions...
 1. is it possible to reset the videocard properly to be able to
 reboot the vm?

Which versions of kernel and qemu-kvm are involved via your distro? Can
you retry with latest Linux (3.5-rcX) / lastest qemu-kvm? Maybe
something got fixed meanwhile.

In general, there are many adapters that require special procedures to
perform resets. This one may fall into that category as well.

 
 2.the xonar d2x is a very nice audio card, it would be very handy
 to be able to use it in the vm. in my oppinion the card is a
 d2 with a pci-e to pci bridge.
 i tried to passthrough the card alone and with the pci-bridge
 that was shown though lspci, but i had no success.
 maybe you guys here have an idea on that topic?

Any further details about the error? Does the adapter work with a Linux
guest or provide more information that way?

Jan


-- 
Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
Corporate Competence Center Embedded Linux

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/7 v6] introduce a new qom device to deal with panicked event

2012-07-18 Thread Jan Kiszka
On 2012-07-18 11:19, Jan Kiszka wrote:
 On 2012-07-18 03:54, Wen Congyang wrote:
 At 07/06/2012 07:05 PM, Jan Kiszka Wrote:
 On 2012-07-06 11:41, Wen Congyang wrote:
 If the target is x86/x86_64, the guest's kernel will write 0x01 to the
 port KVM_PV_PORT when it is panciked. This patch introduces a new qom
 device kvm_pv_ioport to listen this I/O port, and deal with panicked
 event according to panicked_action's value. The possible actions are:
 1. emit QEVENT_GUEST_PANICKED only
 2. emit QEVENT_GUEST_PANICKED and pause the guest
 3. emit QEVENT_GUEST_PANICKED and poweroff the guest
 4. emit QEVENT_GUEST_PANICKED and reset the guest

 I/O ports does not work for some targets(for example: s390). And you
 can implement another qom device, and include it's code into pv_event.c
 for such target.

 Note: if we emit QEVENT_GUEST_PANICKED only, and the management
 application does not receive this event(the management may not
 run when the event is emitted), the management won't know the
 guest is panicked.

 Signed-off-by: Wen Congyang we...@cn.fujitsu.com
 ---
  hw/kvm/Makefile.objs |2 +-
  hw/kvm/pv_event.c|   73 +++
  hw/kvm/pv_ioport.c   |  133 
 ++
  kvm-stub.c   |9 +++
  kvm.h|3 +
  vl.c |4 ++
  6 files changed, 223 insertions(+), 1 deletions(-)
  create mode 100644 hw/kvm/pv_event.c
  create mode 100644 hw/kvm/pv_ioport.c

 diff --git a/hw/kvm/Makefile.objs b/hw/kvm/Makefile.objs
 index 226497a..23e3b30 100644
 --- a/hw/kvm/Makefile.objs
 +++ b/hw/kvm/Makefile.objs
 @@ -1 +1 @@
 -obj-$(CONFIG_KVM) += clock.o apic.o i8259.o ioapic.o i8254.o
 +obj-$(CONFIG_KVM) += clock.o apic.o i8259.o ioapic.o i8254.o pv_event.o
 diff --git a/hw/kvm/pv_event.c b/hw/kvm/pv_event.c
 new file mode 100644
 index 000..d7ded37
 --- /dev/null
 +++ b/hw/kvm/pv_event.c
 @@ -0,0 +1,73 @@
 +/*
 + * QEMU KVM support, paravirtual event device
 + *
 + * Copyright Fujitsu, Corp. 2012
 + *
 + * Authors:
 + * Wen Congyang we...@cn.fujitsu.com
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2 or 
 later.
 + * See the COPYING file in the top-level directory.
 + *
 + */
 +
 +#include linux/kvm_para.h
 +#include asm/kvm_para.h
 +#include qobject.h
 +#include qjson.h
 +#include monitor.h
 +#include sysemu.h
 +#include kvm.h
 +
 +/* Possible values for action parameter. */
 +#define PANICKED_REPORT 1   /* emit QEVENT_GUEST_PANICKED only */
 +#define PANICKED_PAUSE  2   /* emit QEVENT_GUEST_PANICKED and pause 
 VM */
 +#define PANICKED_POWEROFF   3   /* emit QEVENT_GUEST_PANICKED and quit VM 
 */
 +#define PANICKED_RESET  4   /* emit QEVENT_GUEST_PANICKED and reset 
 VM */
 +
 +static int panicked_action = PANICKED_REPORT;

 Avoid global variables please when there are device states. This one is
 unneeded anyway (and will generate warnings when build without KVM_PV_PORT).

 Hmm, do you mean introduce another qom device to store event action?
 
 I think you should be fine with one device per bus binding, but those
 will consist of a common event layer and just different I/O layers (for
 bus registration and access).

To make this clearer: the I/O layer should embed a common state
structure of the event layer in its device state so that the event layer
can keep things like the action mode there.

Jan

-- 
Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
Corporate Competence Center Embedded Linux


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] qemu kvm: Recognize PCID feature

2012-07-18 Thread Jan Kiszka
On 2012-07-18 10:44, Mao, Junjie wrote:
 Hi, Avi
 
 Any comments on this patch? :)

Always include qemu-devel when your are changing QEMU, qemu-kvm is just
staging for the latter. This patch can actually go into upstream
directly, maybe even via qemu-trivial as it just makes that flag selectable.

Jan

 
 -Original Message-
 From: kvm-ow...@vger.kernel.org [mailto:kvm-ow...@vger.kernel.org] On
 Behalf Of Mao, Junjie
 Sent: Friday, July 13, 2012 12:58 PM
 To: 'kvm@vger.kernel.org'
 Subject: [PATCH] qemu kvm: Recognize PCID feature

 This patch makes Qemu recognize the PCID feature specified from
 configuration or command line options.

 Signed-off-by: Junjie Mao junjie@intel.com
 ---
  target-i386/cpu.c |2 +-
  1 files changed, 1 insertions(+), 1 deletions(-)

 diff --git a/target-i386/cpu.c b/target-i386/cpu.c index 5521709..efc6ece
 100644
 --- a/target-i386/cpu.c
 +++ b/target-i386/cpu.c
 @@ -50,7 +50,7 @@ static const char *ext_feature_name[] = {
  ds_cpl, vmx, smx, est,
  tm2, ssse3, cid, NULL,
  fma, cx16, xtpr, pdcm,
 -NULL, NULL, dca, sse4.1|sse4_1,
 +NULL, pcid, dca, sse4.1|sse4_1,
  sse4.2|sse4_2, x2apic, movbe, popcnt,
  tsc-deadline, aes, xsave, osxsave,
  avx, NULL, NULL, hypervisor,
 --
 1.7.1
 --
 To unsubscribe from this list: send the line unsubscribe kvm in the body 
 of a
 message to majord...@vger.kernel.org More majordomo info at
 http://vger.kernel.org/majordomo-info.html
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 

-- 
Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
Corporate Competence Center Embedded Linux

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2 v3] KVM: PPC: booke: Add watchdog emulation

2012-07-18 Thread Bharat Bhushan
This patch adds the watchdog emulation in KVM. The watchdog
emulation is enabled by KVM_ENABLE_CAP(KVM_CAP_PPC_WDT) ioctl.
The kernel timer are used for watchdog emulation and emulates
h/w watchdog state machine. On watchdog timer expiry, it exit to QEMU
if TCR.WRC is non ZERO. QEMU can reset/shutdown etc depending upon how
it is configured.

Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com
---
v3:
 - Using KVM_REQ_WATCHDOG for userspace exit.
 - TSR changes left for vcpu thread.
 - Other review comments on v2

 arch/powerpc/include/asm/kvm_host.h  |3 +
 arch/powerpc/include/asm/kvm_ppc.h   |3 +
 arch/powerpc/include/asm/reg_booke.h |7 ++
 arch/powerpc/kvm/booke.c |  140 ++
 arch/powerpc/kvm/booke_emulate.c |8 ++
 arch/powerpc/kvm/powerpc.c   |   27 ++-
 include/linux/kvm.h  |2 +
 include/linux/kvm_host.h |1 +
 8 files changed, 187 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 50ea12f..01047f4 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -467,6 +467,8 @@ struct kvm_vcpu_arch {
ulong fault_esr;
ulong queued_dear;
ulong queued_esr;
+   spinlock_t wdt_lock;
+   struct timer_list wdt_timer;
u32 tlbcfg[4];
u32 mmucfg;
u32 epr;
@@ -482,6 +484,7 @@ struct kvm_vcpu_arch {
u8 osi_needed;
u8 osi_enabled;
u8 papr_enabled;
+   u8 watchdog_enable;
u8 sane;
u8 cpu_type;
u8 hcall_needed;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 0124937..e5cf4b9 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -67,6 +67,7 @@ extern int kvmppc_emulate_mmio(struct kvm_run *run, struct 
kvm_vcpu *vcpu);
 extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
 extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
 extern void kvmppc_decrementer_func(unsigned long data);
+extern void kvmppc_watchdog_func(unsigned long data);
 extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
 
 /* Core-specific hooks */
@@ -104,6 +105,8 @@ extern void kvmppc_core_queue_external(struct kvm_vcpu 
*vcpu,
struct kvm_interrupt *irq);
 extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
  struct kvm_interrupt *irq);
+extern void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu);
+extern void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu);
 
 extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
   unsigned int op, int *advance);
diff --git a/arch/powerpc/include/asm/reg_booke.h 
b/arch/powerpc/include/asm/reg_booke.h
index 2d916c4..e07e6af 100644
--- a/arch/powerpc/include/asm/reg_booke.h
+++ b/arch/powerpc/include/asm/reg_booke.h
@@ -539,6 +539,13 @@
 #define TCR_FIE0x0080  /* FIT Interrupt Enable */
 #define TCR_ARE0x0040  /* Auto Reload Enable */
 
+#ifdef CONFIG_E500
+#define TCR_GET_WP(tcr)  tcr)  0xC000)  30) | \
+ (((tcr)  0x1E)  15))
+#else
+#define TCR_GET_WP(tcr)  (((tcr)  0xC000)  30)
+#endif
+
 /* Bit definitions for the TSR. */
 #define TSR_ENW0x8000  /* Enable Next Watchdog */
 #define TSR_WIS0x4000  /* WDT Interrupt Status */
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index d25a097..9682506 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -206,6 +206,16 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, vcpu-arch.pending_exceptions);
 }
 
+void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu)
+{
+   kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG);
+}
+
+void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu)
+{
+   clear_bit(BOOKE_IRQPRIO_WATCHDOG, vcpu-arch.pending_exceptions);
+}
+
 static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
 {
 #ifdef CONFIG_KVM_BOOKE_HV
@@ -325,6 +335,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu 
*vcpu,
msr_mask = MSR_CE | MSR_ME | MSR_DE;
int_class = INT_CLASS_NONCRIT;
break;
+   case BOOKE_IRQPRIO_WATCHDOG:
case BOOKE_IRQPRIO_CRITICAL:
case BOOKE_IRQPRIO_DBELL_CRIT:
allowed = vcpu-arch.shared-msr  MSR_CE;
@@ -404,12 +415,112 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu 
*vcpu,
return allowed;
 }
 
+/*
+ * Return the number of jiffies until the next timeout.  If the timeout is
+ * longer than the NEXT_TIMER_MAX_DELTA, then return NEXT_TIMER_MAX_DELTA
+ * because the larger value can break the timer APIs.
+ */

[PATCH 2/2 v4] KVM: PPC: booke: Add watchdog emulation

2012-07-18 Thread Bharat Bhushan
This patch adds the watchdog emulation in KVM. The watchdog
emulation is enabled by KVM_ENABLE_CAP(KVM_CAP_PPC_WDT) ioctl.
The kernel timer are used for watchdog emulation and emulates
h/w watchdog state machine. On watchdog timer expiry, it exit to QEMU
if TCR.WRC is non ZERO. QEMU can reset/shutdown etc depending upon how
it is configured.

Signed-off-by: Liu Yu yu@freescale.com
Signed-off-by: Scott Wood scottw...@freescale.com
Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com
---
v4:
 - in v3 i forgot to add Scott Wood and Liu Yu signoff

v3:
 - Using KVM_REQ_WATCHDOG for userspace exit.
 - TSR changes left for vcpu thread.
 - Other review comments on v2

 arch/powerpc/include/asm/kvm_host.h  |3 +
 arch/powerpc/include/asm/kvm_ppc.h   |3 +
 arch/powerpc/include/asm/reg_booke.h |7 ++
 arch/powerpc/kvm/booke.c |  140 ++
 arch/powerpc/kvm/booke_emulate.c |8 ++
 arch/powerpc/kvm/powerpc.c   |   27 ++-
 include/linux/kvm.h  |2 +
 include/linux/kvm_host.h |1 +
 8 files changed, 187 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 50ea12f..01047f4 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -467,6 +467,8 @@ struct kvm_vcpu_arch {
ulong fault_esr;
ulong queued_dear;
ulong queued_esr;
+   spinlock_t wdt_lock;
+   struct timer_list wdt_timer;
u32 tlbcfg[4];
u32 mmucfg;
u32 epr;
@@ -482,6 +484,7 @@ struct kvm_vcpu_arch {
u8 osi_needed;
u8 osi_enabled;
u8 papr_enabled;
+   u8 watchdog_enable;
u8 sane;
u8 cpu_type;
u8 hcall_needed;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 0124937..e5cf4b9 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -67,6 +67,7 @@ extern int kvmppc_emulate_mmio(struct kvm_run *run, struct 
kvm_vcpu *vcpu);
 extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
 extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
 extern void kvmppc_decrementer_func(unsigned long data);
+extern void kvmppc_watchdog_func(unsigned long data);
 extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
 
 /* Core-specific hooks */
@@ -104,6 +105,8 @@ extern void kvmppc_core_queue_external(struct kvm_vcpu 
*vcpu,
struct kvm_interrupt *irq);
 extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
  struct kvm_interrupt *irq);
+extern void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu);
+extern void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu);
 
 extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
   unsigned int op, int *advance);
diff --git a/arch/powerpc/include/asm/reg_booke.h 
b/arch/powerpc/include/asm/reg_booke.h
index 2d916c4..e07e6af 100644
--- a/arch/powerpc/include/asm/reg_booke.h
+++ b/arch/powerpc/include/asm/reg_booke.h
@@ -539,6 +539,13 @@
 #define TCR_FIE0x0080  /* FIT Interrupt Enable */
 #define TCR_ARE0x0040  /* Auto Reload Enable */
 
+#ifdef CONFIG_E500
+#define TCR_GET_WP(tcr)  tcr)  0xC000)  30) | \
+ (((tcr)  0x1E)  15))
+#else
+#define TCR_GET_WP(tcr)  (((tcr)  0xC000)  30)
+#endif
+
 /* Bit definitions for the TSR. */
 #define TSR_ENW0x8000  /* Enable Next Watchdog */
 #define TSR_WIS0x4000  /* WDT Interrupt Status */
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index d25a097..9682506 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -206,6 +206,16 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, vcpu-arch.pending_exceptions);
 }
 
+void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu)
+{
+   kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG);
+}
+
+void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu)
+{
+   clear_bit(BOOKE_IRQPRIO_WATCHDOG, vcpu-arch.pending_exceptions);
+}
+
 static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
 {
 #ifdef CONFIG_KVM_BOOKE_HV
@@ -325,6 +335,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu 
*vcpu,
msr_mask = MSR_CE | MSR_ME | MSR_DE;
int_class = INT_CLASS_NONCRIT;
break;
+   case BOOKE_IRQPRIO_WATCHDOG:
case BOOKE_IRQPRIO_CRITICAL:
case BOOKE_IRQPRIO_DBELL_CRIT:
allowed = vcpu-arch.shared-msr  MSR_CE;
@@ -404,12 +415,112 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu 
*vcpu,
return allowed;
 }
 
+/*
+ * Return the number of jiffies until the next timeout.  If the 

Re: [PATCH 0/5] Add vhost-blk support

2012-07-18 Thread Ronen Hod

On 07/17/2012 12:21 PM, Asias He wrote:

On 07/17/2012 04:52 PM, Paolo Bonzini wrote:

Il 17/07/2012 10:29, Asias He ha scritto:

So, vhost-blk at least saves ~6 syscalls for us in each request.


Are they really 6?  If I/O is coalesced by a factor of 3, for example
(i.e. each exit processes 3 requests), it's really 2 syscalls per request.


Well. I am counting the number of syscalls in one notify and response process. 
Sure the IO can be coalesced.


Note that Asias is using very fast disks (FusionIO  Ramdisk).
- This might affect the level of coalescing both ways, depending on the 
scenario and algorithm.
- This also means that the 5%-15% gain will probably be lower in real life.
Ronen.




Also, is there anything we can improve? Perhaps we can modify epoll and
ask it to clear the eventfd for us (would save 2 reads)?  Or
io_getevents (would save 1)?


I guess you mean qemu here. Yes, in theory, qemu's block layer can be
improved to achieve similar performance as vhost-blk or kvm tool's
userspace virito-blk has. But I think it makes no sense to prevent one
solution becase there is another in theory solution called: we can do
similar in qemu.


It depends.  Like vhost-scsi, vhost-blk has the problem of a crippled
feature set: no support for block device formats, non-raw protocols,
etc.  This makes it different from vhost-net.


Data-plane qemu also has this cripppled feature set problem, no? Does user 
always choose to use block devices format like qcow2? What if they prefer raw 
image or raw block device?



So it begs the question, is it going to be used in production, or just a
useful reference tool?


This should be decided by user, I can not speak for them. What is wrong with 
adding one option for user which they can decide?




--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 3/4] kvm: Create kvm_clear_irq()

2012-07-18 Thread Michael S. Tsirkin
On Wed, Jul 18, 2012 at 09:27:42AM +0300, Gleb Natapov wrote:
 On Tue, Jul 17, 2012 at 07:14:52PM +0300, Michael S. Tsirkin wrote:
   _Seems_ racy, or _is_ racy?  Please identify the race.
  
  Look at this:
  
  static inline int kvm_irq_line_state(unsigned long *irq_state,
   int irq_source_id, int level)
  {
  /* Logical OR for level trig interrupt */
  if (level)
  set_bit(irq_source_id, irq_state);
  else
  clear_bit(irq_source_id, irq_state);
  
  return !!(*irq_state);
  }
  
  
  Now:
  If other CPU changes some other bit after the atomic change,
  it looks like !!(*irq_state) might return a stale value.
  
  CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
  If CPU 0 sees a stale value now it will return 0 here
  and interrupt will get cleared.
  
 This will hardly happen on x86 especially since bit is set with
 serialized instruction.

Probably. But it does make me a bit uneasy.  Why don't we pass
irq_source_id to kvm_pic_set_irq/kvm_ioapic_set_irq, and move
kvm_irq_line_state to under pic_lock/ioapic_lock?  We can then use
__set_bit/__clear_bit in kvm_irq_line_state, making the ordering simpler
and saving an atomic op in the process.

 But there is actually a race here.
 CPU 0 clears bit 0. CPU 0 read irq_state as 0. CPU 1 sets level to 1.
 CPU 1 calls kvm_ioapic_set_irq(1). CPU 0 calls kvm_ioapic_set_irq(0).
 No ioapic thinks the level is 0 but irq_state is not 0.
 
 This untested and un-compiled patch should fix it.
 
 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
 index ef91d79..e22c78b 100644
 --- a/arch/x86/include/asm/kvm_host.h
 +++ b/arch/x86/include/asm/kvm_host.h
 @@ -825,7 +825,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct 
 kvm_mmu *mmu,
  void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
  bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
  
 -int kvm_pic_set_irq(void *opaque, int irq, int level);
 +int kvm_pic_set_irq(void *opaque, int irq);
  
  void kvm_inject_nmi(struct kvm_vcpu *vcpu);
  
 diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
 index 81cf4fa..0d6988f 100644
 --- a/arch/x86/kvm/i8259.c
 +++ b/arch/x86/kvm/i8259.c
 @@ -188,12 +188,13 @@ void kvm_pic_update_irq(struct kvm_pic *s)
   pic_unlock(s);
  }
  
 -int kvm_pic_set_irq(void *opaque, int irq, int level)
 +int kvm_pic_set_irq(void *opaque, int irq)
  {
   struct kvm_pic *s = opaque;
 - int ret = -1;
 + int ret = -1, level;
  
   pic_lock(s);
 + level = !!s-irq_states[irq];
   if (irq = 0  irq  PIC_NUM_PINS) {
   ret = pic_set_irq1(s-pics[irq  3], irq  7, level);
   pic_update_irq(s);
 diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
 index 26fd54d..6ad6a6b 100644
 --- a/virt/kvm/ioapic.c
 +++ b/virt/kvm/ioapic.c
 @@ -191,14 +191,15 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, 
 int irq)
   return kvm_irq_delivery_to_apic(ioapic-kvm, NULL, irqe);
  }
  
 -int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
 +int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq)
  {
   u32 old_irr;
   u32 mask = 1  irq;
   union kvm_ioapic_redirect_entry entry;
 - int ret = 1;
 + int ret = 1, level;
  
   spin_lock(ioapic-lock);
 + level = !!ioapic-irq_states[irq];
   old_irr = ioapic-irr;
   if (irq = 0  irq  IOAPIC_NUM_PINS) {
   entry = ioapic-redirtbl[irq];
 diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
 index 32872a0..65894dd 100644
 --- a/virt/kvm/ioapic.h
 +++ b/virt/kvm/ioapic.h
 @@ -74,7 +74,7 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int 
 trigger_mode);
  bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector);
  int kvm_ioapic_init(struct kvm *kvm);
  void kvm_ioapic_destroy(struct kvm *kvm);
 -int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
 +int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq);
  void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
  int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
   struct kvm_lapic_irq *irq);
 diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
 index a6a0365..db0ccef 100644
 --- a/virt/kvm/irq_comm.c
 +++ b/virt/kvm/irq_comm.c
 @@ -33,7 +33,7 @@
  
  #include ioapic.h
  
 -static inline int kvm_irq_line_state(unsigned long *irq_state,
 +static inline void kvm_irq_line_state(unsigned long *irq_state,
int irq_source_id, int level)
  {
   /* Logical OR for level trig interrupt */
 @@ -41,8 +41,6 @@ static inline int kvm_irq_line_state(unsigned long 
 *irq_state,
   set_bit(irq_source_id, irq_state);
   else
   clear_bit(irq_source_id, irq_state);
 -
 - return !!(*irq_state);
  }
  
  static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
 @@ -50,9 +48,9 @@ static int 

Re: [PATCH v5 3/4] kvm: Create kvm_clear_irq()

2012-07-18 Thread Gleb Natapov
On Wed, Jul 18, 2012 at 01:20:29PM +0300, Michael S. Tsirkin wrote:
 On Wed, Jul 18, 2012 at 09:27:42AM +0300, Gleb Natapov wrote:
  On Tue, Jul 17, 2012 at 07:14:52PM +0300, Michael S. Tsirkin wrote:
_Seems_ racy, or _is_ racy?  Please identify the race.
   
   Look at this:
   
   static inline int kvm_irq_line_state(unsigned long *irq_state,
int irq_source_id, int level)
   {
   /* Logical OR for level trig interrupt */
   if (level)
   set_bit(irq_source_id, irq_state);
   else
   clear_bit(irq_source_id, irq_state);
   
   return !!(*irq_state);
   }
   
   
   Now:
   If other CPU changes some other bit after the atomic change,
   it looks like !!(*irq_state) might return a stale value.
   
   CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
   If CPU 0 sees a stale value now it will return 0 here
   and interrupt will get cleared.
   
  This will hardly happen on x86 especially since bit is set with
  serialized instruction.
 
 Probably. But it does make me a bit uneasy.  Why don't we pass
 irq_source_id to kvm_pic_set_irq/kvm_ioapic_set_irq, and move
 kvm_irq_line_state to under pic_lock/ioapic_lock?  We can then use
 __set_bit/__clear_bit in kvm_irq_line_state, making the ordering simpler
 and saving an atomic op in the process.
 
With my patch I do not see why we can't change them to unlocked variant
without moving them anywhere. The only requirement is to not use RMW
sequence to set/clear bits. The ordering of setting does not matter. The
ordering of reading is.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 2/4] kvm: KVM_EOIFD, an eventfd for EOIs

2012-07-18 Thread Michael S. Tsirkin
On Tue, Jul 17, 2012 at 08:44:04PM -0600, Alex Williamson wrote:
 On Wed, 2012-07-18 at 01:24 +0300, Michael S. Tsirkin wrote:
  On Tue, Jul 17, 2012 at 04:09:25PM -0600, Alex Williamson wrote:
   On Wed, 2012-07-18 at 00:23 +0300, Michael S. Tsirkin wrote:
On Tue, Jul 17, 2012 at 02:03:05PM -0600, Alex Williamson wrote:
 On Tue, 2012-07-17 at 21:58 +0300, Michael S. Tsirkin wrote:
  On Tue, Jul 17, 2012 at 10:52:16AM -0600, Alex Williamson wrote:
   On Tue, 2012-07-17 at 19:19 +0300, Michael S. Tsirkin wrote:
On Tue, Jul 17, 2012 at 10:06:01AM -0600, Alex Williamson wrote:
 On Tue, 2012-07-17 at 18:53 +0300, Michael S. Tsirkin wrote:
  On Tue, Jul 17, 2012 at 09:41:09AM -0600, Alex Williamson 
  wrote:
   On Tue, 2012-07-17 at 18:13 +0300, Michael S. Tsirkin 
   wrote:
On Tue, Jul 17, 2012 at 08:57:04AM -0600, Alex 
Williamson wrote:
 On Tue, 2012-07-17 at 17:42 +0300, Michael S. Tsirkin 
 wrote:
  On Tue, Jul 17, 2012 at 08:29:43AM -0600, Alex 
  Williamson wrote:
   On Tue, 2012-07-17 at 17:10 +0300, Michael S. 
   Tsirkin wrote:
On Tue, Jul 17, 2012 at 07:59:16AM -0600, Alex 
Williamson wrote:
 On Tue, 2012-07-17 at 13:21 +0300, Michael S. 
 Tsirkin wrote:
  On Mon, Jul 16, 2012 at 02:33:55PM -0600, 
  Alex Williamson wrote:
   + if (args-flags  
   KVM_EOIFD_FLAG_LEVEL_IRQFD) {
   + struct _irqfd *irqfd = 
   _irqfd_fdget_lock(kvm, args-irqfd);
   + if (IS_ERR(irqfd)) {
   + ret = PTR_ERR(irqfd);
   + goto fail;
   + }
   +
   + gsi = irqfd-gsi;
   + level_irqfd = 
   eventfd_ctx_get(irqfd-eventfd);
   + source = 
   _irq_source_get(irqfd-source);
   + _irqfd_put_unlock(irqfd);
   + if (!source) {
   + ret = -EINVAL;
   + goto fail;
   + }
   + } else {
   + ret = -EINVAL;
   + goto fail;
   + }
   +
   + INIT_LIST_HEAD(eoifd-list);
   + eoifd-kvm = kvm;
   + eoifd-eventfd = eventfd;
   + eoifd-source = source;
   + eoifd-level_irqfd = level_irqfd;
   + eoifd-notifier.gsi = gsi;
   + eoifd-notifier.irq_acked = eoifd_event;
  
  OK so this means eoifd keeps a reference to 
  the irqfd.
  And since this is the case, can't we drop 
  the reference counting
  around source ids now? Everything is 
  referenced through irqfd.
 
 Holding a reference and using it as a 
 reference count are not the same
 thing.  What if another module holds a 
 reference to this eventfd?  How
 do we do anything on release?

We don't as there is no release, and using kref 
on source id does not
help: it just never gets invoked.
   
   Please work out how you think it should work and 
   let me know, I don't
   see it.  We have an irq source id that needs to 
   be allocated by irqfd
   and returned when it's unused.  It becomes unused 
   when neither irqfd nor
   eoifd are making use of it.  irqfd and eoifd may 
   be closed in any order.
   Use of the source id is what we're reference 
   counting, which is why it's
   in struct _irq_source.  How can I use an eventfd 
   reference for the same?
   I don't know when it's unused.  I don't know who 
   else holds a reference
   to it...  Doesn't make sense to me.  Feels like 
   attempting to squat on
   someone else's object.
   
   
  
  eoifd should prevent irqfd from being released.
 
 Why?  Note that this is actually quite difficult too. 
  We can't fail a
 release, nobody checks close(3p) return.  Blocking a 
 release is likely
 to cause all sorts of problems, so what you mean is 
 that irqfd should
 linger around until there are no references to it... 
 

Re: [PATCH v5 3/4] kvm: Create kvm_clear_irq()

2012-07-18 Thread Michael S. Tsirkin
On Wed, Jul 18, 2012 at 01:27:39PM +0300, Gleb Natapov wrote:
 On Wed, Jul 18, 2012 at 01:20:29PM +0300, Michael S. Tsirkin wrote:
  On Wed, Jul 18, 2012 at 09:27:42AM +0300, Gleb Natapov wrote:
   On Tue, Jul 17, 2012 at 07:14:52PM +0300, Michael S. Tsirkin wrote:
 _Seems_ racy, or _is_ racy?  Please identify the race.

Look at this:

static inline int kvm_irq_line_state(unsigned long *irq_state,
 int irq_source_id, int level)
{
/* Logical OR for level trig interrupt */
if (level)
set_bit(irq_source_id, irq_state);
else
clear_bit(irq_source_id, irq_state);

return !!(*irq_state);
}


Now:
If other CPU changes some other bit after the atomic change,
it looks like !!(*irq_state) might return a stale value.

CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
If CPU 0 sees a stale value now it will return 0 here
and interrupt will get cleared.

   This will hardly happen on x86 especially since bit is set with
   serialized instruction.
  
  Probably. But it does make me a bit uneasy.  Why don't we pass
  irq_source_id to kvm_pic_set_irq/kvm_ioapic_set_irq, and move
  kvm_irq_line_state to under pic_lock/ioapic_lock?  We can then use
  __set_bit/__clear_bit in kvm_irq_line_state, making the ordering simpler
  and saving an atomic op in the process.
  
 With my patch I do not see why we can't change them to unlocked variant
 without moving them anywhere. The only requirement is to not use RMW
 sequence to set/clear bits. The ordering of setting does not matter. The
 ordering of reading is.

You want to use __set_bit/__clear_bit on the same word
from multiple CPUs, without locking?
Why won't this lose information?

In any case, it seems simpler and safer to do accesses under lock
than rely on specific use.

 --
   Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 3/4] kvm: Create kvm_clear_irq()

2012-07-18 Thread Gleb Natapov
On Wed, Jul 18, 2012 at 01:33:35PM +0300, Michael S. Tsirkin wrote:
 On Wed, Jul 18, 2012 at 01:27:39PM +0300, Gleb Natapov wrote:
  On Wed, Jul 18, 2012 at 01:20:29PM +0300, Michael S. Tsirkin wrote:
   On Wed, Jul 18, 2012 at 09:27:42AM +0300, Gleb Natapov wrote:
On Tue, Jul 17, 2012 at 07:14:52PM +0300, Michael S. Tsirkin wrote:
  _Seems_ racy, or _is_ racy?  Please identify the race.
 
 Look at this:
 
 static inline int kvm_irq_line_state(unsigned long *irq_state,
  int irq_source_id, int level)
 {
 /* Logical OR for level trig interrupt */
 if (level)
 set_bit(irq_source_id, irq_state);
 else
 clear_bit(irq_source_id, irq_state);
 
 return !!(*irq_state);
 }
 
 
 Now:
 If other CPU changes some other bit after the atomic change,
 it looks like !!(*irq_state) might return a stale value.
 
 CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
 If CPU 0 sees a stale value now it will return 0 here
 and interrupt will get cleared.
 
This will hardly happen on x86 especially since bit is set with
serialized instruction.
   
   Probably. But it does make me a bit uneasy.  Why don't we pass
   irq_source_id to kvm_pic_set_irq/kvm_ioapic_set_irq, and move
   kvm_irq_line_state to under pic_lock/ioapic_lock?  We can then use
   __set_bit/__clear_bit in kvm_irq_line_state, making the ordering simpler
   and saving an atomic op in the process.
   
  With my patch I do not see why we can't change them to unlocked variant
  without moving them anywhere. The only requirement is to not use RMW
  sequence to set/clear bits. The ordering of setting does not matter. The
  ordering of reading is.
 
 You want to use __set_bit/__clear_bit on the same word
 from multiple CPUs, without locking?
 Why won't this lose information?
Because it is not RMW. If it is then yes, you can't do that.
 
 In any case, it seems simpler and safer to do accesses under lock
 than rely on specific use.
 
  --
  Gleb.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: UIO: missing resource mapping

2012-07-18 Thread Dominic Eschweiler
Am Montag, den 16.07.2012, 23:58 +0200 schrieb Hans J. Koch:
 Try to hack up a patch to add generic BAR mapping to uio_pci_generic.c
 and post it for review.
 

Here we go ...
 
Signed-off-by: Dominic Eschweiler eschwei...@fias.uni-frankfurt.de
diff --git a/drivers/uio/uio_pci_generic.c
b/drivers/uio/uio_pci_generic.c
index 0bd08ef..e25991e 100644
--- a/drivers/uio/uio_pci_generic.c
+++ b/drivers/uio/uio_pci_generic.c
@@ -25,10 +25,12 @@
 #include linux/slab.h
 #include linux/uio_driver.h
 
-#define DRIVER_VERSION 0.01.0
+#define DRIVER_VERSION 0.02.0
 #define DRIVER_AUTHOR  Michael S. Tsirkin m...@redhat.com
 #define DRIVER_DESCGeneric UIO driver for PCI 2.3 devices
 
+#define DRV_NAME uio_pci_generic
+
 struct uio_pci_generic_dev {
struct uio_info info;
struct pci_dev *pdev;
@@ -58,6 +60,7 @@ static int __devinit probe(struct pci_dev *pdev,
 {
struct uio_pci_generic_dev *gdev;
int err;
+   int i;
 
err = pci_enable_device(pdev);
if (err) {
@@ -67,8 +70,7 @@ static int __devinit probe(struct pci_dev *pdev,
}
 
if (!pdev-irq) {
-   dev_warn(pdev-dev, No IRQ assigned to device: 
-no support for interrupts?\n);
+   dev_warn(pdev-dev, No IRQ assigned to device: no support for
interrupts?\n);
pci_disable_device(pdev);
return -ENODEV;
}
@@ -91,10 +93,31 @@ static int __devinit probe(struct pci_dev *pdev,
gdev-info.handler = irqhandler;
gdev-pdev = pdev;
 
+   /* request regions */
+   err = pci_request_regions(pdev, DRV_NAME);
+   if (err) {
+   dev_err(pdev-dev, Couldn't get PCI resources, aborting\n);
+   return err;
+   }
+
+   /* create attributes for BAR mappings */
+   for (i = 0; i  PCI_NUM_RESOURCES; i++) {
+   if (pdev-resource[i].flags 
+   (pdev-resource[i].flags  IORESOURCE_MEM)) {
+   gdev-info.mem[i].addr = pci_resource_start(pdev, i);
+   gdev-info.mem[i].size = pci_resource_len(pdev, i);
+   gdev-info.mem[i].internal_addr = NULL;
+   gdev-info.mem[i].memtype = UIO_MEM_PHYS;
+   }
+   }
+
if (uio_register_device(pdev-dev, gdev-info))
goto err_register;
pci_set_drvdata(pdev, gdev);
 
+   pr_info(UIO_PCI_GENERIC : initialized new device (%x %x)\n,
+   pdev-vendor, pdev-device);
+
return 0;
 err_register:
kfree(gdev);
@@ -107,17 +130,21 @@ err_verify:
 static void remove(struct pci_dev *pdev)
 {
struct uio_pci_generic_dev *gdev = pci_get_drvdata(pdev);
-
uio_unregister_device(gdev-info);
+
+   pci_release_regions(pdev);
pci_disable_device(pdev);
kfree(gdev);
+
+   pr_info(UIO_PCI_GENERIC : removed device (%x %x)\n,
+   pdev-vendor, pdev-device);
 }
 
 static struct pci_driver driver = {
-   .name = uio_pci_generic,
+   .name = DRV_NAME,
.id_table = NULL, /* only dynamic id's */
-   .probe = probe,
-   .remove = remove,
+   .probe= probe,
+   .remove   = remove,
 };
 
 static int __init init(void)

-- 
Gruß
  Dominic

Frankfurt Institute for Advanced Studies (FIAS)
Ruth-Moufang-Straße 1
D-60438 Frankfurt am Main
Germany

Phone:  +49 69 79844114

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 1/4] kvm: Extend irqfd to support level interrupts

2012-07-18 Thread Michael S. Tsirkin
On Mon, Jul 16, 2012 at 02:33:47PM -0600, Alex Williamson wrote:
 In order to inject a level interrupt from an external source using an
 irqfd, we need to allocate a new irq_source_id.  This allows us to
 assert and (later) de-assert an interrupt line independently from
 users of KVM_IRQ_LINE and avoid lost interrupts.
 
 We also add what may appear like a bit of excessive infrastructure
 around an object for storing this irq_source_id.  However, notice
 that we only provide a way to assert the interrupt here.  A follow-on
 interface will make use of the same irq_source_id to allow de-assert.
 
 Signed-off-by: Alex Williamson alex.william...@redhat.com
 ---
 
  Documentation/virtual/kvm/api.txt |6 ++
  arch/x86/kvm/x86.c|1 
  include/linux/kvm.h   |3 +
  virt/kvm/eventfd.c|  114 
 -
  4 files changed, 120 insertions(+), 4 deletions(-)
 
 diff --git a/Documentation/virtual/kvm/api.txt 
 b/Documentation/virtual/kvm/api.txt
 index 100acde..c7267d5 100644
 --- a/Documentation/virtual/kvm/api.txt
 +++ b/Documentation/virtual/kvm/api.txt
 @@ -1981,6 +1981,12 @@ the guest using the specified gsi pin.  The irqfd is 
 removed using
  the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd
  and kvm_irqfd.gsi.
  
 +The KVM_IRQFD_FLAG_LEVEL flag indicates the gsi input is for a level
 +triggered interrupt.  In this case a new irqchip input is allocated
 +which is logically OR'd with other inputs allowing multiple sources
 +to independently assert level interrupts.  The KVM_IRQFD_FLAG_LEVEL
 +is only necessary on setup, teardown is identical to that above.
 +KVM_IRQFD_FLAG_LEVEL support is indicated by KVM_CAP_IRQFD_LEVEL.
  
  5. The kvm_run structure
  
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index a01a424..80bed07 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -2148,6 +2148,7 @@ int kvm_dev_ioctl_check_extension(long ext)
   case KVM_CAP_GET_TSC_KHZ:
   case KVM_CAP_PCI_2_3:
   case KVM_CAP_KVMCLOCK_CTRL:
 + case KVM_CAP_IRQFD_LEVEL:
   r = 1;
   break;
   case KVM_CAP_COALESCED_MMIO:
 diff --git a/include/linux/kvm.h b/include/linux/kvm.h
 index 2ce09aa..b2e6e4f 100644
 --- a/include/linux/kvm.h
 +++ b/include/linux/kvm.h
 @@ -618,6 +618,7 @@ struct kvm_ppc_smmu_info {
  #define KVM_CAP_PPC_GET_SMMU_INFO 78
  #define KVM_CAP_S390_COW 79
  #define KVM_CAP_PPC_ALLOC_HTAB 80
 +#define KVM_CAP_IRQFD_LEVEL 81
  
  #ifdef KVM_CAP_IRQ_ROUTING
  
 @@ -683,6 +684,8 @@ struct kvm_xen_hvm_config {
  #endif
  
  #define KVM_IRQFD_FLAG_DEASSIGN (1  0)
 +/* Available with KVM_CAP_IRQFD_LEVEL */
 +#define KVM_IRQFD_FLAG_LEVEL (1  1)
  
  struct kvm_irqfd {
   __u32 fd;
 diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
 index 7d7e2aa..ecdbfea 100644
 --- a/virt/kvm/eventfd.c
 +++ b/virt/kvm/eventfd.c
 @@ -36,6 +36,68 @@
  #include iodev.h
  
  /*
 + * An irq_source_id can be created from KVM_IRQFD for level interrupt
 + * injections and shared with other interfaces for EOI or de-assert.
 + * Create an object with reference counting to make it easy to use.
 + */
 +struct _irq_source {
 + int id; /* the IRQ source ID */
 + bool level_asserted; /* Track assertion state and protect with lock */
 + spinlock_t lock; /* to avoid unnecessary re-assert/spurious eoi. */
 + struct kvm *kvm;
 + struct kref kref;
 +};
 +
 +static void _irq_source_release(struct kref *kref)
 +{
 + struct _irq_source *source;
 +
 + source = container_of(kref, struct _irq_source, kref);
 +
 + /* This also de-asserts */
 + kvm_free_irq_source_id(source-kvm, source-id);
 + kfree(source);
 +}
 +
 +static void _irq_source_put(struct _irq_source *source)
 +{
 + if (source)
 + kref_put(source-kref, _irq_source_release);
 +}
 +
 +static struct _irq_source *__attribute__ ((used)) /* white lie for now */
 +_irq_source_get(struct _irq_source *source)
 +{
 + if (source)
 + kref_get(source-kref);
 +
 + return source;
 +}
 +
 +static struct _irq_source *_irq_source_alloc(struct kvm *kvm)
 +{
 + struct _irq_source *source;
 + int id;
 +
 + source = kzalloc(sizeof(*source), GFP_KERNEL);
 + if (!source)
 + return ERR_PTR(-ENOMEM);
 +
 + id = kvm_request_irq_source_id(kvm);
 + if (id  0) {
 + kfree(source);
 + return ERR_PTR(id);
 + }
 +
 + kref_init(source-kref);
 + spin_lock_init(source-lock);
 + source-kvm = kvm;
 + source-id = id;
 +
 + return source;
 +}
 +
 +/*
   * 
   * irqfd: Allows an fd to be used to inject an interrupt to the guest
   *
 @@ -52,6 +114,8 @@ struct _irqfd {
   /* Used for level IRQ fast-path */
   int gsi;
   struct work_struct inject;
 + /* IRQ source ID for level triggered irqfds */
 + struct 

Re: [PATCH v5 0/4] kvm: level irqfd and new eoifd

2012-07-18 Thread Michael S. Tsirkin
On Mon, Jul 16, 2012 at 02:33:38PM -0600, Alex Williamson wrote:
 v5:
  - irqfds now have a one-to-one mapping with eoifds to prevent users
from consuming all of kernel memory by repeatedly creating eoifds
from a single irqfd.
  - implement a kvm_clear_irq() which does a test_and_clear_bit of
the irq_state, only updating the pic/ioapic if changes and allowing
the caller to know if anything was done.  I added this onto the end
as it's essentially an optimization on the previous design.  It's
hard to tell if there's an actual performance benefit to this.

I have to agree to this, but we need to avoid invoking kvm_set_irq in
atomic context, without introducing sprurious eois.

Can bool + spinlock that previous patch has be replaced by an atomic?

  - dropped eoifd gsi support patch as it was only an FYI.
 
 Thanks,
 
 Alex
 
 ---
 
 Alex Williamson (4):
   kvm: Convert eoifd to use kvm_clear_irq
   kvm: Create kvm_clear_irq()
   kvm: KVM_EOIFD, an eventfd for EOIs
   kvm: Extend irqfd to support level interrupts
 
 
  Documentation/virtual/kvm/api.txt |   28 +++
  arch/x86/kvm/x86.c|3 
  include/linux/kvm.h   |   18 ++
  include/linux/kvm_host.h  |   16 ++
  virt/kvm/eventfd.c|  333 
 +
  virt/kvm/irq_comm.c   |   78 +
  virt/kvm/kvm_main.c   |   11 +
  7 files changed, 483 insertions(+), 4 deletions(-)
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 1/4] kvm: Extend irqfd to support level interrupts

2012-07-18 Thread Gleb Natapov
On Wed, Jul 18, 2012 at 01:41:14PM +0300, Michael S. Tsirkin wrote:
 On Mon, Jul 16, 2012 at 02:33:47PM -0600, Alex Williamson wrote:
  In order to inject a level interrupt from an external source using an
  irqfd, we need to allocate a new irq_source_id.  This allows us to
  assert and (later) de-assert an interrupt line independently from
  users of KVM_IRQ_LINE and avoid lost interrupts.
  
  We also add what may appear like a bit of excessive infrastructure
  around an object for storing this irq_source_id.  However, notice
  that we only provide a way to assert the interrupt here.  A follow-on
  interface will make use of the same irq_source_id to allow de-assert.
  
  Signed-off-by: Alex Williamson alex.william...@redhat.com
  ---
  
   Documentation/virtual/kvm/api.txt |6 ++
   arch/x86/kvm/x86.c|1 
   include/linux/kvm.h   |3 +
   virt/kvm/eventfd.c|  114 
  -
   4 files changed, 120 insertions(+), 4 deletions(-)
  
  diff --git a/Documentation/virtual/kvm/api.txt 
  b/Documentation/virtual/kvm/api.txt
  index 100acde..c7267d5 100644
  --- a/Documentation/virtual/kvm/api.txt
  +++ b/Documentation/virtual/kvm/api.txt
  @@ -1981,6 +1981,12 @@ the guest using the specified gsi pin.  The irqfd is 
  removed using
   the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd
   and kvm_irqfd.gsi.
   
  +The KVM_IRQFD_FLAG_LEVEL flag indicates the gsi input is for a level
  +triggered interrupt.  In this case a new irqchip input is allocated
  +which is logically OR'd with other inputs allowing multiple sources
  +to independently assert level interrupts.  The KVM_IRQFD_FLAG_LEVEL
  +is only necessary on setup, teardown is identical to that above.
  +KVM_IRQFD_FLAG_LEVEL support is indicated by KVM_CAP_IRQFD_LEVEL.
   
   5. The kvm_run structure
   
  diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
  index a01a424..80bed07 100644
  --- a/arch/x86/kvm/x86.c
  +++ b/arch/x86/kvm/x86.c
  @@ -2148,6 +2148,7 @@ int kvm_dev_ioctl_check_extension(long ext)
  case KVM_CAP_GET_TSC_KHZ:
  case KVM_CAP_PCI_2_3:
  case KVM_CAP_KVMCLOCK_CTRL:
  +   case KVM_CAP_IRQFD_LEVEL:
  r = 1;
  break;
  case KVM_CAP_COALESCED_MMIO:
  diff --git a/include/linux/kvm.h b/include/linux/kvm.h
  index 2ce09aa..b2e6e4f 100644
  --- a/include/linux/kvm.h
  +++ b/include/linux/kvm.h
  @@ -618,6 +618,7 @@ struct kvm_ppc_smmu_info {
   #define KVM_CAP_PPC_GET_SMMU_INFO 78
   #define KVM_CAP_S390_COW 79
   #define KVM_CAP_PPC_ALLOC_HTAB 80
  +#define KVM_CAP_IRQFD_LEVEL 81
   
   #ifdef KVM_CAP_IRQ_ROUTING
   
  @@ -683,6 +684,8 @@ struct kvm_xen_hvm_config {
   #endif
   
   #define KVM_IRQFD_FLAG_DEASSIGN (1  0)
  +/* Available with KVM_CAP_IRQFD_LEVEL */
  +#define KVM_IRQFD_FLAG_LEVEL (1  1)
   
   struct kvm_irqfd {
  __u32 fd;
  diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
  index 7d7e2aa..ecdbfea 100644
  --- a/virt/kvm/eventfd.c
  +++ b/virt/kvm/eventfd.c
  @@ -36,6 +36,68 @@
   #include iodev.h
   
   /*
  + * An irq_source_id can be created from KVM_IRQFD for level interrupt
  + * injections and shared with other interfaces for EOI or de-assert.
  + * Create an object with reference counting to make it easy to use.
  + */
  +struct _irq_source {
  +   int id; /* the IRQ source ID */
  +   bool level_asserted; /* Track assertion state and protect with lock */
  +   spinlock_t lock; /* to avoid unnecessary re-assert/spurious eoi. */
  +   struct kvm *kvm;
  +   struct kref kref;
  +};
  +
  +static void _irq_source_release(struct kref *kref)
  +{
  +   struct _irq_source *source;
  +
  +   source = container_of(kref, struct _irq_source, kref);
  +
  +   /* This also de-asserts */
  +   kvm_free_irq_source_id(source-kvm, source-id);
  +   kfree(source);
  +}
  +
  +static void _irq_source_put(struct _irq_source *source)
  +{
  +   if (source)
  +   kref_put(source-kref, _irq_source_release);
  +}
  +
  +static struct _irq_source *__attribute__ ((used)) /* white lie for now */
  +_irq_source_get(struct _irq_source *source)
  +{
  +   if (source)
  +   kref_get(source-kref);
  +
  +   return source;
  +}
  +
  +static struct _irq_source *_irq_source_alloc(struct kvm *kvm)
  +{
  +   struct _irq_source *source;
  +   int id;
  +
  +   source = kzalloc(sizeof(*source), GFP_KERNEL);
  +   if (!source)
  +   return ERR_PTR(-ENOMEM);
  +
  +   id = kvm_request_irq_source_id(kvm);
  +   if (id  0) {
  +   kfree(source);
  +   return ERR_PTR(id);
  +   }
  +
  +   kref_init(source-kref);
  +   spin_lock_init(source-lock);
  +   source-kvm = kvm;
  +   source-id = id;
  +
  +   return source;
  +}
  +
  +/*
* 
* irqfd: Allows an fd to be used to inject an interrupt to the guest
*
  @@ -52,6 +114,8 @@ struct _irqfd {
  /* Used for level IRQ 

Re: [PATCH v5 1/4] kvm: Extend irqfd to support level interrupts

2012-07-18 Thread Michael S. Tsirkin
On Wed, Jul 18, 2012 at 01:44:29PM +0300, Gleb Natapov wrote:
 On Wed, Jul 18, 2012 at 01:41:14PM +0300, Michael S. Tsirkin wrote:
  On Mon, Jul 16, 2012 at 02:33:47PM -0600, Alex Williamson wrote:
   In order to inject a level interrupt from an external source using an
   irqfd, we need to allocate a new irq_source_id.  This allows us to
   assert and (later) de-assert an interrupt line independently from
   users of KVM_IRQ_LINE and avoid lost interrupts.
   
   We also add what may appear like a bit of excessive infrastructure
   around an object for storing this irq_source_id.  However, notice
   that we only provide a way to assert the interrupt here.  A follow-on
   interface will make use of the same irq_source_id to allow de-assert.
   
   Signed-off-by: Alex Williamson alex.william...@redhat.com
   ---
   
Documentation/virtual/kvm/api.txt |6 ++
arch/x86/kvm/x86.c|1 
include/linux/kvm.h   |3 +
virt/kvm/eventfd.c|  114 
   -
4 files changed, 120 insertions(+), 4 deletions(-)
   
   diff --git a/Documentation/virtual/kvm/api.txt 
   b/Documentation/virtual/kvm/api.txt
   index 100acde..c7267d5 100644
   --- a/Documentation/virtual/kvm/api.txt
   +++ b/Documentation/virtual/kvm/api.txt
   @@ -1981,6 +1981,12 @@ the guest using the specified gsi pin.  The irqfd 
   is removed using
the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd
and kvm_irqfd.gsi.

   +The KVM_IRQFD_FLAG_LEVEL flag indicates the gsi input is for a level
   +triggered interrupt.  In this case a new irqchip input is allocated
   +which is logically OR'd with other inputs allowing multiple sources
   +to independently assert level interrupts.  The KVM_IRQFD_FLAG_LEVEL
   +is only necessary on setup, teardown is identical to that above.
   +KVM_IRQFD_FLAG_LEVEL support is indicated by KVM_CAP_IRQFD_LEVEL.

5. The kvm_run structure

   diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
   index a01a424..80bed07 100644
   --- a/arch/x86/kvm/x86.c
   +++ b/arch/x86/kvm/x86.c
   @@ -2148,6 +2148,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 case KVM_CAP_GET_TSC_KHZ:
 case KVM_CAP_PCI_2_3:
 case KVM_CAP_KVMCLOCK_CTRL:
   + case KVM_CAP_IRQFD_LEVEL:
 r = 1;
 break;
 case KVM_CAP_COALESCED_MMIO:
   diff --git a/include/linux/kvm.h b/include/linux/kvm.h
   index 2ce09aa..b2e6e4f 100644
   --- a/include/linux/kvm.h
   +++ b/include/linux/kvm.h
   @@ -618,6 +618,7 @@ struct kvm_ppc_smmu_info {
#define KVM_CAP_PPC_GET_SMMU_INFO 78
#define KVM_CAP_S390_COW 79
#define KVM_CAP_PPC_ALLOC_HTAB 80
   +#define KVM_CAP_IRQFD_LEVEL 81

#ifdef KVM_CAP_IRQ_ROUTING

   @@ -683,6 +684,8 @@ struct kvm_xen_hvm_config {
#endif

#define KVM_IRQFD_FLAG_DEASSIGN (1  0)
   +/* Available with KVM_CAP_IRQFD_LEVEL */
   +#define KVM_IRQFD_FLAG_LEVEL (1  1)

struct kvm_irqfd {
 __u32 fd;
   diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
   index 7d7e2aa..ecdbfea 100644
   --- a/virt/kvm/eventfd.c
   +++ b/virt/kvm/eventfd.c
   @@ -36,6 +36,68 @@
#include iodev.h

/*
   + * An irq_source_id can be created from KVM_IRQFD for level interrupt
   + * injections and shared with other interfaces for EOI or de-assert.
   + * Create an object with reference counting to make it easy to use.
   + */
   +struct _irq_source {
   + int id; /* the IRQ source ID */
   + bool level_asserted; /* Track assertion state and protect with lock */
   + spinlock_t lock; /* to avoid unnecessary re-assert/spurious eoi. */
   + struct kvm *kvm;
   + struct kref kref;
   +};
   +
   +static void _irq_source_release(struct kref *kref)
   +{
   + struct _irq_source *source;
   +
   + source = container_of(kref, struct _irq_source, kref);
   +
   + /* This also de-asserts */
   + kvm_free_irq_source_id(source-kvm, source-id);
   + kfree(source);
   +}
   +
   +static void _irq_source_put(struct _irq_source *source)
   +{
   + if (source)
   + kref_put(source-kref, _irq_source_release);
   +}
   +
   +static struct _irq_source *__attribute__ ((used)) /* white lie for now */
   +_irq_source_get(struct _irq_source *source)
   +{
   + if (source)
   + kref_get(source-kref);
   +
   + return source;
   +}
   +
   +static struct _irq_source *_irq_source_alloc(struct kvm *kvm)
   +{
   + struct _irq_source *source;
   + int id;
   +
   + source = kzalloc(sizeof(*source), GFP_KERNEL);
   + if (!source)
   + return ERR_PTR(-ENOMEM);
   +
   + id = kvm_request_irq_source_id(kvm);
   + if (id  0) {
   + kfree(source);
   + return ERR_PTR(id);
   + }
   +
   + kref_init(source-kref);
   + spin_lock_init(source-lock);
   + source-kvm = kvm;
   + source-id = id;
   +
   + return source;
   +}
   +
   +/*
 * 
 * irqfd: 

Re: [PATCH v5 1/4] kvm: Extend irqfd to support level interrupts

2012-07-18 Thread Gleb Natapov
On Wed, Jul 18, 2012 at 01:48:44PM +0300, Michael S. Tsirkin wrote:
 On Wed, Jul 18, 2012 at 01:44:29PM +0300, Gleb Natapov wrote:
  On Wed, Jul 18, 2012 at 01:41:14PM +0300, Michael S. Tsirkin wrote:
   On Mon, Jul 16, 2012 at 02:33:47PM -0600, Alex Williamson wrote:
In order to inject a level interrupt from an external source using an
irqfd, we need to allocate a new irq_source_id.  This allows us to
assert and (later) de-assert an interrupt line independently from
users of KVM_IRQ_LINE and avoid lost interrupts.

We also add what may appear like a bit of excessive infrastructure
around an object for storing this irq_source_id.  However, notice
that we only provide a way to assert the interrupt here.  A follow-on
interface will make use of the same irq_source_id to allow de-assert.

Signed-off-by: Alex Williamson alex.william...@redhat.com
---

 Documentation/virtual/kvm/api.txt |6 ++
 arch/x86/kvm/x86.c|1 
 include/linux/kvm.h   |3 +
 virt/kvm/eventfd.c|  114 
-
 4 files changed, 120 insertions(+), 4 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 100acde..c7267d5 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1981,6 +1981,12 @@ the guest using the specified gsi pin.  The 
irqfd is removed using
 the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd
 and kvm_irqfd.gsi.
 
+The KVM_IRQFD_FLAG_LEVEL flag indicates the gsi input is for a level
+triggered interrupt.  In this case a new irqchip input is allocated
+which is logically OR'd with other inputs allowing multiple sources
+to independently assert level interrupts.  The KVM_IRQFD_FLAG_LEVEL
+is only necessary on setup, teardown is identical to that above.
+KVM_IRQFD_FLAG_LEVEL support is indicated by KVM_CAP_IRQFD_LEVEL.
 
 5. The kvm_run structure
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a01a424..80bed07 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2148,6 +2148,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_GET_TSC_KHZ:
case KVM_CAP_PCI_2_3:
case KVM_CAP_KVMCLOCK_CTRL:
+   case KVM_CAP_IRQFD_LEVEL:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 2ce09aa..b2e6e4f 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -618,6 +618,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_PPC_GET_SMMU_INFO 78
 #define KVM_CAP_S390_COW 79
 #define KVM_CAP_PPC_ALLOC_HTAB 80
+#define KVM_CAP_IRQFD_LEVEL 81
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -683,6 +684,8 @@ struct kvm_xen_hvm_config {
 #endif
 
 #define KVM_IRQFD_FLAG_DEASSIGN (1  0)
+/* Available with KVM_CAP_IRQFD_LEVEL */
+#define KVM_IRQFD_FLAG_LEVEL (1  1)
 
 struct kvm_irqfd {
__u32 fd;
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 7d7e2aa..ecdbfea 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -36,6 +36,68 @@
 #include iodev.h
 
 /*
+ * An irq_source_id can be created from KVM_IRQFD for level interrupt
+ * injections and shared with other interfaces for EOI or de-assert.
+ * Create an object with reference counting to make it easy to use.
+ */
+struct _irq_source {
+   int id; /* the IRQ source ID */
+   bool level_asserted; /* Track assertion state and protect with 
lock */
+   spinlock_t lock; /* to avoid unnecessary re-assert/spurious 
eoi. */
+   struct kvm *kvm;
+   struct kref kref;
+};
+
+static void _irq_source_release(struct kref *kref)
+{
+   struct _irq_source *source;
+
+   source = container_of(kref, struct _irq_source, kref);
+
+   /* This also de-asserts */
+   kvm_free_irq_source_id(source-kvm, source-id);
+   kfree(source);
+}
+
+static void _irq_source_put(struct _irq_source *source)
+{
+   if (source)
+   kref_put(source-kref, _irq_source_release);
+}
+
+static struct _irq_source *__attribute__ ((used)) /* white lie for now 
*/
+_irq_source_get(struct _irq_source *source)
+{
+   if (source)
+   kref_get(source-kref);
+
+   return source;
+}
+
+static struct _irq_source *_irq_source_alloc(struct kvm *kvm)
+{
+   struct _irq_source *source;
+   int id;
+
+   source = kzalloc(sizeof(*source), GFP_KERNEL);
+   if (!source)
+   return 

Re: [PATCH v5 3/4] kvm: Create kvm_clear_irq()

2012-07-18 Thread Michael S. Tsirkin
On Wed, Jul 18, 2012 at 01:36:08PM +0300, Gleb Natapov wrote:
 On Wed, Jul 18, 2012 at 01:33:35PM +0300, Michael S. Tsirkin wrote:
  On Wed, Jul 18, 2012 at 01:27:39PM +0300, Gleb Natapov wrote:
   On Wed, Jul 18, 2012 at 01:20:29PM +0300, Michael S. Tsirkin wrote:
On Wed, Jul 18, 2012 at 09:27:42AM +0300, Gleb Natapov wrote:
 On Tue, Jul 17, 2012 at 07:14:52PM +0300, Michael S. Tsirkin wrote:
   _Seems_ racy, or _is_ racy?  Please identify the race.
  
  Look at this:
  
  static inline int kvm_irq_line_state(unsigned long *irq_state,
   int irq_source_id, int level)
  {
  /* Logical OR for level trig interrupt */
  if (level)
  set_bit(irq_source_id, irq_state);
  else
  clear_bit(irq_source_id, irq_state);
  
  return !!(*irq_state);
  }
  
  
  Now:
  If other CPU changes some other bit after the atomic change,
  it looks like !!(*irq_state) might return a stale value.
  
  CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
  If CPU 0 sees a stale value now it will return 0 here
  and interrupt will get cleared.
  
 This will hardly happen on x86 especially since bit is set with
 serialized instruction.

Probably. But it does make me a bit uneasy.  Why don't we pass
irq_source_id to kvm_pic_set_irq/kvm_ioapic_set_irq, and move
kvm_irq_line_state to under pic_lock/ioapic_lock?  We can then use
__set_bit/__clear_bit in kvm_irq_line_state, making the ordering simpler
and saving an atomic op in the process.

   With my patch I do not see why we can't change them to unlocked variant
   without moving them anywhere. The only requirement is to not use RMW
   sequence to set/clear bits. The ordering of setting does not matter. The
   ordering of reading is.
  
  You want to use __set_bit/__clear_bit on the same word
  from multiple CPUs, without locking?
  Why won't this lose information?
 Because it is not RMW. If it is then yes, you can't do that.

You are saying __set_bit does not do RMW on x86? Interesting.
It's probably not a good idea to rely on this I think.

  
  In any case, it seems simpler and safer to do accesses under lock
  than rely on specific use.
  
   --
 Gleb.
 
 --
   Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 1/4] kvm: Extend irqfd to support level interrupts

2012-07-18 Thread Michael S. Tsirkin
On Wed, Jul 18, 2012 at 01:49:06PM +0300, Gleb Natapov wrote:
 On Wed, Jul 18, 2012 at 01:48:44PM +0300, Michael S. Tsirkin wrote:
  On Wed, Jul 18, 2012 at 01:44:29PM +0300, Gleb Natapov wrote:
   On Wed, Jul 18, 2012 at 01:41:14PM +0300, Michael S. Tsirkin wrote:
On Mon, Jul 16, 2012 at 02:33:47PM -0600, Alex Williamson wrote:
 In order to inject a level interrupt from an external source using an
 irqfd, we need to allocate a new irq_source_id.  This allows us to
 assert and (later) de-assert an interrupt line independently from
 users of KVM_IRQ_LINE and avoid lost interrupts.
 
 We also add what may appear like a bit of excessive infrastructure
 around an object for storing this irq_source_id.  However, notice
 that we only provide a way to assert the interrupt here.  A follow-on
 interface will make use of the same irq_source_id to allow de-assert.
 
 Signed-off-by: Alex Williamson alex.william...@redhat.com
 ---
 
  Documentation/virtual/kvm/api.txt |6 ++
  arch/x86/kvm/x86.c|1 
  include/linux/kvm.h   |3 +
  virt/kvm/eventfd.c|  114 
 -
  4 files changed, 120 insertions(+), 4 deletions(-)
 
 diff --git a/Documentation/virtual/kvm/api.txt 
 b/Documentation/virtual/kvm/api.txt
 index 100acde..c7267d5 100644
 --- a/Documentation/virtual/kvm/api.txt
 +++ b/Documentation/virtual/kvm/api.txt
 @@ -1981,6 +1981,12 @@ the guest using the specified gsi pin.  The 
 irqfd is removed using
  the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd
  and kvm_irqfd.gsi.
  
 +The KVM_IRQFD_FLAG_LEVEL flag indicates the gsi input is for a level
 +triggered interrupt.  In this case a new irqchip input is allocated
 +which is logically OR'd with other inputs allowing multiple sources
 +to independently assert level interrupts.  The KVM_IRQFD_FLAG_LEVEL
 +is only necessary on setup, teardown is identical to that above.
 +KVM_IRQFD_FLAG_LEVEL support is indicated by KVM_CAP_IRQFD_LEVEL.
  
  5. The kvm_run structure
  
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index a01a424..80bed07 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -2148,6 +2148,7 @@ int kvm_dev_ioctl_check_extension(long ext)
   case KVM_CAP_GET_TSC_KHZ:
   case KVM_CAP_PCI_2_3:
   case KVM_CAP_KVMCLOCK_CTRL:
 + case KVM_CAP_IRQFD_LEVEL:
   r = 1;
   break;
   case KVM_CAP_COALESCED_MMIO:
 diff --git a/include/linux/kvm.h b/include/linux/kvm.h
 index 2ce09aa..b2e6e4f 100644
 --- a/include/linux/kvm.h
 +++ b/include/linux/kvm.h
 @@ -618,6 +618,7 @@ struct kvm_ppc_smmu_info {
  #define KVM_CAP_PPC_GET_SMMU_INFO 78
  #define KVM_CAP_S390_COW 79
  #define KVM_CAP_PPC_ALLOC_HTAB 80
 +#define KVM_CAP_IRQFD_LEVEL 81
  
  #ifdef KVM_CAP_IRQ_ROUTING
  
 @@ -683,6 +684,8 @@ struct kvm_xen_hvm_config {
  #endif
  
  #define KVM_IRQFD_FLAG_DEASSIGN (1  0)
 +/* Available with KVM_CAP_IRQFD_LEVEL */
 +#define KVM_IRQFD_FLAG_LEVEL (1  1)
  
  struct kvm_irqfd {
   __u32 fd;
 diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
 index 7d7e2aa..ecdbfea 100644
 --- a/virt/kvm/eventfd.c
 +++ b/virt/kvm/eventfd.c
 @@ -36,6 +36,68 @@
  #include iodev.h
  
  /*
 + * An irq_source_id can be created from KVM_IRQFD for level interrupt
 + * injections and shared with other interfaces for EOI or de-assert.
 + * Create an object with reference counting to make it easy to use.
 + */
 +struct _irq_source {
 + int id; /* the IRQ source ID */
 + bool level_asserted; /* Track assertion state and protect with 
 lock */
 + spinlock_t lock; /* to avoid unnecessary re-assert/spurious 
 eoi. */
 + struct kvm *kvm;
 + struct kref kref;
 +};
 +
 +static void _irq_source_release(struct kref *kref)
 +{
 + struct _irq_source *source;
 +
 + source = container_of(kref, struct _irq_source, kref);
 +
 + /* This also de-asserts */
 + kvm_free_irq_source_id(source-kvm, source-id);
 + kfree(source);
 +}
 +
 +static void _irq_source_put(struct _irq_source *source)
 +{
 + if (source)
 + kref_put(source-kref, _irq_source_release);
 +}
 +
 +static struct _irq_source *__attribute__ ((used)) /* white lie for 
 now */
 +_irq_source_get(struct _irq_source *source)
 +{
 + if (source)
 + kref_get(source-kref);
 +
 + return source;
 +}
 +
 +static struct _irq_source *_irq_source_alloc(struct kvm *kvm)
 +{
 + struct _irq_source *source;
 + 

Re: [PATCH v5 3/4] kvm: Create kvm_clear_irq()

2012-07-18 Thread Gleb Natapov
On Wed, Jul 18, 2012 at 01:51:05PM +0300, Michael S. Tsirkin wrote:
 On Wed, Jul 18, 2012 at 01:36:08PM +0300, Gleb Natapov wrote:
  On Wed, Jul 18, 2012 at 01:33:35PM +0300, Michael S. Tsirkin wrote:
   On Wed, Jul 18, 2012 at 01:27:39PM +0300, Gleb Natapov wrote:
On Wed, Jul 18, 2012 at 01:20:29PM +0300, Michael S. Tsirkin wrote:
 On Wed, Jul 18, 2012 at 09:27:42AM +0300, Gleb Natapov wrote:
  On Tue, Jul 17, 2012 at 07:14:52PM +0300, Michael S. Tsirkin wrote:
_Seems_ racy, or _is_ racy?  Please identify the race.
   
   Look at this:
   
   static inline int kvm_irq_line_state(unsigned long *irq_state,
int irq_source_id, int level)
   {
   /* Logical OR for level trig interrupt */
   if (level)
   set_bit(irq_source_id, irq_state);
   else
   clear_bit(irq_source_id, irq_state);
   
   return !!(*irq_state);
   }
   
   
   Now:
   If other CPU changes some other bit after the atomic change,
   it looks like !!(*irq_state) might return a stale value.
   
   CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
   If CPU 0 sees a stale value now it will return 0 here
   and interrupt will get cleared.
   
  This will hardly happen on x86 especially since bit is set with
  serialized instruction.
 
 Probably. But it does make me a bit uneasy.  Why don't we pass
 irq_source_id to kvm_pic_set_irq/kvm_ioapic_set_irq, and move
 kvm_irq_line_state to under pic_lock/ioapic_lock?  We can then use
 __set_bit/__clear_bit in kvm_irq_line_state, making the ordering 
 simpler
 and saving an atomic op in the process.
 
With my patch I do not see why we can't change them to unlocked variant
without moving them anywhere. The only requirement is to not use RMW
sequence to set/clear bits. The ordering of setting does not matter. The
ordering of reading is.
   
   You want to use __set_bit/__clear_bit on the same word
   from multiple CPUs, without locking?
   Why won't this lose information?
  Because it is not RMW. If it is then yes, you can't do that.
 
 You are saying __set_bit does not do RMW on x86? Interesting.
I think it doesn't.

 It's probably not a good idea to rely on this I think.
 
The code is no in arch/x86 so probably no. Although it is used only on
x86 (and ia64 which has broken kvm anyway).

   
   In any case, it seems simpler and safer to do accesses under lock
   than rely on specific use.
   
--
Gleb.
  
  --
  Gleb.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 1/4] kvm: Extend irqfd to support level interrupts

2012-07-18 Thread Gleb Natapov
On Wed, Jul 18, 2012 at 01:53:11PM +0300, Michael S. Tsirkin wrote:
 On Wed, Jul 18, 2012 at 01:49:06PM +0300, Gleb Natapov wrote:
  On Wed, Jul 18, 2012 at 01:48:44PM +0300, Michael S. Tsirkin wrote:
   On Wed, Jul 18, 2012 at 01:44:29PM +0300, Gleb Natapov wrote:
On Wed, Jul 18, 2012 at 01:41:14PM +0300, Michael S. Tsirkin wrote:
 On Mon, Jul 16, 2012 at 02:33:47PM -0600, Alex Williamson wrote:
  In order to inject a level interrupt from an external source using 
  an
  irqfd, we need to allocate a new irq_source_id.  This allows us to
  assert and (later) de-assert an interrupt line independently from
  users of KVM_IRQ_LINE and avoid lost interrupts.
  
  We also add what may appear like a bit of excessive infrastructure
  around an object for storing this irq_source_id.  However, notice
  that we only provide a way to assert the interrupt here.  A 
  follow-on
  interface will make use of the same irq_source_id to allow 
  de-assert.
  
  Signed-off-by: Alex Williamson alex.william...@redhat.com
  ---
  
   Documentation/virtual/kvm/api.txt |6 ++
   arch/x86/kvm/x86.c|1 
   include/linux/kvm.h   |3 +
   virt/kvm/eventfd.c|  114 
  -
   4 files changed, 120 insertions(+), 4 deletions(-)
  
  diff --git a/Documentation/virtual/kvm/api.txt 
  b/Documentation/virtual/kvm/api.txt
  index 100acde..c7267d5 100644
  --- a/Documentation/virtual/kvm/api.txt
  +++ b/Documentation/virtual/kvm/api.txt
  @@ -1981,6 +1981,12 @@ the guest using the specified gsi pin.  The 
  irqfd is removed using
   the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd
   and kvm_irqfd.gsi.
   
  +The KVM_IRQFD_FLAG_LEVEL flag indicates the gsi input is for a 
  level
  +triggered interrupt.  In this case a new irqchip input is allocated
  +which is logically OR'd with other inputs allowing multiple sources
  +to independently assert level interrupts.  The KVM_IRQFD_FLAG_LEVEL
  +is only necessary on setup, teardown is identical to that above.
  +KVM_IRQFD_FLAG_LEVEL support is indicated by KVM_CAP_IRQFD_LEVEL.
   
   5. The kvm_run structure
   
  diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
  index a01a424..80bed07 100644
  --- a/arch/x86/kvm/x86.c
  +++ b/arch/x86/kvm/x86.c
  @@ -2148,6 +2148,7 @@ int kvm_dev_ioctl_check_extension(long ext)
  case KVM_CAP_GET_TSC_KHZ:
  case KVM_CAP_PCI_2_3:
  case KVM_CAP_KVMCLOCK_CTRL:
  +   case KVM_CAP_IRQFD_LEVEL:
  r = 1;
  break;
  case KVM_CAP_COALESCED_MMIO:
  diff --git a/include/linux/kvm.h b/include/linux/kvm.h
  index 2ce09aa..b2e6e4f 100644
  --- a/include/linux/kvm.h
  +++ b/include/linux/kvm.h
  @@ -618,6 +618,7 @@ struct kvm_ppc_smmu_info {
   #define KVM_CAP_PPC_GET_SMMU_INFO 78
   #define KVM_CAP_S390_COW 79
   #define KVM_CAP_PPC_ALLOC_HTAB 80
  +#define KVM_CAP_IRQFD_LEVEL 81
   
   #ifdef KVM_CAP_IRQ_ROUTING
   
  @@ -683,6 +684,8 @@ struct kvm_xen_hvm_config {
   #endif
   
   #define KVM_IRQFD_FLAG_DEASSIGN (1  0)
  +/* Available with KVM_CAP_IRQFD_LEVEL */
  +#define KVM_IRQFD_FLAG_LEVEL (1  1)
   
   struct kvm_irqfd {
  __u32 fd;
  diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
  index 7d7e2aa..ecdbfea 100644
  --- a/virt/kvm/eventfd.c
  +++ b/virt/kvm/eventfd.c
  @@ -36,6 +36,68 @@
   #include iodev.h
   
   /*
  + * An irq_source_id can be created from KVM_IRQFD for level 
  interrupt
  + * injections and shared with other interfaces for EOI or 
  de-assert.
  + * Create an object with reference counting to make it easy to use.
  + */
  +struct _irq_source {
  +   int id; /* the IRQ source ID */
  +   bool level_asserted; /* Track assertion state and protect with 
  lock */
  +   spinlock_t lock; /* to avoid unnecessary re-assert/spurious 
  eoi. */
  +   struct kvm *kvm;
  +   struct kref kref;
  +};
  +
  +static void _irq_source_release(struct kref *kref)
  +{
  +   struct _irq_source *source;
  +
  +   source = container_of(kref, struct _irq_source, kref);
  +
  +   /* This also de-asserts */
  +   kvm_free_irq_source_id(source-kvm, source-id);
  +   kfree(source);
  +}
  +
  +static void _irq_source_put(struct _irq_source *source)
  +{
  +   if (source)
  +   kref_put(source-kref, _irq_source_release);
  +}
  +
  +static struct _irq_source *__attribute__ ((used)) /* white lie for 
  now */
  +_irq_source_get(struct _irq_source *source)
  +{
  +   if (source)
  +   

Ubuntu 12.04 (kvm host) Centos6.3 (guest) , rebooting from inside centos6.3 VM - gets stuck on seabios/grub loop - previous kernels fine - occurs on multiple servers

2012-07-18 Thread Morgan Cox
I have a really annoying bug, I can reproduce often (although it is a
bit random).

I have an Ubuntu 12.04 KVM server , using Centos 6 guests -  when I
install the latest kernel for centos 6.3 -2.6.32-279.1.1.el6 - if you
reboot from inside a Centos6 vm it gets stuck in a loop between
seabios/grub - this does't happen 100% of the time - there is a high
chance it will though - usually after 3 reboots it will get stuck in
the loop - it will NEVER reboot without manual intervention (virsh
destroy..)
It seem to fail at the kernel initialise stage - if I use vga=normal I
can see the words 'Probing EEID...' for a sec (then it reboots)

If i use virsh/virt-manager to reboot its fine, only from inside a
centos6 vm (with latest centos kernel) does this occur

As a test I installed centos 6.2 - this was 100% fine *until* I did a
yum update then I got the same issue.

Ubuntu 10.04 KVM host / Centos 6.3 (guest) is fine  - so i'm unsure
where the fault is. Likewize a Centos 6.3 kvm host and centos 6.3
guest is also fine...

I have installed a 2nd Ubuntu 12.04 KVM server and the exact same
thing occurs (i.e 2 different servers = same issue)

How can I troubleshoot this ? I already have already enabled the boot
(options 'console=ttyS0' (which I can access vm's using virsh console
id) however this gives no output (as it crashes when initialising the
kernel)

I have also tried installing the latest qemu-kvm (1.1) from source on
the Ubuntu 12.04 kvm server, the same thing occurs .

I have reported bugs @ Ubuntu and Centos (only Ubuntu have bothered to respond)

https://bugs.launchpad.net/centos/+bug/1025188
http://bugs.centos.org/view.php?id=5841

There is a vid showing the issue here

https://launchpadlibrarian.net/110374291/out.ogv

(you can't see the boot msgs as I am using console=ttyS0 on the boot options)

Any info, help, hints at where to look next would be useful

Regards
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 3/4] kvm: Create kvm_clear_irq()

2012-07-18 Thread Michael S. Tsirkin
On Wed, Jul 18, 2012 at 01:53:15PM +0300, Gleb Natapov wrote:
 On Wed, Jul 18, 2012 at 01:51:05PM +0300, Michael S. Tsirkin wrote:
  On Wed, Jul 18, 2012 at 01:36:08PM +0300, Gleb Natapov wrote:
   On Wed, Jul 18, 2012 at 01:33:35PM +0300, Michael S. Tsirkin wrote:
On Wed, Jul 18, 2012 at 01:27:39PM +0300, Gleb Natapov wrote:
 On Wed, Jul 18, 2012 at 01:20:29PM +0300, Michael S. Tsirkin wrote:
  On Wed, Jul 18, 2012 at 09:27:42AM +0300, Gleb Natapov wrote:
   On Tue, Jul 17, 2012 at 07:14:52PM +0300, Michael S. Tsirkin 
   wrote:
 _Seems_ racy, or _is_ racy?  Please identify the race.

Look at this:

static inline int kvm_irq_line_state(unsigned long *irq_state,
 int irq_source_id, int 
level)
{
/* Logical OR for level trig interrupt */
if (level)
set_bit(irq_source_id, irq_state);
else
clear_bit(irq_source_id, irq_state);

return !!(*irq_state);
}


Now:
If other CPU changes some other bit after the atomic change,
it looks like !!(*irq_state) might return a stale value.

CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
If CPU 0 sees a stale value now it will return 0 here
and interrupt will get cleared.

   This will hardly happen on x86 especially since bit is set with
   serialized instruction.
  
  Probably. But it does make me a bit uneasy.  Why don't we pass
  irq_source_id to kvm_pic_set_irq/kvm_ioapic_set_irq, and move
  kvm_irq_line_state to under pic_lock/ioapic_lock?  We can then use
  __set_bit/__clear_bit in kvm_irq_line_state, making the ordering 
  simpler
  and saving an atomic op in the process.
  
 With my patch I do not see why we can't change them to unlocked 
 variant
 without moving them anywhere. The only requirement is to not use RMW
 sequence to set/clear bits. The ordering of setting does not matter. 
 The
 ordering of reading is.

You want to use __set_bit/__clear_bit on the same word
from multiple CPUs, without locking?
Why won't this lose information?
   Because it is not RMW. If it is then yes, you can't do that.
  
  You are saying __set_bit does not do RMW on x86? Interesting.
 I think it doesn't.

Anywhere I can read about this?

  It's probably not a good idea to rely on this I think.
  
 The code is no in arch/x86 so probably no. Although it is used only on
 x86 (and ia64 which has broken kvm anyway).

Yes but exactly the reverse is documented.

/**
 * __set_bit - Set a bit in memory
 * @nr: the bit to set
 * @addr: the address to start counting from
 *
 * Unlike set_bit(), this function is non-atomic and may be reordered.


 pls note the below

 * If it's called on the same region of memory simultaneously, the effect
 * may be that only one operation succeeds.
 until here

 */
static inline void __set_bit(int nr, volatile unsigned long *addr)
{
asm volatile(bts %1,%0 : ADDR : Ir (nr) : memory);
}





In any case, it seems simpler and safer to do accesses under lock
than rely on specific use.

 --
   Gleb.
   
   --
 Gleb.
 
 --
   Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 1/4] kvm: Extend irqfd to support level interrupts

2012-07-18 Thread Michael S. Tsirkin
On Wed, Jul 18, 2012 at 01:55:30PM +0300, Gleb Natapov wrote:
 On Wed, Jul 18, 2012 at 01:53:11PM +0300, Michael S. Tsirkin wrote:
  On Wed, Jul 18, 2012 at 01:49:06PM +0300, Gleb Natapov wrote:
   On Wed, Jul 18, 2012 at 01:48:44PM +0300, Michael S. Tsirkin wrote:
On Wed, Jul 18, 2012 at 01:44:29PM +0300, Gleb Natapov wrote:
 On Wed, Jul 18, 2012 at 01:41:14PM +0300, Michael S. Tsirkin wrote:
  On Mon, Jul 16, 2012 at 02:33:47PM -0600, Alex Williamson wrote:
   In order to inject a level interrupt from an external source 
   using an
   irqfd, we need to allocate a new irq_source_id.  This allows us to
   assert and (later) de-assert an interrupt line independently from
   users of KVM_IRQ_LINE and avoid lost interrupts.
   
   We also add what may appear like a bit of excessive infrastructure
   around an object for storing this irq_source_id.  However, notice
   that we only provide a way to assert the interrupt here.  A 
   follow-on
   interface will make use of the same irq_source_id to allow 
   de-assert.
   
   Signed-off-by: Alex Williamson alex.william...@redhat.com
   ---
   
Documentation/virtual/kvm/api.txt |6 ++
arch/x86/kvm/x86.c|1 
include/linux/kvm.h   |3 +
virt/kvm/eventfd.c|  114 
   -
4 files changed, 120 insertions(+), 4 deletions(-)
   
   diff --git a/Documentation/virtual/kvm/api.txt 
   b/Documentation/virtual/kvm/api.txt
   index 100acde..c7267d5 100644
   --- a/Documentation/virtual/kvm/api.txt
   +++ b/Documentation/virtual/kvm/api.txt
   @@ -1981,6 +1981,12 @@ the guest using the specified gsi pin.  
   The irqfd is removed using
the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd
and kvm_irqfd.gsi.

   +The KVM_IRQFD_FLAG_LEVEL flag indicates the gsi input is for a 
   level
   +triggered interrupt.  In this case a new irqchip input is 
   allocated
   +which is logically OR'd with other inputs allowing multiple 
   sources
   +to independently assert level interrupts.  The 
   KVM_IRQFD_FLAG_LEVEL
   +is only necessary on setup, teardown is identical to that above.
   +KVM_IRQFD_FLAG_LEVEL support is indicated by KVM_CAP_IRQFD_LEVEL.

5. The kvm_run structure

   diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
   index a01a424..80bed07 100644
   --- a/arch/x86/kvm/x86.c
   +++ b/arch/x86/kvm/x86.c
   @@ -2148,6 +2148,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 case KVM_CAP_GET_TSC_KHZ:
 case KVM_CAP_PCI_2_3:
 case KVM_CAP_KVMCLOCK_CTRL:
   + case KVM_CAP_IRQFD_LEVEL:
 r = 1;
 break;
 case KVM_CAP_COALESCED_MMIO:
   diff --git a/include/linux/kvm.h b/include/linux/kvm.h
   index 2ce09aa..b2e6e4f 100644
   --- a/include/linux/kvm.h
   +++ b/include/linux/kvm.h
   @@ -618,6 +618,7 @@ struct kvm_ppc_smmu_info {
#define KVM_CAP_PPC_GET_SMMU_INFO 78
#define KVM_CAP_S390_COW 79
#define KVM_CAP_PPC_ALLOC_HTAB 80
   +#define KVM_CAP_IRQFD_LEVEL 81

#ifdef KVM_CAP_IRQ_ROUTING

   @@ -683,6 +684,8 @@ struct kvm_xen_hvm_config {
#endif

#define KVM_IRQFD_FLAG_DEASSIGN (1  0)
   +/* Available with KVM_CAP_IRQFD_LEVEL */
   +#define KVM_IRQFD_FLAG_LEVEL (1  1)

struct kvm_irqfd {
 __u32 fd;
   diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
   index 7d7e2aa..ecdbfea 100644
   --- a/virt/kvm/eventfd.c
   +++ b/virt/kvm/eventfd.c
   @@ -36,6 +36,68 @@
#include iodev.h

/*
   + * An irq_source_id can be created from KVM_IRQFD for level 
   interrupt
   + * injections and shared with other interfaces for EOI or 
   de-assert.
   + * Create an object with reference counting to make it easy to 
   use.
   + */
   +struct _irq_source {
   + int id; /* the IRQ source ID */
   + bool level_asserted; /* Track assertion state and protect with 
   lock */
   + spinlock_t lock; /* to avoid unnecessary re-assert/spurious 
   eoi. */
   + struct kvm *kvm;
   + struct kref kref;
   +};
   +
   +static void _irq_source_release(struct kref *kref)
   +{
   + struct _irq_source *source;
   +
   + source = container_of(kref, struct _irq_source, kref);
   +
   + /* This also de-asserts */
   + kvm_free_irq_source_id(source-kvm, source-id);
   + kfree(source);
   +}
   +
   +static void _irq_source_put(struct _irq_source *source)
   +{
   + if (source)
   + kref_put(source-kref, _irq_source_release);
   +}
   +
   +static 

Re: [PATCH v5 1/4] kvm: Extend irqfd to support level interrupts

2012-07-18 Thread Michael S. Tsirkin
On Wed, Jul 18, 2012 at 02:22:19PM +0300, Michael S. Tsirkin wrote:
   So as was discussed kvm_set_irq under spinlock is bad for 
   scalability
   with multiple VCPUs.  Why do we need a spinlock simply to protect
   level_asserted?  Let's use an atomic test and set/test and clear 
   and the
   problem goes away.
   
  That sad reality is that for level interrupt we already scan all 
  vcpus
  under spinlock.
 
 Where?
 
ioapic
   
   $ grep kvm_for_each_vcpu virt/kvm/ioapic.c
   $
   
   ?
   
  
  Come on Michael. You can do better than grep and actually look at what
  code does. The code that loops over all vcpus while delivering an irq is
  in kvm_irq_delivery_to_apic(). Now grep for that.
 
 Hmm, I see, it's actually done for edge if injected from ioapic too,
 right?
 
 So set_irq does a linear scan, and for each matching CPU it calls
 kvm_irq_delivery_to_apic which is another scan?
 So it's actually N^2 worst case for a broadcast?

No it isn't, I misread the code.


Anyway, maybe not trivially but this looks fixable to me: we could drop
the ioapic lock before calling kvm_irq_delivery_to_apic.

  --
  Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RESEND 0/5] Add vhost-blk support

2012-07-18 Thread Stefan Hajnoczi
On Tue, Jul 17, 2012 at 4:09 PM, Michael S. Tsirkin m...@redhat.com wrote:
 On Fri, Jul 13, 2012 at 04:55:06PM +0800, Asias He wrote:

 Hi folks,

 [I am resending to fix the broken thread in the previous one.]

 This patchset adds vhost-blk support. vhost-blk is a in kernel virito-blk
 device accelerator. Compared to userspace virtio-blk implementation, 
 vhost-blk
 gives about 5% to 15% performance improvement.

 Same thing as tcm_host comment:

 It seems not 100% clear whether this driver will have major
 userspace using it. And if not, it would be very hard to support a
 driver when recent userspace does not use it in the end.

 I think a good idea for 3.6 would be to make it depend on
 CONFIG_STAGING.  Then we don't commit to an ABI.  For this, you can 
 add
 a separate Kconfig and source it from drivers/staging/Kconfig.  Maybe 
 it
 needs to be in a separate directory drivers/vhost/staging/Kconfig.

 I Cc'd the list of tcm_host in the hope that you can cooperate on this.


Adding it to staging allows more people to try it out, so that's a
good thing.  If I get a moment to play with it I'll let you know the
results.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 1/4] kvm: Extend irqfd to support level interrupts

2012-07-18 Thread Gleb Natapov
On Wed, Jul 18, 2012 at 02:39:10PM +0300, Michael S. Tsirkin wrote:
 On Wed, Jul 18, 2012 at 02:22:19PM +0300, Michael S. Tsirkin wrote:
So as was discussed kvm_set_irq under spinlock is bad for 
scalability
with multiple VCPUs.  Why do we need a spinlock simply to 
protect
level_asserted?  Let's use an atomic test and set/test and 
clear and the
problem goes away.

   That sad reality is that for level interrupt we already scan all 
   vcpus
   under spinlock.
  
  Where?
  
 ioapic

$ grep kvm_for_each_vcpu virt/kvm/ioapic.c
$

?

   
   Come on Michael. You can do better than grep and actually look at what
   code does. The code that loops over all vcpus while delivering an irq is
   in kvm_irq_delivery_to_apic(). Now grep for that.
  
  Hmm, I see, it's actually done for edge if injected from ioapic too,
  right?
  
  So set_irq does a linear scan, and for each matching CPU it calls
  kvm_irq_delivery_to_apic which is another scan?
  So it's actually N^2 worst case for a broadcast?
 
 No it isn't, I misread the code.
 
 
 Anyway, maybe not trivially but this looks fixable to me: we could drop
 the ioapic lock before calling kvm_irq_delivery_to_apic.
 
May be, may be not. Just saying lets drop lock whenever we don't feel
like holding one does not cut it. Back to original point though current
situation is that calling kvm_set_irq() under spinlock is not worse for
scalability than calling it not under one.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 3/4] kvm: Create kvm_clear_irq()

2012-07-18 Thread Gleb Natapov
On Wed, Jul 18, 2012 at 02:08:43PM +0300, Michael S. Tsirkin wrote:
 On Wed, Jul 18, 2012 at 01:53:15PM +0300, Gleb Natapov wrote:
  On Wed, Jul 18, 2012 at 01:51:05PM +0300, Michael S. Tsirkin wrote:
   On Wed, Jul 18, 2012 at 01:36:08PM +0300, Gleb Natapov wrote:
On Wed, Jul 18, 2012 at 01:33:35PM +0300, Michael S. Tsirkin wrote:
 On Wed, Jul 18, 2012 at 01:27:39PM +0300, Gleb Natapov wrote:
  On Wed, Jul 18, 2012 at 01:20:29PM +0300, Michael S. Tsirkin wrote:
   On Wed, Jul 18, 2012 at 09:27:42AM +0300, Gleb Natapov wrote:
On Tue, Jul 17, 2012 at 07:14:52PM +0300, Michael S. Tsirkin 
wrote:
  _Seems_ racy, or _is_ racy?  Please identify the race.
 
 Look at this:
 
 static inline int kvm_irq_line_state(unsigned long *irq_state,
  int irq_source_id, int 
 level)
 {
 /* Logical OR for level trig interrupt */
 if (level)
 set_bit(irq_source_id, irq_state);
 else
 clear_bit(irq_source_id, irq_state);
 
 return !!(*irq_state);
 }
 
 
 Now:
 If other CPU changes some other bit after the atomic change,
 it looks like !!(*irq_state) might return a stale value.
 
 CPU 0 clears bit 0. CPU 1 sets bit 1. CPU 1 sets level to 1.
 If CPU 0 sees a stale value now it will return 0 here
 and interrupt will get cleared.
 
This will hardly happen on x86 especially since bit is set with
serialized instruction.
   
   Probably. But it does make me a bit uneasy.  Why don't we pass
   irq_source_id to kvm_pic_set_irq/kvm_ioapic_set_irq, and move
   kvm_irq_line_state to under pic_lock/ioapic_lock?  We can then use
   __set_bit/__clear_bit in kvm_irq_line_state, making the ordering 
   simpler
   and saving an atomic op in the process.
   
  With my patch I do not see why we can't change them to unlocked 
  variant
  without moving them anywhere. The only requirement is to not use RMW
  sequence to set/clear bits. The ordering of setting does not 
  matter. The
  ordering of reading is.
 
 You want to use __set_bit/__clear_bit on the same word
 from multiple CPUs, without locking?
 Why won't this lose information?
Because it is not RMW. If it is then yes, you can't do that.
   
   You are saying __set_bit does not do RMW on x86? Interesting.
  I think it doesn't.
 
 Anywhere I can read about this?
 
Well actually SDM says LOCK prefix is needed, so yes we cannot use
__set_bit/__clear_bit without moving it under lock.

   It's probably not a good idea to rely on this I think.
   
  The code is no in arch/x86 so probably no. Although it is used only on
  x86 (and ia64 which has broken kvm anyway).
 
 Yes but exactly the reverse is documented.
 
 /**
  * __set_bit - Set a bit in memory
  * @nr: the bit to set
  * @addr: the address to start counting from
  *
  * Unlike set_bit(), this function is non-atomic and may be reordered.
 
 
  pls note the below
 
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  until here
 
  */
 static inline void __set_bit(int nr, volatile unsigned long *addr)
 {
 asm volatile(bts %1,%0 : ADDR : Ir (nr) : memory);
 }
 
 
 
 
 
 In any case, it seems simpler and safer to do accesses under lock
 than rely on specific use.
 
  --
  Gleb.

--
Gleb.
  
  --
  Gleb.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 1/4] kvm: Extend irqfd to support level interrupts

2012-07-18 Thread Michael S. Tsirkin
On Wed, Jul 18, 2012 at 02:48:44PM +0300, Gleb Natapov wrote:
 On Wed, Jul 18, 2012 at 02:39:10PM +0300, Michael S. Tsirkin wrote:
  On Wed, Jul 18, 2012 at 02:22:19PM +0300, Michael S. Tsirkin wrote:
 So as was discussed kvm_set_irq under spinlock is bad for 
 scalability
 with multiple VCPUs.  Why do we need a spinlock simply to 
 protect
 level_asserted?  Let's use an atomic test and set/test and 
 clear and the
 problem goes away.
 
That sad reality is that for level interrupt we already scan 
all vcpus
under spinlock.
   
   Where?
   
  ioapic
 
 $ grep kvm_for_each_vcpu virt/kvm/ioapic.c
 $
 
 ?
 

Come on Michael. You can do better than grep and actually look at what
code does. The code that loops over all vcpus while delivering an irq is
in kvm_irq_delivery_to_apic(). Now grep for that.
   
   Hmm, I see, it's actually done for edge if injected from ioapic too,
   right?
   
   So set_irq does a linear scan, and for each matching CPU it calls
   kvm_irq_delivery_to_apic which is another scan?
   So it's actually N^2 worst case for a broadcast?
  
  No it isn't, I misread the code.
  
  
  Anyway, maybe not trivially but this looks fixable to me: we could drop
  the ioapic lock before calling kvm_irq_delivery_to_apic.
  
 May be, may be not. Just saying lets drop lock whenever we don't feel
 like holding one does not cut it.

One thing we do is set remote_irr if interrupt was injected.
I agree these things are tricky.

One other question:

static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
{
union kvm_ioapic_redirect_entry *pent;
int injected = -1;

pent = ioapic-redirtbl[idx];

if (!pent-fields.mask) {
injected = ioapic_deliver(ioapic, idx);
if (injected  pent-fields.trig_mode == IOAPIC_LEVEL_TRIG)
pent-fields.remote_irr = 1;
}

return injected;
}


This if (injected) looks a bit strange since ioapic_deliver returns
-1 if no matching destinations. Should be if (injected  0)?



 Back to original point though current
 situation is that calling kvm_set_irq() under spinlock is not worse for
 scalability than calling it not under one.

Yes. Still the specific use can just use an atomic flag,
lock+bool is not needed, and we won't need to undo it later.

 --
   Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: vga passthrough // questions about pci passthrough

2012-07-18 Thread Martin Wolf

On 18.07.2012 11:26, Jan Kiszka wrote:

On 2012-07-18 07:45, Martin Wolf wrote:

Hello,

i was able to passthrough an AMD 7870 videocard to my win7 guest machine.

Would you add it to http://www.linux-kvm.org/page/VGA_device_assignment?

sure, i will prepare something



my host is ubuntu 12.04 with stock kernel.
my system contains:
dq67sw q67 mainboard
i5-2400s cpu
sapphire 7870 amd videocard
xonar d2x (problems to passthrough)

for full functionality i just needed two options

- kernel : iommu=on
- kvm module: ignore_msrs=1
(if i would not set it the guest os would crash with a bluescreen)

Can you report (= kernel log) which MSRs are unknown to KVM?
Jul 18 14:03:33 kvm-xen kernel: [  437.309931] kvm: 3347: cpu1 
kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop
Jul 18 14:03:33 kvm-xen kernel: [  437.522724] kvm: 3347: cpu1 
kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop
Jul 18 14:03:33 kvm-xen kernel: [  437.522733] kvm: 3347: cpu1 ignored 
rdmsr: 0x1c9
Jul 18 14:03:33 kvm-xen kernel: [  437.522736] kvm: 3347: cpu1 ignored 
rdmsr: 0x60
Jul 18 14:03:33 kvm-xen kernel: [  437.522752] kvm: 3347: cpu1 ignored 
rdmsr: 0x1c9
Jul 18 14:03:33 kvm-xen kernel: [  437.522755] kvm: 3347: cpu1 ignored 
rdmsr: 0x60
Jul 18 14:03:33 kvm-xen kernel: [  437.522821] kvm: 3347: cpu1 ignored 
rdmsr: 0x1c9
Jul 18 14:03:33 kvm-xen kernel: [  437.522823] kvm: 3347: cpu1 ignored 
rdmsr: 0x60
Jul 18 14:03:33 kvm-xen kernel: [  437.522834] kvm: 3347: cpu1 
kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop
Jul 18 14:03:33 kvm-xen kernel: [  437.522840] kvm: 3347: cpu1 ignored 
rdmsr: 0x1c9
Jul 18 14:03:33 kvm-xen kernel: [  437.522842] kvm: 3347: cpu1 ignored 
rdmsr: 0x60
Jul 18 14:03:33 kvm-xen kernel: [  437.522865] kvm: 3347: cpu1 ignored 
rdmsr: 0x1c9
Jul 18 14:03:33 kvm-xen kernel: [  437.522867] kvm: 3347: cpu1 ignored 
rdmsr: 0x60
Jul 18 14:03:33 kvm-xen kernel: [  437.522921] kvm: 3347: cpu1 
kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop
Jul 18 14:03:33 kvm-xen kernel: [  437.523005] kvm: 3347: cpu1 
kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop
Jul 18 14:03:33 kvm-xen kernel: [  437.523081] kvm: 3347: cpu1 
kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop
Jul 18 14:03:33 kvm-xen kernel: [  437.523175] kvm: 3347: cpu1 
kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop
Jul 18 14:03:33 kvm-xen kernel: [  437.523248] kvm: 3347: cpu1 
kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop
Jul 18 14:03:33 kvm-xen kernel: [  437.52] kvm: 3347: cpu1 
kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop
Jul 18 14:03:33 kvm-xen kernel: [  437.523430] kvm: 3347: cpu1 
kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop


i hope thats the info you need, i booted it with ignore_msrs=1 since if 
i dont do that i get less output.

(do you need it without the option?)




the unigine benchmark ran flawlessly
also the benchmark included in windows gave my videocard
similar values (7.7) comparable with my native win7 (7.9)


now to my questions...
1. is it possible to reset the videocard properly to be able to
 reboot the vm?

Which versions of kernel and qemu-kvm are involved via your distro? Can
you retry with latest Linux (3.5-rcX) / lastest qemu-kvm? Maybe
something got fixed meanwhile.

In general, there are many adapters that require special procedures to
perform resets. This one may fall into that category as well.

i will do a test today.

2.the xonar d2x is a very nice audio card, it would be very handy
 to be able to use it in the vm. in my oppinion the card is a
 d2 with a pci-e to pci bridge.
 i tried to passthrough the card alone and with the pci-bridge
 that was shown though lspci, but i had no success.
 maybe you guys here have an idea on that topic?

Any further details about the error? Does the adapter work with a Linux
guest or provide more information that way?

Jan

i will also add info here later




--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: kvm build error: undefined reference to `x86_hyper_kvm'

2012-07-18 Thread Prarit Bhargava


On 07/17/2012 10:42 PM, Ren, Yongjie wrote:
 kvm.git next branch
 commit: ebf7d2e9
 
 arch/x86/built-in.o: In function `init_hypervisor_platform':
 (.init.text+0x56b1): undefined reference to `x86_hyper_kvm'
 arch/x86/built-in.o: In function `init_hypervisor_platform':
 (.init.text+0x56bc): undefined reference to `x86_hyper_kvm'
 make: *** [vmlinux] Error 1
 

That's odd, can you send me your .config?  It compiles for me...

P.

 
 Best Regards,
  Yongjie (Jay)
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: kvm build error: undefined reference to `x86_hyper_kvm'

2012-07-18 Thread Prarit Bhargava


On 07/17/2012 10:42 PM, Ren, Yongjie wrote:
 kvm.git next branch
 commit: ebf7d2e9
 
 arch/x86/built-in.o: In function `init_hypervisor_platform':
 (.init.text+0x56b1): undefined reference to `x86_hyper_kvm'
 arch/x86/built-in.o: In function `init_hypervisor_platform':
 (.init.text+0x56bc): undefined reference to `x86_hyper_kvm'
 make: *** [vmlinux] Error 1
 

Ah, never mind my previous email.  I see that Avi quickly found the issue.

http://marc.info/?l=kvmm=134260159617733w=2

Sorry 'bout that,

P.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: vga passthrough // questions about pci passthrough

2012-07-18 Thread Martin Wolf

soundcard logs added

On 18.07.2012 14:08, Martin Wolf wrote:

On 18.07.2012 11:26, Jan Kiszka wrote:

On 2012-07-18 07:45, Martin Wolf wrote:

Hello,

i was able to passthrough an AMD 7870 videocard to my win7 guest 
machine.

Would you add it to http://www.linux-kvm.org/page/VGA_device_assignment?

sure, i will prepare something



my host is ubuntu 12.04 with stock kernel.
my system contains:
dq67sw q67 mainboard
i5-2400s cpu
sapphire 7870 amd videocard
xonar d2x (problems to passthrough)

for full functionality i just needed two options

- kernel : iommu=on
- kvm module: ignore_msrs=1
(if i would not set it the guest os would crash with a bluescreen)

Can you report (= kernel log) which MSRs are unknown to KVM?
Jul 18 14:03:33 kvm-xen kernel: [  437.309931] kvm: 3347: cpu1 
kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop
Jul 18 14:03:33 kvm-xen kernel: [  437.522724] kvm: 3347: cpu1 
kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop
Jul 18 14:03:33 kvm-xen kernel: [  437.522733] kvm: 3347: cpu1 ignored 
rdmsr: 0x1c9
Jul 18 14:03:33 kvm-xen kernel: [  437.522736] kvm: 3347: cpu1 ignored 
rdmsr: 0x60
Jul 18 14:03:33 kvm-xen kernel: [  437.522752] kvm: 3347: cpu1 ignored 
rdmsr: 0x1c9
Jul 18 14:03:33 kvm-xen kernel: [  437.522755] kvm: 3347: cpu1 ignored 
rdmsr: 0x60
Jul 18 14:03:33 kvm-xen kernel: [  437.522821] kvm: 3347: cpu1 ignored 
rdmsr: 0x1c9
Jul 18 14:03:33 kvm-xen kernel: [  437.522823] kvm: 3347: cpu1 ignored 
rdmsr: 0x60
Jul 18 14:03:33 kvm-xen kernel: [  437.522834] kvm: 3347: cpu1 
kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop
Jul 18 14:03:33 kvm-xen kernel: [  437.522840] kvm: 3347: cpu1 ignored 
rdmsr: 0x1c9
Jul 18 14:03:33 kvm-xen kernel: [  437.522842] kvm: 3347: cpu1 ignored 
rdmsr: 0x60
Jul 18 14:03:33 kvm-xen kernel: [  437.522865] kvm: 3347: cpu1 ignored 
rdmsr: 0x1c9
Jul 18 14:03:33 kvm-xen kernel: [  437.522867] kvm: 3347: cpu1 ignored 
rdmsr: 0x60
Jul 18 14:03:33 kvm-xen kernel: [  437.522921] kvm: 3347: cpu1 
kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop
Jul 18 14:03:33 kvm-xen kernel: [  437.523005] kvm: 3347: cpu1 
kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop
Jul 18 14:03:33 kvm-xen kernel: [  437.523081] kvm: 3347: cpu1 
kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop
Jul 18 14:03:33 kvm-xen kernel: [  437.523175] kvm: 3347: cpu1 
kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop
Jul 18 14:03:33 kvm-xen kernel: [  437.523248] kvm: 3347: cpu1 
kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop
Jul 18 14:03:33 kvm-xen kernel: [  437.52] kvm: 3347: cpu1 
kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop
Jul 18 14:03:33 kvm-xen kernel: [  437.523430] kvm: 3347: cpu1 
kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop


i hope thats the info you need, i booted it with ignore_msrs=1 since 
if i dont do that i get less output.

(do you need it without the option?)




the unigine benchmark ran flawlessly
also the benchmark included in windows gave my videocard
similar values (7.7) comparable with my native win7 (7.9)


now to my questions...
1. is it possible to reset the videocard properly to be able to
 reboot the vm?

Which versions of kernel and qemu-kvm are involved via your distro? Can
you retry with latest Linux (3.5-rcX) / lastest qemu-kvm? Maybe
something got fixed meanwhile.

In general, there are many adapters that require special procedures to
perform resets. This one may fall into that category as well.

i will do a test today.

2.the xonar d2x is a very nice audio card, it would be very handy
 to be able to use it in the vm. in my oppinion the card is a
 d2 with a pci-e to pci bridge.
 i tried to passthrough the card alone and with the pci-bridge
 that was shown though lspci, but i had no success.
 maybe you guys here have an idea on that topic?

Any further details about the error? Does the adapter work with a Linux
guest or provide more information that way?

Jan


02:00.0 PCI bridge: PLX Technology, Inc. PEX8112 x1 Lane PCI 
Express-to-PCI Bridge (rev aa) (prog-if 00 [Normal decode])
Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- 
ParErr- Stepping- SERR- FastB2B- DisINTx-
Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast TAbort- 
TAbort- MAbort- SERR- PERR- INTx-

Latency: 0, Cache Line Size: 64 bytes
Bus: primary=02, secondary=03, subordinate=03, sec-latency=32
I/O behind bridge: d000-dfff
Memory behind bridge: fff0-000f
Prefetchable memory behind bridge: fff0-000f
Secondary status: 66MHz+ FastB2B- ParErr- DEVSEL=medium 
TAbort- TAbort- MAbort- SERR- PERR-

BridgeCtl: Parity- SERR- NoISA- VGA- MAbort- Reset- FastB2B-
PriDiscTmr- SecDiscTmr- DiscTmrStat- DiscTmrSERREn-
Capabilities: access denied
Kernel driver in use: pci-stub
Kernel modules: shpchp

03:04.0 Multimedia audio controller: C-Media Electronics Inc CMI8788 
[Oxygen HD Audio]

Subsystem: ASUSTeK 

Re: [RFC-v3 0/4] tcm_vhost+cmwq fabric driver code for-3.6

2012-07-18 Thread Michael S. Tsirkin
On Wed, Jul 18, 2012 at 12:59:28AM +, Nicholas A. Bellinger wrote:
 From: Nicholas Bellinger n...@linux-iscsi.org
 
 Hi folks,
 
 The following is the RFC-v3 series of tcm_vhost target fabric driver code
 currently in-flight for-3.6 mainline code.
 
 With the merge window opening soon, the tcm_vhost code has started seeing
 time in linux-next.  The v2 - v3 changelog from the last week is currently
 looking like:
 
  *) Unlock on error in tcm_vhost_drop_nexus() (DanC)
  *) Fix strlen() doesn't count the terminator (DanC)
  *) Call kfree() on an error path (DanC)
  *) Convert tcm_vhost_write_pending to use target_execute_cmd (hch + nab)
  *) Fix another strlen() off by one in tcm_vhost_make_tport (DanC)
  *) Add option under drivers/staging/Kconfig, and move to drivers/vhost/tcm/
 as requested by MST (nab)

I actually only wanted to have a separate Kconfig (in a separate
directory or Kconfig.tcm), so you do not need to move code back to move
driver out of staging.  But if you prefer this, I'm fine with it too for
now.

 Thanks to Dan Carpenter for his smatch fixes this past round.
 
 Also as requested by MST, the code has been moved to a seperate tcm/ 
 subdirectory
 under drivers/vhost/ so that it can be included under staging's config options
 until we can settle on the necessary userspace bits for QEMU and kvm-tool.
 
 The updated series will be going out shortly to target-pending/for-next-merge.
 
 Please have another look and let us know if you have any concerned.
 
 Thanks!
 
 Nicholas Bellinger (2):
   vhost: Add vhost_scsi specific defines
   tcm_vhost: Initial merge for vhost level target fabric driver
 
 Stefan Hajnoczi (2):
   vhost: Separate vhost-net features from vhost features
   vhost: make vhost work queue visible
 
  drivers/staging/Kconfig   |2 +
  drivers/vhost/Makefile|2 +
  drivers/vhost/net.c   |4 +-
  drivers/vhost/tcm/Kconfig |6 +
  drivers/vhost/tcm/Makefile|1 +
  drivers/vhost/tcm/tcm_vhost.c | 1611 
 +
  drivers/vhost/tcm/tcm_vhost.h |   74 ++
  drivers/vhost/test.c  |4 +-
  drivers/vhost/vhost.c |5 +-
  drivers/vhost/vhost.h |6 +-
  include/linux/vhost.h |9 +
  11 files changed, 1716 insertions(+), 8 deletions(-)
  create mode 100644 drivers/vhost/tcm/Kconfig
  create mode 100644 drivers/vhost/tcm/Makefile
  create mode 100644 drivers/vhost/tcm/tcm_vhost.c
  create mode 100644 drivers/vhost/tcm/tcm_vhost.h
 
 -- 
 1.7.2.5
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC-v3 3/4] vhost: Add vhost_scsi specific defines

2012-07-18 Thread Michael S. Tsirkin
On Wed, Jul 18, 2012 at 12:59:31AM +, Nicholas A. Bellinger wrote:
 From: Nicholas Bellinger n...@risingtidesystems.com
 
 This patch adds the initial vhost_scsi_ioctl() callers for 
 VHOST_SCSI_SET_ENDPOINT
 and VHOST_SCSI_CLEAR_ENDPOINT respectively, and also adds struct 
 vhost_vring_target
 that is used by tcm_vhost code when locating target ports during qemu setup.
 
 Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
 Cc: Zhi Yong Wu wu...@cn.ibm.com
 Cc: Michael S. Tsirkin m...@redhat.com
 Cc: Paolo Bonzini pbonz...@redhat.com,
 Signed-off-by: Nicholas A. Bellinger n...@risingtidesystems.com
 ---
  include/linux/vhost.h |9 +
  1 files changed, 9 insertions(+), 0 deletions(-)
 
 diff --git a/include/linux/vhost.h b/include/linux/vhost.h
 index e847f1e..33b313b 100644
 --- a/include/linux/vhost.h
 +++ b/include/linux/vhost.h
 @@ -24,7 +24,11 @@ struct vhost_vring_state {
  struct vhost_vring_file {
   unsigned int index;
   int fd; /* Pass -1 to unbind from file. */
 +};
  
 +struct vhost_vring_target {

Can this be renamed vhost_scsi_target?

 + unsigned char vhost_wwpn[224];

224? I am guessing ISCSI_NAME_LEN from include/scsi/iscsi_proto.h?
Unfortunately we can't include iscsi_proto.h here as it
is not exported to users. But let's add a comment for now.

 + unsigned short vhost_tpgt;
  };
  
  struct vhost_vring_addr {
 @@ -121,6 +125,11 @@ struct vhost_memory {
   * device.  This can be used to stop the ring (e.g. for migration). */
  #define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct 
 vhost_vring_file)
  
 +/* VHOST_SCSI specific defines */
 +
 +#define VHOST_SCSI_SET_ENDPOINT _IOW(VHOST_VIRTIO, 0x40, struct 
 vhost_vring_target)
 +#define VHOST_SCSI_CLEAR_ENDPOINT _IOW(VHOST_VIRTIO, 0x41, struct 
 vhost_vring_target)
 +
  /* Feature bits */
  /* Log all write descriptors. Can be changed while device is active. */

Can these go into appropriate ifdef CONFIG_TCP_VHOST please?

  #define VHOST_F_LOG_ALL 26
 -- 
 1.7.2.5
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RFC V5 0/3] kvm: Improving directed yield in PLE handler

2012-07-18 Thread Raghavendra K T

Currently Pause Loop Exit (PLE) handler is doing directed yield to a
random vcpu on pl-exit. We already have filtering while choosing
the candidate to yield_to. This change adds more checks while choosing
a candidate to yield_to.

On a large vcpu guests, there is a high probability of
yielding to the same vcpu who had recently done a pause-loop exit. 
Such a yield can lead to the vcpu spinning again.

The patchset keeps track of the pause loop exit and gives chance to a
vcpu which has:

 (a) Not done pause loop exit at all (probably he is preempted lock-holder)

 (b) vcpu skipped in last iteration because it did pause loop exit, and
 probably has become eligible now (next eligible lock holder)

This concept also helps in cpu relax interception cases which use same handler.

Changes since V4:
 - Naming Change (Avi):
  struct ple == struct spin_loop
  cpu_relax_intercepted == in_spin_loop
  vcpu_check_and_update_eligible == vcpu_eligible_for_directed_yield
 - mark vcpu in spinloop as not eligible to avoid influence of previous exit

Changes since V3:
 - arch specific fix/changes (Christian)

Changes since v2:
 - Move ple structure to common code (Avi)
 - rename pause_loop_exited to cpu_relax_intercepted (Avi)
 - add config HAVE_KVM_CPU_RELAX_INTERCEPT (Avi)
 - Drop superfluous curly braces (Ingo)

Changes since v1:
 - Add more documentation for structure and algorithm and Rename
   plo == ple (Rik).
 - change dy_eligible initial value to false. (otherwise very first directed
yield will not be skipped. (Nikunj)
 - fixup signoff/from issue

Future enhancements:
  (1) Currently we have a boolean to decide on eligibility of vcpu. It
would be nice if I get feedback on guest (32 vcpu) whether we can
improve better with integer counter. (with counter = say f(log n )).
  
  (2) We have not considered system load during iteration of vcpu. With
   that information we can limit the scan and also decide whether schedule()
   is better. [ I am able to use #kicked vcpus to decide on this But may
   be there are better ideas like information from global loadavg.]

  (3) We can exploit this further with PV patches since it also knows about
   next eligible lock-holder.

Summary: There is a very good improvement for kvm based guest on PLE machine.
The V5 has huge improvement for kbench.

+---+---+---++---+
   base_rikstdev   patched  stdev   %improve
+---+---+---++---+
  kernbench (time in sec lesser is better)
+---+---+---++---+
 1x49.2300 1.017122.6842 0.3073117.0233 %
 2x91.9358 1.776853.9608 1.015470.37516 %
+---+---+---++---+

+---+---+---++---+
  ebizzy (records/sec more is better)
+---+---+---++---+
 1x  1129.250028.67932125.625032.823988.23334 %
 2x  1892.375075.11122377.1250   181.682225.61596 %
+---+---+---++---+

Note: The patches are tested on x86.

 Links
  V4: https://lkml.org/lkml/2012/7/16/80
  V3: https://lkml.org/lkml/2012/7/12/437
  V2: https://lkml.org/lkml/2012/7/10/392
  V1: https://lkml.org/lkml/2012/7/9/32

 Raghavendra K T (3):
   config: Add config to support ple or cpu relax optimzation 
   kvm : Note down when cpu relax intercepted or pause loop exited 
   kvm : Choose a better candidate for directed yield 
---
 arch/s390/kvm/Kconfig|1 +
 arch/x86/kvm/Kconfig |1 +
 include/linux/kvm_host.h |   39 +++
 virt/kvm/Kconfig |3 +++
 virt/kvm/kvm_main.c  |   41 +
 5 files changed, 85 insertions(+), 0 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RFC V5 1/3] kvm/config: Add config to support ple or cpu relax optimzation

2012-07-18 Thread Raghavendra K T
From: Raghavendra K T raghavendra...@linux.vnet.ibm.com

Suggested-by: Avi Kivity a...@redhat.com
Signed-off-by: Raghavendra K T raghavendra...@linux.vnet.ibm.com
---
 arch/s390/kvm/Kconfig |1 +
 arch/x86/kvm/Kconfig  |1 +
 virt/kvm/Kconfig  |3 +++
 3 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index 78eb984..a6e2677 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -21,6 +21,7 @@ config KVM
depends on HAVE_KVM  EXPERIMENTAL
select PREEMPT_NOTIFIERS
select ANON_INODES
+   select HAVE_KVM_CPU_RELAX_INTERCEPT
---help---
  Support hosting paravirtualized guest machines using the SIE
  virtualization capability on the mainframe. This should work
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index a28f338..45c044f 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -37,6 +37,7 @@ config KVM
select TASK_DELAY_ACCT
select PERF_EVENTS
select HAVE_KVM_MSI
+   select HAVE_KVM_CPU_RELAX_INTERCEPT
---help---
  Support hosting fully virtualized guest machines using hardware
  virtualization extensions.  You will need a fairly recent
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 28694f4..d01b24b 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -21,3 +21,6 @@ config KVM_ASYNC_PF
 
 config HAVE_KVM_MSI
bool
+
+config HAVE_KVM_CPU_RELAX_INTERCEPT
+   bool

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RFC V5 3/3] kvm: Choose better candidate for directed yield

2012-07-18 Thread Raghavendra K T
From: Raghavendra K T raghavendra...@linux.vnet.ibm.com

Currently, on a large vcpu guests, there is a high probability of
yielding to the same vcpu who had recently done a pause-loop exit or
cpu relax intercepted. Such a yield can lead to the vcpu spinning
again and hence degrade the performance.

The patchset keeps track of the pause loop exit/cpu relax interception
and gives chance to a vcpu which:
 (a) Has not done pause loop exit or cpu relax intercepted at all
 (probably he is preempted lock-holder)
 (b) Was skipped in last iteration because it did pause loop exit or
 cpu relax intercepted, and probably has become eligible now
 (next eligible lock holder)

Signed-off-by: Raghavendra K T raghavendra...@linux.vnet.ibm.com
---
V2 was:
Reviewed-by: Rik van Riel r...@redhat.com

 include/linux/kvm_host.h |5 +
 virt/kvm/kvm_main.c  |   36 
 2 files changed, 41 insertions(+), 0 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 34ce296..952427d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -923,6 +923,11 @@ static inline void kvm_vcpu_set_dy_eligible(struct 
kvm_vcpu *vcpu, bool val)
 {
 }
 
+static inline bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
+{
+   return true;
+}
+
 #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
 #endif
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 3d6ffc8..bf9fb97 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1571,6 +1571,39 @@ bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
 
+#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
+/*
+ * Helper that checks whether a VCPU is eligible for directed yield.
+ * Most eligible candidate to yield is decided by following heuristics:
+ *
+ *  (a) VCPU which has not done pl-exit or cpu relax intercepted recently
+ *  (preempted lock holder), indicated by @in_spin_loop.
+ *  Set at the beiginning and cleared at the end of interception/PLE handler.
+ *
+ *  (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
+ *  chance last time (mostly it has become eligible now since we have probably
+ *  yielded to lockholder in last iteration. This is done by toggling
+ *  @dy_eligible each time a VCPU checked for eligibility.)
+ *
+ *  Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
+ *  to preempted lock-holder could result in wrong VCPU selection and CPU
+ *  burning. Giving priority for a potential lock-holder increases lock
+ *  progress.
+ */
+bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
+{
+   bool eligible;
+
+   eligible = !vcpu-spin_loop.in_spin_loop ||
+   (vcpu-spin_loop.in_spin_loop 
+vcpu-spin_loop.dy_eligible);
+
+   if (vcpu-spin_loop.in_spin_loop)
+   vcpu-spin_loop.dy_eligible = !vcpu-spin_loop.dy_eligible;
+
+   return eligible;
+}
+#endif
 void kvm_vcpu_on_spin(struct kvm_vcpu *me)
 {
struct kvm *kvm = me-kvm;
@@ -1599,6 +1632,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
continue;
if (waitqueue_active(vcpu-wq))
continue;
+   if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
+   continue;
if (kvm_vcpu_yield_to(vcpu)) {
kvm-last_boosted_vcpu = i;
yielded = 1;
@@ -1607,6 +1642,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
}
}
kvm_vcpu_set_in_spin_loop(me, false);
+   kvm_vcpu_set_dy_eligible(me, false);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RFC V5 2/3] kvm: Note down when cpu relax intercepted or pause loop exited

2012-07-18 Thread Raghavendra K T
From: Raghavendra K T raghavendra...@linux.vnet.ibm.com

Noting pause loop exited vcpu or cpu relax intercepted helps in
filtering right candidate to yield. Wrong selection of vcpu;
i.e., a vcpu that just did a pl-exit or cpu relax intercepted may
contribute to performance degradation.

Signed-off-by: Raghavendra K T raghavendra...@linux.vnet.ibm.com
---
V2 was:
Reviewed-by: Rik van Riel r...@redhat.com

 include/linux/kvm_host.h |   34 ++
 virt/kvm/kvm_main.c  |5 +
 2 files changed, 39 insertions(+), 0 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c446435..34ce296 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -183,6 +183,18 @@ struct kvm_vcpu {
} async_pf;
 #endif
 
+#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
+   /*
+* Cpu relax intercept or pause loop exit optimization
+* in_spin_loop: set when a vcpu does a pause loop exit
+*  or cpu relax intercepted.
+* dy_eligible: indicates whether vcpu is eligible for directed yield.
+*/
+   struct {
+   bool in_spin_loop;
+   bool dy_eligible;
+   } spin_loop;
+#endif
struct kvm_vcpu_arch arch;
 };
 
@@ -890,5 +902,27 @@ static inline bool kvm_check_request(int req, struct 
kvm_vcpu *vcpu)
}
 }
 
+#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
+
+static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
+{
+   vcpu-spin_loop.in_spin_loop = val;
+}
+static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
+{
+   vcpu-spin_loop.dy_eligible = val;
+}
+
+#else /* !CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
+
+static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
+{
+}
+
+static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
+{
+}
+
+#endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
 #endif
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7e14068..3d6ffc8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -236,6 +236,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, 
unsigned id)
}
vcpu-run = page_address(page);
 
+   kvm_vcpu_set_in_spin_loop(vcpu, false);
+   kvm_vcpu_set_dy_eligible(vcpu, false);
+
r = kvm_arch_vcpu_init(vcpu);
if (r  0)
goto fail_free_run;
@@ -1577,6 +1580,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
int pass;
int i;
 
+   kvm_vcpu_set_in_spin_loop(me, true);
/*
 * We boost the priority of a VCPU that is runnable but not
 * currently running, because it got preempted by something
@@ -1602,6 +1606,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
}
}
}
+   kvm_vcpu_set_in_spin_loop(me, false);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC-v2 0/4] tcm_vhost+cmwq fabric driver code for-3.6

2012-07-18 Thread Anthony Liguori

On 07/17/2012 04:50 PM, Nicholas A. Bellinger wrote:

On Tue, 2012-07-17 at 13:55 -0500, Anthony Liguori wrote:

On 07/17/2012 10:05 AM, Michael S. Tsirkin wrote:

On Wed, Jul 11, 2012 at 09:15:00PM +, Nicholas A. Bellinger wrote:


SNIP



It still seems not 100% clear whether this driver will have major
userspace using it. And if not, it would be very hard to support a driver
when recent userspace does not use it in the end.


I don't think this is a good reason to exclude something from the kernel.
However, there are good reasons why this doesn't make sense for something like
QEMU--specifically because we have a large number of features in our block layer
that tcm_vhost would bypass.



I can definitely appreciate your concern here as the QEMU maintainer.


But perhaps it makes sense for something like native kvm tool.  And if it did go
into the kernel, we would certainly support it in QEMU.



...


But I do think the kernel should carefully consider whether it wants to support
an interface like this.  This an extremely complicated ABI with a lot of subtle
details around state and compatibility.

Are you absolutely confident that you can support a userspace application that
expects to get exactly the same response from all possible commands in 20 kernel
versions from now?  Virtualization requires absolutely precise compatibility in
terms of bugs and features.  This is probably not something the TCM stack has
had to consider yet.



We most certainly have thought about long term userspace compatibility
with TCM.  Our userspace code (that's now available in all major
distros) is completely forward-compatible with new fabric modules such
as tcm_vhost.  No update required.


I'm not sure we're talking about the same thing when we say compatibility.

I'm not talking about the API.  I'm talking about the behavior of the commands 
that tcm_vhost supports.


If you add support for a new command, you need to provide userspace a way to 
disable this command.  If you change what gets reported for VPD, you need to 
provide userspace a way to make VPD look like what it did in a previous version.


Basically, you need to be able to make a TCM device behave 100% the same as it 
did in an older version of the kernel.


This is unique to virtualization due to live migration.  If you migrate from a 
3.6 kernel to a 3.8 kernel, you need to make sure that the 3.8 kernel's TCM 
device behaves exactly like the 3.6 kernel because the guest that is interacting 
with it does not realize that live migration happened.


Yes, you can add knobs via configfs to control this behavior, but I think the 
question is, what's the plan for this?


BTW, I think this is a good thing to cover in Documentation/vhost/tcm_vhost.txt. 
 I think that's probably the only change that's needed here.


Regards,

Anthony Liguori



Also, by virtue of the fact that we are using configfs + rtslib (python
object library) on top, it's very easy to keep any type of compatibility
logic around in python code.  With rtslib, we are able to hide configfs
ABI changes from higher level apps.

So far we've had a track record of 100% userspace ABI compatibility in
mainline since .38, and I don't intend to merge a patch that breaks this
any time soon.  But if that ever happens, apps using rtslib are not
going to be effected.


I think a good idea for 3.6 would be to make it depend on CONFIG_STAGING.
Then we don't commit to an ABI.


I think this is a good idea.  Even if it goes in, a really clear policy would be
needed wrt the userspace ABI.

While tcm_vhost is probably more useful than vhost_blk, it's a much more complex
ABI to maintain.



As far as I am concerned, the kernel API (eg: configfs directory layout)
as it is now in sys/kernel/config/target/vhost/ is not going to change.
It's based on the same drivers/target/target_core_fabric_configfs.c
generic layout that we've had since .38.

The basic functional fabric layout in configfs is identical (with fabric
dependent WWPN naming of course) regardless of fabric driver, and by
virtue of being generic it means we can add things like fabric dependent
attributes + parameters in the future for existing fabrics without
breaking userspace.

So while I agree the ABI is more complex than vhost-blk, the logic in
target_core_fabric_configfs.c is a basic ABI fabric definition that we
are enforcing across all fabric modules in mainline for long term
compatibility.

--nab



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC-v2 0/4] tcm_vhost+cmwq fabric driver code for-3.6

2012-07-18 Thread Paolo Bonzini
Il 18/07/2012 15:42, Anthony Liguori ha scritto:
 If you add support for a new command, you need to provide userspace a
 way to disable this command.  If you change what gets reported for VPD,
 you need to provide userspace a way to make VPD look like what it did in
 a previous version.

The QEMU target is not enforcing this to this level.  We didn't for
CD-ROM ATAPI, and we're not doing it for SCSI.

It may indeed be useful for changes to VPD pages or major features.
However, so far we've never introduced any feature that deserved it.
This is also because OSes typically don't care: they use a small subset
of the features and all the remaining decorations are only needed to
be pedantically compliant to the spec.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: vga passthrough // questions about pci passthrough

2012-07-18 Thread Alex Williamson
On Wed, 2012-07-18 at 14:37 +0200, Martin Wolf wrote:
 soundcard logs added
 
 On 18.07.2012 14:08, Martin Wolf wrote:
  On 18.07.2012 11:26, Jan Kiszka wrote:
  On 2012-07-18 07:45, Martin Wolf wrote:
  Hello,
 
  i was able to passthrough an AMD 7870 videocard to my win7 guest 
  machine.
  Would you add it to http://www.linux-kvm.org/page/VGA_device_assignment?
  sure, i will prepare something
 
  my host is ubuntu 12.04 with stock kernel.
  my system contains:
  dq67sw q67 mainboard
  i5-2400s cpu
  sapphire 7870 amd videocard
  xonar d2x (problems to passthrough)
 
  for full functionality i just needed two options
 
  - kernel : iommu=on
  - kvm module: ignore_msrs=1
  (if i would not set it the guest os would crash with a bluescreen)
  Can you report (= kernel log) which MSRs are unknown to KVM?
  Jul 18 14:03:33 kvm-xen kernel: [  437.309931] kvm: 3347: cpu1 
  kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop
  Jul 18 14:03:33 kvm-xen kernel: [  437.522724] kvm: 3347: cpu1 
  kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop
  Jul 18 14:03:33 kvm-xen kernel: [  437.522733] kvm: 3347: cpu1 ignored 
  rdmsr: 0x1c9
  Jul 18 14:03:33 kvm-xen kernel: [  437.522736] kvm: 3347: cpu1 ignored 
  rdmsr: 0x60
  Jul 18 14:03:33 kvm-xen kernel: [  437.522752] kvm: 3347: cpu1 ignored 
  rdmsr: 0x1c9
  Jul 18 14:03:33 kvm-xen kernel: [  437.522755] kvm: 3347: cpu1 ignored 
  rdmsr: 0x60
  Jul 18 14:03:33 kvm-xen kernel: [  437.522821] kvm: 3347: cpu1 ignored 
  rdmsr: 0x1c9
  Jul 18 14:03:33 kvm-xen kernel: [  437.522823] kvm: 3347: cpu1 ignored 
  rdmsr: 0x60
  Jul 18 14:03:33 kvm-xen kernel: [  437.522834] kvm: 3347: cpu1 
  kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop
  Jul 18 14:03:33 kvm-xen kernel: [  437.522840] kvm: 3347: cpu1 ignored 
  rdmsr: 0x1c9
  Jul 18 14:03:33 kvm-xen kernel: [  437.522842] kvm: 3347: cpu1 ignored 
  rdmsr: 0x60
  Jul 18 14:03:33 kvm-xen kernel: [  437.522865] kvm: 3347: cpu1 ignored 
  rdmsr: 0x1c9
  Jul 18 14:03:33 kvm-xen kernel: [  437.522867] kvm: 3347: cpu1 ignored 
  rdmsr: 0x60
  Jul 18 14:03:33 kvm-xen kernel: [  437.522921] kvm: 3347: cpu1 
  kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop
  Jul 18 14:03:33 kvm-xen kernel: [  437.523005] kvm: 3347: cpu1 
  kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop
  Jul 18 14:03:33 kvm-xen kernel: [  437.523081] kvm: 3347: cpu1 
  kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop
  Jul 18 14:03:33 kvm-xen kernel: [  437.523175] kvm: 3347: cpu1 
  kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop
  Jul 18 14:03:33 kvm-xen kernel: [  437.523248] kvm: 3347: cpu1 
  kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop
  Jul 18 14:03:33 kvm-xen kernel: [  437.52] kvm: 3347: cpu1 
  kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop
  Jul 18 14:03:33 kvm-xen kernel: [  437.523430] kvm: 3347: cpu1 
  kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x1, nop
 
  i hope thats the info you need, i booted it with ignore_msrs=1 since 
  if i dont do that i get less output.
  (do you need it without the option?)
 
 
  the unigine benchmark ran flawlessly
  also the benchmark included in windows gave my videocard
  similar values (7.7) comparable with my native win7 (7.9)
 
 
  now to my questions...
  1. is it possible to reset the videocard properly to be able to
   reboot the vm?
  Which versions of kernel and qemu-kvm are involved via your distro? Can
  you retry with latest Linux (3.5-rcX) / lastest qemu-kvm? Maybe
  something got fixed meanwhile.
 
  In general, there are many adapters that require special procedures to
  perform resets. This one may fall into that category as well.
  i will do a test today.
  2.the xonar d2x is a very nice audio card, it would be very handy
   to be able to use it in the vm. in my oppinion the card is a
   d2 with a pci-e to pci bridge.
   i tried to passthrough the card alone and with the pci-bridge
   that was shown though lspci, but i had no success.
   maybe you guys here have an idea on that topic?
  Any further details about the error? Does the adapter work with a Linux
  guest or provide more information that way?
 
  Jan
 
 02:00.0 PCI bridge: PLX Technology, Inc. PEX8112 x1 Lane PCI 
 Express-to-PCI Bridge (rev aa) (prog-if 00 [Normal decode])
  Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- 
 ParErr- Stepping- SERR- FastB2B- DisINTx-
  Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast TAbort- 
 TAbort- MAbort- SERR- PERR- INTx-
  Latency: 0, Cache Line Size: 64 bytes
  Bus: primary=02, secondary=03, subordinate=03, sec-latency=32
  I/O behind bridge: d000-dfff
  Memory behind bridge: fff0-000f
  Prefetchable memory behind bridge: fff0-000f
  Secondary status: 66MHz+ FastB2B- ParErr- DEVSEL=medium 
  TAbort- TAbort- MAbort- SERR- PERR-
  BridgeCtl: Parity- SERR- NoISA- VGA- MAbort- Reset- FastB2B-
  PriDiscTmr- SecDiscTmr- DiscTmrStat- 

Re: [PATCH RESEND 5/5] vhost-blk: Add vhost-blk support

2012-07-18 Thread Jeff Moyer
Asias He as...@redhat.com writes:

 On 07/18/2012 03:10 AM, Jeff Moyer wrote:
 Asias He as...@redhat.com writes:

 vhost-blk is a in kernel virito-blk device accelerator.

 This patch is based on Liu Yuan's implementation with various
 improvements and bug fixes. Notably, this patch makes guest notify and
 host completion processing in parallel which gives about 60% performance
 improvement compared to Liu Yuan's implementation.

 So, first off, some basic questions.  Is it correct to assume that you
 tested this with buffered I/O (files opened *without* O_DIRECT)?
  I'm pretty sure that if you used O_DIRECT, you'd run into problems (which
 are solved by the patch set posted by Shaggy, based on Zach Brown's work
 of many moons ago).  Note that, with buffered I/O, the submission path
 is NOT asynchronous.  So, any speedups you've reported are extremely
 suspect.  ;-)

 I always used O_DIRECT to test this patchset. And I mostly used raw
 block device as guest image. Is this the reason why I did not hit the
 problem you mentioned. Btw, I do have run this patchset on image based
 file. I still do not see problems like IO hangs.

Hmm, so do the iovec's passed in point to buffers in userspace?  I
thought they were kernel buffers, which would have blown up in
get_user_pages_fast.

Cheers,
Jeff
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC V5 3/3] kvm: Choose better candidate for directed yield

2012-07-18 Thread Raghavendra K T

On 07/18/2012 07:08 PM, Raghavendra K T wrote:

From: Raghavendra K Traghavendra...@linux.vnet.ibm.com
+bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
+{
+   bool eligible;
+
+   eligible = !vcpu-spin_loop.in_spin_loop ||
+   (vcpu-spin_loop.in_spin_loop
+vcpu-spin_loop.dy_eligible);
+
+   if (vcpu-spin_loop.in_spin_loop)
+   vcpu-spin_loop.dy_eligible = !vcpu-spin_loop.dy_eligible;
+
+   return eligible;
+}


I should have added a comment like:
Since algorithm is based on heuristics, accessing another vcpu data
without locking does not harm. It may result in trying to yield to  same 
VCPU, fail and continue with next and so on.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RESEND 5/5] vhost-blk: Add vhost-blk support

2012-07-18 Thread Asias He

On 07/18/2012 10:31 PM, Jeff Moyer wrote:

Asias He as...@redhat.com writes:


On 07/18/2012 03:10 AM, Jeff Moyer wrote:

Asias He as...@redhat.com writes:


vhost-blk is a in kernel virito-blk device accelerator.

This patch is based on Liu Yuan's implementation with various
improvements and bug fixes. Notably, this patch makes guest notify and
host completion processing in parallel which gives about 60% performance
improvement compared to Liu Yuan's implementation.


So, first off, some basic questions.  Is it correct to assume that you
tested this with buffered I/O (files opened *without* O_DIRECT)?
  I'm pretty sure that if you used O_DIRECT, you'd run into problems (which
are solved by the patch set posted by Shaggy, based on Zach Brown's work
of many moons ago).  Note that, with buffered I/O, the submission path
is NOT asynchronous.  So, any speedups you've reported are extremely
suspect.  ;-)


I always used O_DIRECT to test this patchset. And I mostly used raw
block device as guest image. Is this the reason why I did not hit the
problem you mentioned. Btw, I do have run this patchset on image based
file. I still do not see problems like IO hangs.


Hmm, so do the iovec's passed in point to buffers in userspace?  I
thought they were kernel buffers, which would have blown up in
get_user_pages_fast.


Yes. The iovec's passed in point to userspace buffers. ;-)

--
Asias


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 1/4] kvm: Extend irqfd to support level interrupts

2012-07-18 Thread Alex Williamson
On Wed, 2012-07-18 at 15:07 +0300, Michael S. Tsirkin wrote:
 On Wed, Jul 18, 2012 at 02:48:44PM +0300, Gleb Natapov wrote:
  On Wed, Jul 18, 2012 at 02:39:10PM +0300, Michael S. Tsirkin wrote:
   On Wed, Jul 18, 2012 at 02:22:19PM +0300, Michael S. Tsirkin wrote:
  So as was discussed kvm_set_irq under spinlock is bad for 
  scalability
  with multiple VCPUs.  Why do we need a spinlock simply to 
  protect
  level_asserted?  Let's use an atomic test and set/test and 
  clear and the
  problem goes away.
  
 That sad reality is that for level interrupt we already scan 
 all vcpus
 under spinlock.

Where?

   ioapic
  
  $ grep kvm_for_each_vcpu virt/kvm/ioapic.c
  $
  
  ?
  
 
 Come on Michael. You can do better than grep and actually look at what
 code does. The code that loops over all vcpus while delivering an irq 
 is
 in kvm_irq_delivery_to_apic(). Now grep for that.

Hmm, I see, it's actually done for edge if injected from ioapic too,
right?

So set_irq does a linear scan, and for each matching CPU it calls
kvm_irq_delivery_to_apic which is another scan?
So it's actually N^2 worst case for a broadcast?
   
   No it isn't, I misread the code.
   
   
   Anyway, maybe not trivially but this looks fixable to me: we could drop
   the ioapic lock before calling kvm_irq_delivery_to_apic.
   
  May be, may be not. Just saying lets drop lock whenever we don't feel
  like holding one does not cut it.
 
 One thing we do is set remote_irr if interrupt was injected.
 I agree these things are tricky.
 
 One other question:
 
 static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
 {
 union kvm_ioapic_redirect_entry *pent;
 int injected = -1;
 
 pent = ioapic-redirtbl[idx];
 
 if (!pent-fields.mask) {
 injected = ioapic_deliver(ioapic, idx);
 if (injected  pent-fields.trig_mode == IOAPIC_LEVEL_TRIG)
 pent-fields.remote_irr = 1;
 }
 
 return injected;
 }
 
 
 This if (injected) looks a bit strange since ioapic_deliver returns
 -1 if no matching destinations. Should be if (injected  0)?
 
 
 
  Back to original point though current
  situation is that calling kvm_set_irq() under spinlock is not worse for
  scalability than calling it not under one.
 
 Yes. Still the specific use can just use an atomic flag,
 lock+bool is not needed, and we won't need to undo it later.


Actually, no, replacing it with an atomic is racy.

CPU0 (inject)   CPU1 (EOI)
atomic_cmpxchg(asserted, 0, 1)
atomic_cmpxchg(asserted, 1, 0)
kvm_set_irq(0)
kvm_set_irq(1)
eventfd_signal

The interrupt is now stuck on until another interrupt is injected.



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC v9 00/27] virtio: virtio-blk data plane

2012-07-18 Thread Stefan Hajnoczi
This series implements a dedicated thread for virtio-blk processing using Linux
AIO for raw image files only.  It is based on qemu-kvm.git a0bc8c3 and somewhat
old but I wanted to share it on the list since it has been mentioned on mailing
lists and IRC recently.

These patches can be used for benchmarking and discussion about how to improve
block performance.  Paolo Bonzini has also worked in this area and might want
to share his patches.

The basic approach is:
1. Each virtio-blk device has a thread dedicated to handling ioeventfd
   signalling when the guest kicks the virtqueue.
2. Requests are processed without going through the QEMU block layer using
   Linux AIO directly.
3. Completion interrupts are injected via ioctl from the dedicated thread.

The series also contains request merging as a bdrv_aio_multiwrite() equivalent.
This was only to get a comparison against the QEMU block layer and I would drop
it for other types of analysis.

The effect of this series is that O_DIRECT Linux AIO on raw files can bypass
the QEMU global mutex and block layer.  This means higher performance.

A cleaned up version of this approach could be added to QEMU as a raw O_DIRECT
Linux AIO fast path.  Image file formats, protocols, and other block layer
features are not supported by virtio-blk-data-plane.

Git repo:
http://repo.or.cz/w/qemu-kvm/stefanha.git/shortlog/refs/heads/virtio-blk-data-plane

Stefan Hajnoczi (27):
  virtio-blk: Remove virtqueue request handling code
  virtio-blk: Set up host notifier for data plane
  virtio-blk: Data plane thread event loop
  virtio-blk: Map vring
  virtio-blk: Do cheapest possible memory mapping
  virtio-blk: Take PCI memory range into account
  virtio-blk: Put dataplane code into its own directory
  virtio-blk: Read requests from the vring
  virtio-blk: Add Linux AIO queue
  virtio-blk: Stop data plane thread cleanly
  virtio-blk: Indirect vring and flush support
  virtio-blk: Add workaround for BUG_ON() dependency in virtio_ring.h
  virtio-blk: Increase max requests for indirect vring
  virtio-blk: Use pthreads instead of qemu-thread
  notifier: Add a function to set the notifier
  virtio-blk: Kick data plane thread using event notifier set
  virtio-blk: Use guest notifier to raise interrupts
  virtio-blk: Call ioctl() directly instead of irqfd
  virtio-blk: Disable guest-host notifies while processing vring
  virtio-blk: Add ioscheduler to detect mergable requests
  virtio-blk: Add basic request merging
  virtio-blk: Fix request merging
  virtio-blk: Stub out SCSI commands
  virtio-blk: fix incorrect length
  msix: fix irqchip breakage in msix_try_notify_from_thread()
  msix: use upstream kvm_irqchip_set_irq()
  virtio-blk: add EVENT_IDX support to dataplane

 event_notifier.c  |7 +
 event_notifier.h  |1 +
 hw/dataplane/event-poll.h |  116 +++
 hw/dataplane/ioq.h|  128 
 hw/dataplane/iosched.h|   97 ++
 hw/dataplane/vring.h  |  334 
 hw/msix.c |   15 +
 hw/msix.h |1 +
 hw/virtio-blk.c   |  753 +
 hw/virtio-pci.c   |8 +
 hw/virtio.c   |9 +
 hw/virtio.h   |3 +
 12 files changed, 1074 insertions(+), 398 deletions(-)
 create mode 100644 hw/dataplane/event-poll.h
 create mode 100644 hw/dataplane/ioq.h
 create mode 100644 hw/dataplane/iosched.h
 create mode 100644 hw/dataplane/vring.h

-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC v9 02/27] virtio-blk: Set up host notifier for data plane

2012-07-18 Thread Stefan Hajnoczi
Set up the virtqueue notify ioeventfd that the data plane will monitor.

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
 hw/virtio-blk.c |   37 +
 1 file changed, 37 insertions(+)

diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index a627427..0389294 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -26,6 +26,8 @@ typedef struct VirtIOBlock
 char *serial;
 unsigned short sector_mask;
 DeviceState *qdev;
+
+bool data_plane_started;
 } VirtIOBlock;
 
 static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev)
@@ -33,6 +35,39 @@ static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev)
 return (VirtIOBlock *)vdev;
 }
 
+static void virtio_blk_data_plane_start(VirtIOBlock *s)
+{
+if (s-vdev.binding-set_host_notifier(s-vdev.binding_opaque, 0, true) != 
0) {
+fprintf(stderr, virtio-blk failed to set host notifier\n);
+return;
+}
+
+s-data_plane_started = true;
+}
+
+static void virtio_blk_data_plane_stop(VirtIOBlock *s)
+{
+s-data_plane_started = false;
+
+s-vdev.binding-set_host_notifier(s-vdev.binding_opaque, 0, false);
+}
+
+static void virtio_blk_set_status(VirtIODevice *vdev, uint8_t val)
+{
+VirtIOBlock *s = to_virtio_blk(vdev);
+
+/* Toggle host notifier only on status change */
+if (s-data_plane_started == !!(val  VIRTIO_CONFIG_S_DRIVER_OK)) {
+return;
+}
+
+if (val  VIRTIO_CONFIG_S_DRIVER_OK) {
+virtio_blk_data_plane_start(s);
+} else {
+virtio_blk_data_plane_stop(s);
+}
+}
+
 static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
 {
 fprintf(stderr, virtio_blk_handle_output: should never get here,
@@ -115,6 +150,7 @@ VirtIODevice *virtio_blk_init(DeviceState *dev, BlockConf 
*conf,
 
 s-vdev.get_config = virtio_blk_update_config;
 s-vdev.get_features = virtio_blk_get_features;
+s-vdev.set_status = virtio_blk_set_status;
 s-bs = conf-bs;
 s-conf = conf;
 s-serial = *serial;
@@ -122,6 +158,7 @@ VirtIODevice *virtio_blk_init(DeviceState *dev, BlockConf 
*conf,
 bdrv_guess_geometry(s-bs, cylinders, heads, secs);
 
 s-vq = virtio_add_queue(s-vdev, 128, virtio_blk_handle_output);
+s-data_plane_started = false;
 
 s-qdev = dev;
 bdrv_set_buffer_alignment(s-bs, conf-logical_block_size);
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC v9 03/27] virtio-blk: Data plane thread event loop

2012-07-18 Thread Stefan Hajnoczi
Add a simple event handling loop based on epoll(2).  The data plane
thread now receives virtqueue notify and Linux AIO completion events.

The data plane thread currently does not shut down.  Either it needs to
be a detached thread or have clean shutdown support.

Most of the data plane start/stop code can be done once on virtio-blk
init/cleanup instead of each time the virtio device is brought up/down
by the driver.  Only the vring address and the notify pio address
change.

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
 hw/virtio-blk.c |  125 +++
 1 file changed, 116 insertions(+), 9 deletions(-)

diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index 0389294..f6043bc 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -11,12 +11,25 @@
  *
  */
 
+#include sys/epoll.h
+#include sys/eventfd.h
+#include libaio.h
 #include qemu-common.h
+#include qemu-thread.h
 #include qemu-error.h
-#include trace.h
 #include blockdev.h
 #include virtio-blk.h
 
+enum {
+SEG_MAX = 126, /* maximum number of I/O segments */
+};
+
+typedef struct
+{
+EventNotifier *notifier;/* eventfd */
+void (*handler)(void);  /* handler function */
+} EventHandler;
+
 typedef struct VirtIOBlock
 {
 VirtIODevice vdev;
@@ -28,6 +41,13 @@ typedef struct VirtIOBlock
 DeviceState *qdev;
 
 bool data_plane_started;
+QemuThread data_plane_thread;
+
+int epoll_fd;   /* epoll(2) file descriptor */
+io_context_t io_ctx;/* Linux AIO context */
+EventNotifier io_notifier;  /* Linux AIO eventfd */
+EventHandler io_handler;/* Linux AIO completion handler */
+EventHandler notify_handler;/* virtqueue notify handler */
 } VirtIOBlock;
 
 static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev)
@@ -35,21 +55,108 @@ static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev)
 return (VirtIOBlock *)vdev;
 }
 
-static void virtio_blk_data_plane_start(VirtIOBlock *s)
+static void handle_io(void)
+{
+fprintf(stderr, io completion happened\n);
+}
+
+static void handle_notify(void)
+{
+fprintf(stderr, virtqueue notify happened\n);
+}
+
+static void *data_plane_thread(void *opaque)
 {
+VirtIOBlock *s = opaque;
+struct epoll_event event;
+int nevents;
+EventHandler *event_handler;
+
+/* Signals are masked, EINTR should never happen */
+
+for (;;) {
+/* Wait for the next event.  Only do one event per call to keep the
+ * function simple, this could be changed later. */
+nevents = epoll_wait(s-epoll_fd, event, 1, -1);
+if (unlikely(nevents != 1)) {
+fprintf(stderr, epoll_wait failed: %m\n);
+continue; /* should never happen */
+}
+
+/* Find out which event handler has become active */
+event_handler = event.data.ptr;
+
+/* Clear the eventfd */
+event_notifier_test_and_clear(event_handler-notifier);
+
+/* Handle the event */
+event_handler-handler();
+}
+return NULL;
+}
+
+static void add_event_handler(int epoll_fd, EventHandler *event_handler)
+{
+struct epoll_event event = {
+.events = EPOLLIN,
+.data.ptr = event_handler,
+};
+if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, 
event_notifier_get_fd(event_handler-notifier), event) != 0) {
+fprintf(stderr, virtio-blk failed to add event handler to epoll: 
%m\n);
+exit(1);
+}
+}
+
+static void data_plane_start(VirtIOBlock *s)
+{
+/* Create epoll file descriptor */
+s-epoll_fd = epoll_create1(EPOLL_CLOEXEC);
+if (s-epoll_fd  0) {
+fprintf(stderr, epoll_create1 failed: %m\n);
+return; /* TODO error handling */
+}
+
 if (s-vdev.binding-set_host_notifier(s-vdev.binding_opaque, 0, true) != 
0) {
 fprintf(stderr, virtio-blk failed to set host notifier\n);
-return;
+return; /* TODO error handling */
+}
+
+s-notify_handler.notifier = virtio_queue_get_host_notifier(s-vq),
+s-notify_handler.handler = handle_notify;
+add_event_handler(s-epoll_fd, s-notify_handler);
+
+/* Create aio context */
+if (io_setup(SEG_MAX, s-io_ctx) != 0) {
+fprintf(stderr, virtio-blk io_setup failed\n);
+return; /* TODO error handling */
 }
 
+if (event_notifier_init(s-io_notifier, 0) != 0) {
+fprintf(stderr, virtio-blk io event notifier creation failed\n);
+return; /* TODO error handling */
+}
+
+s-io_handler.notifier = s-io_notifier;
+s-io_handler.handler = handle_io;
+add_event_handler(s-epoll_fd, s-io_handler);
+
+qemu_thread_create(s-data_plane_thread, data_plane_thread, s, 
QEMU_THREAD_JOINABLE);
+
 s-data_plane_started = true;
 }
 
-static void virtio_blk_data_plane_stop(VirtIOBlock *s)
+static void data_plane_stop(VirtIOBlock *s)
 {
 s-data_plane_started = false;
 
+/* TODO stop data plane thread */
+
+

[RFC v9 05/27] virtio-blk: Do cheapest possible memory mapping

2012-07-18 Thread Stefan Hajnoczi
Instead of using QEMU memory access functions, grab the host address of
guest physical address zero and simply add to this base address.

This not only simplifies vring mapping but will also make virtqueue
element access cheap by avoiding QEMU memory access functions in the I/O
code path.

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
 hw/virtio-blk.c |   58 ---
 1 file changed, 30 insertions(+), 28 deletions(-)

diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index 4c790a3..abd9386 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -51,6 +51,8 @@ typedef struct VirtIOBlock
 EventNotifier io_notifier;  /* Linux AIO eventfd */
 EventHandler io_handler;/* Linux AIO completion handler */
 EventHandler notify_handler;/* virtqueue notify handler */
+
+void *phys_mem_zero_host_ptr;   /* host pointer to guest RAM */
 } VirtIOBlock;
 
 static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev)
@@ -58,43 +60,44 @@ static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev)
 return (VirtIOBlock *)vdev;
 }
 
+/* Map target physical address to host address
+ */
+static inline void *phys_to_host(VirtIOBlock *s, target_phys_addr_t phys)
+{
+return s-phys_mem_zero_host_ptr + phys;
+}
+
+/* Setup for cheap target physical to host address conversion
+ *
+ * This is a hack for direct access to guest memory, we're not really allowed
+ * to do this.
+ */
+static void setup_phys_to_host(VirtIOBlock *s)
+{
+target_phys_addr_t len = 4096; /* RAM is really much larger but we cheat */
+s-phys_mem_zero_host_ptr = cpu_physical_memory_map(0, len, 0);
+if (!s-phys_mem_zero_host_ptr) {
+fprintf(stderr, setup_phys_to_host failed\n);
+exit(1);
+}
+}
+
 /* Map the guest's vring to host memory
  *
  * This is not allowed but we know the ring won't move.
  */
-static void map_vring(struct vring *vring, VirtIODevice *vdev, int n)
+static void map_vring(struct vring *vring, VirtIOBlock *s, VirtIODevice *vdev, 
int n)
 {
-target_phys_addr_t physaddr, len;
-
 vring-num = virtio_queue_get_num(vdev, n);
-
-physaddr = virtio_queue_get_desc_addr(vdev, n);
-len = virtio_queue_get_desc_size(vdev, n);
-vring-desc = cpu_physical_memory_map(physaddr, len, 0);
-
-physaddr = virtio_queue_get_avail_addr(vdev, n);
-len = virtio_queue_get_avail_size(vdev, n);
-vring-avail = cpu_physical_memory_map(physaddr, len, 0);
-
-physaddr = virtio_queue_get_used_addr(vdev, n);
-len = virtio_queue_get_used_size(vdev, n);
-vring-used = cpu_physical_memory_map(physaddr, len, 0);
-
-if (!vring-desc || !vring-avail || !vring-used) {
-fprintf(stderr, virtio-blk failed to map vring\n);
-exit(1);
-}
+vring-desc = phys_to_host(s, virtio_queue_get_desc_addr(vdev, n));
+vring-avail = phys_to_host(s, virtio_queue_get_avail_addr(vdev, n));
+vring-used = phys_to_host(s, virtio_queue_get_used_addr(vdev, n));
 
 fprintf(stderr, virtio-blk vring physical=%#lx desc=%p avail=%p 
used=%p\n,
 virtio_queue_get_ring_addr(vdev, n),
 vring-desc, vring-avail, vring-used);
 }
 
-static void unmap_vring(struct vring *vring, VirtIODevice *vdev, int n)
-{
-cpu_physical_memory_unmap(vring-desc, virtio_queue_get_ring_size(vdev, 
n), 0, 0);
-}
-
 static void handle_io(void)
 {
 fprintf(stderr, io completion happened\n);
@@ -149,7 +152,8 @@ static void add_event_handler(int epoll_fd, EventHandler 
*event_handler)
 
 static void data_plane_start(VirtIOBlock *s)
 {
-map_vring(s-vring, s-vdev, 0);
+setup_phys_to_host(s);
+map_vring(s-vring, s, s-vdev, 0);
 
 /* Create epoll file descriptor */
 s-epoll_fd = epoll_create1(EPOLL_CLOEXEC);
@@ -199,8 +203,6 @@ static void data_plane_stop(VirtIOBlock *s)
 s-vdev.binding-set_host_notifier(s-vdev.binding_opaque, 0, false);
 
 close(s-epoll_fd);
-
-unmap_vring(s-vring, s-vdev, 0);
 }
 
 static void virtio_blk_set_status(VirtIODevice *vdev, uint8_t val)
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC v9 04/27] virtio-blk: Map vring

2012-07-18 Thread Stefan Hajnoczi
Map the vring to host memory so it can be accessed without the overhead
of the QEMU memory functions.

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
 hw/virtio-blk.c |   44 
 1 file changed, 44 insertions(+)

diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index f6043bc..4c790a3 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -14,6 +14,7 @@
 #include sys/epoll.h
 #include sys/eventfd.h
 #include libaio.h
+#include linux/virtio_ring.h
 #include qemu-common.h
 #include qemu-thread.h
 #include qemu-error.h
@@ -43,6 +44,8 @@ typedef struct VirtIOBlock
 bool data_plane_started;
 QemuThread data_plane_thread;
 
+struct vring vring;
+
 int epoll_fd;   /* epoll(2) file descriptor */
 io_context_t io_ctx;/* Linux AIO context */
 EventNotifier io_notifier;  /* Linux AIO eventfd */
@@ -55,6 +58,43 @@ static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev)
 return (VirtIOBlock *)vdev;
 }
 
+/* Map the guest's vring to host memory
+ *
+ * This is not allowed but we know the ring won't move.
+ */
+static void map_vring(struct vring *vring, VirtIODevice *vdev, int n)
+{
+target_phys_addr_t physaddr, len;
+
+vring-num = virtio_queue_get_num(vdev, n);
+
+physaddr = virtio_queue_get_desc_addr(vdev, n);
+len = virtio_queue_get_desc_size(vdev, n);
+vring-desc = cpu_physical_memory_map(physaddr, len, 0);
+
+physaddr = virtio_queue_get_avail_addr(vdev, n);
+len = virtio_queue_get_avail_size(vdev, n);
+vring-avail = cpu_physical_memory_map(physaddr, len, 0);
+
+physaddr = virtio_queue_get_used_addr(vdev, n);
+len = virtio_queue_get_used_size(vdev, n);
+vring-used = cpu_physical_memory_map(physaddr, len, 0);
+
+if (!vring-desc || !vring-avail || !vring-used) {
+fprintf(stderr, virtio-blk failed to map vring\n);
+exit(1);
+}
+
+fprintf(stderr, virtio-blk vring physical=%#lx desc=%p avail=%p 
used=%p\n,
+virtio_queue_get_ring_addr(vdev, n),
+vring-desc, vring-avail, vring-used);
+}
+
+static void unmap_vring(struct vring *vring, VirtIODevice *vdev, int n)
+{
+cpu_physical_memory_unmap(vring-desc, virtio_queue_get_ring_size(vdev, 
n), 0, 0);
+}
+
 static void handle_io(void)
 {
 fprintf(stderr, io completion happened\n);
@@ -109,6 +149,8 @@ static void add_event_handler(int epoll_fd, EventHandler 
*event_handler)
 
 static void data_plane_start(VirtIOBlock *s)
 {
+map_vring(s-vring, s-vdev, 0);
+
 /* Create epoll file descriptor */
 s-epoll_fd = epoll_create1(EPOLL_CLOEXEC);
 if (s-epoll_fd  0) {
@@ -157,6 +199,8 @@ static void data_plane_stop(VirtIOBlock *s)
 s-vdev.binding-set_host_notifier(s-vdev.binding_opaque, 0, false);
 
 close(s-epoll_fd);
+
+unmap_vring(s-vring, s-vdev, 0);
 }
 
 static void virtio_blk_set_status(VirtIODevice *vdev, uint8_t val)
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC v9 01/27] virtio-blk: Remove virtqueue request handling code

2012-07-18 Thread Stefan Hajnoczi
Start with a clean slate, a virtio-blk device that supports virtio
lifecycle operations and configuration but doesn't do any actual I/O.
The I/O is going to happen in a separate optimized data plane thread.

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
 hw/virtio-blk.c |  496 +--
 1 file changed, 3 insertions(+), 493 deletions(-)

diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index 49990f8..a627427 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -16,18 +16,12 @@
 #include trace.h
 #include blockdev.h
 #include virtio-blk.h
-#include scsi-defs.h
-#ifdef __linux__
-# include scsi/sg.h
-#endif
 
 typedef struct VirtIOBlock
 {
 VirtIODevice vdev;
 BlockDriverState *bs;
 VirtQueue *vq;
-void *rq;
-QEMUBH *bh;
 BlockConf *conf;
 char *serial;
 unsigned short sector_mask;
@@ -39,439 +33,11 @@ static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev)
 return (VirtIOBlock *)vdev;
 }
 
-typedef struct VirtIOBlockReq
-{
-VirtIOBlock *dev;
-VirtQueueElement elem;
-struct virtio_blk_inhdr *in;
-struct virtio_blk_outhdr *out;
-struct virtio_scsi_inhdr *scsi;
-QEMUIOVector qiov;
-struct VirtIOBlockReq *next;
-BlockAcctCookie acct;
-} VirtIOBlockReq;
-
-static void virtio_blk_req_complete(VirtIOBlockReq *req, int status)
-{
-VirtIOBlock *s = req-dev;
-
-trace_virtio_blk_req_complete(req, status);
-
-stb_p(req-in-status, status);
-virtqueue_push(s-vq, req-elem, req-qiov.size + sizeof(*req-in));
-virtio_notify(s-vdev, s-vq);
-}
-
-static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
-int is_read)
-{
-BlockErrorAction action = bdrv_get_on_error(req-dev-bs, is_read);
-VirtIOBlock *s = req-dev;
-
-if (action == BLOCK_ERR_IGNORE) {
-bdrv_emit_qmp_error_event(s-bs, BDRV_ACTION_IGNORE, is_read);
-return 0;
-}
-
-if ((error == ENOSPC  action == BLOCK_ERR_STOP_ENOSPC)
-|| action == BLOCK_ERR_STOP_ANY) {
-req-next = s-rq;
-s-rq = req;
-bdrv_emit_qmp_error_event(s-bs, BDRV_ACTION_STOP, is_read);
-vm_stop(RUN_STATE_IO_ERROR);
-bdrv_iostatus_set_err(s-bs, error);
-} else {
-virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
-bdrv_acct_done(s-bs, req-acct);
-g_free(req);
-bdrv_emit_qmp_error_event(s-bs, BDRV_ACTION_REPORT, is_read);
-}
-
-return 1;
-}
-
-static void virtio_blk_rw_complete(void *opaque, int ret)
-{
-VirtIOBlockReq *req = opaque;
-
-trace_virtio_blk_rw_complete(req, ret);
-
-if (ret) {
-int is_read = !(ldl_p(req-out-type)  VIRTIO_BLK_T_OUT);
-if (virtio_blk_handle_rw_error(req, -ret, is_read))
-return;
-}
-
-virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
-bdrv_acct_done(req-dev-bs, req-acct);
-g_free(req);
-}
-
-static void virtio_blk_flush_complete(void *opaque, int ret)
-{
-VirtIOBlockReq *req = opaque;
-
-if (ret) {
-if (virtio_blk_handle_rw_error(req, -ret, 0)) {
-return;
-}
-}
-
-virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
-bdrv_acct_done(req-dev-bs, req-acct);
-g_free(req);
-}
-
-static VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s)
-{
-VirtIOBlockReq *req = g_malloc(sizeof(*req));
-req-dev = s;
-req-qiov.size = 0;
-req-next = NULL;
-return req;
-}
-
-static VirtIOBlockReq *virtio_blk_get_request(VirtIOBlock *s)
-{
-VirtIOBlockReq *req = virtio_blk_alloc_request(s);
-
-if (req != NULL) {
-if (!virtqueue_pop(s-vq, req-elem)) {
-g_free(req);
-return NULL;
-}
-}
-
-return req;
-}
-
-#ifdef __linux__
-static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
-{
-struct sg_io_hdr hdr;
-int ret;
-int status;
-int i;
-
-if ((req-dev-vdev.guest_features  (1  VIRTIO_BLK_F_SCSI)) == 0) {
-virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
-g_free(req);
-return;
-}
-
-/*
- * We require at least one output segment each for the virtio_blk_outhdr
- * and the SCSI command block.
- *
- * We also at least require the virtio_blk_inhdr, the virtio_scsi_inhdr
- * and the sense buffer pointer in the input segments.
- */
-if (req-elem.out_num  2 || req-elem.in_num  3) {
-virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
-g_free(req);
-return;
-}
-
-/*
- * No support for bidirection commands yet.
- */
-if (req-elem.out_num  2  req-elem.in_num  3) {
-virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
-g_free(req);
-return;
-}
-
-/*
- * The scsi inhdr is placed in the second-to-last input segment, just
- * before the regular inhdr.
- */
-req-scsi = (void *)req-elem.in_sg[req-elem.in_num - 2].iov_base;
-
-memset(hdr, 0, sizeof(struct sg_io_hdr));
-hdr.interface_id = 

[RFC v9 08/27] virtio-blk: Read requests from the vring

2012-07-18 Thread Stefan Hajnoczi
Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
 hw/dataplane/vring.h |8 +--
 hw/virtio-blk.c  |   62 ++
 2 files changed, 59 insertions(+), 11 deletions(-)

diff --git a/hw/dataplane/vring.h b/hw/dataplane/vring.h
index 7099a99..b07d4f6 100644
--- a/hw/dataplane/vring.h
+++ b/hw/dataplane/vring.h
@@ -76,7 +76,7 @@ static void vring_setup(Vring *vring, VirtIODevice *vdev, int 
n)
  * Stolen from linux-2.6/drivers/vhost/vhost.c.
  */
 static unsigned int vring_pop(Vring *vring,
- struct iovec iov[], unsigned int iov_size,
+ struct iovec iov[], struct iovec *iov_end,
  unsigned int *out_num, unsigned int *in_num)
 {
struct vring_desc desc;
@@ -138,10 +138,14 @@ static unsigned int vring_pop(Vring *vring,
return ret;
}
continue; */
-fprintf(stderr, virtio-blk indirect vring not supported\n);
+fprintf(stderr, Indirect vring not supported\n);
 exit(1);
}
 
+if (iov = iov_end) {
+fprintf(stderr, Not enough vring iovecs\n);
+exit(1);
+}
 iov-iov_base = phys_to_host(vring, desc.addr);
 iov-iov_len  = desc.len;
 iov++;
diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index 2c1cce8..91f1bab 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -24,6 +24,7 @@
 enum {
 SEG_MAX = 126,  /* maximum number of I/O segments */
 VRING_MAX = SEG_MAX + 2,/* maximum number of vring descriptors */
+REQ_MAX = VRING_MAX / 2,/* maximum number of requests in the vring 
*/
 };
 
 typedef struct VirtIOBlock
@@ -58,20 +59,63 @@ static void handle_io(EventHandler *handler)
 fprintf(stderr, io completion happened\n);
 }
 
+static void process_request(struct iovec iov[], unsigned int out_num, unsigned 
int in_num)
+{
+/* Virtio block requests look like this: */
+struct virtio_blk_outhdr *outhdr; /* iov[0] */
+/* data[]... */
+struct virtio_blk_inhdr *inhdr;   /* iov[out_num + in_num - 1] */
+
+if (unlikely(out_num == 0 || in_num == 0 ||
+iov[0].iov_len != sizeof *outhdr ||
+iov[out_num + in_num - 1].iov_len != sizeof *inhdr)) {
+fprintf(stderr, virtio-blk invalid request\n);
+exit(1);
+}
+
+outhdr = iov[0].iov_base;
+inhdr = iov[out_num + in_num - 1].iov_base;
+
+fprintf(stderr, virtio-blk request type=%#x sector=%#lx\n,
+outhdr-type, outhdr-sector);
+}
+
 static void handle_notify(EventHandler *handler)
 {
 VirtIOBlock *s = container_of(handler, VirtIOBlock, notify_handler);
-struct iovec iov[VRING_MAX];
-unsigned int out_num, in_num;
-int head;
 
-head = vring_pop(s-vring, iov, ARRAY_SIZE(iov), out_num, in_num);
-if (unlikely(head = vring_get_num(s-vring))) {
-fprintf(stderr, false alarm, nothing on vring\n);
-return;
-}
+/* There is one array of iovecs into which all new requests are extracted
+ * from the vring.  Requests are read from the vring and the translated
+ * descriptors are written to the iovecs array.  The iovecs do not have to
+ * persist across handle_notify() calls because the kernel copies the
+ * iovecs on io_submit().
+ *
+ * Handling io_submit() EAGAIN may require storing the requests across
+ * handle_notify() calls until the kernel has sufficient resources to
+ * accept more I/O.  This is not implemented yet.
+ */
+struct iovec iovec[VRING_MAX];
+struct iovec *iov, *end = iovec[VRING_MAX];
+
+/* When a request is read from the vring, the index of the first descriptor
+ * (aka head) is returned so that the completed request can be pushed onto
+ * the vring later.
+ *
+ * The number of hypervisor read-only iovecs is out_num.  The number of
+ * hypervisor write-only iovecs is in_num.
+ */
+unsigned int head, out_num = 0, in_num = 0;
+
+for (iov = iovec; ; iov += out_num + in_num) {
+head = vring_pop(s-vring, iov, end, out_num, in_num);
+if (head = vring_get_num(s-vring)) {
+break; /* no more requests */
+}
+
+fprintf(stderr, head=%u out_num=%u in_num=%u\n, head, out_num, 
in_num);
 
-fprintf(stderr, head=%u out_num=%u in_num=%u\n, head, out_num, in_num);
+process_request(iov, out_num, in_num);
+}
 }
 
 static void *data_plane_thread(void *opaque)
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC v9 09/27] virtio-blk: Add Linux AIO queue

2012-07-18 Thread Stefan Hajnoczi
Requests read from the vring will be placed in a queue where they can be
merged as necessary.  Once all requests have been read from the vring,
the queue can be submitted.

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
 hw/dataplane/ioq.h |  104 
 hw/virtio-blk.c|   33 -
 2 files changed, 120 insertions(+), 17 deletions(-)
 create mode 100644 hw/dataplane/ioq.h

diff --git a/hw/dataplane/ioq.h b/hw/dataplane/ioq.h
new file mode 100644
index 000..26ca307
--- /dev/null
+++ b/hw/dataplane/ioq.h
@@ -0,0 +1,104 @@
+#ifndef IO_QUEUE_H
+#define IO_QUEUE_H
+
+typedef struct {
+int fd; /* file descriptor */
+unsigned int maxreqs;   /* max length of freelist and queue */
+
+io_context_t io_ctx;/* Linux AIO context */
+EventNotifier notifier; /* Linux AIO eventfd */
+
+/* Requests can complete in any order so a free list is necessary to manage
+ * available iocbs.
+ */
+struct iocb **freelist; /* free iocbs */
+unsigned int freelist_idx;
+
+/* Multiple requests are queued up before submitting them all in one go */
+struct iocb **queue;/* queued iocbs */
+unsigned int queue_idx;
+} IOQueue;
+
+static void ioq_init(IOQueue *ioq, int fd, unsigned int maxreqs)
+{
+ioq-fd = fd;
+ioq-maxreqs = maxreqs;
+
+if (io_setup(maxreqs, ioq-io_ctx) != 0) {
+fprintf(stderr, ioq io_setup failed\n);
+exit(1);
+}
+
+if (event_notifier_init(ioq-notifier, 0) != 0) {
+fprintf(stderr, ioq io event notifier creation failed\n);
+exit(1);
+}
+
+ioq-freelist = g_malloc0(sizeof ioq-freelist[0] * maxreqs);
+ioq-freelist_idx = 0;
+
+ioq-queue = g_malloc0(sizeof ioq-queue[0] * maxreqs);
+ioq-queue_idx = 0;
+}
+
+static void ioq_cleanup(IOQueue *ioq)
+{
+g_free(ioq-freelist);
+g_free(ioq-queue);
+
+event_notifier_cleanup(ioq-notifier);
+io_destroy(ioq-io_ctx);
+}
+
+static EventNotifier *ioq_get_notifier(IOQueue *ioq)
+{
+return ioq-notifier;
+}
+
+static struct iocb *ioq_get_iocb(IOQueue *ioq)
+{
+if (unlikely(ioq-freelist_idx == 0)) {
+fprintf(stderr, ioq underflow\n);
+exit(1);
+}
+struct iocb *iocb = ioq-freelist[--ioq-freelist_idx];
+ioq-queue[ioq-queue_idx++] = iocb;
+}
+
+static __attribute__((unused)) void ioq_put_iocb(IOQueue *ioq, struct iocb 
*iocb)
+{
+if (unlikely(ioq-freelist_idx == ioq-maxreqs)) {
+fprintf(stderr, ioq overflow\n);
+exit(1);
+}
+ioq-freelist[ioq-freelist_idx++] = iocb;
+}
+
+static __attribute__((unused)) void ioq_rdwr(IOQueue *ioq, bool read, struct 
iovec *iov, unsigned int count, long long offset)
+{
+struct iocb *iocb = ioq_get_iocb(ioq);
+
+if (read) {
+io_prep_preadv(iocb, ioq-fd, iov, count, offset);
+} else {
+io_prep_pwritev(iocb, ioq-fd, iov, count, offset);
+}
+io_set_eventfd(iocb, event_notifier_get_fd(ioq-notifier));
+}
+
+static __attribute__((unused)) void ioq_fdsync(IOQueue *ioq)
+{
+struct iocb *iocb = ioq_get_iocb(ioq);
+
+io_prep_fdsync(iocb, ioq-fd);
+io_set_eventfd(iocb, event_notifier_get_fd(ioq-notifier));
+}
+
+static __attribute__((unused)) int ioq_submit(IOQueue *ioq)
+{
+int rc = io_submit(ioq-io_ctx, ioq-queue_idx, ioq-queue);
+ioq-queue_idx = 0; /* reset */
+return rc;
+}
+
+#endif /* IO_QUEUE_H */
diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index 91f1bab..5e1ed79 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -13,12 +13,14 @@
 
 #include libaio.h
 #include qemu-common.h
+#include block_int.h
 #include qemu-thread.h
 #include qemu-error.h
 #include blockdev.h
 #include virtio-blk.h
 #include hw/dataplane/event-poll.h
 #include hw/dataplane/vring.h
+#include hw/dataplane/ioq.h
 #include kvm.h
 
 enum {
@@ -42,9 +44,9 @@ typedef struct VirtIOBlock
 
 Vring vring;/* virtqueue vring */
 
+IOQueue ioqueue;/* Linux AIO queue (should really be per 
dataplane thread) */
+
 EventPoll event_poll;   /* event poller */
-io_context_t io_ctx;/* Linux AIO context */
-EventNotifier io_notifier;  /* Linux AIO eventfd */
 EventHandler io_handler;/* Linux AIO completion handler */
 EventHandler notify_handler;/* virtqueue notify handler */
 } VirtIOBlock;
@@ -128,6 +130,14 @@ static void *data_plane_thread(void *opaque)
 return NULL;
 }
 
+/* Normally the block driver passes down the fd, there's no way to get it from
+ * above.
+ */
+static int get_raw_posix_fd_hack(VirtIOBlock *s)
+{
+return *(int*)s-bs-file-opaque;
+}
+
 static void data_plane_start(VirtIOBlock *s)
 {
 vring_setup(s-vring, s-vdev, 0);
@@ -138,23 +148,13 @@ static void data_plane_start(VirtIOBlock *s)
 fprintf(stderr, virtio-blk failed to set host notifier, ensure 
-enable-kvm is set\n);
 

[RFC v9 12/27] virtio-blk: Add workaround for BUG_ON() dependency in virtio_ring.h

2012-07-18 Thread Stefan Hajnoczi
Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
 hw/dataplane/vring.h |5 +
 1 file changed, 5 insertions(+)

diff --git a/hw/dataplane/vring.h b/hw/dataplane/vring.h
index 3eab4b4..44ef4a9 100644
--- a/hw/dataplane/vring.h
+++ b/hw/dataplane/vring.h
@@ -1,6 +1,11 @@
 #ifndef VRING_H
 #define VRING_H
 
+/* Some virtio_ring.h files use BUG_ON() */
+#ifndef BUG_ON
+#define BUG_ON(x)
+#endif
+
 #include linux/virtio_ring.h
 #include qemu-common.h
 
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC v9 11/27] virtio-blk: Indirect vring and flush support

2012-07-18 Thread Stefan Hajnoczi
RHEL6 and other new guest kernels use indirect vring descriptors to
increase the number of requests that can be batched.  This fundamentally
changes vring from a scheme that requires fixed resources to something
more dynamic (although there is still an absolute maximum number of
descriptors).  Cope with indirect vrings by taking on as many requests
as we can in one go and then postponing the remaining requests until the
first batch completes.

It would be possible to switch to dynamic resource management so iovec
and iocb structs are malloced.  This would allow the entire ring to be
processed even with indirect descriptors, but would probably hit a
bottleneck when io_submit refuses to queue more requests.  Therefore,
stick with the simpler scheme for now.

Unfortunately Linux AIO does not support asynchronous fsync/fdatasync on
all files.  In particular, an O_DIRECT opened file on ext4 does not
support Linux AIO fdsync.  Work around this by performing fdatasync()
synchronously for now.

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
 hw/dataplane/ioq.h   |   18 -
 hw/dataplane/vring.h |  103 +++---
 hw/virtio-blk.c  |   75 ++--
 3 files changed, 144 insertions(+), 52 deletions(-)

diff --git a/hw/dataplane/ioq.h b/hw/dataplane/ioq.h
index 7200e87..d1545d6 100644
--- a/hw/dataplane/ioq.h
+++ b/hw/dataplane/ioq.h
@@ -3,7 +3,7 @@
 
 typedef struct {
 int fd; /* file descriptor */
-unsigned int max_reqs;   /* max length of freelist and queue */
+unsigned int max_reqs;  /* max length of freelist and queue */
 
 io_context_t io_ctx;/* Linux AIO context */
 EventNotifier io_notifier;  /* Linux AIO eventfd */
@@ -91,18 +91,16 @@ static struct iocb *ioq_rdwr(IOQueue *ioq, bool read, 
struct iovec *iov, unsigne
 return iocb;
 }
 
-static struct iocb *ioq_fdsync(IOQueue *ioq)
-{
-struct iocb *iocb = ioq_get_iocb(ioq);
-
-io_prep_fdsync(iocb, ioq-fd);
-io_set_eventfd(iocb, event_notifier_get_fd(ioq-io_notifier));
-return iocb;
-}
-
 static int ioq_submit(IOQueue *ioq)
 {
 int rc = io_submit(ioq-io_ctx, ioq-queue_idx, ioq-queue);
+if (unlikely(rc  0)) {
+unsigned int i;
+fprintf(stderr, io_submit io_ctx=%#lx nr=%d iovecs=%p\n, 
(uint64_t)ioq-io_ctx, ioq-queue_idx, ioq-queue);
+for (i = 0; i  ioq-queue_idx; i++) {
+fprintf(stderr, [%u] type=%#x fd=%d\n, i, 
ioq-queue[i]-aio_lio_opcode, ioq-queue[i]-aio_fildes);
+}
+}
 ioq-queue_idx = 0; /* reset */
 return rc;
 }
diff --git a/hw/dataplane/vring.h b/hw/dataplane/vring.h
index 70675e5..3eab4b4 100644
--- a/hw/dataplane/vring.h
+++ b/hw/dataplane/vring.h
@@ -64,6 +64,86 @@ static void vring_setup(Vring *vring, VirtIODevice *vdev, 
int n)
 vring-vr.desc, vring-vr.avail, vring-vr.used);
 }
 
+static bool vring_more_avail(Vring *vring)
+{
+   return vring-vr.avail-idx != vring-last_avail_idx;
+}
+
+/* This is stolen from linux-2.6/drivers/vhost/vhost.c. */
+static bool get_indirect(Vring *vring,
+   struct iovec iov[], struct iovec *iov_end,
+   unsigned int *out_num, unsigned int *in_num,
+   struct vring_desc *indirect)
+{
+   struct vring_desc desc;
+   unsigned int i = 0, count, found = 0;
+
+   /* Sanity check */
+   if (unlikely(indirect-len % sizeof desc)) {
+   fprintf(stderr, Invalid length in indirect descriptor: 
+  len 0x%llx not multiple of 0x%zx\n,
+  (unsigned long long)indirect-len,
+  sizeof desc);
+   exit(1);
+   }
+
+   count = indirect-len / sizeof desc;
+   /* Buffers are chained via a 16 bit next field, so
+* we can have at most 2^16 of these. */
+   if (unlikely(count  USHRT_MAX + 1)) {
+   fprintf(stderr, Indirect buffer length too big: %d\n,
+  indirect-len);
+exit(1);
+   }
+
+/* Point to translate indirect desc chain */
+indirect = phys_to_host(vring, indirect-addr);
+
+   /* We will use the result as an address to read from, so most
+* architectures only need a compiler barrier here. */
+   __sync_synchronize(); /* read_barrier_depends(); */
+
+   do {
+   if (unlikely(++found  count)) {
+   fprintf(stderr, Loop detected: last one at %u 
+  indirect size %u\n,
+  i, count);
+   exit(1);
+   }
+
+desc = *indirect++;
+   if (unlikely(desc.flags  VRING_DESC_F_INDIRECT)) {
+   fprintf(stderr, Nested indirect descriptor\n);
+exit(1);
+   }
+
+/* Stop for now if there are not enough iovecs available. */
+if (iov = iov_end) {
+ 

[RFC v9 16/27] virtio-blk: Kick data plane thread using event notifier set

2012-07-18 Thread Stefan Hajnoczi
---
 hw/virtio-blk.c |3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index 1616be5..d75c187 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -339,8 +339,7 @@ static void virtio_blk_handle_output(VirtIODevice *vdev, 
VirtQueue *vq)
 virtio_blk_set_status(vdev, VIRTIO_CONFIG_S_DRIVER_OK); /* start the 
thread */
 
 /* Now kick the thread */
-uint64_t dummy = 1;
-ssize_t unused __attribute__((unused)) = 
write(event_notifier_get_fd(virtio_queue_get_host_notifier(s-vq)), dummy, 
sizeof dummy);
+event_notifier_set(virtio_queue_get_host_notifier(s-vq));
 }
 
 /* coalesce internal state, copy to pci i/o region 0
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC v9 06/27] virtio-blk: Take PCI memory range into account

2012-07-18 Thread Stefan Hajnoczi
Support 4 GB physical memory accesses.

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
 hw/virtio-blk.c |7 +++
 1 file changed, 7 insertions(+)

diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index abd9386..99654f1 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -64,6 +64,13 @@ static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev)
  */
 static inline void *phys_to_host(VirtIOBlock *s, target_phys_addr_t phys)
 {
+/* Adjust for 3.6-4 GB PCI memory range */
+if (phys = 0x1) {
+phys -= 0x1 - 0xe000;
+} else if (phys = 0xe000) {
+fprintf(stderr, phys_to_host bad physical address in PCI range 
%#lx\n, phys);
+exit(1);
+}
 return s-phys_mem_zero_host_ptr + phys;
 }
 
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC v9 15/27] notifier: Add a function to set the notifier

2012-07-18 Thread Stefan Hajnoczi
Although past users only needed to test and clear event notifiers, it is
useful to be able to set them too.

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
 event_notifier.c |7 +++
 event_notifier.h |1 +
 2 files changed, 8 insertions(+)

diff --git a/event_notifier.c b/event_notifier.c
index 0b82981..006adc5 100644
--- a/event_notifier.c
+++ b/event_notifier.c
@@ -59,3 +59,10 @@ int event_notifier_test(EventNotifier *e)
 }
 return r == sizeof(value);
 }
+
+int event_notifier_set(EventNotifier *e)
+{
+uint64_t value = 1;
+int r = write(e-fd, value, sizeof(value));
+return r == sizeof(value);
+}
diff --git a/event_notifier.h b/event_notifier.h
index 886222c..46a22f8 100644
--- a/event_notifier.h
+++ b/event_notifier.h
@@ -24,5 +24,6 @@ void event_notifier_cleanup(EventNotifier *);
 int event_notifier_get_fd(EventNotifier *);
 int event_notifier_test_and_clear(EventNotifier *);
 int event_notifier_test(EventNotifier *);
+int event_notifier_set(EventNotifier *);
 
 #endif
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC v9 23/27] virtio-blk: Stub out SCSI commands

2012-07-18 Thread Stefan Hajnoczi
Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
 hw/virtio-blk.c |   25 +
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index 51807b5..8734029 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -215,14 +215,8 @@ static void process_request(IOQueue *ioq, struct iovec 
iov[], unsigned int out_n
 
 /* TODO Linux sets the barrier bit even when not advertised! */
 uint32_t type = outhdr-type  ~VIRTIO_BLK_T_BARRIER;
-
-if (unlikely(type  ~(VIRTIO_BLK_T_OUT | VIRTIO_BLK_T_FLUSH))) {
-fprintf(stderr, virtio-blk unsupported request type %#x\n, 
outhdr-type);
-exit(1);
-}
-
 struct iocb *iocb;
-switch (type  (VIRTIO_BLK_T_OUT | VIRTIO_BLK_T_FLUSH)) {
+switch (type  (VIRTIO_BLK_T_OUT | VIRTIO_BLK_T_SCSI_CMD | 
VIRTIO_BLK_T_FLUSH)) {
 case VIRTIO_BLK_T_IN:
 if (unlikely(out_num != 1)) {
 fprintf(stderr, virtio-blk invalid read request\n);
@@ -239,6 +233,21 @@ static void process_request(IOQueue *ioq, struct iovec 
iov[], unsigned int out_n
 iocb = ioq_rdwr(ioq, false, iov[1], out_num - 1, outhdr-sector * 
512UL); /* TODO is it always 512? */
 break;
 
+case VIRTIO_BLK_T_SCSI_CMD:
+if (unlikely(in_num == 0)) {
+fprintf(stderr, virtio-blk invalid SCSI command request\n);
+exit(1);
+}
+
+/* TODO support SCSI commands */
+{
+VirtIOBlock *s = container_of(ioq, VirtIOBlock, ioqueue);
+inhdr-status = VIRTIO_BLK_S_UNSUPP;
+vring_push(s-vring, head, sizeof *inhdr);
+virtio_blk_notify_guest(s);
+}
+return;
+
 case VIRTIO_BLK_T_FLUSH:
 if (unlikely(in_num != 1 || out_num != 1)) {
 fprintf(stderr, virtio-blk invalid flush request\n);
@@ -256,7 +265,7 @@ static void process_request(IOQueue *ioq, struct iovec 
iov[], unsigned int out_n
 return;
 
 default:
-fprintf(stderr, virtio-blk multiple request type bits set\n);
+fprintf(stderr, virtio-blk unsupported request type %#x\n, 
outhdr-type);
 exit(1);
 }
 
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC v9 13/27] virtio-blk: Increase max requests for indirect vring

2012-07-18 Thread Stefan Hajnoczi
With indirect vring descriptors, one can no longer assume that the
maximum number of requests is VRING_MAX / 2 (outhdr and inhdr).  Now a
single indirect descriptor can contain the outhdr and inhdr so max
requests becomes VRING_MAX.

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
 hw/virtio-blk.c |4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index 591eace..7ae3c56 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -26,7 +26,9 @@
 enum {
 SEG_MAX = 126,  /* maximum number of I/O segments */
 VRING_MAX = SEG_MAX + 2,/* maximum number of vring descriptors */
-REQ_MAX = VRING_MAX / 2,/* maximum number of requests in the vring 
*/
+REQ_MAX = VRING_MAX,/* maximum number of requests in the vring,
+ * is VRING_MAX / 2 with traditional and
+ * VRING_MAX with indirect descriptors */
 };
 
 typedef struct {
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC v9 20/27] virtio-blk: Add ioscheduler to detect mergable requests

2012-07-18 Thread Stefan Hajnoczi
---
 hw/dataplane/iosched.h |   78 
 hw/virtio-blk.c|5 
 2 files changed, 83 insertions(+)
 create mode 100644 hw/dataplane/iosched.h

diff --git a/hw/dataplane/iosched.h b/hw/dataplane/iosched.h
new file mode 100644
index 000..12ebccc
--- /dev/null
+++ b/hw/dataplane/iosched.h
@@ -0,0 +1,78 @@
+#ifndef IOSCHED_H
+#define IOSCHED_H
+
+#include hw/dataplane/ioq.h
+
+typedef struct {
+unsigned long iocbs;
+unsigned long merges;
+unsigned long sched_calls;
+} IOSched;
+
+static int iocb_cmp(const void *a, const void *b)
+{
+const struct iocb *iocb_a = a;
+const struct iocb *iocb_b = b;
+
+/*
+ * Note that we can't simply subtract req2-sector from req1-sector
+ * here as that could overflow the return value.
+ */
+if (iocb_a-u.c.offset  iocb_b-u.c.offset) {
+return 1;
+} else if (iocb_a-u.c.offset  iocb_b-u.c.offset) {
+return -1;
+} else {
+return 0;
+}
+}
+
+static size_t iocb_nbytes(struct iocb *iocb)
+{
+struct iovec *iov = iocb-u.c.buf;
+size_t nbytes = 0;
+size_t i;
+for (i = 0; i  iocb-u.c.nbytes; i++) {
+nbytes += iov-iov_len;
+iov++;
+}
+return nbytes;
+}
+
+static void iosched_init(IOSched *iosched)
+{
+memset(iosched, 0, sizeof *iosched);
+}
+
+static void iosched_print_stats(IOSched *iosched)
+{
+fprintf(stderr, iocbs = %lu merges = %lu sched_calls = %lu\n,
+iosched-iocbs, iosched-merges, iosched-sched_calls);
+memset(iosched, 0, sizeof *iosched);
+}
+
+static void iosched(IOSched *iosched, struct iocb *unsorted[], unsigned int 
count)
+{
+struct iocb *sorted[count];
+struct iocb *last;
+unsigned int i;
+
+if ((++iosched-sched_calls % 1000) == 0) {
+iosched_print_stats(iosched);
+}
+
+memcpy(sorted, unsorted, sizeof sorted);
+qsort(sorted, count, sizeof sorted[0], iocb_cmp);
+
+iosched-iocbs += count;
+last = sorted[0];
+for (i = 1; i  count; i++) {
+if (last-aio_lio_opcode == sorted[i]-aio_lio_opcode 
+last-u.c.offset + iocb_nbytes(last) == sorted[i]-u.c.offset) {
+iosched-merges++;
+}
+last = sorted[i];
+}
+}
+
+#endif /* IOSCHED_H */
diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index f67fdb7..75cb0f2 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -22,6 +22,7 @@
 #include hw/dataplane/event-poll.h
 #include hw/dataplane/vring.h
 #include hw/dataplane/ioq.h
+#include hw/dataplane/iosched.h
 #include kvm.h
 
 enum {
@@ -57,6 +58,7 @@ typedef struct {
 EventHandler notify_handler;/* virtqueue notify handler */
 
 IOQueue ioqueue;/* Linux AIO queue (should really be per 
dataplane thread) */
+IOSched iosched;/* I/O scheduler */
 VirtIOBlockRequest requests[REQ_MAX]; /* pool of requests, managed by the 
queue */
 } VirtIOBlock;
 
@@ -249,6 +251,8 @@ static bool handle_notify(EventHandler *handler)
 }
 }
 
+iosched(s-iosched, s-ioqueue.queue, s-ioqueue.queue_idx);
+
 /* Submit requests, if any */
 int rc = ioq_submit(s-ioqueue);
 if (unlikely(rc  0)) {
@@ -289,6 +293,7 @@ static void data_plane_start(VirtIOBlock *s)
 {
 int i;
 
+iosched_init(s-iosched);
 vring_setup(s-vring, s-vdev, 0);
 
 /* Set up guest notifier (irq) */
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC v9 25/27] msix: fix irqchip breakage in msix_try_notify_from_thread()

2012-07-18 Thread Stefan Hajnoczi
Commit bd8b215bce453706c3951460cc7e6627ccb90314 removed #ifdef
KVM_CAP_IRQCHIP from hw/msix.c after it turned out linux/kvm.h is not
included since msix.o is built in libhw64/.  Do the same for
msix_try_notify_from_thread() since we do not have access to
linux/kvm.h here and hence KVM_CAP_IRQCHIP is not defined.

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
 hw/msix.c |2 --
 1 file changed, 2 deletions(-)

diff --git a/hw/msix.c b/hw/msix.c
index 3308604..0ed1013 100644
--- a/hw/msix.c
+++ b/hw/msix.c
@@ -511,12 +511,10 @@ bool msix_try_notify_from_thread(PCIDevice *dev, unsigned 
vector)
 if (unlikely(msix_is_masked(dev, vector))) {
 return false;
 }
-#ifdef KVM_CAP_IRQCHIP
 if (likely(kvm_enabled()  kvm_irqchip_in_kernel())) {
 kvm_set_irq(dev-msix_irq_entries[vector].gsi, 1, NULL);
 return true;
 }
-#endif
 return false;
 }
 
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC v9 27/27] virtio-blk: add EVENT_IDX support to dataplane

2012-07-18 Thread Stefan Hajnoczi
This patch adds support for the VIRTIO_RING_F_EVENT_IDX feature for
interrupt mitigation.  virtio-blk doesn't do anything fancy with it so
we may not see a performance improvement.  This patch will allow newer
guest kernels to run successfully.

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
 hw/dataplane/vring.h |   65 --
 hw/virtio-blk.c  |   16 ++---
 2 files changed, 60 insertions(+), 21 deletions(-)

diff --git a/hw/dataplane/vring.h b/hw/dataplane/vring.h
index bbf8c86..d939a22 100644
--- a/hw/dataplane/vring.h
+++ b/hw/dataplane/vring.h
@@ -14,6 +14,8 @@ typedef struct {
 struct vring vr;/* virtqueue vring mapped to host memory */
 __u16 last_avail_idx;   /* last processed avail ring index */
 __u16 last_used_idx;/* last processed used ring index */
+uint16_t signalled_used;/* EVENT_IDX state */
+bool signalled_used_valid;
 } Vring;
 
 static inline unsigned int vring_get_num(Vring *vring)
@@ -63,6 +65,8 @@ static void vring_setup(Vring *vring, VirtIODevice *vdev, int 
n)
 
 vring-last_avail_idx = 0;
 vring-last_used_idx = 0;
+vring-signalled_used = 0;
+vring-signalled_used_valid = false;
 
 fprintf(stderr, vring physical=%#lx desc=%p avail=%p used=%p\n,
 (unsigned long)virtio_queue_get_ring_addr(vdev, n),
@@ -75,21 +79,48 @@ static bool vring_more_avail(Vring *vring)
return vring-vr.avail-idx != vring-last_avail_idx;
 }
 
-/* Hint to disable guest-host notifies */
-static void vring_disable_cb(Vring *vring)
+/* Toggle guest-host notifies */
+static void vring_set_notification(VirtIODevice *vdev, Vring *vring, bool 
enable)
 {
-vring-vr.used-flags |= VRING_USED_F_NO_NOTIFY;
+if (vdev-guest_features  (1  VIRTIO_RING_F_EVENT_IDX)) {
+if (enable) {
+vring_avail_event(vring-vr) = vring-vr.avail-idx;
+}
+} else if (enable) {
+vring-vr.used-flags = ~VRING_USED_F_NO_NOTIFY;
+} else {
+vring-vr.used-flags |= VRING_USED_F_NO_NOTIFY;
+}
 }
 
-/* Re-enable guest-host notifies
- *
- * Returns false if there are more descriptors in the ring.
- */
-static bool vring_enable_cb(Vring *vring)
+/* This is stolen from linux/drivers/vhost/vhost.c:vhost_notify() */
+static bool vring_should_notify(VirtIODevice *vdev, Vring *vring)
 {
-vring-vr.used-flags = ~VRING_USED_F_NO_NOTIFY;
-__sync_synchronize(); /* mb() */
-return !vring_more_avail(vring);
+uint16_t old, new;
+bool v;
+/* Flush out used index updates. This is paired
+ * with the barrier that the Guest executes when enabling
+ * interrupts. */
+__sync_synchronize(); /* smp_mb() */
+
+if ((vdev-guest_features  VIRTIO_F_NOTIFY_ON_EMPTY) 
+unlikely(vring-vr.avail-idx == vring-last_avail_idx)) {
+return true;
+}
+
+if (!(vdev-guest_features  VIRTIO_RING_F_EVENT_IDX)) {
+return !(vring-vr.avail-flags  VRING_AVAIL_F_NO_INTERRUPT);
+}
+old = vring-signalled_used;
+v = vring-signalled_used_valid;
+new = vring-signalled_used = vring-last_used_idx;
+vring-signalled_used_valid = true;
+
+if (unlikely(!v)) {
+return true;
+}
+
+return vring_need_event(vring_used_event(vring-vr), new, old);
 }
 
 /* This is stolen from linux-2.6/drivers/vhost/vhost.c. */
@@ -178,7 +209,7 @@ static bool get_indirect(Vring *vring,
  *
  * Stolen from linux-2.6/drivers/vhost/vhost.c.
  */
-static int vring_pop(Vring *vring,
+static int vring_pop(VirtIODevice *vdev, Vring *vring,
  struct iovec iov[], struct iovec *iov_end,
  unsigned int *out_num, unsigned int *in_num)
 {
@@ -214,6 +245,10 @@ static int vring_pop(Vring *vring,
exit(1);
}
 
+   if (vdev-guest_features  (1  VIRTIO_RING_F_EVENT_IDX)) {
+   vring_avail_event(vring-vr) = vring-vr.avail-idx;
+   }
+
/* When we start there are none of either input nor output. */
*out_num = *in_num = 0;
 
@@ -279,6 +314,7 @@ static int vring_pop(Vring *vring,
 static void vring_push(Vring *vring, unsigned int head, int len)
 {
struct vring_used_elem *used;
+   uint16_t new;
 
/* The virtqueue contains a ring of used buffers.  Get a pointer to the
 * next entry in that used ring. */
@@ -289,7 +325,10 @@ static void vring_push(Vring *vring, unsigned int head, 
int len)
/* Make sure buffer is written before we update index. */
__sync_synchronize(); /* smp_wmb() */
 
-vring-vr.used-idx = ++vring-last_used_idx;
+   new = vring-vr.used-idx = ++vring-last_used_idx;
+   if (unlikely((int16_t)(new - vring-signalled_used)  (uint16_t)1)) {
+   vring-signalled_used_valid = false;
+   }
 }
 
 #endif /* VRING_H */
diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index cff2298..a3e3d8c 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -96,11 +96,9 

[RFC v9 24/27] virtio-blk: fix incorrect length

2012-07-18 Thread Stefan Hajnoczi
Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
 hw/virtio-blk.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index 8734029..cff2298 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -131,7 +131,7 @@ static void complete_one_request(VirtIOBlockRequest *req, 
VirtIOBlock *s, ssize_
  * written to, but for virtio-blk it seems to be the number of bytes
  * transferred plus the status bytes.
  */
-vring_push(s-vring, req-head, len + sizeof req-status);
+vring_push(s-vring, req-head, len + sizeof(*req-status));
 }
 
 static bool is_request_merged(VirtIOBlockRequest *req)
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC v9 26/27] msix: use upstream kvm_irqchip_set_irq()

2012-07-18 Thread Stefan Hajnoczi
Commit 9507e305ec54062fccc88fcf6fccf1898a7e7141 changed the
kvm_set_irq() function to kvm_irqchip_set_irq().

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
 hw/msix.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/msix.c b/hw/msix.c
index 0ed1013..373017a 100644
--- a/hw/msix.c
+++ b/hw/msix.c
@@ -512,7 +512,7 @@ bool msix_try_notify_from_thread(PCIDevice *dev, unsigned 
vector)
 return false;
 }
 if (likely(kvm_enabled()  kvm_irqchip_in_kernel())) {
-kvm_set_irq(dev-msix_irq_entries[vector].gsi, 1, NULL);
+kvm_irqchip_set_irq(kvm_state, dev-msix_irq_entries[vector].gsi, 1);
 return true;
 }
 return false;
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC v9 21/27] virtio-blk: Add basic request merging

2012-07-18 Thread Stefan Hajnoczi
This commit adds an I/O scheduler that sorts requests and merges
adjacent requests if they have the same operation type (read/write).
The code is ugly and not very well factored but it does merge
successfully.
---
 hw/dataplane/ioq.h |3 +-
 hw/dataplane/iosched.h |   51 +-
 hw/dataplane/vring.h   |4 +--
 hw/virtio-blk.c|   93 +++-
 4 files changed, 122 insertions(+), 29 deletions(-)

diff --git a/hw/dataplane/ioq.h b/hw/dataplane/ioq.h
index d1545d6..72e5fd6 100644
--- a/hw/dataplane/ioq.h
+++ b/hw/dataplane/ioq.h
@@ -96,7 +96,7 @@ static int ioq_submit(IOQueue *ioq)
 int rc = io_submit(ioq-io_ctx, ioq-queue_idx, ioq-queue);
 if (unlikely(rc  0)) {
 unsigned int i;
-fprintf(stderr, io_submit io_ctx=%#lx nr=%d iovecs=%p\n, 
(uint64_t)ioq-io_ctx, ioq-queue_idx, ioq-queue);
+fprintf(stderr, io_submit failed io_ctx=%#lx nr=%d iovecs=%p 
rc=%d\n, (uint64_t)ioq-io_ctx, ioq-queue_idx, ioq-queue, rc);
 for (i = 0; i  ioq-queue_idx; i++) {
 fprintf(stderr, [%u] type=%#x fd=%d\n, i, 
ioq-queue[i]-aio_lio_opcode, ioq-queue[i]-aio_fildes);
 }
@@ -121,7 +121,6 @@ static int ioq_run_completion(IOQueue *ioq, 
IOQueueCompletion *completion, void
 ssize_t ret = ((uint64_t)events[i].res2  32) | events[i].res;
 
 completion(events[i].obj, ret, opaque);
-ioq_put_iocb(ioq, events[i].obj);
 }
 return nevents;
 }
diff --git a/hw/dataplane/iosched.h b/hw/dataplane/iosched.h
index 12ebccc..39da73c 100644
--- a/hw/dataplane/iosched.h
+++ b/hw/dataplane/iosched.h
@@ -9,6 +9,8 @@ typedef struct {
 unsigned long sched_calls;
 } IOSched;
 
+typedef void MergeFunc(struct iocb *a, struct iocb *b);
+
 static int iocb_cmp(const void *a, const void *b)
 {
 const struct iocb *iocb_a = a;
@@ -29,10 +31,10 @@ static int iocb_cmp(const void *a, const void *b)
 
 static size_t iocb_nbytes(struct iocb *iocb)
 {
-struct iovec *iov = iocb-u.c.buf;
+const struct iovec *iov = iocb-u.v.vec;
 size_t nbytes = 0;
 size_t i;
-for (i = 0; i  iocb-u.c.nbytes; i++) {
+for (i = 0; i  iocb-u.v.nr; i++) {
 nbytes += iov-iov_len;
 iov++;
 }
@@ -44,35 +46,52 @@ static void iosched_init(IOSched *iosched)
 memset(iosched, 0, sizeof *iosched);
 }
 
-static void iosched_print_stats(IOSched *iosched)
+static __attribute__((unused)) void iosched_print_stats(IOSched *iosched)
 {
 fprintf(stderr, iocbs = %lu merges = %lu sched_calls = %lu\n,
 iosched-iocbs, iosched-merges, iosched-sched_calls);
 memset(iosched, 0, sizeof *iosched);
 }
 
-static void iosched(IOSched *iosched, struct iocb *unsorted[], unsigned int 
count)
+static void iosched(IOSched *iosched, struct iocb *unsorted[], unsigned int 
*count, MergeFunc merge_func)
 {
-struct iocb *sorted[count];
-struct iocb *last;
-unsigned int i;
+struct iocb *sorted[*count];
+unsigned int merges = 0;
+unsigned int i, j;
 
+/*
 if ((++iosched-sched_calls % 1000) == 0) {
 iosched_print_stats(iosched);
 }
+*/
+
+if (!*count) {
+return;
+}
 
 memcpy(sorted, unsorted, sizeof sorted);
-qsort(sorted, count, sizeof sorted[0], iocb_cmp);
-
-iosched-iocbs += count;
-last = sorted[0];
-for (i = 1; i  count; i++) {
-if (last-aio_lio_opcode == sorted[i]-aio_lio_opcode 
-last-u.c.offset + iocb_nbytes(last) == sorted[i]-u.c.offset) {
-iosched-merges++;
+qsort(sorted, *count, sizeof sorted[0], iocb_cmp);
+
+unsorted[0] = sorted[0];
+j = 1;
+for (i = 1; i  *count; i++) {
+struct iocb *last = sorted[i - 1];
+struct iocb *cur = sorted[i];
+
+if (last-aio_lio_opcode == cur-aio_lio_opcode 
+last-u.c.offset + iocb_nbytes(last) == cur-u.c.offset) {
+merge_func(last, cur);
+merges++;
+
+unsorted[j - 1] = cur;
+} else {
+unsorted[j++] = cur;
 }
-last = sorted[i];
 }
+
+iosched-merges += merges;
+iosched-iocbs += *count;
+*count = j;
 }
 
 #endif /* IOSCHED_H */
diff --git a/hw/dataplane/vring.h b/hw/dataplane/vring.h
index cdd4d4a..bbf8c86 100644
--- a/hw/dataplane/vring.h
+++ b/hw/dataplane/vring.h
@@ -29,7 +29,7 @@ static inline void *phys_to_host(Vring *vring, 
target_phys_addr_t phys)
 if (phys = 0x1) {
 phys -= 0x1 - 0xe000;
 } else if (phys = 0xe000) {
-fprintf(stderr, phys_to_host bad physical address in PCI range 
%#lx\n, phys);
+fprintf(stderr, phys_to_host bad physical address in PCI range 
%#lx\n, (unsigned long)phys);
 exit(1);
 }
 return vring-phys_mem_zero_host_ptr + phys;
@@ -65,7 +65,7 @@ static void vring_setup(Vring *vring, VirtIODevice *vdev, int 
n)
 vring-last_used_idx = 0;
 
 fprintf(stderr, vring physical=%#lx desc=%p avail=%p used=%p\n,
- 

[RFC v9 22/27] virtio-blk: Fix request merging

2012-07-18 Thread Stefan Hajnoczi
Khoa Huynh k...@us.ibm.com discovered that request merging is broken.
The merged iocb is not updated to reflect the total number of iovecs and
the offset is also outdated.

This patch fixes request merging.

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
 hw/virtio-blk.c |   10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index 9131a7a..51807b5 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -178,13 +178,17 @@ static void merge_request(struct iocb *iocb_a, struct 
iocb *iocb_b)
 req_a-len = iocb_nbytes(iocb_a);
 }
 
-iocb_b-u.v.vec = iovec;
-req_b-len = iocb_nbytes(iocb_b);
-req_b-next_merged = req_a;
 /*
 fprintf(stderr, merged %p (%u) and %p (%u), %u iovecs in total\n,
 req_a, iocb_a-u.v.nr, req_b, iocb_b-u.v.nr, iocb_a-u.v.nr + 
iocb_b-u.v.nr);
 */
+
+iocb_b-u.v.vec = iovec;
+iocb_b-u.v.nr += iocb_a-u.v.nr;
+iocb_b-u.v.offset = iocb_a-u.v.offset;
+
+req_b-len = iocb_nbytes(iocb_b);
+req_b-next_merged = req_a;
 }
 
 static void process_request(IOQueue *ioq, struct iovec iov[], unsigned int 
out_num, unsigned int in_num, unsigned int head)
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC v9 19/27] virtio-blk: Disable guest-host notifies while processing vring

2012-07-18 Thread Stefan Hajnoczi
---
 hw/dataplane/vring.h |   28 +++-
 hw/virtio-blk.c  |   47 +++
 2 files changed, 58 insertions(+), 17 deletions(-)

diff --git a/hw/dataplane/vring.h b/hw/dataplane/vring.h
index 44ef4a9..cdd4d4a 100644
--- a/hw/dataplane/vring.h
+++ b/hw/dataplane/vring.h
@@ -69,11 +69,29 @@ static void vring_setup(Vring *vring, VirtIODevice *vdev, 
int n)
 vring-vr.desc, vring-vr.avail, vring-vr.used);
 }
 
+/* Are there more descriptors available? */
 static bool vring_more_avail(Vring *vring)
 {
return vring-vr.avail-idx != vring-last_avail_idx;
 }
 
+/* Hint to disable guest-host notifies */
+static void vring_disable_cb(Vring *vring)
+{
+vring-vr.used-flags |= VRING_USED_F_NO_NOTIFY;
+}
+
+/* Re-enable guest-host notifies
+ *
+ * Returns false if there are more descriptors in the ring.
+ */
+static bool vring_enable_cb(Vring *vring)
+{
+vring-vr.used-flags = ~VRING_USED_F_NO_NOTIFY;
+__sync_synchronize(); /* mb() */
+return !vring_more_avail(vring);
+}
+
 /* This is stolen from linux-2.6/drivers/vhost/vhost.c. */
 static bool get_indirect(Vring *vring,
struct iovec iov[], struct iovec *iov_end,
@@ -160,7 +178,7 @@ static bool get_indirect(Vring *vring,
  *
  * Stolen from linux-2.6/drivers/vhost/vhost.c.
  */
-static unsigned int vring_pop(Vring *vring,
+static int vring_pop(Vring *vring,
  struct iovec iov[], struct iovec *iov_end,
  unsigned int *out_num, unsigned int *in_num)
 {
@@ -178,9 +196,9 @@ static unsigned int vring_pop(Vring *vring,
exit(1);
}
 
-   /* If there's nothing new since last we looked, return invalid. */
+   /* If there's nothing new since last we looked. */
if (avail_idx == last_avail_idx)
-   return num;
+   return -EAGAIN;
 
/* Only get avail ring entries after they have been exposed by guest. */
__sync_synchronize(); /* smp_rmb() */
@@ -215,7 +233,7 @@ static unsigned int vring_pop(Vring *vring,
 desc = vring-vr.desc[i];
if (desc.flags  VRING_DESC_F_INDIRECT) {
if (!get_indirect(vring, iov, iov_end, out_num, in_num, 
desc)) {
-return num; /* not enough iovecs, stop for now */
+return -ENOBUFS; /* not enough iovecs, stop for now */
 }
 continue;
}
@@ -225,7 +243,7 @@ static unsigned int vring_pop(Vring *vring,
  * with the current set.
  */
 if (iov = iov_end) {
-return num;
+return -ENOBUFS;
 }
 
 iov-iov_base = phys_to_host(vring, desc.addr);
diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index efeffa0..f67fdb7 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -202,7 +202,8 @@ static bool handle_notify(EventHandler *handler)
  * accept more I/O.  This is not implemented yet.
  */
 struct iovec iovec[VRING_MAX];
-struct iovec *iov, *end = iovec[VRING_MAX];
+struct iovec *end = iovec[VRING_MAX];
+struct iovec *iov = iovec;
 
 /* When a request is read from the vring, the index of the first descriptor
  * (aka head) is returned so that the completed request can be pushed onto
@@ -211,19 +212,41 @@ static bool handle_notify(EventHandler *handler)
  * The number of hypervisor read-only iovecs is out_num.  The number of
  * hypervisor write-only iovecs is in_num.
  */
-unsigned int head, out_num = 0, in_num = 0;
+int head;
+unsigned int out_num = 0, in_num = 0;
 
-for (iov = iovec; ; iov += out_num + in_num) {
-head = vring_pop(s-vring, iov, end, out_num, in_num);
-if (head = vring_get_num(s-vring)) {
-break; /* no more requests */
-}
+for (;;) {
+/* Disable guest-host notifies to avoid unnecessary vmexits */
+vring_disable_cb(s-vring);
+
+for (;;) {
+head = vring_pop(s-vring, iov, end, out_num, in_num);
+if (head  0) {
+break; /* no more requests */
+}
 
-/*
-fprintf(stderr, out_num=%u in_num=%u head=%u\n, out_num, in_num, 
head);
-*/
+/*
+fprintf(stderr, out_num=%u in_num=%u head=%d\n, out_num, in_num, 
head);
+*/
 
-process_request(s-ioqueue, iov, out_num, in_num, head);
+process_request(s-ioqueue, iov, out_num, in_num, head);
+iov += out_num + in_num;
+}
+
+if (likely(head == -EAGAIN)) { /* vring emptied */
+/* Re-enable guest-host notifies and stop processing the vring.
+ * But if the guest has snuck in more descriptors, keep processing.
+ */
+if (likely(vring_enable_cb(s-vring))) {
+break;
+}
+} else { /* head == -ENOBUFS, cannot continue since iovecs[] is 
depleted */
+/* Since 

[RFC v9 18/27] virtio-blk: Call ioctl() directly instead of irqfd

2012-07-18 Thread Stefan Hajnoczi
Optimize for the MSI-X enabled and vector unmasked case where it is
possible to issue the KVM ioctl() directly instead of using irqfd.

This patch introduces a new virtio binding function which tries to
notify in a thread-safe way.  If this is not possible, the function
returns false.  Virtio block then knows to use irqfd as a fallback.
---
 hw/msix.c   |   17 +
 hw/msix.h   |1 +
 hw/virtio-blk.c |   10 --
 hw/virtio-pci.c |8 
 hw/virtio.c |9 +
 hw/virtio.h |3 +++
 6 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/hw/msix.c b/hw/msix.c
index 7955221..3308604 100644
--- a/hw/msix.c
+++ b/hw/msix.c
@@ -503,6 +503,23 @@ void msix_notify(PCIDevice *dev, unsigned vector)
 stl_le_phys(address, data);
 }
 
+bool msix_try_notify_from_thread(PCIDevice *dev, unsigned vector)
+{
+if (unlikely(vector = dev-msix_entries_nr || 
!dev-msix_entry_used[vector])) {
+return false;
+}
+if (unlikely(msix_is_masked(dev, vector))) {
+return false;
+}
+#ifdef KVM_CAP_IRQCHIP
+if (likely(kvm_enabled()  kvm_irqchip_in_kernel())) {
+kvm_set_irq(dev-msix_irq_entries[vector].gsi, 1, NULL);
+return true;
+}
+#endif
+return false;
+}
+
 void msix_reset(PCIDevice *dev)
 {
 if (!(dev-cap_present  QEMU_PCI_CAP_MSIX))
diff --git a/hw/msix.h b/hw/msix.h
index a8661e1..99fb08f 100644
--- a/hw/msix.h
+++ b/hw/msix.h
@@ -26,6 +26,7 @@ void msix_vector_unuse(PCIDevice *dev, unsigned vector);
 void msix_unuse_all_vectors(PCIDevice *dev);
 
 void msix_notify(PCIDevice *dev, unsigned vector);
+bool msix_try_notify_from_thread(PCIDevice *dev, unsigned vector);
 
 void msix_reset(PCIDevice *dev);
 
diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index bdff68a..efeffa0 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -82,6 +82,12 @@ static void virtio_blk_notify_guest(VirtIOBlock *s)
 !(s-vdev.guest_features  (1  VIRTIO_F_NOTIFY_ON_EMPTY
return;
 
+/* Try to issue the ioctl() directly for speed */
+if (likely(virtio_queue_try_notify_from_thread(s-vq))) {
+return;
+}
+
+/* If the fast path didn't work, use irqfd */
 event_notifier_set(virtio_queue_get_guest_notifier(s-vq));
 }
 
@@ -263,7 +269,7 @@ static void data_plane_start(VirtIOBlock *s)
 vring_setup(s-vring, s-vdev, 0);
 
 /* Set up guest notifier (irq) */
-if (s-vdev.binding-set_guest_notifier(s-vdev.binding_opaque, 0, true) 
!= 0) {
+if (s-vdev.binding-set_guest_notifiers(s-vdev.binding_opaque, true) != 
0) {
 fprintf(stderr, virtio-blk failed to set guest notifier, ensure 
-enable-kvm is set\n);
 exit(1);
 }
@@ -315,7 +321,7 @@ static void data_plane_stop(VirtIOBlock *s)
 event_poll_cleanup(s-event_poll);
 
 /* Clean up guest notifier (irq) */
-s-vdev.binding-set_guest_notifier(s-vdev.binding_opaque, 0, false);
+s-vdev.binding-set_guest_notifiers(s-vdev.binding_opaque, false);
 }
 
 static void virtio_blk_set_status(VirtIODevice *vdev, uint8_t val)
diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
index f1e13af..03512b3 100644
--- a/hw/virtio-pci.c
+++ b/hw/virtio-pci.c
@@ -106,6 +106,13 @@ static void virtio_pci_notify(void *opaque, uint16_t 
vector)
 qemu_set_irq(proxy-pci_dev.irq[0], proxy-vdev-isr  1);
 }
 
+static bool virtio_pci_try_notify_from_thread(void *opaque, uint16_t vector)
+{
+VirtIOPCIProxy *proxy = opaque;
+return msix_enabled(proxy-pci_dev) 
+   msix_try_notify_from_thread(proxy-pci_dev, vector);
+}
+
 static void virtio_pci_save_config(void * opaque, QEMUFile *f)
 {
 VirtIOPCIProxy *proxy = opaque;
@@ -707,6 +714,7 @@ static void virtio_pci_vmstate_change(void *opaque, bool 
running)
 
 static const VirtIOBindings virtio_pci_bindings = {
 .notify = virtio_pci_notify,
+.try_notify_from_thread = virtio_pci_try_notify_from_thread,
 .save_config = virtio_pci_save_config,
 .load_config = virtio_pci_load_config,
 .save_queue = virtio_pci_save_queue,
diff --git a/hw/virtio.c b/hw/virtio.c
index 064aecf..a1d1a8a 100644
--- a/hw/virtio.c
+++ b/hw/virtio.c
@@ -689,6 +689,15 @@ static inline int vring_need_event(uint16_t event, 
uint16_t new, uint16_t old)
return (uint16_t)(new - event - 1)  (uint16_t)(new - old);
 }
 
+bool virtio_queue_try_notify_from_thread(VirtQueue *vq)
+{
+VirtIODevice *vdev = vq-vdev;
+if (likely(vdev-binding-try_notify_from_thread)) {
+return vdev-binding-try_notify_from_thread(vdev-binding_opaque, 
vq-vector);
+}
+return false;
+}
+
 static bool vring_notify(VirtIODevice *vdev, VirtQueue *vq)
 {
 uint16_t old, new;
diff --git a/hw/virtio.h b/hw/virtio.h
index 400c092..2cdf2be 100644
--- a/hw/virtio.h
+++ b/hw/virtio.h
@@ -93,6 +93,7 @@ typedef struct VirtQueueElement
 
 typedef struct {
 void (*notify)(void * opaque, uint16_t vector);
+bool (*try_notify_from_thread)(void * opaque, uint16_t vector);
 void 

[RFC v9 10/27] virtio-blk: Stop data plane thread cleanly

2012-07-18 Thread Stefan Hajnoczi
Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
 hw/dataplane/event-poll.h |   79 ---
 hw/dataplane/ioq.h|   65 +--
 hw/dataplane/vring.h  |6 +-
 hw/virtio-blk.c   |  154 +++--
 4 files changed, 243 insertions(+), 61 deletions(-)

diff --git a/hw/dataplane/event-poll.h b/hw/dataplane/event-poll.h
index f38e969..acd85e1 100644
--- a/hw/dataplane/event-poll.h
+++ b/hw/dataplane/event-poll.h
@@ -5,17 +5,40 @@
 #include event_notifier.h
 
 typedef struct EventHandler EventHandler;
-typedef void EventCallback(EventHandler *handler);
+typedef bool EventCallback(EventHandler *handler);
 struct EventHandler
 {
-EventNotifier *notifier;/* eventfd */
-EventCallback *callback;/* callback function */
+EventNotifier *notifier;/* eventfd */
+EventCallback *callback;/* callback function */
 };
 
 typedef struct {
-int epoll_fd;   /* epoll(2) file descriptor */
+int epoll_fd;   /* epoll(2) file descriptor */
+EventNotifier stop_notifier;/* stop poll notifier */
+EventHandler stop_handler;  /* stop poll handler */
 } EventPoll;
 
+/* Add an event notifier and its callback for polling */
+static void event_poll_add(EventPoll *poll, EventHandler *handler, 
EventNotifier *notifier, EventCallback *callback)
+{
+struct epoll_event event = {
+.events = EPOLLIN,
+.data.ptr = handler,
+};
+handler-notifier = notifier;
+handler-callback = callback;
+if (epoll_ctl(poll-epoll_fd, EPOLL_CTL_ADD, 
event_notifier_get_fd(notifier), event) != 0) {
+fprintf(stderr, failed to add event handler to epoll: %m\n);
+exit(1);
+}
+}
+
+/* Event callback for stopping the event_poll_run() loop */
+static bool handle_stop(EventHandler *handler)
+{
+return false; /* stop event loop */
+}
+
 static void event_poll_init(EventPoll *poll)
 {
 /* Create epoll file descriptor */
@@ -24,35 +47,29 @@ static void event_poll_init(EventPoll *poll)
 fprintf(stderr, epoll_create1 failed: %m\n);
 exit(1);
 }
+
+/* Set up stop notifier */
+if (event_notifier_init(poll-stop_notifier, 0)  0) {
+fprintf(stderr, failed to init stop notifier\n);
+exit(1);
+}
+event_poll_add(poll, poll-stop_handler,
+   poll-stop_notifier, handle_stop);
 }
 
 static void event_poll_cleanup(EventPoll *poll)
 {
+event_notifier_cleanup(poll-stop_notifier);
 close(poll-epoll_fd);
 poll-epoll_fd = -1;
 }
 
-/* Add an event notifier and its callback for polling */
-static void event_poll_add(EventPoll *poll, EventHandler *handler, 
EventNotifier *notifier, EventCallback *callback)
-{
-struct epoll_event event = {
-.events = EPOLLIN,
-.data.ptr = handler,
-};
-handler-notifier = notifier;
-handler-callback = callback;
-if (epoll_ctl(poll-epoll_fd, EPOLL_CTL_ADD, 
event_notifier_get_fd(notifier), event) != 0) {
-fprintf(stderr, failed to add event handler to epoll: %m\n);
-exit(1);
-}
-}
-
 /* Block until the next event and invoke its callback
  *
  * Signals must be masked, EINTR should never happen.  This is true for QEMU
  * threads.
  */
-static void event_poll(EventPoll *poll)
+static bool event_poll(EventPoll *poll)
 {
 EventHandler *handler;
 struct epoll_event event;
@@ -73,7 +90,27 @@ static void event_poll(EventPoll *poll)
 event_notifier_test_and_clear(handler-notifier);
 
 /* Handle the event */
-handler-callback(handler);
+return handler-callback(handler);
+}
+
+static void event_poll_run(EventPoll *poll)
+{
+while (event_poll(poll)) {
+/* do nothing */
+}
+}
+
+/* Stop the event_poll_run() loop
+ *
+ * This function can be used from another thread.
+ */
+static void event_poll_stop(EventPoll *poll)
+{
+uint64_t dummy = 1;
+int eventfd = event_notifier_get_fd(poll-stop_notifier);
+ssize_t unused __attribute__((unused));
+
+unused = write(eventfd, dummy, sizeof dummy);
 }
 
 #endif /* EVENT_POLL_H */
diff --git a/hw/dataplane/ioq.h b/hw/dataplane/ioq.h
index 26ca307..7200e87 100644
--- a/hw/dataplane/ioq.h
+++ b/hw/dataplane/ioq.h
@@ -3,10 +3,10 @@
 
 typedef struct {
 int fd; /* file descriptor */
-unsigned int maxreqs;   /* max length of freelist and queue */
+unsigned int max_reqs;   /* max length of freelist and queue */
 
 io_context_t io_ctx;/* Linux AIO context */
-EventNotifier notifier; /* Linux AIO eventfd */
+EventNotifier io_notifier;  /* Linux AIO eventfd */
 
 /* Requests can complete in any order so a free list is necessary to manage
  * available iocbs.
@@ -19,25 +19,28 @@ typedef struct {
 unsigned int queue_idx;
 } IOQueue;
 
-static void ioq_init(IOQueue *ioq, int fd, unsigned int maxreqs)
+static void 

[RFC v9 14/27] virtio-blk: Use pthreads instead of qemu-thread

2012-07-18 Thread Stefan Hajnoczi
Using qemu-thread.h seemed like a nice idea but it has two limitations:

1. QEMU needs to be built with --enable-io-thread
2. qemu-kvm doesn't build with --enable-io-thread

For now just copy the pthread_create() code straight into virtio-blk.c.

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
 hw/virtio-blk.c |   16 +---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index 7ae3c56..1616be5 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -11,6 +11,7 @@
  *
  */
 
+#include pthread.h
 #include libaio.h
 #include qemu-common.h
 #include block_int.h
@@ -47,7 +48,7 @@ typedef struct {
 DeviceState *qdev;
 
 bool data_plane_started;
-QemuThread data_plane_thread;
+pthread_t data_plane_thread;
 
 Vring vring;/* virtqueue vring */
 
@@ -268,7 +269,16 @@ static void data_plane_start(VirtIOBlock *s)
 }
 event_poll_add(s-event_poll, s-io_handler, 
ioq_get_notifier(s-ioqueue), handle_io);
 
-qemu_thread_create(s-data_plane_thread, data_plane_thread, s, 
QEMU_THREAD_JOINABLE);
+/* Create data plane thread */
+sigset_t set, oldset;
+sigfillset(set);
+pthread_sigmask(SIG_SETMASK, set, oldset);
+if (pthread_create(s-data_plane_thread, NULL, data_plane_thread, s) != 0)
+{
+fprintf(stderr, pthread create failed: %m\n);
+exit(1);
+}
+pthread_sigmask(SIG_SETMASK, oldset, NULL);
 
 s-data_plane_started = true;
 }
@@ -279,7 +289,7 @@ static void data_plane_stop(VirtIOBlock *s)
 
 /* Tell data plane thread to stop and then wait for it to return */
 event_poll_stop(s-event_poll);
-pthread_join(s-data_plane_thread.thread, NULL);
+pthread_join(s-data_plane_thread, NULL);
 
 ioq_cleanup(s-ioqueue);
 
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC-v2 0/4] tcm_vhost+cmwq fabric driver code for-3.6

2012-07-18 Thread Michael S. Tsirkin
On Wed, Jul 18, 2012 at 08:42:21AM -0500, Anthony Liguori wrote:
 On 07/17/2012 04:50 PM, Nicholas A. Bellinger wrote:
 On Tue, 2012-07-17 at 13:55 -0500, Anthony Liguori wrote:
 On 07/17/2012 10:05 AM, Michael S. Tsirkin wrote:
 On Wed, Jul 11, 2012 at 09:15:00PM +, Nicholas A. Bellinger wrote:
 
 SNIP
 
 
 It still seems not 100% clear whether this driver will have major
 userspace using it. And if not, it would be very hard to support a driver
 when recent userspace does not use it in the end.
 
 I don't think this is a good reason to exclude something from the kernel.
 However, there are good reasons why this doesn't make sense for something 
 like
 QEMU--specifically because we have a large number of features in our block 
 layer
 that tcm_vhost would bypass.
 
 
 I can definitely appreciate your concern here as the QEMU maintainer.
 
 But perhaps it makes sense for something like native kvm tool.  And if it 
 did go
 into the kernel, we would certainly support it in QEMU.
 
 
 ...
 
 But I do think the kernel should carefully consider whether it wants to 
 support
 an interface like this.  This an extremely complicated ABI with a lot of 
 subtle
 details around state and compatibility.
 
 Are you absolutely confident that you can support a userspace application 
 that
 expects to get exactly the same response from all possible commands in 20 
 kernel
 versions from now?  Virtualization requires absolutely precise 
 compatibility in
 terms of bugs and features.  This is probably not something the TCM stack 
 has
 had to consider yet.
 
 
 We most certainly have thought about long term userspace compatibility
 with TCM.  Our userspace code (that's now available in all major
 distros) is completely forward-compatible with new fabric modules such
 as tcm_vhost.  No update required.
 
 I'm not sure we're talking about the same thing when we say compatibility.
 
 I'm not talking about the API.  I'm talking about the behavior of
 the commands that tcm_vhost supports.
 
 If you add support for a new command, you need to provide userspace
 a way to disable this command.  If you change what gets reported for
 VPD, you need to provide userspace a way to make VPD look like what
 it did in a previous version.
 
 Basically, you need to be able to make a TCM device behave 100% the
 same as it did in an older version of the kernel.
 
 This is unique to virtualization due to live migration.  If you
 migrate from a 3.6 kernel to a 3.8 kernel, you need to make sure
 that the 3.8 kernel's TCM device behaves exactly like the 3.6 kernel
 because the guest that is interacting with it does not realize that
 live migration happened.
 
 Yes, you can add knobs via configfs to control this behavior, but I
 think the question is, what's the plan for this?
 
 BTW, I think this is a good thing to cover in
 Documentation/vhost/tcm_vhost.txt.  I think that's probably the only
 change that's needed here.
 
 Regards,
 
 Anthony Liguori

I agree it's needed but it's not a requirement for merging IMHO.
As a first step we can disable live migration.

 
 Also, by virtue of the fact that we are using configfs + rtslib (python
 object library) on top, it's very easy to keep any type of compatibility
 logic around in python code.  With rtslib, we are able to hide configfs
 ABI changes from higher level apps.
 
 So far we've had a track record of 100% userspace ABI compatibility in
 mainline since .38, and I don't intend to merge a patch that breaks this
 any time soon.  But if that ever happens, apps using rtslib are not
 going to be effected.
 
 I think a good idea for 3.6 would be to make it depend on CONFIG_STAGING.
 Then we don't commit to an ABI.
 
 I think this is a good idea.  Even if it goes in, a really clear policy 
 would be
 needed wrt the userspace ABI.
 
 While tcm_vhost is probably more useful than vhost_blk, it's a much more 
 complex
 ABI to maintain.
 
 
 As far as I am concerned, the kernel API (eg: configfs directory layout)
 as it is now in sys/kernel/config/target/vhost/ is not going to change.
 It's based on the same drivers/target/target_core_fabric_configfs.c
 generic layout that we've had since .38.
 
 The basic functional fabric layout in configfs is identical (with fabric
 dependent WWPN naming of course) regardless of fabric driver, and by
 virtue of being generic it means we can add things like fabric dependent
 attributes + parameters in the future for existing fabrics without
 breaking userspace.
 
 So while I agree the ABI is more complex than vhost-blk, the logic in
 target_core_fabric_configfs.c is a basic ABI fabric definition that we
 are enforcing across all fabric modules in mainline for long term
 compatibility.
 
 --nab
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 1/4] kvm: Extend irqfd to support level interrupts

2012-07-18 Thread Michael S. Tsirkin
On Wed, Jul 18, 2012 at 08:47:23AM -0600, Alex Williamson wrote:
 On Wed, 2012-07-18 at 15:07 +0300, Michael S. Tsirkin wrote:
  On Wed, Jul 18, 2012 at 02:48:44PM +0300, Gleb Natapov wrote:
   On Wed, Jul 18, 2012 at 02:39:10PM +0300, Michael S. Tsirkin wrote:
On Wed, Jul 18, 2012 at 02:22:19PM +0300, Michael S. Tsirkin wrote:
   So as was discussed kvm_set_irq under spinlock is bad for 
   scalability
   with multiple VCPUs.  Why do we need a spinlock simply to 
   protect
   level_asserted?  Let's use an atomic test and set/test 
   and clear and the
   problem goes away.
   
  That sad reality is that for level interrupt we already 
  scan all vcpus
  under spinlock.
 
 Where?
 
ioapic
   
   $ grep kvm_for_each_vcpu virt/kvm/ioapic.c
   $
   
   ?
   
  
  Come on Michael. You can do better than grep and actually look at 
  what
  code does. The code that loops over all vcpus while delivering an 
  irq is
  in kvm_irq_delivery_to_apic(). Now grep for that.
 
 Hmm, I see, it's actually done for edge if injected from ioapic too,
 right?
 
 So set_irq does a linear scan, and for each matching CPU it calls
 kvm_irq_delivery_to_apic which is another scan?
 So it's actually N^2 worst case for a broadcast?

No it isn't, I misread the code.


Anyway, maybe not trivially but this looks fixable to me: we could drop
the ioapic lock before calling kvm_irq_delivery_to_apic.

   May be, may be not. Just saying lets drop lock whenever we don't feel
   like holding one does not cut it.
  
  One thing we do is set remote_irr if interrupt was injected.
  I agree these things are tricky.
  
  One other question:
  
  static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
  {
  union kvm_ioapic_redirect_entry *pent;
  int injected = -1;
  
  pent = ioapic-redirtbl[idx];
  
  if (!pent-fields.mask) {
  injected = ioapic_deliver(ioapic, idx);
  if (injected  pent-fields.trig_mode == IOAPIC_LEVEL_TRIG)
  pent-fields.remote_irr = 1;
  }
  
  return injected;
  }
  
  
  This if (injected) looks a bit strange since ioapic_deliver returns
  -1 if no matching destinations. Should be if (injected  0)?
  
  
  
   Back to original point though current
   situation is that calling kvm_set_irq() under spinlock is not worse for
   scalability than calling it not under one.
  
  Yes. Still the specific use can just use an atomic flag,
  lock+bool is not needed, and we won't need to undo it later.
 
 
 Actually, no, replacing it with an atomic is racy.
 
 CPU0 (inject)   CPU1 (EOI)
 atomic_cmpxchg(asserted, 0, 1)
 atomic_cmpxchg(asserted, 1, 0)
 kvm_set_irq(0)
 kvm_set_irq(1)
 eventfd_signal
 
 The interrupt is now stuck on until another interrupt is injected.
 

Well EOI somehow happened here before interrupt so it's a bug somewhere
else?


-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC v9 18/27] virtio-blk: Call ioctl() directly instead of irqfd

2012-07-18 Thread Michael S. Tsirkin
On Wed, Jul 18, 2012 at 04:07:45PM +0100, Stefan Hajnoczi wrote:
 Optimize for the MSI-X enabled and vector unmasked case where it is
 possible to issue the KVM ioctl() directly instead of using irqfd.

Why? Is an ioctl faster?

 This patch introduces a new virtio binding function which tries to
 notify in a thread-safe way.  If this is not possible, the function
 returns false.  Virtio block then knows to use irqfd as a fallback.
 ---
  hw/msix.c   |   17 +
  hw/msix.h   |1 +
  hw/virtio-blk.c |   10 --
  hw/virtio-pci.c |8 
  hw/virtio.c |9 +
  hw/virtio.h |3 +++
  6 files changed, 46 insertions(+), 2 deletions(-)
 
 diff --git a/hw/msix.c b/hw/msix.c
 index 7955221..3308604 100644
 --- a/hw/msix.c
 +++ b/hw/msix.c
 @@ -503,6 +503,23 @@ void msix_notify(PCIDevice *dev, unsigned vector)
  stl_le_phys(address, data);
  }
  
 +bool msix_try_notify_from_thread(PCIDevice *dev, unsigned vector)
 +{
 +if (unlikely(vector = dev-msix_entries_nr || 
 !dev-msix_entry_used[vector])) {
 +return false;
 +}
 +if (unlikely(msix_is_masked(dev, vector))) {
 +return false;
 +}
 +#ifdef KVM_CAP_IRQCHIP
 +if (likely(kvm_enabled()  kvm_irqchip_in_kernel())) {
 +kvm_set_irq(dev-msix_irq_entries[vector].gsi, 1, NULL);
 +return true;
 +}
 +#endif
 +return false;
 +}
 +
  void msix_reset(PCIDevice *dev)
  {
  if (!(dev-cap_present  QEMU_PCI_CAP_MSIX))
 diff --git a/hw/msix.h b/hw/msix.h
 index a8661e1..99fb08f 100644
 --- a/hw/msix.h
 +++ b/hw/msix.h
 @@ -26,6 +26,7 @@ void msix_vector_unuse(PCIDevice *dev, unsigned vector);
  void msix_unuse_all_vectors(PCIDevice *dev);
  
  void msix_notify(PCIDevice *dev, unsigned vector);
 +bool msix_try_notify_from_thread(PCIDevice *dev, unsigned vector);
  
  void msix_reset(PCIDevice *dev);
  
 diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
 index bdff68a..efeffa0 100644
 --- a/hw/virtio-blk.c
 +++ b/hw/virtio-blk.c
 @@ -82,6 +82,12 @@ static void virtio_blk_notify_guest(VirtIOBlock *s)
  !(s-vdev.guest_features  (1  VIRTIO_F_NOTIFY_ON_EMPTY
   return;
  
 +/* Try to issue the ioctl() directly for speed */
 +if (likely(virtio_queue_try_notify_from_thread(s-vq))) {
 +return;
 +}
 +
 +/* If the fast path didn't work, use irqfd */
  event_notifier_set(virtio_queue_get_guest_notifier(s-vq));
  }
  
 @@ -263,7 +269,7 @@ static void data_plane_start(VirtIOBlock *s)
  vring_setup(s-vring, s-vdev, 0);
  
  /* Set up guest notifier (irq) */
 -if (s-vdev.binding-set_guest_notifier(s-vdev.binding_opaque, 0, true) 
 != 0) {
 +if (s-vdev.binding-set_guest_notifiers(s-vdev.binding_opaque, true) 
 != 0) {
  fprintf(stderr, virtio-blk failed to set guest notifier, ensure 
 -enable-kvm is set\n);
  exit(1);
  }
 @@ -315,7 +321,7 @@ static void data_plane_stop(VirtIOBlock *s)
  event_poll_cleanup(s-event_poll);
  
  /* Clean up guest notifier (irq) */
 -s-vdev.binding-set_guest_notifier(s-vdev.binding_opaque, 0, false);
 +s-vdev.binding-set_guest_notifiers(s-vdev.binding_opaque, false);
  }
  
  static void virtio_blk_set_status(VirtIODevice *vdev, uint8_t val)
 diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
 index f1e13af..03512b3 100644
 --- a/hw/virtio-pci.c
 +++ b/hw/virtio-pci.c
 @@ -106,6 +106,13 @@ static void virtio_pci_notify(void *opaque, uint16_t 
 vector)
  qemu_set_irq(proxy-pci_dev.irq[0], proxy-vdev-isr  1);
  }
  
 +static bool virtio_pci_try_notify_from_thread(void *opaque, uint16_t vector)
 +{
 +VirtIOPCIProxy *proxy = opaque;
 +return msix_enabled(proxy-pci_dev) 
 +   msix_try_notify_from_thread(proxy-pci_dev, vector);
 +}
 +
  static void virtio_pci_save_config(void * opaque, QEMUFile *f)
  {
  VirtIOPCIProxy *proxy = opaque;
 @@ -707,6 +714,7 @@ static void virtio_pci_vmstate_change(void *opaque, bool 
 running)
  
  static const VirtIOBindings virtio_pci_bindings = {
  .notify = virtio_pci_notify,
 +.try_notify_from_thread = virtio_pci_try_notify_from_thread,
  .save_config = virtio_pci_save_config,
  .load_config = virtio_pci_load_config,
  .save_queue = virtio_pci_save_queue,
 diff --git a/hw/virtio.c b/hw/virtio.c
 index 064aecf..a1d1a8a 100644
 --- a/hw/virtio.c
 +++ b/hw/virtio.c
 @@ -689,6 +689,15 @@ static inline int vring_need_event(uint16_t event, 
 uint16_t new, uint16_t old)
   return (uint16_t)(new - event - 1)  (uint16_t)(new - old);
  }
  
 +bool virtio_queue_try_notify_from_thread(VirtQueue *vq)
 +{
 +VirtIODevice *vdev = vq-vdev;
 +if (likely(vdev-binding-try_notify_from_thread)) {
 +return vdev-binding-try_notify_from_thread(vdev-binding_opaque, 
 vq-vector);
 +}
 +return false;
 +}
 +
  static bool vring_notify(VirtIODevice *vdev, VirtQueue *vq)
  {
  uint16_t old, new;
 diff --git a/hw/virtio.h b/hw/virtio.h
 index 400c092..2cdf2be 100644
 --- 

Re: [RFC v9 00/27] virtio: virtio-blk data plane

2012-07-18 Thread Michael S. Tsirkin
On Wed, Jul 18, 2012 at 04:07:27PM +0100, Stefan Hajnoczi wrote:
 This series implements a dedicated thread for virtio-blk processing using 
 Linux
 AIO for raw image files only.  It is based on qemu-kvm.git a0bc8c3 and 
 somewhat
 old but I wanted to share it on the list since it has been mentioned on 
 mailing
 lists and IRC recently.
 
 These patches can be used for benchmarking and discussion about how to improve
 block performance.  Paolo Bonzini has also worked in this area and might want
 to share his patches.
 
 The basic approach is:
 1. Each virtio-blk device has a thread dedicated to handling ioeventfd
signalling when the guest kicks the virtqueue.
 2. Requests are processed without going through the QEMU block layer using
Linux AIO directly.
 3. Completion interrupts are injected via ioctl from the dedicated thread.
 
 The series also contains request merging as a bdrv_aio_multiwrite() 
 equivalent.
 This was only to get a comparison against the QEMU block layer and I would 
 drop
 it for other types of analysis.
 
 The effect of this series is that O_DIRECT Linux AIO on raw files can bypass
 the QEMU global mutex and block layer.  This means higher performance.

Do you have any numbers at all?

 A cleaned up version of this approach could be added to QEMU as a raw O_DIRECT
 Linux AIO fast path.  Image file formats, protocols, and other block layer
 features are not supported by virtio-blk-data-plane.
 
 Git repo:
 http://repo.or.cz/w/qemu-kvm/stefanha.git/shortlog/refs/heads/virtio-blk-data-plane
 
 Stefan Hajnoczi (27):
   virtio-blk: Remove virtqueue request handling code
   virtio-blk: Set up host notifier for data plane
   virtio-blk: Data plane thread event loop
   virtio-blk: Map vring
   virtio-blk: Do cheapest possible memory mapping
   virtio-blk: Take PCI memory range into account
   virtio-blk: Put dataplane code into its own directory
   virtio-blk: Read requests from the vring
   virtio-blk: Add Linux AIO queue
   virtio-blk: Stop data plane thread cleanly
   virtio-blk: Indirect vring and flush support
   virtio-blk: Add workaround for BUG_ON() dependency in virtio_ring.h
   virtio-blk: Increase max requests for indirect vring
   virtio-blk: Use pthreads instead of qemu-thread
   notifier: Add a function to set the notifier
   virtio-blk: Kick data plane thread using event notifier set
   virtio-blk: Use guest notifier to raise interrupts
   virtio-blk: Call ioctl() directly instead of irqfd
   virtio-blk: Disable guest-host notifies while processing vring
   virtio-blk: Add ioscheduler to detect mergable requests
   virtio-blk: Add basic request merging
   virtio-blk: Fix request merging
   virtio-blk: Stub out SCSI commands
   virtio-blk: fix incorrect length
   msix: fix irqchip breakage in msix_try_notify_from_thread()
   msix: use upstream kvm_irqchip_set_irq()
   virtio-blk: add EVENT_IDX support to dataplane
 
  event_notifier.c  |7 +
  event_notifier.h  |1 +
  hw/dataplane/event-poll.h |  116 +++
  hw/dataplane/ioq.h|  128 
  hw/dataplane/iosched.h|   97 ++
  hw/dataplane/vring.h  |  334 
  hw/msix.c |   15 +
  hw/msix.h |1 +
  hw/virtio-blk.c   |  753 
 +
  hw/virtio-pci.c   |8 +
  hw/virtio.c   |9 +
  hw/virtio.h   |3 +
  12 files changed, 1074 insertions(+), 398 deletions(-)
  create mode 100644 hw/dataplane/event-poll.h
  create mode 100644 hw/dataplane/ioq.h
  create mode 100644 hw/dataplane/iosched.h
  create mode 100644 hw/dataplane/vring.h
 
 -- 
 1.7.10.4
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: buildbot failure in kvm on next-i386

2012-07-18 Thread Avi Kivity
On 07/18/2012 08:05 AM, k...@buildbot.b1-systems.de wrote:
 The Buildbot has detected a new failure on builder next-i386 while building 
 kvm.
 Full details are available at:
  http://buildbot.b1-systems.de/kvm/builders/next-i386/builds/590
 

It seems like the config does not include CONFIG_KVM_GUEST.  While it
luckily caught this build bug, I'd like to have it enabled in the future
to trap more kvm code build failures.


-- 
error compiling committee.c: too many arguments to function


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 1/4] kvm: Extend irqfd to support level interrupts

2012-07-18 Thread Alex Williamson
On Wed, 2012-07-18 at 18:38 +0300, Michael S. Tsirkin wrote:
 On Wed, Jul 18, 2012 at 08:47:23AM -0600, Alex Williamson wrote:
  On Wed, 2012-07-18 at 15:07 +0300, Michael S. Tsirkin wrote:
   On Wed, Jul 18, 2012 at 02:48:44PM +0300, Gleb Natapov wrote:
On Wed, Jul 18, 2012 at 02:39:10PM +0300, Michael S. Tsirkin wrote:
 On Wed, Jul 18, 2012 at 02:22:19PM +0300, Michael S. Tsirkin wrote:
So as was discussed kvm_set_irq under spinlock is bad 
for scalability
with multiple VCPUs.  Why do we need a spinlock simply 
to protect
level_asserted?  Let's use an atomic test and set/test 
and clear and the
problem goes away.

   That sad reality is that for level interrupt we already 
   scan all vcpus
   under spinlock.
  
  Where?
  
 ioapic

$ grep kvm_for_each_vcpu virt/kvm/ioapic.c
$

?

   
   Come on Michael. You can do better than grep and actually look at 
   what
   code does. The code that loops over all vcpus while delivering an 
   irq is
   in kvm_irq_delivery_to_apic(). Now grep for that.
  
  Hmm, I see, it's actually done for edge if injected from ioapic too,
  right?
  
  So set_irq does a linear scan, and for each matching CPU it calls
  kvm_irq_delivery_to_apic which is another scan?
  So it's actually N^2 worst case for a broadcast?
 
 No it isn't, I misread the code.
 
 
 Anyway, maybe not trivially but this looks fixable to me: we could 
 drop
 the ioapic lock before calling kvm_irq_delivery_to_apic.
 
May be, may be not. Just saying lets drop lock whenever we don't feel
like holding one does not cut it.
   
   One thing we do is set remote_irr if interrupt was injected.
   I agree these things are tricky.
   
   One other question:
   
   static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
   {
   union kvm_ioapic_redirect_entry *pent;
   int injected = -1;
   
   pent = ioapic-redirtbl[idx];
   
   if (!pent-fields.mask) {
   injected = ioapic_deliver(ioapic, idx);
   if (injected  pent-fields.trig_mode == 
   IOAPIC_LEVEL_TRIG)
   pent-fields.remote_irr = 1;
   }
   
   return injected;
   }
   
   
   This if (injected) looks a bit strange since ioapic_deliver returns
   -1 if no matching destinations. Should be if (injected  0)?
   
   
   
Back to original point though current
situation is that calling kvm_set_irq() under spinlock is not worse for
scalability than calling it not under one.
   
   Yes. Still the specific use can just use an atomic flag,
   lock+bool is not needed, and we won't need to undo it later.
  
  
  Actually, no, replacing it with an atomic is racy.
  
  CPU0 (inject)   CPU1 (EOI)
  atomic_cmpxchg(asserted, 0, 1)
  atomic_cmpxchg(asserted, 1, 0)
  kvm_set_irq(0)
  kvm_set_irq(1)
  eventfd_signal
  
  The interrupt is now stuck on until another interrupt is injected.
  
 
 Well EOI somehow happened here before interrupt so it's a bug somewhere
 else?

Interrupts can be shared.  We also can't guarantee that the guest won't
write a bogus EOI to the ioapic.  The irq ack notifier doesn't filter on
irq source id... I'm not sure it can.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC v9 00/27] virtio: virtio-blk data plane

2012-07-18 Thread Michael S. Tsirkin
On Wed, Jul 18, 2012 at 04:07:27PM +0100, Stefan Hajnoczi wrote:
 This series implements a dedicated thread for virtio-blk processing using 
 Linux
 AIO for raw image files only.  It is based on qemu-kvm.git a0bc8c3 and 
 somewhat
 old but I wanted to share it on the list since it has been mentioned on 
 mailing
 lists and IRC recently.

BTW are these any bugfixes here upstream needs?
I could not tell.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC-v2 0/4] tcm_vhost+cmwq fabric driver code for-3.6

2012-07-18 Thread Christoph Hellwig
On Wed, Jul 18, 2012 at 08:42:21AM -0500, Anthony Liguori wrote:
 
 If you add support for a new command, you need to provide userspace
 a way to disable this command.  If you change what gets reported for
 VPD, you need to provide userspace a way to make VPD look like what
 it did in a previous version.
 
 Basically, you need to be able to make a TCM device behave 100% the
 same as it did in an older version of the kernel.
 
 This is unique to virtualization due to live migration.  If you
 migrate from a 3.6 kernel to a 3.8 kernel, you need to make sure
 that the 3.8 kernel's TCM device behaves exactly like the 3.6 kernel
 because the guest that is interacting with it does not realize that
 live migration happened.

I don't think these strict live migration rules apply to SCSI targets.

Real life storage systems get new features and different behaviour with
firmware upgrades all the time, and SCSI initiators deal with that just
fine.  I don't see any reason to be more picky just because we're
virtualized.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 1/4] kvm: Extend irqfd to support level interrupts

2012-07-18 Thread Michael S. Tsirkin
On Wed, Jul 18, 2012 at 09:48:01AM -0600, Alex Williamson wrote:
 On Wed, 2012-07-18 at 18:38 +0300, Michael S. Tsirkin wrote:
  On Wed, Jul 18, 2012 at 08:47:23AM -0600, Alex Williamson wrote:
   On Wed, 2012-07-18 at 15:07 +0300, Michael S. Tsirkin wrote:
On Wed, Jul 18, 2012 at 02:48:44PM +0300, Gleb Natapov wrote:
 On Wed, Jul 18, 2012 at 02:39:10PM +0300, Michael S. Tsirkin wrote:
  On Wed, Jul 18, 2012 at 02:22:19PM +0300, Michael S. Tsirkin wrote:
 So as was discussed kvm_set_irq under spinlock is bad 
 for scalability
 with multiple VCPUs.  Why do we need a spinlock 
 simply to protect
 level_asserted?  Let's use an atomic test and 
 set/test and clear and the
 problem goes away.
 
That sad reality is that for level interrupt we already 
scan all vcpus
under spinlock.
   
   Where?
   
  ioapic
 
 $ grep kvm_for_each_vcpu virt/kvm/ioapic.c
 $
 
 ?
 

Come on Michael. You can do better than grep and actually look 
at what
code does. The code that loops over all vcpus while delivering 
an irq is
in kvm_irq_delivery_to_apic(). Now grep for that.
   
   Hmm, I see, it's actually done for edge if injected from ioapic 
   too,
   right?
   
   So set_irq does a linear scan, and for each matching CPU it calls
   kvm_irq_delivery_to_apic which is another scan?
   So it's actually N^2 worst case for a broadcast?
  
  No it isn't, I misread the code.
  
  
  Anyway, maybe not trivially but this looks fixable to me: we could 
  drop
  the ioapic lock before calling kvm_irq_delivery_to_apic.
  
 May be, may be not. Just saying lets drop lock whenever we don't feel
 like holding one does not cut it.

One thing we do is set remote_irr if interrupt was injected.
I agree these things are tricky.

One other question:

static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
{
union kvm_ioapic_redirect_entry *pent;
int injected = -1;

pent = ioapic-redirtbl[idx];

if (!pent-fields.mask) {
injected = ioapic_deliver(ioapic, idx);
if (injected  pent-fields.trig_mode == 
IOAPIC_LEVEL_TRIG)
pent-fields.remote_irr = 1;
}

return injected;
}


This if (injected) looks a bit strange since ioapic_deliver returns
-1 if no matching destinations. Should be if (injected  0)?



 Back to original point though current
 situation is that calling kvm_set_irq() under spinlock is not worse 
 for
 scalability than calling it not under one.

Yes. Still the specific use can just use an atomic flag,
lock+bool is not needed, and we won't need to undo it later.
   
   
   Actually, no, replacing it with an atomic is racy.
   
   CPU0 (inject)   CPU1 (EOI)
   atomic_cmpxchg(asserted, 0, 1)
   atomic_cmpxchg(asserted, 1, 0)
   kvm_set_irq(0)
   kvm_set_irq(1)
   eventfd_signal
   
   The interrupt is now stuck on until another interrupt is injected.
   
  
  Well EOI somehow happened here before interrupt so it's a bug somewhere
  else?
 
 Interrupts can be shared.  We also can't guarantee that the guest won't
 write a bogus EOI to the ioapic.  The irq ack notifier doesn't filter on
 irq source id... I'm not sure it can.

I guess if Avi OKs adding another kvm_set_irq under spinlock that's
the best we can do for now.

If not, maybe we can teach kvm_set_irq to return an indication
of the previous status. Specifically kvm_irq_line_state
could do test_and_set/test_and_clear and if already set/clear
we return 0 immediately.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC-v2 0/4] tcm_vhost+cmwq fabric driver code for-3.6

2012-07-18 Thread Michael S. Tsirkin
On Wed, Jul 18, 2012 at 11:53:38AM -0400, Christoph Hellwig wrote:
 On Wed, Jul 18, 2012 at 08:42:21AM -0500, Anthony Liguori wrote:
  
  If you add support for a new command, you need to provide userspace
  a way to disable this command.  If you change what gets reported for
  VPD, you need to provide userspace a way to make VPD look like what
  it did in a previous version.
  
  Basically, you need to be able to make a TCM device behave 100% the
  same as it did in an older version of the kernel.
  
  This is unique to virtualization due to live migration.  If you
  migrate from a 3.6 kernel to a 3.8 kernel, you need to make sure
  that the 3.8 kernel's TCM device behaves exactly like the 3.6 kernel
  because the guest that is interacting with it does not realize that
  live migration happened.
 
 I don't think these strict live migration rules apply to SCSI targets.
 
 Real life storage systems get new features and different behaviour with
 firmware upgrades all the time, and SCSI initiators deal with that just
 fine.
  I don't see any reason to be more picky just because we're
 virtualized.


Presumably initiators are shut down for target firmware upgrades?
With virtualization your host can change without guest shutdown.
You can also *lose* commands when migrating to an older host.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC-v3 4/4] tcm_vhost: Initial merge for vhost level target fabric driver

2012-07-18 Thread Michael S. Tsirkin
On Wed, Jul 18, 2012 at 12:59:32AM +, Nicholas A. Bellinger wrote:
 From: Nicholas Bellinger n...@linux-iscsi.org
 
 This patch adds the initial code for tcm_vhost, a Vhost level TCM
 fabric driver for virtio SCSI initiators into KVM guest.
 
 This code is currently up and running on v3.5-rc2 host+guest along
 with the virtio-scsi vdev-scan() patch to allow a proper
 scsi_scan_host() to occur once the tcm_vhost nexus has been established
 by the paravirtualized virtio-scsi client here:
 
 virtio-scsi: Add vdrv-scan for post VIRTIO_CONFIG_S_DRIVER_OK LUN scanning
 http://marc.info/?l=linux-scsim=134160609212542w=2
 
 Using tcm_vhost requires Zhi's - Stefan's qemu vhost-scsi tree here:
 
 https://github.com/wuzhy/qemu/tree/vhost-scsi
 
 along with the recent QEMU patch to hw/virtio-scsi.c to set max_target=0
 during vhost-scsi operation.
 
 Changelog v2 - v3:
 
   Unlock on error in tcm_vhost_drop_nexus() (DanC)
   Fix strlen() doesn't count the terminator (DanC)
   Call kfree() on an error path (DanC)
   Convert tcm_vhost_write_pending to use target_execute_cmd (hch + nab)
   Fix another strlen() off by one in tcm_vhost_make_tport (DanC)
   Add option under drivers/staging/Kconfig, and move to drivers/vhost/tcm/
   as requested by MST (nab)
 
 Changelog v1 - v2:
 
   Fix tv_cmd completion - release SGL memory leak (nab)
   Fix sparse warnings for static variable usage ((Fengguang Wu)
   Fix sparse warnings for min() typing + printk format specs (Fengguang Wu)
   Convert to cmwq submission for I/O dispatch (nab + hch)
 
 Changelog v0 - v1:
 
   Merge into single source + header file, and move to drivers/vhost/
 
 Cc: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
 Cc: Zhi Yong Wu wu...@cn.ibm.com
 Cc: Michael S. Tsirkin m...@redhat.com
 Cc: Paolo Bonzini pbonz...@redhat.com
 Cc: Christoph Hellwig h...@lst.de
 Cc: Hannes Reinecke h...@suse.de
 Cc: Jens Axboe ax...@kernel.dk
 Signed-off-by: Nicholas Bellinger n...@linux-iscsi.org
 ---
  drivers/staging/Kconfig   |2 +
  drivers/vhost/Makefile|2 +
  drivers/vhost/tcm/Kconfig |6 +
  drivers/vhost/tcm/Makefile|1 +
  drivers/vhost/tcm/tcm_vhost.c | 1611 
 +
  drivers/vhost/tcm/tcm_vhost.h |   74 ++
  6 files changed, 1696 insertions(+), 0 deletions(-)
  create mode 100644 drivers/vhost/tcm/Kconfig
  create mode 100644 drivers/vhost/tcm/Makefile
  create mode 100644 drivers/vhost/tcm/tcm_vhost.c
  create mode 100644 drivers/vhost/tcm/tcm_vhost.h
 

Really sorry about making you run around like that,
I did not mean moving all of tcm to a directory,
just adding tcm/Kconfig or adding drivers/vhost/Kconfig.tcm
because eventually it's easier to keep it all together
in one place.

 diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
 index 05e33c7..8d1a627 100644
 --- a/drivers/staging/Kconfig
 +++ b/drivers/staging/Kconfig
 @@ -132,4 +132,6 @@ source drivers/staging/ipack/Kconfig
  
  source drivers/staging/gdm72xx/Kconfig
  
 +source drivers/vhost/tcm/Kconfig
 +
  endif # STAGING
 diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
 index 72dd020..3408bea 100644
 --- a/drivers/vhost/Makefile
 +++ b/drivers/vhost/Makefile
 @@ -1,2 +1,4 @@
  obj-$(CONFIG_VHOST_NET) += vhost_net.o
  vhost_net-y := vhost.o net.o
 +
 +obj-$(CONFIG_TCM_VHOST) += tcm/
 diff --git a/drivers/vhost/tcm/Kconfig b/drivers/vhost/tcm/Kconfig
 new file mode 100644
 index 000..a9c6f76
 --- /dev/null
 +++ b/drivers/vhost/tcm/Kconfig
 @@ -0,0 +1,6 @@
 +config TCM_VHOST
 + tristate TCM_VHOST fabric module (EXPERIMENTAL)
 + depends on TARGET_CORE  EVENTFD  EXPERIMENTAL  m
 + default n
 + ---help---
 + Say M here to enable the TCM_VHOST fabric module for use with 
 virtio-scsi guests
 diff --git a/drivers/vhost/tcm/Makefile b/drivers/vhost/tcm/Makefile
 new file mode 100644
 index 000..54b0ea6
 --- /dev/null
 +++ b/drivers/vhost/tcm/Makefile
 @@ -0,0 +1 @@
 +obj-$(CONFIG_TCM_VHOST) += tcm_vhost.o
 diff --git a/drivers/vhost/tcm/tcm_vhost.c b/drivers/vhost/tcm/tcm_vhost.c
 new file mode 100644
 index 000..0ee4046
 --- /dev/null
 +++ b/drivers/vhost/tcm/tcm_vhost.c
 @@ -0,0 +1,1611 @@
 +/***
 + * Vhost kernel TCM fabric driver for virtio SCSI initiators
 + *
 + * (C) Copyright 2010-2012 RisingTide Systems LLC.
 + * (C) Copyright 2010-2012 IBM Corp.
 + *
 + * Licensed to the Linux Foundation under the General Public License (GPL) 
 version 2.
 + *
 + * Authors: Nicholas A. Bellinger n...@risingtidesystems.com
 + *  Stefan Hajnoczi stefa...@linux.vnet.ibm.com
 + *
 + * This program is free software; you can redistribute it and/or modify
 + * it under the terms of the GNU General Public License as published by
 + * the Free Software Foundation; either version 2 of the License, or
 + * (at your option) any later version.
 + *
 + * This program is distributed in the hope that it will be useful,
 + * but 

Re: [RFC-v3 0/4] tcm_vhost+cmwq fabric driver code for-3.6

2012-07-18 Thread Michael S. Tsirkin
On Wed, Jul 18, 2012 at 12:59:28AM +, Nicholas A. Bellinger wrote:
 From: Nicholas Bellinger n...@linux-iscsi.org
 
 Hi folks,
 
 The following is the RFC-v3 series of tcm_vhost target fabric driver code
 currently in-flight for-3.6 mainline code.

So I sent some comments. I think it's in an OK state for a staging
driver.  I'd suggest doing some interations while there is still time
and the rest we can fix up in tree.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC-v2 0/4] tcm_vhost+cmwq fabric driver code for-3.6

2012-07-18 Thread Anthony Liguori

On 07/18/2012 10:53 AM, Christoph Hellwig wrote:

On Wed, Jul 18, 2012 at 08:42:21AM -0500, Anthony Liguori wrote:


If you add support for a new command, you need to provide userspace
a way to disable this command.  If you change what gets reported for
VPD, you need to provide userspace a way to make VPD look like what
it did in a previous version.

Basically, you need to be able to make a TCM device behave 100% the
same as it did in an older version of the kernel.

This is unique to virtualization due to live migration.  If you
migrate from a 3.6 kernel to a 3.8 kernel, you need to make sure
that the 3.8 kernel's TCM device behaves exactly like the 3.6 kernel
because the guest that is interacting with it does not realize that
live migration happened.


I don't think these strict live migration rules apply to SCSI targets.

Real life storage systems get new features and different behaviour with
firmware upgrades all the time, and SCSI initiators deal with that just
fine.  I don't see any reason to be more picky just because we're
virtualized.


But would this happen while a system is running live?

I agree that in general, SCSI targets don't need this, but I'm pretty sure that 
if a guest probes for a command, you migrate to an old version, and that command 
is no longer there, badness will ensue.


It's different when you're talking about a reboot happening or a 
disconnect/reconnect due to firmware upgrade.  The OS would naturally be 
reprobing in this case.


Regards,

Anthony Liguori





--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC-v2 0/4] tcm_vhost+cmwq fabric driver code for-3.6

2012-07-18 Thread Rustad, Mark D
On Jul 18, 2012, at 9:00 AM, Michael S. Tsirkin wrote:

 On Wed, Jul 18, 2012 at 11:53:38AM -0400, Christoph Hellwig wrote:
 On Wed, Jul 18, 2012 at 08:42:21AM -0500, Anthony Liguori wrote:
 
 If you add support for a new command, you need to provide userspace
 a way to disable this command.  If you change what gets reported for
 VPD, you need to provide userspace a way to make VPD look like what
 it did in a previous version.
 
 Basically, you need to be able to make a TCM device behave 100% the
 same as it did in an older version of the kernel.
 
 This is unique to virtualization due to live migration.  If you
 migrate from a 3.6 kernel to a 3.8 kernel, you need to make sure
 that the 3.8 kernel's TCM device behaves exactly like the 3.6 kernel
 because the guest that is interacting with it does not realize that
 live migration happened.
 
 I don't think these strict live migration rules apply to SCSI targets.
 
 Real life storage systems get new features and different behaviour with
 firmware upgrades all the time, and SCSI initiators deal with that just
 fine.
 I don't see any reason to be more picky just because we're
 virtualized.
 
 Presumably initiators are shut down for target firmware upgrades?
 With virtualization your host can change without guest shutdown.
 You can also *lose* commands when migrating to an older host.


Actually no. Storage vendors do not want to impose a need to take initiators 
down for any reason. I have worked for a storage system vendor that routinely 
did firmware upgrades on-the-fly. This is done by multi-pathing and taking one 
path down, upgrade, bring up, repeat. There was even one non-redundant system 
that I am aware of that could upgrade firmware and reboot fast enough that the 
initiators would not notice.

You do have to pay very close attention to some things however. Don't change 
the device identity in any way - even version information, otherwise a Windows 
initiator will blue-screen. I made that mistake myself, so I remember it well. 
It seemed like such an innocent change. I don't recall there being any issue 
with adding commands and we did do that on occasion.

-- 
Mark Rustad, LAN Access Division, Intel Corporation

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC-v2 0/4] tcm_vhost+cmwq fabric driver code for-3.6

2012-07-18 Thread James Bottomley
On Wed, 2012-07-18 at 11:00 -0500, Anthony Liguori wrote:
 On 07/18/2012 10:53 AM, Christoph Hellwig wrote:
  On Wed, Jul 18, 2012 at 08:42:21AM -0500, Anthony Liguori wrote:
 
  If you add support for a new command, you need to provide userspace
  a way to disable this command.  If you change what gets reported for
  VPD, you need to provide userspace a way to make VPD look like what
  it did in a previous version.
 
  Basically, you need to be able to make a TCM device behave 100% the
  same as it did in an older version of the kernel.
 
  This is unique to virtualization due to live migration.  If you
  migrate from a 3.6 kernel to a 3.8 kernel, you need to make sure
  that the 3.8 kernel's TCM device behaves exactly like the 3.6 kernel
  because the guest that is interacting with it does not realize that
  live migration happened.
 
  I don't think these strict live migration rules apply to SCSI targets.
 
  Real life storage systems get new features and different behaviour with
  firmware upgrades all the time, and SCSI initiators deal with that just
  fine.  I don't see any reason to be more picky just because we're
  virtualized.
 
 But would this happen while a system is running live?

Of course: Think about the consequences: you want to upgrade one array
on your SAN.  You definitely don't want to shut down your entire data
centre to achieve it.  In place upgrades on running SANs have been
common in enterprise environments for a while.

 I agree that in general, SCSI targets don't need this, but I'm pretty sure 
 that 
 if a guest probes for a command, you migrate to an old version, and that 
 command 
 is no longer there, badness will ensue.

What command are we talking about?  Operation of initiators is usually
just READ and WRITE.  So perhaps we might have inline UNMAP ... but the
world wouldn't come to an end even if the latter stopped working.

Most of the complex SCSI stuff is done at start of day; it's actually
only then we'd notice things like changes in INQUIRY strings or mode
pages.

Failover, which is what you're talking about, requires reinstatement of
all the operating parameters of the source/target system, but that's not
wholly the responsibility of the storage system ...

James

 It's different when you're talking about a reboot happening or a 
 disconnect/reconnect due to firmware upgrade.  The OS would naturally be 
 reprobing in this case.



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH v4] Fixes related to processing of qemu's -numa option

2012-07-18 Thread Eduardo Habkost
On Mon, Jul 16, 2012 at 09:31:30PM -0700, Chegu Vinod wrote:
 Changes since v3:
- using bitmap_set() instead of set_bit() in numa_add() routine.
- removed call to bitmak_zero() since bitmap_new() also zeros' the bitmap.
- Rebased to the latest qemu.

Tested-by: Eduardo Habkost ehabk...@redhat.com
Reviewed-by: Eduardo Habkost ehabk...@redhat.com


 
 Changes since v2:
- Using unsigned long * for the node_cpumask[].
- Use bitmap_new() instead of g_malloc0() for allocation.
- Don't rely on max_cpus since it may not be initialized
  before the numa related qemu options are parsed  processed.
 
 Note: Continuing to use a new constant for allocation of
   the mask (This constant is currently set to 255 since
   with an 8bit APIC ID VCPUs can range from 0-254 in a
   guest. The APIC ID 255 (0xFF) is reserved for broadcast).
 
 Changes since v1:
 
- Use bitmap functions that are already in qemu (instead
  of cpu_set_t macro's from sched.h)
- Added a check for endvalue = max_cpus.
- Fix to address the round-robbing assignment when
  cpu's are not explicitly specified.
 ---
 
 v1:
 --
 
 The -numa option to qemu is used to create [fake] numa nodes
 and expose them to the guest OS instance.
 
 There are a couple of issues with the -numa option:
 
 a) Max VCPU's that can be specified for a guest while using
the qemu's -numa option is 64. Due to a typecasting issue
when the number of VCPUs is  32 the VCPUs don't show up
under the specified [fake] numa nodes.
 
 b) KVM currently has support for 160VCPUs per guest. The
qemu's -numa option has only support for upto 64VCPUs
per guest.
 This patch addresses these two issues.
 
 Below are examples of (a) and (b)
 
 a) 32 VCPUs are specified with the -numa option:
 
 /usr/local/bin/qemu-system-x86_64 \
 -enable-kvm \
 71:01:01 \
 -net tap,ifname=tap0,script=no,downscript=no \
 -vnc :4
 
 ...
 Upstream qemu :
 --
 
 QEMU 1.1.50 monitor - type 'help' for more information
 (qemu) info numa
 6 nodes
 node 0 cpus: 0 1 2 3 4 5 6 7 8 9 32 33 34 35 36 37 38 39 40 41
 node 0 size: 131072 MB
 node 1 cpus: 10 11 12 13 14 15 16 17 18 19 42 43 44 45 46 47 48 49 50 51
 node 1 size: 131072 MB
 node 2 cpus: 20 21 22 23 24 25 26 27 28 29 52 53 54 55 56 57 58 59
 node 2 size: 131072 MB
 node 3 cpus: 30
 node 3 size: 131072 MB
 node 4 cpus:
 node 4 size: 131072 MB
 node 5 cpus: 31
 node 5 size: 131072 MB
 
 With the patch applied :
 ---
 
 QEMU 1.1.50 monitor - type 'help' for more information
 (qemu) info numa
 6 nodes
 node 0 cpus: 0 1 2 3 4 5 6 7 8 9
 node 0 size: 131072 MB
 node 1 cpus: 10 11 12 13 14 15 16 17 18 19
 node 1 size: 131072 MB
 node 2 cpus: 20 21 22 23 24 25 26 27 28 29
 node 2 size: 131072 MB
 node 3 cpus: 30 31 32 33 34 35 36 37 38 39
 node 3 size: 131072 MB
 node 4 cpus: 40 41 42 43 44 45 46 47 48 49
 node 4 size: 131072 MB
 node 5 cpus: 50 51 52 53 54 55 56 57 58 59
 node 5 size: 131072 MB
 
 b) 64 VCPUs specified with -numa option:
 
 /usr/local/bin/qemu-system-x86_64 \
 -enable-kvm \
 -cpu 
 Westmere,+rdtscp,+pdpe1gb,+dca,+pdcm,+xtpr,+tm2,+est,+smx,+vmx,+ds_cpl,+monitor,+dtes64,+pclmuldq,+pbe,+tm,+ht,+ss,+acpi,+d-vnc
  :4
 
 ...
 
 Upstream qemu :
 --
 
 only 63 CPUs in NUMA mode supported.
 only 64 CPUs in NUMA mode supported.
 QEMU 1.1.50 monitor - type 'help' for more information
 (qemu) info numa
 8 nodes
 node 0 cpus: 6 7 8 9 38 39 40 41 70 71 72 73
 node 0 size: 65536 MB
 node 1 cpus: 10 11 12 13 14 15 16 17 18 19 42 43 44 45 46 47 48 49 50 51 74 
 75 76 77 78 79
 node 1 size: 65536 MB
 node 2 cpus: 20 21 22 23 24 25 26 27 28 29 52 53 54 55 56 57 58 59 60 61
 node 2 size: 65536 MB
 node 3 cpus: 30 62
 node 3 size: 65536 MB
 node 4 cpus:
 node 4 size: 65536 MB
 node 5 cpus:
 node 5 size: 65536 MB
 node 6 cpus: 31 63
 node 6 size: 65536 MB
 node 7 cpus: 0 1 2 3 4 5 32 33 34 35 36 37 64 65 66 67 68 69
 node 7 size: 65536 MB
 
 With the patch applied :
 ---
 
 QEMU 1.1.50 monitor - type 'help' for more information
 (qemu) info numa
 8 nodes
 node 0 cpus: 0 1 2 3 4 5 6 7 8 9
 node 0 size: 65536 MB
 node 1 cpus: 10 11 12 13 14 15 16 17 18 19
 node 1 size: 65536 MB
 node 2 cpus: 20 21 22 23 24 25 26 27 28 29
 node 2 size: 65536 MB
 node 3 cpus: 30 31 32 33 34 35 36 37 38 39
 node 3 size: 65536 MB
 node 4 cpus: 40 41 42 43 44 45 46 47 48 49
 node 4 size: 65536 MB
 node 5 cpus: 50 51 52 53 54 55 56 57 58 59
 node 5 size: 65536 MB
 node 6 cpus: 60 61 62 63 64 65 66 67 68 69
 node 6 size: 65536 MB
 node 7 cpus: 70 71 72 73 74 75 76 77 78 79
 
 Signed-off-by: Chegu Vinod chegu_vi...@hp.com, Jim Hull jim.h...@hp.com, 
 Craig Hada craig.h...@hp.com
 ---
  cpus.c   |3 ++-
  hw/pc.c  |3 ++-
  sysemu.h |3 ++-
  vl.c |   43 +--
  4 files changed, 27 insertions(+), 25 deletions(-)
 
 diff --git a/cpus.c b/cpus.c
 index b182b3d..acccd08 100644
 

Re: [RFC-v2 0/4] tcm_vhost+cmwq fabric driver code for-3.6

2012-07-18 Thread Michael S. Tsirkin
On Wed, Jul 18, 2012 at 04:42:33PM +, Rustad, Mark D wrote:
 On Jul 18, 2012, at 9:00 AM, Michael S. Tsirkin wrote:
 
  On Wed, Jul 18, 2012 at 11:53:38AM -0400, Christoph Hellwig wrote:
  On Wed, Jul 18, 2012 at 08:42:21AM -0500, Anthony Liguori wrote:
  
  If you add support for a new command, you need to provide userspace
  a way to disable this command.  If you change what gets reported for
  VPD, you need to provide userspace a way to make VPD look like what
  it did in a previous version.
  
  Basically, you need to be able to make a TCM device behave 100% the
  same as it did in an older version of the kernel.
  
  This is unique to virtualization due to live migration.  If you
  migrate from a 3.6 kernel to a 3.8 kernel, you need to make sure
  that the 3.8 kernel's TCM device behaves exactly like the 3.6 kernel
  because the guest that is interacting with it does not realize that
  live migration happened.
  
  I don't think these strict live migration rules apply to SCSI targets.
  
  Real life storage systems get new features and different behaviour with
  firmware upgrades all the time, and SCSI initiators deal with that just
  fine.
  I don't see any reason to be more picky just because we're
  virtualized.
  
  Presumably initiators are shut down for target firmware upgrades?
  With virtualization your host can change without guest shutdown.
  You can also *lose* commands when migrating to an older host.
 
 
 Actually no. Storage vendors do not want to impose a need to take initiators 
 down for any reason. I have worked for a storage system vendor that routinely 
 did firmware upgrades on-the-fly. This is done by multi-pathing and taking 
 one path down, upgrade, bring up, repeat.

With live migration even that does not happen.

 There was even one non-redundant system that I am aware of that could upgrade 
 firmware and reboot fast enough that the initiators would not notice.
 
 You do have to pay very close attention to some things however. Don't change 
 the device identity in any way - even version information, otherwise a 
 Windows initiator will blue-screen. I made that mistake myself, so I remember 
 it well. It seemed like such an innocent change. I don't recall there being 
 any issue with adding commands and we did do that on occasion.

How about removing commands?

 -- 
 Mark Rustad, LAN Access Division, Intel Corporation
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC v9 11/27] virtio-blk: Indirect vring and flush support

2012-07-18 Thread Michael S. Tsirkin
On Wed, Jul 18, 2012 at 04:07:38PM +0100, Stefan Hajnoczi wrote:
 RHEL6 and other new guest kernels use indirect vring descriptors to
 increase the number of requests that can be batched.  This fundamentally
 changes vring from a scheme that requires fixed resources to something
 more dynamic (although there is still an absolute maximum number of
 descriptors).  Cope with indirect vrings by taking on as many requests
 as we can in one go and then postponing the remaining requests until the
 first batch completes.
 
 It would be possible to switch to dynamic resource management so iovec
 and iocb structs are malloced.  This would allow the entire ring to be
 processed even with indirect descriptors, but would probably hit a
 bottleneck when io_submit refuses to queue more requests.  Therefore,
 stick with the simpler scheme for now.
 
 Unfortunately Linux AIO does not support asynchronous fsync/fdatasync on
 all files.  In particular, an O_DIRECT opened file on ext4 does not
 support Linux AIO fdsync.  Work around this by performing fdatasync()
 synchronously for now.
 
 Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
 ---
  hw/dataplane/ioq.h   |   18 -
  hw/dataplane/vring.h |  103 
 +++---
  hw/virtio-blk.c  |   75 ++--
  3 files changed, 144 insertions(+), 52 deletions(-)
 
 diff --git a/hw/dataplane/ioq.h b/hw/dataplane/ioq.h
 index 7200e87..d1545d6 100644
 --- a/hw/dataplane/ioq.h
 +++ b/hw/dataplane/ioq.h
 @@ -3,7 +3,7 @@
  
  typedef struct {
  int fd; /* file descriptor */
 -unsigned int max_reqs;   /* max length of freelist and queue */
 +unsigned int max_reqs;  /* max length of freelist and queue */
  
  io_context_t io_ctx;/* Linux AIO context */
  EventNotifier io_notifier;  /* Linux AIO eventfd */
 @@ -91,18 +91,16 @@ static struct iocb *ioq_rdwr(IOQueue *ioq, bool read, 
 struct iovec *iov, unsigne
  return iocb;
  }
  
 -static struct iocb *ioq_fdsync(IOQueue *ioq)
 -{
 -struct iocb *iocb = ioq_get_iocb(ioq);
 -
 -io_prep_fdsync(iocb, ioq-fd);
 -io_set_eventfd(iocb, event_notifier_get_fd(ioq-io_notifier));
 -return iocb;
 -}
 -
  static int ioq_submit(IOQueue *ioq)
  {
  int rc = io_submit(ioq-io_ctx, ioq-queue_idx, ioq-queue);
 +if (unlikely(rc  0)) {
 +unsigned int i;
 +fprintf(stderr, io_submit io_ctx=%#lx nr=%d iovecs=%p\n, 
 (uint64_t)ioq-io_ctx, ioq-queue_idx, ioq-queue);
 +for (i = 0; i  ioq-queue_idx; i++) {
 +fprintf(stderr, [%u] type=%#x fd=%d\n, i, 
 ioq-queue[i]-aio_lio_opcode, ioq-queue[i]-aio_fildes);
 +}
 +}
  ioq-queue_idx = 0; /* reset */
  return rc;
  }
 diff --git a/hw/dataplane/vring.h b/hw/dataplane/vring.h
 index 70675e5..3eab4b4 100644
 --- a/hw/dataplane/vring.h
 +++ b/hw/dataplane/vring.h
 @@ -64,6 +64,86 @@ static void vring_setup(Vring *vring, VirtIODevice *vdev, 
 int n)
  vring-vr.desc, vring-vr.avail, vring-vr.used);
  }
  
 +static bool vring_more_avail(Vring *vring)
 +{
 + return vring-vr.avail-idx != vring-last_avail_idx;
 +}
 +
 +/* This is stolen from linux-2.6/drivers/vhost/vhost.c. */
 +static bool get_indirect(Vring *vring,
 + struct iovec iov[], struct iovec *iov_end,
 + unsigned int *out_num, unsigned int *in_num,
 + struct vring_desc *indirect)
 +{
 + struct vring_desc desc;
 + unsigned int i = 0, count, found = 0;
 +
 + /* Sanity check */
 + if (unlikely(indirect-len % sizeof desc)) {
 + fprintf(stderr, Invalid length in indirect descriptor: 
 +len 0x%llx not multiple of 0x%zx\n,
 +(unsigned long long)indirect-len,
 +sizeof desc);
 + exit(1);
 + }
 +
 + count = indirect-len / sizeof desc;
 + /* Buffers are chained via a 16 bit next field, so
 +  * we can have at most 2^16 of these. */
 + if (unlikely(count  USHRT_MAX + 1)) {
 + fprintf(stderr, Indirect buffer length too big: %d\n,
 +indirect-len);
 +exit(1);
 + }
 +
 +/* Point to translate indirect desc chain */
 +indirect = phys_to_host(vring, indirect-addr);
 +
 + /* We will use the result as an address to read from, so most
 +  * architectures only need a compiler barrier here. */
 + __sync_synchronize(); /* read_barrier_depends(); */


qemu has its own barriers now, pls use them.

 +
 + do {
 + if (unlikely(++found  count)) {
 + fprintf(stderr, Loop detected: last one at %u 
 +indirect size %u\n,
 +i, count);
 + exit(1);
 + }
 +
 +desc = *indirect++;
 + if (unlikely(desc.flags  VRING_DESC_F_INDIRECT)) {
 + fprintf(stderr, 

Re: [RFC v9 06/27] virtio-blk: Take PCI memory range into account

2012-07-18 Thread Michael S. Tsirkin
On Wed, Jul 18, 2012 at 04:07:33PM +0100, Stefan Hajnoczi wrote:
 Support 4 GB physical memory accesses.
 
 Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com

Need some sane APIs, this is just too scary.

 ---
  hw/virtio-blk.c |7 +++
  1 file changed, 7 insertions(+)
 
 diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
 index abd9386..99654f1 100644
 --- a/hw/virtio-blk.c
 +++ b/hw/virtio-blk.c
 @@ -64,6 +64,13 @@ static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev)
   */
  static inline void *phys_to_host(VirtIOBlock *s, target_phys_addr_t phys)
  {
 +/* Adjust for 3.6-4 GB PCI memory range */
 +if (phys = 0x1) {
 +phys -= 0x1 - 0xe000;
 +} else if (phys = 0xe000) {
 +fprintf(stderr, phys_to_host bad physical address in PCI range 
 %#lx\n, phys);
 +exit(1);
 +}
  return s-phys_mem_zero_host_ptr + phys;
  }
  
 -- 
 1.7.10.4
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


  1   2   >