[PATCH 00/14] Fix enhance NMI support for KVM - v4

2008-09-26 Thread jan . kiszka
Version 4 of this series includes the following fixes:

 - Handle NMI task gates (Gleb Natapov)
 - Clear internal NMI states on VCPU reset (Gleb Natapov)
 - Typo fixes and minor cleanups (Sheng Yang and /me)

Hope we are now reaching a mergable state.

Jan

--
Siemens AG, Corporate Technology, CT SE 2
Corporate Competence Center Embedded Linux

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 13/14] KVM: VMX: work around lacking VNMI support

2008-09-26 Thread jan . kiszka
Older VMX supporting CPUs do not provide the Virtual NMI feature for
tracking the NMI-blocked state after injecting such events. For now
KVM is unable to inject NMIs on those CPUs.

Derived from Sheng Yang's suggestion to use the IRQ window notification
for detecting the end of NMI handlers, this patch implements virtual
NMI support without impact on the host's ability to receive real NMIs.
The downside is that the given approach requires some heuristics that
can cause NMI nesting in vary rare corner cases.

The approach works as follows:
 - inject NMI and set a software-based NMI-blocked flag
 - arm the IRQ window start notification whenever an NMI window is
   requested
 - if the guest exits due to an opening IRQ window, clear the emulated
   NMI-blocked flag
 - if the guest net execution time with NMI-blocked but without an IRQ
   window exceeds 1 second, force NMI-blocked reset and inject anyway

This approach covers most practical scenarios:
 - succeeding NMIs are seperated by at least one open IRQ window
 - the guest may spin with IRQs disabled (e.g. due to a bug), but
   leaving the NMI handler takes much less time than one second
 - the guest does not rely on strict ordering or timing of NMIs
   (would be problematic in virtualized environments anyway)

Successfully tested with the 'nmi n' monitor command, the kgdbts
testsuite on smp guests (additional patches required to add debug
register support to kvm) + the kernel's nmi_watchdog=1, and a Siemens-
specific board emulation (+ guest) that comes with its own NMI
watchdog mechanism.

Signed-off-by: Jan Kiszka [EMAIL PROTECTED]
---
 arch/x86/kvm/vmx.c |  170 +++--
 1 file changed, 113 insertions(+), 57 deletions(-)

Index: b/arch/x86/kvm/vmx.c
===
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -90,6 +90,11 @@ struct vcpu_vmx {
} rmode;
int vpid;
bool emulation_required;
+
+   /* Support for vnmi-less CPUs */
+   int soft_vnmi_blocked;
+   ktime_t entry_time;
+   s64 vnmi_blocked_time;
 };
 
 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -2226,6 +2231,8 @@ static int vmx_vcpu_reset(struct kvm_vcp
 
vmx-vcpu.arch.rmode.active = 0;
 
+   vmx-soft_vnmi_blocked = 0;
+
vmx-vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
kvm_set_cr8(vmx-vcpu, 0);
msr = 0xfee0 | MSR_IA32_APICBASE_ENABLE;
@@ -2331,6 +2338,29 @@ out:
return ret;
 }
 
+static void enable_irq_window(struct kvm_vcpu *vcpu)
+{
+   u32 cpu_based_vm_exec_control;
+
+   cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+   cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
+   vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
+static void enable_nmi_window(struct kvm_vcpu *vcpu)
+{
+   u32 cpu_based_vm_exec_control;
+
+   if (!cpu_has_virtual_nmis()) {
+   enable_irq_window(vcpu);
+   return;
+   }
+
+   cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+   cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
+   vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
 static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2356,6 +2386,19 @@ static void vmx_inject_nmi(struct kvm_vc
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+   if (!cpu_has_virtual_nmis()) {
+   /*
+* Tracking the NMI-blocked state in software is built upon
+* finding the next open IRQ window. This, in turn, depends on
+* well-behaving guests: They have to keep IRQs disabled at
+* least as long as the NMI handler runs. Otherwise we may
+* cause NMI nesting, maybe breaking the guest. But as this is
+* highly unlikely, we can live with the residual risk.
+*/
+   vmx-soft_vnmi_blocked = 1;
+   vmx-vnmi_blocked_time = 0;
+   }
+
++vcpu-stat.nmi_injections;
if (vcpu-arch.rmode.active) {
vmx-rmode.irq.pending = true;
@@ -2380,6 +2423,8 @@ static void vmx_update_window_states(str
!(guest_intr  (GUEST_INTR_STATE_STI |
GUEST_INTR_STATE_MOV_SS |
GUEST_INTR_STATE_NMI));
+   if (!cpu_has_virtual_nmis()  to_vmx(vcpu)-soft_vnmi_blocked)
+   vcpu-arch.nmi_window_open = 0;
 
vcpu-arch.interrupt_window_open =
((vmcs_readl(GUEST_RFLAGS)  X86_EFLAGS_IF) 
@@ -2399,55 +2444,31 @@ static void kvm_do_inject_irq(struct kvm
kvm_queue_interrupt(vcpu, irq);
 }
 
-static void enable_irq_window(struct kvm_vcpu *vcpu)
-{
-   u32 cpu_based_vm_exec_control;
-
-   cpu_based_vm_exec_control = 

[PATCH 14/14] kvm-userspace: Enable NMI support for user space irqchip

2008-09-26 Thread jan . kiszka
Make use of the new KVM_NMI IOCTL to push NMIs into the KVM guest if the
user space APIC emulation or some other source raised them.

In order to use the 'nmi' monitor command which asynchroniously injects
NMIs for the given CPU, a new service called kvm_inject_interrupt is
required. This will invoke cpu_interrupt on the target VCPU, working
around the fact that the QEMU service is not thread-safe.

Signed-off-by: Jan Kiszka [EMAIL PROTECTED]
---
 libkvm/libkvm.c |   31 +++
 libkvm/libkvm.h |   23 +++
 qemu/monitor.c  |5 -
 qemu/qemu-kvm-x86.c |   26 +++---
 qemu/qemu-kvm.c |   18 +-
 qemu/qemu-kvm.h |2 ++
 6 files changed, 100 insertions(+), 5 deletions(-)

Index: b/libkvm/libkvm.c
===
--- a/libkvm/libkvm.c
+++ b/libkvm/libkvm.c
@@ -814,6 +814,11 @@ int try_push_interrupts(kvm_context_t kv
return kvm-callbacks-try_push_interrupts(kvm-opaque);
 }
 
+int try_push_nmi(kvm_context_t kvm)
+{
+   return kvm-callbacks-try_push_nmi(kvm-opaque);
+}
+
 void post_kvm_run(kvm_context_t kvm, int vcpu)
 {
kvm-callbacks-post_kvm_run(kvm-opaque, vcpu);
@@ -838,6 +843,17 @@ int kvm_is_ready_for_interrupt_injection
return run-ready_for_interrupt_injection;
 }
 
+int kvm_is_ready_for_nmi_injection(kvm_context_t kvm, int vcpu)
+{
+#ifdef KVM_CAP_NMI
+   struct kvm_run *run = kvm-run[vcpu];
+
+   return run-ready_for_nmi_injection;
+#else
+   return 0;
+#endif
+}
+
 int kvm_run(kvm_context_t kvm, int vcpu)
 {
int r;
@@ -845,6 +861,9 @@ int kvm_run(kvm_context_t kvm, int vcpu)
struct kvm_run *run = kvm-run[vcpu];
 
 again:
+#ifdef KVM_CAP_NMI
+   run-request_nmi_window = try_push_nmi(kvm);
+#endif
 #if !defined(__s390__)
if (!kvm-irqchip_in_kernel)
run-request_interrupt_window = try_push_interrupts(kvm);
@@ -920,6 +939,9 @@ again:
r = handle_halt(kvm, vcpu);
break;
case KVM_EXIT_IRQ_WINDOW_OPEN:
+#ifdef KVM_CAP_NMI
+   case KVM_EXIT_NMI_WINDOW_OPEN:
+#endif
break;
case KVM_EXIT_SHUTDOWN:
r = handle_shutdown(kvm, vcpu);
@@ -1004,6 +1026,15 @@ int kvm_has_sync_mmu(kvm_context_t kvm)
 return r;
 }
 
+int kvm_inject_nmi(kvm_context_t kvm, int vcpu)
+{
+#ifdef KVM_CAP_NMI
+   return ioctl(kvm-vcpu_fd[vcpu], KVM_NMI);
+#else
+   return -ENOSYS;
+#endif
+}
+
 int kvm_init_coalesced_mmio(kvm_context_t kvm)
 {
int r = 0;
Index: b/libkvm/libkvm.h
===
--- a/libkvm/libkvm.h
+++ b/libkvm/libkvm.h
@@ -66,6 +66,7 @@ struct kvm_callbacks {
 int (*shutdown)(void *opaque, int vcpu);
 int (*io_window)(void *opaque);
 int (*try_push_interrupts)(void *opaque);
+int (*try_push_nmi)(void *opaque);
 void (*post_kvm_run)(void *opaque, int vcpu);
 int (*pre_kvm_run)(void *opaque, int vcpu);
 int (*tpr_access)(void *opaque, int vcpu, uint64_t rip, int is_write);
@@ -216,6 +217,17 @@ uint64_t kvm_get_apic_base(kvm_context_t
 int kvm_is_ready_for_interrupt_injection(kvm_context_t kvm, int vcpu);
 
 /*!
+ * \brief Check if a vcpu is ready for NMI injection
+ *
+ * This checks if vcpu is not already running in NMI context.
+ *
+ * \param kvm Pointer to the current kvm_context
+ * \param vcpu Which virtual CPU should get dumped
+ * \return boolean indicating NMI injection readiness
+ */
+int kvm_is_ready_for_nmi_injection(kvm_context_t kvm, int vcpu);
+
+/*!
  * \brief Read VCPU registers
  *
  * This gets the GP registers from the VCPU and outputs them
@@ -579,6 +591,17 @@ int kvm_set_lapic(kvm_context_t kvm, int
 
 #endif
 
+/*!
+ * \brief Simulate an NMI
+ *
+ * This allows you to simulate a non-maskable interrupt.
+ *
+ * \param kvm Pointer to the current kvm_context
+ * \param vcpu Which virtual CPU should get dumped
+ * \return 0 on success
+ */
+int kvm_inject_nmi(kvm_context_t kvm, int vcpu);
+
 #endif
 
 /*!
Index: b/qemu/qemu-kvm-x86.c
===
--- a/qemu/qemu-kvm-x86.c
+++ b/qemu/qemu-kvm-x86.c
@@ -598,7 +598,8 @@ int kvm_arch_halt(void *opaque, int vcpu
 CPUState *env = cpu_single_env;
 
 if (!((env-interrupt_request  CPU_INTERRUPT_HARD) 
- (env-eflags  IF_MASK))) {
+ (env-eflags  IF_MASK)) 
+   !(env-interrupt_request  CPU_INTERRUPT_NMI)) {
 env-halted = 1;
env-exception_index = EXCP_HLT;
 }
@@ -627,8 +628,9 @@ void kvm_arch_post_kvm_run(void *opaque,
 
 int kvm_arch_has_work(CPUState *env)
 {
-if ((env-interrupt_request  (CPU_INTERRUPT_HARD | CPU_INTERRUPT_EXIT)) 
-   (env-eflags  IF_MASK))
+if (((env-interrupt_request  (CPU_INTERRUPT_HARD | CPU_INTERRUPT_EXIT)) 

+(env-eflags  IF_MASK)) ||
+   

[PATCH 09/14] KVM: x86: VCPU with pending NMI is runnabled

2008-09-26 Thread jan . kiszka
Ensure that a VCPU with pending NMIs is considered runnable.

Signed-off-by: Jan Kiszka [EMAIL PROTECTED]
---
 arch/x86/kvm/x86.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

Index: b/arch/x86/kvm/x86.c
===
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4375,7 +4375,8 @@ void kvm_arch_flush_shadow(struct kvm *k
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
return vcpu-arch.mp_state == KVM_MP_STATE_RUNNABLE
-  || vcpu-arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED;
+  || vcpu-arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
+  || vcpu-arch.nmi_pending;
 }
 
 static void vcpu_kick_intr(void *info)

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 02/14] KVM: VMX: Use INTR_TYPE_NMI_INTR instead of magic value

2008-09-26 Thread jan . kiszka
Signed-off-by: Jan Kiszka [EMAIL PROTECTED]
---
 arch/x86/kvm/vmx.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

Index: b/arch/x86/kvm/vmx.c
===
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2488,7 +2488,7 @@ static int handle_exception(struct kvm_v
set_bit(irq / BITS_PER_LONG, vcpu-arch.irq_summary);
}
 
-   if ((intr_info  INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
+   if ((intr_info  INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
return 1;  /* already handled by vmx_vcpu_run() */
 
if (is_no_device(intr_info)) {
@@ -3321,7 +3321,7 @@ static void vmx_vcpu_run(struct kvm_vcpu
intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 
/* We need to handle NMIs before interrupts are enabled */
-   if ((intr_info  INTR_INFO_INTR_TYPE_MASK) == 0x200 
+   if ((intr_info  INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR 
(intr_info  INTR_INFO_VALID_MASK)) {
KVMTRACE_0D(NMI, vcpu, handler);
asm(int $2);

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 9/9] x86/iommu: use dma_ops_list in get_dma_ops

2008-09-26 Thread Amit Shah
* On Monday 22 Sep 2008 23:51:21 Joerg Roedel wrote:
 This patch enables stackable dma_ops on x86. To do this, it also enables
 the per-device dma_ops on i386.

 Signed-off-by: Joerg Roedel [EMAIL PROTECTED]
 ---
  arch/x86/kernel/pci-dma.c |   26 ++
  include/asm-x86/device.h  |6 +++---
  include/asm-x86/dma-mapping.h |   14 +++---
  3 files changed, 36 insertions(+), 10 deletions(-)

 diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
 index b990fb6..2e517c2 100644
 --- a/arch/x86/kernel/pci-dma.c
 +++ b/arch/x86/kernel/pci-dma.c
 @@ -82,6 +82,32 @@ void x86_register_dma_ops(struct dma_mapping_ops *ops,
   write_unlock_irqrestore(dma_ops_list_lock, flags);
  }

 +struct dma_mapping_ops *find_dma_ops_for_device(struct device *dev)
 +{
 + int i;
 + unsigned long flags;
 + struct dma_mapping_ops *entry, *ops = NULL;
 +
 + read_lock_irqsave(dma_ops_list_lock, flags);
 +
 + for (i = 0; i  DMA_OPS_TYPE_MAX; ++i)
 + list_for_each_entry(entry, dma_ops_list[i], list) {
 + if (!entry-device_supported)
 + continue;
 + if (entry-device_supported(dev)) {
 + ops = entry;
 + goto out;
 + }
 + }
 +out:
 + read_unlock_irqrestore(dma_ops_list_lock, flags);

For PVDMA, we want the native dma_ops to succeed first, eg, nommu, and then 
do our PV DMA, which is just translating gpa  to hpa and then program the 
hardware. This isn't being done here. This can be done by extending the 
return type:

DMA_DEV_NOT_SUPPORTED
DMA_DEV_HANDLED
DMA_DEV_PASS

Where NOT_SUPPORTED means we should look for the next one in the chain 
(current return value 0), DEV_HANDLED means the dma operation has been 
handled successfully (current return value 1) and DEV_PASS means fall back to 
the next layer and then return back.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: writes to a virtio block device hungs

2008-09-26 Thread Michael Tokarev

Marcelo Tosatti wrote:

On Tue, Sep 23, 2008 at 11:06:11AM +0400, Michael Tokarev wrote:

(both host and guests are linux machines), I placed
one virtual machine into production use, and almost
immediately come... issues.  Here's how it looks like
from the guest:

Sep 21 10:35:52 hobbit kernel: INFO: task cleanup:20535 blocked for more than 
120 seconds.
Sep 21 10:35:52 hobbit kernel: echo 0  
/proc/sys/kernel/hung_task_timeout_secs disables this message.
Sep 21 10:35:52 hobbit kernel: cleanup   D  0 20535   1570
Sep 21 10:35:52 hobbit kernel:f73b39c0 00200086   
c3a2ba48  f7022e00 
Sep 21 10:35:52 hobbit kernel:dbc48ed4 f789c000 c0399080 c0157e48 
000e  d05e1b80 d05e1ce4
Sep 21 10:35:52 hobbit kernel:0002 00200286 c01322f7 d05e1ce4 
c0131ef0 dbc48ec8 00200286 c0132486
Sep 21 10:35:52 hobbit kernel: Call Trace:
Sep 21 10:35:52 hobbit kernel:  [c0157e48] find_get_pages_tag+0x38/0x80
Sep 21 10:35:52 hobbit kernel:  [c01322f7] lock_timer_base+0x27/0x60
Sep 21 10:35:52 hobbit kernel:  [c0131ef0] process_timeout+0x0/0x10
Sep 21 10:35:52 hobbit kernel:  [c0132486] __mod_timer+0x86/0xa0
Sep 21 10:35:52 hobbit kernel:  [c02c6408] schedule_timeout+0x58/0xb0
Sep 21 10:35:52 hobbit kernel:  [c0131ef0] process_timeout+0x0/0x10
Sep 21 10:35:52 hobbit kernel:  [f882db04] journal_stop+0xa4/0x1b0 [jbd]
Sep 21 10:35:52 hobbit kernel:  [f882ece8] journal_start+0x88/0xc0 [jbd]
Sep 21 10:35:52 hobbit kernel:  [f8860f20] ext3_write_inode+0x0/0x40 [ext3]
Sep 21 10:35:52 hobbit kernel:  [f8860f20] ext3_write_inode+0x0/0x40 [ext3]
Sep 21 10:35:52 hobbit kernel:  [c019d002] 
__writeback_single_inode+0x282/0x390
Sep 21 10:35:52 hobbit kernel:  [c015f3c0] generic_writepages+0x20/0x30
Sep 21 10:35:52 hobbit kernel:  [c015f419] do_writepages+0x49/0x50
Sep 21 10:35:52 hobbit kernel:  [c0159151] 
__filemap_fdatawrite_range+0x71/0x90
Sep 21 10:35:52 hobbit kernel:  [c019d131] sync_inode+0x21/0x40
Sep 21 10:35:52 hobbit kernel:  [f885f88e] ext3_sync_file+0x9e/0xc0 [ext3]
Sep 21 10:35:52 hobbit kernel:  [c01a065e] do_fsync+0x6e/0xb0
Sep 21 10:35:52 hobbit kernel:  [c01a06c7] __do_fsync+0x27/0x50
Sep 21 10:35:52 hobbit kernel:  [c01032f3] sysenter_past_esp+0x78/0xb1
Sep 21 10:35:52 hobbit kernel:  ===

It's almost always after fsync, but I guess it's due to the fact that
cleanup (from Postfix) process is the one who does that most often.


I'm waiting for opportunity to install a new kernel with new kvm...
in a hope still.


Meanwhile I installed kvm-75, which did NOT change anything, -- the system
still hangs.  What really changed things is switching guest to single
processor (was 2 before, from 4-core Phenom).


Are you using ext3 in the host as the filesystem to back the guest
image? If so, try writeback instead of ordered mode:


On the host there's an MD device (raid1) that hold complete raw disk
image for the guest.  It was in my email:

 The device in question is a virtio block device (vda), which is on top
 op a raid1 device on the host (/dev/md_d5, partitioned).  [...]

I'm trying to set up a test system to debug the case further,
because it's impossible to do that on production machine.

/mjt
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 9/9] x86/iommu: use dma_ops_list in get_dma_ops

2008-09-26 Thread Joerg Roedel
On Fri, Sep 26, 2008 at 01:26:19PM +0530, Amit Shah wrote:
 * On Monday 22 Sep 2008 23:51:21 Joerg Roedel wrote:
  This patch enables stackable dma_ops on x86. To do this, it also enables
  the per-device dma_ops on i386.
 
  Signed-off-by: Joerg Roedel [EMAIL PROTECTED]
  ---
   arch/x86/kernel/pci-dma.c |   26 ++
   include/asm-x86/device.h  |6 +++---
   include/asm-x86/dma-mapping.h |   14 +++---
   3 files changed, 36 insertions(+), 10 deletions(-)
 
  diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
  index b990fb6..2e517c2 100644
  --- a/arch/x86/kernel/pci-dma.c
  +++ b/arch/x86/kernel/pci-dma.c
  @@ -82,6 +82,32 @@ void x86_register_dma_ops(struct dma_mapping_ops *ops,
  write_unlock_irqrestore(dma_ops_list_lock, flags);
   }
 
  +struct dma_mapping_ops *find_dma_ops_for_device(struct device *dev)
  +{
  +   int i;
  +   unsigned long flags;
  +   struct dma_mapping_ops *entry, *ops = NULL;
  +
  +   read_lock_irqsave(dma_ops_list_lock, flags);
  +
  +   for (i = 0; i  DMA_OPS_TYPE_MAX; ++i)
  +   list_for_each_entry(entry, dma_ops_list[i], list) {
  +   if (!entry-device_supported)
  +   continue;
  +   if (entry-device_supported(dev)) {
  +   ops = entry;
  +   goto out;
  +   }
  +   }
  +out:
  +   read_unlock_irqrestore(dma_ops_list_lock, flags);
 
 For PVDMA, we want the native dma_ops to succeed first, eg, nommu, and then 
 do our PV DMA, which is just translating gpa  to hpa and then program the 
 hardware. This isn't being done here. This can be done by extending the 
 return type:
 
 DMA_DEV_NOT_SUPPORTED
 DMA_DEV_HANDLED
 DMA_DEV_PASS
 
 Where NOT_SUPPORTED means we should look for the next one in the chain 
 (current return value 0), DEV_HANDLED means the dma operation has been 
 handled successfully (current return value 1) and DEV_PASS means fall back to 
 the next layer and then return back.

I am not sure I fully understand what you mean? Why do we need to call
nommu handlers first for PVDMA devices?
I think that PVDMA devices must always be handled by a pv-dma_ops
implementation. So it makes more sense for me to assign the the dma_ops
of this implementation to the per-device dma_ops structure when we do
the first call to the dma api. So we pay this overhead of finding out
who is responsible only once and not at every call to the dma api.

Joerg

-- 
   |   AMD Saxony Limited Liability Company  Co. KG
 Operating | Wilschdorfer Landstr. 101, 01109 Dresden, Germany
 System|  Register Court Dresden: HRA 4896
 Research  |  General Partner authorized to represent:
 Center| AMD Saxony LLC (Wilmington, Delaware, US)
   | General Manager of AMD Saxony LLC: Dr. Hans-R. Deppe, Thomas McCoy

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/9] Implement GIF, clgi and stgi v3

2008-09-26 Thread Alexander Graf


On 25.09.2008, at 23:27, Joerg Roedel wrote:


On Thu, Sep 25, 2008 at 09:55:27PM +0200, Alexander Graf wrote:


On 25.09.2008, at 20:47, Joerg Roedel wrote:

I had another possible idea for performance improvement here.  
Since we
only inject normal interrupts and exceptions (and not NMI and  
such) we
can patch clgi to cli and stgi to sti to save these two intercepts  
in

the guests vmrun path.
Any objections/problems with this?


How do we know if we're allowed to inject interrupts with V_INTR set?
Usually IF is on and GIF is off when entering the VM in KVM, so we
allow interrupts to arrive even when IF is clear in the guest...


Hmm yes, this is a problem. So this optimization will not work. We  
need

other ways to optimize :)


Well it would work for the KVM-in-KVM case, where we know that VMRUN  
is always triggered with IF=1 and V_INTR=1. The only case that hack  
fails is when we have IF=0 and V_INTR=1. Everything else should work  
just fine. And in this case we would simply issue some VMEXITs 0x60,  
so no big deal IMHO. It should be worth the tradeoff of making most  
VMMs a lot faster.


There should be a compile-option to enable the correct behavior  
though. If we join that with the VMLOAD and VMSAVE hack there would be  
only the VMRUN and DR exits left. That sounds like a really good  
improvement where I wouldn't mind to break some specs :-).


Alex
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 9/9] x86/iommu: use dma_ops_list in get_dma_ops

2008-09-26 Thread Joerg Roedel
On Fri, Sep 26, 2008 at 01:26:19PM +0530, Amit Shah wrote:
 * On Monday 22 Sep 2008 23:51:21 Joerg Roedel wrote:
  This patch enables stackable dma_ops on x86. To do this, it also enables
  the per-device dma_ops on i386.
 
  Signed-off-by: Joerg Roedel [EMAIL PROTECTED]
  ---
   arch/x86/kernel/pci-dma.c |   26 ++
   include/asm-x86/device.h  |6 +++---
   include/asm-x86/dma-mapping.h |   14 +++---
   3 files changed, 36 insertions(+), 10 deletions(-)
 
  diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
  index b990fb6..2e517c2 100644
  --- a/arch/x86/kernel/pci-dma.c
  +++ b/arch/x86/kernel/pci-dma.c
  @@ -82,6 +82,32 @@ void x86_register_dma_ops(struct dma_mapping_ops *ops,
  write_unlock_irqrestore(dma_ops_list_lock, flags);
   }
 
  +struct dma_mapping_ops *find_dma_ops_for_device(struct device *dev)
  +{
  +   int i;
  +   unsigned long flags;
  +   struct dma_mapping_ops *entry, *ops = NULL;
  +
  +   read_lock_irqsave(dma_ops_list_lock, flags);
  +
  +   for (i = 0; i  DMA_OPS_TYPE_MAX; ++i)
  +   list_for_each_entry(entry, dma_ops_list[i], list) {
  +   if (!entry-device_supported)
  +   continue;
  +   if (entry-device_supported(dev)) {
  +   ops = entry;
  +   goto out;
  +   }
  +   }
  +out:
  +   read_unlock_irqrestore(dma_ops_list_lock, flags);
 
 For PVDMA, we want the native dma_ops to succeed first, eg, nommu, and then 
 do our PV DMA, which is just translating gpa  to hpa and then program the 

Btw, how will dma_masks be handled when PV DMA just translates gpa to
hpa?

Joerg

-- 
   |   AMD Saxony Limited Liability Company  Co. KG
 Operating | Wilschdorfer Landstr. 101, 01109 Dresden, Germany
 System|  Register Court Dresden: HRA 4896
 Research  |  General Partner authorized to represent:
 Center| AMD Saxony LLC (Wilmington, Delaware, US)
   | General Manager of AMD Saxony LLC: Dr. Hans-R. Deppe, Thomas McCoy

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch] stop passing in global variable as argument to cmos_init()

2008-09-26 Thread Jes Sorensen

Hi,

Looking through the ia64 code I came across this little gem.

At some point someone added a new argument to hw/pc.c:cmos_init() named
'smp_cpus', and then passed in the global variable 'smp_cpus' as the
argument. This propagated through to the ia64 code as well.

I checked, this isn't present in the upstream QEMU code, so lets kill
it in the KVM branch. One small step to get closer to upstream :-)

Cheers,
Jes

There is no reason to pass in global variable smp_cpus to cmos_init()
which then references it as a local variable of the same name 'smp_cpus'.

Signed-off-by: Jes Sorensen [EMAIL PROTECTED]

---
 qemu/hw/ipf.c |5 ++---
 qemu/hw/pc.c  |6 ++
 2 files changed, 4 insertions(+), 7 deletions(-)

Index: kvm-userspace.git/qemu/hw/ipf.c
===
--- kvm-userspace.git.orig/qemu/hw/ipf.c
+++ kvm-userspace.git/qemu/hw/ipf.c
@@ -179,8 +179,7 @@
 
 /* hd_table must contain 4 block drivers */
 static void cmos_init(ram_addr_t ram_size, ram_addr_t above_4g_mem_size,
-  const char *boot_device, BlockDriverState **hd_table,
-  int smp_cpus)
+  const char *boot_device, BlockDriverState **hd_table)
 {
 RTCState *s = rtc_state;
 int nbds, bds[3] = { 0, };
@@ -591,7 +590,7 @@
 }
 floppy_controller = fdctrl_init(i8259[6], 2, 0, 0x3f0, fd);
 
-cmos_init(ram_size, above_4g_mem_size, boot_device, hd, smp_cpus);
+cmos_init(ram_size, above_4g_mem_size, boot_device, hd);
 
 if (pci_enabled  usb_enabled) {
 usb_uhci_piix3_init(pci_bus, piix3_devfn + 2);
Index: kvm-userspace.git/qemu/hw/pc.c
===
--- kvm-userspace.git.orig/qemu/hw/pc.c
+++ kvm-userspace.git/qemu/hw/pc.c
@@ -228,8 +228,7 @@
 
 /* hd_table must contain 4 block drivers */
 static void cmos_init(ram_addr_t ram_size, ram_addr_t above_4g_mem_size,
-  const char *boot_device, BlockDriverState **hd_table,
-  int smp_cpus)
+  const char *boot_device, BlockDriverState **hd_table)
 {
 RTCState *s = rtc_state;
 int nbds, bds[3] = { 0, };
@@ -1093,8 +1092,7 @@
 }
 floppy_controller = fdctrl_init(i8259[6], 2, 0, 0x3f0, fd);
 
-cmos_init(below_4g_mem_size, above_4g_mem_size, boot_device, hd,
- smp_cpus);
+cmos_init(below_4g_mem_size, above_4g_mem_size, boot_device, hd);
 
 if (pci_enabled  usb_enabled) {
 usb_uhci_piix3_init(pci_bus, piix3_devfn + 2);


change in 'smp_cpus' at runtime vs vcpu_info?

2008-09-26 Thread Jes Sorensen

Hi,

Looking through the KVM qemu code, I see nothing that indicates that
smp_cpus can be increased after qemu is initially launched?

Does anyone see any reason why we couldn't allocate the vcpu_info
array at launch time based on the value of smp_cpus? Right now
vcpu_info is a static array and I would love to get rid of this
limitation if possible.

Cheers,
Jes
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: change in 'smp_cpus' at runtime vs vcpu_info?

2008-09-26 Thread Jes Sorensen

Glauber Costa wrote:

On Fri, Sep 26, 2008 at 11:17 AM, Jes Sorensen [EMAIL PROTECTED] wrote:

Hi,

Looking through the KVM qemu code, I see nothing that indicates that
smp_cpus can be increased after qemu is initially launched?

Does anyone see any reason why we couldn't allocate the vcpu_info
array at launch time based on the value of smp_cpus? Right now
vcpu_info is a static array and I would love to get rid of this
limitation if possible.


it can increase in hw/apic.c. The real limitation is then in the bios.
The APIC tables
used to be populated based on the smp_cpus value, and now are populated based on
MAX_CPUS.


Hmmm. I don't see any reference to MAX_CPUS in hw/apic.c, any chance you
can give me a pointer?

Thanks,
Jes
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: change in 'smp_cpus' at runtime vs vcpu_info?

2008-09-26 Thread Jes Sorensen

Glauber Costa wrote:

On Fri, Sep 26, 2008 at 11:31 AM, Jes Sorensen [EMAIL PROTECTED] wrote:

Hmmm. I don't see any reference to MAX_CPUS in hw/apic.c, any chance you
can give me a pointer?


sorry. That's not apic (damn acronyms), It's acpi.
But the reference to MAX_CPUS is at bios/rombios32.c when building the tables.
the acpi code just fire them out.


Oh, that seems to be x86 gibberish though :-)

Well the point is that qemu-kvm.c seems to loop over the vcpu_info array
based on smp_cpus, so if we hotplug a CPU and smp_cpus isn't updated,
which I cannot find any place that does, then we'll end up looping over
only a subset of cpus?

or am I missing something here?

Cheers,
Jes
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch] reindent ia64 code to match qemu code style

2008-09-26 Thread Jes Sorensen

Hi,

Xiantao and I have agreed to reformat the ia64 related code so it
better matches the QEMU formatting style.

This patch has zero code change, it is solely reformatting.

It goes on top of the cmos_init() tidyup patch I sent out earlier today.

Thanks,
Jes
Reindent a bunch of ia64 code to better match the QEMU coding style.

Signed-off-by: Jes Sorensen [EMAIL PROTECTED]
Signed-off-by : Xiantao Zhang [EMAIL PROTECTED]

---
 qemu/hw/ipf.c|  168 ++-
 qemu/qemu-kvm-ia64.c |7 -
 qemu/target-ia64/cpu.h   |   12 +--
 qemu/target-ia64/firmware.c  |   39 -
 qemu/target-ia64/firmware.h  |8 --
 qemu/target-ia64/op_helper.c |   26 ++
 6 files changed, 125 insertions(+), 135 deletions(-)

Index: kvm-userspace.git/qemu/hw/ipf.c
===
--- kvm-userspace.git.orig/qemu/hw/ipf.c
+++ kvm-userspace.git/qemu/hw/ipf.c
@@ -54,48 +54,49 @@
 
 static uint32_t ipf_to_legacy_io(target_phys_addr_t addr)
 {
-   return (uint32_t)(((addr0x3ff)  12  2)|((addr)  0x3));
+return (uint32_t)(((addr0x3ff)  12  2)|((addr)  0x3));
 }
 
 static void ipf_legacy_io_writeb(void *opaque, target_phys_addr_t addr,
 uint32_t val) {
-   uint32_t port = ipf_to_legacy_io(addr);
-   cpu_outb(0, port, val);
+uint32_t port = ipf_to_legacy_io(addr);
+
+cpu_outb(0, port, val);
 }
 
 static void ipf_legacy_io_writew(void *opaque, target_phys_addr_t addr,
 uint32_t val) {
-   uint32_t port = ipf_to_legacy_io(addr);
+uint32_t port = ipf_to_legacy_io(addr);
 
-   cpu_outw(0, port, val);
+cpu_outw(0, port, val);
 }
 
 static void ipf_legacy_io_writel(void *opaque, target_phys_addr_t addr,
 uint32_t val) {
-   uint32_t port = ipf_to_legacy_io(addr);
+uint32_t port = ipf_to_legacy_io(addr);
 
-   cpu_outl(0, port, val);
+cpu_outl(0, port, val);
 }
 
 static uint32_t ipf_legacy_io_readb(void *opaque, target_phys_addr_t addr)
 {
-   uint32_t port = ipf_to_legacy_io(addr);
+uint32_t port = ipf_to_legacy_io(addr);
 
-   return cpu_inb(0, port);
+return cpu_inb(0, port);
 }
 
 static uint32_t ipf_legacy_io_readw(void *opaque, target_phys_addr_t addr)
 {
-   uint32_t port = ipf_to_legacy_io(addr);
+uint32_t port = ipf_to_legacy_io(addr);
 
-   return cpu_inw(0, port);
+return cpu_inw(0, port);
 }
 
 static uint32_t ipf_legacy_io_readl(void *opaque, target_phys_addr_t addr)
 {
-   uint32_t port = ipf_to_legacy_io(addr);
+uint32_t port = ipf_to_legacy_io(addr);
 
-   return cpu_inl(0, port);
+return cpu_inl(0, port);
 }
 
 static CPUReadMemoryFunc *ipf_legacy_io_read[3] = {
@@ -112,7 +113,7 @@
 
 static void pic_irq_request(void *opaque, int irq, int level)
 {
-   fprintf(stderr,pic_irq_request called!\n);
+fprintf(stderr,pic_irq_request called!\n);
 }
 
 /* PC cmos mappings */
@@ -147,6 +148,7 @@
 {
 RTCState *s = rtc_state;
 int cylinders, heads, sectors;
+
 bdrv_get_geometry_hint(hd, cylinders, heads, sectors);
 rtc_set_memory(s, type_ofs, 47);
 rtc_set_memory(s, info_ofs, cylinders);
@@ -221,10 +223,12 @@
 /* set boot devices, and disable floppy signature check if requested */
 #define PC_MAX_BOOT_DEVICES 3
 nbds = strlen(boot_device);
+
 if (nbds  PC_MAX_BOOT_DEVICES) {
 fprintf(stderr, Too many boot devices for PC\n);
 exit(1);
 }
+
 for (i = 0; i  nbds; i++) {
 bds[i] = boot_device2nibble(boot_device[i]);
 if (bds[i] == 0) {
@@ -233,6 +237,7 @@
 exit(1);
 }
 }
+
 rtc_set_memory(s, 0x3d, (bds[1]  4) | bds[0]);
 rtc_set_memory(s, 0x38, (bds[2]  4) | (fd_bootchk ?  0x0 : 0x1));
 
@@ -250,6 +255,7 @@
 nb++;
 if (fd1  3)
 nb++;
+
 switch (nb) {
 case 0:
 break;
@@ -260,6 +266,7 @@
 val |= 0x41; /* 2 drives, ready for boot */
 break;
 }
+
 val |= 0x02; /* FPU is there */
 val |= 0x04; /* PS/2 mouse installed */
 rtc_set_memory(s, REG_EQUIPMENT_BYTE, val);
@@ -277,12 +284,13 @@
 if (hd_table[i]) {
 int cylinders, heads, sectors, translation;
 /* NOTE: bdrv_get_geometry_hint() returns the physical
-geometry.  It is always such that: 1 = sects = 63, 1
-= heads = 16, 1 = cylinders = 16383. The BIOS
-geometry can be different if a translation is done. */
+   geometry.  It is always such that: 1 = sects = 63, 1
+   = heads = 16, 1 = cylinders = 16383. The BIOS
+   geometry can be different if a translation is done. */
 translation = bdrv_get_translation_hint(hd_table[i]);
 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
-bdrv_get_geometry_hint(hd_table[i], cylinders, heads, 
sectors);
+  

Re: [Qemu-devel] [5323] Implement an fd pool to get real AIO with posix-aio

2008-09-26 Thread Ryan Harper
* Anthony Liguori [EMAIL PROTECTED] [2008-09-26 11:03]:
 Revision: 5323
   http://svn.sv.gnu.org/viewvc/?view=revroot=qemurevision=5323
 Author:   aliguori
 Date: 2008-09-26 15:59:29 + (Fri, 26 Sep 2008)
 
 Log Message:
 ---
 Implement an fd pool to get real AIO with posix-aio
 
 This patch implements a simple fd pool to allow many AIO requests with
 posix-aio.  The result is significantly improved performance (identical to 
 that
 reported for linux-aio) for both cache=on and cache=off.
 
 The fundamental problem with posix-aio is that it limits itself to one thread
 per-file descriptor.  I don't know why this is, but this patch provides a 
 simple
 mechanism to work around this (duplicating the file descriptor).
 
 This isn't a great solution, but it seems like a reasonable intermediate step
 between posix-aio and a custom thread-pool to replace it.
 
 Ryan Harper will be posting some performance analysis he did comparing 
 posix-aio
 with fd pooling against linux-aio.  The size of the posix-aio thread pool and
 the fd pool were largely determined by him based on this analysis.

I'll have some more data to post in a bit, but for now, bumping the fd
pool up to 64 and ensuring we init aio to support a thread per fd, we
mostly match linux aio performance with a simpler implementation.  For
randomwrites, fd_pool lags a bit, but I've got other data that shows in
most scenarios, fd_pool matches linux aio performance and does so with
less CPU consumption.

Results:

16k randwrite 1 thread, 74 iodepth | MB/s | avg sub lat (us) | avg comp lat (ms)
---+--+--+--
baremetal (O_DIRECT, aka cache=off)| 61.2 |   13.07  |  19.59
kvm: cache=off posix-aio w/o patch |  4.7 | 3467.44  | 254.08
kvm: cache=off linux-aio   | 61.1 |   75.35  |  19.57
kvm: cache=on  posix-aio w/o patch |127.0 |  115.78  |   9.19
kvm: cache=on  posix-aio w/ patch  |126.0 |   67.35  |   9.30
 new results --+--+--+--
kvm:cache=off posix-aio fd_pool[16]| 33.5 |   14.28  |  49.19
kvm:cache=off posix-aio fd_pool[64]| 51.1 |   14.86  |  23.66


16k write 1 thread, 74 iodepth | MB/s | avg sub lat (us) | avg comp lat (ms)
---+--+--+--
baremetal (O_DIRECT, aka cache=off)|128.1 |   10.90  |   9.45
kvm: cache=off posix-aio w/o patch |  5.1 | 3152.00  | 231.06 
kvm: cache=off linux-aio   |130.0 |   83.83  |   8.99
kvm: cache=on  posix-aio w/o patch |184.0 |   80.46  |   6.35
kvm: cache=on  posix-aio w/ patch  |165.0 |   70.90  |   7.09
 new results --+--+--+--
kvm:cache=off posix-aio fd_pool[16]| 78.2 |   58.24  |  15.43
kvm:cache=off posix-aio fd_pool[64]|129.0 |   71.62  |   9.11


-- 
Ryan Harper
Software Engineer; Linux Technology Center
IBM Corp., Austin, Tx
(512) 838-9253   T/L: 678-9253
[EMAIL PROTECTED]
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [5323] Implement an fd pool to get real AIO with posix-aio

2008-09-26 Thread Anthony Liguori

Ryan Harper wrote:

* Anthony Liguori [EMAIL PROTECTED] [2008-09-26 11:03]:
  

Revision: 5323
  http://svn.sv.gnu.org/viewvc/?view=revroot=qemurevision=5323
Author:   aliguori
Date: 2008-09-26 15:59:29 + (Fri, 26 Sep 2008)

Log Message:
---
Implement an fd pool to get real AIO with posix-aio

This patch implements a simple fd pool to allow many AIO requests with
posix-aio.  The result is significantly improved performance (identical to that
reported for linux-aio) for both cache=on and cache=off.

The fundamental problem with posix-aio is that it limits itself to one thread
per-file descriptor.  I don't know why this is, but this patch provides a simple
mechanism to work around this (duplicating the file descriptor).

This isn't a great solution, but it seems like a reasonable intermediate step
between posix-aio and a custom thread-pool to replace it.

Ryan Harper will be posting some performance analysis he did comparing posix-aio
with fd pooling against linux-aio.  The size of the posix-aio thread pool and
the fd pool were largely determined by him based on this analysis.



I'll have some more data to post in a bit, but for now, bumping the fd
pool up to 64 and ensuring we init aio to support a thread per fd, we
mostly match linux aio performance with a simpler implementation.  For
randomwrites, fd_pool lags a bit, but I've got other data that shows in
most scenarios, fd_pool matches linux aio performance and does so with
less CPU consumption.

Results:

16k randwrite 1 thread, 74 iodepth | MB/s | avg sub lat (us) | avg comp lat (ms)
---+--+--+--
baremetal (O_DIRECT, aka cache=off)| 61.2 |   13.07  |  19.59
kvm: cache=off posix-aio w/o patch |  4.7 | 3467.44  | 254.08
  


So with posix-aio, once we have many requests, each request is going to 
block until the request completes.  I don't fully understand why the 
average completion latency is so high because in theory, there should be 
no delay between completion and submission.  Maybe it has to do with the 
fact that we spend so much time blocking during submission, that the 
io-thread doesn't get a chance to run.  I bet if we dropped the 
qemu_mutex during submission, the completion latency would drop to a 
very small number.  Not worth actually testing.



kvm: cache=off linux-aio   | 61.1 |   75.35  |  19.57
  


The fact that the submission latency is so high confirms what I've been 
about linux-aio submissions being very unoptimal.  That is really quite 
high.



kvm: cache=on  posix-aio w/o patch |127.0 |  115.78  |   9.19
kvm: cache=on  posix-aio w/ patch  |126.0 |   67.35  |   9.30
  


It looks like 127mb/s is pretty close to the optimal cached write time.  
When using caching, writes can complete almost immediately so it's not 
surprising that submission latency is so low (even though it's blocking 
during submission).


I am surprised that w/patch has a latency that's so high.  I think that 
suggests that requests are queuing up.  I bet increasing the aio_num 
field would reduce this number.



 new results --+--+--+--
kvm:cache=off posix-aio fd_pool[16]| 33.5 |   14.28  |  49.19
kvm:cache=off posix-aio fd_pool[64]| 51.1 |   14.86  |  23.66
  


I assume you tried to bump from 64 to something higher and couldn't make 
up the lost bandwidth?



16k write 1 thread, 74 iodepth | MB/s | avg sub lat (us) | avg comp lat (ms)
---+--+--+--
baremetal (O_DIRECT, aka cache=off)|128.1 |   10.90  |   9.45
kvm: cache=off posix-aio w/o patch |  5.1 | 3152.00  | 231.06 
kvm: cache=off linux-aio   |130.0 |   83.83  |   8.99

kvm: cache=on  posix-aio w/o patch |184.0 |   80.46  |   6.35
kvm: cache=on  posix-aio w/ patch  |165.0 |   70.90  |   7.09
 new results --+--+--+--
kvm:cache=off posix-aio fd_pool[16]| 78.2 |   58.24  |  15.43
kvm:cache=off posix-aio fd_pool[64]|129.0 |   71.62  |   9.11
  


That's a nice result.  We could probably improve the latency by tweaking 
the queue sizes.


Very nice work!  Thanks for doing the thorough analysis.

Regards,

Anthony Liguori


  


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [5323] Implement an fd pool to get real AIO with posix-aio

2008-09-26 Thread Ryan Harper
* Anthony Liguori [EMAIL PROTECTED] [2008-09-26 13:37]:
 Ryan Harper wrote:
 * Anthony Liguori [EMAIL PROTECTED] [2008-09-26 11:03]:

 kvm: cache=on  posix-aio w/o patch |127.0 |  115.78  |   9.19
 kvm: cache=on  posix-aio w/ patch  |126.0 |   67.35  |   9.30
   
 
 It looks like 127mb/s is pretty close to the optimal cached write time.  
 When using caching, writes can complete almost immediately so it's not 
 surprising that submission latency is so low (even though it's blocking 
 during submission).
 
 I am surprised that w/patch has a latency that's so high.  I think that 
 suggests that requests are queuing up.  I bet increasing the aio_num 
 field would reduce this number.

Yeah, there is plenty of room to twiddle with the threads and number of
outstanding ios, but that'll take quite a bit of time to generate the
data and compare.

  new results 
 --+--+--+--
 kvm:cache=off posix-aio fd_pool[16]| 33.5 |   14.28  |  49.19
 kvm:cache=off posix-aio fd_pool[64]| 51.1 |   14.86  |  23.66
   
 
 I assume you tried to bump from 64 to something higher and couldn't make 
 up the lost bandwidth?

Very slightly, switching to 128 threads/fds gave another 1MB/s. 

 16k write 1 thread, 74 iodepth | MB/s | avg sub lat (us) | avg comp 
 lat (ms)
 ---+--+--+--
 baremetal (O_DIRECT, aka cache=off)|128.1 |   10.90  |   9.45
 kvm: cache=off posix-aio w/o patch |  5.1 | 3152.00  | 231.06 
 kvm: cache=off linux-aio   |130.0 |   83.83  |   8.99
 kvm: cache=on  posix-aio w/o patch |184.0 |   80.46  |   6.35
 kvm: cache=on  posix-aio w/ patch  |165.0 |   70.90  |   7.09
  new results 
 --+--+--+--
 kvm:cache=off posix-aio fd_pool[16]| 78.2 |   58.24  |  15.43
 kvm:cache=off posix-aio fd_pool[64]|129.0 |   71.62  |   9.11
   
 
 That's a nice result.  We could probably improve the latency by tweaking 
 the queue sizes.

Yeah, I was quite pleased to see a simpler solution perform so well.
 
 Very nice work!  Thanks for doing the thorough analysis.

Thanks, very happy to see a signficant improvement in IO here.

-- 
Ryan Harper
Software Engineer; Linux Technology Center
IBM Corp., Austin, Tx
(512) 838-9253   T/L: 678-9253
[EMAIL PROTECTED]
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] VT-d: Fix iommu map page for mmio pages

2008-09-26 Thread Muli Ben-Yehuda
On Thu, Sep 25, 2008 at 04:51:24PM -0500, Anthony Liguori wrote:
 Muli Ben-Yehuda wrote:
 On Thu, Sep 25, 2008 at 05:45:30PM +0300, Avi Kivity wrote:
   
 Han, Weidong wrote:
 
 Is it possible DMA into an mmio page?   
 I don't see why not.
 

 Two reasons. First it makes no sense. MMIO pages don't have RAM
 backing them, they have another device's register window. So the
 effect of DMA'ing into an MMIO page would be for one device to DMA
 into the register window of another device, which sounds to me insane.
   

 MMIO isn't just a register window.  It may be an on-device buffer.

Unlikely, but ok.

 For instance, all packets are stored in a buffer on the ne2k that's
 mapped via mmio.  It would seem entirely reasonable to me to program
 an IDE driver to DMA directly into the devices packet buffer.

It would be insane to me. Have you tried this on real hardware and
seen it work?

 Second, and more importantly, I've seen systems where doing the
 above caused a nice, immediate, reboot. So I think that unless
 someone comes with a valid scenario where we need to support it or
 something breaks, we'd better err on the side of caution and not
 map pages that should not be DMA targets.
   

 Xen maps the MMIO pages into the VT-d table.  The system you were using 
 could have just been busted.  I think the burden is to prove that this is 
 illegal (via the architecture specification).

I strongly disagree. You are advocating something that is potentially
unsafe---for the sake of code simplicity?! I am advocating caution in
what we let an *untrusted* guest do.

Cheers,
Muli
-- 
The First Workshop on I/O Virtualization (WIOV '08)
Dec 2008, San Diego, CA, http://www.usenix.org/wiov08/
  xxx
SYSTOR 2009---The Israeli Experimental Systems Conference
http://www.haifa.il.ibm.com/conferences/systor2009/
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 9/9] x86/iommu: use dma_ops_list in get_dma_ops

2008-09-26 Thread Muli Ben-Yehuda
On Fri, Sep 26, 2008 at 02:32:43PM +0200, Joerg Roedel wrote:

 Ok, the allocation only matters for dma_alloc_coherent. Fujita
 introduced a generic software-based dma_alloc_coherent recently
 which you can use for that. I think implementing PVDMA into an own
 dma_ops backend and multiplex it using my patches introduces less
 overhead than an additional layer over the current dma_ops
 implementation.

I'm not sure what you have in mind, but I agree with Amit that
conceptually pvdma should be called after the guest's native dma_ops
have done their thing. This is not just for nommu, consider a guest
that is using an (emulated) hardware IOMMU, or that wants to use
swiotlb. We can't replicate their functionality in the pv_dma_ops
layer, we have to let them run first and then pass deal with whatever
we get back.

 Another two questions to your approach: What happens if a
 dma_alloc_coherent allocation crosses page boundarys and the gpa's
 are not contiguous in host memory? How will dma masks be handled?

That's a very good question. The host will need to be aware of a
device's DMA capabilities in order to return I/O addresses (which
could be hpa's if you don't have an IOMMU) that satisfy them. That's
quite a pain.

Cheers,
Muli
-- 
The First Workshop on I/O Virtualization (WIOV '08)
Dec 2008, San Diego, CA, http://www.usenix.org/wiov08/
  xxx
SYSTOR 2009---The Israeli Experimental Systems Conference
http://www.haifa.il.ibm.com/conferences/systor2009/
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Status of pci passthrough work?

2008-09-26 Thread Thomas Fjellstrom
I'm very interested in being able to pass a few devices through to kvm guests. 
I'm wondering what exactly is working now, and how I can start testing it?

the latest kvm release doesn't seem to include any support for it in 
userspace, so I can't test it with that...

Basically what I want to do is assign a two or three physical nics (100mb and 
GiB) to one vm, some tv tuner cards to another.

Also, I'm wondering if AMD's iommu in the SB750 southbridge is supported yet? 
Or if anyone is working on it?

-- 
Thomas Fjellstrom
[EMAIL PROTECTED]
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] kvm: bios: switch MTRRs to cover only the PCI range and default to WB

2008-09-26 Thread Yang, Sheng
On Friday 26 September 2008 01:52:29 Alex Williamson wrote:
 kvm: bios: switch MTRRs to cover only the PCI range and default to WB

 This matches how some bare metal machines report MTRRs and avoids
 the problem of running out of MTRRs to cover all of RAM.

 Signed-off-by: Alex Williamson [EMAIL PROTECTED]
 ---

  bios/rombios32.c |   24 
  1 files changed, 4 insertions(+), 20 deletions(-)

 diff --git a/bios/rombios32.c b/bios/rombios32.c
 index f8edf18..592abf9 100755
 --- a/bios/rombios32.c
 +++ b/bios/rombios32.c
 @@ -494,7 +494,6 @@ void setup_mtrr(void)
  uint8_t valb[8];
  uint64_t val;
  } u;
 -uint64_t vbase, vmask;

  mtrr_cap = rdmsr(MSR_MTRRcap);
  vcnt = mtrr_cap  0xff;
 @@ -521,25 +520,10 @@ void setup_mtrr(void)
  wrmsr_smp(MSR_MTRRfix4K_E8000, 0);
  wrmsr_smp(MSR_MTRRfix4K_F, 0);
  wrmsr_smp(MSR_MTRRfix4K_F8000, 0);
 -vbase = 0;
 ---vcnt; /* leave one mtrr for VRAM */
 -for (i = 0; i  vcnt  vbase  ram_size; ++i) {
 -vmask = (1ull  40) - 1;
 -while (vbase + vmask + 1  ram_size)
 -vmask = 1;
 -wrmsr_smp(MTRRphysBase_MSR(i), vbase | 6);
 -wrmsr_smp(MTRRphysMask_MSR(i), (~vmask  0xfff000ull) |
 0x800); -vbase += vmask + 1;
 -}
 -for (vbase = 1ull  32; i  vcnt  vbase  ram_end; ++i) {
 -vmask = (1ull  40) - 1;
 -while (vbase + vmask + 1  ram_end)
 -vmask = 1;
 -wrmsr_smp(MTRRphysBase_MSR(i), vbase | 6);
 -wrmsr_smp(MTRRphysMask_MSR(i), (~vmask  0xfff000ull) |
 0x800); -vbase += vmask + 1;
 -}
 -wrmsr_smp(MSR_MTRRdefType, 0xc00);
 +/* Mark 3.5-4GB as UC, anything not specified defaults to WB */
 +wrmsr_smp(MTRRphysBase_MSR(0), 0xe000ull | 0);
 +wrmsr_smp(MTRRphysMask_MSR(0), ~(0x2000ull - 1) | 0x800);
 +wrmsr_smp(MSR_MTRRdefType, 0xc06);
  }


I think we should do a little more than just write msr to update mtrr.

Intel SDM 10.11.8 MTRR consideration in MP Systems define the procedure to 
modify MTRR msr in MP. Especially, step 4 enter no-fill cache mode(set CR0.CD 
bit and clean NW bit), step 12 re-enabled the caching(clear this two bits).

We based on these behaviors to detect MTRR update.

(Forgot to raise the bug to Avi, recalled it now...)
--
regards
Yang, Sheng
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html