[COMMIT master] KVM: fix build error: add missing semi-colon in longmode cpuid

2009-05-12 Thread Avi Kivity
From: Randy Dunlap randy.dun...@oracle.com

Add missing ; to fix build error:

arch/x86/kvm/x86.c:1259: error: expected',' or ';' before 'const'

Signed-off-by: Randy Dunlap randy.dun...@oracle.com
Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c6d3ff3..fd0a571 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1252,7 +1252,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, 
u32 function,
 #ifdef CONFIG_X86_64
unsigned f_lm = F(LM);
 #else
-   unsigned f_lm = 0
+   unsigned f_lm = 0;
 #endif
 
/* cpuid 1.edx */
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[COMMIT master] Move source sync code into new file sync.mak

2009-05-12 Thread Avi Kivity
From: Avi Kivity a...@redhat.com

This makes it possible to call it without running ./configure.

Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/Makefile b/Makefile
index 0413212..ff6945e 100644
--- a/Makefile
+++ b/Makefile
@@ -19,21 +19,6 @@ rpmrelease = devel
 
 LINUX = ./linux-2.6
 
-version = $(KVM_VERSION)
-
-_hack = mv $1 $1.orig  \
-   gawk -v version=$(version) -f $(ARCH_DIR)/hack-module.awk $1.orig \
-   | sed '/\#include/! s/\blapic\b/l_apic/g'  $1  rm $1.orig
-
-unifdef = mv $1 $1.orig  cat unifdef.h $1.orig  $1  rm $1.orig
-
-hack = $(call _hack,$T/$(strip $1))
-
-hack-files-x86 = kvm_main.c mmu.c vmx.c svm.c x86.c irq.h lapic.c i8254.c 
kvm_trace.c timer.c
-hack-files-ia64 = kvm_main.c kvm_fw.c kvm_lib.c kvm-ia64.c
-
-hack-files = $(hack-files-$(ARCH_DIR))
-
 ifeq ($(EXT_CONFIG_KVM_TRACE),y)
 module_defines += -DEXT_CONFIG_KVM_TRACE=y
 endif
@@ -48,54 +33,10 @@ all:: prerequisite
-include `pwd`/$(ARCH_DIR)/external-module-compat.h 
$(module_defines)
$$@
 
-sync: header-sync source-sync
-
-T = $(subst -sync,,$@)-tmp
-
-headers-old = $(LINUX)/./include/asm-$(ARCH_DIR)/kvm*.h
-headers-new = $(LINUX)/arch/$(ARCH_DIR)/include/asm/./kvm*.h \
-   $(LINUX)/arch/$(ARCH_DIR)/include/asm/./vmx*.h \
-   $(LINUX)/arch/$(ARCH_DIR)/include/asm/./svm*.h \
-   $(LINUX)/arch/$(ARCH_DIR)/include/asm/./virtext*.h
-
-header-sync:
-   rm -rf $T
-   rsync -R -L \
-$(LINUX)/./include/linux/kvm*.h \
-$(if $(wildcard $(headers-old)), $(headers-old)) \
- $T/
-   $(if $(wildcard $(headers-new)), \
-   rsync -R -L \
-$(wildcard $(headers-new)) \
- $T/include/asm-$(ARCH_DIR)/)
-
-   for i in $$(find $T -name '*.h'); do \
-   $(call unifdef,$$i); done
-   $(call hack, include/linux/kvm.h)
-   $(call hack, include/asm-$(ARCH_DIR)/kvm.h)
-   set -e  for i in $$(find $T -type f -printf '%P '); \
-   do mkdir -p $$(dirname $$i); cmp -s $$i $T/$$i || cp $T/$$i 
$$i; done
-   rm -rf $T
-
-source-sync:
-   rm -rf $T
-   rsync --exclude='*.mod.c' -R \
-$(LINUX)/arch/$(ARCH_DIR)/kvm/./*.[cSh] \
-$(LINUX)/virt/kvm/./*.[cSh] \
-$T/
-
-   for i in $$(find $T -name '*.c'); do \
-   $(call unifdef,$$i); done
-
-   for i in $(hack-files); \
-   do $(call hack, $$i); done
-
-   for i in $$(find $T -type f -printf '%P '); \
-   do cmp -s $(ARCH_DIR)/$$i $T/$$i || cp $T/$$i $(ARCH_DIR)/$$i; 
done
-   rm -rf $T
-
 include $(MAKEFILE_PRE)
 
+include sync.mak
+
 install:
mkdir -p $(DESTDIR)/$(INSTALLDIR)
cp $(ARCH_DIR)/*.ko $(DESTDIR)/$(INSTALLDIR)
diff --git a/sync.mak b/sync.mak
new file mode 100644
index 000..801772f
--- /dev/null
+++ b/sync.mak
@@ -0,0 +1,63 @@
+LINUX = ./linux-2.6
+
+version = $(KVM_VERSION)
+
+_hack = mv $1 $1.orig  \
+   gawk -v version=$(version) -f $(ARCH_DIR)/hack-module.awk $1.orig \
+   | sed '/\#include/! s/\blapic\b/l_apic/g'  $1  rm $1.orig
+
+unifdef = mv $1 $1.orig  cat unifdef.h $1.orig  $1  rm $1.orig
+
+hack = $(call _hack,$T/$(strip $1))
+
+hack-files-x86 = kvm_main.c mmu.c vmx.c svm.c x86.c irq.h lapic.c i8254.c 
kvm_trace.c timer.c
+hack-files-ia64 = kvm_main.c kvm_fw.c kvm_lib.c kvm-ia64.c
+
+hack-files = $(hack-files-$(ARCH_DIR))
+
+sync: header-sync source-sync
+
+T = $(subst -sync,,$@)-tmp
+
+headers-old = $(LINUX)/./include/asm-$(ARCH_DIR)/kvm*.h
+headers-new = $(LINUX)/arch/$(ARCH_DIR)/include/asm/./kvm*.h \
+   $(LINUX)/arch/$(ARCH_DIR)/include/asm/./vmx*.h \
+   $(LINUX)/arch/$(ARCH_DIR)/include/asm/./svm*.h \
+   $(LINUX)/arch/$(ARCH_DIR)/include/asm/./virtext*.h
+
+header-sync:
+   rm -rf $T
+   rsync -R -L \
+$(LINUX)/./include/linux/kvm*.h \
+$(if $(wildcard $(headers-old)), $(headers-old)) \
+ $T/
+   $(if $(wildcard $(headers-new)), \
+   rsync -R -L \
+$(wildcard $(headers-new)) \
+ $T/include/asm-$(ARCH_DIR)/)
+
+   for i in $$(find $T -name '*.h'); do \
+   $(call unifdef,$$i); done
+   $(call hack, include/linux/kvm.h)
+   $(call hack, include/asm-$(ARCH_DIR)/kvm.h)
+   set -e  for i in $$(find $T -type f -printf '%P '); \
+   do mkdir -p $$(dirname $$i); cmp -s $$i $T/$$i || cp $T/$$i 
$$i; done
+   rm -rf $T
+
+source-sync:
+   rm -rf $T
+   rsync --exclude='*.mod.c' -R \
+$(LINUX)/arch/$(ARCH_DIR)/kvm/./*.[cSh] \
+$(LINUX)/virt/kvm/./*.[cSh] \
+$T/
+
+   for i in $$(find $T -name '*.c'); do \
+   $(call unifdef,$$i); done
+
+   for i in $(hack-files); \
+   do $(call hack, $$i); done
+
+   for i in $$(find $T -type f -printf '%P '); \
+   do cmp -s $(ARCH_DIR)/$$i $T/$$i || cp $T/$$i $(ARCH_DIR)/$$i; 
done
+   rm -rf $T
+
--
To unsubscribe from this 

[COMMIT master] Make vmx-debug.c to compile again

2009-05-12 Thread Avi Kivity
From: Gleb Natapov g...@redhat.com

Add TPR value checking.

Signed-off-by: Gleb Natapov g...@redhat.com
Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/x86/vmx-debug.c b/x86/vmx-debug.c
index 29316a0..d466f03 100644
--- a/x86/vmx-debug.c
+++ b/x86/vmx-debug.c
@@ -17,17 +17,14 @@
 #include linux/highmem.h
 
 #include linux/kvm_host.h
+#include asm/vmx.h
+#include asm/kvm_host.h
+#include mmu.h
+#include lapic.h
 #include debug.h
 
 #ifdef KVM_DEBUG
 
-static const char *vmx_msr_name[] = {
-   MSR_EFER, MSR_STAR, MSR_CSTAR,
-   MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR
-};
-
-#define NR_VMX_MSR (sizeof(vmx_msr_name) / sizeof(char*))
-
 static unsigned long vmcs_readl(unsigned long field)
 {
unsigned long value;
@@ -56,29 +53,21 @@ static u64 vmcs_read64(unsigned long field)
 #endif
 }
 
-void show_msrs(struct kvm_vcpu *vcpu)
-{
-   int i;
-
-   for (i = 0; i  NR_VMX_MSR; ++i) {
-   vcpu_printf(vcpu, %s: %s=0x%llx\n,
-  __FUNCTION__,
-  vmx_msr_name[i],
-  vcpu-guest_msrs[i].data);
-   }
-}
-
 void show_code(struct kvm_vcpu *vcpu)
 {
gva_t rip = vmcs_readl(GUEST_RIP);
u8 code[50];
char buf[30 + 3 * sizeof code];
int i;
+   gpa_t gpa;
 
if (!is_long_mode(vcpu))
rip += vmcs_readl(GUEST_CS_BASE);
 
-   kvm_read_guest(vcpu, rip, sizeof code, code);
+   gpa = vcpu-arch.mmu.gva_to_gpa(vcpu, rip);
+   if (gpa == UNMAPPED_GVA)
+   return;
+   kvm_read_guest(vcpu-kvm, gpa, code, sizeof code);
for (i = 0; i  sizeof code; ++i)
sprintf(buf + i * 3,  %02x, code[i]);
vcpu_printf(vcpu, code: %lx%s\n, rip, buf);
@@ -98,6 +87,7 @@ void show_irq(struct kvm_vcpu *vcpu,  int irq)
unsigned long idt_base = vmcs_readl(GUEST_IDTR_BASE);
unsigned long idt_limit = vmcs_readl(GUEST_IDTR_LIMIT);
struct gate_struct gate;
+   gpa_t gpa;
 
if (!is_long_mode(vcpu))
vcpu_printf(vcpu, %s: not in long mode\n, __FUNCTION__);
@@ -109,7 +99,11 @@ void show_irq(struct kvm_vcpu *vcpu,  int irq)
return;
}
 
-   if (kvm_read_guest(vcpu, idt_base + irq * sizeof(gate), sizeof(gate), 
gate) != sizeof(gate)) {
+   gpa = vcpu-arch.mmu.gva_to_gpa(vcpu, idt_base + irq * sizeof(gate));
+   if (gpa == UNMAPPED_GVA)
+   return;
+
+   if (kvm_read_guest(vcpu-kvm, gpa, gate, sizeof(gate)) != 
sizeof(gate)) {
vcpu_printf(vcpu, %s: 0x%x read_guest err\n,
   __FUNCTION__,
   irq);
@@ -127,12 +121,16 @@ void show_page(struct kvm_vcpu *vcpu,
 gva_t addr)
 {
u64 *buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+   gpa_t gpa;
 
if (!buf)
return;
 
addr = PAGE_MASK;
-   if (kvm_read_guest(vcpu, addr, PAGE_SIZE, buf)) {
+   gpa = vcpu-arch.mmu.gva_to_gpa(vcpu, addr);
+   if (gpa == UNMAPPED_GVA)
+   return;
+   if (kvm_read_guest(vcpu-kvm, gpa, buf, PAGE_SIZE)) {
int i;
for (i = 0; i   PAGE_SIZE / sizeof(u64) ; i++) {
u8 *ptr = (u8*)buf[i];
@@ -150,8 +148,12 @@ void show_page(struct kvm_vcpu *vcpu,
 void show_u64(struct kvm_vcpu *vcpu, gva_t addr)
 {
u64 buf;
+   gpa_t gpa;
 
-   if (kvm_read_guest(vcpu, addr, sizeof(u64), buf) == sizeof(u64)) {
+   gpa = vcpu-arch.mmu.gva_to_gpa(vcpu, addr);
+   if (gpa == UNMAPPED_GVA)
+   return;
+   if (kvm_read_guest(vcpu-kvm, gpa, buf, sizeof(u64)) == sizeof(u64)) {
u8 *ptr = (u8*)buf;
int j;
vcpu_printf(vcpu,  0x%16.16lx:, addr);
@@ -178,6 +180,8 @@ int vm_entry_test_guest(struct kvm_vcpu *vcpu)
unsigned long sysenter_esp;
unsigned long sysenter_eip;
unsigned long rflags;
+   unsigned long cpu_exec_ctrl, cpu_secondary_exec_ctrl;
+   unsigned long tpr_threshold;
 
int long_mode;
int virtual8086;
@@ -219,38 +223,38 @@ int vm_entry_test_guest(struct kvm_vcpu *vcpu)
 
cr0 = vmcs_readl(GUEST_CR0);
 
-   if (!(cr0  CR0_PG_MASK)) {
+   if (!(cr0  X86_CR0_PG)) {
vcpu_printf(vcpu, %s: cr0 0x%lx, PG is not set\n,
   __FUNCTION__, cr0);
return 0;
}
 
-   if (!(cr0  CR0_PE_MASK)) {
+   if (!(cr0  X86_CR0_PE)) {
vcpu_printf(vcpu, %s: cr0 0x%lx, PE is not set\n,
   __FUNCTION__, cr0);
return 0;
}
 
-   if (!(cr0  CR0_NE_MASK)) {
+   if (!(cr0  X86_CR0_NE)) {
vcpu_printf(vcpu, %s: cr0 0x%lx, NE is not set\n,
   __FUNCTION__, cr0);
return 0;
}
 
-   if (!(cr0  CR0_WP_MASK)) {
+   if (!(cr0  X86_CR0_WP)) {
vcpu_printf(vcpu, %s: cr0 

[COMMIT stable-0.10] Merge branch 'stable-0.10' of git://git.sv.gnu.org/qemu into stable-0.10

2009-05-12 Thread Avi Kivity
From: Avi Kivity a...@redhat.com

* 'stable-0.10' of git://git.sv.gnu.org/qemu:
  Improve block range checks
  e1000: Do not reinit pci config space to 0
  AIO deletion race fix
  reset state for load_linux
  register reset handler for option_roms
  Fix cluster freeing in qcow2
  Enable power button even generation.

Conflicts:
pc-bios/bios.bin

Signed-off-by: Avi Kivity a...@redhat.com
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[COMMIT master] Compile and cache regular expressions

2009-05-12 Thread Avi Kivity
From: Avi Kivity a...@redhat.com

Speeds up hacking the source.

Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/sync b/sync
index cb4a9db..4a89296 100755
--- a/sync
+++ b/sync
@@ -14,6 +14,14 @@ if len(sys.argv) = 2:
 
 linux = 'linux-2.6'
 
+_re_cache = {}
+
+def re_cache(regexp):
+global _re_cache
+if regexp not in _re_cache:
+_re_cache[regexp] = re.compile(regexp)
+return _re_cache[regexp]
+
 def __hack(data):
 compat_apis = str.split(
 'INIT_WORK desc_struct ldttss_desc64 desc_ptr '
@@ -24,11 +32,11 @@ def __hack(data):
 anon_inodes = anon_inodes_exit = False
 result = []
 def sub(regexp, repl, str):
-return re.sub(regexp, repl, str)
+return re_cache(regexp).sub(repl, str)
 for line in data.splitlines():
 orig = line
 def match(regexp):
-return re.search(regexp, line)
+return re_cache(regexp).search(line)
 def w(line, result = result):
 result.append(line)
 f = line.split()
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM on Via Nano (Isaiah) CPUs? Virus checked

2009-05-12 Thread Avi Kivity

Andreas Tanz wrote:

Craig Metz wrote:


In message 49d396ab.6090...@redhat.com, you write:
  
  
Via engineers have contacted me and confirmed that this is a problem in 
the processor.



  Is there a known-fixed CPU revision?

  Is there a way to identify working vs. non-working chips, either from IC
stamp or from /proc/cpuinfo? (Bonus: is it possible to put a check and an error
into the kvm-intel kernel model?)
  
  

I have no idea.  Please contact Via for this information.




Hi,

I've read an EMail from VIA, telling that the Nano must be at least stepping 3.
Prior steppings have a buggy vt implementation...
(Damn! I have stepping 2 :-I )
  


Can you send your /proc/cpuinfo, I'll try to blacklist it.


--
Do not meddle in the internals of kernels, for they are subtle and quick to 
panic.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [KVM PATCH v4 2/2] kvm: add support for irqfd via eventfd-notification interface

2009-05-12 Thread Davide Libenzi
On Mon, 11 May 2009, Gregory Haskins wrote:

 Davide Libenzi wrote:
  On Wed, 6 May 2009, Gregory Haskins wrote:
 

  If there isn't any more feedback on the series from Al, Avi,
  etc...please formally submit your eventfd patch so this series is
  available for Avi to pull in for inclusion when/if he deems it fit.
  
 
  Did you decide you will be using those bits?

 
 Hi Davide,
   It appears that we will not end up needing them since the new version
 I am about to push goes back to creating the eventfd in userspace to
 begin with, per Avi's request.

That looks like a good idea.


- Davide


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Implement generic double fault generation mechanism

2009-05-12 Thread Gleb Natapov
On Tue, May 12, 2009 at 01:35:31PM +0800, Dong, Eddie wrote:
 Gleb Natapov wrote:
  On Mon, May 11, 2009 at 09:04:52AM +0800, Dong, Eddie wrote:
  
  There is not point referring to current code. Current code does not
  handle serial exceptions properly. So fix it in your patch
  otherwise I propose to use my patch that fixes current code
  (http://patchwork.kernel.org/patch/21829/).
  
  
  I would like Avi to decide. As comments to the difference of 2
  patches, my undrestanding is that I am addressing the problem base
  on SDM 5-4 with the answer to serial injection as first in first
  service. Your patch doesn;t solve generic double fault case for
  example exception 11 on 11, or GP on GP which needs to be converted
  to #DF per SDM, rather you only handle the case the secondary
  exception is PF,  and servicing PF.  
  
  There is nothing to decide really. I prefer your patch with serial
  exception handling fixed. If you'll not do it I'll do it.
 
 OK, an additional patch will be constructive but my position is neutral. The 
 reason (mentioned) is:
 
 1: Current KVM just WARN_ON for those case (and never be hit), so the this 
 patch won't introduce 
 additional issues. Either printk or WARN_ON to notify us in case we met the 
 problem in future is safer way for me.
 
But current KVM also replace pending exception with a newer one after
WARN_ON. I agree that real OSes (at least common ones) never hit this
case. But it is possible to hit it from a guest and I have a test case.
 
 2: In case of real serial ecception happens, from architectural point of 
 view, I think we'd better consult Table 5-2 to prioritize them, which is 
 neither reserving former exception nor overwritting. But as you mentioned, 
 the list is not completed. My point is that this is another complicated 
 scenario that we should spend time in future, but not related to current 
 patch.
 
If you can get more complete info about what real CPU does in case of
simultaneous exceptions it would be nice. I think CPU is smart enough
to understand when second exception happened while trying to handle the
first one and handle the second one first in this case. Otherwise I
don't see how it could work.

 3: This function will soon needs to be extended to cover IRQ case too, which 
 needs to push back the overwritten IRQ. We need a total solution for this, so 
 I prefer to do that some time later.
 
I don't think that IRQ should be handled by this function. At leas it
should still be stored in its own queue.

 4: I prefer to split issue if possible. 
 
 
That is OK, You can send two patches. The first one will WARN_ON and
overwrite exception like the current code does. And the second one will
remove WARN_ON explaining that this case is actually possible to trigger
from a guest.

  
  I can check with internal architecture to see what does handle
  exceptions serially mean in really. For me serial means first in
  first out, and thus we should remain 1st exception.  
  
  There is a table 5.2 that defines an order between some events.  The
  table is not complete, I don't see #DE there for instance.  But
  consider 
  this case: #DE (or #NP) happens while exception stack is paged out so
  #PF happens next. #PF is handled by TSS gate so it uses its own stack
  and it fixes exception stack in its handler. If we drop #PF because
  #DE is already waiting we will keep trying to inject #DE
  indefinitely. The result is hanging QEMU process eating 100% cpu
  time. If we replace #DE with #PF on the other hand then #PF handler
  will fix exception stack instruction that caused #DE will be
  re-executed, #DE regenerated and handled properly. So which scenario
  do you prefer? 
 
 See above.
 
 Thx, eddie
--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM on Via Nano (Isaiah) CPUs? Virus checked

2009-05-12 Thread Andreas Tanz
 Craig Metz wrote:
  In message 49d396ab.6090...@redhat.com, you write:

  Via engineers have contacted me and confirmed that this is a problem in 
  the processor.
  
 
Is there a known-fixed CPU revision?
 
Is there a way to identify working vs. non-working chips, either from IC
  stamp or from /proc/cpuinfo? (Bonus: is it possible to put a check and an 
  error
  into the kvm-intel kernel model?)

 
 I have no idea.  Please contact Via for this information.
 

Hi,

I've read an EMail from VIA, telling that the Nano must be at least stepping 3.
Prior steppings have a buggy vt implementation...
(Damn! I have stepping 2 :-I )

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH, RFC] virtio_blk: add cache flush command

2009-05-12 Thread Christoph Hellwig
On Mon, May 11, 2009 at 07:49:37PM +0300, Avi Kivity wrote:
 Maybe we should add a fourth cache= mode then.  But 
 cache=writeback+fsync doesn't correspond to any real world drive; in the 
 real world you're limited to power failures and a few megabytes of cache 
 (typically less), cache=writeback+fsync can lose hundreds of megabytes 
 due to power loss or software failure.

cache=writeback+fsync is exactly the same model as a normal writeback
cache disk drive.  (Well, almost as we currently don't use tag ordering
but drain flushes as a Linux implementation detail, but the disks also
support TCQ-based ordering).

The cache size on disks is constantly growing, and if you lose cache
it doesn't really matter how much you lose but what you lose.

 Oh, and cache=writeback+fsync doesn't work on qcow2, unless we add fsync 
 after metadata updates.

If you care about data integrity in case of crashes qcow2 doesn't work
at all.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH, RFC] virtio_blk: add cache flush command

2009-05-12 Thread Christoph Hellwig
On Mon, May 11, 2009 at 12:47:58PM -0500, Anthony Liguori wrote:
 But how do we define the data integrity guarantees to the user of 
 cache=writeback+fsync?  It seems to require a rather detailed knowledge 
 of Linux's use of T_FLUSH operations.

It does work the same as for disks with writeback caches.  If a barrier
request completes the filesystem can assume all previous I/O on this
disk has finished before the barrier request, and the barrier request
has finished after these without any later request being finished before
it.  If a cache flush is issued the cache as of that point in time is
flushed (used for fsync).

 Right now, it's fairly easy to understand.  cache=none and 
 cache=writethrough guarantee that all write operations that the guest 
 thinks have completed are completed.  cache=writeback provides no such 
 guarantee.

As said above with barriers it indeed does.

 cache=writeback+fsync would guarantee that only operations that include 
 a T_FLUSH are present on disk which currently includes fsyncs but does 
 not include O_DIRECT writes.  I guess whether O_SYNC does a T_FLUSH also 
 has to be determined.

O_SYNC and O_DIRECT do it at least on XFS due to updating metadata after
the write.  I can't vouch for implementation details on other
filesystems.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH, RFC] virtio_blk: add cache flush command

2009-05-12 Thread Christoph Hellwig
On Mon, May 11, 2009 at 11:38:09AM -0500, Anthony Liguori wrote:
 Right now it doesn't, but it probably should.
   
 
 So then with cache=writeback, fsync behaves itself but O_DIRECT writes 
 do not.

Right now O_DIRECT does not do an explicit cache flush, but due to the
way barriers are implemented in Linux we do get the cache flush as part
of the metadata updates after I/O completion.

 fsync is pretty crappy on ext3 default configs.  I'm concerned that this 
 could be considered a DoS by a malicious guest.  If it sat in a T_FLUSH 
 loop, it would potentially bring your system to a crawl, no?

It's exactly the same effect as a regular user doing fsync in a loop,
whatever that causes on ext3.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2] kvm: x86: Allow PIT emulation without speaker port

2009-05-12 Thread Jan Kiszka
The in-kernel speaker emulation is only a dummy and also unneeded from
the performance point of view. Rather, it takes user space support to
generate sound output on the host, e.g. console beeps.

To allow this, introduce KVM_CREATE_PIT2 which controls in-kernel
speaker port emulation via a flag passed along the new IOCTL. It also
leaves room for future extensions of the PIT configuration interface.

Changes in v2:
 - Use extensible KVM_CREATE_PIT2

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
---

 arch/x86/kvm/i8254.c |   14 --
 arch/x86/kvm/i8254.h |2 +-
 arch/x86/kvm/x86.c   |   12 +++-
 include/linux/kvm.h  |   10 ++
 4 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 4d6f0d2..584e3d3 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -560,7 +560,7 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier 
*kimn, bool mask)
}
 }
 
-struct kvm_pit *kvm_create_pit(struct kvm *kvm)
+struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 {
struct kvm_pit *pit;
struct kvm_kpit_state *pit_state;
@@ -586,11 +586,13 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
pit-dev.private = pit;
kvm_io_bus_register_dev(kvm-pio_bus, pit-dev);
 
-   pit-speaker_dev.read = speaker_ioport_read;
-   pit-speaker_dev.write = speaker_ioport_write;
-   pit-speaker_dev.in_range = speaker_in_range;
-   pit-speaker_dev.private = pit;
-   kvm_io_bus_register_dev(kvm-pio_bus, pit-speaker_dev);
+   if (flags  KVM_PIT_SPEAKER_DUMMY) {
+   pit-speaker_dev.read = speaker_ioport_read;
+   pit-speaker_dev.write = speaker_ioport_write;
+   pit-speaker_dev.in_range = speaker_in_range;
+   pit-speaker_dev.private = pit;
+   kvm_io_bus_register_dev(kvm-pio_bus, pit-speaker_dev);
+   }
 
kvm-arch.vpit = pit;
pit-kvm = kvm;
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index bbd863f..b267018 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -50,7 +50,7 @@ struct kvm_pit {
 
 void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val);
-struct kvm_pit *kvm_create_pit(struct kvm *kvm);
+struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
 void kvm_free_pit(struct kvm *kvm);
 void kvm_pit_reset(struct kvm_pit *pit);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c6d3ff3..f9adc2b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1026,6 +1026,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_REINJECT_CONTROL:
case KVM_CAP_IRQ_INJECT_STATUS:
case KVM_CAP_ASSIGN_DEV_IRQ:
+   case KVM_CAP_PIT2:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -1826,6 +1827,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
union {
struct kvm_pit_state ps;
struct kvm_memory_alias alias;
+   struct kvm_pit_config pit_config;
} u;
 
switch (ioctl) {
@@ -1886,12 +1888,20 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
break;
case KVM_CREATE_PIT:
+   u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
+   goto create_pit;
+   case KVM_CREATE_PIT2:
+   r = -EFAULT;
+   if (copy_from_user(u.pit_config, argp,
+  sizeof(struct kvm_pit_config)))
+   goto out;
+   create_pit:
mutex_lock(kvm-lock);
r = -EEXIST;
if (kvm-arch.vpit)
goto create_pit_unlock;
r = -ENOMEM;
-   kvm-arch.vpit = kvm_create_pit(kvm);
+   kvm-arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
if (kvm-arch.vpit)
r = 0;
create_pit_unlock:
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 3db5d8d..607d88a 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -70,6 +70,14 @@ struct kvm_irqchip {
} chip;
 };
 
+/* for KVM_CREATE_PIT2 */
+struct kvm_pit_config {
+   __u32 flags;
+   __u32 pad;
+};
+
+#define KVM_PIT_SPEAKER_DUMMY 1
+
 #define KVM_EXIT_UNKNOWN  0
 #define KVM_EXIT_EXCEPTION1
 #define KVM_EXIT_IO   2
@@ -415,6 +423,7 @@ struct kvm_trace_rec {
 #define KVM_CAP_ASSIGN_DEV_IRQ 29
 /* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */
 #define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30
+#define KVM_CAP_PIT2 31
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -498,6 +507,7 @@ struct kvm_irq_routing {
 #define KVM_ASSIGN_SET_MSIX_ENTRY \
_IOW(KVMIO, 0x74, struct kvm_assigned_msix_entry)
 #define KVM_DEASSIGN_DEV_IRQ   _IOW(KVMIO, 0x75, struct kvm_assigned_irq)
+#define KVM_CREATE_PIT2   _IOW(KVMIO, 0x76, 

[PATCH v2] qemu-kvm: Make PC speaker emulation aware of in-kernel PIT

2009-05-12 Thread Jan Kiszka
When using the in-kernel PIT the speaker emulation has to synchronize
the PIT state with KVM. Enhance the existing speaker sound device and
allow it to take over port 0x61 by using KVM_CREATE_PIT2 where
available. This unbreaks -soundhw pcspk in KVM mode.

Changes in v2:
 - rebased over qemu-kvm and KVM_CREATE_PIT2
 - refactored hooks in pcspk

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
---

 hw/pcspk.c |   41 
 kvm/kernel/include/linux/kvm.h |   10 ++
 kvm/libkvm/libkvm-x86.c|   26 ++---
 3 files changed, 69 insertions(+), 8 deletions(-)

diff --git a/hw/pcspk.c b/hw/pcspk.c
index ec1d0c6..14dd540 100644
--- a/hw/pcspk.c
+++ b/hw/pcspk.c
@@ -27,6 +27,8 @@
 #include isa.h
 #include audio/audio.h
 #include qemu-timer.h
+#include i8254.h
+#include qemu-kvm.h
 
 #define PCSPK_BUF_LEN 1792
 #define PCSPK_SAMPLE_RATE 32000
@@ -48,6 +50,37 @@ typedef struct {
 static const char *s_spk = pcspk;
 static PCSpkState pcspk_state;
 
+#ifdef USE_KVM_PIT
+static void kvm_get_pit_ch2(PITState *pit)
+{
+struct kvm_pit_state pit_state;
+
+if (qemu_kvm_pit_in_kernel()) {
+kvm_get_pit(kvm_context, pit_state);
+pit-channels[2].mode = pit_state.channels[2].mode;
+pit-channels[2].count = pit_state.channels[2].count;
+pit-channels[2].count_load_time = 
pit_state.channels[2].count_load_time;
+pit-channels[2].gate = pit_state.channels[2].gate;
+}
+}
+
+static void kvm_set_pit_ch2(PITState *pit)
+{
+struct kvm_pit_state pit_state;
+
+if (qemu_kvm_pit_in_kernel()) {
+pit_state.channels[2].mode = pit-channels[2].mode;
+pit_state.channels[2].count = pit-channels[2].count;
+pit_state.channels[2].count_load_time = 
pit-channels[2].count_load_time;
+pit_state.channels[2].gate = pit-channels[2].gate;
+kvm_set_pit(kvm_context, pit_state);
+}
+}
+#else
+static inline void kvm_get_pit_ch2(PITState *pit) { }
+static inline void kvm_set_pit_ch2(PITState *pit) { }
+#endif
+
 static inline void generate_samples(PCSpkState *s)
 {
 unsigned int i;
@@ -72,6 +105,8 @@ static void pcspk_callback(void *opaque, int free)
 PCSpkState *s = opaque;
 unsigned int n;
 
+kvm_get_pit_ch2(s-pit);
+
 if (pit_get_mode(s-pit, 2) != 3)
 return;
 
@@ -121,6 +156,8 @@ static uint32_t pcspk_ioport_read(void *opaque, uint32_t 
addr)
 PCSpkState *s = opaque;
 int out;
 
+kvm_get_pit_ch2(s-pit);
+
 s-dummy_refresh_clock ^= (1  4);
 out = pit_get_out(s-pit, 2, qemu_get_clock(vm_clock))  5;
 
@@ -132,6 +169,8 @@ static void pcspk_ioport_write(void *opaque, uint32_t addr, 
uint32_t val)
 PCSpkState *s = opaque;
 const int gate = val  1;
 
+kvm_get_pit_ch2(s-pit);
+
 s-data_on = (val  1)  1;
 pit_set_gate(s-pit, 2, gate);
 if (s-voice) {
@@ -139,6 +178,8 @@ static void pcspk_ioport_write(void *opaque, uint32_t addr, 
uint32_t val)
 s-play_pos = 0;
 AUD_set_active_out(s-voice, gate  s-data_on);
 }
+
+kvm_set_pit_ch2(s-pit);
 }
 
 void pcspk_init(PITState *pit)
diff --git a/kvm/kernel/include/linux/kvm.h b/kvm/kernel/include/linux/kvm.h
index f5e9d66..9ed4892 100644
--- a/kvm/kernel/include/linux/kvm.h
+++ b/kvm/kernel/include/linux/kvm.h
@@ -110,6 +110,14 @@ struct kvm_irqchip {
} chip;
 };
 
+/* for KVM_CREATE_PIT2 */
+struct kvm_pit_config {
+   __u32 flags;
+   __u32 pad;
+};
+
+#define KVM_PIT_SPEAKER_DUMMY 1
+
 #define KVM_EXIT_UNKNOWN  0
 #define KVM_EXIT_EXCEPTION1
 #define KVM_EXIT_IO   2
@@ -455,6 +463,7 @@ struct kvm_trace_rec {
 #define KVM_CAP_ASSIGN_DEV_IRQ 29
 /* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */
 #define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30
+#define KVM_CAP_PIT2 31
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -538,6 +547,7 @@ struct kvm_irq_routing {
 #define KVM_ASSIGN_SET_MSIX_ENTRY \
_IOW(KVMIO, 0x74, struct kvm_assigned_msix_entry)
 #define KVM_DEASSIGN_DEV_IRQ   _IOW(KVMIO, 0x75, struct kvm_assigned_irq)
+#define KVM_CREATE_PIT2   _IOW(KVMIO, 0x76, struct 
kvm_pit_config)
 
 /*
  * ioctls for vcpu fds
diff --git a/kvm/libkvm/libkvm-x86.c b/kvm/libkvm/libkvm-x86.c
index a2f6320..9c0d967 100644
--- a/kvm/libkvm/libkvm-x86.c
+++ b/kvm/libkvm/libkvm-x86.c
@@ -59,16 +59,26 @@ static int kvm_create_pit(kvm_context_t kvm)
 
kvm-pit_in_kernel = 0;
if (!kvm-no_pit_creation) {
-   r = ioctl(kvm-fd, KVM_CHECK_EXTENSION, KVM_CAP_PIT);
-   if (r  0) {
+#ifdef KVM_CAP_PIT2
+   struct kvm_pit_config config = { .flags = 0 };
+
+   r = ioctl(kvm-fd, KVM_CHECK_EXTENSION, KVM_CAP_PIT2);
+   if (r  0)
+   r = ioctl(kvm-vm_fd, KVM_CREATE_PIT2, config);
+   else
+#endif
+   {
+   r = ioctl(kvm-fd, KVM_CHECK_EXTENSION, KVM_CAP_PIT);
+

Re: [Qemu-devel] Re: Question about KVM and PC speaker

2009-05-12 Thread Jan Kiszka
Sebastian Herbszt wrote:
 Jan Kiszka wrote:
 Sebastian Herbszt wrote:
 Anyway, using timer ticks since midnight should be possible (INT 08h
 handler is set up
 before vga bios is called).
 
 I have implemented the beep for vgabios-6b and it works on bochs but
 fails on qemu.
 With -soundhw pcspk i should hear it thru the hosts pc speaker, right?

Yep, that's how it should work. Mind to share your code and tell us
which test case you used?

Jan



signature.asc
Description: OpenPGP digital signature


event injection MACROs

2009-05-12 Thread Dong, Eddie
I noticed the MACRO for SVM vmcb-control.event_inj and VMX VM_EXIT_INTR_INFO 
are almost same, I have a need to query the event injection situation in common 
code so plan to expose this register read/write to x86.c.  Should we define a 
new format for evtinj/VM_EXIT_INTR_INFO as common KVM format, or just move 
those original MACRO to kvm_host.h?

Thx, eddie--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH -next] kvm: fix build error: add missing semi-colon

2009-05-12 Thread Avi Kivity

Randy Dunlap wrote:

From: Randy Dunlap randy.dun...@oracle.com

Add missing ; to fix build error:

arch/x86/kvm/x86.c:1259: error: expected',' or ';' before 'const'

  


Applied, thanks.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH, RFC] virtio_blk: add cache flush command

2009-05-12 Thread Avi Kivity

Christoph Hellwig wrote:

On Mon, May 11, 2009 at 07:49:37PM +0300, Avi Kivity wrote:
  
Maybe we should add a fourth cache= mode then.  But 
cache=writeback+fsync doesn't correspond to any real world drive; in the 
real world you're limited to power failures and a few megabytes of cache 
(typically less), cache=writeback+fsync can lose hundreds of megabytes 
due to power loss or software failure.



cache=writeback+fsync is exactly the same model as a normal writeback
cache disk drive.  (Well, almost as we currently don't use tag ordering
but drain flushes as a Linux implementation detail, but the disks also
support TCQ-based ordering).

The cache size on disks is constantly growing, and if you lose cache
it doesn't really matter how much you lose but what you lose.
  


Software errors won't cause data loss on a real disk (firmware bugs 
will, but the firmware is less likely to crash than the host OS).


Oh, and cache=writeback+fsync doesn't work on qcow2, unless we add fsync 
after metadata updates.



If you care about data integrity in case of crashes qcow2 doesn't work
at all.
  


Do you known of any known corruptors in qcow2 with cache=writethrough?

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: event injection MACROs

2009-05-12 Thread Gleb Natapov
On Tue, May 12, 2009 at 03:38:59PM +0800, Dong, Eddie wrote:
 I noticed the MACRO for SVM vmcb-control.event_inj and VMX VM_EXIT_INTR_INFO 
 are almost same, I have a need to query the event injection situation in 
 common code so plan to expose this register read/write to x86.c.  Should we 
 define a new format for evtinj/VM_EXIT_INTR_INFO as common KVM format, or 
 just move those original MACRO to kvm_host.h?
 
I haven't seen your code, so I don't know what you are trying to do, but why
querying interrupt/nmi/exception injection queues is not enough? What
info is missing there?

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] virtio: find_vqs/del_vqs virtio operations

2009-05-12 Thread Christian Borntraeger
Am Tuesday 12 May 2009 00:19:32 schrieb Michael S. Tsirkin:
 This replaces find_vq/del_vq with find_vqs/del_vqs virtio operations,
 and updates all drivers. This is needed for MSI support, because MSI
 needs to know the total number of vectors upfront.
[...]
 diff --git a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c

This file contains several copy/paste breakages and needs at least the
patch below to compile.

Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com

---
 drivers/s390/kvm/kvm_virtio.c |   28 +++-
 1 file changed, 7 insertions(+), 21 deletions(-)

Index: kvm/drivers/s390/kvm/kvm_virtio.c
===
--- kvm.orig/drivers/s390/kvm/kvm_virtio.c
+++ kvm/drivers/s390/kvm/kvm_virtio.c
@@ -237,23 +237,23 @@ static void kvm_del_vqs(struct virtio_de
int i;
if (!kdev-vqs)
return;
-   for (i = 0; i  ldev-nvqs; ++i)
+   for (i = 0; i  kdev-nvqs; ++i)
kvm_del_vq(kdev-vqs[i]);
-   kfree(ldev-vqs);
-   ldev-vqs = NULL;
-   ldev-nvqs = 0;
+   kfree(kdev-vqs);
+   kdev-vqs = NULL;
+   kdev-nvqs = 0;
 }
 
 static int kvm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
-   struct virtqueue *vqs[]
-   void (*callbacks)[](struct virtqueue *))
+   struct virtqueue *vqs[],
+   virtqueue_callback *callbacks[])
 {
struct kvm_device *kdev = to_kvmdev(vdev);
int i;
 
/* We must have this many virtqueues. */
if (nvqs  kdev-desc-num_vq)
-   return ERR_PTR(-ENOENT);
+   return -ENOENT;
 
kdev-vqs = kmalloc(GFP_KERNEL, nvqs * sizeof *kdev-vqs);
if (!kdev-vqs)
@@ -272,20 +272,6 @@ error:
return PTR_ERR(vqs[i]);
 }
 
-static void kvm_del_vqs(struct virtio_device *vdev)
-{
-   struct lguest_device *ldev = to_lgdev(vdev);
-   int i;
-
-   if (!ldev-vqs)
-   return;
-   for (i = 0; i  ldev-nvqs; ++i)
-   lg_del_vq(ldev-vqs[i]);
-   kfree(ldev-vqs);
-   ldev-vqs = NULL;
-   ldev-nvqs = 0;
-}
-
 /*
  * The config ops structure as defined by virtio config
  */
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] Userspace changes for KVM HPET (v3)

2009-05-12 Thread Avi Kivity

Beth Kon wrote:

Signed-off-by: Beth Kon e...@us.ibm.com


diff --git a/hw/hpet.c b/hw/hpet.c
index c7945ec..100abf5 100644
--- a/hw/hpet.c
+++ b/hw/hpet.c
@@ -30,6 +30,7 @@
 #include console.h
 #include qemu-timer.h
 #include hpet_emul.h
+#include qemu-kvm.h
 
 //#define HPET_DEBUG

 #ifdef HPET_DEBUG
@@ -48,6 +49,28 @@ uint32_t hpet_in_legacy_mode(void)
 return 0;
 }
 
+static void hpet_legacy_enable(void)

+{
+if (qemu_kvm_pit_in_kernel()) {
+kvm_kpit_disable();
+dprintf(qemu: hpet disabled kernel pit\n);
+} else {
+hpet_pit_disable();
+dprintf(qemu: hpet disabled userspace pit\n);
+}
+}
+
+static void hpet_legacy_disable(void)
+{
+if (qemu_kvm_pit_in_kernel()) {
+kvm_kpit_enable();
+dprintf(qemu: hpet enabled kernel pit\n);
+} else {
+hpet_pit_enable();
+dprintf(qemu: hpet enabled userspace pit\n);
+}
+}
  
I think it's better to move these into hpet_pit_enable() and 
hpet_pit_enable().  This avoids changing the calls below, and puts pit 
stuff in i8254.c instead of hpet.c.


Might also need to be called from hpet_load(); probably a problem in 
upstream as well.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: device-assignment deadlock

2009-05-12 Thread Yang, Sheng
[I was kicked off from the mailing list by periodic unknown reason last 
Friday... Sorry]

 Hi Sheng,

 I think I'm running into the following deadlock in the kvm kernel module
 when trying to use device assignment:

 CPU A   CPU B
 kvm_vm_ioctl_deassign_dev_irq()
  mutex_lock(kvm-lock);   worker_thread()
  - kvm_deassign_irq()   - 
kvm_assigned_dev_interrupt_work_handler()
- deassign_host_irq()  mutex_lock(kvm-lock);
  - cancel_work_sync() [blocked]

 I wonder if we need finer granularity locking to avoid this.
 Suggestions?  Thanks,

This part again...

I think simply move kvm_deassign_irq() out of critical region is OK, and I 
also add the lock which seems missing in deassign_guest_irq(). Would post a 
patch soon.

-- 
regards
Yang, Sheng

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/6] kvm-s390: Fix memory slot versus run

2009-05-12 Thread Christian Ehrhardt

Avi Kivity wrote:

Christian Ehrhardt wrote:


The bad thing on vcpu-request in that case is that I don't want the 
async behaviour of vcpu-requests in that case, I want the memory 
slot updated in all vcpu's when the ioctl is returning.


You mean, the hardware can access the vcpu control block even when the 
vcpu is not running? 
No, hardware only uses it with a running vcpu, but I realised my own 
fault while changing the code to vcpu-request style.
For s390 I need to update the KVM-arch and *all* 
vcpu-arch-sie_block... data synchronously.

That makes the per vcpu resync on next entry approach not feasible.

On the other hand I realized at the same moment that the livelock should 
be no issue for us, because as I mentioned:

a) only one memslot
b) a vcpu can't run without memslot
So I don't even need to kick out vcpu's, they just should not be running.
Until we ever support multiple slots, or updates of the existing single 
slot this should be ok, so is the bugfix patch this should be.
To avoid a theoretical deadlock in case other code is changing (badly) 
it should be fair to aquire the lock with mutex_trylock and return 
-EINVAL if we did not get all locks.


[...]

If I can change it that way it will definitely require some testing.
... to be continued :-)


I definitely recommend it -- would bring s390 more in line with the 
other ports (I know it's a backward step for you :)


Note our plan is to change slots_lock to RCU, so it's even better if 
you use memslots.
As long as we have the special conditions mentioned above I think its ok 
to implement it the way I do it now.
I agree that if we ever support multiple memslots we should strive for a 
common solution.


p.s. the second patch in the series ensures that a vcpu really never 
runs without a memslot being set as this was another bug we had.


--

Grüsse / regards, 
Christian Ehrhardt

IBM Linux Technology Center, Open Virtualization

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/4] Userspace changes for configuring irq0-inti2 override (v3)

2009-05-12 Thread Gleb Natapov
On Mon, May 11, 2009 at 01:29:44PM -0400, Beth Kon wrote:
 Signed-off-by: Beth Kon e...@us.ibm.com
 
 diff --git a/hw/fw_cfg.c b/hw/fw_cfg.c
 index e1b19d7..bb74f38 100644
 --- a/hw/fw_cfg.c
 +++ b/hw/fw_cfg.c
 @@ -279,6 +279,7 @@ void *fw_cfg_init(uint32_t ctl_port, uint32_t data_port,
  fw_cfg_add_bytes(s, FW_CFG_UUID, qemu_uuid, 16);
  fw_cfg_add_i16(s, FW_CFG_NOGRAPHIC, (uint16_t)nographic);
  fw_cfg_add_i16(s, FW_CFG_NB_CPUS, (uint16_t)smp_cpus);
 +fw_cfg_add_i16(s, FW_CFG_IRQ0_OVERRIDE, (uint16_t)irq0override);
  
It is read as 1 byte by the BIOS, but it is 2 bytes here. And arch
specific config should be registered in arch specific place (hw/pc.c)

  register_savevm(fw_cfg, -1, 1, fw_cfg_save, fw_cfg_load, s);
  qemu_register_reset(fw_cfg_reset, s);
 diff --git a/hw/fw_cfg.h b/hw/fw_cfg.h
 index f616ed2..1de7360 100644
 --- a/hw/fw_cfg.h
 +++ b/hw/fw_cfg.h
 @@ -19,6 +19,7 @@
  
  #define FW_CFG_WRITE_CHANNEL0x4000
  #define FW_CFG_ARCH_LOCAL   0x8000
 +#define FW_CFG_IRQ0_OVERRIDE(FW_CFG_ARCH_LOCAL + 2)
This should go to hw/pc.c
 
  #define FW_CFG_ENTRY_MASK   ~(FW_CFG_WRITE_CHANNEL | FW_CFG_ARCH_LOCAL)
  
  #define FW_CFG_INVALID  0x
 diff --git a/hw/ioapic.c b/hw/ioapic.c
 index 0b70cf6..2d77a2c 100644
 --- a/hw/ioapic.c
 +++ b/hw/ioapic.c
 @@ -23,6 +23,7 @@
  
  #include hw.h
  #include pc.h
 +#include sysemu.h
  #include qemu-timer.h
  #include host-utils.h
  
 @@ -95,14 +96,13 @@ void ioapic_set_irq(void *opaque, int vector, int level)
  {
  IOAPICState *s = opaque;
  
 -#if 0
  /* ISA IRQs map to GSI 1-1 except for IRQ0 which maps
   * to GSI 2.  GSI maps to ioapic 1-1.  This is not
   * the cleanest way of doing it but it should work. */
  
 -if (vector == 0)
 +if (vector == 0  irq0override) {
  vector = 2;
 -#endif
 +}
  
  if (vector = 0  vector  IOAPIC_NUM_PINS) {
  uint32_t mask = 1  vector;
 diff --git a/qemu-kvm-x86.c b/qemu-kvm-x86.c
 index 8cb6faa..2e52c87 100644
 --- a/qemu-kvm-x86.c
 +++ b/qemu-kvm-x86.c
 @@ -879,7 +879,11 @@ int kvm_arch_init_irq_routing(void)
  return r;
  }
  for (i = 0; i  24; ++i) {
 -r = kvm_add_irq_route(kvm_context, i, KVM_IRQCHIP_IOAPIC, i);
 +if (i == 0) {
 +r = kvm_add_irq_route(kvm_context, i, KVM_IRQCHIP_IOAPIC, 2);
 +} else if (i != 2) {
 +r = kvm_add_irq_route(kvm_context, i, KVM_IRQCHIP_IOAPIC, i);
 +}
There is no entry for IRQ2, is this OK? What happens if IRQ2 triggers?

  if (r  0)
  return r;
  }
 diff --git a/qemu-kvm.h b/qemu-kvm.h
 index dd045dd..6a1968a 100644
 --- a/qemu-kvm.h
 +++ b/qemu-kvm.h
 @@ -165,6 +165,7 @@ void qemu_kvm_cpu_stop(CPUState *env);
  #define kvm_enabled() (kvm_allowed)
  #define qemu_kvm_irqchip_in_kernel() kvm_irqchip_in_kernel(kvm_context)
  #define qemu_kvm_pit_in_kernel() kvm_pit_in_kernel(kvm_context)
 +#define qemu_kvm_has_gsi_routing() kvm_has_gsi_routing(kvm_context)
  #define kvm_has_sync_mmu() qemu_kvm_has_sync_mmu()
  void kvm_init_vcpu(CPUState *env);
  void kvm_load_tsc(CPUState *env);
 @@ -173,6 +174,7 @@ void kvm_load_tsc(CPUState *env);
  #define kvm_nested 0
  #define qemu_kvm_irqchip_in_kernel() (0)
  #define qemu_kvm_pit_in_kernel() (0)
 +#define qemu_kvm_has_gsi_routing() (0)
  #define kvm_has_sync_mmu() (0)
  #define kvm_load_registers(env) do {} while(0)
  #define kvm_save_registers(env) do {} while(0)
 diff --git a/sysemu.h b/sysemu.h
 index 1f45fd6..292bbc3 100644
 --- a/sysemu.h
 +++ b/sysemu.h
 @@ -93,6 +93,7 @@ extern int graphic_width;
  extern int graphic_height;
  extern int graphic_depth;
  extern int nographic;
 +extern int irq0override;
  extern const char *keyboard_layout;
  extern int win2k_install_hack;
  extern int rtc_td_hack;
 diff --git a/vl.c b/vl.c
 index d9f0607..0bffc82 100644
 --- a/vl.c
 +++ b/vl.c
 @@ -207,6 +207,7 @@ static int vga_ram_size;
  enum vga_retrace_method vga_retrace_method = VGA_RETRACE_DUMB;
  static DisplayState *display_state;
  int nographic;
 +int irq0override;
  static int curses;
  static int sdl;
  const char* keyboard_layout = NULL;
 @@ -5035,6 +5036,7 @@ int main(int argc, char **argv, char **envp)
  vga_ram_size = VGA_RAM_SIZE;
  snapshot = 0;
  nographic = 0;
 +irq0override = 1;
Why not do that when defining the variable? Yeah I realize this is how
it is done for other variables too, but why?

  curses = 0;
  kernel_filename = NULL;
  kernel_cmdline = ;
 @@ -6129,8 +6131,14 @@ int main(int argc, char **argv, char **envp)
  }
  }
  
 -if (kvm_enabled())
 - kvm_init_ap();
 +if (kvm_enabled()) {
 +   kvm_init_ap();
 +#ifdef USE_KVM
 +if (kvm_irqchip  !qemu_kvm_has_gsi_routing()) {
 +irq0override = 0;
 +}
 +#endif
 +}
  
  machine-init(ram_size, vga_ram_size, boot_devices,
kernel_filename, 

Re: [PATCH 1/4] BIOS changes for configuring irq0-inti2 override (v3)

2009-05-12 Thread Gleb Natapov
On Mon, May 11, 2009 at 01:29:43PM -0400, Beth Kon wrote:
 Signed-off-by: Beth Kon e...@us.ibm.com
 
 diff --git a/kvm/bios/rombios32.c b/kvm/bios/rombios32.c
 index cbd5f15..53359b8 100755
 --- a/kvm/bios/rombios32.c
 +++ b/kvm/bios/rombios32.c
 @@ -444,6 +444,9 @@ uint32_t cpuid_features;
  uint32_t cpuid_ext_features;
  unsigned long ram_size;
  uint64_t ram_end;
 +#ifdef BX_QEMU
 +uint8_t irq0_override;
 +#endif
  #ifdef BX_USE_EBDA_TABLES
  unsigned long ebda_cur_addr;
  #endif
 @@ -485,6 +488,7 @@ void wrmsr_smp(uint32_t index, uint64_t val)
  #define QEMU_CFG_ARCH_LOCAL 0x8000
  #define QEMU_CFG_ACPI_TABLES  (QEMU_CFG_ARCH_LOCAL + 0)
  #define QEMU_CFG_SMBIOS_ENTRIES  (QEMU_CFG_ARCH_LOCAL + 1)
 +#define QEMU_CFG_IRQ0_OVERRIDE   (QEMU_CFG_ARCH_LOCAL + 2)
  
  int qemu_cfg_port;
  
 @@ -553,6 +557,18 @@ uint64_t qemu_cfg_get64 (void)
  }
  #endif
  
 +#ifdef BX_QEMU
 +void irq0_override_probe(void)
 +{
 +if(qemu_cfg_port) {
 +qemu_cfg_select(QEMU_CFG_IRQ0_OVERRIDE);
 +qemu_cfg_read(irq0_override, 1);
 +return;
 +}
 +memset(irq0_override, 0, 1);
 +}
Why memset and not irq0_override = 0, actually it should zero already.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: boot=on option fails on win 2k/xp double boot

2009-05-12 Thread Federico Fissore

Gleb Natapov, il 11/05/2009 11:26, ha scritto:

May be you can test it not in production? On another machine. Or
alternatively you can compile only userspace of kvm-84 and run it
with kernel kvm modules available in lenny. Another option would be 
to run kvm-84 with -no-kvm flag.




i've tried on my gentoo but I had some problems. I'll try asap with a fedora

btw the pure kvm support on debian lenny seems good, while libvirt  C. 
suck: I had a Centos 5.3 guest (asterisknow) and network performance 
degraded with time: after 12 hours the guest was unable to connect to 
VoIP phones. Running KVM directly (with plain old rc.local) solved the 
problem.


As said, the whole kvm stack on lenny is old and that maybe the problem.

Thanks for your help and time

Federico


smime.p7s
Description: S/MIME Cryptographic Signature


Re: [PATCH 2/4] Userspace changes for configuring irq0-inti2 override (v3)

2009-05-12 Thread Avi Kivity

Gleb Natapov wrote:

 for (i = 0; i  24; ++i) {
-r = kvm_add_irq_route(kvm_context, i, KVM_IRQCHIP_IOAPIC, i);
+if (i == 0) {
+r = kvm_add_irq_route(kvm_context, i, KVM_IRQCHIP_IOAPIC, 2);
+} else if (i != 2) {
+r = kvm_add_irq_route(kvm_context, i, KVM_IRQCHIP_IOAPIC, i);
+}


There is no entry for IRQ2, is this OK? What happens if IRQ2 triggers?
  


irq 2 is the PIC cascade interrupt.  If it is somehow triggered, the 
kernel will ignore it.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/4] Userspace changes for configuring irq0-inti2 override (v3)

2009-05-12 Thread Gleb Natapov
On Tue, May 12, 2009 at 01:22:06PM +0300, Avi Kivity wrote:
 Gleb Natapov wrote:
  for (i = 0; i  24; ++i) {
 -r = kvm_add_irq_route(kvm_context, i, KVM_IRQCHIP_IOAPIC, i);
 +if (i == 0) {
 +r = kvm_add_irq_route(kvm_context, i, KVM_IRQCHIP_IOAPIC, 
 2);
 +} else if (i != 2) {
 +r = kvm_add_irq_route(kvm_context, i, KVM_IRQCHIP_IOAPIC, 
 i);
 +}
 
 There is no entry for IRQ2, is this OK? What happens if IRQ2 triggers?
   

 irq 2 is the PIC cascade interrupt.  If it is somehow triggered, the  
 kernel will ignore it.

But here we configure IOAPIC routing. What if IOAPIC is used for
interrupt delivery and something triggers irq2. There is no entry
describing it in IOAPIC routing table, so what gsi it will be mapped
to?

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/6] kvm-s390: Fix memory slot versus run

2009-05-12 Thread Avi Kivity

Christian Ehrhardt wrote:

Avi Kivity wrote:

Christian Ehrhardt wrote:


The bad thing on vcpu-request in that case is that I don't want the 
async behaviour of vcpu-requests in that case, I want the memory 
slot updated in all vcpu's when the ioctl is returning.


You mean, the hardware can access the vcpu control block even when 
the vcpu is not running? 
No, hardware only uses it with a running vcpu, but I realised my own 
fault while changing the code to vcpu-request style.
For s390 I need to update the KVM-arch and *all* 
vcpu-arch-sie_block... data synchronously.


Out of interest, can you explain why?


That makes the per vcpu resync on next entry approach not feasible.

On the other hand I realized at the same moment that the livelock 
should be no issue for us, because as I mentioned:

a) only one memslot
b) a vcpu can't run without memslot
So I don't even need to kick out vcpu's, they just should not be running.
Until we ever support multiple slots, or updates of the existing 
single slot this should be ok, so is the bugfix patch this should be.
To avoid a theoretical deadlock in case other code is changing (badly) 
it should be fair to aquire the lock with mutex_trylock and return 
-EINVAL if we did not get all locks.


OK.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Work around supported cpuid ioctl() brokenness

2009-05-12 Thread Mark McLoughlin
KVM_GET_SUPPORTED_CPUID has been known to fail to return -E2BIG
when it runs out of entries. Detect this by always trying again
with a bigger table if the ioctl() fills the table.

Signed-off-by: Mark McLoughlin mar...@redhat.com
---
 kvm/libkvm/libkvm-x86.c |2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/kvm/libkvm/libkvm-x86.c b/kvm/libkvm/libkvm-x86.c
index a2f6320..4f9539a 100644
--- a/kvm/libkvm/libkvm-x86.c
+++ b/kvm/libkvm/libkvm-x86.c
@@ -575,6 +575,8 @@ static struct kvm_cpuid2 *try_get_cpuid(kvm_context_t kvm, 
int max)
r = ioctl(kvm-fd, KVM_GET_SUPPORTED_CPUID, cpuid);
if (r == -1)
r = -errno;
+   else if (r == 0  cpuid-nent = max)
+   r = -E2BIG;
if (r  0) {
if (r == -E2BIG) {
free(cpuid);
-- 
1.6.0.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH 4/4] kvm: Trim cpu features not supported by kvm

2009-05-12 Thread Mark McLoughlin
On Sun, 2009-05-03 at 17:04 +0300, Avi Kivity wrote:
 Remove cpu features that are not supported by kvm from the cpuid features
 reported to the guest.
 
 Signed-off-by: Avi Kivity a...@redhat.com

 @@ -1699,5 +1714,20 @@ CPUX86State *cpu_x86_init(const char *cpu_model)
  
  qemu_init_vcpu(env);
  
 +if (kvm_enabled()) {
 +kvm_trim_features(env-cpuid_features,
 +  kvm_arch_get_supported_cpuid(env, 1, R_EDX),
 +  feature_name);

This isn't work in qemu.git because the features are only queried from
qemu_init_vcpu() (see kvm_arch_init_vcpu())

The obvious fix is to move qemu_init_vcpu() after the feature trimming,
but that requires us to split env-kvm_state initialization out of
kvm_init_vcpu()

Also, it works in qemu-kvm.git, but only because actually call
kvm_qemu_init_env() twice - once before feature trimming and once after.

Cheers,
Mark.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/1] KVM: Fix potentially recursively get kvm lock

2009-05-12 Thread Marcelo Tosatti
On Tue, May 12, 2009 at 05:32:09PM +0800, Sheng Yang wrote:
 kvm_vm_ioctl_deassign_dev_irq() would potentially recursively get kvm-lock,
 because it called kvm_deassigned_irq() which implicit hold kvm-lock by 
 calling
 deassign_host_irq().
 
 Fix it by move kvm_deassign_irq() out of critial region. And add the missing
 lock for deassign_guest_irq().
 
 Reported-by: Alex Williamson alex.william...@hp.com
 Signed-off-by: Sheng Yang sh...@linux.intel.com
 ---
  virt/kvm/kvm_main.c |   14 +++---
  1 files changed, 7 insertions(+), 7 deletions(-)
 
 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
 index 4d00942..3c69655 100644
 --- a/virt/kvm/kvm_main.c
 +++ b/virt/kvm/kvm_main.c
 @@ -215,6 +215,8 @@ static void kvm_assigned_dev_ack_irq(struct 
 kvm_irq_ack_notifier *kian)
  static void deassign_guest_irq(struct kvm *kvm,
  struct kvm_assigned_dev_kernel *assigned_dev)
  {
 + mutex_lock(kvm-lock);
 +
   kvm_unregister_irq_ack_notifier(assigned_dev-ack_notifier);
   assigned_dev-ack_notifier.gsi = -1;
  
 @@ -222,6 +224,8 @@ static void deassign_guest_irq(struct kvm *kvm,
   kvm_free_irq_source_id(kvm, assigned_dev-irq_source_id);
   assigned_dev-irq_source_id = -1;
   assigned_dev-irq_requested_type = ~(KVM_DEV_IRQ_GUEST_MASK);
 +
 + mutex_unlock(kvm-lock);
  }
  
  /* The function implicit hold kvm-lock mutex due to cancel_work_sync() */
 @@ -558,20 +562,16 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm 
 *kvm,
struct kvm_assigned_irq
*assigned_irq)
  {
 - int r = -ENODEV;
   struct kvm_assigned_dev_kernel *match;
  
   mutex_lock(kvm-lock);
 -
   match = kvm_find_assigned_dev(kvm-arch.assigned_dev_head,
 assigned_irq-assigned_dev_id);
 + mutex_unlock(kvm-lock);

assigned_dev list is protected by kvm-lock. So you could have another
ioctl adding to it at the same time you're searching.

Could either have a separate kvm-assigned_devs_lock, to protect
kvm-arch.assigned_dev_head (users are ioctls that manipulate it), or
change the IRQ injection to use a separate spinlock, kill the workqueue
and call kvm_set_irq from the assigned device interrupt handler.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] virtio: find_vqs/del_vqs virtio operations

2009-05-12 Thread Rusty Russell
On Sun, 10 May 2009 04:55:38 pm Michael S. Tsirkin wrote:
 On Sun, May 10, 2009 at 01:37:06PM +0930, Rusty Russell wrote:
  Yes, and in fact a rough look at your patch reveals that we don't
  actually need del_vq: now we track them, we can just do that as part of
  vdev destruction, right?

 Let's assume that a driver encounters an error in probe
 after it calls find_vq. It would need a way to revert
 find_vq, won't it?

 It seems to me that bus-remove does not get called
 on probe failure. Isn't that right?

Yep, I looked too fast.

Thanks,
Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/4] Userspace changes for configuring irq0-inti2override (v3)

2009-05-12 Thread Beth Kon

Gleb Natapov wrote:

On Tue, May 12, 2009 at 01:22:06PM +0300, Avi Kivity wrote:
  

Gleb Natapov wrote:


 for (i = 0; i  24; ++i) {
-r = kvm_add_irq_route(kvm_context, i, KVM_IRQCHIP_IOAPIC, i);
+if (i == 0) {
+r = kvm_add_irq_route(kvm_context, i, KVM_IRQCHIP_IOAPIC, 2);
+} else if (i != 2) {
+r = kvm_add_irq_route(kvm_context, i, KVM_IRQCHIP_IOAPIC, i);
+}



There is no entry for IRQ2, is this OK? What happens if IRQ2 triggers?
  
  
irq 2 is the PIC cascade interrupt.  If it is somehow triggered, the  
kernel will ignore it.




But here we configure IOAPIC routing. What if IOAPIC is used for
interrupt delivery and something triggers irq2. There is no entry
describing it in IOAPIC routing table, so what gsi it will be mapped to?

--
  
The ACPI spec states that systems that support both APIC and dual-8259 
interrupt models must map system interrupt vectors 0-15 to 8259 IRQs 
0-15, except where interrupt source overrides are provided. We provide 
an irq0-inti2 override, and no irq2 override, so irq2 must be unused.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/4] Userspace changes for configuring irq0-inti2override (v3)

2009-05-12 Thread Beth Kon

Gleb Natapov wrote:

On Mon, May 11, 2009 at 01:29:44PM -0400, Beth Kon wrote:
  

Signed-off-by: Beth Kon e...@us.ibm.com

diff --git a/hw/fw_cfg.c b/hw/fw_cfg.c
index e1b19d7..bb74f38 100644
--- a/hw/fw_cfg.c
+++ b/hw/fw_cfg.c
@@ -279,6 +279,7 @@ void *fw_cfg_init(uint32_t ctl_port, uint32_t data_port,
 fw_cfg_add_bytes(s, FW_CFG_UUID, qemu_uuid, 16);
 fw_cfg_add_i16(s, FW_CFG_NOGRAPHIC, (uint16_t)nographic);
 fw_cfg_add_i16(s, FW_CFG_NB_CPUS, (uint16_t)smp_cpus);
+fw_cfg_add_i16(s, FW_CFG_IRQ0_OVERRIDE, (uint16_t)irq0override);
 


It is read as 1 byte by the BIOS, but it is 2 bytes here. And arch
specific config should be registered in arch specific place (hw/pc.c)
  

ok.
  

 register_savevm(fw_cfg, -1, 1, fw_cfg_save, fw_cfg_load, s);
 qemu_register_reset(fw_cfg_reset, s);
diff --git a/hw/fw_cfg.h b/hw/fw_cfg.h
index f616ed2..1de7360 100644
--- a/hw/fw_cfg.h
+++ b/hw/fw_cfg.h
@@ -19,6 +19,7 @@
 
 #define FW_CFG_WRITE_CHANNEL0x4000

 #define FW_CFG_ARCH_LOCAL   0x8000
+#define FW_CFG_IRQ0_OVERRIDE(FW_CFG_ARCH_LOCAL + 2)


This should go to hw/pc.c
  

ok.
  

 #define FW_CFG_ENTRY_MASK   ~(FW_CFG_WRITE_CHANNEL | FW_CFG_ARCH_LOCAL)
 
 #define FW_CFG_INVALID  0x

diff --git a/hw/ioapic.c b/hw/ioapic.c
index 0b70cf6..2d77a2c 100644
--- a/hw/ioapic.c
+++ b/hw/ioapic.c
@@ -23,6 +23,7 @@
 
 #include hw.h

 #include pc.h
+#include sysemu.h
 #include qemu-timer.h
 #include host-utils.h
 
@@ -95,14 +96,13 @@ void ioapic_set_irq(void *opaque, int vector, int level)

 {
 IOAPICState *s = opaque;
 
-#if 0

 /* ISA IRQs map to GSI 1-1 except for IRQ0 which maps
  * to GSI 2.  GSI maps to ioapic 1-1.  This is not
  * the cleanest way of doing it but it should work. */
 
-if (vector == 0)

+if (vector == 0  irq0override) {
 vector = 2;
-#endif
+}
 
 if (vector = 0  vector  IOAPIC_NUM_PINS) {

 uint32_t mask = 1  vector;
diff --git a/qemu-kvm-x86.c b/qemu-kvm-x86.c
index 8cb6faa..2e52c87 100644
--- a/qemu-kvm-x86.c
+++ b/qemu-kvm-x86.c
@@ -879,7 +879,11 @@ int kvm_arch_init_irq_routing(void)
 return r;
 }
 for (i = 0; i  24; ++i) {
-r = kvm_add_irq_route(kvm_context, i, KVM_IRQCHIP_IOAPIC, i);
+if (i == 0) {
+r = kvm_add_irq_route(kvm_context, i, KVM_IRQCHIP_IOAPIC, 2);
+} else if (i != 2) {
+r = kvm_add_irq_route(kvm_context, i, KVM_IRQCHIP_IOAPIC, i);
+}


There is no entry for IRQ2, is this OK? What happens if IRQ2 triggers?

  

Answered in separate email.

 if (r  0)
 return r;
 }
diff --git a/qemu-kvm.h b/qemu-kvm.h
index dd045dd..6a1968a 100644
--- a/qemu-kvm.h
+++ b/qemu-kvm.h
@@ -165,6 +165,7 @@ void qemu_kvm_cpu_stop(CPUState *env);
 #define kvm_enabled() (kvm_allowed)
 #define qemu_kvm_irqchip_in_kernel() kvm_irqchip_in_kernel(kvm_context)
 #define qemu_kvm_pit_in_kernel() kvm_pit_in_kernel(kvm_context)
+#define qemu_kvm_has_gsi_routing() kvm_has_gsi_routing(kvm_context)
 #define kvm_has_sync_mmu() qemu_kvm_has_sync_mmu()
 void kvm_init_vcpu(CPUState *env);
 void kvm_load_tsc(CPUState *env);
@@ -173,6 +174,7 @@ void kvm_load_tsc(CPUState *env);
 #define kvm_nested 0
 #define qemu_kvm_irqchip_in_kernel() (0)
 #define qemu_kvm_pit_in_kernel() (0)
+#define qemu_kvm_has_gsi_routing() (0)
 #define kvm_has_sync_mmu() (0)
 #define kvm_load_registers(env) do {} while(0)
 #define kvm_save_registers(env) do {} while(0)
diff --git a/sysemu.h b/sysemu.h
index 1f45fd6..292bbc3 100644
--- a/sysemu.h
+++ b/sysemu.h
@@ -93,6 +93,7 @@ extern int graphic_width;
 extern int graphic_height;
 extern int graphic_depth;
 extern int nographic;
+extern int irq0override;
 extern const char *keyboard_layout;
 extern int win2k_install_hack;
 extern int rtc_td_hack;
diff --git a/vl.c b/vl.c
index d9f0607..0bffc82 100644
--- a/vl.c
+++ b/vl.c
@@ -207,6 +207,7 @@ static int vga_ram_size;
 enum vga_retrace_method vga_retrace_method = VGA_RETRACE_DUMB;
 static DisplayState *display_state;
 int nographic;
+int irq0override;
 static int curses;
 static int sdl;
 const char* keyboard_layout = NULL;
@@ -5035,6 +5036,7 @@ int main(int argc, char **argv, char **envp)
 vga_ram_size = VGA_RAM_SIZE;
 snapshot = 0;
 nographic = 0;
+irq0override = 1;


Why not do that when defining the variable? Yeah I realize this is how
it is done for other variables too, but why?

  
Good question. I don't think there is any good reason. I was conforming 
to the existing style.

 curses = 0;
 kernel_filename = NULL;
 kernel_cmdline = ;
@@ -6129,8 +6131,14 @@ int main(int argc, char **argv, char **envp)
 }
 }
 
-if (kvm_enabled())

-   kvm_init_ap();
+if (kvm_enabled()) {
+   kvm_init_ap();
+#ifdef USE_KVM
+if (kvm_irqchip  !qemu_kvm_has_gsi_routing()) {
+irq0override = 0;
+}
+#endif
+}
 
 machine-init(ram_size, 

Re: [PATCH 1/6] kvm-s390: Fix memory slot versus run

2009-05-12 Thread Christian Ehrhardt

Avi Kivity wrote:

Christian Ehrhardt wrote:

Avi Kivity wrote:

Christian Ehrhardt wrote:


The bad thing on vcpu-request in that case is that I don't want 
the async behaviour of vcpu-requests in that case, I want the 
memory slot updated in all vcpu's when the ioctl is returning.


You mean, the hardware can access the vcpu control block even when 
the vcpu is not running? 
No, hardware only uses it with a running vcpu, but I realised my own 
fault while changing the code to vcpu-request style.
For s390 I need to update the KVM-arch and *all* 
vcpu-arch-sie_block... data synchronously.


Out of interest, can you explain why?

Sure I'll try to give an example.

a) The whole guest has one memory slot representing all it's memory. 
Therefore some important values like guest_origin and guest_memsize (one 
slot so it's just addr+size) are kept at VM level in kvm-arch.
b) We fortunately have cool hardware support for nearly everything(tm) 
:-) In this case for example we set in vcpu-arch.sie_block the values 
for origin and size translated into a limit to get memory management 
virtualization support.
c) we have other code e.g. all our copy_from/to_guest stuff that uses 
the kvm-arch values


If we would allow e.g. updates of a memslot (or as the patch supposes to 
harden the set_memory_region code against inconsiderate code changes in 
other sections) it might happen that we set the kvm-arch information 
but the vcpu-arch-sie_block stuff not until next reentry. Now 
concurrently the running vcpu could cause some kind of fault that 
involves a copy_from/to_guest. That way we could end up with potentially 
invalid handling of that fault (fault handling and running guest would 
use different userspace adresses until it is synced on next vcpu 
reentry) - it's theoretical I know, but it might cause some issues that 
would be hard to find.


On the other hand for the long term I wanted to note that all our 
copy_from/to_guest functions is per vcpu, so when we some day implement 
updateable memslots, multiple memslots or even just fill free time(tm) 
and streamline our code we could redesign that origin/size storage. This 
could be done multiple ways, either just store it per vcpu or with a 
lock for the kvm-arch level variables - both ways and maybe more could 
then use the vcpu-request based approach, but unfortunately it's 
neither part of that patch nor of the current effort to do that.


The really good thing is, because of our discussion about that I now 
have a really detailed idea how I can improve that code aside from this 
bugfix patch (lets hope not too far in the future).



That makes the per vcpu resync on next entry approach not feasible.

On the other hand I realized at the same moment that the livelock 
should be no issue for us, because as I mentioned:

a) only one memslot
b) a vcpu can't run without memslot
So I don't even need to kick out vcpu's, they just should not be 
running.
Until we ever support multiple slots, or updates of the existing 
single slot this should be ok, so is the bugfix patch this should be.
To avoid a theoretical deadlock in case other code is changing 
(badly) it should be fair to aquire the lock with mutex_trylock and 
return -EINVAL if we did not get all locks.


OK.





--

Grüsse / regards, 
Christian Ehrhardt

IBM Linux Technology Center, Open Virtualization

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/4] Userspace changes for configuring irq0-inti2override (v3)

2009-05-12 Thread Gleb Natapov
On Tue, May 12, 2009 at 09:20:36AM -0400, Beth Kon wrote:
 Gleb Natapov wrote:
 On Tue, May 12, 2009 at 01:22:06PM +0300, Avi Kivity wrote:
   
 Gleb Natapov wrote:
 
  for (i = 0; i  24; ++i) {
 -r = kvm_add_irq_route(kvm_context, i, KVM_IRQCHIP_IOAPIC, i);
 +if (i == 0) {
 +r = kvm_add_irq_route(kvm_context, i, 
 KVM_IRQCHIP_IOAPIC, 2);
 +} else if (i != 2) {
 +r = kvm_add_irq_route(kvm_context, i, 
 KVM_IRQCHIP_IOAPIC, i);
 +}
 
 There is no entry for IRQ2, is this OK? What happens if IRQ2 triggers?
 
 irq 2 is the PIC cascade interrupt.  If it is somehow triggered, the  
 kernel will ignore it.

 
 But here we configure IOAPIC routing. What if IOAPIC is used for
 interrupt delivery and something triggers irq2. There is no entry
 describing it in IOAPIC routing table, so what gsi it will be mapped to?

 --
   
 The ACPI spec states that systems that support both APIC and dual-8259  
 interrupt models must map system interrupt vectors 0-15 to 8259 IRQs  
 0-15, except where interrupt source overrides are provided. We provide  
 an irq0-inti2 override, and no irq2 override, so irq2 must be unused.
OK. I hope we do what ACPI spec states and irq2 never reaches IOAPIC.


--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC 0/2] qemu-kvm: MSI-X support

2009-05-12 Thread Michael S. Tsirkin
On Mon, May 11, 2009 at 05:24:25PM -0500, Anthony Liguori wrote:
 Michael S. Tsirkin wrote:
 Here's a draft MSI-X support patch. Among missing features:
 save/load support, and command-line flag to control the
 feature. This is on top of qemu-kvm: msi-x is disabled
 without kvm interrupt injection support for now.
   

 What's your impression of how much work would be to get this going on  
 top of upstream QEMU?

 I'm willing to borrow a few cycles to help out here.  I'd really like to  
 see this series go in via QEMU if possible.

It seems that if I just call apic_deliver_irq each time
I want to send MSI, things will work.

However, large part of the msix code is managing IRQs versus kernel,
and I'm not sure it's a wise investment of effort to rip it all out. So
IMHO, what's missing is API that abstracts managing irq routes in kvm,
specifically abstract this stuff in some way:
kvm_get_irq_route_gsi
kvm_add_routing_entry
kvm_del_routing_entry
kvm_commit_irq_routes
kvm_set_irq
How hard is that?

For now, this API could be a stub that just stores the routes somewhere,
and set_irq would call the local apic emulation, along the lines of:

uint8_t dest = (addr_lo  MSI_ADDR_DEST_ID_MASK)
 MSI_ADDR_DEST_ID_SHIFT;
uint8_t vector = (addr_hi  MSI_DATA_VECTOR_MASK)
 MSI_DATA_VECTOR_SHIFT;
uint8_t dest_mode = (addr_lo  MSI_ADDR_DEST_MODE_SHIFT)  0x1;
uint8_t trigger_mode = (data  MSI_DATA_TRIGGER_SHIFT)  0x1;
uint8_t delivery_mode = (data  MSI_DATA_DELIVERY_MODE_SHIFT) 
0x7;
apic_deliver_irq(dest, dest_mode, delivery_mode, vector, 0,
 trigger_mode);

I would be happy to port my msix code to work on top of this API.
Willing to help?


-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH, RFC] virtio_blk: add cache flush command

2009-05-12 Thread Rusty Russell
On Mon, 11 May 2009 06:09:08 pm Christoph Hellwig wrote:
 Do we need a new feature flag for this command or can we expect that
 all previous barrier support was buggy enough anyway?

You mean reuse the VIRTIO_BLK_F_BARRIER for this as well?  Seems fine.

AFAIK only lguest offered that, and lguest has no ABI.  Best would be to 
implement VIRTIO_BLK_T_FLUSH as well; it's supposed to be demo code, and it 
should be easy).

Thanks,
Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/4] BIOS changes for configuring irq0-inti2 override(v3)

2009-05-12 Thread Beth Kon

Gleb Natapov wrote:

On Mon, May 11, 2009 at 01:29:43PM -0400, Beth Kon wrote:
  

Signed-off-by: Beth Kon e...@us.ibm.com

diff --git a/kvm/bios/rombios32.c b/kvm/bios/rombios32.c
index cbd5f15..53359b8 100755
--- a/kvm/bios/rombios32.c
+++ b/kvm/bios/rombios32.c
@@ -444,6 +444,9 @@ uint32_t cpuid_features;
 uint32_t cpuid_ext_features;
 unsigned long ram_size;
 uint64_t ram_end;
+#ifdef BX_QEMU
+uint8_t irq0_override;
+#endif
 #ifdef BX_USE_EBDA_TABLES
 unsigned long ebda_cur_addr;
 #endif
@@ -485,6 +488,7 @@ void wrmsr_smp(uint32_t index, uint64_t val)
 #define QEMU_CFG_ARCH_LOCAL 0x8000
 #define QEMU_CFG_ACPI_TABLES  (QEMU_CFG_ARCH_LOCAL + 0)
 #define QEMU_CFG_SMBIOS_ENTRIES  (QEMU_CFG_ARCH_LOCAL + 1)
+#define QEMU_CFG_IRQ0_OVERRIDE   (QEMU_CFG_ARCH_LOCAL + 2)
 
 int qemu_cfg_port;
 
@@ -553,6 +557,18 @@ uint64_t qemu_cfg_get64 (void)

 }
 #endif
 
+#ifdef BX_QEMU

+void irq0_override_probe(void)
+{
+if(qemu_cfg_port) {
+qemu_cfg_select(QEMU_CFG_IRQ0_OVERRIDE);
+qemu_cfg_read(irq0_override, 1);
+return;
+}
+memset(irq0_override, 0, 1);
+}


Why memset and not irq0_override = 0, actually it should zero already.

  
This was an oversight, left over from some early cut-and-paste coding I 
was doing. You're right - not necessary. Thanks.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
  


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/1] KVM: Fix potentially recursively get kvm lock

2009-05-12 Thread Yang, Sheng
On Tuesday 12 May 2009 19:55:24 Marcelo Tosatti wrote:
 On Tue, May 12, 2009 at 05:32:09PM +0800, Sheng Yang wrote:
  kvm_vm_ioctl_deassign_dev_irq() would potentially recursively get
  kvm-lock, because it called kvm_deassigned_irq() which implicit hold
  kvm-lock by calling deassign_host_irq().
 
  Fix it by move kvm_deassign_irq() out of critial region. And add the
  missing lock for deassign_guest_irq().
 
  Reported-by: Alex Williamson alex.william...@hp.com
  Signed-off-by: Sheng Yang sh...@linux.intel.com
  ---
   virt/kvm/kvm_main.c |   14 +++---
   1 files changed, 7 insertions(+), 7 deletions(-)
 
  diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
  index 4d00942..3c69655 100644
  --- a/virt/kvm/kvm_main.c
  +++ b/virt/kvm/kvm_main.c
  @@ -215,6 +215,8 @@ static void kvm_assigned_dev_ack_irq(struct
  kvm_irq_ack_notifier *kian) static void deassign_guest_irq(struct kvm
  *kvm,
 struct kvm_assigned_dev_kernel *assigned_dev)
   {
  +   mutex_lock(kvm-lock);
  +
  kvm_unregister_irq_ack_notifier(assigned_dev-ack_notifier);
  assigned_dev-ack_notifier.gsi = -1;
 
  @@ -222,6 +224,8 @@ static void deassign_guest_irq(struct kvm *kvm,
  kvm_free_irq_source_id(kvm, assigned_dev-irq_source_id);
  assigned_dev-irq_source_id = -1;
  assigned_dev-irq_requested_type = ~(KVM_DEV_IRQ_GUEST_MASK);
  +
  +   mutex_unlock(kvm-lock);
   }
 
   /* The function implicit hold kvm-lock mutex due to cancel_work_sync()
  */ @@ -558,20 +562,16 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct
  kvm *kvm, struct kvm_assigned_irq
   *assigned_irq)
   {
  -   int r = -ENODEV;
  struct kvm_assigned_dev_kernel *match;
 
  mutex_lock(kvm-lock);
  -
  match = kvm_find_assigned_dev(kvm-arch.assigned_dev_head,
assigned_irq-assigned_dev_id);
  +   mutex_unlock(kvm-lock);

 assigned_dev list is protected by kvm-lock. So you could have another
 ioctl adding to it at the same time you're searching.

Oh, yes... My fault... 

 Could either have a separate kvm-assigned_devs_lock, to protect
 kvm-arch.assigned_dev_head (users are ioctls that manipulate it), or
 change the IRQ injection to use a separate spinlock, kill the workqueue
 and call kvm_set_irq from the assigned device interrupt handler.

Peferred the latter, though needs more work. But the only reason for put a 
workqueue here is because kvm-lock is a mutex? I can't believe... If so, I 
think we had made a big mistake - we have to fix all kinds of racy problem 
caused by this, but finally find it's unnecessary... 

Maybe another reason is kvm_kick_vcpu(), but have already fixed by you.

Continue to check the code...

-- 
regards
Yang, Sheng
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] Userspace changes for KVM HPET (v3)

2009-05-12 Thread Beth Kon

Avi Kivity wrote:

Beth Kon wrote:

Signed-off-by: Beth Kon e...@us.ibm.com


diff --git a/hw/hpet.c b/hw/hpet.c
index c7945ec..100abf5 100644
--- a/hw/hpet.c
+++ b/hw/hpet.c
@@ -30,6 +30,7 @@
 #include console.h
 #include qemu-timer.h
 #include hpet_emul.h
+#include qemu-kvm.h
 
 //#define HPET_DEBUG

 #ifdef HPET_DEBUG
@@ -48,6 +49,28 @@ uint32_t hpet_in_legacy_mode(void)
 return 0;
 }
 
+static void hpet_legacy_enable(void)

+{
+if (qemu_kvm_pit_in_kernel()) {
+kvm_kpit_disable();
+dprintf(qemu: hpet disabled kernel pit\n);
+} else {
+hpet_pit_disable();
+dprintf(qemu: hpet disabled userspace pit\n);
+}
+}
+
+static void hpet_legacy_disable(void)
+{
+if (qemu_kvm_pit_in_kernel()) {
+kvm_kpit_enable();
+dprintf(qemu: hpet enabled kernel pit\n);
+} else {
+hpet_pit_enable();
+dprintf(qemu: hpet enabled userspace pit\n);
+}
+}
  
I think it's better to move these into hpet_pit_enable() and 
hpet_pit_enable().  This avoids changing the calls below, and puts pit 
stuff in i8254.c instead of hpet.c.


Might also need to be called from hpet_load(); probably a problem in 
upstream as well.


My assumption about hpet_load was that the correct pit state would be 
established via pit_load (since all saves/loads are done together).  But 
when I wrote this, I was thinking only about the userspace pit (for 
qemu). I'm not sure how the load concept applies to kernel state.  Do 
I need to explicitly re-enable or disable the kernel pit during load?

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] virtio: find_vqs/del_vqs virtio operations

2009-05-12 Thread Rusty Russell
On Tue, 12 May 2009 07:49:32 am Michael S. Tsirkin wrote:
 This replaces find_vq/del_vq with find_vqs/del_vqs virtio operations,
 and updates all drivers. This is needed for MSI support, because MSI
 needs to know the total number of vectors upfront.

Sorry, is this not on top of my virtio_device vq linked list patch?

Two other things: prefer vq_callback_t as the type name, and perhaps consider 
varargs for the callbacks (or would that be too horrible at the implementation 
end?)

Thanks,
Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/1] KVM: Fix potentially recursively get kvm lock

2009-05-12 Thread Marcelo Tosatti
On Tue, May 12, 2009 at 10:13:36PM +0800, Yang, Sheng wrote:
   + mutex_unlock(kvm-lock);
 
  assigned_dev list is protected by kvm-lock. So you could have another
  ioctl adding to it at the same time you're searching.
 
 Oh, yes... My fault... 
 
  Could either have a separate kvm-assigned_devs_lock, to protect
  kvm-arch.assigned_dev_head (users are ioctls that manipulate it), or
  change the IRQ injection to use a separate spinlock, kill the workqueue
  and call kvm_set_irq from the assigned device interrupt handler.
 
 Peferred the latter, though needs more work. But the only reason for put a 
 workqueue here is because kvm-lock is a mutex? I can't believe... If so, I 
 think we had made a big mistake - we have to fix all kinds of racy problem 
 caused by this, but finally find it's unnecessary... 

One issue is that kvm_set_irq can take too long while interrupts are
blocked, and you'd have to disable interrupts in other contexes that
inject interrupts (say qemu-ioctl(SET_INTERRUPT)-...-), so all i can
see is a tradeoff.

guess mode on

But the interrupt injection path seems to be pretty short and efficient
to happen in host interrupt context.

guess mode off

Avi, Gleb?

 Maybe another reason is kvm_kick_vcpu(), but have already fixed by you.

Note you tested the spinlock_irq patch with GigE and there was no
significant performance regression right?

 
 Continue to check the code...
 
 -- 
 regards
 Yang, Sheng
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: PowerPC page faults

2009-05-12 Thread Hollis Blanchard
On Monday 11 May 2009 17:17:53 Anthony Liguori wrote:
 Hollis Blanchard wrote:
  On Mon, 2009-05-11 at 12:54 -0500, Anthony Liguori wrote:

  For future ppcemb's, do you know if there is an equivalent of a PF exit 
  type?  Does the hardware squirrel away the faulting address somewhere 
  and set PC to the start of the instruction?  If so, no guest memory load 
  should be required.
  
 
  Ahhh... you're saying that the address itself (or offset within a page)
  is the hypercall token, totally separate from IO emulation, and so we
  could ignore the access size.
 
 No, I'm not being nearly that clever.

I started thinking you weren't, but then I realized you must be working 
several levels above me and just forgot to explain... :)

 I was suggesting that hardware virtualization support in future PPC 
 systems might contain a mechanism to intercept a guest-mode TLB miss.  
 If it did, it would be useful if that guest-mode TLB miss exit 
 contained extra information somewhere that included the PC of the 
 faulting instruction, the address response for the fault, and enough 
 information to handle the fault without instruction decoding.
 
 I assume all MMIO comes from the same set of instructions in PPC?  
 Something like ld/st instructions?  Presumably all you need to know from 
 instruction decoding is the destination register and whether it was a 
 read or write?

In addition to register source/target, we also need to know the access size of 
the memory reference. That information isn't stuffed into registers for us by 
hardware, and it's not in published specifications for future hardware.

Now, if you wanted to define a hypercall as a byte access within a particular 
4K page, where the register source/target is ignored, that could be 
interesting, but I don't know if that's relevant to this hypercall vs MMIO 
discussion.

-- 
Hollis Blanchard
IBM Linux Technology Center
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Enable IRQ windows after exception injection if there are pending virq

2009-05-12 Thread Dong, Eddie

I didn't take many test since our PTS system stop working now due to KVM 
userspace
build changes. But since the logic is pretty simple, so I want to post here to 
see comments.
Thx, eddie




If there is pending irq after an virtual exception is injected,
KVM needs to enable IRQ window to trap back earlier once 
the exception is handled.

Signed-off-by: Eddie Dong eddie.d...@intel.com

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 308d8e9..f8ceaea 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3154,15 +3154,18 @@ static void inject_irq(struct kvm_vcpu *vcpu)
}
 }
 
-static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static void inject_pending_irq(struct kvm_vcpu *vcpu)
 {
-   bool req_int_win = !irqchip_in_kernel(vcpu-kvm) 
-   kvm_run-request_interrupt_window;
-
if (vcpu-guest_debug  KVM_GUESTDBG_SINGLESTEP)
kvm_x86_ops-drop_interrupt_shadow(vcpu);
 
inject_irq(vcpu);
+}
+
+static void set_pending_virq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+   bool req_int_win = !irqchip_in_kernel(vcpu-kvm) 
+   kvm_run-request_interrupt_window;
 
/* enable NMI/IRQ window open exits if needed */
if (vcpu-arch.nmi_pending)
@@ -3229,7 +3232,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct 
kvm_run *kvm_run)
if (vcpu-arch.exception.pending)
__queue_exception(vcpu);
else
-   inject_pending_irq(vcpu, kvm_run);
+   inject_pending_irq(vcpu);
+
+   set_pending_virq(vcpu, kvm_run);
 
if (kvm_lapic_enabled(vcpu)) {
if (!vcpu-arch.apic-vapic_addr)

irq_windows.patch
Description: irq_windows.patch


[PATCH 0/2] Deal with shadow interrupts after emulated instructions

2009-05-12 Thread Glauber Costa
Same as before, addressing avi's comments


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] deal with interrupt shadow state for emulated instruction

2009-05-12 Thread Glauber Costa
we currently unblock shadow interrupt state when we skip an instruction,
but failing to do so when we actually emulate one. This blocks interrupts
in key instruction blocks, in particular sti; hlt; sequences

If the instruction emulated is an sti, we have to block shadow interrupts.
The same goes for mov ss. pop ss also needs it, but we don't currently
emulate it.

Without this patch, I cannot boot gpxe option roms at vmx machines.
This is described at https://bugzilla.redhat.com/show_bug.cgi?id=494469

Signed-off-by: Glauber Costa glom...@redhat.com
CC: H. Peter Anvin h...@zytor.com
CC: Avi Kivity a...@redhat.com
CC: Gleb Natapov g...@redhat.com
---
 arch/x86/include/asm/kvm_x86_emulate.h |3 +++
 arch/x86/kvm/x86.c |6 +-
 arch/x86/kvm/x86_emulate.c |   26 +-
 3 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_x86_emulate.h 
b/arch/x86/include/asm/kvm_x86_emulate.h
index be40d6e..b7ed2c4 100644
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ b/arch/x86/include/asm/kvm_x86_emulate.h
@@ -155,6 +155,9 @@ struct x86_emulate_ctxt {
int mode;
u32 cs_base;
 
+   /* interruptibility state, as a result of execution of STI or MOV SS */
+   int interruptibility;
+
/* decode cache */
struct decode_cache decode;
 };
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3d8fcc5..b45baff 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2362,7 +2362,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
u16 error_code,
int emulation_type)
 {
-   int r;
+   int r, shadow_mask;
struct decode_cache *c;
 
kvm_clear_exception_queue(vcpu);
@@ -2416,6 +2416,10 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
}
 
r = x86_emulate_insn(vcpu-arch.emulate_ctxt, emulate_ops);
+   shadow_mask = vcpu-arch.emulate_ctxt.interruptibility;
+
+   if (r == 0)
+   kvm_x86_ops-set_interrupt_shadow(vcpu, shadow_mask);
 
if (vcpu-arch.pio.string)
return EMULATE_DO_MMIO;
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index d2664fc..b847523 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -1372,6 +1372,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct 
x86_emulate_ops *ops)
int io_dir_in;
int rc = 0;
 
+   ctxt-interruptibility = 0;
+
/* Shadow copy of register state. Committed on successful emulation.
 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
 * modify them.
@@ -1618,6 +1620,15 @@ special_insn:
int err;
 
sel = c-src.val;
+   if (c-modrm_reg == VCPU_SREG_SS) {
+   u32 int_shadow =
+   kvm_x86_ops-get_interrupt_shadow(ctxt-vcpu,
+ 
X86_SHADOW_INT_MOV_SS);
+   /* See sti emulation for an explanation of this */
+   if (!(int_shadow  X86_SHADOW_INT_MOV_SS))
+   ctxt-interruptibility = X86_SHADOW_INT_MOV_SS;
+   }
+
if (c-modrm_reg = 5) {
type_bits = (c-modrm_reg == 1) ? 9 : 1;
err = kvm_load_segment_descriptor(ctxt-vcpu, sel,
@@ -1846,10 +1857,23 @@ special_insn:
ctxt-eflags = ~X86_EFLAGS_IF;
c-dst.type = OP_NONE;  /* Disable writeback. */
break;
-   case 0xfb: /* sti */
+   case 0xfb: { /* sti */
+   u32 int_shadow =
+   kvm_x86_ops-get_interrupt_shadow(ctxt-vcpu,
+ X86_SHADOW_INT_STI);
+   /*
+* an sti; sti; sequence only disable interrupts for the first
+* instruction. So, if the last instruction, be it emulated or
+* not, left the system with the INT_STI flag enabled, it
+* means that the last instruction is an sti. We should not
+* leave the flag on in this case
+*/
+   if (!(int_shadow  X86_SHADOW_INT_STI))
+   ctxt-interruptibility = X86_SHADOW_INT_STI;
ctxt-eflags |= X86_EFLAGS_IF;
c-dst.type = OP_NONE;  /* Disable writeback. */
break;
+   }
case 0xfc: /* cld */
ctxt-eflags = ~EFLG_DF;
c-dst.type = OP_NONE;  /* Disable writeback. */
-- 
1.5.6.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] replace drop_interrupt_shadow by set_interrupt_shadow

2009-05-12 Thread Glauber Costa
This patch replaces drop_interrupt_shadow with the more
general set_interrupt_shadow, that can either drop or raise
it, depending on its parameter.

Signed-off-by: Glauber Costa glom...@redhat.com
CC: H. Peter Anvin h...@zytor.com
CC: Avi Kivity a...@redhat.com
CC: Gleb Natapov g...@redhat.com
---
 arch/x86/include/asm/kvm_host.h|3 +-
 arch/x86/include/asm/kvm_x86_emulate.h |3 ++
 arch/x86/kvm/svm.c |   32 +++-
 arch/x86/kvm/vmx.c |   49 +--
 arch/x86/kvm/x86.c |2 +-
 5 files changed, 63 insertions(+), 26 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8e680c3..3d933cf 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -510,6 +510,8 @@ struct kvm_x86_ops {
void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+   void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
+   u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
void (*patch_hypercall)(struct kvm_vcpu *vcpu,
unsigned char *hypercall_addr);
void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
@@ -521,7 +523,6 @@ struct kvm_x86_ops {
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
-   void (*drop_interrupt_shadow)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*get_tdp_level)(void);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h 
b/arch/x86/include/asm/kvm_x86_emulate.h
index 6a15973..be40d6e 100644
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ b/arch/x86/include/asm/kvm_x86_emulate.h
@@ -143,6 +143,9 @@ struct decode_cache {
struct fetch_cache fetch;
 };
 
+#define X86_SHADOW_INT_MOV_SS  1
+#define X86_SHADOW_INT_STI 2
+
 struct x86_emulate_ctxt {
/* Register state before/after emulation. */
struct kvm_vcpu *vcpu;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ef43a18..ae29a95 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -202,6 +202,27 @@ static int is_external_interrupt(u32 info)
return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
 }
 
+static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+   struct vcpu_svm *svm = to_svm(vcpu);
+   u32 ret = 0;
+
+   if (svm-vmcb-control.int_state  SVM_INTERRUPT_SHADOW_MASK)
+   ret |= X86_SHADOW_INT_STI  X86_SHADOW_INT_MOV_SS;
+   return ret  mask;
+}
+
+static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+   struct vcpu_svm *svm = to_svm(vcpu);
+
+   if (mask == 0)
+   svm-vmcb-control.int_state = ~SVM_INTERRUPT_SHADOW_MASK;
+   else
+   svm-vmcb-control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
+
+}
+
 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
struct vcpu_svm *svm = to_svm(vcpu);
@@ -215,7 +236,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
   __func__, kvm_rip_read(vcpu), svm-next_rip);
 
kvm_rip_write(vcpu, svm-next_rip);
-   svm-vmcb-control.int_state = ~SVM_INTERRUPT_SHADOW_MASK;
+   svm_set_interrupt_shadow(vcpu, 0);
 }
 
 static int has_svm(void)
@@ -2229,12 +2250,6 @@ static void pre_svm_run(struct vcpu_svm *svm)
new_asid(svm, svm_data);
 }
 
-static void svm_drop_interrupt_shadow(struct kvm_vcpu *vcpu)
-{
-   struct vcpu_svm *svm = to_svm(vcpu);
-   svm-vmcb-control.int_state = ~SVM_INTERRUPT_SHADOW_MASK;
-}
-
 static void svm_inject_nmi(struct kvm_vcpu *vcpu)
 {
struct vcpu_svm *svm = to_svm(vcpu);
@@ -2637,6 +2652,8 @@ static struct kvm_x86_ops svm_x86_ops = {
.run = svm_vcpu_run,
.handle_exit = handle_exit,
.skip_emulated_instruction = skip_emulated_instruction,
+   .set_interrupt_shadow = svm_set_interrupt_shadow,
+   .get_interrupt_shadow = svm_get_interrupt_shadow,
.patch_hypercall = svm_patch_hypercall,
.set_irq = svm_set_irq,
.set_nmi = svm_inject_nmi,
@@ -2646,7 +2663,6 @@ static struct kvm_x86_ops svm_x86_ops = {
.enable_nmi_window = enable_nmi_window,
.enable_irq_window = enable_irq_window,
.update_cr8_intercept = update_cr8_intercept,
-   .drop_interrupt_shadow = svm_drop_interrupt_shadow,
 
.set_tss_addr = svm_set_tss_addr,
.get_tdp_level = get_npt_level,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e8a5649..c28baf8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -736,23 

Problem doing pci passthrough of the network card without VT-d

2009-05-12 Thread Passera, Pablo R
Hi List,
I am having problems to do pci passthrough to a network card without 
using VT-d. The card is present in the guest but with a different model (Intel 
Corporation 82801I Gigabit Ethernet Controller (rev 2)) and it does not work. 
The qemu line that I used is:

./devel/bin/qemu-system-x86_64 -hda ./dm.img -m 256 -pcidevice 
host=00:19.0,dma=none -net none

Before running qemu I did

echo 8086 294c  /sys/bus/pci/drivers/pci-stub/new_id
echo :00:19.0  /sys/bus/pci/drivers/e1000e/unbind
echo :00:19.0  /sys/bus/pci/drivers/pci-stub/bind

This is the lspci -tv output

-[:00]-+-00.0  Intel Corporation 82X38/X48 Express DRAM Controller
   +-01.0-[:01]00.0  nVidia Corporation G80 [GeForce 8800 GTX]
   +-19.0  Intel Corporation 82566DC-2 Gigabit Network Connection
   +-1a.0  Intel Corporation 82801I (ICH9 Family) USB UHCI Controller #4
   +-1a.1  Intel Corporation 82801I (ICH9 Family) USB UHCI Controller #5
   +-1a.2  Intel Corporation 82801I (ICH9 Family) USB UHCI Controller #6
   +-1a.7  Intel Corporation 82801I (ICH9 Family) USB2 EHCI Controller 
#2
   +-1b.0  Intel Corporation 82801I (ICH9 Family) HD Audio Controller
   +-1c.0-[:02]--
   +-1c.4-[:03]00.0  Marvell Technology Group Ltd. 88SE6121 
SATA II Controller
   +-1d.0  Intel Corporation 82801I (ICH9 Family) USB UHCI Controller #1
   +-1d.1  Intel Corporation 82801I (ICH9 Family) USB UHCI Controller #2
   +-1d.2  Intel Corporation 82801I (ICH9 Family) USB UHCI Controller #3
   +-1d.7  Intel Corporation 82801I (ICH9 Family) USB2 EHCI Controller 
#1
   +-1e.0-[:04]03.0  Texas Instruments TSB43AB22/A 
IEEE-1394a-2000 Controller (PHY/Link)
   +-1f.0  Intel Corporation 82801IR (ICH9R) LPC Interface Controller
   +-1f.2  Intel Corporation 82801IR/IO/IH (ICH9R/DO/DH) 4 port SATA 
IDE Controller
   +-1f.3  Intel Corporation 82801I (ICH9 Family) SMBus Controller
   \-1f.5  Intel Corporation 82801I (ICH9 Family) 2 port SATA IDE 
Controller


I am getting the following error in host dmesg

e1000e :00:19.0: PCI INT A disabled
pci-stub :00:19.0: PCI INT A - GSI 20 (level, low) - IRQ 20
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
DMAR:[DMA Read] Request device [00:19.0] fault addr baee000
DMAR:[fault reason 02] Present bit in context entry is clear
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
DMAR:[DMA Read] Request device [00:19.0] fault addr b90c000
DMAR:[fault reason 02] Present bit in context entry is clear

And in a second run I got

e1000e :00:19.0: PCI INT A disabled
pci-stub :00:19.0: PCI INT A - GSI 20 (level, low) - IRQ 20
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
DMAR:[DMA Read] Request device [00:19.0] fault addr ba8
DMAR:[fault reason 06] PTE Read access is not set
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
DMAR:[DMA Read] Request device [00:19.0] fault addr baa4000
DMAR:[fault reason 06] PTE Read access is not set

It does work if I enable VT-d, even though the network card is recognized with 
a different model. However, I need this to work without VT-d. Which could be 
the problem here? Is this supported?

Thanks a lot,
Pablo


More information and logs...

Kernel, kvm and network driver versions:

Host Kernel 2.6.29.2
Guest Kernel 2.6.23.1-42.fc8
KVM-85
Network card driver (guest and host) e1000e 0.5.18.3

Device assignment log
init_assigned_device: Registering real physical device 00:19.0 (bus=0 dev=19 
func=0)
get_real_device: region 0 size 131072 start 0x9320 type 512 resource_fd 12
get_real_device: region 1 size 4096 start 0x93224000 type 512 resource_fd 13
get_real_device: region 2 size 32 start 0x30e0 type 256 resource_fd 0
assigned_dev_pci_read_config: (4.0): address= val=0x8086 len=2
assigned_dev_pci_read_config: (4.0): address=0002 val=0x294c len=2
assigned_dev_pci_read_config: (4.0): address= val=0x8086 len=2
assigned_dev_pci_read_config: (4.0): address=0002 val=0x294c len=2
assigned_dev_pci_read_config: (4.0): address= val=0x8086 len=2
assigned_dev_pci_read_config: (4.0): address=0002 val=0x294c len=2
assigned_dev_pci_read_config: (4.0): address=000a val=0x0200 len=2

[PATCH 4/6]: kvm-s390: Unlink vcpu on destroy - v2

2009-05-12 Thread ehrhardt
From: Carsten Otte co...@de.ibm.com

This patch makes sure we do unlink a vcpu's sie control block
from the system control area in kvm_arch_vcpu_destroy. This
prevents illegal accesses to the sie control block from other
virtual cpus after free.

Reported-by: Mijo Safradin m...@linux.vnet.ibm.com
Signed-off-by: Carsten Otte co...@de.ibm.com
Signed-off-by: Christian Ehrhardt ehrha...@de.ibm.com
---
 kvm-s390.c |   10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

Index: kvm/arch/s390/kvm/kvm-s390.c
===
--- kvm.orig/arch/s390/kvm/kvm-s390.c
+++ kvm/arch/s390/kvm/kvm-s390.c
@@ -195,6 +195,10 @@ out_nokvm:
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
VCPU_EVENT(vcpu, 3, %s, free cpu);
+   if (vcpu-kvm-arch.sca-cpu[vcpu-vcpu_id].sda ==
+   (__u64) vcpu-arch.sie_block)
+   vcpu-kvm-arch.sca-cpu[vcpu-vcpu_id].sda = 0;
+   smp_mb();
free_page((unsigned long)(vcpu-arch.sie_block));
kvm_vcpu_uninit(vcpu);
kfree(vcpu);
@@ -307,8 +311,10 @@ struct kvm_vcpu *kvm_arch_vcpu_create(st
 
vcpu-arch.sie_block-icpua = id;
BUG_ON(!kvm-arch.sca);
-   BUG_ON(kvm-arch.sca-cpu[id].sda);
-   kvm-arch.sca-cpu[id].sda = (__u64) vcpu-arch.sie_block;
+   if (!kvm-arch.sca-cpu[id].sda)
+   kvm-arch.sca-cpu[id].sda = (__u64) vcpu-arch.sie_block;
+   else
+   BUG_ON(!kvm-vcpus[id]); /* vcpu does already exist */
vcpu-arch.sie_block-scaoh = (__u32)(((__u64)kvm-arch.sca)  32);
vcpu-arch.sie_block-scaol = (__u32)(__u64)kvm-arch.sca;
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/6] kvm-s390: Fix memory slot versus run - v3

2009-05-12 Thread ehrhardt
From: Carsten Otte co...@de.ibm.com

This patch fixes an incorrectness in the kvm backend for s390.
In case virtual cpus are being created before the corresponding
memory slot is being registered, we need to update the sie
control blocks for the virtual cpus.

*updates in v3*
In consideration of the s390 memslot constraints locking was changed
to trylock. These locks should never be held, as vcpu's can't run without
the single memslot we just assign when running this code. To ensure this
never deadlocks in case other code changes the code uses trylocks and bail
out if it can't get all locks.

Additionally most of the discussed special conditions for s390 like
only one memslot and no user_alloc are now checked for validity in
kvm_arch_set_memory_region.

Reported-by: Mijo Safradin m...@linux.vnet.ibm.com
Signed-off-by: Carsten Otte co...@de.ibm.com
Signed-off-by: Christian Ehrhardt ehrha...@de.ibm.com
---
 kvm-s390.c |   36 +++-
 1 file changed, 31 insertions(+), 5 deletions(-)

Index: kvm/arch/s390/kvm/kvm-s390.c
===
--- kvm.orig/arch/s390/kvm/kvm-s390.c
+++ kvm/arch/s390/kvm/kvm-s390.c
@@ -657,6 +657,8 @@ int kvm_arch_set_memory_region(struct kv
struct kvm_memory_slot old,
int user_alloc)
 {
+   int i;
+
/* A few sanity checks. We can have exactly one memory slot which has
   to start at guest virtual zero and which has to be located at a
   page boundary in userland and which has to end at a page boundary.
@@ -664,7 +666,7 @@ int kvm_arch_set_memory_region(struct kv
   vmas. It is okay to mmap() and munmap() stuff in this slot after
   doing this call at any time */
 
-   if (mem-slot)
+   if (mem-slot || kvm-arch.guest_memsize)
return -EINVAL;
 
if (mem-guest_phys_addr)
@@ -676,15 +678,39 @@ int kvm_arch_set_memory_region(struct kv
if (mem-memory_size  (PAGE_SIZE - 1))
return -EINVAL;
 
+   if (!user_alloc)
+   return -EINVAL;
+
+   /* lock all vcpus */
+   for (i = 0; i  KVM_MAX_VCPUS; ++i) {
+   if (!kvm-vcpus[i])
+   continue;
+   if (!mutex_trylock(kvm-vcpus[i]-mutex))
+   goto fail_out;
+   }
+
kvm-arch.guest_origin = mem-userspace_addr;
kvm-arch.guest_memsize = mem-memory_size;
 
-   /* FIXME: we do want to interrupt running CPUs and update their memory
-  configuration now to avoid race conditions. But hey, changing the
-  memory layout while virtual CPUs are running is usually bad
-  programming practice. */
+   /* update sie control blocks, and unlock all vcpus */
+   for (i = 0; i  KVM_MAX_VCPUS; ++i) {
+   if (kvm-vcpus[i]) {
+   kvm-vcpus[i]-arch.sie_block-gmsor =
+   kvm-arch.guest_origin;
+   kvm-vcpus[i]-arch.sie_block-gmslm =
+   kvm-arch.guest_memsize +
+   kvm-arch.guest_origin +
+   VIRTIODESCSPACE - 1ul;
+   mutex_unlock(kvm-vcpus[i]-mutex);
+   }
+   }
 
return 0;
+
+fail_out:
+   for (; i = 0; i--)
+   mutex_unlock(kvm-vcpus[i]-mutex);
+   return -EINVAL;
 }
 
 void kvm_arch_flush_shadow(struct kvm *kvm)
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/6] kvm-s390: collection of kvm-s390 fixes - v3

2009-05-12 Thread ehrhardt
From: Christian Ehrhardt ehrha...@de.ibm.com

*updates in v3*
- fix memory slot vs. run uses trylock to avoid a potential livelock
- fix memory slot vs. run checks if it is the first and only memslot registered

*updates in v2*
- hrtimer wakeup use a more accurate calculation
- unlink vcpu uses smb_mb so the pointer is really zero when the page is freed

This is a collection of fixes for kvm-s390 that originate from several tests
made in the last few months. They are now tested a while and should be ready
to be merged.

Patches included:
[PATCH 1/6] kvm-s390: Fix memory slot versus run' - v3
[PATCH 2/6] kvm-s390: use hrtimer for clock wakeup from idle' - v2
[PATCH 3/6] kvm-s390: optimize float int lock: spin_lock_bh -- spin_lock'
[PATCH 4/6] kvm-s390: Unlink vcpu on destroy' - v2
[PATCH 5/6] kvm-s390: Sanity check on validity intercept'
[PATCH 6/6] kvm-s390: Verify memory in kvm run'

Overall-Diffstat:
 arch/s390/include/asm/kvm_host.h |5 ++-
 arch/s390/kvm/intercept.c|   28 +++--
 arch/s390/kvm/interrupt.c|   53 
 arch/s390/kvm/kvm-s390.c |   63 ---
 arch/s390/kvm/kvm-s390.h |4 +-
 arch/s390/kvm/priv.c |4 +-
 arch/s390/kvm/sigp.c |   16 -
 7 files changed, 120 insertions(+), 53 deletions(-)
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/6] kvm-s390: Sanity check on validity intercept

2009-05-12 Thread ehrhardt
From: Carsten Otte co...@de.ibm.com

This patch adds a sanity check for the content of the guest
prefix register content before faulting in the cpu lowcore
that it refers to. The guest might end up in an endless loop
where SIE complains about missing lowcore with incorrect
content of the prefix register without this fix.

Reported-by: Mijo Safradin m...@linux.vnet.ibm.com
Signed-off-by: Carsten Otte co...@de.ibm.com
Signed-off-by: Christian Ehrhardt ehrha...@de.ibm.com
---
 arch/s390/kvm/intercept.c |   28 ++--
 1 file changed, 18 insertions(+), 10 deletions(-)

Index: kvm/arch/s390/kvm/intercept.c
===
--- kvm.orig/arch/s390/kvm/intercept.c
+++ kvm/arch/s390/kvm/intercept.c
@@ -154,17 +154,25 @@ static int handle_stop(struct kvm_vcpu *
 static int handle_validity(struct kvm_vcpu *vcpu)
 {
int viwhy = vcpu-arch.sie_block-ipb  16;
+   int rc;
+
vcpu-stat.exit_validity++;
-   if (viwhy == 0x37) {
-   fault_in_pages_writeable((char __user *)
-vcpu-kvm-arch.guest_origin +
-vcpu-arch.sie_block-prefix,
-PAGE_SIZE);
-   return 0;
-   }
-   VCPU_EVENT(vcpu, 2, unhandled validity intercept code %d,
-  viwhy);
-   return -ENOTSUPP;
+   if ((viwhy == 0x37)  (vcpu-arch.sie_block-prefix
+   = vcpu-kvm-arch.guest_memsize - 2*PAGE_SIZE)){
+   rc = fault_in_pages_writeable((char __user *)
+vcpu-kvm-arch.guest_origin +
+vcpu-arch.sie_block-prefix,
+2*PAGE_SIZE);
+   if (rc)
+   /* user will receive sigsegv, exit to user */
+   rc = -ENOTSUPP;
+   } else
+   rc = -ENOTSUPP;
+
+   if (rc)
+   VCPU_EVENT(vcpu, 2, unhandled validity intercept code %d,
+  viwhy);
+   return rc;
 }
 
 static int handle_instruction(struct kvm_vcpu *vcpu)
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 6/6] kvm-s390: Verify memory in kvm run

2009-05-12 Thread ehrhardt
From: Carsten Otte co...@de.ibm.com

This check verifies that the guest we're trying to run in KVM_RUN
has some memory assigned to it. It enters an endless exception
loop if this is not the case.

Reported-by: Mijo Safradin m...@linux.vnet.ibm.com
Signed-off-by: Carsten Otte co...@de.ibm.com
Signed-off-by: Christian Ehrhardt ehrha...@de.ibm.com
---
 arch/s390/kvm/kvm-s390.c |6 ++
 1 file changed, 6 insertions(+)

Index: kvm/arch/s390/kvm/kvm-s390.c
===
--- kvm.orig/arch/s390/kvm/kvm-s390.c
+++ kvm/arch/s390/kvm/kvm-s390.c
@@ -478,6 +478,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_v
 
vcpu_load(vcpu);
 
+   /* verify, that memory has been registered */
+   if (!vcpu-kvm-arch.guest_memsize) {
+   vcpu_put(vcpu);
+   return -EINVAL;
+   }
+
if (vcpu-sigset_active)
sigprocmask(SIG_SETMASK, vcpu-sigset, sigsaved);
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Enable IRQ windows after exception injection if there are pending virq

2009-05-12 Thread Gleb Natapov
On Tue, May 12, 2009 at 11:06:39PM +0800, Dong, Eddie wrote:
 
 I didn't take many test since our PTS system stop working now due to KVM 
 userspace
 build changes. But since the logic is pretty simple, so I want to post here 
 to see comments.
 Thx, eddie
 
 
 
 
 If there is pending irq after an virtual exception is injected,
 KVM needs to enable IRQ window to trap back earlier once 
 the exception is handled.
 
I already posted patch to do that http://patchwork.kernel.org/patch/21830/
Is you patch different?

 Signed-off-by: Eddie Dong eddie.d...@intel.com
 
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index 308d8e9..f8ceaea 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -3154,15 +3154,18 @@ static void inject_irq(struct kvm_vcpu *vcpu)
   }
  }
  
 -static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run 
 *kvm_run)
 +static void inject_pending_irq(struct kvm_vcpu *vcpu)
  {
 - bool req_int_win = !irqchip_in_kernel(vcpu-kvm) 
 - kvm_run-request_interrupt_window;
 -
   if (vcpu-guest_debug  KVM_GUESTDBG_SINGLESTEP)
   kvm_x86_ops-drop_interrupt_shadow(vcpu);
  
   inject_irq(vcpu);
 +}
 +
 +static void set_pending_virq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 +{
 + bool req_int_win = !irqchip_in_kernel(vcpu-kvm) 
 + kvm_run-request_interrupt_window;
  
   /* enable NMI/IRQ window open exits if needed */
   if (vcpu-arch.nmi_pending)
 @@ -3229,7 +3232,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, 
 struct kvm_run *kvm_run)
   if (vcpu-arch.exception.pending)
   __queue_exception(vcpu);
   else
 - inject_pending_irq(vcpu, kvm_run);
 + inject_pending_irq(vcpu);
 +
 + set_pending_virq(vcpu, kvm_run);
  
   if (kvm_lapic_enabled(vcpu)) {
   if (!vcpu-arch.apic-vapic_addr)


--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH][KVM-AUTOTEST] Add custom install option for kvm_install

2009-05-12 Thread Mike Burns
From: Michael Burns mbu...@redhat.com


Signed-off-by: Michael Burns mbu...@redhat.com
---
 client/tests/kvm_runtest_2/control|   18 +-
 client/tests/kvm_runtest_2/kvm_install.py |   15 +++
 2 files changed, 32 insertions(+), 1 deletions(-)

diff --git a/client/tests/kvm_runtest_2/control 
b/client/tests/kvm_runtest_2/control
index fd68e94..d6e26bc 100644
--- a/client/tests/kvm_runtest_2/control
+++ b/client/tests/kvm_runtest_2/control
@@ -41,6 +41,19 @@ link_if_not_exist(pwd, qemu_img, 'qemu-img')
 
 # -
 # Build and install kvm
+#
+# Details of Install options
+#   Mode: custom
+#   Description:  install from custom install script
+#   Parameters needed:
+# install_script: 
+#   location of script relative to the kvm-runtest_2 directory.
+#   Script will be executed from test.bindir (generally kvm_runtest_2)
+#   parameters for the script can be passed either as environment variables
+#   in the params array below or in the definition of install_script.
+#   If they are passed as part of params, then they will be accessible as
+#   KVM_INSTALL_s in the OS Environment when your script runs.  
+#
 # -
 params = {
 name: kvm_install,
@@ -57,7 +70,10 @@ params = {
 
 ## Install from git
 git_repo: 'git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm.git',
-user_git_repo: 
'git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm-userspace.git'
+user_git_repo: 
'git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm-userspace.git',
+
+## Custom install
+install_script: 'custom_kvm_install.sh param1'
 }
 
 # Comment the job.run_test line if you do not want to install kvm on the host.
diff --git a/client/tests/kvm_runtest_2/kvm_install.py 
b/client/tests/kvm_runtest_2/kvm_install.py
index 8be5a93..234c77a 100755
--- a/client/tests/kvm_runtest_2/kvm_install.py
+++ b/client/tests/kvm_runtest_2/kvm_install.py
@@ -77,6 +77,21 @@ def run_kvm_install(test, params, env):
 elif install_mode == localsrc:
 __install_kvm(test, srcdir)
 
+# install from custom script
+elif install_mode == custom:
+install_script = params.get(install_script)
+script = os.path.join(test.bindir,install_script)
+if not install_script:
+message = Custom script filename not specified
+kvm_log.error(message)
+raise error.TestError, message
+for k in params.keys():
+ kvm_log.info(Adding KVM_INSTALL_%s to Environment % (k))
+  os.putenv(KVM_INSTALL_%s % (k), str(params[k]))
+   kvm_log.info(Running  + script +  to install kvm)
+os.system(cd %s; %s % (test.bindir, script))
+   kvm_log.info(Completed %s % (script))
+
 # invalid installation mode
 else:
 message = Invalid installation mode: '%s' % install_mode
-- 
1.6.0.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] virtio: find_vqs/del_vqs virtio operations

2009-05-12 Thread Michael S. Tsirkin
On Wed, May 13, 2009 at 12:00:02AM +0930, Rusty Russell wrote:
 On Tue, 12 May 2009 07:49:32 am Michael S. Tsirkin wrote:
  This replaces find_vq/del_vq with find_vqs/del_vqs virtio operations,
  and updates all drivers. This is needed for MSI support, because MSI
  needs to know the total number of vectors upfront.
 
 Sorry, is this not on top of my virtio_device vq linked list patch?

Not yet, working on that.

 Two other things: prefer vq_callback_t as the type name,

ok

 and perhaps consider 
 varargs for the callbacks (or would that be too horrible at the 
 implementation 
 end?)
 
 Thanks,
 Rusty.

Ugh ... I think it will be. And AFAIK gcc generates a lot of code
for varargs - not something we want to do in each interrupt handler.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] Userspace changes for KVM HPET (v3)

2009-05-12 Thread Beth Kon

Beth Kon wrote:

Avi Kivity wrote:

Beth Kon wrote:

Signed-off-by: Beth Kon e...@us.ibm.com


diff --git a/hw/hpet.c b/hw/hpet.c
index c7945ec..100abf5 100644
--- a/hw/hpet.c
+++ b/hw/hpet.c
@@ -30,6 +30,7 @@
 #include console.h
 #include qemu-timer.h
 #include hpet_emul.h
+#include qemu-kvm.h
 
 //#define HPET_DEBUG

 #ifdef HPET_DEBUG
@@ -48,6 +49,28 @@ uint32_t hpet_in_legacy_mode(void)
 return 0;
 }
 
+static void hpet_legacy_enable(void)

+{
+if (qemu_kvm_pit_in_kernel()) {
+kvm_kpit_disable();
+dprintf(qemu: hpet disabled kernel pit\n);
+} else {
+hpet_pit_disable();
+dprintf(qemu: hpet disabled userspace pit\n);
+}
+}
+
+static void hpet_legacy_disable(void)
+{
+if (qemu_kvm_pit_in_kernel()) {
+kvm_kpit_enable();
+dprintf(qemu: hpet enabled kernel pit\n);
+} else {
+hpet_pit_enable();
+dprintf(qemu: hpet enabled userspace pit\n);
+}
+}
  
I think it's better to move these into hpet_pit_enable() and 
hpet_pit_enable().  This avoids changing the calls below, and puts 
pit stuff in i8254.c instead of hpet.c.


Might also need to be called from hpet_load(); probably a problem in 
upstream as well.


My assumption about hpet_load was that the correct pit state would be 
established via pit_load (since all saves/loads are done together).  
But when I wrote this, I was thinking only about the userspace pit 
(for qemu). I'm not sure how the load concept applies to kernel 
state.  Do I need to explicitly re-enable or disable the kernel pit 
during load?
Looking further at the code, it looks like kvm_pit_load should take care 
of this. Agree?





--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [KVM PATCH v6] kvm: add support for irqfd

2009-05-12 Thread Avi Kivity

Gregory Haskins wrote:

(Applies to kvm.git:833367b57)

KVM provides a complete virtual system environment for guests, including
support for injecting interrupts modeled after the real exception/interrupt
facilities present on the native platform (such as the IDT on x86).
Virtual interrupts can come from a variety of sources (emulated devices,
pass-through devices, etc) but all must be injected to the guest via
the KVM infrastructure.  This patch adds a new mechanism to inject a specific
interrupt to a guest using a decoupled eventfd mechnanism:  Any legal signal
on the irqfd (using eventfd semantics from either userspace or kernel) will
translate into an injected interrupt in the guest at the next available
interrupt window.

I am reasonably satisfied with this series, so Avi please consider for
inclusion.
  


Looks good.  Please add a way to disassociate an irqfd (for hotunplug) 
and I'll merge.


--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] replace drop_interrupt_shadow by set_interrupt_shadow

2009-05-12 Thread Avi Kivity

Glauber Costa wrote:

This patch replaces drop_interrupt_shadow with the more
general set_interrupt_shadow, that can either drop or raise
it, depending on its parameter.

 }
 
+static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)

+{
+   struct vcpu_svm *svm = to_svm(vcpu);
+   u32 ret = 0;
+
+   if (svm-vmcb-control.int_state  SVM_INTERRUPT_SHADOW_MASK)
+   ret |= X86_SHADOW_INT_STI  X86_SHADOW_INT_MOV_SS;
+   return ret  mask;
+}
  


 - |.


+
+static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+   u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+   u32 interruptibility = interruptibility_old;
+
+   interruptibility = ~((GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
  


May drop one layer of parentheses.



--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] deal with interrupt shadow state for emulated instruction

2009-05-12 Thread Avi Kivity

Glauber Costa wrote:

we currently unblock shadow interrupt state when we skip an instruction,
but failing to do so when we actually emulate one. This blocks interrupts
in key instruction blocks, in particular sti; hlt; sequences

If the instruction emulated is an sti, we have to block shadow interrupts.
The same goes for mov ss. pop ss also needs it, but we don't currently
emulate it.

Without this patch, I cannot boot gpxe option roms at vmx machines.
This is described at https://bugzilla.redhat.com/show_bug.cgi?id=494469

@@ -1618,6 +1620,15 @@ special_insn:
int err;
 
 		sel = c-src.val;

+   if (c-modrm_reg == VCPU_SREG_SS) {
+   u32 int_shadow =
+   kvm_x86_ops-get_interrupt_shadow(ctxt-vcpu,
+ 
X86_SHADOW_INT_MOV_SS);
+   /* See sti emulation for an explanation of this */
+   if (!(int_shadow  X86_SHADOW_INT_MOV_SS))
+   ctxt-interruptibility = X86_SHADOW_INT_MOV_SS;
+   }
  


The indentation of the first statement here is annoying.  Suggest a 
function toggle_interruptibility(ctxt, mask).  Would eliminate the need 
for the comment forward reference as well.



--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: Problem doing pci passthrough of the network card without VT-d

2009-05-12 Thread Passera, Pablo R
One update on this. I disabled VT-d from the BIOS and now I am not getting the 
DMAR error messages in dmesg, but the board still does not work on the guest. 
Any help is welcomed.

e1000e :00:19.0: PCI INT A disabled
pci-stub :00:19.0: PCI INT A - GSI 20 (level, low) - IRQ 20
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X

Regards,
Pablo

-Original Message-
From: Passera, Pablo R
Sent: Tuesday, May 12, 2009 12:14 PM
To: kvm@vger.kernel.org
Subject: Problem doing pci passthrough of the network card without VT-d

Hi List,
   I am having problems to do pci passthrough to a network card
without using VT-d. The card is present in the guest but with a
different model (Intel Corporation 82801I Gigabit Ethernet Controller
(rev 2)) and it does not work. The qemu line that I used is:

./devel/bin/qemu-system-x86_64 -hda ./dm.img -m 256 -pcidevice
host=00:19.0,dma=none -net none

Before running qemu I did

echo 8086 294c  /sys/bus/pci/drivers/pci-stub/new_id
echo :00:19.0  /sys/bus/pci/drivers/e1000e/unbind
echo :00:19.0  /sys/bus/pci/drivers/pci-stub/bind

This is the lspci -tv output

-[:00]-+-00.0  Intel Corporation 82X38/X48 Express DRAM Controller
   +-01.0-[:01]00.0  nVidia Corporation G80 [GeForce
8800 GTX]
   +-19.0  Intel Corporation 82566DC-2 Gigabit Network
Connection
   +-1a.0  Intel Corporation 82801I (ICH9 Family) USB UHCI
Controller #4
   +-1a.1  Intel Corporation 82801I (ICH9 Family) USB UHCI
Controller #5
   +-1a.2  Intel Corporation 82801I (ICH9 Family) USB UHCI
Controller #6
   +-1a.7  Intel Corporation 82801I (ICH9 Family) USB2 EHCI
Controller #2
   +-1b.0  Intel Corporation 82801I (ICH9 Family) HD Audio
Controller
   +-1c.0-[:02]--
   +-1c.4-[:03]00.0  Marvell Technology Group Ltd.
88SE6121 SATA II Controller
   +-1d.0  Intel Corporation 82801I (ICH9 Family) USB UHCI
Controller #1
   +-1d.1  Intel Corporation 82801I (ICH9 Family) USB UHCI
Controller #2
   +-1d.2  Intel Corporation 82801I (ICH9 Family) USB UHCI
Controller #3
   +-1d.7  Intel Corporation 82801I (ICH9 Family) USB2 EHCI
Controller #1
   +-1e.0-[:04]03.0  Texas Instruments TSB43AB22/A IEEE-
1394a-2000 Controller (PHY/Link)
   +-1f.0  Intel Corporation 82801IR (ICH9R) LPC Interface
Controller
   +-1f.2  Intel Corporation 82801IR/IO/IH (ICH9R/DO/DH) 4 port
SATA IDE Controller
   +-1f.3  Intel Corporation 82801I (ICH9 Family) SMBus
Controller
   \-1f.5  Intel Corporation 82801I (ICH9 Family) 2 port SATA
IDE Controller


I am getting the following error in host dmesg

e1000e :00:19.0: PCI INT A disabled
pci-stub :00:19.0: PCI INT A - GSI 20 (level, low) - IRQ 20
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
DMAR:[DMA Read] Request device [00:19.0] fault addr baee000
DMAR:[fault reason 02] Present bit in context entry is clear
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
DMAR:[DMA Read] Request device [00:19.0] fault addr b90c000
DMAR:[fault reason 02] Present bit in context entry is clear

And in a second run I got

e1000e :00:19.0: PCI INT A disabled
pci-stub :00:19.0: PCI INT A - GSI 20 (level, low) - IRQ 20
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
DMAR:[DMA Read] Request device [00:19.0] fault addr ba8
DMAR:[fault reason 06] PTE Read access is not set
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
pci-stub :00:19.0: irq 29 for MSI/MSI-X
DMAR:[DMA Read] Request device [00:19.0] fault addr baa4000
DMAR:[fault reason 06] PTE Read access is not set

It does work if I enable VT-d, even though the network card is
recognized with a different model. However, I need this to work without
VT-d. Which could be the problem here? Is this supported?

Thanks a lot,
Pablo


More information and logs...

Kernel, kvm and network driver versions:

Host Kernel 2.6.29.2
Guest Kernel 2.6.23.1-42.fc8

[KVM PATCH v7 0/3] kvm: eventfd interfaces (formerly irqfd)

2009-05-12 Thread Gregory Haskins
(Applies to kvm.git:b5e725fa)

This is v7 of the series.  We have generalized the name of the series (as well
as some of the hunks in the series) to reflect the fact that we have multiple
eventfd based components.

This series has been tested and appears to be working as intended.  You can
download the unit-test used to verify this here:

ftp://ftp.novell.com/dev/ghaskins/kvm-eventfd.tar.bz

This will also serve as an example on how to use the new interfaces.

[ Changelog:

   v7:
*) Added iofd to allow PIO/MMIO writes to generate an eventfd
   signal.  This was previously discussed as hypercallfd, but
   since explicit hypercalls are not looking to be very popular,
   and based on the fact that they were not going to carry payload
   anyway, I named them iofd.
*) Generalized some of the code so that irqfd and iofd could be
   logically grouped together.  For instance
   s/KVM_CAP_IRQFD/KVM_CAP_EVENTFD and
   virt/kvm/irqfd.c becomes virt/kvm/eventfd.c
*) Added support for deassign operations to ensure we can properly
   support hot-unplug.
*) Reinstated the eventfd EXPORT_SYMBOL patch since we need it again
   for supporting iofd.
*) Rebased to kvm.git:b5e725fa 

   v6:
*) Moved eventfd creation back to userspace, per Avi's request
*) Dropped no longer necessary supporting patches from series
*) Rebased to kvm.git:833367b57

   v5:
*) Added padding to the ioctl structure
*) Added proper ref-count increment to the file before returning
   success. (Needs review by Al Viro, Davide Libenzi)
*) Cleaned up error-handling path to make sure we remove ourself
   from the waitq if necessary.
*) Make sure we only add ourselves to kvm-irqfds if successful
   creating the irqfd in the first place.
*) Rebased to kvm.git:66b0aed4 

   v4:
*) Changed allocation model to create the new fd last, after
   we get past the last potential error point by using Davide's
   new eventfd_file_create interface (Al Viro, Davide Libenzi)
*) We no longer export sys_eventfd2() since it is replaced
   functionally with eventfd_file_create();
*) Rebased to kvm.git:7da2e3ba

   v3:
*) The kernel now allocates the eventfd (need to export sys_eventfd2)
*) Added a flags field for future expansion to kvm_irqfd()
*) We properly toggle the irq level 1+0.
*) We re-use the USERSPACE_SRC_ID instead of creating our own
*) Properly check for failures establishing a poll-table with eventfd
*) Fixed fd/file leaks on failure
*) Rebased to lateste kvm.git::41b76d8d04

   v2:
*) Dropped notifier_chain based callbacks in favor of
   wait_queue_t::func and file::poll based callbacks (Thanks to
   Davide for the suggestion)

   v1:
*) Initial release

]

[ Todo:
*) Implement the bus_io_unregister() function so the iofd hot-unplug
   path may be completed
*) Test the hot-unplug path
]


---

Gregory Haskins (3):
  kvm: add iofd support
  kvm: add support for irqfd via eventfd-notification interface
  eventfd: export eventfd interfaces for module use


 arch/x86/kvm/Makefile|2 
 arch/x86/kvm/x86.c   |1 
 fs/eventfd.c |3 
 include/linux/kvm.h  |   22 +++
 include/linux/kvm_host.h |7 +
 virt/kvm/eventfd.c   |  294 ++
 virt/kvm/kvm_main.c  |   33 +
 7 files changed, 361 insertions(+), 1 deletions(-)
 create mode 100644 virt/kvm/eventfd.c

-- 
Signature
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[KVM PATCH v7 1/3] eventfd: export eventfd interfaces for module use

2009-05-12 Thread Gregory Haskins
We want to use eventfd from KVM which can be compiled as a module, so
export the interfaces.

Signed-off-by: Gregory Haskins ghask...@novell.com
---

 fs/eventfd.c |3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 2a701d5..3f0e197 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -16,6 +16,7 @@
 #include linux/anon_inodes.h
 #include linux/eventfd.h
 #include linux/syscalls.h
+#include linux/module.h
 
 struct eventfd_ctx {
wait_queue_head_t wqh;
@@ -56,6 +57,7 @@ int eventfd_signal(struct file *file, int n)
 
return n;
 }
+EXPORT_SYMBOL_GPL(eventfd_signal);
 
 static int eventfd_release(struct inode *inode, struct file *file)
 {
@@ -197,6 +199,7 @@ struct file *eventfd_fget(int fd)
 
return file;
 }
+EXPORT_SYMBOL_GPL(eventfd_fget);
 
 SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 {

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[KVM PATCH v7 2/3] kvm: add support for irqfd via eventfd-notification interface

2009-05-12 Thread Gregory Haskins
KVM provides a complete virtual system environment for guests, including
support for injecting interrupts modeled after the real exception/interrupt
facilities present on the native platform (such as the IDT on x86).
Virtual interrupts can come from a variety of sources (emulated devices,
pass-through devices, etc) but all must be injected to the guest via
the KVM infrastructure.  This patch adds a new mechanism to inject a specific
interrupt to a guest using a decoupled eventfd mechnanism:  Any legal signal
on the irqfd (using eventfd semantics from either userspace or kernel) will
translate into an injected interrupt in the guest at the next available
interrupt window.

Signed-off-by: Gregory Haskins ghask...@novell.com
---

 arch/x86/kvm/Makefile|2 
 arch/x86/kvm/x86.c   |1 
 include/linux/kvm.h  |   10 ++
 include/linux/kvm_host.h |5 +
 virt/kvm/eventfd.c   |  187 ++
 virt/kvm/kvm_main.c  |   20 +
 6 files changed, 224 insertions(+), 1 deletions(-)
 create mode 100644 virt/kvm/eventfd.c

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index b43c4ef..4d50904 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -3,7 +3,7 @@
 #
 
 common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
-coalesced_mmio.o irq_comm.o)
+coalesced_mmio.o irq_comm.o eventfd.o)
 ifeq ($(CONFIG_KVM_TRACE),y)
 common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
 endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fd0a571..ba541f6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1026,6 +1026,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_REINJECT_CONTROL:
case KVM_CAP_IRQ_INJECT_STATUS:
case KVM_CAP_ASSIGN_DEV_IRQ:
+   case KVM_CAP_EVENTFD:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 3db5d8d..dfc4bcc 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -415,6 +415,7 @@ struct kvm_trace_rec {
 #define KVM_CAP_ASSIGN_DEV_IRQ 29
 /* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */
 #define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30
+#define KVM_CAP_EVENTFD 31
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -454,6 +455,13 @@ struct kvm_irq_routing {
 
 #endif
 
+struct kvm_irqfd {
+   __u32 fd;
+   __u32 gsi;
+   __u32 flags;
+   __u8  pad[20];
+};
+
 /*
  * ioctls for VM fds
  */
@@ -498,6 +506,8 @@ struct kvm_irq_routing {
 #define KVM_ASSIGN_SET_MSIX_ENTRY \
_IOW(KVMIO, 0x74, struct kvm_assigned_msix_entry)
 #define KVM_DEASSIGN_DEV_IRQ   _IOW(KVMIO, 0x75, struct kvm_assigned_irq)
+#define KVM_ASSIGN_IRQFD   _IOW(KVMIO, 0x76, struct kvm_irqfd)
+#define KVM_DEASSIGN_IRQFD _IOW(KVMIO, 0x77, __u32)
 
 /*
  * ioctls for vcpu fds
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2b8df0c..1acc528 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -134,6 +134,7 @@ struct kvm {
struct list_head vm_list;
struct kvm_io_bus mmio_bus;
struct kvm_io_bus pio_bus;
+   struct list_head irqfds;
struct kvm_vm_stat stat;
struct kvm_arch arch;
atomic_t users_count;
@@ -525,4 +526,8 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {}
 
 #endif
 
+int kvm_assign_irqfd(struct kvm *kvm, int fd, int gsi, int flags);
+int kvm_deassign_irqfd(struct kvm *kvm, int fd);
+void kvm_irqfd_release(struct kvm *kvm);
+
 #endif
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
new file mode 100644
index 000..71afd62
--- /dev/null
+++ b/virt/kvm/eventfd.c
@@ -0,0 +1,187 @@
+/*
+ * kvm eventfd support - use eventfd objects to signal various KVM events
+ *
+ * Copyright 2009 Novell.  All Rights Reserved.
+ *
+ * Author:
+ * Gregory Haskins ghask...@novell.com
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include linux/kvm_host.h
+#include linux/workqueue.h
+#include linux/syscalls.h
+#include linux/wait.h
+#include linux/poll.h
+#include linux/file.h
+#include linux/list.h
+
+/*
+ * 
+ * irqfd: Allows an fd to be used to inject an interrupt to the guest
+ *
+ * Credit goes to Avi Kivity for the 

[KVM PATCH v7 3/3] kvm: add iofd support

2009-05-12 Thread Gregory Haskins
iofd is a mechanism to register PIO/MMIO regions to trigger an eventfd
signal when written to.  Userspace can register any arbitrary address
with a corresponding eventfd.

Signed-off-by: Gregory Haskins ghask...@novell.com
---

 include/linux/kvm.h  |   12 +
 include/linux/kvm_host.h |2 +
 virt/kvm/eventfd.c   |  107 ++
 virt/kvm/kvm_main.c  |   13 ++
 4 files changed, 134 insertions(+), 0 deletions(-)

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index dfc4bcc..99b6e45 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -292,6 +292,17 @@ struct kvm_guest_debug {
struct kvm_guest_debug_arch arch;
 };
 
+#define KVM_IOFD_FLAG_DEASSIGN  (1  0)
+#define KVM_IOFD_FLAG_PIO   (1  1)
+
+struct kvm_iofd {
+   __u64 addr;
+   __u32 len;
+   __u32 fd;
+   __u32 flags;
+   __u8  pad[12];
+};
+
 #define KVM_TRC_SHIFT   16
 /*
  * kvm trace categories
@@ -508,6 +519,7 @@ struct kvm_irqfd {
 #define KVM_DEASSIGN_DEV_IRQ   _IOW(KVMIO, 0x75, struct kvm_assigned_irq)
 #define KVM_ASSIGN_IRQFD   _IOW(KVMIO, 0x76, struct kvm_irqfd)
 #define KVM_DEASSIGN_IRQFD _IOW(KVMIO, 0x77, __u32)
+#define KVM_IOFD   _IOW(KVMIO, 0x78, struct kvm_iofd)
 
 /*
  * ioctls for vcpu fds
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1acc528..d53cb70 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -529,5 +529,7 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {}
 int kvm_assign_irqfd(struct kvm *kvm, int fd, int gsi, int flags);
 int kvm_deassign_irqfd(struct kvm *kvm, int fd);
 void kvm_irqfd_release(struct kvm *kvm);
+int kvm_iofd(struct kvm *kvm, unsigned long addr, size_t len,
+int fd, int flags);
 
 #endif
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 71afd62..8b23317 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -21,12 +21,16 @@
  */
 
 #include linux/kvm_host.h
+#include linux/kvm.h
 #include linux/workqueue.h
 #include linux/syscalls.h
 #include linux/wait.h
 #include linux/poll.h
 #include linux/file.h
 #include linux/list.h
+#include linux/eventfd.h
+
+#include iodev.h
 
 /*
  * 
@@ -185,3 +189,106 @@ kvm_irqfd_release(struct kvm *kvm)
list_for_each_entry_safe(irqfd, tmp, kvm-irqfds, list)
irqfd_release(irqfd);
 }
+
+/*
+ * 
+ * iofd: translate a PIO/MMIO memory write to an eventfd signal.
+ *
+ * userspace can register a PIO/MMIO address with an eventfd for recieving
+ * notification when the memory has been touched.
+ * 
+ */
+
+struct _iofd {
+   u64  addr;
+   size_t   length;
+   struct file *file;
+   struct kvm_io_device dev;
+};
+
+static int
+iofd_in_range(struct kvm_io_device *this, gpa_t addr, int len, int is_write)
+{
+   struct _iofd *iofd = (struct _iofd *)this-private;
+
+   return ((addr = iofd-addr  (addr  iofd-addr + iofd-length)));
+}
+
+/* writes trigger an event */
+static void
+iofd_write(struct kvm_io_device *this, gpa_t addr, int len, const void *val)
+{
+   struct _iofd *iofd = (struct _iofd *)this-private;
+
+   eventfd_signal(iofd-file, 1);
+}
+
+/* reads return all zeros */
+static void
+iofd_read(struct kvm_io_device *this, gpa_t addr, int len, void *val)
+{
+   memset(val, 0, len);
+}
+
+static void
+iofd_destructor(struct kvm_io_device *this)
+{
+   struct _iofd *iofd = (struct _iofd *)this-private;
+
+   fput(iofd-file);
+   kfree(iofd);
+}
+
+static int
+kvm_assign_iofd(struct kvm *kvm, unsigned long addr, size_t len,
+   int fd, int flags)
+{
+   intpio = flags  KVM_IOFD_FLAG_PIO;
+   struct kvm_io_bus *bus = pio ? kvm-pio_bus : kvm-mmio_bus;
+   struct _iofd  *iofd;
+   struct file   *file;
+
+   file = eventfd_fget(fd);
+   if (IS_ERR(file))
+   return PTR_ERR(file);
+
+   iofd = kzalloc(sizeof(*iofd), GFP_KERNEL);
+   if (!iofd) {
+   fput(file);
+   return -ENOMEM;
+   }
+
+   iofd-dev.read   = iofd_read;
+   iofd-dev.write  = iofd_write;
+   iofd-dev.in_range   = iofd_in_range;
+   iofd-dev.destructor = iofd_destructor;
+   iofd-dev.private= iofd;
+
+   iofd-addr   = addr;
+   iofd-length = len;
+   iofd-file   = file;
+
+   kvm_io_bus_register_dev(bus, iofd-dev);
+
+   printk(KERN_DEBUG registering %s iofd at %lx of size %d\n,
+  pio  ? PIO : MMIO, addr, (int)len);
+
+   return 0;
+}
+
+static int
+kvm_deassign_iofd(struct kvm *kvm, unsigned long addr, size_t len,
+ int fd, int flags)
+{
+   /* FIXME: We need an 

[PATCH v7 0/2] eventfd support for KVM userspace

2009-05-12 Thread Gregory Haskins
This is the userspace support for the irqfd/iofd interfaces published here:

http://lkml.org/lkml/2009/5/12/372

---

Gregory Haskins (2):
  qemu-kvm: add iofd support
  qemu-kvm: add irqfd support


 kvm/libkvm/libkvm.c |  118 +++
 kvm/libkvm/libkvm.h |   56 
 2 files changed, 174 insertions(+), 0 deletions(-)

-- 
Signature
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v7 1/2] qemu-kvm: add irqfd support

2009-05-12 Thread Gregory Haskins
irqfd lets you create an eventfd based file-desriptor to inject interrupts
to a kvm guest.  We associate one gsi per fd for fine-grained routing.

Signed-off-by: Gregory Haskins ghask...@novell.com
---

 kvm/libkvm/libkvm.c |   66 +++
 kvm/libkvm/libkvm.h |   25 +++
 2 files changed, 91 insertions(+), 0 deletions(-)

diff --git a/kvm/libkvm/libkvm.c b/kvm/libkvm/libkvm.c
index ba0a5d1..74a21a2 100644
--- a/kvm/libkvm/libkvm.c
+++ b/kvm/libkvm/libkvm.c
@@ -34,6 +34,7 @@
 #include string.h
 #include errno.h
 #include sys/ioctl.h
+#include sys/eventfd.h
 #include inttypes.h
 #include libkvm.h
 
@@ -1444,3 +1445,68 @@ int kvm_assign_set_msix_entry(kvm_context_t kvm,
 return ret;
 }
 #endif
+
+#ifdef KVM_CAP_EVENTFD
+static int _assign_irqfd(kvm_context_t kvm, int fd, int gsi, int flags)
+{
+   int r;
+   struct kvm_irqfd data = {
+   .fd= fd,
+   .gsi   = gsi,
+   .flags = flags,
+   };
+
+   r = ioctl(kvm-vm_fd, KVM_ASSIGN_IRQFD, data);
+   if (r == -1)
+   r = -errno;
+   return r;
+}
+
+int kvm_create_irqfd(kvm_context_t kvm, int gsi, int flags)
+{
+   int r;
+   int fd;
+
+   if (!kvm_check_extension(kvm, KVM_CAP_EVENTFD))
+   return -ENOENT;
+
+   fd = eventfd(0, 0);
+   if (fd  0)
+   return -errno;
+
+   r = _assign_irqfd(kvm, fd, gsi, flags);
+   if (r  0) {
+   close(fd);
+   return -errno;
+   }
+
+   return fd;
+}
+
+int kvm_destroy_irqfd(kvm_context_t kvm, int fd, int flags)
+{
+   int r;
+   __u32 data = fd;
+
+   r = ioctl(kvm-vm_fd, KVM_DEASSIGN_IRQFD, data);
+   if (r == -1)
+   r = -errno;
+
+   close(fd);
+
+   return r;
+}
+
+#else /* KVM_CAP_EVENTFD */
+
+int kvm_create_irqfd(kvm_context_t kvm, int gsi, int flags)
+{
+   return -ENOENT;
+}
+
+int kvm_destroy_irqfd(kvm_context_t kvm, int fd, int flags)
+{
+   return -ENOENT;
+}
+
+#endif /* KVM_CAP_EVENTFD */
diff --git a/kvm/libkvm/libkvm.h b/kvm/libkvm/libkvm.h
index 4821a1e..322b4cd 100644
--- a/kvm/libkvm/libkvm.h
+++ b/kvm/libkvm/libkvm.h
@@ -856,6 +856,31 @@ int kvm_commit_irq_routes(kvm_context_t kvm);
  */
 int kvm_get_irq_route_gsi(kvm_context_t kvm);
 
+/*!
+ * \brief Create a file descriptor for injecting interrupts
+ *
+ * Creates an eventfd based file-descriptor that maps to a specific GSI
+ * in the guest.  eventfd compliant signaling (write() from userspace, or
+ * eventfd_signal() from kernelspace) will cause the GSI to inject
+ * itself into the guest at the next available window.
+ *
+ * \param kvm Pointer to the current kvm_context
+ * \param gsi GSI to assign to this fd
+ * \param flags reserved, must be zero
+ */
+int kvm_create_irqfd(kvm_context_t kvm, int gsi, int flags);
+
+/*!
+ * \brief Destroy an irqfd file descriptor
+ *
+ * Destroys a file descriptor previously opened with kvm_create_irqfd()
+ *
+ * \param kvm Pointer to the current kvm_context
+ * \param fd fd to close
+ * \param flags reserved, must be zero
+ */
+int kvm_destroy_irqfd(kvm_context_t kvm, int fd, int flags);
+
 #ifdef KVM_CAP_DEVICE_MSIX
 int kvm_assign_set_msix_nr(kvm_context_t kvm,
   struct kvm_assigned_msix_nr *msix_nr);

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v7 2/2] qemu-kvm: add iofd support

2009-05-12 Thread Gregory Haskins
An iofd allows an eventfd to attach to a specific PIO/MMIO region in the
guest.  Any guest-writes to that region will trigger an eventfd signal.

Signed-off-by: Gregory Haskins ghask...@novell.com
---

 kvm/libkvm/libkvm.c |   52 +++
 kvm/libkvm/libkvm.h |   31 ++
 2 files changed, 83 insertions(+), 0 deletions(-)

diff --git a/kvm/libkvm/libkvm.c b/kvm/libkvm/libkvm.c
index 74a21a2..0139abd 100644
--- a/kvm/libkvm/libkvm.c
+++ b/kvm/libkvm/libkvm.c
@@ -1497,6 +1497,45 @@ int kvm_destroy_irqfd(kvm_context_t kvm, int fd, int 
flags)
return r;
 }
 
+int kvm_assign_iofd(kvm_context_t kvm, unsigned long addr, size_t len,
+   int fd, int type, int flags)
+{
+   int r;
+   struct kvm_iofd data = {
+   .addr  = addr,
+   .len   = len,
+   .fd= fd,
+   .flags = type ? KVM_IOFD_FLAG_PIO : 0,
+   };
+
+   if (!kvm_check_extension(kvm, KVM_CAP_EVENTFD))
+   return -ENOENT;
+
+   r = ioctl(kvm-vm_fd, KVM_IOFD, data);
+   if (r == -1)
+   r = -errno;
+   return r;
+}
+
+int kvm_deassign_iofd(kvm_context_t kvm, unsigned long addr, size_t len,
+ int type, int flags)
+{
+   int r;
+   struct kvm_iofd data = {
+   .addr  = addr,
+   .len   = len,
+   .flags = KVM_IOFD_FLAG_DEASSIGN | (type ? KVM_IOFD_FLAG_PIO : 
0),
+   };
+
+   if (!kvm_check_extension(kvm, KVM_CAP_EVENTFD))
+   return -ENOENT;
+
+   r = ioctl(kvm-vm_fd, KVM_IOFD, data);
+   if (r == -1)
+   r = -errno;
+   return r;
+}
+
 #else /* KVM_CAP_EVENTFD */
 
 int kvm_create_irqfd(kvm_context_t kvm, int gsi, int flags)
@@ -1509,4 +1548,17 @@ int kvm_destroy_irqfd(kvm_context_t kvm, int fd, int 
flags)
return -ENOENT;
 }
 
+int kvm_assign_iofd(kvm_context_t kvm, unsigned long addr, size_t len,
+   int fd, int type, int flags)
+{
+   return -ENOENT;
+}
+
+int kvm_deassign_iofd(kvm_context_t kvm, unsigned long addr, size_t len,
+ int type, int flags)
+{
+   return -ENOENT;
+}
+
 #endif /* KVM_CAP_EVENTFD */
+
diff --git a/kvm/libkvm/libkvm.h b/kvm/libkvm/libkvm.h
index 322b4cd..89c558a 100644
--- a/kvm/libkvm/libkvm.h
+++ b/kvm/libkvm/libkvm.h
@@ -881,6 +881,37 @@ int kvm_create_irqfd(kvm_context_t kvm, int gsi, int 
flags);
  */
 int kvm_destroy_irqfd(kvm_context_t kvm, int fd, int flags);
 
+/*!
+ * \brief Assign an eventfd to an IO port (PIO or MMIO)
+ *
+ * Assigns an eventfd based file-descriptor to a specific PIO or MMIO
+ * address range.  Any guest writes to the specified range will generate
+ * an eventfd signal.
+ *
+ * \param kvm Pointer to the current kvm_context
+ * \param addr The IO address
+ * \param len The length of the IO region at the address
+ * \param fd The eventfd file-descriptor
+ * \param type MMIO=0, PIO=1
+ * \param flags reserved, must be zero
+ */
+int kvm_assign_iofd(kvm_context_t kvm, unsigned long addr, size_t len,
+   int fd, int type, int flags);
+
+/*!
+ * \brief Deassign an iofd from a previously registered IO port
+ *
+ * Deassigns an iofd previously registered with kvm_assign_iofd()
+ *
+ * \param kvm Pointer to the current kvm_context
+ * \param addr The IO address
+ * \param len The length of the IO region at the address
+ * \param type MMIO=0, PIO=1
+ * \param flags reserved, must be zero
+ */
+int kvm_deassign_iofd(kvm_context_t kvm, unsigned long addr, size_t len,
+ int type, int flags);
+
 #ifdef KVM_CAP_DEVICE_MSIX
 int kvm_assign_set_msix_nr(kvm_context_t kvm,
   struct kvm_assigned_msix_nr *msix_nr);

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm: Use a bitmap for tracking used GSIs

2009-05-12 Thread Alex Williamson
On Mon, 2009-05-11 at 12:00 +, Yang, Sheng wrote:
 On Friday 08 May 2009 06:22:20 Alex Williamson wrote:
  +   /* Round the number of GSIs supported to a 4 byte
  +* value so we can search it using ints and ffs */
  +   i = kvm_get_gsi_count(kvm)  ~0x1f;
  +   kvm-used_gsi_bitmap = malloc(i  3);
 
 3 or 5?

3, ie. /8 (bits to bytes)

 I am a little confused by these magic numbers, including 0x1f...

The 5 shift gives us the index into the array of ints, the 0x1f gives us
the bit index into a specific int.  This is very similar to the code in
hw/acpi.c.

 I think there are something can indicate the length of unsigned long in 
 QEmu(sorry, can't find it now...), so how about using ffsl() and get other 
 constants based on it?

We'd probably want to use ffsll() so we can ignore 32b vs 64b longs.
There's HOST_LONG_BITS, but that doesn't actually help defining a shift
value.  Thanks,

Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [KVM PATCH v6] kvm: add support for irqfd

2009-05-12 Thread Gregory Haskins
Avi Kivity wrote:
 Gregory Haskins wrote:
 (Applies to kvm.git:833367b57)

 KVM provides a complete virtual system environment for guests, including
 support for injecting interrupts modeled after the real
 exception/interrupt
 facilities present on the native platform (such as the IDT on x86).
 Virtual interrupts can come from a variety of sources (emulated devices,
 pass-through devices, etc) but all must be injected to the guest via
 the KVM infrastructure.  This patch adds a new mechanism to inject a
 specific
 interrupt to a guest using a decoupled eventfd mechnanism:  Any legal
 signal
 on the irqfd (using eventfd semantics from either userspace or
 kernel) will
 translate into an injected interrupt in the guest at the next available
 interrupt window.

 I am reasonably satisfied with this series, so Avi please consider for
 inclusion.
   

 Looks good.  Please add a way to disassociate an irqfd (for hotunplug)
 and I'll merge.

Done, and v7 sent out.  Since I completed the testing on the
hypercall-fd (I call them iofd for reasons described in the patch
headers) I included those as well for review.  Since the irqfd stuff is
heavily reviewed and the iofd patches haven't seen the light of day
until today, feel free to merge them independently.  I included them
together because I wanted to avoid forking the CAP bits, etc, for two
closely related subsystems.  But they technically don't need to go in
together if you prefer.

Thanks Avi,
-Greg



signature.asc
Description: OpenPGP digital signature


Re: [KVM PATCH v7 3/3] kvm: add iofd support

2009-05-12 Thread Gregory Haskins
Gregory Haskins wrote:
 iofd is a mechanism to register PIO/MMIO regions to trigger an eventfd
 signal when written to.  Userspace can register any arbitrary address
 with a corresponding eventfd.
   

Ugg..this patch header sucks, especially given all the talk around how
we need to do them better lately :)

I will add this text as well for future versions:

--

Traditional MMIO/PIO exit paths are expensive because they are done
within the same context as the VCPU thread and therefore cause a VMX/SVM
heavy-weight exit, a transition back to userspace, and overhead with
the qemu processing of the operation.  An eventfd mechanism, on the
other hand, allows the VCPU to take a very brief lightweight exit only
long enough to trigger the eventfd_signal.  This means that clients of
the eventfd (supporting both userspace or kernel end-points) can
potentially get notified much more efficiently than if we were to
register through the traditional mechanism via qemu MMIO/PIO notification.

---
 Signed-off-by: Gregory Haskins ghask...@novell.com
 ---

  include/linux/kvm.h  |   12 +
  include/linux/kvm_host.h |2 +
  virt/kvm/eventfd.c   |  107 
 ++
  virt/kvm/kvm_main.c  |   13 ++
  4 files changed, 134 insertions(+), 0 deletions(-)

 diff --git a/include/linux/kvm.h b/include/linux/kvm.h
 index dfc4bcc..99b6e45 100644
 --- a/include/linux/kvm.h
 +++ b/include/linux/kvm.h
 @@ -292,6 +292,17 @@ struct kvm_guest_debug {
   struct kvm_guest_debug_arch arch;
  };
  
 +#define KVM_IOFD_FLAG_DEASSIGN  (1  0)
 +#define KVM_IOFD_FLAG_PIO   (1  1)
 +
 +struct kvm_iofd {
 + __u64 addr;
 + __u32 len;
 + __u32 fd;
 + __u32 flags;
 + __u8  pad[12];
 +};
 +
  #define KVM_TRC_SHIFT   16
  /*
   * kvm trace categories
 @@ -508,6 +519,7 @@ struct kvm_irqfd {
  #define KVM_DEASSIGN_DEV_IRQ   _IOW(KVMIO, 0x75, struct kvm_assigned_irq)
  #define KVM_ASSIGN_IRQFD   _IOW(KVMIO, 0x76, struct kvm_irqfd)
  #define KVM_DEASSIGN_IRQFD _IOW(KVMIO, 0x77, __u32)
 +#define KVM_IOFD   _IOW(KVMIO, 0x78, struct kvm_iofd)
  
  /*
   * ioctls for vcpu fds
 diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
 index 1acc528..d53cb70 100644
 --- a/include/linux/kvm_host.h
 +++ b/include/linux/kvm_host.h
 @@ -529,5 +529,7 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) 
 {}
  int kvm_assign_irqfd(struct kvm *kvm, int fd, int gsi, int flags);
  int kvm_deassign_irqfd(struct kvm *kvm, int fd);
  void kvm_irqfd_release(struct kvm *kvm);
 +int kvm_iofd(struct kvm *kvm, unsigned long addr, size_t len,
 +  int fd, int flags);
  
  #endif
 diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
 index 71afd62..8b23317 100644
 --- a/virt/kvm/eventfd.c
 +++ b/virt/kvm/eventfd.c
 @@ -21,12 +21,16 @@
   */
  
  #include linux/kvm_host.h
 +#include linux/kvm.h
  #include linux/workqueue.h
  #include linux/syscalls.h
  #include linux/wait.h
  #include linux/poll.h
  #include linux/file.h
  #include linux/list.h
 +#include linux/eventfd.h
 +
 +#include iodev.h
  
  /*
   * 
 @@ -185,3 +189,106 @@ kvm_irqfd_release(struct kvm *kvm)
   list_for_each_entry_safe(irqfd, tmp, kvm-irqfds, list)
   irqfd_release(irqfd);
  }
 +
 +/*
 + * 
 + * iofd: translate a PIO/MMIO memory write to an eventfd signal.
 + *
 + * userspace can register a PIO/MMIO address with an eventfd for recieving
 + * notification when the memory has been touched.
 + * 
 + */
 +
 +struct _iofd {
 + u64  addr;
 + size_t   length;
 + struct file *file;
 + struct kvm_io_device dev;
 +};
 +
 +static int
 +iofd_in_range(struct kvm_io_device *this, gpa_t addr, int len, int is_write)
 +{
 + struct _iofd *iofd = (struct _iofd *)this-private;
 +
 + return ((addr = iofd-addr  (addr  iofd-addr + iofd-length)));
 +}
 +
 +/* writes trigger an event */
 +static void
 +iofd_write(struct kvm_io_device *this, gpa_t addr, int len, const void *val)
 +{
 + struct _iofd *iofd = (struct _iofd *)this-private;
 +
 + eventfd_signal(iofd-file, 1);
 +}
 +
 +/* reads return all zeros */
 +static void
 +iofd_read(struct kvm_io_device *this, gpa_t addr, int len, void *val)
 +{
 + memset(val, 0, len);
 +}
 +
 +static void
 +iofd_destructor(struct kvm_io_device *this)
 +{
 + struct _iofd *iofd = (struct _iofd *)this-private;
 +
 + fput(iofd-file);
 + kfree(iofd);
 +}
 +
 +static int
 +kvm_assign_iofd(struct kvm *kvm, unsigned long addr, size_t len,
 + int fd, int flags)
 +{
 + intpio = flags  KVM_IOFD_FLAG_PIO;
 + struct kvm_io_bus *bus = pio ? kvm-pio_bus : kvm-mmio_bus;
 + struct _iofd  *iofd;
 + struct 

Re: [PATCH 1/1] KVM: Fix potentially recursively get kvm lock

2009-05-12 Thread Marcelo Tosatti
On Tue, May 12, 2009 at 11:30:21AM -0300, Marcelo Tosatti wrote:
 On Tue, May 12, 2009 at 10:13:36PM +0800, Yang, Sheng wrote:
+   mutex_unlock(kvm-lock);
  
   assigned_dev list is protected by kvm-lock. So you could have another
   ioctl adding to it at the same time you're searching.
  
  Oh, yes... My fault... 
  
   Could either have a separate kvm-assigned_devs_lock, to protect
   kvm-arch.assigned_dev_head (users are ioctls that manipulate it), or
   change the IRQ injection to use a separate spinlock, kill the workqueue
   and call kvm_set_irq from the assigned device interrupt handler.
  
  Peferred the latter, though needs more work. But the only reason for put a 
  workqueue here is because kvm-lock is a mutex? I can't believe... If so, I 
  think we had made a big mistake - we have to fix all kinds of racy problem 
  caused by this, but finally find it's unnecessary... 
 
 One issue is that kvm_set_irq can take too long while interrupts are
 blocked, and you'd have to disable interrupts in other contexes that
 inject interrupts (say qemu-ioctl(SET_INTERRUPT)-...-), so all i can
 see is a tradeoff.

Or multiple kvm_set_irq calls for MSI.

 
 guess mode on
 
 But the interrupt injection path seems to be pretty short and efficient
 to happen in host interrupt context.
 
 guess mode off
 
 Avi, Gleb?
 
  Maybe another reason is kvm_kick_vcpu(), but have already fixed by you.
 
 Note you tested the spinlock_irq patch with GigE and there was no
 significant performance regression right?
 
  
  Continue to check the code...
  
  -- 
  regards
  Yang, Sheng
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [kvm] Re: [PATCH] kvm: Use a bitmap for tracking used GSIs

2009-05-12 Thread Alex Williamson
On Tue, 2009-05-12 at 12:45 -0600, Alex Williamson wrote:
 On Mon, 2009-05-11 at 12:00 +, Yang, Sheng wrote:
  On Friday 08 May 2009 06:22:20 Alex Williamson wrote:
   + /* Round the number of GSIs supported to a 4 byte
   +  * value so we can search it using ints and ffs */
   + i = kvm_get_gsi_count(kvm)  ~0x1f;
   + kvm-used_gsi_bitmap = malloc(i  3);
  
  3 or 5?
 
 3, ie. /8 (bits to bytes)
 
  I am a little confused by these magic numbers, including 0x1f...
 
 The 5 shift gives us the index into the array of ints, the 0x1f gives us
 the bit index into a specific int.  This is very similar to the code in
 hw/acpi.c.
 
  I think there are something can indicate the length of unsigned long in 
  QEmu(sorry, can't find it now...), so how about using ffsl() and get other 
  constants based on it?
 
 We'd probably want to use ffsll() so we can ignore 32b vs 64b longs.
 There's HOST_LONG_BITS, but that doesn't actually help defining a shift
 value.  Thanks,

Hmm, neither ffsl() or ffsll() are standard.  I'm inclined to stick with
32bits.  We'll likely only be using the first index on x86_64 anyway
(2nd on ia64).  Thanks,

Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [kvm] Re: [PATCH] kvm: Use a bitmap for tracking used GSIs

2009-05-12 Thread Michael S. Tsirkin
On Tue, May 12, 2009 at 01:06:54PM -0600, Alex Williamson wrote:
 On Tue, 2009-05-12 at 12:45 -0600, Alex Williamson wrote:
  On Mon, 2009-05-11 at 12:00 +, Yang, Sheng wrote:
   On Friday 08 May 2009 06:22:20 Alex Williamson wrote:
+   /* Round the number of GSIs supported to a 4 byte
+* value so we can search it using ints and ffs */
+   i = kvm_get_gsi_count(kvm)  ~0x1f;
+   kvm-used_gsi_bitmap = malloc(i  3);
   
   3 or 5?
  
  3, ie. /8 (bits to bytes)
  
   I am a little confused by these magic numbers, including 0x1f...
  
  The 5 shift gives us the index into the array of ints, the 0x1f gives us
  the bit index into a specific int.  This is very similar to the code in
  hw/acpi.c.
  
   I think there are something can indicate the length of unsigned long in 
   QEmu(sorry, can't find it now...), so how about using ffsl() and get 
   other 
   constants based on it?
  
  We'd probably want to use ffsll() so we can ignore 32b vs 64b longs.
  There's HOST_LONG_BITS, but that doesn't actually help defining a shift
  value.  Thanks,
 
 Hmm, neither ffsl() or ffsll() are standard.  I'm inclined to stick with
 32bits.  We'll likely only be using the first index on x86_64 anyway
 (2nd on ia64).  Thanks,
 
 Alex

With MSI, we start with GSI 24, so we'll be using more than just the
first index.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[KVM PATCH v7.1] kvm: add iofd support

2009-05-12 Thread Gregory Haskins
[ here is the updated header ]

iofd is a mechanism to register PIO/MMIO regions to trigger an eventfd
signal when written to.  Userspace can register any arbitrary address
with a corresponding eventfd.

Normal IO requires a synchronous/blocking round-trip since the operation
may cause side-effects in the emulated model or may return data to the caller.
However, there is a subclass of IO which acts purely as a trigger for other IO
(such as to kick off an out-of-band DMA request, etc).  For these patterns,
the synchronous call is particularly expensive because they are done
within the same context as the VCPU thread and therefore cause a VMX/SVM
heavy-weight exit, a transition back to userspace, and overhead with
the qemu locking and decoding operations.

Therefore providing the registration of an in-kernel trigger point allows the
VCPU to take a very brief lightweight exit only long enough to signal the
eventfd.  This also means that any clients compatible with the eventfd
interface (which includes userspace and kernelspace equally well) can now
register for asynchronous notification when one of these signals are generated
in the guest.  The end result should be a more flexible and higher performance
notification API for the backend KVM hypervisor and perhipheral components.

Signed-off-by: Gregory Haskins ghask...@novell.com
---

 include/linux/kvm.h  |   12 +
 include/linux/kvm_host.h |2 +
 virt/kvm/eventfd.c   |  107 ++
 virt/kvm/kvm_main.c  |   13 ++
 4 files changed, 134 insertions(+), 0 deletions(-)

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index dfc4bcc..99b6e45 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -292,6 +292,17 @@ struct kvm_guest_debug {
struct kvm_guest_debug_arch arch;
 };
 
+#define KVM_IOFD_FLAG_DEASSIGN  (1  0)
+#define KVM_IOFD_FLAG_PIO   (1  1)
+
+struct kvm_iofd {
+   __u64 addr;
+   __u32 len;
+   __u32 fd;
+   __u32 flags;
+   __u8  pad[12];
+};
+
 #define KVM_TRC_SHIFT   16
 /*
  * kvm trace categories
@@ -508,6 +519,7 @@ struct kvm_irqfd {
 #define KVM_DEASSIGN_DEV_IRQ   _IOW(KVMIO, 0x75, struct kvm_assigned_irq)
 #define KVM_ASSIGN_IRQFD   _IOW(KVMIO, 0x76, struct kvm_irqfd)
 #define KVM_DEASSIGN_IRQFD _IOW(KVMIO, 0x77, __u32)
+#define KVM_IOFD   _IOW(KVMIO, 0x78, struct kvm_iofd)
 
 /*
  * ioctls for vcpu fds
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1acc528..d53cb70 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -529,5 +529,7 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {}
 int kvm_assign_irqfd(struct kvm *kvm, int fd, int gsi, int flags);
 int kvm_deassign_irqfd(struct kvm *kvm, int fd);
 void kvm_irqfd_release(struct kvm *kvm);
+int kvm_iofd(struct kvm *kvm, unsigned long addr, size_t len,
+int fd, int flags);
 
 #endif
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 71afd62..8b23317 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -21,12 +21,16 @@
  */
 
 #include linux/kvm_host.h
+#include linux/kvm.h
 #include linux/workqueue.h
 #include linux/syscalls.h
 #include linux/wait.h
 #include linux/poll.h
 #include linux/file.h
 #include linux/list.h
+#include linux/eventfd.h
+
+#include iodev.h
 
 /*
  * 
@@ -185,3 +189,106 @@ kvm_irqfd_release(struct kvm *kvm)
list_for_each_entry_safe(irqfd, tmp, kvm-irqfds, list)
irqfd_release(irqfd);
 }
+
+/*
+ * 
+ * iofd: translate a PIO/MMIO memory write to an eventfd signal.
+ *
+ * userspace can register a PIO/MMIO address with an eventfd for recieving
+ * notification when the memory has been touched.
+ * 
+ */
+
+struct _iofd {
+   u64  addr;
+   size_t   length;
+   struct file *file;
+   struct kvm_io_device dev;
+};
+
+static int
+iofd_in_range(struct kvm_io_device *this, gpa_t addr, int len, int is_write)
+{
+   struct _iofd *iofd = (struct _iofd *)this-private;
+
+   return ((addr = iofd-addr  (addr  iofd-addr + iofd-length)));
+}
+
+/* writes trigger an event */
+static void
+iofd_write(struct kvm_io_device *this, gpa_t addr, int len, const void *val)
+{
+   struct _iofd *iofd = (struct _iofd *)this-private;
+
+   eventfd_signal(iofd-file, 1);
+}
+
+/* reads return all zeros */
+static void
+iofd_read(struct kvm_io_device *this, gpa_t addr, int len, void *val)
+{
+   memset(val, 0, len);
+}
+
+static void
+iofd_destructor(struct kvm_io_device *this)
+{
+   struct _iofd *iofd = (struct _iofd *)this-private;
+
+   fput(iofd-file);
+   kfree(iofd);
+}
+
+static int
+kvm_assign_iofd(struct kvm *kvm, unsigned long addr, size_t len,
+   

Re: [PATCH v2] kvm: Use a bitmap for tracking used GSIs

2009-05-12 Thread Michael S. Tsirkin

On Fri, May 08, 2009 at 04:31:21PM -0600, Alex Williamson wrote:
 +#ifdef KVM_CAP_IRQ_ROUTING

We don't need these anymore.

 +static inline void set_bit(unsigned int *buf, int bit)
 +{
 + buf[bit  5] |= (1U  (bit  0x1f));
 +}

external () not needed here. bit  5 might be clearer as bit / 32 IMO.

 +
 +static inline void clear_bit(unsigned int *buf, int bit)
 +{
 + buf[bit  5] = ~(1U  (bit  0x1f));
 +}

Make bit unsigned. And then bit  0x1f can be written as bit % 32.
IMO that's easier to parse.

 +
 +static int kvm_find_free_gsi(kvm_context_t kvm)
 +{
 + int i, bit, gsi;
 + unsigned int *buf = kvm-used_gsi_bitmap;
 +
 + for (i = 0; i  (kvm-max_gsi  5); i++) {

may be clearer as kvm-max_gsi / 32

 + if (buf[i] != ~0U)
 + break;
 + }

{} around single statement isn't needed

 +
 + if (i == kvm-max_gsi  5)
 + return -ENOSPC;

May be clearer as kvm-max_gsi / 32
By the way, this math means we can't use all gsi's
if the number is not a multiple of 32.
Round up instead? It's not hard: (kvm-max_gsi + 31) / 32

 +
 + bit = ffs(~buf[i]);
 + if (!bit)
 + return -EAGAIN;

We know it won't be 0, right?
Instead of checking twice, move the ffs call within the loop above?
E.g. like this:
for (i = 0; i  kvm-max_gsi / 32; ++i)
if ((bit = ffs(~buf[i])) {
gsi = bit - 1 + i * 32;
set_bit(buf, gsi);
return gsi;
}

return -ENOSPC;

 +
 + gsi = (bit - 1) | (i  5);

clearer as bit - 1 + i * 32

 + set_bit(buf, gsi);
 + return gsi;
 +}
 +#endif
 +
  int kvm_get_irq_route_gsi(kvm_context_t kvm)
  {
  #ifdef KVM_CAP_IRQ_ROUTING
 - if (kvm-max_used_gsi = KVM_IOAPIC_NUM_PINS)  {
 - if (kvm-max_used_gsi + 1  kvm_get_gsi_count(kvm))
 -return kvm-max_used_gsi + 1;
 -else
 -return -ENOSPC;
 -} else
 -return KVM_IOAPIC_NUM_PINS;
 + int gsi;
 +
 + pthread_mutex_lock(kvm-gsi_mutex);
 +
 + if (!kvm-max_gsi) {

Why is this lazy allocation required?
Let's do the below in some init function,
and keep code simple?

 + int i;
 +
 + /* Round the number of GSIs supported to a 4 byte
 +  * value so we can search it using ints and ffs */
 + i = kvm_get_gsi_count(kvm)  ~0x1f;

Should not we round up? Why not?
Again, we can count = kvm_get_gsi_count(kvm) / 32, and use count * 4 below.

 + kvm-used_gsi_bitmap = malloc(i  3);

why not qemu_malloc?

 + if (!kvm-used_gsi_bitmap) {
 + pthread_mutex_unlock(kvm-gsi_mutex);
 + return -ENOMEM;
 + }
 + memset(kvm-used_gsi_bitmap, 0, i  3);
 + kvm-max_gsi = i;
 +
 + /* Mark all the IOAPIC pin GSIs as already used */
 + for (i = 0; i = KVM_IOAPIC_NUM_PINS; i++)

Is this really =? Not ?

 + set_bit(kvm-used_gsi_bitmap, i);
 + }
 +
 + gsi = kvm_find_free_gsi(kvm);
 + pthread_mutex_unlock(kvm-gsi_mutex);
 + return gsi;
  #else
   return -ENOSYS;
  #endif
  }
 + 
 +void kvm_free_irq_route_gsi(kvm_context_t kvm, int gsi)
 +{
 +#ifdef KVM_CAP_IRQ_ROUTING
 + pthread_mutex_lock(kvm-gsi_mutex);
 + clear_bit(kvm-used_gsi_bitmap, gsi);
 + pthread_mutex_unlock(kvm-gsi_mutex);
 +#endif
 +}
  
  #ifdef KVM_CAP_DEVICE_MSIX
  int kvm_assign_set_msix_nr(kvm_context_t kvm,
 diff --git a/kvm/libkvm/libkvm.h b/kvm/libkvm/libkvm.h
 index c23d37b..4e9344c 100644
 --- a/kvm/libkvm/libkvm.h
 +++ b/kvm/libkvm/libkvm.h
 @@ -856,6 +856,16 @@ int kvm_commit_irq_routes(kvm_context_t kvm);
   */
  int kvm_get_irq_route_gsi(kvm_context_t kvm);
  
 +/*!
 + * \brief Free used GSI number
 + *
 + * Free used GSI number acquired from kvm_get_irq_route_gsi()
 + *
 + * \param kvm Pointer to the current kvm_context
 + * \param gsi GSI number to free
 + */
 +void kvm_free_irq_route_gsi(kvm_context_t kvm, int gsi);
 +
  #ifdef KVM_CAP_DEVICE_MSIX
  int kvm_assign_set_msix_nr(kvm_context_t kvm,
  struct kvm_assigned_msix_nr *msix_nr);
 

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/1] KVM: Fix potentially recursively get kvm lock

2009-05-12 Thread Marcelo Tosatti
On Tue, May 12, 2009 at 11:30:21AM -0300, Marcelo Tosatti wrote:
 On Tue, May 12, 2009 at 10:13:36PM +0800, Yang, Sheng wrote:
+   mutex_unlock(kvm-lock);
  
   assigned_dev list is protected by kvm-lock. So you could have another
   ioctl adding to it at the same time you're searching.
  
  Oh, yes... My fault... 
  
   Could either have a separate kvm-assigned_devs_lock, to protect
   kvm-arch.assigned_dev_head (users are ioctls that manipulate it), or
   change the IRQ injection to use a separate spinlock, kill the workqueue
   and call kvm_set_irq from the assigned device interrupt handler.
  
  Peferred the latter, though needs more work. But the only reason for put a 
  workqueue here is because kvm-lock is a mutex? I can't believe... If so, I 
  think we had made a big mistake - we have to fix all kinds of racy problem 
  caused by this, but finally find it's unnecessary... 
 
 One issue is that kvm_set_irq can take too long while interrupts are
 blocked, and you'd have to disable interrupts in other contexes that
 inject interrupts (say qemu-ioctl(SET_INTERRUPT)-...-), so all i can
 see is a tradeoff.
 
 guess mode on
 
 But the interrupt injection path seems to be pretty short and efficient
 to happen in host interrupt context.
 
 guess mode off
 
 Avi, Gleb?
 
  Maybe another reason is kvm_kick_vcpu(), but have already fixed by you.
 
 Note you tested the spinlock_irq patch with GigE and there was no
 significant performance regression right?
 
  
  Continue to check the code...

OK, it might take some time for bigger changes to happen. I've changed
your patch to drop the lock only around cancel_work_sync. Can deadlock
if someone else tries to mess with the assigned device at the same time,
but the VM won't go away under it because of the vmfd reference.

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4d00942..ba067db 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -250,7 +250,15 @@ static void deassign_host_irq(struct kvm *kvm,
disable_irq_nosync(assigned_dev-
   host_msix_entries[i].vector);
 
+   /*
+* FIXME: kvm_assigned_dev_interrupt_work_handler can deadlock
+* with cancel_work_sync, since it requires kvm-lock for irq
+* injection. This is a hack, the irq code must use
+* a separate lock.
+*/
+   mutex_unlock(kvm-lock);
cancel_work_sync(assigned_dev-interrupt_work);
+   mutex_lock(kvm-lock);
 
for (i = 0; i  assigned_dev-entries_nr; i++)
free_irq(assigned_dev-host_msix_entries[i].vector,
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] replace drop_interrupt_shadow by set_interrupt_shadow

2009-05-12 Thread Glauber Costa
This patch replaces drop_interrupt_shadow with the more
general set_interrupt_shadow, that can either drop or raise
it, depending on its parameter.

Signed-off-by: Glauber Costa glom...@redhat.com
CC: H. Peter Anvin h...@zytor.com
CC: Avi Kivity a...@redhat.com
CC: Gleb Natapov g...@redhat.com
---
 arch/x86/include/asm/kvm_host.h|3 +-
 arch/x86/include/asm/kvm_x86_emulate.h |3 ++
 arch/x86/kvm/svm.c |   32 +++-
 arch/x86/kvm/vmx.c |   49 +--
 arch/x86/kvm/x86.c |2 +-
 5 files changed, 63 insertions(+), 26 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8e680c3..3d933cf 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -510,6 +510,8 @@ struct kvm_x86_ops {
void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+   void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
+   u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
void (*patch_hypercall)(struct kvm_vcpu *vcpu,
unsigned char *hypercall_addr);
void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
@@ -521,7 +523,6 @@ struct kvm_x86_ops {
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
-   void (*drop_interrupt_shadow)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*get_tdp_level)(void);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h 
b/arch/x86/include/asm/kvm_x86_emulate.h
index 6a15973..be40d6e 100644
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ b/arch/x86/include/asm/kvm_x86_emulate.h
@@ -143,6 +143,9 @@ struct decode_cache {
struct fetch_cache fetch;
 };
 
+#define X86_SHADOW_INT_MOV_SS  1
+#define X86_SHADOW_INT_STI 2
+
 struct x86_emulate_ctxt {
/* Register state before/after emulation. */
struct kvm_vcpu *vcpu;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ef43a18..f4d1bd9 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -202,6 +202,27 @@ static int is_external_interrupt(u32 info)
return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
 }
 
+static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+   struct vcpu_svm *svm = to_svm(vcpu);
+   u32 ret = 0;
+
+   if (svm-vmcb-control.int_state  SVM_INTERRUPT_SHADOW_MASK)
+   ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS;
+   return ret  mask;
+}
+
+static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+   struct vcpu_svm *svm = to_svm(vcpu);
+
+   if (mask == 0)
+   svm-vmcb-control.int_state = ~SVM_INTERRUPT_SHADOW_MASK;
+   else
+   svm-vmcb-control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
+
+}
+
 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
struct vcpu_svm *svm = to_svm(vcpu);
@@ -215,7 +236,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
   __func__, kvm_rip_read(vcpu), svm-next_rip);
 
kvm_rip_write(vcpu, svm-next_rip);
-   svm-vmcb-control.int_state = ~SVM_INTERRUPT_SHADOW_MASK;
+   svm_set_interrupt_shadow(vcpu, 0);
 }
 
 static int has_svm(void)
@@ -2229,12 +2250,6 @@ static void pre_svm_run(struct vcpu_svm *svm)
new_asid(svm, svm_data);
 }
 
-static void svm_drop_interrupt_shadow(struct kvm_vcpu *vcpu)
-{
-   struct vcpu_svm *svm = to_svm(vcpu);
-   svm-vmcb-control.int_state = ~SVM_INTERRUPT_SHADOW_MASK;
-}
-
 static void svm_inject_nmi(struct kvm_vcpu *vcpu)
 {
struct vcpu_svm *svm = to_svm(vcpu);
@@ -2637,6 +2652,8 @@ static struct kvm_x86_ops svm_x86_ops = {
.run = svm_vcpu_run,
.handle_exit = handle_exit,
.skip_emulated_instruction = skip_emulated_instruction,
+   .set_interrupt_shadow = svm_set_interrupt_shadow,
+   .get_interrupt_shadow = svm_get_interrupt_shadow,
.patch_hypercall = svm_patch_hypercall,
.set_irq = svm_set_irq,
.set_nmi = svm_inject_nmi,
@@ -2646,7 +2663,6 @@ static struct kvm_x86_ops svm_x86_ops = {
.enable_nmi_window = enable_nmi_window,
.enable_irq_window = enable_irq_window,
.update_cr8_intercept = update_cr8_intercept,
-   .drop_interrupt_shadow = svm_drop_interrupt_shadow,
 
.set_tss_addr = svm_set_tss_addr,
.get_tdp_level = get_npt_level,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e8a5649..f3ab27b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -736,23 

[PATCH 2/2] deal with interrupt shadow state for emulated instruction

2009-05-12 Thread Glauber Costa
we currently unblock shadow interrupt state when we skip an instruction,
but failing to do so when we actually emulate one. This blocks interrupts
in key instruction blocks, in particular sti; hlt; sequences

If the instruction emulated is an sti, we have to block shadow interrupts.
The same goes for mov ss. pop ss also needs it, but we don't currently
emulate it.

Without this patch, I cannot boot gpxe option roms at vmx machines.
This is described at https://bugzilla.redhat.com/show_bug.cgi?id=494469

Signed-off-by: Glauber Costa glom...@redhat.com
CC: H. Peter Anvin h...@zytor.com
CC: Avi Kivity a...@redhat.com
CC: Gleb Natapov g...@redhat.com
---
 arch/x86/include/asm/kvm_x86_emulate.h |3 +++
 arch/x86/kvm/x86.c |6 +-
 arch/x86/kvm/x86_emulate.c |   20 
 3 files changed, 28 insertions(+), 1 deletions(-)

diff --git a/arch/x86/include/asm/kvm_x86_emulate.h 
b/arch/x86/include/asm/kvm_x86_emulate.h
index be40d6e..b7ed2c4 100644
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ b/arch/x86/include/asm/kvm_x86_emulate.h
@@ -155,6 +155,9 @@ struct x86_emulate_ctxt {
int mode;
u32 cs_base;
 
+   /* interruptibility state, as a result of execution of STI or MOV SS */
+   int interruptibility;
+
/* decode cache */
struct decode_cache decode;
 };
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3d8fcc5..b45baff 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2362,7 +2362,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
u16 error_code,
int emulation_type)
 {
-   int r;
+   int r, shadow_mask;
struct decode_cache *c;
 
kvm_clear_exception_queue(vcpu);
@@ -2416,6 +2416,10 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
}
 
r = x86_emulate_insn(vcpu-arch.emulate_ctxt, emulate_ops);
+   shadow_mask = vcpu-arch.emulate_ctxt.interruptibility;
+
+   if (r == 0)
+   kvm_x86_ops-set_interrupt_shadow(vcpu, shadow_mask);
 
if (vcpu-arch.pio.string)
return EMULATE_DO_MMIO;
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index d2664fc..c1b6c23 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -1361,6 +1361,20 @@ static inline int writeback(struct x86_emulate_ctxt 
*ctxt,
return 0;
 }
 
+void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
+{
+   u32 int_shadow = kvm_x86_ops-get_interrupt_shadow(ctxt-vcpu, mask);
+   /*
+* an sti; sti; sequence only disable interrupts for the first
+* instruction. So, if the last instruction, be it emulated or
+* not, left the system with the INT_STI flag enabled, it
+* means that the last instruction is an sti. We should not
+* leave the flag on in this case. The same goes for mov ss
+*/
+   if (!(int_shadow  mask))
+   ctxt-interruptibility = mask;
+}
+
 int
 x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
@@ -1372,6 +1386,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct 
x86_emulate_ops *ops)
int io_dir_in;
int rc = 0;
 
+   ctxt-interruptibility = 0;
+
/* Shadow copy of register state. Committed on successful emulation.
 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
 * modify them.
@@ -1618,6 +1634,9 @@ special_insn:
int err;
 
sel = c-src.val;
+   if (c-modrm_reg == VCPU_SREG_SS)
+   toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
+
if (c-modrm_reg = 5) {
type_bits = (c-modrm_reg == 1) ? 9 : 1;
err = kvm_load_segment_descriptor(ctxt-vcpu, sel,
@@ -1847,6 +1866,7 @@ special_insn:
c-dst.type = OP_NONE;  /* Disable writeback. */
break;
case 0xfb: /* sti */
+   toggle_interruptibility(ctxt, X86_SHADOW_INT_STI);
ctxt-eflags |= X86_EFLAGS_IF;
c-dst.type = OP_NONE;  /* Disable writeback. */
break;
-- 
1.5.6.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] msi-x: let drivers retry when not enough vectors

2009-05-12 Thread Michael S. Tsirkin
On Fri, May 08, 2009 at 09:25:00AM +0930, Rusty Russell wrote:
 On Thu, 7 May 2009 07:49:53 pm Sheng Yang wrote:
  On Thursday 07 May 2009 17:53:02 Matthew Wilcox wrote:
   Here's a good example.  Let's suppose you have a driver which supports
   two different models of cards, one has 16 MSI-X interrupts, the other
   has 10.  You can call pci_enable_msix() asking for 16 vectors.  If your
   card is model A, you get 16 interrupts.  If your card is model B, it says
   you can have 10.
 
 Sheng is absolutely right, that's a horrid API.
 
 If it actually enabled that number and returned it, it might make sense (cf. 
 write() returning less bytes than you give it).  But overloading the return 
 value to save an explicit call is just ugly; it's not worth saving a few 
 lines 
 of code at cost of making all the drivers subtle and tricksy.
 
 Fail with -ENOSPC or something.
 
 Rusty.

I do agree that returning a positive value from pci_enable_msix
it an ugly API (but note that this is the API that linux currently has).

Here's a wrapper that I ended up with in my driver:

static int enable_msix(struct pci_dev *dev, struct msix_entry *entries,
   int *options, int noptions)
{
int i;
for (i = 0; i  noptions; ++i)
if (!pci_enable_msix(dev, entries, options[i]))
return options[i];
return -EBUSY;
}

This gets an array of options for # of vectors and tries them one after
the other until an option that the system can support is found.
On success, we get the # of vectors actually enabled, and
driver can then use them as it sees fit.

Is there interest in moving something like this to pci.h?

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 3/3] KVM: x86: check for cr3 validity in mmu_alloc_roots

2009-05-12 Thread mtosatti
Verify the cr3 address stored in vcpu-arch.cr3 points to an existant
memslot. If not, inject a triple fault.

Signed-off-by: Marcelo Tosatti mtosa...@redhat.com

Index: kvm/arch/x86/kvm/mmu.c
===
--- kvm.orig/arch/x86/kvm/mmu.c
+++ kvm/arch/x86/kvm/mmu.c
@@ -1912,7 +1912,19 @@ static void mmu_free_roots(struct kvm_vc
vcpu-arch.mmu.root_hpa = INVALID_PAGE;
 }
 
-static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
+static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
+{
+   int ret = 0;
+
+   if (!kvm_is_visible_gfn(vcpu-kvm, root_gfn)) {
+   set_bit(KVM_REQ_TRIPLE_FAULT, vcpu-requests);
+   ret = 1;
+   }
+
+   return ret;
+}
+
+static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 {
int i;
gfn_t root_gfn;
@@ -1927,13 +1939,15 @@ static void mmu_alloc_roots(struct kvm_v
ASSERT(!VALID_PAGE(root));
if (tdp_enabled)
direct = 1;
+   if (mmu_check_root(vcpu, root_gfn))
+   return 1;
sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
  PT64_ROOT_LEVEL, direct,
  ACC_ALL, NULL);
root = __pa(sp-spt);
++sp-root_count;
vcpu-arch.mmu.root_hpa = root;
-   return;
+   return 0;
}
direct = !is_paging(vcpu);
if (tdp_enabled)
@@ -1950,6 +1964,8 @@ static void mmu_alloc_roots(struct kvm_v
root_gfn = vcpu-arch.pdptrs[i]  PAGE_SHIFT;
} else if (vcpu-arch.mmu.root_level == 0)
root_gfn = 0;
+   if (mmu_check_root(vcpu, root_gfn))
+   return 1;
sp = kvm_mmu_get_page(vcpu, root_gfn, i  30,
  PT32_ROOT_LEVEL, direct,
  ACC_ALL, NULL);
@@ -1958,6 +1974,7 @@ static void mmu_alloc_roots(struct kvm_v
vcpu-arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
}
vcpu-arch.mmu.root_hpa = __pa(vcpu-arch.mmu.pae_root);
+   return 0;
 }
 
 static void mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -1976,7 +1993,7 @@ static void mmu_sync_roots(struct kvm_vc
for (i = 0; i  4; ++i) {
hpa_t root = vcpu-arch.mmu.pae_root[i];
 
-   if (root) {
+   if (root  VALID_PAGE(root)) {
root = PT64_BASE_ADDR_MASK;
sp = page_header(root);
mmu_sync_children(vcpu, sp);
@@ -2311,9 +2328,11 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
goto out;
spin_lock(vcpu-kvm-mmu_lock);
kvm_mmu_free_some_pages(vcpu);
-   mmu_alloc_roots(vcpu);
+   r = mmu_alloc_roots(vcpu);
mmu_sync_roots(vcpu);
spin_unlock(vcpu-kvm-mmu_lock);
+   if (r)
+   goto out;
kvm_x86_ops-set_cr3(vcpu, vcpu-arch.mmu.root_hpa);
kvm_mmu_flush_tlb(vcpu);
 out:
Index: kvm/arch/x86/kvm/x86.c
===
--- kvm.orig/arch/x86/kvm/x86.c
+++ kvm/arch/x86/kvm/x86.c
@@ -4554,6 +4554,7 @@ int kvm_arch_set_memory_region(struct kv
 void kvm_arch_flush_shadow(struct kvm *kvm)
 {
kvm_mmu_zap_all(kvm);
+   kvm_reload_remote_mmus(kvm);
 }
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/1] KVM: Fix potentially recursively get kvm lock

2009-05-12 Thread Marcelo Tosatti
On Tue, May 12, 2009 at 03:36:27PM -0600, Alex Williamson wrote:
 On Tue, 2009-05-12 at 16:44 -0300, Marcelo Tosatti wrote:
  diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
  index 4d00942..ba067db 100644
  --- a/virt/kvm/kvm_main.c
  +++ b/virt/kvm/kvm_main.c
  @@ -250,7 +250,15 @@ static void deassign_host_irq(struct kvm *kvm,
  disable_irq_nosync(assigned_dev-
 host_msix_entries[i].vector);
   
  +   /*
  +* FIXME: kvm_assigned_dev_interrupt_work_handler can deadlock
  +* with cancel_work_sync, since it requires kvm-lock for irq
  +* injection. This is a hack, the irq code must use
  +* a separate lock.
  +*/
  +   mutex_unlock(kvm-lock);
  cancel_work_sync(assigned_dev-interrupt_work);
  +   mutex_lock(kvm-lock);
 
 Seems to work, I assume you've got a similar unlock/lock for the
 MSI/INTx block.  Thanks,

KVM: workaround workqueue / deassign_host_irq deadlock

I think I'm running into the following deadlock in the kvm kernel module
when trying to use device assignment:

CPU A   CPU B
kvm_vm_ioctl_deassign_dev_irq()
  mutex_lock(kvm-lock);   worker_thread()
  - kvm_deassign_irq()   -
kvm_assigned_dev_interrupt_work_handler()
- deassign_host_irq()  mutex_lock(kvm-lock);
  - cancel_work_sync() [blocked]

Workaround the issue by dropping kvm-lock for cancel_work_sync().

Reported-by: Alex Williamson alex.william...@hp.com
From: Sheng Yang sheng.y...@intel.com
Signed-off-by: Marcelo Tosatti mtosa...@redhat.com


diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4d00942..d4af719 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -250,7 +250,15 @@ static void deassign_host_irq(struct kvm *kvm,
disable_irq_nosync(assigned_dev-
   host_msix_entries[i].vector);
 
+   /*
+* FIXME: kvm_assigned_dev_interrupt_work_handler can deadlock
+* with cancel_work_sync, since it requires kvm-lock for irq
+* injection. This is a hack, the irq code must use
+* a separate lock. Same below for MSI.
+*/
+   mutex_unlock(kvm-lock);
cancel_work_sync(assigned_dev-interrupt_work);
+   mutex_lock(kvm-lock);
 
for (i = 0; i  assigned_dev-entries_nr; i++)
free_irq(assigned_dev-host_msix_entries[i].vector,
@@ -263,7 +271,9 @@ static void deassign_host_irq(struct kvm *kvm,
} else {
/* Deal with MSI and INTx */
disable_irq_nosync(assigned_dev-host_irq);
+   mutex_unlock(kvm-lock);
cancel_work_sync(assigned_dev-interrupt_work);
+   mutex_lock(kvm-lock);
 
free_irq(assigned_dev-host_irq, (void *)assigned_dev);
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4d00942..d4af719 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -250,7 +250,15 @@ static void deassign_host_irq(struct kvm *kvm,
disable_irq_nosync(assigned_dev-
   host_msix_entries[i].vector);
 
+   /*
+* FIXME: kvm_assigned_dev_interrupt_work_handler can deadlock
+* with cancel_work_sync, since it requires kvm-lock for irq
+* injection. This is a hack, the irq code must use
+* a separate lock. Same below for MSI.
+*/
+   mutex_unlock(kvm-lock);
cancel_work_sync(assigned_dev-interrupt_work);
+   mutex_lock(kvm-lock);
 
for (i = 0; i  assigned_dev-entries_nr; i++)
free_irq(assigned_dev-host_msix_entries[i].vector,
@@ -263,7 +271,9 @@ static void deassign_host_irq(struct kvm *kvm,
} else {
/* Deal with MSI and INTx */
disable_irq_nosync(assigned_dev-host_irq);
+   mutex_unlock(kvm-lock);
cancel_work_sync(assigned_dev-interrupt_work);
+   mutex_lock(kvm-lock);
 
free_irq(assigned_dev-host_irq, (void *)assigned_dev);
 


[KVM PATCH v7.2] kvm: add iofd support

2009-05-12 Thread Gregory Haskins
[ updated with figures, graphs, performance-test-harness info ]

iofd is a mechanism to register PIO/MMIO regions to trigger an eventfd
signal when written to by a guest.  Userspace can register any arbitrary
address with a corresponding eventfd and then pass the eventfd to a specific
end-point of interest for handling.

Normal IO requires a blocking round-trip since the operation may cause
side-effects in the emulated model or may return data to the caller.
Therefore, an IO in KVM traps from the guest to the host, causes a VMX/SVM
heavy-weight exit back to userspace, and is ultimately serviced by qemu's
device model synchronously before returning control back to the vcpu.

However, there is a subclass of IO which acts purely as a trigger for
other IO (such as to kick off an out-of-band DMA request, etc).  For these
patterns, the synchronous call is particularly expensive since we really
only want to simply get our notification transmitted asychronously and
return as quickly as possible.  All the sychronous infrastructure to ensure
proper data-dependencies are met in the normal IO case are just unecessary
overhead for signalling.  This adds additional computational load on the
system, as well as latency to the signalling path.

Therefore, we provide a mechanism for registration of an in-kernel trigger
point that allows the VCPU to only require a very brief, lightweight
exit just long enough to signal an eventfd.  This also means that any
clients compatible with the eventfd interface (which includes userspace
and kernelspace equally well) can now register to be notified. The end
result should be a more flexible and higher performance notification API
for the backend KVM hypervisor and perhipheral components.

To test this theory, we built a test-harness called doorbell.  This
module has a function called doorbell_ring() which simply increments a
counter for each time the doorbell is signaled.  It supports signalling
from either an eventfd, or an ioctl().

We then wired up two paths to the doorbell: One via QEMU via a registered
io region and through the doorbell ioctl().  The other is direct via iofd.

You can download this test harness here:

ftp://ftp.novell.com/dev/ghaskins/doorbell.tar.bz2

The measured results are as follows:

qemu-mmio: 11 iops, 9.09us rtt
iofd-mmio: 200100 iops, 5.00us rtt
iofd-pio:  367300 iops, 2.72us rtt

I didn't measure qemu-pio, because I have to figure out how to register a
PIO region, and I got lazy.  However, for now we can extrapolate based on
the data from the NULLIO runs of +2.56us for MMIO, and -350ns for HC, we
get:

qemu-pio: 153139 iops, 6.53us rtt
iofd-hc: 412585 iops, 2.37us rtt

these are just for fun, for now, until I can gather more data.

Here is a graph for your convenience:

http://developer.novell.com/wiki/images/7/76/Iofd-chart.png

The conclusion to draw is that we save about 4us by skipping the userspace
hop.



Signed-off-by: Gregory Haskins ghask...@novell.com
---

 include/linux/kvm.h  |   12 +
 include/linux/kvm_host.h |2 +
 virt/kvm/eventfd.c   |  107 ++
 virt/kvm/kvm_main.c  |   13 ++
 4 files changed, 134 insertions(+), 0 deletions(-)

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index dfc4bcc..99b6e45 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -292,6 +292,17 @@ struct kvm_guest_debug {
struct kvm_guest_debug_arch arch;
 };
 
+#define KVM_IOFD_FLAG_DEASSIGN  (1  0)
+#define KVM_IOFD_FLAG_PIO   (1  1)
+
+struct kvm_iofd {
+   __u64 addr;
+   __u32 len;
+   __u32 fd;
+   __u32 flags;
+   __u8  pad[12];
+};
+
 #define KVM_TRC_SHIFT   16
 /*
  * kvm trace categories
@@ -508,6 +519,7 @@ struct kvm_irqfd {
 #define KVM_DEASSIGN_DEV_IRQ   _IOW(KVMIO, 0x75, struct kvm_assigned_irq)
 #define KVM_ASSIGN_IRQFD   _IOW(KVMIO, 0x76, struct kvm_irqfd)
 #define KVM_DEASSIGN_IRQFD _IOW(KVMIO, 0x77, __u32)
+#define KVM_IOFD   _IOW(KVMIO, 0x78, struct kvm_iofd)
 
 /*
  * ioctls for vcpu fds
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1acc528..d53cb70 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -529,5 +529,7 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {}
 int kvm_assign_irqfd(struct kvm *kvm, int fd, int gsi, int flags);
 int kvm_deassign_irqfd(struct kvm *kvm, int fd);
 void kvm_irqfd_release(struct kvm *kvm);
+int kvm_iofd(struct kvm *kvm, unsigned long addr, size_t len,
+int fd, int flags);
 
 #endif
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 71afd62..8b23317 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -21,12 +21,16 @@
  */
 
 #include linux/kvm_host.h
+#include linux/kvm.h
 #include linux/workqueue.h
 #include linux/syscalls.h
 #include linux/wait.h
 #include linux/poll.h
 #include linux/file.h
 #include linux/list.h
+#include linux/eventfd.h
+
+#include 

Re: [PATCH 1/1] KVM: Fix potentially recursively get kvm lock

2009-05-12 Thread Alex Williamson
On Tue, 2009-05-12 at 19:09 -0300, Marcelo Tosatti wrote:
 KVM: workaround workqueue / deassign_host_irq deadlock
 
 I think I'm running into the following deadlock in the kvm kernel module
 when trying to use device assignment:
 
 CPU A   CPU B
 kvm_vm_ioctl_deassign_dev_irq()
   mutex_lock(kvm-lock);   worker_thread()
   - kvm_deassign_irq()   -
 kvm_assigned_dev_interrupt_work_handler()
 - deassign_host_irq()  mutex_lock(kvm-lock);
   - cancel_work_sync() [blocked]
 
 Workaround the issue by dropping kvm-lock for cancel_work_sync().
 
 Reported-by: Alex Williamson alex.william...@hp.com
 From: Sheng Yang sheng.y...@intel.com
 Signed-off-by: Marcelo Tosatti mtosa...@redhat.com

Perfect, thanks.

Acked-by: Alex Williamson alex.william...@hp.com

 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
 index 4d00942..d4af719 100644
 --- a/virt/kvm/kvm_main.c
 +++ b/virt/kvm/kvm_main.c
 @@ -250,7 +250,15 @@ static void deassign_host_irq(struct kvm *kvm,
   disable_irq_nosync(assigned_dev-
  host_msix_entries[i].vector);
  
 + /*
 +  * FIXME: kvm_assigned_dev_interrupt_work_handler can deadlock
 +  * with cancel_work_sync, since it requires kvm-lock for irq
 +  * injection. This is a hack, the irq code must use
 +  * a separate lock. Same below for MSI.
 +  */
 + mutex_unlock(kvm-lock);
   cancel_work_sync(assigned_dev-interrupt_work);
 + mutex_lock(kvm-lock);
  
   for (i = 0; i  assigned_dev-entries_nr; i++)
   free_irq(assigned_dev-host_msix_entries[i].vector,
 @@ -263,7 +271,9 @@ static void deassign_host_irq(struct kvm *kvm,
   } else {
   /* Deal with MSI and INTx */
   disable_irq_nosync(assigned_dev-host_irq);
 + mutex_unlock(kvm-lock);
   cancel_work_sync(assigned_dev-interrupt_work);
 + mutex_lock(kvm-lock);
  
   free_irq(assigned_dev-host_irq, (void *)assigned_dev);
  


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] Re: Question about KVM and PC speaker

2009-05-12 Thread malc
On Wed, 13 May 2009, Sebastian Herbszt wrote:

 Jan Kiszka wrote:
  Sebastian Herbszt wrote:
   With the modified vgabios (see below) it does beep on bochs on vista,
   but not on qemu.
  
  Works fine for me!
  
  Did you check twice that your modified vgabios is actually loaded by
  qemu (e.g. via strace)?
 
 Yes, it uses the right rom.
 
  Moreover, does sound work at all with your qemu?
  The image I tried [1] issues two beeps after loading (obviously via
  direct hw access) - a good way to check general support. Note that one
  reason for broken host sound with qemu can be OSS. For that reason I
  always configure my qemu with --audio-drv-list=alsa.
 
 Thats a good hint :)
 Seems i used to compile qemu without --audio-drv-list. Since dsound and
 fmod drivers don't compile here (i likely miss some libs in my mingw), i
 used sdl.

Don't do that. Here's a nice tutorial Kazu made that will probably help 
you: http://www.h7.dion.ne.jp/~qemu-win/Audio-en.html

 Now i can hear those two beeps with the image you suggested. Tho those are
 coming
 thru my sound card and not the hosts pc speaker (even with -soundhw pcspk,
 but maybe
 that option means something different).

And it will always come through your soundcard. pcspk is not a passthrough
thing.

 With INT 10h AH=0Eh i now can hear a beep too, but it doesn't stop and qemu
 somewhat freezes.

Huh?

  Jan
  
  PS: Your patch was mangled by your mail client. Fortunately, it was
  small enough for manual fixing.
 
 Unfortunatelly Windows Mail tends to do this and i failed to find an option to
 fix it.
 
 - Sebastian
 
 
 

-- 
mailto:av1...@comtv.ru
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH] bios: Use the correct mask to size the PCI option ROM BAR

2009-05-12 Thread Alex Williamson
On Tue, 2009-05-12 at 23:41 +0100, Paul Brook wrote:
 On Tuesday 12 May 2009, Alex Williamson wrote:
  Bit 0 is the enable bit, which we not only don't want to set, but
  it will stick and make us think it's an I/O port resource.
 
 Why is the ROM slot special? Doesn't the same apply to all BARs?

The PCI option (or expansion) ROM is assumed to be in MMIO space, so bit
0 becomes the enable bit rather than the memory space flag.  The option
ROM is also only a 4 byte register.  The regular 6 base address
registers can support MMIO or I/O port addresses and for PCI 2.0 (iirc),
2 regular base address registers can be combined to describe an 8 byte
address.  You can look at drivers/pci/probe.c:__pci_read_base() in the
Linux source code and note that it makes the same special case for
sizing the ROM BAR (~PCI_ROM_ADDRESS_ENABLE vs ~0).

Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


XP smp using a lot of CPU

2009-05-12 Thread Ross Boylan
I just installed XP into a new VM, specifying -smp 2 for the machine.
According to top, it's using nearly 200% of a cpu even when I'm not
doing anything.

Is this real CPU useage, or just a reporting problem (just as my disk
image is big according to ls, but isn't really)?

If it's real, is there anything I can do about it?

kvm 0.7.2 on Debian Lenny (but 2.6.29 kernel), amd64.  Xeon chips; 32
bit version of XP pro installed, now fully patched (including the
Windows Genuine Advantage stuff, though I cancelled it when it wanted to
run).  

Task manager in XP shows virtually no CPU useage.

Please cc me on responses.

Thanks for any assistance.
-- 
Ross Boylan  wk:  (415) 514-8146
185 Berry St #5700   r...@biostat.ucsf.edu
Dept of Epidemiology and Biostatistics   fax: (415) 514-8150
University of California, San Francisco
San Francisco, CA 94107-1739 hm:  (415) 550-1062

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Best choice for copy/clone/snapshot

2009-05-12 Thread Ross Boylan
First, I have a feeling this might be a question I could ask on a qemu
list.  Is there a way for me to tell which questions should go where?
Is it OK to ask here?

As I install software onto a system I want to preserve its state--just
the disk state---at various points so I can go back.  What is the best
way to do this?

First, I think I could just make a copy of the virtual disk, although I
haven't seen this suggested anywhere.  I assume this will work if the VM
is off; are there other circumstances in which it is safe?  Since my
original virtual disk file isn't really occupying its nominal space, I
assume this will be true of the copy too.

Second, kvm-img could create a copy on write image.  There are several
things I don't understand about this.  Suppose I go
kvm-img -b A.img  B.img

If I then go on and use A.img as I did before, changing what is on disk,
have I screwed up B.img?

Do A.img or B.img have to be qcow2 format?  I created a raw image for
portability.

Suppose I work for awhile installing new stuff on B.img, and then want
to preserve the state.  Is
kvm-img -b B.img C.img
sensible, or is this kind of recursive operation (B.img is already the
copy on write version of A.img) not OK?

Does ‘commit [-f fmt] filename’, documented as
Commit the changes recorded in filename in its base image.
mean commit the recorded changes TO its base image?

Here are some other things I think I don't want to do.  Please let me
know if I'm mistaken.

-snapshot on the kvm command line: nothing persistent comes of this
(maybe if you commit you update the original image, but you don't get
2).

snapshot in the monitor: this snapshots the non-disk state of the VM;
further, that state is not guaranteed to work if you later change what
is on the disk.  I think kvm-img snapshot also accesses these
facilities.

Yours in confusion :)
Ross

P.S. Please cc me.
-- 
Ross Boylan  wk:  (415) 514-8146
185 Berry St #5700   r...@biostat.ucsf.edu
Dept of Epidemiology and Biostatistics   fax: (415) 514-8150
University of California, San Francisco
San Francisco, CA 94107-1739 hm:  (415) 550-1062

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: XP smp using a lot of CPU

2009-05-12 Thread Elias Probst
Hi,

I'm facing the same problem here.

A Windows XP (SP3 + all current Updates) which uses constantly ~95% CPU on 
each core.

I've already tried:
- Using only 1 core
- Replacing the HAL in Windows
- Using another KVM version
- Using different timer options for KVM

I'm currently on KVM-85 where the problem still exists.

Regards, Elias P.


On Wednesday 13 May 2009 02:41:38 Ross Boylan wrote:
 I just installed XP into a new VM, specifying -smp 2 for the machine.
 According to top, it's using nearly 200% of a cpu even when I'm not
 doing anything.

 Is this real CPU useage, or just a reporting problem (just as my disk
 image is big according to ls, but isn't really)?

 If it's real, is there anything I can do about it?

 kvm 0.7.2 on Debian Lenny (but 2.6.29 kernel), amd64.  Xeon chips; 32
 bit version of XP pro installed, now fully patched (including the
 Windows Genuine Advantage stuff, though I cancelled it when it wanted to
 run).

 Task manager in XP shows virtually no CPU useage.

 Please cc me on responses.

 Thanks for any assistance.



signature.asc
Description: This is a digitally signed message part.


Re: [PATCH 1/3] virtio: find_vqs/del_vqs virtio operations

2009-05-12 Thread Rusty Russell
On Wed, 13 May 2009 01:03:30 am Michael S. Tsirkin wrote:
 On Wed, May 13, 2009 at 12:00:02AM +0930, Rusty Russell wrote
  and perhaps consider
  varargs for the callbacks (or would that be too horrible at the
  implementation end?)
 
  Thanks,
  Rusty.

 Ugh ... I think it will be. And AFAIK gcc generates a lot of code
 for varargs - not something we want to do in each interrupt handler.

Err, no I mean for find_vqs:  eg.
(block device)
err = vdev-config-find_vqs(vdev, 1, vblk-vq, blk_done);

(net device)
err = vdev-config-find_vqs(vdev, 3, vqs, skb_recv_done, 
skb_xmit_done, NULL);

A bit neater for for the single-queue case.

Cheers,
Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH, RFC] virtio_blk: add cache flush command

2009-05-12 Thread Rusty Russell
On Tue, 12 May 2009 11:48:36 pm Christian Borntraeger wrote:
 Am Tuesday 12 May 2009 15:54:14 schrieb Rusty Russell:
  On Mon, 11 May 2009 06:09:08 pm Christoph Hellwig wrote:
   Do we need a new feature flag for this command or can we expect that
   all previous barrier support was buggy enough anyway?
 
  You mean reuse the VIRTIO_BLK_F_BARRIER for this as well?  Seems fine.

 It is also used by kuli
 (http://www.ibm.com/developerworks/linux/linux390/kuli.html) and kuli used
 fdatasync.

OK, that's sufficient for me.  It's not like block is going to catch up with 
net 
for feature flags any time soon.

Christoph, please make a new feature flag.

Oh and FYI Christian, you might not have seen this patch:

lguest: barrier me harder

Impact: barrier correctness in example launcher

I doubt either lguest user will complain about performance.

Reported-by: Christoph Hellwig h...@infradead.org
Cc: Jens Axboe jens.ax...@oracle.com
Signed-off-by: Rusty Russell ru...@rustcorp.com.au
---
 Documentation/lguest/lguest.c |7 +++
 1 file changed, 7 insertions(+)

diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -1630,6 +1630,13 @@ static bool service_io(struct device *de
}
}
 
+   /* OK, so we noted that it was pretty poor to use an fdatasync as a
+* barrier.  But Christoph Hellwig points out that we need a sync
+* *afterwards* as well: Barriers specify no reordering to the front
+* or the back.  And Jens Axboe confirmed it, so here we are: */
+   if (out-type  VIRTIO_BLK_T_BARRIER)
+   fdatasync(vblk-fd);
+
/* We can't trigger an IRQ, because we're not the Launcher.  It does
 * that when we tell it we're done. */
add_used(dev-vq, head, wlen);

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[ kvm-Bugs-2791009 ] -vga vmware + X-windows: display errors, jumpy mouse, hangs

2009-05-12 Thread SourceForge.net
Bugs item #2791009, was opened at 2009-05-12 20:58
Message generated for change (Tracker Item Submitted) made by jiggly
You can respond by visiting: 
https://sourceforge.net/tracker/?func=detailatid=893831aid=2791009group_id=180599

Please note that this message will contain a full copy of the comment thread,
including the initial issue submission, for this request,
not just the latest update.
Category: intel
Group: None
Status: Open
Resolution: None
Priority: 5
Private: No
Submitted By: Balzac von Jigglypuff (jiggly)
Assigned to: Nobody/Anonymous (nobody)
Summary: -vga vmware + X-windows: display errors, jumpy mouse, hangs

Initial Comment:
With -vga vmware, Xorg (in the guest) doesn't display properly. The display is 
vaguely legible, but it's scrambled. With -vga vmware -smp 2, I get the same 
display problem, plus the mouse jumps erratically and kvm hangs.

CPU: Core 2 Duo P8600
KVM: kvm-85 (Gentoo ebuild kvm-85-r1, ncurses enabled, sdl enabled, all other 
USE flags disabled)
Host kernel: 2.6.29 (linux-2.6.29-gentoo-r3)
Host kernel arch: x86_64
Host distro: Gentoo AMD64
Guest kernel: 2.6.29 (linux-2.6.29-gentoo-r3), no virtio, no KVM_GUEST
Guest kernel arch: x86_64
Guest distro: Gentoo AMD64
Command line: kvm -kernel vmlinuz -append root=/dev/hda -hda 
/dev/mapper/vg0-linroot -vga vmware
Command line (alternate): kvm -smp 2 -kernel vmlinuz -append root=/dev/hda -hda 
/dev/mapper/vg0-linroot -vga vmware

Here's a more detailed description of the problem:

I'm trying to run Xorg on a Gentoo guest. With kvm -vga std and the vesa X 
driver, everything works fine, even with -smp 2. With kvm -vga vmware and the 
vmware X driver, though, the X display is scrambled. I can barely see some 
garbled twm xterm windows (see attached screenshot), and I can type in those 
xterms. I tried different resolutions and HorizSync/VertRefresh's, but it's 
always scrambled.

If I use kvm -vga vmware -smp 2, the display is similarly scrambled, and 
additionally the mouse is unusable. If I move the mouse, the guest pointer 
starts jumping all over erratically. Sometimes the pointer settles down and 
stops (until I touch the mouse again), sometimes it keeps jumping around. Xorg 
usually outputs a bunch of [mi] EQ overflowing. The server is probably stuck 
in an infinite loop. errors while this is happening.

If I keep moving the mouse, kvm eventually hangs: I can't ungrab the pointer 
from the kvm window, so I have to switch to VT1; top shows that kvm is using 
max CPU; kvm doesn't respond to SIGINT; and I have to kill -9 kvm. The amount 
of mouse waving it takes to hang kvm varies (~5s-120s), and it's only 
reproducible ~20% of the time. If the guest kernel has any paravirtualization, 
though, the hang happens much faster and is 100% reproducible.

--

You can respond by visiting: 
https://sourceforge.net/tracker/?func=detailatid=893831aid=2791009group_id=180599
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3] kvm: Use a bitmap for tracking used GSIs

2009-05-12 Thread Alex Williamson
On Tue, 2009-05-12 at 21:42 -0600, Alex Williamson wrote:
 On Wed, 2009-05-13 at 11:30 +0800, Yang, Sheng wrote:
   + kvm-used_gsi_bitmap = malloc(gsi_bytes);
   + if (!kvm-used_gsi_bitmap) {
   + pthread_mutex_unlock(kvm-gsi_mutex);
   + goto out_close;
   + }
   + memset(kvm-used_gsi_bitmap, 0, gsi_bytes);
   + kvm-max_gsi = gsi_bytes * 8;
  
  So max_gsi = gsi_count / 4?

kvm-max_gsi actually becomes the number of GSIs available in the
bitmap, which may be more than gsi_count if we rounded up.  We
preallocate GSIs between gsi_count and max_gsi to avoid using them.
This just lets us not need to special case testing whether a bit in the
last index is  gsi_count.  Am I overlooking anything here?  Thanks,

Alex


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3] kvm: Use a bitmap for tracking used GSIs

2009-05-12 Thread Yang, Sheng
On Wednesday 13 May 2009 12:10:34 Alex Williamson wrote:
 On Tue, 2009-05-12 at 21:42 -0600, Alex Williamson wrote:
  On Wed, 2009-05-13 at 11:30 +0800, Yang, Sheng wrote:
+   kvm-used_gsi_bitmap = malloc(gsi_bytes);
+   if (!kvm-used_gsi_bitmap) {
+   pthread_mutex_unlock(kvm-gsi_mutex);
+   goto out_close;
+   }
+   memset(kvm-used_gsi_bitmap, 0, gsi_bytes);
+   kvm-max_gsi = gsi_bytes * 8;
  
   So max_gsi = gsi_count / 4?

 kvm-max_gsi actually becomes the number of GSIs available in the
 bitmap, which may be more than gsi_count if we rounded up.  We
 preallocate GSIs between gsi_count and max_gsi to avoid using them.
 This just lets us not need to special case testing whether a bit in the
 last index is  gsi_count.  Am I overlooking anything here?  Thanks,

Oh, I understand that, and I just follow the logic of last comment here(which 
gsi_bytes = (gsi_count + 31) / 32 )... Sorry for confusion...

-- 
regards
Yang, Sheng
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html