Re: KVM guest 100% cpu lock up

2013-04-01 Thread Phil Daws
Froze again :( would it be worth upgrading to a new kernel on the host instead 
of using the stock once from CentOS 6.4 ? I am not overcommitting memory of the 
machine so believe that is ruled out. I installed watchdog on the guest as-well 
to reboot it this happened again but that did not work :( Any thoughts on how 
to resolve the problem ? Here is how the guest is being started:

qemu 15773 22.1 26.2 2588540 2111252 ? Sl   09:28   5:16 
/usr/libexec/qemu-kvm -name vs2 -S -M rhel6.4.0 -enable-kvm -m 2048 -smp 
1,sockets=1,cores=1,threads=1 -uuid 38fb2902-06a5-0781-f14a-f17bf96668f1 
-nodefconfig -nodefaults -chardev 
socket,id=charmonitor,path=/var/lib/libvirt/qemu/vs2.monitor,server,nowait -mon 
chardev=charmonitor,id=monitor,mode=control -rtc base=utc -no-shutdown -device 
piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 -drive 
file=/images/vs2.img,if=none,id=drive-virtio-disk0,format=raw,cache=none 
-device 
virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1
 -drive if=none,media=cdrom,id=drive-ide0-1-0,readonly=on,format=raw -device 
ide-drive,bus=ide.1,unit=0,drive=drive-ide0-1-0,id=ide0-1-0 -netdev 
tap,fd=23,id=hostnet0,vhost=on,vhostfd=25 -device 
virtio-net-pci,netdev=hostnet0,id=net0,mac=54:52:00:02:01:04,bus=pci.0,addr=0x3 
-chardev pty,id=charserial0 -device isa-serial,chardev=charserial0,id=serial0 
-device usb-tablet,id=input0 -vnc 127.0.0.1:2 -vga cirrus -device 
virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x6

Thanks.


- Original Message -
From: Phil Daws ux...@splatnix.net
To: kvm@vger.kernel.org
Cc: Andrew Jones drjo...@redhat.com
Sent: Wednesday, 27 March, 2013 9:03:52 AM
Subject: Re: KVM guest 100% cpu lock up


This problem still persists :( I have read a few articles about setting the 
clocksource to acpi_pm and the guest was stable for a couple of days, then low 
and behold the CPU on the guest started to climb too 100% and completely froze. 
 Only way to recover it was hard reset on the guest machine :(  Any thoughts ?

- Original Message -
From: Phil Daws ux...@splatnix.net
To: Andrew Jones drjo...@redhat.com
Cc: kvm@vger.kernel.org
Sent: Monday, 18 March, 2013 4:10:11 PM
Subject: Re: KVM guest 100% cpu lock up


- Original Message -
From: Andrew Jones drjo...@redhat.com
To: Phil Daws ux...@splatnix.net
Cc: kvm@vger.kernel.org
Sent: Monday, 18 March, 2013 4:08:48 PM
Subject: Re: KVM guest 100% cpu lock up



- Original Message -
 Hello,
 
 I am having an intermittent issue where one of my KVM guests is
 locking up and when checking the process its running @ 100% cpu.
  The host is CentOS 6.4 running the kernel
 kernel-2.6.32-358.0.1.el6.x86_64.
 
 Would it be worth attempting to compile a 3.2 kernel and use the
 latest qemu-kvm package instead ?
 

Can you please supply the full qemu command line?

thanks,
drew


 Thanks.
 --

qemu  5919 22.5 22.3 2705424 1798484 ? Sl   15:23   4:44 
/usr/libexec/qemu-kvm -name vs2 -S -M rhel6.4.0 -enable-kvm -m 2048 -smp 
2,sockets=2,cores=1,threads=1 -uuid 38fb2902-06a5-0781-f14a-f17bf96668f1 
-nodefconfig -nodefaults -chardev 
socket,id=charmonitor,path=/var/lib/libvirt/qemu/vs2.monitor,server,nowait -mon 
chardev=charmonitor,id=monitor,mode=control -rtc base=utc -no-shutdown -device 
piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 -drive 
file=/images/vs2.img,if=none,id=drive-virtio-disk0,format=raw,cache=none 
-device 
virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1
 -drive if=none,media=cdrom,id=drive-ide0-1-0,readonly=on,format=raw -device 
ide-drive,bus=ide.1,unit=0,drive=drive-ide0-1-0,id=ide0-1-0 -netdev 
tap,fd=22,id=hostnet0,vhost=on,vhostfd=23 -device 
virtio-net-pci,netdev=hostnet0,id=net0,mac=54:52:00:02:01:04,bus=pci.0,addr=0x3 
-chardev pty,id=charserial0 -device isa-serial,chardev=charserial0,id=serial0 
-devi
 ce usb-tablet,id=input0 -vnc 127.0.0.1:2 -vga cirrus -device 
intel-hda,id=sound0,bus=pci.0,addr=0x4 -device 
hda-duplex,id=sound0-codec0,bus=sound0.0,cad=0 -device 
virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x6
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 1/6] KVM: MMU: retain more available bits on mmio spte

2013-04-01 Thread Xiao Guangrong
Let mmio spte only use bit62 and bit63 on upper 32 bits, then bit 52 ~ bit 61
can be used for other purposes

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/x86/kvm/vmx.c |4 ++--
 arch/x86/kvm/x86.c |8 +++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 03f5746..915ef56 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3983,10 +3983,10 @@ static void ept_set_mmio_spte_mask(void)
/*
 * EPT Misconfigurations can be generated if the value of bits 2:0
 * of an EPT paging-structure entry is 110b (write/execute).
-* Also, magic bits (0xffull  49) is set to quickly identify mmio
+* Also, magic bits (0x3ull  62) is set to quickly identify mmio
 * spte.
 */
-   kvm_mmu_set_mmio_spte_mask(0xffull  49 | 0x6ull);
+   kvm_mmu_set_mmio_spte_mask((0x3ull  62) | 0x6ull);
 }
 
 /*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9ce99ed..12ad5b5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5194,7 +5194,13 @@ static void kvm_set_mmio_spte_mask(void)
 * Set the reserved bits and the present bit of an paging-structure
 * entry to generate page fault with PFER.RSV = 1.
 */
-   mask = ((1ull  (62 - maxphyaddr + 1)) - 1)  maxphyaddr;
+/* Mask the reserved physical address bits. */
+   mask = ((1ull  (51 - maxphyaddr + 1)) - 1)  maxphyaddr;
+
+   /* Bit 62 is always reserved for 32bit host. */
+   mask |= 0x3ull  62;
+
+   /* Set the present bit. */
mask |= 1ull;
 
 #ifdef CONFIG_X86_64
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 2/6] KVM: MMU: store generation-number into mmio spte

2013-04-01 Thread Xiao Guangrong
Store the generation-number into bit3 ~ bit11 and bit52 ~ bit61, totally
19 bits can be used, it should be enough for nearly all most common cases

In this patch, the generation-number is always 0, it will be changed in
the later patch

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/x86/kvm/mmu.c |   58 +++
 arch/x86/kvm/mmutrace.h|   10 ---
 arch/x86/kvm/paging_tmpl.h |3 +-
 3 files changed, 55 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1ebca53..be4f733 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -197,15 +197,50 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 
-static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
+/*
+ * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number,
+ * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation
+ * number.
+ */
+#define MMIO_SPTE_GEN_LOW_SHIFT3
+#define MMIO_SPTE_GEN_HIGH_SHIFT   52
+
+#define MMIO_GEN_LOW_SHIFT 9
+#define MMIO_GEN_LOW_MASK  ((1  MMIO_GEN_LOW_SHIFT) - 1)
+#define MMIO_MAX_GEN   ((1  19) - 1)
+
+static u64 generation_mmio_spte_mask(unsigned int gen)
 {
-   struct kvm_mmu_page *sp =  page_header(__pa(sptep));
+   u64 mask;
+
+   WARN_ON(gen  MMIO_MAX_GEN);
+
+   mask = (gen  MMIO_GEN_LOW_MASK)  MMIO_SPTE_GEN_LOW_SHIFT;
+   mask |= ((u64)gen  MMIO_GEN_LOW_SHIFT)  MMIO_SPTE_GEN_HIGH_SHIFT;
+   return mask;
+}
+
+static unsigned int get_mmio_spte_generation(u64 spte)
+{
+   unsigned int gen;
+
+   spte = ~shadow_mmio_mask;
+
+   gen = (spte  MMIO_SPTE_GEN_LOW_SHIFT)  MMIO_GEN_LOW_MASK;
+   gen |= (spte  MMIO_SPTE_GEN_HIGH_SHIFT)  MMIO_GEN_LOW_SHIFT;
+   return gen;
+}
+
+static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
+  unsigned access)
+{
+   u64 mask = generation_mmio_spte_mask(0);
 
access = ACC_WRITE_MASK | ACC_USER_MASK;
+   mask |= shadow_mmio_mask | access | gfn  PAGE_SHIFT;
 
-   sp-mmio_cached = true;
-   trace_mark_mmio_spte(sptep, gfn, access);
-   mmu_spte_set(sptep, shadow_mmio_mask | access | gfn  PAGE_SHIFT);
+   trace_mark_mmio_spte(sptep, gfn, access, 0);
+   mmu_spte_set(sptep, mask);
 }
 
 static bool is_mmio_spte(u64 spte)
@@ -223,10 +258,11 @@ static unsigned get_mmio_spte_access(u64 spte)
return (spte  ~shadow_mmio_mask)  ~PAGE_MASK;
 }
 
-static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access)
+static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
+ pfn_t pfn, unsigned access)
 {
if (unlikely(is_noslot_pfn(pfn))) {
-   mark_mmio_spte(sptep, gfn, access);
+   mark_mmio_spte(kvm, sptep, gfn, access);
return true;
}
 
@@ -2335,7 +2371,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
u64 spte;
int ret = 0;
 
-   if (set_mmio_spte(sptep, gfn, pfn, pte_access))
+   if (set_mmio_spte(vcpu-kvm, sptep, gfn, pfn, pte_access))
return 0;
 
spte = PT_PRESENT_MASK;
@@ -3388,8 +3424,8 @@ static inline void protect_clean_gpte(unsigned *access, 
unsigned gpte)
*access = mask;
 }
 
-static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
-  int *nr_present)
+static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
+  unsigned access, int *nr_present)
 {
if (unlikely(is_mmio_spte(*sptep))) {
if (gfn != get_mmio_spte_gfn(*sptep)) {
@@ -3398,7 +3434,7 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, 
unsigned access,
}
 
(*nr_present)++;
-   mark_mmio_spte(sptep, gfn, access);
+   mark_mmio_spte(kvm, sptep, gfn, access);
return true;
}
 
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index b8f6172..f5b62a7 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -197,23 +197,25 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
 
 TRACE_EVENT(
mark_mmio_spte,
-   TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access),
-   TP_ARGS(sptep, gfn, access),
+   TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access, unsigned int gen),
+   TP_ARGS(sptep, gfn, access, gen),
 
TP_STRUCT__entry(
__field(void *, sptep)
__field(gfn_t, gfn)
__field(unsigned, access)
+   __field(unsigned int, gen)
),
 
TP_fast_assign(
__entry-sptep = sptep;
__entry-gfn = gfn;
__entry-access = access;
+   __entry-gen = gen;
),
 
-   TP_printk(sptep:%p gfn %llx access %x, __entry-sptep, __entry-gfn,
-

[PATCH v2 6/6] KVM: MMU: init kvm generation close to mmio wrap-around value

2013-04-01 Thread Xiao Guangrong
Then it has chance to trigger mmio generation number wrap-around

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/x86/include/asm/kvm_host.h |1 +
 arch/x86/kvm/mmu.c  |8 
 virt/kvm/kvm_main.c |6 ++
 3 files changed, 15 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6c1e642..4e1f7cb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -767,6 +767,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 struct kvm_memory_slot *slot,
 gfn_t gfn_offset, unsigned long mask);
 void kvm_mmu_zap_all(struct kvm *kvm);
+void kvm_arch_init_generation(struct kvm *kvm);
 void kvm_mmu_invalid_mmio_sptes(struct kvm *kvm);
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d314e21..dcc059c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4279,6 +4279,14 @@ restart:
spin_unlock(kvm-mmu_lock);
 }
 
+void kvm_arch_init_generation(struct kvm *kvm)
+{
+   mutex_lock(kvm-slots_lock);
+   /* It is easier to trigger mmio generation-number wrap-around. */
+   kvm_memslots(kvm)-generation = MMIO_MAX_GEN - 13;
+   mutex_unlock(kvm-slots_lock);
+}
+
 void kvm_mmu_invalid_mmio_sptes(struct kvm *kvm)
 {
/*
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ff71541..d21694a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -459,6 +459,10 @@ static void kvm_init_memslots_id(struct kvm *kvm)
slots-id_to_index[i] = slots-memslots[i].id = i;
 }
 
+void __attribute__((weak)) kvm_arch_init_generation(struct kvm *kvm)
+{
+}
+
 static struct kvm *kvm_create_vm(unsigned long type)
 {
int r, i;
@@ -505,6 +509,8 @@ static struct kvm *kvm_create_vm(unsigned long type)
mutex_init(kvm-slots_lock);
atomic_set(kvm-users_count, 1);
 
+   kvm_arch_init_generation(kvm);
+
r = kvm_init_mmu_notifier(kvm);
if (r)
goto out_err;
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 5/6] KVM: MMU: add tracepoint for check_mmio_spte

2013-04-01 Thread Xiao Guangrong
It is useful for debug mmio spte invalidation

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/x86/kvm/mmu.c  |9 +++--
 arch/x86/kvm/mmutrace.h |   24 
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1020152..d314e21 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -279,8 +279,13 @@ static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, 
gfn_t gfn,
 
 static bool check_mmio_spte(struct kvm *kvm, u64 spte)
 {
-   return get_mmio_spte_generation(spte) ==
- kvm_current_mmio_generation(kvm);
+   unsigned int kvm_gen, spte_gen;
+
+   kvm_gen = kvm_current_mmio_generation(kvm);
+   spte_gen = get_mmio_spte_generation(spte);
+
+   trace_check_mmio_spte(spte, kvm_gen, spte_gen);
+   return kvm_gen == spte_gen;
 }
 
 static inline u64 rsvd_bits(int s, int e)
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index f5b62a7..dac44ab 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -276,6 +276,30 @@ TRACE_EVENT(
  __spte_satisfied(old_spte), __spte_satisfied(new_spte)
)
 );
+
+
+TRACE_EVENT(
+   check_mmio_spte,
+   TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen),
+   TP_ARGS(spte, kvm_gen, spte_gen),
+
+   TP_STRUCT__entry(
+   __field(unsigned int, kvm_gen)
+   __field(unsigned int, spte_gen)
+   __field(u64, spte)
+   ),
+
+   TP_fast_assign(
+   __entry-kvm_gen = kvm_gen;
+   __entry-spte_gen = spte_gen;
+   __entry-spte = spte;
+   ),
+
+   TP_printk(spte %llx kvm_gen %x spte-gen %x valid %d, __entry-spte,
+ __entry-kvm_gen, __entry-spte_gen,
+ __entry-kvm_gen == __entry-spte_gen
+   )
+);
 #endif /* _TRACE_KVMMMU_H */
 
 #undef TRACE_INCLUDE_PATH
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 0/6] KVM: MMU: fast invalid all mmio sptes

2013-04-01 Thread Xiao Guangrong
Changelog in v2:
  - rename kvm_mmu_invalid_mmio_spte to kvm_mmu_invalid_mmio_sptes
  - use kvm-memslots-generation as kvm global generation-number
  - fix comment and codestyle
  - init kvm generation close to mmio wrap-around value
  - keep kvm_mmu_zap_mmio_sptes

The current way is holding hot mmu-lock and walking all shadow pages, this
is not scale. This patchset tries to introduce a very simple and scale way
to fast invalid all mmio sptes - it need not walk any shadow pages and hold
any locks.

The idea is simple:
KVM maintains a global mmio invalid generation-number which is stored in
kvm-memslots.generation and every mmio spte stores the current global
generation-number into his available bits when it is created

When KVM need zap all mmio sptes, it just simply increase the global
generation-number. When guests do mmio access, KVM intercepts a MMIO #PF
then it walks the shadow page table and get the mmio spte. If the
generation-number on the spte does not equal the global generation-number,
it will go to the normal #PF handler to update the mmio spte

Since 19 bits are used to store generation-number on mmio spte, we zap all
mmio sptes when the number is round

Xiao Guangrong (6):
  KVM: MMU: retain more available bits on mmio spte
  KVM: MMU: store generation-number into mmio spte
  KVM: MMU: make return value of mmio page fault handler more readable
  KVM: MMU: fast invalid all mmio sptes
  KVM: MMU: add tracepoint for check_mmio_spte
  KVM: MMU: init kvm generation close to mmio wrap-around value

 arch/x86/include/asm/kvm_host.h |3 +-
 arch/x86/kvm/mmu.c  |  134 +++
 arch/x86/kvm/mmu.h  |   17 +
 arch/x86/kvm/mmutrace.h |   34 +-
 arch/x86/kvm/paging_tmpl.h  |   10 ++-
 arch/x86/kvm/vmx.c  |   12 +++-
 arch/x86/kvm/x86.c  |   11 +++-
 virt/kvm/kvm_main.c |6 ++
 8 files changed, 186 insertions(+), 41 deletions(-)

-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 3/6] KVM: MMU: make return value of mmio page fault handler more readable

2013-04-01 Thread Xiao Guangrong
Define some meaningful names instead of raw code

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/x86/kvm/mmu.c |   15 +--
 arch/x86/kvm/mmu.h |   14 ++
 arch/x86/kvm/vmx.c |4 ++--
 3 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index be4f733..31c5586 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3182,17 +3182,12 @@ static u64 walk_shadow_page_get_mmio_spte(struct 
kvm_vcpu *vcpu, u64 addr)
return spte;
 }
 
-/*
- * If it is a real mmio page fault, return 1 and emulat the instruction
- * directly, return 0 to let CPU fault again on the address, -1 is
- * returned if bug is detected.
- */
 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
 {
u64 spte;
 
if (quickly_check_mmio_pf(vcpu, addr, direct))
-   return 1;
+   return RET_MMIO_PF_EMU;
 
spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
 
@@ -3205,7 +3200,7 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, 
u64 addr, bool direct)
 
trace_handle_mmio_page_fault(addr, gfn, access);
vcpu_cache_mmio_info(vcpu, addr, gfn, access);
-   return 1;
+   return RET_MMIO_PF_EMU;
}
 
/*
@@ -3213,13 +3208,13 @@ int handle_mmio_page_fault_common(struct kvm_vcpu 
*vcpu, u64 addr, bool direct)
 * it's a BUG if the gfn is not a mmio page.
 */
if (direct  !check_direct_spte_mmio_pf(spte))
-   return -1;
+   return RET_MMIO_PF_BUG;
 
/*
 * If the page table is zapped by other cpus, let CPU fault again on
 * the address.
 */
-   return 0;
+   return RET_MMIO_PF_RETRY;
 }
 EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
 
@@ -3229,7 +3224,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, 
u64 addr,
int ret;
 
ret = handle_mmio_page_fault_common(vcpu, addr, direct);
-   WARN_ON(ret  0);
+   WARN_ON(ret == RET_MMIO_PF_BUG);
return ret;
 }
 
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 2adcbc2..6b4ba1e 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -52,6 +52,20 @@
 
 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
+
+/*
+ * Return values of handle_mmio_page_fault_common:
+ * RET_MMIO_PF_EMU: it is a real mmio page fault, emulate the instruction
+ * directly.
+ * RET_MMIO_PF_RETRY: let CPU fault again on the address.
+ * RET_MMIO_PF_BUG: bug is detected.
+ */
+enum {
+   RET_MMIO_PF_EMU = 1,
+   RET_MMIO_PF_RETRY = 0,
+   RET_MMIO_PF_BUG = -1
+};
+
 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool 
direct);
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 915ef56..d0f2790 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5135,10 +5135,10 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
 
ret = handle_mmio_page_fault_common(vcpu, gpa, true);
-   if (likely(ret == 1))
+   if (likely(ret == RET_MMIO_PF_EMU))
return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
  EMULATE_DONE;
-   if (unlikely(!ret))
+   if (unlikely(ret == RET_MMIO_PF_RETRY))
return 1;
 
/* It is the real ept misconfig */
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 4/6] KVM: MMU: fast invalid all mmio sptes

2013-04-01 Thread Xiao Guangrong
This patch tries to introduce a very simple and scale way to invalid all
mmio sptes - it need not walk any shadow pages and hold mmu-lock

KVM maintains a global mmio invalid generation-number which is stored in
kvm-memslots.generation and every mmio spte stores the current global
generation-number into his available bits when it is created

When KVM need zap all mmio sptes, it just simply increase the global
generation-number. When guests do mmio access, KVM intercepts a MMIO #PF
then it walks the shadow page table and get the mmio spte. If the
generation-number on the spte does not equal the global generation-number,
it will go to the normal #PF handler to update the mmio spte

Since 19 bits are used to store generation-number on mmio spte, we zap all
mmio sptes when the number is round

Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
---
 arch/x86/include/asm/kvm_host.h |2 +-
 arch/x86/kvm/mmu.c  |   54 +--
 arch/x86/kvm/mmu.h  |3 ++
 arch/x86/kvm/paging_tmpl.h  |7 +++-
 arch/x86/kvm/vmx.c  |4 +++
 arch/x86/kvm/x86.c  |3 +-
 6 files changed, 60 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b5a6462..6c1e642 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -767,7 +767,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
 struct kvm_memory_slot *slot,
 gfn_t gfn_offset, unsigned long mask);
 void kvm_mmu_zap_all(struct kvm *kvm);
-void kvm_mmu_zap_mmio_sptes(struct kvm *kvm);
+void kvm_mmu_invalid_mmio_sptes(struct kvm *kvm);
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 31c5586..1020152 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -205,9 +205,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 #define MMIO_SPTE_GEN_LOW_SHIFT3
 #define MMIO_SPTE_GEN_HIGH_SHIFT   52
 
+#define MMIO_GEN_SHIFT 19
 #define MMIO_GEN_LOW_SHIFT 9
 #define MMIO_GEN_LOW_MASK  ((1  MMIO_GEN_LOW_SHIFT) - 1)
-#define MMIO_MAX_GEN   ((1  19) - 1)
+#define MMIO_GEN_MASK  ((1  MMIO_GEN_SHIFT) - 1)
+#define MMIO_MAX_GEN   ((1  MMIO_GEN_SHIFT) - 1)
 
 static u64 generation_mmio_spte_mask(unsigned int gen)
 {
@@ -231,15 +233,21 @@ static unsigned int get_mmio_spte_generation(u64 spte)
return gen;
 }
 
+static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
+{
+   return kvm_memslots(kvm)-generation  MMIO_GEN_MASK;
+}
+
 static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
   unsigned access)
 {
-   u64 mask = generation_mmio_spte_mask(0);
+   unsigned int gen = kvm_current_mmio_generation(kvm);
+   u64 mask = generation_mmio_spte_mask(gen);
 
access = ACC_WRITE_MASK | ACC_USER_MASK;
mask |= shadow_mmio_mask | access | gfn  PAGE_SHIFT;
 
-   trace_mark_mmio_spte(sptep, gfn, access, 0);
+   trace_mark_mmio_spte(sptep, gfn, access, gen);
mmu_spte_set(sptep, mask);
 }
 
@@ -269,6 +277,12 @@ static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, 
gfn_t gfn,
return false;
 }
 
+static bool check_mmio_spte(struct kvm *kvm, u64 spte)
+{
+   return get_mmio_spte_generation(spte) ==
+ kvm_current_mmio_generation(kvm);
+}
+
 static inline u64 rsvd_bits(int s, int e)
 {
return ((1ULL  (e - s + 1)) - 1)  s;
@@ -3195,6 +3209,9 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, 
u64 addr, bool direct)
gfn_t gfn = get_mmio_spte_gfn(spte);
unsigned access = get_mmio_spte_access(spte);
 
+   if (unlikely(!check_mmio_spte(vcpu-kvm, spte)))
+   return RET_MMIO_PF_INVALID;
+
if (direct)
addr = 0;
 
@@ -3236,8 +3253,12 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, 
gva_t gva,
 
pgprintk(%s: gva %lx error %x\n, __func__, gva, error_code);
 
-   if (unlikely(error_code  PFERR_RSVD_MASK))
-   return handle_mmio_page_fault(vcpu, gva, error_code, true);
+   if (unlikely(error_code  PFERR_RSVD_MASK)) {
+   r = handle_mmio_page_fault(vcpu, gva, error_code, true);
+
+   if (likely(r != RET_MMIO_PF_INVALID))
+   return r;
+   }
 
r = mmu_topup_memory_caches(vcpu);
if (r)
@@ -3313,8 +3334,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t 
gpa, u32 error_code,
ASSERT(vcpu);
ASSERT(VALID_PAGE(vcpu-arch.mmu.root_hpa));
 
-   if (unlikely(error_code  PFERR_RSVD_MASK))
-   return 

KVM call agenda for 2013-04-02

2013-04-01 Thread Juan Quintela


Hi

Please send in any agenda topics you are interested in.

Later, Juan.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [virt-test][PATCH 5/7] virt: Update Cartesian config unittest named variants

2013-04-01 Thread Eduardo Habkost
I will start by reviewing the test code, so we can agree on expected
syntax/behavior, before reviewing the implementation:

On Fri, Mar 29, 2013 at 06:14:08PM +0100, Jiří Župka wrote:
 Signed-off-by: Jiří Župka jzu...@redhat.com
 ---
  virttest/cartesian_config_unittest.py | 79 
 +++
  1 file changed, 79 insertions(+)

What about squashing this into the previous patch? I don't see a reason
to put unit-tests and implementation in separate commits.

 
 diff --git a/virttest/cartesian_config_unittest.py 
 b/virttest/cartesian_config_unittest.py
 index afc1b14..98c1efc 100755
 --- a/virttest/cartesian_config_unittest.py
 +++ b/virttest/cartesian_config_unittest.py
 @@ -86,6 +86,85 @@ class CartesianConfigTest(unittest.TestCase):
  )
  
  
 +def testNameVariant(self):
 +self._checkStringDump(
 +variants name=tests: # All tests in configuration

I like how you tried to make the syntax extensible, but:

1) I dislike the fact that all variants-block metadata will have to be
stuck in a single line if we extend this.

2) I find the model required to understand what name=tests means
confusing. With the syntax above, name (the left-hand side of the =
sign) is a kind of variable/option name, but the right-hand side of the
= sign (tests) is _also_ a variable/option name.

I mean: on all programming languages I know, variables are declared like
this:

  var i;
or:
  int i;

not like this:

  var name=i;
or:
  var type=int name=i;

That said, I would love to have extensibility to allow other
variants-block metadata in the future, but I think the variants-block
_name_ is special and doesn't need to be prepended with name=. IMO,
the name= prefix makes the semantics more confusing, not clearer.

 +  - wait:
 +   run = wait
 +   variants:
 + - long:
 +time = short_time
 + - short: long
 +time = logn_time
 +  - test2:
 +   run = test1
 +
 +variants name=virt_system:
 +  - @linux:
 +  - windows:
 +
 +variants name=host_os:
 +  - linux:
 +   image = linux
 +  - windows:
 +   image = windows
 +
 +only host_oslinux

That this  syntax mean? Is it something new?

Is the  operator whitespace-sensitive? Would host_os  linux work?

 +,
 +[
 +{'dep': [],
 + 'host_os': 'linux',
 + 'image': 'linux',
 + 'name': 'host_oslinux.virt_systemlinux.testswait.long',
 + 'run': 'wait',
 + 'shortname': 'host_oslinux.testswait.long',
 + 'tests': 'wait',
 + 'time': 'short_time',
 + 'virt_system': 'linux'},
 +{'dep': ['host_oslinux.virt_systemlinux.testswait.long'],
 + 'host_os': 'linux',
 + 'image': 'linux',
 + 'name': 'host_oslinux.virt_systemlinux.testswait.short',
 + 'run': 'wait',
 + 'shortname': 'host_oslinux.testswait.short',
 + 'tests': 'wait',
 + 'time': 'logn_time',
 + 'virt_system': 'linux'},
 +{'dep': [],
 + 'host_os': 'linux',
 + 'image': 'linux',
 + 'name': 'host_oslinux.virt_systemlinux.teststest2',
 + 'run': 'test1',
 + 'shortname': 'host_oslinux.teststest2',
 + 'tests': 'test2',
 + 'virt_system': 'linux'},
 +{'dep': [],
 + 'host_os': 'linux',
 + 'image': 'linux',
 + 'name': 'host_oslinux.virt_systemwindows.testswait.long',
 + 'run': 'wait',
 + 'shortname': 
 'host_oslinux.virt_systemwindows.testswait.long',
 + 'tests': 'wait',
 + 'time': 'short_time',
 + 'virt_system': 'windows'},
 +{'dep': 
 ['host_oslinux.virt_systemwindows.testswait.long'],
 + 'host_os': 'linux',
 + 'image': 'linux',
 + 'name': 
 'host_oslinux.virt_systemwindows.testswait.short',
 + 'run': 'wait',
 + 'shortname': 
 'host_oslinux.virt_systemwindows.testswait.short',
 + 'tests': 'wait',
 + 'time': 'logn_time',
 + 'virt_system': 'windows'},
 +{'dep': [],
 + 'host_os': 'linux',
 + 'image': 'linux',
 + 'name': 'host_oslinux.virt_systemwindows.teststest2',
 + 'run': 'test1',
 + 'shortname': 
 'host_oslinux.virt_systemwindows.teststest2',
 + 'tests': 'test2',
 +

Re: [virt-test][PATCH 4/7] virt: Adds named variants to Cartesian config.

2013-04-01 Thread Eduardo Habkost
Sorry for not reading the commit message before my previous reply. Now I
see the origin of the  syntax.

On Fri, Mar 29, 2013 at 06:14:07PM +0100, Jiří Župka wrote:
[...]
 
 For filtering of named variants is used character  because there was
 problem with conflict with = in expression key = value. The char 
 could be changed to something better but it should be different from =
 for optimization of speed.

IMO we need really strong reasons to use anything different from =
because it is the most obvious choice we have. Using  doesn't make
any sense to me.

What kind of speed optimization are you talking about, exactly? We need
to keep algorithm time/space complexity under control, but making two or
three additional regexp matches per line won't make the code much
slower, will it?

Also: whatever symbol we use, I would really like to make it
whitespace-insensitive.

I mean: if foox or foo=x works, foo  x or foo = x should work,
too. I am absolutely sure people _will_ eventually try to put whitespace
around the operator symbol, and this shouldn't cause unpleasant
surprises.


 
 Additionally named variant adds keys to final dictionary in case of
 example is it (virt_system = linux). It should reduce size of config file.
 Keys defined in config and keys defined by named variants are in same
 name space.

This is the part I like the most. Thanks!


 
 Signed-off-by: Jiří Župka jzu...@redhat.com
 ---
  virttest/cartesian_config.py | 138 
 ++-
  1 file changed, 124 insertions(+), 14 deletions(-)
 
 diff --git a/virttest/cartesian_config.py b/virttest/cartesian_config.py
 index ef91051..04ed2b5 100755
 --- a/virttest/cartesian_config.py
 +++ b/virttest/cartesian_config.py
 @@ -145,6 +145,74 @@ class MissingIncludeError:
  num_failed_cases = 5
  
  
 +class Label(object):
 +__slots__ = [name, var_name, long_name, hash_val, hash_var]
 +
 +def __init__(self, name, next_name=None):
 +if next_name is None:
 +self.name = name
 +self.var_name = None
 +else:
 +self.name = next_name
 +self.var_name = name
 +
 +if self.var_name is None:
 +self.long_name = %s % (self.name)
 +else:
 +self.long_name = %s%s % (self.var_name, self.name)
 +
 +self.hash_val = self.hash_name()
 +self.hash_var = None
 +if self.var_name:
 +self.hash_var = self.hash_variant()
 +
 +
 +def __str__(self):
 +return self.long_name
 +
 +
 +def __repr__(self):
 +return self.long_name
 +
 +
 +def __eq__(self, o):
 +
 +The comparison is asymmetric due to optimization.
 +
 +if o.var_name:
 +if self.long_name == o.long_name:
 +return True
 +else:
 +if self.name == o.name:
 +return True
 +return False
 +
 +
 +def __ne__(self, o):
 +
 +The comparison is asymmetric due to optimization.
 +
 +if o.var_name:
 +if self.long_name != o.long_name:
 +return True
 +else:
 +if self.name != o.name:
 +return True
 +return False
 +
 +
 +def __hash__(self):
 +return self.hash_val
 +
 +
 +def hash_name(self):
 +return sum([i + 1 * ord(x) for i, x in enumerate(self.name)])
 +
 +
 +def hash_variant(self):
 +return sum([i + 1 * ord(x) for i, x in enumerate(str(self))])
 +
 +
  class Node(object):
  __slots__ = [name, dep, content, children, labels,
   append_to_shortname, failed_cases, default]
 @@ -212,18 +280,19 @@ class Filter(object):
  def __init__(self, s):
  self.filter = []
  for char in s:
 -if not (char.isalnum() or char.isspace() or char in .,_-):
 +if not (char.isalnum() or char.isspace() or char in .,_-):
  raise ParserError(Illegal characters in filter)
  for word in s.replace(,,  ).split(): # OR
  word = [block.split(.) for block in word.split(..)]  # AND
 -for word in s.replace(,,  ).split():
 -word = [block.split(.) for block in word.split(..)]
 -for block in word:
 +words = []
  for block in word:   # .
 +b = []
  for elem in block:
  if not elem:
  raise ParserError(Syntax error)
 -self.filter += [word]
 +b.append(Label(*elem.split()))
 +words.append(b)
 +self.filter += [words]
  
  
  def match(self, ctx, ctx_set):
 @@ -506,15 +575,16 @@ class Parser(object):
  #node.dump(0)
  # Update dep
  for d in node.dep:
 -dep = dep + [..join(ctx + [d])]
 +dep = dep + [..join([str(label) for label in ctx + [d]])]

Re: [virt-test][PATCH 6/7] virt: Adds possibility filter defaults variant from variants

2013-04-01 Thread Eduardo Habkost
On Fri, Mar 29, 2013 at 06:14:09PM +0100, Jiří Župka wrote:
 If default variant is not filtered by only or no filters then
 only default variant is chosen. This behavior was used for optimizing
 of speed of Cartesian config.
 If variants don't have default variant then everything works as usual.
 Default variant must be in variants with with_default exactly one times.
 The default variant could be filtered by only, no filter. If default
 variant is filtered from variants then variants works same as usual variants 
 with
 default variant.
 
 For calling Cartesian config from command line is used option -d/--defaults:
../virttest/cartesian_config.py -d cfg/cc.cfg
 
 For calling Cartesian config from python:
c = Parser(args[0], defaults=options.defaults, debug=options.debug)

I believe we have allow that to be per-variants-block, not
all-or-nothing.

For example: the default ./run behavior could be to automatically choose
defaults for every variants-block (guest OS, block format, etc), except
for the subtests variants-block.

I would like the API to look like this:

   # this would be the current behavior:
   c = Parser(..., expand_all_variants=True)
   # will expand only the subtest and guest_os variants-block, and
   # automatically choose defaults for every other variants-block:
   c = Parser(..., expand_variants=['subtest', 'guest_os'])
   # For somebody who wants to run the default tests with all Windows
   # versions:
   c = Parser(..., expand_variants=['subtest', 'windows_version'])
   # For somebody who wants to run the default tests with all CPU models,
   # and all guest OSes:
   c = Parser(..., expand_variants=['subtest', 'guest_os', 'cpu_model'])
   # (additional nice-to-have: to allow something like guest_os.* to
   # expand a variants-block and all its sub-variants (guest OS
   # version, guest OS architecture, etc)

We could also find a way to encode the expand_variants instruction
inside the config file syntax, so people could put that information in
their config file. But that can be done later, after we test if the
concept is really working in the Python API and command-line.

 
 *  example:
 variants name=tests:
   - wait:
run = wait
variants:
  - long:
 time = short_time
  - short: long
 time = logn_time
   - test2:
run = test1
 
 variants name=virt_system, with_default:
   - @linux:
   - windows:
 
 variants name=host_os, with_default:
   - linux:
image = linux
variants with_default:
 - ubuntu:
 - @fedora:
   - windows:
image = windows
variants:
 - @XP:
 - WIN7:
 
 only host_oswindows
 
 In this case is chosen from host_os variants windows variant.
 host_oswindows was choosen because default variant linux was filtered.
 Next step is select one variant from guest_os. There will be chosen only
 default variant linux because not filtered and virt_system variant is
 with with_default. There is no default variant in tests variants because
 that all of tests will be chosen.
 
   output:
 dict1:  host_oswindows.testswait.long
 dep = []
 host_os = windows
 image = windows
 name = host_oswindows.XP.virt_systemlinux.testswait.long
 run = wait
 shortname = host_oswindows.testswait.long
 tests = wait
 time = short_time
 virt_system = linux
 dict2:  host_oswindows.testswait.short
 dep = ['host_oswindows.XP.virt_systemlinux.testswait.long']
 host_os = windows
 image = windows
 name = host_oswindows.XP.virt_systemlinux.testswait.short
 run = wait
 shortname = host_oswindows.testswait.short
 tests = wait
 time = logn_time
 virt_system = linux
 dict3:  host_oswindows.teststest2
 dep = []
 host_os = windows
 image = windows
 name = host_oswindows.XP.virt_systemlinux.teststest2
 run = test1
 shortname = host_oswindows.teststest2
 tests = test2
 virt_system = linux
 dict4:  host_oswindows.WIN7.testswait.long
 dep = []
 host_os = windows
 image = windows
 name = host_oswindows.WIN7.virt_systemlinux.testswait.long
 run = wait
 shortname = host_oswindows.WIN7.testswait.long
 tests = wait
 time = short_time
 virt_system = linux
 dict5:  host_oswindows.WIN7.testswait.short
 dep = ['host_oswindows.WIN7.virt_systemlinux.testswait.long']
 host_os = windows
 image = windows
 name = host_oswindows.WIN7.virt_systemlinux.testswait.short
 run = wait
 shortname = host_oswindows.WIN7.testswait.short
 tests = wait
 time = logn_time
 virt_system = linux
 dict6:  host_oswindows.WIN7.teststest2
 dep = []
 host_os = windows
 image = windows
 name = host_oswindows.WIN7.virt_systemlinux.teststest2
 run = test1
 shortname = host_oswindows.WIN7.teststest2
 tests = test2
 virt_system = linux
 
 Signed-off-by: Jiří Župka jzu...@redhat.com
 

Re: Best way to busy-wait for a virtio queue?

2013-04-01 Thread Eric Northup
On Fri, Mar 29, 2013 at 4:12 PM, H. Peter Anvin h...@zytor.com wrote:

 Is there any preferred way to busy-wait on a virtio event?  As in: the
 guest doesn't have anything useful to do until something is plopped down
 on the virtio queue, but would like to proceed as quickly as possible
 after that.  Passing through an interrupt handler seems like unnecessary
 overhead.

How much information do you have about the host?  It is possible that
leaving the vCPU running is displacing execution from whatever host
thread(s) would be involved in making progress towards the event you
want delivered - in that case, the interrupt overhead might be
balanced out by lower latency of the event delivery.

 Right now I have a poll loop looking like (pseudocode):

 outw(0, trigger);
 while (readl(ring-output pointer) != final output pointer)
 cpu_relax();/* x86 PAUSE instruction */

 ... but I have no idea how much sense that makes.

The cleanest expression of the desired semantic I can think of would
be MONITOR/MWAIT, except that KVM doesn't allow those instructions in
the guest.  For the case of a 100% non-overcommitted host (including
host i/o processing), there's no reason not to allow the guest to run
those instructions.

Lacking that, I think the above busy-loop w/PAUSE in it will end up
causing a pause-loop exit - so it has largely the same effect but also
works on current hosts.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/9] linux-headers: Update to v3.9-rc2

2013-04-01 Thread Alex Williamson
Unedited scripts/update-linux-headers.sh run against v3.9-rc2 tag

Signed-off-by: Alex Williamson alex.william...@redhat.com
---
 linux-headers/linux/vfio.h |9 +
 1 file changed, 9 insertions(+)

diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index f787b72..e094121 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -303,6 +303,15 @@ enum {
VFIO_PCI_BAR5_REGION_INDEX,
VFIO_PCI_ROM_REGION_INDEX,
VFIO_PCI_CONFIG_REGION_INDEX,
+   /*
+* Expose VGA regions defined for PCI base class 03, subclass 00.
+* This includes I/O port ranges 0x3b0 to 0x3bb and 0x3c0 to 0x3df
+* as well as the MMIO range 0xa to 0xb.  Each implemented
+* range is found at it's identity mapped offset from the region
+* offset, for example 0x3b0 is region_info.offset + 0x3b0.  Areas
+* between described ranges are unimplemented.
+*/
+   VFIO_PCI_VGA_REGION_INDEX,
VFIO_PCI_NUM_REGIONS
 };
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/9] vfio-pci: Generalize PCI config mangling

2013-04-01 Thread Alex Williamson
Kernel-side vfio virtualizes all of config space, but some parts are
unique to Qemu.  For instance we may or may not expose the ROM BAR,
Qemu manages MSI/MSIX, and Qemu manages the multi-function bit so that
single function devices can appear as multi-function and vica versa.
Generalize this into a bitmap of Qemu emulated bits.

Signed-off-by: Alex Williamson alex.william...@redhat.com
---
 hw/vfio_pci.c |   80 ++---
 1 file changed, 42 insertions(+), 38 deletions(-)

diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
index 288361d..a3bae7b 100644
--- a/hw/vfio_pci.c
+++ b/hw/vfio_pci.c
@@ -117,6 +117,7 @@ typedef struct VFIODevice {
 int fd;
 VFIOINTx intx;
 unsigned int config_size;
+uint8_t *emulated_config_bits; /* QEMU emulated bits, little-endian */
 off_t config_offset; /* Offset of config space region within device fd */
 unsigned int rom_size;
 off_t rom_offset; /* Offset of ROM region within device fd */
@@ -963,44 +964,29 @@ static const MemoryRegionOps vfio_bar_ops = {
 static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
 {
 VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
-uint32_t val = 0;
+uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val;
 
-/*
- * We only need QEMU PCI config support for the ROM BAR, the MSI and MSIX
- * capabilities, and the multifunction bit below.  We let VFIO handle
- * virtualizing everything else.  Performance is not a concern here.
- */
-if (ranges_overlap(addr, len, PCI_ROM_ADDRESS, 4) ||
-(pdev-cap_present  QEMU_PCI_CAP_MSIX 
- ranges_overlap(addr, len, pdev-msix_cap, MSIX_CAP_LENGTH)) ||
-(pdev-cap_present  QEMU_PCI_CAP_MSI 
- ranges_overlap(addr, len, pdev-msi_cap, vdev-msi_cap_size))) {
+memcpy(emu_bits, vdev-emulated_config_bits + addr, len);
+emu_bits = le32_to_cpu(emu_bits);
 
-val = pci_default_read_config(pdev, addr, len);
-} else {
-if (pread(vdev-fd, val, len, vdev-config_offset + addr) != len) {
+if (emu_bits) {
+emu_val = pci_default_read_config(pdev, addr, len);
+}
+
+if (~emu_bits  (0xU  (32 - len * 8))) {
+ssize_t ret;
+
+ret = pread(vdev-fd, phys_val, len, vdev-config_offset + addr);
+if (ret != len) {
 error_report(%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %m,
  __func__, vdev-host.domain, vdev-host.bus,
  vdev-host.slot, vdev-host.function, addr, len);
 return -errno;
 }
-val = le32_to_cpu(val);
+phys_val = le32_to_cpu(phys_val);
 }
 
-/* Multifunction bit is virualized in QEMU */
-if (unlikely(ranges_overlap(addr, len, PCI_HEADER_TYPE, 1))) {
-uint32_t mask = PCI_HEADER_TYPE_MULTI_FUNCTION;
-
-if (len == 4) {
-mask = 16;
-}
-
-if (pdev-cap_present  QEMU_PCI_CAP_MULTIFUNCTION) {
-val |= mask;
-} else {
-val = ~mask;
-}
-}
+val = (emu_val  emu_bits) | (phys_val  ~emu_bits);
 
 DPRINTF(%s(%04x:%02x:%02x.%x, @0x%x, len=0x%x) %x\n, __func__,
 vdev-host.domain, vdev-host.bus, vdev-host.slot,
@@ -1026,12 +1012,6 @@ static void vfio_pci_write_config(PCIDevice *pdev, 
uint32_t addr,
  vdev-host.slot, vdev-host.function, addr, val, len);
 }
 
-/* Write standard header bits to emulation */
-if (addr  PCI_CONFIG_HEADER_SIZE) {
-pci_default_write_config(pdev, addr, val, len);
-return;
-}
-
 /* MSI/MSI-X Enabling/Disabling */
 if (pdev-cap_present  QEMU_PCI_CAP_MSI 
 ranges_overlap(addr, len, pdev-msi_cap, vdev-msi_cap_size)) {
@@ -1046,9 +1026,7 @@ static void vfio_pci_write_config(PCIDevice *pdev, 
uint32_t addr,
 } else if (was_enabled  !is_enabled) {
 vfio_disable_msi(vdev);
 }
-}
-
-if (pdev-cap_present  QEMU_PCI_CAP_MSIX 
+} else if (pdev-cap_present  QEMU_PCI_CAP_MSIX 
 ranges_overlap(addr, len, pdev-msix_cap, MSIX_CAP_LENGTH)) {
 int is_enabled, was_enabled = msix_enabled(pdev);
 
@@ -1061,6 +1039,9 @@ static void vfio_pci_write_config(PCIDevice *pdev, 
uint32_t addr,
 } else if (was_enabled  !is_enabled) {
 vfio_disable_msix(vdev);
 }
+} else {
+/* Write everything to QEMU to keep emulated bits correct */
+pci_default_write_config(pdev, addr, val, len);
 }
 }
 
@@ -2003,6 +1984,16 @@ static int vfio_initfn(PCIDevice *pdev)
 goto out_put;
 }
 
+/* vfio emulates a lot for us, but some bits need extra love */
+vdev-emulated_config_bits = g_malloc0(vdev-config_size);
+
+/* QEMU can choose to expose the ROM or not */
+memset(vdev-emulated_config_bits + PCI_ROM_ADDRESS, 0xff, 4);
+
+/* QEMU can change multi-function devices to single function, or reverse */
+

[PATCH 3/9] vfio-pci: Add PCIe capability mangling based on bus type

2013-04-01 Thread Alex Williamson
Windows seems to pay particular interest to the PCIe header type of
devices and will fail to load drivers if we attach Endpoint devices or
Legacy Endpoint devices to the Root Complex.  We can use
pci_bus_is_express and pci_bus_is_root to determine the bus type and
mangle the type appropriately:

* Legacy PCI
  * No change, capability is unmodified for compatibility.
* PCI Express
  * Integrated Root Complex Endpoint - Endpoint
* PCI Express Root Complex
  * Endpoint - Integrated Root Complex Endpoint
  * Legacy Endpoint - none, capability hidden

We also take this opportunity to explicitly limit supported devices
to Endpoints, Legacy Endpoints, and Root Complex Integrated Endpoints.
We don't currently have support for other types and users often cause
themselves problems by assigning them.

Signed-off-by: Alex Williamson alex.william...@redhat.com
---
 hw/vfio_pci.c |  129 +
 1 file changed, 128 insertions(+), 1 deletion(-)

diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
index a3bae7b..0f74dbb 100644
--- a/hw/vfio_pci.c
+++ b/hw/vfio_pci.c
@@ -1506,6 +1506,124 @@ static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, 
uint8_t pos)
 return next - pos;
 }
 
+static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask)
+{
+pci_set_word(buf, (pci_get_word(buf)  ~mask) | val);
+}
+
+static void vfio_add_emulated_word(VFIODevice *vdev, int pos,
+   uint16_t val, uint16_t mask)
+{
+vfio_set_word_bits(vdev-pdev.config + pos, val, mask);
+vfio_set_word_bits(vdev-pdev.wmask + pos, ~mask, mask);
+vfio_set_word_bits(vdev-emulated_config_bits + pos, mask, mask);
+}
+
+static void vfio_set_long_bits(uint8_t *buf, uint32_t val, uint32_t mask)
+{
+pci_set_long(buf, (pci_get_long(buf)  ~mask) | val);
+}
+
+static void vfio_add_emulated_long(VFIODevice *vdev, int pos,
+   uint32_t val, uint32_t mask)
+{
+vfio_set_long_bits(vdev-pdev.config + pos, val, mask);
+vfio_set_long_bits(vdev-pdev.wmask + pos, ~mask, mask);
+vfio_set_long_bits(vdev-emulated_config_bits + pos, mask, mask);
+}
+
+static int vfio_setup_pcie_cap(VFIODevice *vdev, int pos, uint8_t size)
+{
+uint16_t flags;
+uint8_t type;
+
+flags = pci_get_word(vdev-pdev.config + pos + PCI_CAP_FLAGS);
+type = (flags  PCI_EXP_FLAGS_TYPE)  4;
+
+if (type != PCI_EXP_TYPE_ENDPOINT 
+type != PCI_EXP_TYPE_LEG_END 
+type != PCI_EXP_TYPE_RC_END) {
+
+error_report(vfio: Assignment of PCIe type 0x%x 
+ devices is not currently supported, type);
+return -EINVAL;
+}
+
+if (!pci_bus_is_express(vdev-pdev.bus)) {
+/*
+ * Use express capability as-is on PCI bus.  It doesn't make much
+ * sense to even expose, but some drivers (ex. tg3) depend on it
+ * and guests don't seem to be particular about it.  We'll need
+ * to revist this or force express devices to express buses if we
+ * ever expose an IOMMU to the guest.
+ */
+} else if (pci_bus_is_root(vdev-pdev.bus)) {
+/*
+ * On a Root Complex bus Endpoints become Root Complex Integrated
+ * Endpoints, which changes the type and clears the LNK  LNK2 fields.
+ */
+if (type == PCI_EXP_TYPE_ENDPOINT) {
+vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
+   PCI_EXP_TYPE_RC_END  4,
+   PCI_EXP_FLAGS_TYPE);
+
+/* Link Capabilities, Status, and Control goes away */
+if (size  PCI_EXP_LNKCTL) {
+vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP, 0, ~0);
+vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
+vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA, 0, ~0);
+
+#ifndef PCI_EXP_LNKCAP2
+#define PCI_EXP_LNKCAP2 44
+#endif
+#ifndef PCI_EXP_LNKSTA2
+#define PCI_EXP_LNKSTA2 50
+#endif
+/* Link 2 Capabilities, Status, and Control goes away */
+if (size  PCI_EXP_LNKCAP2) {
+vfio_add_emulated_long(vdev, pos + PCI_EXP_LNKCAP2, 0, ~0);
+vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL2, 0, ~0);
+vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKSTA2, 0, ~0);
+}
+}
+
+} else if (type == PCI_EXP_TYPE_LEG_END) {
+/*
+ * Legacy endpoints don't belong on the root complex.  Windows
+ * seems to be happier with devices if we skip the capability.
+ */
+return 0;
+}
+
+} else {
+/*
+ * Convert Root Complex Integrated Endpoints to regular endpoints.
+ * These devices don't support LNK/LNK2 capabilities, so make them up.
+ */
+if (type == PCI_EXP_TYPE_RC_END) {
+vfio_add_emulated_word(vdev, pos + PCI_CAP_FLAGS,
+

[PATCH 5/9] qemu vfio-pci: Graphics device quirks

2013-04-01 Thread Alex Williamson
Graphics cards have a number of different backdoors.  Some of these
are alternative ways to get PCI BAR addresses, some of them are
complete mirrors of PCI config space available through MMIO and
I/O port access.  These quirks cover a number of ATI Radeon and
Nvidia devices.  On the ATI/AMD side, this should enable HD5450
and HD7850 and hopefully a host of devices around those generations.
For Nvidia, my card selection is much more dated.  A 8400gs works
well with both the Window shipped driver and the Nvidia downloaded
driver.  A 7300le works as well, with the caveat that generating
the Window experience index with the Nvidia driver causes the card
to reset several times before generating a BSOD.  An NVS 290 card
seems to run well with the shipped Windows driver, but generates
a BSOD with the Nvidia driver.  All of the Nvidia devices work with
the Linux Nvidia proprietary driver and nouveau, the HD5450 works
with either radeon or fglrx, HD7850 works with vesa and fglrx (not
supported by radeon).  Extremely limited 3D testing.

Device reset is also an issue with graphics.  It's unfortunately
very common that the devices offer no means to reset the card or
doesn't seem effective.  Nvidia devices are pretty good about being
able to get the device to a working state through the VGA BIOS init,
Radeon devices less so, and often require a host reboot.  Work
remains to be done here.

Signed-off-by: Alex Williamson alex.william...@redhat.com
---
 hw/vfio_pci.c |  720 +
 1 file changed, 718 insertions(+), 2 deletions(-)

diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
index ff415a6..e96f141 100644
--- a/hw/vfio_pci.c
+++ b/hw/vfio_pci.c
@@ -48,6 +48,16 @@
 do { } while (0)
 #endif
 
+struct VFIODevice;
+
+typedef struct VFIOQuirk {
+MemoryRegion mem;
+struct VFIODevice *vdev;
+QLIST_ENTRY(VFIOQuirk) next;
+uint32_t data;
+uint32_t data2;
+} VFIOQuirk;
+
 typedef struct VFIOBAR {
 off_t fd_offset; /* offset of BAR within device fd */
 int fd; /* device fd, allows us to pass VFIOBAR as opaque data */
@@ -57,12 +67,14 @@ typedef struct VFIOBAR {
 size_t size;
 uint32_t flags; /* VFIO region flags (rd/wr/mmap) */
 uint8_t nr; /* cache the BAR number for debug */
+QLIST_HEAD(, VFIOQuirk) quirks;
 } VFIOBAR;
 
 typedef struct VFIOVGARegion {
 MemoryRegion mem;
 off_t offset;
 int nr;
+QLIST_HEAD(, VFIOQuirk) quirks;
 } VFIOVGARegion;
 
 typedef struct VFIOVGA {
@@ -82,8 +94,6 @@ typedef struct VFIOINTx {
 QEMUTimer *mmap_timer; /* enable mmaps after periods w/o interrupts */
 } VFIOINTx;
 
-struct VFIODevice;
-
 typedef struct VFIOMSIVector {
 EventNotifier interrupt; /* eventfd triggered on interrupt */
 struct VFIODevice *vdev; /* back pointer to device */
@@ -169,6 +179,8 @@ static QLIST_HEAD(, VFIOGroup)
 
 static void vfio_disable_interrupts(VFIODevice *vdev);
 static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
+static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
+  uint32_t val, int len);
 static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled);
 
 /*
@@ -1059,6 +1071,700 @@ static const MemoryRegionOps vfio_vga_ops = {
 };
 
 /*
+ * Device specific quirks
+ */
+
+#define PCI_VENDOR_ID_ATI   0x1002
+
+/*
+ * Device 1002:68f9 (Advanced Micro Devices [AMD] nee ATI Cedar PRO [Radeon
+ * HD 5450/6350]) reports the upper byte of the physical address of the
+ * I/O port BAR4 through VGA register 0x3c3.  The BAR is 256 bytes, so the
+ * lower byte is known to be zero.  Probing for this quirk reads 0xff from
+ * port 0x3c3 on some devices so we store the physical address and replace
+ * reads with the virtual address any time it matches.  XXX Research when
+ * to enable quirk.
+ */
+static uint64_t vfio_ati_3c3_quirk_read(void *opaque,
+hwaddr addr, unsigned size)
+{
+VFIOQuirk *quirk = opaque;
+VFIODevice *vdev = quirk-vdev;
+PCIDevice *pdev = vdev-pdev;
+uint64_t data = vfio_vga_read(vdev-vga.region[QEMU_PCI_VGA_IO_HI],
+  addr + 0x3, size);
+
+if (data == quirk-data) {
+data = pci_get_byte(pdev-config + PCI_BASE_ADDRESS_4 + 1);
+DPRINTF(%s(0x3c3, 1) = 0x%PRIx64\n, __func__, data);
+}
+
+return data;
+}
+
+static const MemoryRegionOps vfio_ati_3c3_quirk = {
+.read = vfio_ati_3c3_quirk_read,
+.endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static void vfio_vga_probe_ati_3c3_quirk(VFIODevice *vdev)
+{
+PCIDevice *pdev = vdev-pdev;
+off_t physoffset = vdev-config_offset + PCI_BASE_ADDRESS_4;
+uint32_t physbar;
+VFIOQuirk *quirk;
+
+if (pci_get_word(pdev-config + PCI_VENDOR_ID) != PCI_VENDOR_ID_ATI ||
+vdev-bars[4].size  256) {
+return;
+}
+
+/* Get I/O port BAR physical address */
+if (pread(vdev-fd, physbar, 4, physoffset) != 4) {
+

[PATCH 6/9] vfio-pci: Add extra debugging

2013-04-01 Thread Alex Williamson
Often when debugging it's useful to be able to disable bypass paths
so no interactions with the device are missed.  Add some extra debug
options to do this.  Also add device info on read/write BAR accesses,
which is useful when debugging more than one assigned device.  A
couple DPRINTFs also had redundant vfio: prefixes.

Signed-off-by: Alex Williamson alex.william...@redhat.com
---
 hw/vfio_pci.c |   40 ++--
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
index e96f141..b913ec0 100644
--- a/hw/vfio_pci.c
+++ b/hw/vfio_pci.c
@@ -48,6 +48,10 @@
 do { } while (0)
 #endif
 
+/* Extra debugging, trap acceleration paths for more logging */
+#define VFIO_ALLOW_MMAP 1
+#define VFIO_ALLOW_KVM_INTX 1
+
 struct VFIODevice;
 
 typedef struct VFIOQuirk {
@@ -305,7 +309,7 @@ static void vfio_enable_intx_kvm(VFIODevice *vdev)
 int ret, argsz;
 int32_t *pfd;
 
-if (!kvm_irqfds_enabled() ||
+if (!VFIO_ALLOW_KVM_INTX || !kvm_irqfds_enabled() ||
 vdev-intx.route.mode != PCI_INTX_ENABLED ||
 !kvm_check_extension(kvm_state, KVM_CAP_IRQFD_RESAMPLE)) {
 return;
@@ -925,8 +929,16 @@ static void vfio_bar_write(void *opaque, hwaddr addr,
  __func__, addr, data, size);
 }
 
-DPRINTF(%s(BAR%d+0x%HWADDR_PRIx, 0x%PRIx64, %d)\n,
-__func__, bar-nr, addr, data, size);
+#ifdef DEBUG_VFIO
+{
+VFIODevice *vdev = container_of(bar, VFIODevice, bars[bar-nr]);
+
+DPRINTF(%s(%04x:%02x:%02x.%x:BAR%d+0x%HWADDR_PRIx, 0x%PRIx64
+, %d)\n, __func__, vdev-host.domain, vdev-host.bus,
+vdev-host.slot, vdev-host.function, bar-nr, addr,
+data, size);
+}
+#endif
 
 /*
  * A read or write to a BAR always signals an INTx EOI.  This will
@@ -972,8 +984,16 @@ static uint64_t vfio_bar_read(void *opaque,
 break;
 }
 
-DPRINTF(%s(BAR%d+0x%HWADDR_PRIx, %d) = 0x%PRIx64\n,
-__func__, bar-nr, addr, size, data);
+#ifdef DEBUG_VFIO
+{
+VFIODevice *vdev = container_of(bar, VFIODevice, bars[bar-nr]);
+
+DPRINTF(%s(%04x:%02x:%02x.%x:BAR%d+0x%HWADDR_PRIx
+, %d) = 0x%PRIx64\n, __func__, vdev-host.domain,
+vdev-host.bus, vdev-host.slot, vdev-host.function,
+bar-nr, addr, size, data);
+}
+#endif
 
 /* Same as write above */
 vfio_eoi(container_of(bar, VFIODevice, bars[bar-nr]));
@@ -1917,7 +1937,7 @@ static void vfio_listener_region_add(MemoryListener 
*listener,
 int ret;
 
 if (vfio_listener_skipped_section(section)) {
-DPRINTF(vfio: SKIPPING region_add %HWADDR_PRIx - %PRIx64\n,
+DPRINTF(SKIPPING region_add %HWADDR_PRIx - %PRIx64\n,
 section-offset_within_address_space,
 section-offset_within_address_space + section-size - 1);
 return;
@@ -1941,7 +1961,7 @@ static void vfio_listener_region_add(MemoryListener 
*listener,
 section-offset_within_region +
 (iova - section-offset_within_address_space);
 
-DPRINTF(vfio: region_add %HWADDR_PRIx - %HWADDR_PRIx [%p]\n,
+DPRINTF(region_add %HWADDR_PRIx - %HWADDR_PRIx [%p]\n,
 iova, end - 1, vaddr);
 
 ret = vfio_dma_map(container, iova, end - iova, vaddr, section-readonly);
@@ -1961,7 +1981,7 @@ static void vfio_listener_region_del(MemoryListener 
*listener,
 int ret;
 
 if (vfio_listener_skipped_section(section)) {
-DPRINTF(vfio: SKIPPING region_del %HWADDR_PRIx - %PRIx64\n,
+DPRINTF(SKIPPING region_del %HWADDR_PRIx - %PRIx64\n,
 section-offset_within_address_space,
 section-offset_within_address_space + section-size - 1);
 return;
@@ -1981,7 +2001,7 @@ static void vfio_listener_region_del(MemoryListener 
*listener,
 return;
 }
 
-DPRINTF(vfio: region_del %HWADDR_PRIx - %HWADDR_PRIx\n,
+DPRINTF(region_del %HWADDR_PRIx - %HWADDR_PRIx\n,
 iova, end - 1);
 
 ret = vfio_dma_unmap(container, iova, end - iova);
@@ -2184,7 +2204,7 @@ static int vfio_mmap_bar(VFIOBAR *bar, MemoryRegion *mem, 
MemoryRegion *submem,
 {
 int ret = 0;
 
-if (size  bar-flags  VFIO_REGION_INFO_FLAG_MMAP) {
+if (VFIO_ALLOW_MMAP  size  bar-flags  VFIO_REGION_INFO_FLAG_MMAP) {
 int prot = 0;
 
 if (bar-flags  VFIO_REGION_INFO_FLAG_READ) {

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 7/9] vfio-pci: Move devices to D0 on reset

2013-04-01 Thread Alex Williamson
Guests may leave devices in a low power state at reboot, but we expect
devices to be woken up for the next boot.  Make this happen.

Signed-off-by: Alex Williamson alex.william...@redhat.com
---
 hw/vfio_pci.c |   23 +++
 1 file changed, 23 insertions(+)

diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
index b913ec0..d310730 100644
--- a/hw/vfio_pci.c
+++ b/hw/vfio_pci.c
@@ -160,6 +160,7 @@ typedef struct VFIODevice {
 uint32_t features;
 #define VFIO_FEATURE_ENABLE_VGA_BIT 0
 #define VFIO_FEATURE_ENABLE_VGA (1  VFIO_FEATURE_ENABLE_VGA_BIT)
+uint8_t pm_cap;
 bool reset_works;
 bool has_vga;
 } VFIODevice;
@@ -2534,6 +2535,8 @@ static int vfio_add_std_cap(VFIODevice *vdev, uint8_t pos)
 case PCI_CAP_ID_MSIX:
 ret = vfio_setup_msix(vdev, pos);
 break;
+case PCI_CAP_ID_PM:
+vdev-pm_cap = pos;
 default:
 ret = pci_add_capability(pdev, cap_id, pos, size);
 break;
@@ -3108,6 +3111,26 @@ static void vfio_pci_reset(DeviceState *dev)
 
 vfio_disable_interrupts(vdev);
 
+/* Make sure the device is in D0 */
+if (vdev-pm_cap) {
+uint16_t pmcsr;
+uint8_t state;
+
+pmcsr = vfio_pci_read_config(pdev, vdev-pm_cap + PCI_PM_CTRL, 2);
+state = pmcsr  PCI_PM_CTRL_STATE_MASK;
+if (state) {
+pmcsr = ~PCI_PM_CTRL_STATE_MASK;
+vfio_pci_write_config(pdev, vdev-pm_cap + PCI_PM_CTRL, pmcsr, 2);
+/* vfio handles the necessary delay here */
+pmcsr = vfio_pci_read_config(pdev, vdev-pm_cap + PCI_PM_CTRL, 2);
+state = pmcsr  PCI_PM_CTRL_STATE_MASK;
+if (state) {
+error_report(vfio: Unable to power on device, stuck in D%d\n,
+ state);
+}
+}
+}
+
 /*
  * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master.
  * Also put INTx Disable in known state.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[GIT PULL 0/9] vfio-pci updates VGA support

2013-04-01 Thread Alex Williamson
Hi Anthony,

The following changes since commit 174d4d215fb49b4d43196e62f22c2533431b260e:

  tcg/mips: Implement muls2_i32 (2013-04-01 18:49:17 +0200)

are available in the git repository at:

  git://github.com/awilliam/qemu-vfio.git tags/vfio-pci-for-qemu-20130401.0

for you to fetch changes up to 6dcfdbad69aa510bc87b4a2585a597e028ca4eaa:

  vfio: cleanup includes (2013-04-01 13:35:40 -0600)



Linux kernel v3.9 adds a VGA interface to vfio allowing userspace
access to legacy VGA address space.  This series takes advantage
of that to enable assignment of graphics cards.  However, as found
in patch 5, graphics cards are not just regular PCI devices.  Both
ATI/AMD and Nvidia cards have their fair share of back doors to get
to PCI config space.  The quirks here handle redirecting backdoor
config space and methods to determine physical addresses through
emulated config space from the guest vCPU.  Still a concern is GPU
access to the same as we have no way to virtualize such accesses.
With this in-tree as an experimental feature we can hopefully get
some user reports on whether this is sufficient or if we'll need to
adopt an identity mapping solution for full support.  Note that only
secondary, non-IGD graphics have been tested.  For Q35 an updated
BIOS from SeaBIOS is recommended and the graphics card should ideally
be exposed under a PCIe root port for the guest.

Also included in this series is the previously posted v3.9-rc2 linux
headers update.  I include it here because the only modified file is
for vfio.  Also add trivial bootindex support for vfio-pci, add PCIe
device type mangling to better support Q35, restore devices to power
state D0 on reset, and cleanup debug prints.

Thanks,
Alex


Alex Williamson (9):
  linux-headers: Update to v3.9-rc2
  vfio-pci: Generalize PCI config mangling
  vfio-pci: Add PCIe capability mangling based on bus type
  qemu vfio-pci: Add support for VGA MMIO and I/O port access
  qemu vfio-pci: Graphics device quirks
  vfio-pci: Add extra debugging
  vfio-pci: Move devices to D0 on reset
  vfio: Add bootindex support
  vfio: cleanup includes


 hw/vfio_pci.c  | 1174 ++--
 linux-headers/linux/vfio.h |9 
 2 files changed, 1128 insertions(+), 55 deletions(-)
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 8/9] vfio: Add bootindex support

2013-04-01 Thread Alex Williamson
Signed-off-by: Alex Williamson alex.william...@redhat.com
---
 hw/vfio_pci.c |5 +
 1 file changed, 5 insertions(+)

diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
index d310730..0d2a445 100644
--- a/hw/vfio_pci.c
+++ b/hw/vfio_pci.c
@@ -30,6 +30,7 @@
 #include qemu/event_notifier.h
 #include exec/address-spaces.h
 #include sysemu/kvm.h
+#include sysemu/sysemu.h
 #include exec/memory.h
 #include hw/pci/msi.h
 #include hw/pci/msix.h
@@ -160,6 +161,7 @@ typedef struct VFIODevice {
 uint32_t features;
 #define VFIO_FEATURE_ENABLE_VGA_BIT 0
 #define VFIO_FEATURE_ENABLE_VGA (1  VFIO_FEATURE_ENABLE_VGA_BIT)
+int32_t bootindex;
 uint8_t pm_cap;
 bool reset_works;
 bool has_vga;
@@ -3070,6 +3072,8 @@ static int vfio_initfn(PCIDevice *pdev)
 }
 }
 
+add_boot_device_path(vdev-bootindex, pdev-qdev, NULL);
+
 return 0;
 
 out_teardown:
@@ -3157,6 +3161,7 @@ static Property vfio_pci_dev_properties[] = {
intx.mmap_timeout, 1100),
 DEFINE_PROP_BIT(x-vga, VFIODevice, features,
 VFIO_FEATURE_ENABLE_VGA_BIT, false),
+DEFINE_PROP_INT32(bootindex, VFIODevice, bootindex, -1),
 /*
  * TODO - support passed fds... is this necessary?
  * DEFINE_PROP_STRING(vfiofd, VFIODevice, vfiofd_name),

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 9/9] vfio: cleanup includes

2013-04-01 Thread Alex Williamson
Starting to get messy, put the back in alphabetical order.

Signed-off-by: Alex Williamson alex.william...@redhat.com
---
 hw/vfio_pci.c |   10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
index 0d2a445..693a9ff 100644
--- a/hw/vfio_pci.c
+++ b/hw/vfio_pci.c
@@ -19,26 +19,26 @@
  */
 
 #include dirent.h
-#include unistd.h
+#include linux/vfio.h
 #include sys/ioctl.h
 #include sys/mman.h
 #include sys/stat.h
 #include sys/types.h
-#include linux/vfio.h
+#include unistd.h
 
 #include config.h
-#include qemu/event_notifier.h
 #include exec/address-spaces.h
-#include sysemu/kvm.h
-#include sysemu/sysemu.h
 #include exec/memory.h
 #include hw/pci/msi.h
 #include hw/pci/msix.h
 #include hw/pci/pci.h
 #include qemu-common.h
 #include qemu/error-report.h
+#include qemu/event_notifier.h
 #include qemu/queue.h
 #include qemu/range.h
+#include sysemu/kvm.h
+#include sysemu/sysemu.h
 
 /* #define DEBUG_VFIO */
 #ifdef DEBUG_VFIO

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/9] qemu vfio-pci: Add support for VGA MMIO and I/O port access

2013-04-01 Thread Alex Williamson
Most VGA cards need some kind of quirk to fully operate since they
hide backdoors to get to other registers outside of PCI config space
within the registers, but this provides the base infrastructure.  If
we could identity map PCI resources for assigned devices we would need
a lot fewer quirks.

To enable this, use a kernel side vfio-pci driver that incorporates
VGA support (v3.9), and use the -vga none option and add the x-vga=on
option for the vfio-pci device.  The x- denotes this as an
experimental feature.  You may also need to use a cached copy of the
VGA BIOS for your device, passing it to vfio-pci using the romfile=
option.

Signed-off-by: Alex Williamson alex.william...@redhat.com
---
 hw/vfio_pci.c |  169 +
 1 file changed, 169 insertions(+)

diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
index 0f74dbb..ff415a6 100644
--- a/hw/vfio_pci.c
+++ b/hw/vfio_pci.c
@@ -59,6 +59,18 @@ typedef struct VFIOBAR {
 uint8_t nr; /* cache the BAR number for debug */
 } VFIOBAR;
 
+typedef struct VFIOVGARegion {
+MemoryRegion mem;
+off_t offset;
+int nr;
+} VFIOVGARegion;
+
+typedef struct VFIOVGA {
+off_t fd_offset;
+int fd;
+VFIOVGARegion region[QEMU_PCI_VGA_NUM_REGIONS];
+} VFIOVGA;
+
 typedef struct VFIOINTx {
 bool pending; /* interrupt pending */
 bool kvm_accel; /* set when QEMU bypass through KVM enabled */
@@ -127,10 +139,15 @@ typedef struct VFIODevice {
 int nr_vectors; /* Number of MSI/MSIX vectors currently in use */
 int interrupt; /* Current interrupt type */
 VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */
+VFIOVGA vga; /* 0xa, 0x3b0, 0x3c0 */
 PCIHostDeviceAddress host;
 QLIST_ENTRY(VFIODevice) next;
 struct VFIOGroup *group;
+uint32_t features;
+#define VFIO_FEATURE_ENABLE_VGA_BIT 0
+#define VFIO_FEATURE_ENABLE_VGA (1  VFIO_FEATURE_ENABLE_VGA_BIT)
 bool reset_works;
+bool has_vga;
 } VFIODevice;
 
 typedef struct VFIOGroup {
@@ -958,6 +975,89 @@ static const MemoryRegionOps vfio_bar_ops = {
 .endianness = DEVICE_LITTLE_ENDIAN,
 };
 
+static void vfio_vga_write(void *opaque, hwaddr addr,
+   uint64_t data, unsigned size)
+{
+VFIOVGARegion *region = opaque;
+VFIOVGA *vga = container_of(region, VFIOVGA, region[region-nr]);
+union {
+uint8_t byte;
+uint16_t word;
+uint32_t dword;
+uint64_t qword;
+} buf;
+off_t offset = vga-fd_offset + region-offset + addr;
+
+switch (size) {
+case 1:
+buf.byte = data;
+break;
+case 2:
+buf.word = cpu_to_le16(data);
+break;
+case 4:
+buf.dword = cpu_to_le32(data);
+break;
+default:
+hw_error(vfio: unsupported write size, %d bytes\n, size);
+break;
+}
+
+if (pwrite(vga-fd, buf, size, offset) != size) {
+error_report(%s(,0x%HWADDR_PRIx, 0x%PRIx64, %d) failed: %m,
+ __func__, region-offset + addr, data, size);
+}
+
+DPRINTF(%s(0x%HWADDR_PRIx, 0x%PRIx64, %d)\n,
+__func__, region-offset + addr, data, size);
+}
+
+static uint64_t vfio_vga_read(void *opaque, hwaddr addr, unsigned size)
+{
+VFIOVGARegion *region = opaque;
+VFIOVGA *vga = container_of(region, VFIOVGA, region[region-nr]);
+union {
+uint8_t byte;
+uint16_t word;
+uint32_t dword;
+uint64_t qword;
+} buf;
+uint64_t data = 0;
+off_t offset = vga-fd_offset + region-offset + addr;
+
+if (pread(vga-fd, buf, size, offset) != size) {
+error_report(%s(,0x%HWADDR_PRIx, %d) failed: %m,
+ __func__, region-offset + addr, size);
+return (uint64_t)-1;
+}
+
+switch (size) {
+case 1:
+data = buf.byte;
+break;
+case 2:
+data = le16_to_cpu(buf.word);
+break;
+case 4:
+data = le32_to_cpu(buf.dword);
+break;
+default:
+hw_error(vfio: unsupported read size, %d bytes\n, size);
+break;
+}
+
+DPRINTF(%s(0x%HWADDR_PRIx, %d) = 0x%PRIx64\n,
+__func__, region-offset + addr, size, data);
+
+return data;
+}
+
+static const MemoryRegionOps vfio_vga_ops = {
+.read = vfio_vga_read,
+.write = vfio_vga_write,
+.endianness = DEVICE_LITTLE_ENDIAN,
+};
+
 /*
  * PCI config space
  */
@@ -1478,6 +1578,28 @@ static void vfio_map_bars(VFIODevice *vdev)
 for (i = 0; i  PCI_ROM_SLOT; i++) {
 vfio_map_bar(vdev, i);
 }
+
+if (vdev-has_vga) {
+memory_region_init_io(vdev-vga.region[QEMU_PCI_VGA_MEM].mem,
+  vfio_vga_ops,
+  vdev-vga.region[QEMU_PCI_VGA_MEM],
+  vfio-vga-mmio@0xa,
+  QEMU_PCI_VGA_MEM_SIZE);
+memory_region_init_io(vdev-vga.region[QEMU_PCI_VGA_IO_LO].mem,
+  vfio_vga_ops,
+  

[RFC PATCH v2 0/6] device control and in-kernel MPIC

2013-04-01 Thread Scott Wood
v2 addresses some requested changes, such as the use of a file descriptor
instead of an ad-hoc handle array, and the use of an enableable
IRQ-type-specific capability to bind the vcpu to a particular MPIC device
(among other things, this allows the notifier patch to go away).

Some other requested improvements, such as support for the standard
KVM_IRQ_LINE interface and splitting up the in-kernel MPIC emulation
patch, will be addressed in a later revision.

Scott Wood (6):
  kvm: add device control API
  kvm/ppc/mpic: import hw/openpic.c from QEMU
  kvm/ppc/mpic: remove some obviously unneeded code
  kvm/ppc/mpic: adapt to kernel style and environment
  kvm/ppc/mpic: in-kernel MPIC emulation
  kvm/ppc/mpic: add KVM_CAP_IRQ_MPIC

 Documentation/virtual/kvm/api.txt  |   78 ++
 Documentation/virtual/kvm/devices/README   |1 +
 Documentation/virtual/kvm/devices/mpic.txt |   37 +
 arch/powerpc/include/asm/kvm_host.h|   16 +-
 arch/powerpc/include/asm/kvm_ppc.h |8 +
 arch/powerpc/kvm/Kconfig   |5 +
 arch/powerpc/kvm/Makefile  |2 +
 arch/powerpc/kvm/booke.c   |   12 +-
 arch/powerpc/kvm/mpic.c| 1786 
 arch/powerpc/kvm/powerpc.c |   38 +-
 include/linux/kvm_host.h   |2 +
 include/uapi/linux/kvm.h   |   37 +
 virt/kvm/kvm_main.c|   40 +
 13 files changed, 2052 insertions(+), 10 deletions(-)
 create mode 100644 Documentation/virtual/kvm/devices/README
 create mode 100644 Documentation/virtual/kvm/devices/mpic.txt
 create mode 100644 arch/powerpc/kvm/mpic.c

-- 
1.7.9.5


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH v2 3/6] kvm/ppc/mpic: remove some obviously unneeded code

2013-04-01 Thread Scott Wood
Remove some parts of the code that are obviously QEMU or Raven specific
before fixing style issues, to reduce the style issues that need to be
fixed.

Signed-off-by: Scott Wood scottw...@freescale.com
---
 arch/powerpc/kvm/mpic.c |  344 ---
 1 file changed, 344 deletions(-)

diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
index 57655b9..d6d70a4 100644
--- a/arch/powerpc/kvm/mpic.c
+++ b/arch/powerpc/kvm/mpic.c
@@ -22,39 +22,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
-/*
- *
- * Based on OpenPic implementations:
- * - Intel GW80314 I/O companion chip developer's manual
- * - Motorola MPC8245  MPC8540 user manuals.
- * - Motorola MCP750 (aka Raven) programmer manual.
- * - Motorola Harrier programmer manuel
- *
- * Serial interrupts, as implemented in Raven chipset are not supported yet.
- *
- */
-#include hw.h
-#include ppc/mac.h
-#include pci/pci.h
-#include openpic.h
-#include sysbus.h
-#include pci/msi.h
-#include qemu/bitops.h
-#include ppc.h
-
-//#define DEBUG_OPENPIC
-
-#ifdef DEBUG_OPENPIC
-static const int debug_openpic = 1;
-#else
-static const int debug_openpic = 0;
-#endif
-
-#define DPRINTF(fmt, ...) do { \
-if (debug_openpic) { \
-printf(fmt , ## __VA_ARGS__); \
-} \
-} while (0)
 
 #define MAX_CPU 32
 #define MAX_SRC 256
@@ -82,21 +49,6 @@ static const int debug_openpic = 0;
 #define OPENPIC_CPU_REG_START0x2
 #define OPENPIC_CPU_REG_SIZE 0x100 + ((MAX_CPU - 1) * 0x1000)
 
-/* Raven */
-#define RAVEN_MAX_CPU  2
-#define RAVEN_MAX_EXT 48
-#define RAVEN_MAX_IRQ 64
-#define RAVEN_MAX_TMR  MAX_TMR
-#define RAVEN_MAX_IPI  MAX_IPI
-
-/* Interrupt definitions */
-#define RAVEN_FE_IRQ (RAVEN_MAX_EXT)   /* Internal functional IRQ */
-#define RAVEN_ERR_IRQ(RAVEN_MAX_EXT + 1)   /* Error IRQ */
-#define RAVEN_TMR_IRQ(RAVEN_MAX_EXT + 2)   /* First timer IRQ */
-#define RAVEN_IPI_IRQ(RAVEN_TMR_IRQ + RAVEN_MAX_TMR)   /* First IPI 
IRQ */
-/* First doorbell IRQ */
-#define RAVEN_DBL_IRQ(RAVEN_IPI_IRQ + (RAVEN_MAX_CPU * RAVEN_MAX_IPI))
-
 typedef struct FslMpicInfo {
int max_ext;
 } FslMpicInfo;
@@ -138,44 +90,6 @@ static FslMpicInfo fsl_mpic_42 = {
 #define ILR_INTTGT_CINT   0x01 /* critical */
 #define ILR_INTTGT_MCP0x02 /* machine check */
 
-/* The currently supported INTTGT values happen to be the same as QEMU's
- * openpic output codes, but don't depend on this.  The output codes
- * could change (unlikely, but...) or support could be added for
- * more INTTGT values.
- */
-static const int inttgt_output[][2] = {
-   {ILR_INTTGT_INT, OPENPIC_OUTPUT_INT},
-   {ILR_INTTGT_CINT, OPENPIC_OUTPUT_CINT},
-   {ILR_INTTGT_MCP, OPENPIC_OUTPUT_MCK},
-};
-
-static int inttgt_to_output(int inttgt)
-{
-   int i;
-
-   for (i = 0; i  ARRAY_SIZE(inttgt_output); i++) {
-   if (inttgt_output[i][0] == inttgt) {
-   return inttgt_output[i][1];
-   }
-   }
-
-   fprintf(stderr, %s: unsupported inttgt %d\n, __func__, inttgt);
-   return OPENPIC_OUTPUT_INT;
-}
-
-static int output_to_inttgt(int output)
-{
-   int i;
-
-   for (i = 0; i  ARRAY_SIZE(inttgt_output); i++) {
-   if (inttgt_output[i][1] == output) {
-   return inttgt_output[i][0];
-   }
-   }
-
-   abort();
-}
-
 #define MSIIR_OFFSET   0x140
 #define MSIIR_SRS_SHIFT29
 #define MSIIR_SRS_MASK (0x7  MSIIR_SRS_SHIFT)
@@ -1265,228 +1179,36 @@ static uint64_t openpic_cpu_read(void *opaque, hwaddr 
addr, unsigned len)
return openpic_cpu_read_internal(opaque, addr, (addr  0x1f000)  12);
 }
 
-static const MemoryRegionOps openpic_glb_ops_le = {
-   .write = openpic_gbl_write,
-   .read = openpic_gbl_read,
-   .endianness = DEVICE_LITTLE_ENDIAN,
-   .impl = {
-.min_access_size = 4,
-.max_access_size = 4,
-},
-};
-
 static const MemoryRegionOps openpic_glb_ops_be = {
.write = openpic_gbl_write,
.read = openpic_gbl_read,
-   .endianness = DEVICE_BIG_ENDIAN,
-   .impl = {
-.min_access_size = 4,
-.max_access_size = 4,
-},
-};
-
-static const MemoryRegionOps openpic_tmr_ops_le = {
-   .write = openpic_tmr_write,
-   .read = openpic_tmr_read,
-   .endianness = DEVICE_LITTLE_ENDIAN,
-   .impl = {
-.min_access_size = 4,
-.max_access_size = 4,
-},
 };
 
 static const MemoryRegionOps openpic_tmr_ops_be = {
.write = openpic_tmr_write,
.read = openpic_tmr_read,
-   .endianness = DEVICE_BIG_ENDIAN,
-   .impl = {
-.min_access_size = 4,
-.max_access_size = 4,
-},
-};
-
-static const MemoryRegionOps openpic_cpu_ops_le = {
-   

[RFC PATCH v2 5/6] kvm/ppc/mpic: in-kernel MPIC emulation

2013-04-01 Thread Scott Wood
Hook the MPIC code up to the KVM interfaces, add locking, etc.

TODO: irqfd support, split up into multiple patches, KVM_IRQ_LINE
support

Signed-off-by: Scott Wood scottw...@freescale.com
---
 Documentation/virtual/kvm/devices/mpic.txt |   37 ++
 arch/powerpc/include/asm/kvm_host.h|8 +-
 arch/powerpc/include/asm/kvm_ppc.h |4 +
 arch/powerpc/kvm/Kconfig   |5 +
 arch/powerpc/kvm/Makefile  |2 +
 arch/powerpc/kvm/booke.c   |   10 +-
 arch/powerpc/kvm/mpic.c|  816 +---
 arch/powerpc/kvm/powerpc.c |   12 +-
 include/linux/kvm_host.h   |2 +
 include/uapi/linux/kvm.h   |9 +
 virt/kvm/kvm_main.c|9 +
 11 files changed, 713 insertions(+), 201 deletions(-)
 create mode 100644 Documentation/virtual/kvm/devices/mpic.txt

diff --git a/Documentation/virtual/kvm/devices/mpic.txt 
b/Documentation/virtual/kvm/devices/mpic.txt
new file mode 100644
index 000..79e000a
--- /dev/null
+++ b/Documentation/virtual/kvm/devices/mpic.txt
@@ -0,0 +1,37 @@
+MPIC interrupt controller
+=
+
+Device types supported:
+  KVM_DEV_TYPE_FSL_MPIC_20 Freescale MPIC v2.0
+  KVM_DEV_TYPE_FSL_MPIC_42 Freescale MPIC v4.2
+
+Only one MPIC instance, of any type, may be instantiated.  The created
+MPIC will act as the system interrupt controller, connecting to each
+vcpu's interrupt inputs.
+
+Groups:
+  KVM_DEV_MPIC_GRP_MISC
+  Attributes:
+KVM_DEV_MPIC_BASE_ADDR (rw, 64-bit)
+  Base address of the 256 KiB MPIC register space.  Must be
+  naturally aligned.  A value of zero disables the mapping.
+  Reset value is zero.
+
+  KVM_DEV_MPIC_GRP_REGISTER (rw, 32-bit)
+Access an MPIC register, as if the access were made from the guest. 
+attr is the byte offset into the MPIC register space.  Accesses
+must be 4-byte aligned.
+
+MSIs may be signaled by using this attribute group to write
+to the relevant MSIIR.
+
+  KVM_DEV_MPIC_GRP_IRQ_ACTIVE (rw, 32-bit)
+IRQ input line for each standard openpic source.  0 is inactive and 1
+is active, regardless of interrupt sense.
+
+For edge-triggered interrupts:  Writing 1 is considered an activating
+edge, and writing 0 is ignored.  Reading returns 1 if a previously
+signaled edge has not been acknowledged, and 0 otherwise.
+
+attr is the IRQ number.  IRQ numbers for standard sources are the
+byte offset of the relevant IVPR from EIVPR0, divided by 32.
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index e0caae2..6713327 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -359,6 +359,11 @@ struct kvmppc_slb {
 #define KVMPPC_BOOKE_MAX_IAC   4
 #define KVMPPC_BOOKE_MAX_DAC   2
 
+/* KVMPPC_EPR_USER takes precedence over KVMPPC_EPR_KERNEL */
+#define KVMPPC_EPR_NONE0 /* EPR not supported */
+#define KVMPPC_EPR_USER1 /* exit to userspace to fill EPR */
+#define KVMPPC_EPR_KERNEL  2 /* in-kernel irqchip */
+
 struct kvmppc_booke_debug_reg {
u32 dbcr0;
u32 dbcr1;
@@ -525,7 +530,7 @@ struct kvm_vcpu_arch {
u8 sane;
u8 cpu_type;
u8 hcall_needed;
-   u8 epr_enabled;
+   u8 epr_flags; /* KVMPPC_EPR_xxx */
u8 epr_needed;
 
u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
@@ -595,5 +600,6 @@ struct kvm_vcpu_arch {
 #define KVM_MMIO_REG_FQPR  0x0060
 
 #define __KVM_HAVE_ARCH_WQP
+#define __KVM_HAVE_CREATE_DEVICE
 
 #endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index f44932c..20b2a5e 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -164,6 +164,8 @@ extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu);
 
 extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *);
 
+int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq);
+
 /*
  * Cuts out inst bits with ordering according to spec.
  * That means the leftmost bit is zero. All given bits are included.
@@ -270,6 +272,8 @@ static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, 
u32 epr)
 #endif
 }
 
+void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu);
+
 int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
  struct kvm_config_tlb *cfg);
 int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 63c67ec..a87139b 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -151,6 +151,11 @@ config KVM_E500MC
 
  If unsure, say N.
 
+config KVM_MPIC
+   bool KVM in-kernel MPIC emulation
+   depends on KVM
+
+
 source drivers/vhost/Kconfig
 
 endif # VIRTUALIZATION
diff --git a/arch/powerpc/kvm/Makefile 

[RFC PATCH v2 2/6] kvm/ppc/mpic: import hw/openpic.c from QEMU

2013-04-01 Thread Scott Wood
This is QEMU's hw/openpic.c from commit
abd8d4a4d6dfea7ddea72f095f993e1de941614e (Update version for
1.4.0-rc0), run through Lindent with no other changes to ease merging
future changes between Linux and QEMU.  Remaining style issues
(including those introduced by Lindent) will be fixed in a later patch.

Signed-off-by: Scott Wood scottw...@freescale.com
---
 arch/powerpc/kvm/mpic.c | 1686 +++
 1 file changed, 1686 insertions(+)
 create mode 100644 arch/powerpc/kvm/mpic.c

diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
new file mode 100644
index 000..57655b9
--- /dev/null
+++ b/arch/powerpc/kvm/mpic.c
@@ -0,0 +1,1686 @@
+/*
+ * OpenPIC emulation
+ *
+ * Copyright (c) 2004 Jocelyn Mayer
+ *   2011 Alexander Graf
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the Software), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+/*
+ *
+ * Based on OpenPic implementations:
+ * - Intel GW80314 I/O companion chip developer's manual
+ * - Motorola MPC8245  MPC8540 user manuals.
+ * - Motorola MCP750 (aka Raven) programmer manual.
+ * - Motorola Harrier programmer manuel
+ *
+ * Serial interrupts, as implemented in Raven chipset are not supported yet.
+ *
+ */
+#include hw.h
+#include ppc/mac.h
+#include pci/pci.h
+#include openpic.h
+#include sysbus.h
+#include pci/msi.h
+#include qemu/bitops.h
+#include ppc.h
+
+//#define DEBUG_OPENPIC
+
+#ifdef DEBUG_OPENPIC
+static const int debug_openpic = 1;
+#else
+static const int debug_openpic = 0;
+#endif
+
+#define DPRINTF(fmt, ...) do { \
+if (debug_openpic) { \
+printf(fmt , ## __VA_ARGS__); \
+} \
+} while (0)
+
+#define MAX_CPU 32
+#define MAX_SRC 256
+#define MAX_TMR 4
+#define MAX_IPI 4
+#define MAX_MSI 8
+#define MAX_IRQ (MAX_SRC + MAX_IPI + MAX_TMR)
+#define VID 0x03   /* MPIC version ID */
+
+/* OpenPIC capability flags */
+#define OPENPIC_FLAG_IDR_CRIT (1  0)
+#define OPENPIC_FLAG_ILR  (2  0)
+
+/* OpenPIC address map */
+#define OPENPIC_GLB_REG_START0x0
+#define OPENPIC_GLB_REG_SIZE 0x10F0
+#define OPENPIC_TMR_REG_START0x10F0
+#define OPENPIC_TMR_REG_SIZE 0x220
+#define OPENPIC_MSI_REG_START0x1600
+#define OPENPIC_MSI_REG_SIZE 0x200
+#define OPENPIC_SUMMARY_REG_START   0x3800
+#define OPENPIC_SUMMARY_REG_SIZE0x800
+#define OPENPIC_SRC_REG_START0x1
+#define OPENPIC_SRC_REG_SIZE (MAX_SRC * 0x20)
+#define OPENPIC_CPU_REG_START0x2
+#define OPENPIC_CPU_REG_SIZE 0x100 + ((MAX_CPU - 1) * 0x1000)
+
+/* Raven */
+#define RAVEN_MAX_CPU  2
+#define RAVEN_MAX_EXT 48
+#define RAVEN_MAX_IRQ 64
+#define RAVEN_MAX_TMR  MAX_TMR
+#define RAVEN_MAX_IPI  MAX_IPI
+
+/* Interrupt definitions */
+#define RAVEN_FE_IRQ (RAVEN_MAX_EXT)   /* Internal functional IRQ */
+#define RAVEN_ERR_IRQ(RAVEN_MAX_EXT + 1)   /* Error IRQ */
+#define RAVEN_TMR_IRQ(RAVEN_MAX_EXT + 2)   /* First timer IRQ */
+#define RAVEN_IPI_IRQ(RAVEN_TMR_IRQ + RAVEN_MAX_TMR)   /* First IPI 
IRQ */
+/* First doorbell IRQ */
+#define RAVEN_DBL_IRQ(RAVEN_IPI_IRQ + (RAVEN_MAX_CPU * RAVEN_MAX_IPI))
+
+typedef struct FslMpicInfo {
+   int max_ext;
+} FslMpicInfo;
+
+static FslMpicInfo fsl_mpic_20 = {
+   .max_ext = 12,
+};
+
+static FslMpicInfo fsl_mpic_42 = {
+   .max_ext = 12,
+};
+
+#define FRR_NIRQ_SHIFT16
+#define FRR_NCPU_SHIFT 8
+#define FRR_VID_SHIFT  0
+
+#define VID_REVISION_1_2   2
+#define VID_REVISION_1_3   3
+
+#define VIR_GENERIC  0x/* Generic Vendor ID */
+
+#define GCR_RESET0x8000
+#define GCR_MODE_PASS0x
+#define GCR_MODE_MIXED   0x2000
+#define GCR_MODE_PROXY   0x6000
+
+#define TBCR_CI   0x8000   /* count inhibit */
+#define TCCR_TOG  0x8000   /* toggles when decrement to zero */
+
+#define IDR_EP_SHIFT  31
+#define IDR_EP_MASK   (1  IDR_EP_SHIFT)

[RFC PATCH v2 4/6] kvm/ppc/mpic: adapt to kernel style and environment

2013-04-01 Thread Scott Wood
Remove braces that Linux style doesn't permit, remove space after
'*' that Lindent added, keep error/debug strings contiguous, etc.

Substitute type names, debug prints, etc.

Signed-off-by: Scott Wood scottw...@freescale.com
---
 arch/powerpc/kvm/mpic.c |  445 ++-
 1 file changed, 208 insertions(+), 237 deletions(-)

diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
index d6d70a4..1df67ae 100644
--- a/arch/powerpc/kvm/mpic.c
+++ b/arch/powerpc/kvm/mpic.c
@@ -42,22 +42,22 @@
 #define OPENPIC_TMR_REG_SIZE 0x220
 #define OPENPIC_MSI_REG_START0x1600
 #define OPENPIC_MSI_REG_SIZE 0x200
-#define OPENPIC_SUMMARY_REG_START   0x3800
-#define OPENPIC_SUMMARY_REG_SIZE0x800
+#define OPENPIC_SUMMARY_REG_START0x3800
+#define OPENPIC_SUMMARY_REG_SIZE 0x800
 #define OPENPIC_SRC_REG_START0x1
 #define OPENPIC_SRC_REG_SIZE (MAX_SRC * 0x20)
 #define OPENPIC_CPU_REG_START0x2
-#define OPENPIC_CPU_REG_SIZE 0x100 + ((MAX_CPU - 1) * 0x1000)
+#define OPENPIC_CPU_REG_SIZE (0x100 + ((MAX_CPU - 1) * 0x1000))
 
-typedef struct FslMpicInfo {
+struct fsl_mpic_info {
int max_ext;
-} FslMpicInfo;
+};
 
-static FslMpicInfo fsl_mpic_20 = {
+static struct fsl_mpic_info fsl_mpic_20 = {
.max_ext = 12,
 };
 
-static FslMpicInfo fsl_mpic_42 = {
+static struct fsl_mpic_info fsl_mpic_42 = {
.max_ext = 12,
 };
 
@@ -100,44 +100,43 @@ static int get_current_cpu(void)
 {
CPUState *cpu_single_cpu;
 
-   if (!cpu_single_env) {
+   if (!cpu_single_env)
return -1;
-   }
 
cpu_single_cpu = ENV_GET_CPU(cpu_single_env);
return cpu_single_cpu-cpu_index;
 }
 
-static uint32_t openpic_cpu_read_internal(void *opaque, hwaddr addr, int idx);
-static void openpic_cpu_write_internal(void *opaque, hwaddr addr,
+static uint32_t openpic_cpu_read_internal(void *opaque, gpa_t addr, int idx);
+static void openpic_cpu_write_internal(void *opaque, gpa_t addr,
   uint32_t val, int idx);
 
-typedef enum IRQType {
+enum irq_type {
IRQ_TYPE_NORMAL = 0,
IRQ_TYPE_FSLINT,/* FSL internal interrupt -- level only */
IRQ_TYPE_FSLSPECIAL,/* FSL timer/IPI interrupt, edge, no polarity */
-} IRQType;
+};
 
-typedef struct IRQQueue {
+struct irq_queue {
/* Round up to the nearest 64 IRQs so that the queue length
 * won't change when moving between 32 and 64 bit hosts.
 */
unsigned long queue[BITS_TO_LONGS((MAX_IRQ + 63)  ~63)];
int next;
int priority;
-} IRQQueue;
+};
 
-typedef struct IRQSource {
+struct irq_source {
uint32_t ivpr;  /* IRQ vector/priority register */
uint32_t idr;   /* IRQ destination register */
uint32_t destmask;  /* bitmap of CPU destinations */
int last_cpu;
int output; /* IRQ level, e.g. OPENPIC_OUTPUT_INT */
int pending;/* TRUE if IRQ is pending */
-   IRQType type;
+   enum irq_type type;
bool level:1;   /* level-triggered */
-   bool nomask:1;  /* critical interrupts ignore mask on some FSL 
MPICs */
-} IRQSource;
+   bool nomask:1;  /* critical interrupts ignore mask on some FSL MPICs */
+};
 
 #define IVPR_MASK_SHIFT   31
 #define IVPR_MASK_MASK(1  IVPR_MASK_SHIFT)
@@ -158,22 +157,19 @@ typedef struct IRQSource {
 #define IDR_EP  0x8000 /* external pin */
 #define IDR_CI  0x4000 /* critical interrupt */
 
-typedef struct IRQDest {
+struct irq_dest {
int32_t ctpr;   /* CPU current task priority */
-   IRQQueue raised;
-   IRQQueue servicing;
+   struct irq_queue raised;
+   struct irq_queue servicing;
qemu_irq *irqs;
 
/* Count of IRQ sources asserting on non-INT outputs */
uint32_t outputs_active[OPENPIC_OUTPUT_NB];
-} IRQDest;
-
-typedef struct OpenPICState {
-   SysBusDevice busdev;
-   MemoryRegion mem;
+};
 
+struct openpic {
/* Behavior control */
-   FslMpicInfo *fsl;
+   struct fsl_mpic_info *fsl;
uint32_t model;
uint32_t flags;
uint32_t nb_irqs;
@@ -186,9 +182,6 @@ typedef struct OpenPICState {
uint32_t brr1;
uint32_t mpic_mode_mask;
 
-   /* Sub-regions */
-   MemoryRegion sub_io_mem[6];
-
/* Global registers */
uint32_t frr;   /* Feature reporting register */
uint32_t gcr;   /* Global configuration register  */
@@ -196,9 +189,9 @@ typedef struct OpenPICState {
uint32_t spve;  /* Spurious vector register */
uint32_t tfrr;  /* Timer frequency reporting register */
/* Source registers */
-   IRQSource src[MAX_IRQ];
+   struct irq_source src[MAX_IRQ];
/* Local registers per output pin */
-   IRQDest dst[MAX_CPU];
+   struct irq_dest 

[RFC PATCH v2 6/6] kvm/ppc/mpic: add KVM_CAP_IRQ_MPIC

2013-04-01 Thread Scott Wood
Enabling this capability connects the vcpu to the designated in-kernel
MPIC.  Using explicit connections between vcpus and irqchips allows
for flexibility, but the main benefit at the moment is that it
simplifies the code -- KVM doesn't need vm-global state to remember
which MPIC object is associated with this vm, and it doesn't need to
care about ordering between irqchip creation and vcpu creation.

Signed-off-by: Scott Wood scottw...@freescale.com
---
 Documentation/virtual/kvm/api.txt   |8 ++
 arch/powerpc/include/asm/kvm_host.h |   10 ---
 arch/powerpc/include/asm/kvm_ppc.h  |2 ++
 arch/powerpc/kvm/booke.c|4 ++-
 arch/powerpc/kvm/mpic.c |   49 +++
 arch/powerpc/kvm/powerpc.c  |   25 +++---
 include/uapi/linux/kvm.h|1 +
 7 files changed, 86 insertions(+), 13 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 77328aa..38f9b6d 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2728,3 +2728,11 @@ to receive the topmost interrupt vector.
 When disabled (args[0] == 0), behavior is as if this facility is unsupported.
 
 When this capability is enabled, KVM_EXIT_EPR can occur.
+
+6.6 KVM_CAP_IRQ_MPIC
+
+Architectures: ppc
+Parameters: args[0] is the MPIC device fd
+args[1] is the MPIC CPU number for this vcpu
+
+This capability connects the vcpu to an in-kernel MPIC device.
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 6713327..2a2e235 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -375,8 +375,10 @@ struct kvmppc_booke_debug_reg {
u64 dac[KVMPPC_BOOKE_MAX_DAC];
 };
 
-#define KVMPPC_IRQCHIP_NONE0
-#define KVMPPC_IRQCHIP_MPIC1
+#define KVMPPC_IRQ_DEFAULT 0
+#define KVMPPC_IRQ_MPIC1
+
+struct openpic;
 
 struct kvm_vcpu_arch {
ulong host_stack;
@@ -557,8 +559,8 @@ struct kvm_vcpu_arch {
unsigned long magic_page_pa; /* phys addr to map the magic page to */
unsigned long magic_page_ea; /* effect. addr to map the magic page to */
 
-   int irqchip_type;
-   void *irqchip_priv;
+   int irq_type;   /* one of KVM_IRQ_* */
+   struct openpic *mpic;   /* KVM_IRQ_MPIC */
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
struct kvm_vcpu_arch_shared shregs;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 20b2a5e..2cc18a4 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -273,6 +273,8 @@ static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, 
u32 epr)
 }
 
 void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu);
+int kvmppc_mpic_connect_vcpu(struct file *mpic_filp, struct kvm_vcpu *vcpu,
+u32 cpu);
 
 int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
  struct kvm_config_tlb *cfg);
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index cddc6b3..7d00222 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -430,8 +430,10 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu 
*vcpu,
if (update_epr == true) {
if (vcpu-arch.epr_flags  KVMPPC_EPR_USER)
kvm_make_request(KVM_REQ_EPR_EXIT, vcpu);
-   else if (vcpu-arch.epr_flags  KVMPPC_EPR_KERNEL)
+   else if (vcpu-arch.epr_flags  KVMPPC_EPR_KERNEL) {
+   BUG_ON(vcpu-arch.irq_type != KVMPPC_IRQ_MPIC);
kvmppc_mpic_set_epr(vcpu);
+   }
}
 
new_msr = msr_mask;
diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
index 9aace50..b790f47 100644
--- a/arch/powerpc/kvm/mpic.c
+++ b/arch/powerpc/kvm/mpic.c
@@ -1159,7 +1159,7 @@ static uint32_t openpic_iack(struct openpic *opp, struct 
irq_dest *dst,
 
 void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu)
 {
-   struct openpic *opp = vcpu-arch.irqchip_priv;
+   struct openpic *opp = vcpu-arch.mpic;
int cpu = vcpu-vcpu_id;
unsigned long flags;
 
@@ -1442,10 +1442,10 @@ static void map_mmio(struct openpic *opp)
 
 static void unmap_mmio(struct openpic *opp)
 {
-   BUG_ON(opp-mmio_mapped);
-   opp-mmio_mapped = false;
-
-   kvm_io_bus_unregister_dev(opp-kvm, KVM_MMIO_BUS, opp-mmio);
+   if (opp-mmio_mapped) {
+   opp-mmio_mapped = false;
+   kvm_io_bus_unregister_dev(opp-kvm, KVM_MMIO_BUS, opp-mmio);
+   }
 }
 
 static int set_base_addr(struct openpic *opp, struct kvm_device_attr *attr)
@@ -1683,6 +1683,45 @@ static const struct file_operations kvm_mpic_fops = {
.release = kvm_mpic_release,
 };
 
+int kvmppc_mpic_connect_vcpu(struct file *mpic_filp, struct kvm_vcpu *vcpu,
+

[RFC PATCH v2 1/6] kvm: add device control API

2013-04-01 Thread Scott Wood
Currently, devices that are emulated inside KVM are configured in a
hardcoded manner based on an assumption that any given architecture
only has one way to do it.  If there's any need to access device state,
it is done through inflexible one-purpose-only IOCTLs (e.g.
KVM_GET/SET_LAPIC).  Defining new IOCTLs for every little thing is
cumbersome and depletes a limited numberspace.

This API provides a mechanism to instantiate a device of a certain
type, returning an ID that can be used to set/get attributes of the
device.  Attributes may include configuration parameters (e.g.
register base address), device state, operational commands, etc.  It
is similar to the ONE_REG API, except that it acts on devices rather
than vcpus.

Both device types and individual attributes can be tested without having
to create the device or get/set the attribute, without the need for
separately managing enumerated capabilities.

Signed-off-by: Scott Wood scottw...@freescale.com
---
 Documentation/virtual/kvm/api.txt|   70 ++
 Documentation/virtual/kvm/devices/README |1 +
 arch/powerpc/include/asm/kvm_host.h  |6 +++
 arch/powerpc/include/asm/kvm_ppc.h   |2 +
 arch/powerpc/kvm/powerpc.c   |7 +++
 include/uapi/linux/kvm.h |   27 
 virt/kvm/kvm_main.c  |   31 +
 7 files changed, 144 insertions(+)
 create mode 100644 Documentation/virtual/kvm/devices/README

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 976eb65..77328aa 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2173,6 +2173,76 @@ header; first `n_valid' valid entries with contents from 
the data
 written, then `n_invalid' invalid entries, invalidating any previously
 valid entries found.
 
+4.79 KVM_CREATE_DEVICE
+
+Capability: KVM_CAP_DEVICE_CTRL
+Type: vm ioctl
+Parameters: struct kvm_create_device (in/out)
+Returns: 0 on success, -1 on error
+Errors:
+  ENODEV: The device type is unknown or unsupported
+  EEXIST: Device already created, and this type of device may not
+  be instantiated multiple times
+  ENOSPC: Too many devices have been created
+
+  Other error conditions may be defined by individual device types.
+
+Creates an emulated device in the kernel.  The file descriptor returned
+in fd can be used with KVM_SET/GET/HAS_DEVICE_ATTR.
+
+If the KVM_CREATE_DEVICE_TEST flag is set, only test whether the
+device type is supported (not necessarily whether it can be created
+in the current vm).
+
+Individual devices should not define flags.  Attributes should be used
+for specifying any behavior that is not implied by the device type
+number.
+
+struct kvm_create_device {
+   __u32   type;   /* in: KVM_DEV_TYPE_xxx */
+   __u32   fd; /* out: device handle */
+   __u32   flags;  /* in: KVM_CREATE_DEVICE_xxx */
+};
+
+4.80 KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR
+
+Capability: KVM_CAP_DEVICE_CTRL
+Type: device ioctl
+Parameters: struct kvm_device_attr
+Returns: 0 on success, -1 on error
+Errors:
+  ENXIO:  The group or attribute is unknown/unsupported for this device
+  EPERM:  The attribute cannot (currently) be accessed this way
+  (e.g. read-only attribute, or attribute that only makes
+  sense when the device is in a different state)
+
+  Other error conditions may be defined by individual device types.
+
+Gets/sets a specified piece of device configuration and/or state.  The
+semantics are device-specific.  See individual device documentation in
+the devices directory.  As with ONE_REG, the size of the data
+transferred is defined by the particular attribute.
+
+struct kvm_device_attr {
+   __u32   flags;  /* no flags currently defined */
+   __u32   group;  /* device-defined */
+   __u64   attr;   /* group-defined */
+   __u64   addr;   /* userspace address of attr data */
+};
+
+4.81 KVM_HAS_DEVICE_ATTR
+
+Capability: KVM_CAP_DEVICE_CTRL
+Type: device ioctl
+Parameters: struct kvm_device_attr
+Returns: 0 on success, -1 on error
+Errors:
+  ENXIO:  The group or attribute is unknown/unsupported for this device
+
+Tests whether a device supports a particular attribute.  A successful
+return indicates the attribute is implemented.  It does not necessarily
+indicate that the attribute can be read or written in the device's
+current state.  addr is ignored.
 
 4.77 KVM_ARM_VCPU_INIT
 
diff --git a/Documentation/virtual/kvm/devices/README 
b/Documentation/virtual/kvm/devices/README
new file mode 100644
index 000..34a6983
--- /dev/null
+++ b/Documentation/virtual/kvm/devices/README
@@ -0,0 +1 @@
+This directory contains specific device bindings for KVM_CAP_DEVICE_CTRL.
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index e34f8fe..e0caae2 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ 

Re: [PATCH V2 2/2] tcm_vhost: Use vq-private_data to indicate if the endpoint is setup

2013-04-01 Thread Rusty Russell
Michael S. Tsirkin m...@redhat.com writes:
 Rusty's currently doing some reorgs of -net let's delay
 cleanups there to avoid stepping on each other's toys.
 Let's focus on scsi here.
 E.g. any chance framing assumptions can be fixed in 3.10?

I am waiting for your removal of the dma-compelete ordering stuff in
vhost-net.

Cheers,
Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] virtio-pci: Add virtio_queue_valid checks ahead of virtio_queue_get_num

2013-04-01 Thread Nicholas A. Bellinger
On Sun, 2013-03-31 at 10:37 +0300, Michael S. Tsirkin wrote:
 On Fri, Mar 29, 2013 at 04:33:11AM +, Nicholas A. Bellinger wrote:
  From: Nicholas Bellinger n...@linux-iscsi.org
  
  This patch adds a number of virtio_queue_valid() checks to virtio-pci
  ahead of virtio_queue_get_num() usage in order to skip operation upon
  the detection of an uninitialized VQ.
  
  There is one exception in virtio_ioport_read():VIRTIO_PCI_QUEUE_NUM,
  where virtio_queue_get_num() may still be called without a valid
  vdev-vq[n].vring.desc physical address.
  
  Cc: Michael S. Tsirkin m...@redhat.com
  Cc: Asias He as...@redhat.com
  Cc: Paolo Bonzini pbonz...@redhat.com
  Signed-off-by: Nicholas Bellinger n...@linux-iscsi.org
 
 Makes sense. Minor nit: virtio_queue_valid calls virtio_queue_get_num
 internally, so we can drop it everywhere we know queue is valid.
 

Yes, of course.  This includes every location in virtio-pci.c below..

Including for patch-v2.

  ---
   hw/virtio-pci.c |   27 +++
   1 files changed, 27 insertions(+), 0 deletions(-)
  
  diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
  index 0d67b84..231ca0c 100644
  --- a/hw/virtio-pci.c
  +++ b/hw/virtio-pci.c
  @@ -211,6 +211,9 @@ static void virtio_pci_start_ioeventfd(VirtIOPCIProxy 
  *proxy)
   }
   
   for (n = 0; n  VIRTIO_PCI_QUEUE_MAX; n++) {
  +if (!virtio_queue_valid(proxy-vdev, n)) {
  +continue;
  +}
   if (!virtio_queue_get_num(proxy-vdev, n)) {
   continue;
   }
  @@ -225,6 +228,9 @@ static void virtio_pci_start_ioeventfd(VirtIOPCIProxy 
  *proxy)
   
   assign_error:
   while (--n = 0) {
  +if (!virtio_queue_valid(proxy-vdev, n)) {
  +continue;
  +}
   if (!virtio_queue_get_num(proxy-vdev, n)) {
   continue;
   }
  @@ -246,6 +252,9 @@ static void virtio_pci_stop_ioeventfd(VirtIOPCIProxy 
  *proxy)
   }
   
   for (n = 0; n  VIRTIO_PCI_QUEUE_MAX; n++) {
  +if (!virtio_queue_valid(proxy-vdev, n)) {
  +continue;
  +}
   if (!virtio_queue_get_num(proxy-vdev, n)) {
   continue;
   }
  @@ -546,6 +555,9 @@ static int kvm_virtio_pci_vector_use(VirtIOPCIProxy 
  *proxy, int nvqs)
   MSIMessage msg;
   
   for (queue_no = 0; queue_no  nvqs; queue_no++) {
  +if (!virtio_queue_valid(vdev, queue_no)) {
  +continue;
  +}
   if (!virtio_queue_get_num(vdev, queue_no)) {
   break;
   }
  @@ -593,6 +605,9 @@ static void 
  kvm_virtio_pci_vector_release(VirtIOPCIProxy *proxy, int nvqs)
   int queue_no;
   
   for (queue_no = 0; queue_no  nvqs; queue_no++) {
  +if (!virtio_queue_valid(vdev, queue_no)) {
  +continue;
  +}
   if (!virtio_queue_get_num(vdev, queue_no)) {
   break;
   }
  @@ -665,6 +680,9 @@ static int kvm_virtio_pci_vector_unmask(PCIDevice *dev, 
  unsigned vector,
   int ret, queue_no;
   
   for (queue_no = 0; queue_no  proxy-nvqs_with_notifiers; queue_no++) {
  +if (!virtio_queue_valid(vdev, queue_no)) {
  +continue;
  +}
   if (!virtio_queue_get_num(vdev, queue_no)) {
   break;
   }
  @@ -695,6 +713,9 @@ static void kvm_virtio_pci_vector_mask(PCIDevice *dev, 
  unsigned vector)
   int queue_no;
   
   for (queue_no = 0; queue_no  proxy-nvqs_with_notifiers; queue_no++) {
  +if (!virtio_queue_valid(vdev, queue_no)) {
  +continue;
  +}
   if (!virtio_queue_get_num(vdev, queue_no)) {
   break;
   }
  @@ -717,6 +738,9 @@ static void kvm_virtio_pci_vector_poll(PCIDevice *dev,
   VirtQueue *vq;
   
   for (queue_no = 0; queue_no  proxy-nvqs_with_notifiers; queue_no++) {
  +if (!virtio_queue_valid(vdev, queue_no)) {
  +continue;
  +}
   if (!virtio_queue_get_num(vdev, queue_no)) {
   break;
   }
  @@ -790,6 +814,9 @@ static int virtio_pci_set_guest_notifiers(DeviceState 
  *d, int nvqs, bool assign)
   }
   
   for (n = 0; n  nvqs; n++) {
  +if (!virtio_queue_valid(vdev, n)) {
  +continue;
  +}
   if (!virtio_queue_get_num(vdev, n)) {
   break;
   }
  -- 
  1.7.2.5
 --
 To unsubscribe from this list: send the line unsubscribe target-devel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] vhost: Check+skip uninitialized VQs in vhost_verify_ring_mappings

2013-04-01 Thread Nicholas A. Bellinger
On Sun, 2013-03-31 at 10:45 +0300, Michael S. Tsirkin wrote:
 On Fri, Mar 29, 2013 at 04:33:12AM +, Nicholas A. Bellinger wrote:
  From: Nicholas Bellinger n...@linux-iscsi.org
  
  With the virtio_queue_valid() checks in place to skip uninitialized VQs
  within virtio-pci code, go ahead and skip the same uninitialized VQs
  during vhost_verify_ring_mappings().
  
  Note this patch does not prevent vhost_virtqueue_start() from executing
  by checking virtio_queue_valid(), as other logic during seabios -
  virtio-scsi LLD guest hand-off appears to depend upon this execution.
 
 Weird.
 cpu_physical_memory_map only succeeds for PA==0 by chance,
 we really should not depend on this.
 So the right thing really should be to skip vhost_virtqueue_start IMHO,
 maybe add an explicit valid flag in vhost_virtqueue
 so vhost_verify_ring_mappings can check it.
 What exactly does it do that is needed?
 

So the issue with virtio_queue_valid() preventing
vhost_virtqueue_start() execution in the original patch was that
vhost_virtqueue_stop() was missing a matching virtio_queue_valid() call,
which ended up triggering a bad ram pointer during subsequent
cpu_physical_memory_unmap() calls to non-existent virtio queue memory..

With the matching virtio_queue_valid() call in place preventing
vhost_virtqueue_stop() when vhost_virtqueue_start() is skipped for an
uninitialized VQ, a explicit valid flag should not be necessary.

--nab


  
  Cc: Michael S. Tsirkin m...@redhat.com
  Cc: Asias He as...@redhat.com
  Cc: Paolo Bonzini pbonz...@redhat.com
  Signed-off-by: Nicholas Bellinger n...@linux-iscsi.org
  ---
   hw/vhost.c |3 +++
   1 files changed, 3 insertions(+), 0 deletions(-)
  
  diff --git a/hw/vhost.c b/hw/vhost.c
  index 4d6aee3..3a71aee 100644
  --- a/hw/vhost.c
  +++ b/hw/vhost.c
  @@ -314,6 +314,9 @@ static int vhost_verify_ring_mappings(struct vhost_dev 
  *dev,
   hwaddr l;
   void *p;
   
  +if (!vq-ring_phys || !vq-ring_size) {
  +continue;
  +}
   if (!ranges_overlap(start_addr, size, vq-ring_phys, 
  vq-ring_size)) {
   continue;
   }
  -- 
  1.7.2.5
 --
 To unsubscribe from this list: send the line unsubscribe target-devel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/3] virtio/vhost: Add checks for uninitialized VQs

2013-04-01 Thread Nicholas A. Bellinger
On Sun, 2013-03-31 at 10:46 +0300, Michael S. Tsirkin wrote:
 On Fri, Mar 29, 2013 at 04:33:09AM +, Nicholas A. Bellinger wrote:
  From: Nicholas Bellinger n...@linux-iscsi.org
  
  Hi folks,
  
  This series adds a virtio_queue_valid() for use by virtio-pci code in
  order to prevent opreations upon uninitialized VQs, that is currently
  expected to occur during seabios setup of virtio-scsi.
  
  This also includes a vhost specific check for uninitialized VQs in
  vhost_verify_ring_mappings() to avoid this same case.
  
  Please review.
  
  --nab
 
 Okay, and does this fix the failures in vhost_verify_ring_mappings
 that you've observed?
 

Unfortunately, no.  I've done some more digging and will follow up with
additional details on the original thread shortly..

--nab

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH-v2 0/3] virtio/vhost: Add checks for uninitialized VQs

2013-04-01 Thread Nicholas A. Bellinger
From: Nicholas Bellinger n...@linux-iscsi.org

Hi folks,

This series adds a virtio_queue_valid() for use by virtio-pci code in
order to prevent opreations upon uninitialized VQs, which is currently
expected to occur during seabios setup of virtio-scsi with in-flight
vhost-scsi-pci device code.

On the vhost side, it also adds virtio_queue_valid() sanity checks in
vhost_virtqueue_[start,stop]() and vhost_verify_ring_mappings() in order
to skip the same uninitialized VQs.

Changes from v1:
  - Remove now unnecessary virtio_queue_get_num() calls in virtio-pci.c
  - Add virtio_queue_valid() calls in vhost_virtqueue_[start,stop]()

Please review.

--nab

Michael S. Tsirkin (1):
  virtio: add API to check that ring is setup

Nicholas Bellinger (2):
  virtio-pci: Add virtio_queue_valid checks ahead of
virtio_queue_get_num
  vhost: Skip uninitialized VQs in vhost_virtqueue_[start,stop]

 hw/vhost.c  |   12 
 hw/virtio-pci.c |   34 +++---
 hw/virtio.c |5 +
 hw/virtio.h |1 +
 4 files changed, 33 insertions(+), 19 deletions(-)

-- 
1.7.2.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH-v2 1/3] virtio: add API to check that ring is setup

2013-04-01 Thread Nicholas A. Bellinger
From: Michael S. Tsirkin m...@redhat.com

virtio scsi makes it legal to only setup a subset of rings.  The only
way to detect the ring is setup seems to be to check whether PA was
written to.  Add API to do this, and teach code to use it instead of
checking hardware queue size.

(nab: use .vring.desc instead of .vring.pa)

Signed-off-by: Michael S. Tsirkin m...@redhat.com
Cc: Asias He as...@redhat.com
Cc: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Nicholas Bellinger n...@linux-iscsi.org
---
 hw/virtio.c |5 +
 hw/virtio.h |1 +
 2 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/hw/virtio.c b/hw/virtio.c
index 26fbc79..65ba253 100644
--- a/hw/virtio.c
+++ b/hw/virtio.c
@@ -651,6 +651,11 @@ int virtio_queue_get_num(VirtIODevice *vdev, int n)
 return vdev-vq[n].vring.num;
 }
 
+bool virtio_queue_valid(VirtIODevice *vdev, int n)
+{
+return vdev-vq[n].vring.num  vdev-vq[n].vring.desc;
+}
+
 int virtio_queue_get_id(VirtQueue *vq)
 {
 VirtIODevice *vdev = vq-vdev;
diff --git a/hw/virtio.h b/hw/virtio.h
index fdbe931..3086798 100644
--- a/hw/virtio.h
+++ b/hw/virtio.h
@@ -227,6 +227,7 @@ void virtio_config_writel(VirtIODevice *vdev, uint32_t 
addr, uint32_t data);
 void virtio_queue_set_addr(VirtIODevice *vdev, int n, hwaddr addr);
 hwaddr virtio_queue_get_addr(VirtIODevice *vdev, int n);
 int virtio_queue_get_num(VirtIODevice *vdev, int n);
+bool virtio_queue_valid(VirtIODevice *vdev, int n);
 void virtio_queue_notify(VirtIODevice *vdev, int n);
 uint16_t virtio_queue_vector(VirtIODevice *vdev, int n);
 void virtio_queue_set_vector(VirtIODevice *vdev, int n, uint16_t vector);
-- 
1.7.2.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH-v2 2/3] virtio-pci: Add virtio_queue_valid checks ahead of virtio_queue_get_num

2013-04-01 Thread Nicholas A. Bellinger
From: Nicholas Bellinger n...@linux-iscsi.org

This patch adds a number of virtio_queue_valid() checks to virtio-pci
ahead of virtio_queue_get_num() usage in order to skip operation upon
the detection of an uninitialized VQ.

There is one exception in virtio_ioport_read():VIRTIO_PCI_QUEUE_NUM,
where virtio_queue_get_num() may still be called without a valid
vdev-vq[n].vring.desc physical address.

v2: Drop now unnecessary virtio_queue_get_num calls (mst)

Cc: Michael S. Tsirkin m...@redhat.com
Cc: Asias He as...@redhat.com
Cc: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Nicholas Bellinger n...@linux-iscsi.org
---
 hw/virtio-pci.c |   34 +++---
 1 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
index 0d67b84..1369d9a 100644
--- a/hw/virtio-pci.c
+++ b/hw/virtio-pci.c
@@ -211,10 +211,9 @@ static void virtio_pci_start_ioeventfd(VirtIOPCIProxy 
*proxy)
 }
 
 for (n = 0; n  VIRTIO_PCI_QUEUE_MAX; n++) {
-if (!virtio_queue_get_num(proxy-vdev, n)) {
+if (!virtio_queue_valid(proxy-vdev, n)) {
 continue;
 }
-
 r = virtio_pci_set_host_notifier_internal(proxy, n, true, true);
 if (r  0) {
 goto assign_error;
@@ -225,10 +224,9 @@ static void virtio_pci_start_ioeventfd(VirtIOPCIProxy 
*proxy)
 
 assign_error:
 while (--n = 0) {
-if (!virtio_queue_get_num(proxy-vdev, n)) {
+if (!virtio_queue_valid(proxy-vdev, n)) {
 continue;
 }
-
 r = virtio_pci_set_host_notifier_internal(proxy, n, false, false);
 assert(r = 0);
 }
@@ -246,10 +244,9 @@ static void virtio_pci_stop_ioeventfd(VirtIOPCIProxy 
*proxy)
 }
 
 for (n = 0; n  VIRTIO_PCI_QUEUE_MAX; n++) {
-if (!virtio_queue_get_num(proxy-vdev, n)) {
+if (!virtio_queue_valid(proxy-vdev, n)) {
 continue;
 }
-
 r = virtio_pci_set_host_notifier_internal(proxy, n, false, false);
 assert(r = 0);
 }
@@ -546,8 +543,8 @@ static int kvm_virtio_pci_vector_use(VirtIOPCIProxy *proxy, 
int nvqs)
 MSIMessage msg;
 
 for (queue_no = 0; queue_no  nvqs; queue_no++) {
-if (!virtio_queue_get_num(vdev, queue_no)) {
-break;
+if (!virtio_queue_valid(vdev, queue_no)) {
+continue;
 }
 vector = virtio_queue_vector(vdev, queue_no);
 if (vector = msix_nr_vectors_allocated(dev)) {
@@ -593,8 +590,8 @@ static void kvm_virtio_pci_vector_release(VirtIOPCIProxy 
*proxy, int nvqs)
 int queue_no;
 
 for (queue_no = 0; queue_no  nvqs; queue_no++) {
-if (!virtio_queue_get_num(vdev, queue_no)) {
-break;
+if (!virtio_queue_valid(vdev, queue_no)) {
+continue;
 }
 vector = virtio_queue_vector(vdev, queue_no);
 if (vector = msix_nr_vectors_allocated(dev)) {
@@ -665,8 +662,8 @@ static int kvm_virtio_pci_vector_unmask(PCIDevice *dev, 
unsigned vector,
 int ret, queue_no;
 
 for (queue_no = 0; queue_no  proxy-nvqs_with_notifiers; queue_no++) {
-if (!virtio_queue_get_num(vdev, queue_no)) {
-break;
+if (!virtio_queue_valid(vdev, queue_no)) {
+continue;
 }
 if (virtio_queue_vector(vdev, queue_no) != vector) {
 continue;
@@ -695,8 +692,8 @@ static void kvm_virtio_pci_vector_mask(PCIDevice *dev, 
unsigned vector)
 int queue_no;
 
 for (queue_no = 0; queue_no  proxy-nvqs_with_notifiers; queue_no++) {
-if (!virtio_queue_get_num(vdev, queue_no)) {
-break;
+if (!virtio_queue_valid(vdev, queue_no)) {
+continue;
 }
 if (virtio_queue_vector(vdev, queue_no) != vector) {
 continue;
@@ -717,8 +714,8 @@ static void kvm_virtio_pci_vector_poll(PCIDevice *dev,
 VirtQueue *vq;
 
 for (queue_no = 0; queue_no  proxy-nvqs_with_notifiers; queue_no++) {
-if (!virtio_queue_get_num(vdev, queue_no)) {
-break;
+if (!virtio_queue_valid(vdev, queue_no)) {
+continue;
 }
 vector = virtio_queue_vector(vdev, queue_no);
 if (vector  vector_start || vector = vector_end ||
@@ -790,10 +787,9 @@ static int virtio_pci_set_guest_notifiers(DeviceState *d, 
int nvqs, bool assign)
 }
 
 for (n = 0; n  nvqs; n++) {
-if (!virtio_queue_get_num(vdev, n)) {
-break;
+if (!virtio_queue_valid(vdev, n)) {
+continue;
 }
-
 r = virtio_pci_set_guest_notifier(d, n, assign,
   kvm_msi_via_irqfd_enabled());
 if (r  0) {
-- 
1.7.2.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH-v2 3/3] vhost: Skip uninitialized VQs in vhost_virtqueue_[start,stop]

2013-04-01 Thread Nicholas A. Bellinger
From: Nicholas Bellinger n...@linux-iscsi.org

This patch adds virtio_queue_valid() checks in vhost_virtqueue_start()
and vhost_virtqueue_stop() to avoid uninitialized VQs during vhost-scsi-pci
seabios operation, where we currently expect only the request VQ to have
been initialized before virtio-scsi LLD guest hand-off.

Also, go ahead and skip the same uninitialized VQs during sanity checks
within vhost_verify_ring_mappings() by checking vq-ring_[phys,size]
directly.

Cc: Michael S. Tsirkin m...@redhat.com
Cc: Asias He as...@redhat.com
Cc: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Nicholas Bellinger n...@linux-iscsi.org
---
 hw/vhost.c |   12 
 1 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/hw/vhost.c b/hw/vhost.c
index 4d6aee3..832cc89 100644
--- a/hw/vhost.c
+++ b/hw/vhost.c
@@ -314,6 +314,9 @@ static int vhost_verify_ring_mappings(struct vhost_dev *dev,
 hwaddr l;
 void *p;
 
+if (!vq-ring_phys || !vq-ring_size) {
+continue;
+}
 if (!ranges_overlap(start_addr, size, vq-ring_phys, vq-ring_size)) {
 continue;
 }
@@ -645,6 +648,10 @@ static int vhost_virtqueue_start(struct vhost_dev *dev,
 
 assert(idx = dev-vq_index  idx  dev-vq_index + dev-nvqs);
 
+if (!virtio_queue_valid(vdev, idx)) {
+return 0;
+}
+
 vq-num = state.num = virtio_queue_get_num(vdev, idx);
 r = ioctl(dev-control, VHOST_SET_VRING_NUM, state);
 if (r) {
@@ -732,6 +739,11 @@ static void vhost_virtqueue_stop(struct vhost_dev *dev,
 };
 int r;
 assert(idx = dev-vq_index  idx  dev-vq_index + dev-nvqs);
+
+if (!virtio_queue_valid(vdev, idx)) {
+return;
+}
+
 r = ioctl(dev-control, VHOST_GET_VRING_BASE, state);
 if (r  0) {
 fprintf(stderr, vhost VQ %d ring restore failed: %d\n, idx, r);
-- 
1.7.2.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH V3 WIP 3/3] disable vhost_verify_ring_mappings check

2013-04-01 Thread Nicholas A. Bellinger
On Fri, 2013-03-29 at 09:14 +0100, Paolo Bonzini wrote: 
 Il 29/03/2013 03:53, Nicholas A. Bellinger ha scritto:
  On Thu, 2013-03-28 at 06:13 -0400, Paolo Bonzini wrote:
  I think it's the right thing to do, but maybe not the right place
  to do this, need to reset after all IO is done, before
  ring memory is write protected.
 
  Our emails are crossing each other unfortunately, but I want to
  reinforce this: ring memory is not write protected.
  
  Understood.  However, AFAICT the act of write protecting these ranges
  for ROM generates the offending callbacks to vhost_set_memory().
  
  The part that I'm missing is if ring memory is not being write protected
  by make_bios_readonly_intel(), why are the vhost_set_memory() calls
  being invoked..?
 
 Because mappings change for the region that contains the ring.  vhost
 doesn't know yet that the changes do not affect ring memory,
 vhost_set_memory() is called exactly to ascertain that.
 

Hi Paolo  Co,

Here's a bit more information on what is going on with the same
cpu_physical_memory_map() failure in vhost_verify_ring_mappings()..

So as before, at the point that seabios is marking memory as readonly
for ROM in src/shadow.c:make_bios_readonly_intel() with the following
call:

Calling pci_config_writeb(0x31): bdf: 0x pam: 0x005b

the memory API update hook triggers back into vhost_region_del() code,
and following occurs:

Entering vhost_region_del section: 0x7fd30a213b60 offset_within_region: 0xc 
size: 2146697216 readonly: 0
vhost_region_del: is_rom: 0, rom_device: 0
vhost_region_del: readable: 1
vhost_region_del: ram_addr 0x0, addr: 0x0 size: 2147483648
vhost_region_del: name: pc.ram
Entering vhost_set_memory, section: 0x7fd30a213b60 add: 0, dev-started: 1
Entering verify_ring_mappings: start_addr 0x000c size: 2146697216
verify_ring_mappings: ring_phys 0x0 ring_size: 0
verify_ring_mappings: ring_phys 0x0 ring_size: 0
verify_ring_mappings: ring_phys 0xed000 ring_size: 5124
verify_ring_mappings: calling cpu_physical_memory_map ring_phys: 0xed000 l: 5124
address_space_map: addr: 0xed000, plen: 5124
address_space_map: l: 4096, len: 5124
phys_page_find got PHYS_MAP_NODE_NIL ..
address_space_map: section: 0x7fd30fabaed0 memory_region_is_ram: 0 readonly: 0
address_space_map: section: 0x7fd30fabaed0 offset_within_region: 0x0 section 
size: 18446744073709551615
Unable to map ring buffer for ring 2, l: 4096

So the interesting part is that phys_page_find() is not able to locate
the corresponding page for vq-ring_phys: 0xed000 from the
vhost_region_del() callback with section-offset_within_region:
0xc..

Is there any case where this would not be considered a bug..? 

register_multipage : d: 0x7fd30f7d0ed0 section: 0x7fd30a2139b0
register_multipage : d: 0x7fd30f7d0ed0 section: 0x7fd30a2139b0
register_multipage : d: 0x7fd30f7d0ed0 section: 0x7fd30a2139b0
Entering vhost_region_add section: 0x7fd30a213aa0 offset_within_region: 0xc 
size: 32768 readonly: 1
vhost_region_add: is_rom: 0, rom_device: 0
vhost_region_add: readable: 1
vhost_region_add: ram_addr 0x, addr: 0x   0 size: 
2147483648
vhost_region_add: name: pc.ram
Entering vhost_set_memory, section: 0x7fd30a213aa0 add: 1, dev-started: 1
Entering verify_ring_mappings: start_addr 0x000c size: 32768
verify_ring_mappings: ring_phys 0x0 ring_size: 0
verify_ring_mappings: ring_phys 0x0 ring_size: 0
verify_ring_mappings: ring_phys 0xed000 ring_size: 5124
verify_ring_mappings: Got !ranges_overlap, skipping
register_multipage : d: 0x7fd30f7d0ed0 section: 0x7fd30a2139b0
Entering vhost_region_add section: 0x7fd30a213aa0 offset_within_region: 0xc8000 
size: 2146664448 readonly: 0
vhost_region_add: is_rom: 0, rom_device: 0
vhost_region_add: readable: 1
vhost_region_add: ram_addr 0x, addr: 0x   0 size: 
2147483648
vhost_region_add: name: pc.ram
Entering vhost_set_memory, section: 0x7fd30a213aa0 add: 1, dev-started: 1
Entering verify_ring_mappings: start_addr 0x000c8000 size: 2146664448
verify_ring_mappings: ring_phys 0x0 ring_size: 0
verify_ring_mappings: ring_phys 0x0 ring_size: 0
verify_ring_mappings: ring_phys 0xed000 ring_size: 5124
verify_ring_mappings: calling cpu_physical_memory_map ring_phys: 0xed000 l: 5124
address_space_map: addr: 0xed000, plen: 5124
address_space_map: l: 4096, len: 5124
address_space_map: section: 0x7fd30fabb020 memory_region_is_ram: 1 readonly: 0
address_space_map: section: 0x7fd30fabb020 offset_within_region: 0xc8000 
section size: 2146664448
address_space_map: l: 4096, len: 1028
address_space_map: section: 0x7fd30fabb020 memory_region_is_ram: 1 readonly: 0
address_space_map: section: 0x7fd30fabb020 offset_within_region: 0xc8000 
section size: 2146664448
address_space_map: Calling qemu_ram_ptr_length: raddr: 0x   ed000 rlen: 
5124
address_space_map: After qemu_ram_ptr_length: raddr: 0x   ed000 rlen: 
5124

So here the vhost_region_add() callback for

Re: Virtualbox svga card in KVM

2013-04-01 Thread Sriram Murthy
Hi,
    I am looking into hw/vga.c and hw/vga-pci.c as a starting point in my 
effort to get vbox svga card into kvm. 

    However, given my very basic understanding of VGA/SVGA standards, can 
somebody throw some light on the inner workings of the std vga card in kvm?
    Also, are there any good urls/blogs/standards that I must be aware of 
before starting on this porting effort?
    Any help here will be greatly appreciated.
Thanks and regards,
Sriram



- Original Message -
From: Alon Levy al...@redhat.com
To: Sriram Murthy srira...@yahoo.com
Cc: kvm@vger.kernel.org; qemu list qemu-de...@nongnu.org
Sent: Thursday, March 21, 2013 7:53 AM
Subject: Re: Virtualbox svga card in KVM

 
 
 Hi,
  I am planning on bringing in the virtualbox svga card into kvm
  as a new svga card type (vbox probably?) so that we can load
  the VirtualBox SVGA card drivers in the guest.
 
  Is this even feasible?. Any ideas on where I should start
  looking?

I don't see why it wouldn't, sounds like a great idea. You can look at 
hw/qxl.c, hw/cirrus_vga.c, hw/vmware_vga.c as existing pci vga cards. Also this 
should go on qemu-devel (cc'ed).

 
 Regards,Sriram
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH 0/5] vhost-scsi: Add support for host virtualized target

2013-04-01 Thread Wenchao Xia
Hi, Nicholas
  Have this serial been merged to qemu 1.4? If not, I am rebasing it to
upstream, hope no one else is working on that.

 From: Nicholas Bellinger n...@linux-iscsi.org
 
 Hello Anthony  Co,
 
 This is the fourth installment to add host virtualized target support for
 the mainline tcm_vhost fabric driver using Linux v3.6-rc into QEMU 1.3.0-rc.
 
 The series is available directly from the following git branch:
 
 git://git.kernel.org/pub/scm/virt/kvm/nab/qemu-kvm.git vhost-scsi-for-1.3
 
 Note the code is cut against yesterday's QEMU head, and dispite the name
 of the tree is based upon mainline qemu.org git code + has thus far been
 running overnight with  100K IOPs small block 4k workloads using v3.6-rc2+
 based target code with RAMDISK_DR backstores.
 
 Other than some minor fuzz between jumping from QEMU 1.2.0 - 1.2.50, this
 series is functionally identical to what's been posted for vhost-scsi RFC-v3
 to qemu-devel.
 
 Please consider applying these patches for an initial vhost-scsi merge into
 QEMU 1.3.0-rc code, or let us know what else you'd like to see addressed for
 this series to in order to merge.
 
 Thank you!
 
 --nab
 
 Nicholas Bellinger (2):
monitor: Rename+move net_handle_fd_param - monitor_handle_fd_param
virtio-scsi: Set max_target=0 during vhost-scsi operation
 
 Stefan Hajnoczi (3):
vhost: Pass device path to vhost_dev_init()
vhost-scsi: add -vhost-scsi host device for use with tcm-vhost
virtio-scsi: Add start/stop functionality for vhost-scsi
 
   configure|   10 +++
   hw/Makefile.objs |1 +
   hw/qdev-properties.c |   41 +++
   hw/vhost-scsi.c  |  190 
 ++
   hw/vhost-scsi.h  |   62 
   hw/vhost.c   |5 +-
   hw/vhost.h   |3 +-
   hw/vhost_net.c   |2 +-
   hw/virtio-pci.c  |2 +
   hw/virtio-scsi.c |   55 ++-
   hw/virtio-scsi.h |1 +
   monitor.c|   18 +
   monitor.h|1 +
   net.c|   18 -
   net.h|2 -
   net/socket.c |2 +-
   net/tap.c|4 +-
   qemu-common.h|1 +
   qemu-config.c|   19 +
   qemu-options.hx  |4 +
   vl.c |   18 +
   21 files changed, 431 insertions(+), 28 deletions(-)
   create mode 100644 hw/vhost-scsi.c
   create mode 100644 hw/vhost-scsi.h
 


-- 
Best Regards

Wenchao Xia

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH 0/5] vhost-scsi: Add support for host virtualized target

2013-04-01 Thread Nicholas A. Bellinger
Hello Wenchao,

On Tue, 2013-04-02 at 11:28 +0800, Wenchao Xia wrote:
 Hi, Nicholas
   Have this serial been merged to qemu 1.4?

Not just yet.  Asias, MST, Paolo and myself have been working recently
on the series for qemu 1.4.

  If not, I am rebasing it to
 upstream, hope no one else is working on that.
 

The latest PATCH-v2 for vhost-scsi-pci against upstream qemu commit
dcadaa9b (from March 25th) is here:

http://www.spinics.net/lists/target-devel/msg04263.html

Note that you'll need this small work-around patch to avoid a bug that
we're currently tracking down:

http://www.spinics.net/lists/target-devel/msg04122.html

Also, you'll need a recent SeaBios build with the following commits:

commit 5a7730db57ab0715223421e65b54fb50d6fefe5c
Author: Asias He as...@redhat.com
Date:   Fri Mar 15 09:45:15 2013 +0800

virtio-scsi: Set _DRIVER_OK flag before scsi target scanning

commit b44a7be17bdd270ea029a8e2ec0c2e80c6cd0444
Author: Asias He as...@redhat.com
Date:   Fri Mar 15 09:45:16 2013 +0800

virtio-scsi: Pack struct virtio_scsi_{req_cmd,resp_cmd}

I'll be rebasing to qemu HEAD over the next few days and posting a
PATCH-v3, and would be happy to include you in the CC to follow along.

Thanks!

--nab


  From: Nicholas Bellinger n...@linux-iscsi.org
  
  Hello Anthony  Co,
  
  This is the fourth installment to add host virtualized target support for
  the mainline tcm_vhost fabric driver using Linux v3.6-rc into QEMU 1.3.0-rc.
  
  The series is available directly from the following git branch:
  
  git://git.kernel.org/pub/scm/virt/kvm/nab/qemu-kvm.git 
  vhost-scsi-for-1.3
  
  Note the code is cut against yesterday's QEMU head, and dispite the name
  of the tree is based upon mainline qemu.org git code + has thus far been
  running overnight with  100K IOPs small block 4k workloads using v3.6-rc2+
  based target code with RAMDISK_DR backstores.
  
  Other than some minor fuzz between jumping from QEMU 1.2.0 - 1.2.50, this
  series is functionally identical to what's been posted for vhost-scsi RFC-v3
  to qemu-devel.
  
  Please consider applying these patches for an initial vhost-scsi merge into
  QEMU 1.3.0-rc code, or let us know what else you'd like to see addressed for
  this series to in order to merge.
  
  Thank you!
  
  --nab
  
  Nicholas Bellinger (2):
 monitor: Rename+move net_handle_fd_param - monitor_handle_fd_param
 virtio-scsi: Set max_target=0 during vhost-scsi operation
  
  Stefan Hajnoczi (3):
 vhost: Pass device path to vhost_dev_init()
 vhost-scsi: add -vhost-scsi host device for use with tcm-vhost
 virtio-scsi: Add start/stop functionality for vhost-scsi
  
configure|   10 +++
hw/Makefile.objs |1 +
hw/qdev-properties.c |   41 +++
hw/vhost-scsi.c  |  190 
  ++
hw/vhost-scsi.h  |   62 
hw/vhost.c   |5 +-
hw/vhost.h   |3 +-
hw/vhost_net.c   |2 +-
hw/virtio-pci.c  |2 +
hw/virtio-scsi.c |   55 ++-
hw/virtio-scsi.h |1 +
monitor.c|   18 +
monitor.h|1 +
net.c|   18 -
net.h|2 -
net/socket.c |2 +-
net/tap.c|4 +-
qemu-common.h|1 +
qemu-config.c|   19 +
qemu-options.hx  |4 +
vl.c |   18 +
21 files changed, 431 insertions(+), 28 deletions(-)
create mode 100644 hw/vhost-scsi.c
create mode 100644 hw/vhost-scsi.h
  
 
 


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH v2 0/6] device control and in-kernel MPIC

2013-04-01 Thread Scott Wood
v2 addresses some requested changes, such as the use of a file descriptor
instead of an ad-hoc handle array, and the use of an enableable
IRQ-type-specific capability to bind the vcpu to a particular MPIC device
(among other things, this allows the notifier patch to go away).

Some other requested improvements, such as support for the standard
KVM_IRQ_LINE interface and splitting up the in-kernel MPIC emulation
patch, will be addressed in a later revision.

Scott Wood (6):
  kvm: add device control API
  kvm/ppc/mpic: import hw/openpic.c from QEMU
  kvm/ppc/mpic: remove some obviously unneeded code
  kvm/ppc/mpic: adapt to kernel style and environment
  kvm/ppc/mpic: in-kernel MPIC emulation
  kvm/ppc/mpic: add KVM_CAP_IRQ_MPIC

 Documentation/virtual/kvm/api.txt  |   78 ++
 Documentation/virtual/kvm/devices/README   |1 +
 Documentation/virtual/kvm/devices/mpic.txt |   37 +
 arch/powerpc/include/asm/kvm_host.h|   16 +-
 arch/powerpc/include/asm/kvm_ppc.h |8 +
 arch/powerpc/kvm/Kconfig   |5 +
 arch/powerpc/kvm/Makefile  |2 +
 arch/powerpc/kvm/booke.c   |   12 +-
 arch/powerpc/kvm/mpic.c| 1786 
 arch/powerpc/kvm/powerpc.c |   38 +-
 include/linux/kvm_host.h   |2 +
 include/uapi/linux/kvm.h   |   37 +
 virt/kvm/kvm_main.c|   40 +
 13 files changed, 2052 insertions(+), 10 deletions(-)
 create mode 100644 Documentation/virtual/kvm/devices/README
 create mode 100644 Documentation/virtual/kvm/devices/mpic.txt
 create mode 100644 arch/powerpc/kvm/mpic.c

-- 
1.7.9.5


--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH v2 3/6] kvm/ppc/mpic: remove some obviously unneeded code

2013-04-01 Thread Scott Wood
Remove some parts of the code that are obviously QEMU or Raven specific
before fixing style issues, to reduce the style issues that need to be
fixed.

Signed-off-by: Scott Wood scottw...@freescale.com
---
 arch/powerpc/kvm/mpic.c |  344 ---
 1 file changed, 344 deletions(-)

diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
index 57655b9..d6d70a4 100644
--- a/arch/powerpc/kvm/mpic.c
+++ b/arch/powerpc/kvm/mpic.c
@@ -22,39 +22,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
-/*
- *
- * Based on OpenPic implementations:
- * - Intel GW80314 I/O companion chip developer's manual
- * - Motorola MPC8245  MPC8540 user manuals.
- * - Motorola MCP750 (aka Raven) programmer manual.
- * - Motorola Harrier programmer manuel
- *
- * Serial interrupts, as implemented in Raven chipset are not supported yet.
- *
- */
-#include hw.h
-#include ppc/mac.h
-#include pci/pci.h
-#include openpic.h
-#include sysbus.h
-#include pci/msi.h
-#include qemu/bitops.h
-#include ppc.h
-
-//#define DEBUG_OPENPIC
-
-#ifdef DEBUG_OPENPIC
-static const int debug_openpic = 1;
-#else
-static const int debug_openpic = 0;
-#endif
-
-#define DPRINTF(fmt, ...) do { \
-if (debug_openpic) { \
-printf(fmt , ## __VA_ARGS__); \
-} \
-} while (0)
 
 #define MAX_CPU 32
 #define MAX_SRC 256
@@ -82,21 +49,6 @@ static const int debug_openpic = 0;
 #define OPENPIC_CPU_REG_START0x2
 #define OPENPIC_CPU_REG_SIZE 0x100 + ((MAX_CPU - 1) * 0x1000)
 
-/* Raven */
-#define RAVEN_MAX_CPU  2
-#define RAVEN_MAX_EXT 48
-#define RAVEN_MAX_IRQ 64
-#define RAVEN_MAX_TMR  MAX_TMR
-#define RAVEN_MAX_IPI  MAX_IPI
-
-/* Interrupt definitions */
-#define RAVEN_FE_IRQ (RAVEN_MAX_EXT)   /* Internal functional IRQ */
-#define RAVEN_ERR_IRQ(RAVEN_MAX_EXT + 1)   /* Error IRQ */
-#define RAVEN_TMR_IRQ(RAVEN_MAX_EXT + 2)   /* First timer IRQ */
-#define RAVEN_IPI_IRQ(RAVEN_TMR_IRQ + RAVEN_MAX_TMR)   /* First IPI 
IRQ */
-/* First doorbell IRQ */
-#define RAVEN_DBL_IRQ(RAVEN_IPI_IRQ + (RAVEN_MAX_CPU * RAVEN_MAX_IPI))
-
 typedef struct FslMpicInfo {
int max_ext;
 } FslMpicInfo;
@@ -138,44 +90,6 @@ static FslMpicInfo fsl_mpic_42 = {
 #define ILR_INTTGT_CINT   0x01 /* critical */
 #define ILR_INTTGT_MCP0x02 /* machine check */
 
-/* The currently supported INTTGT values happen to be the same as QEMU's
- * openpic output codes, but don't depend on this.  The output codes
- * could change (unlikely, but...) or support could be added for
- * more INTTGT values.
- */
-static const int inttgt_output[][2] = {
-   {ILR_INTTGT_INT, OPENPIC_OUTPUT_INT},
-   {ILR_INTTGT_CINT, OPENPIC_OUTPUT_CINT},
-   {ILR_INTTGT_MCP, OPENPIC_OUTPUT_MCK},
-};
-
-static int inttgt_to_output(int inttgt)
-{
-   int i;
-
-   for (i = 0; i  ARRAY_SIZE(inttgt_output); i++) {
-   if (inttgt_output[i][0] == inttgt) {
-   return inttgt_output[i][1];
-   }
-   }
-
-   fprintf(stderr, %s: unsupported inttgt %d\n, __func__, inttgt);
-   return OPENPIC_OUTPUT_INT;
-}
-
-static int output_to_inttgt(int output)
-{
-   int i;
-
-   for (i = 0; i  ARRAY_SIZE(inttgt_output); i++) {
-   if (inttgt_output[i][1] == output) {
-   return inttgt_output[i][0];
-   }
-   }
-
-   abort();
-}
-
 #define MSIIR_OFFSET   0x140
 #define MSIIR_SRS_SHIFT29
 #define MSIIR_SRS_MASK (0x7  MSIIR_SRS_SHIFT)
@@ -1265,228 +1179,36 @@ static uint64_t openpic_cpu_read(void *opaque, hwaddr 
addr, unsigned len)
return openpic_cpu_read_internal(opaque, addr, (addr  0x1f000)  12);
 }
 
-static const MemoryRegionOps openpic_glb_ops_le = {
-   .write = openpic_gbl_write,
-   .read = openpic_gbl_read,
-   .endianness = DEVICE_LITTLE_ENDIAN,
-   .impl = {
-.min_access_size = 4,
-.max_access_size = 4,
-},
-};
-
 static const MemoryRegionOps openpic_glb_ops_be = {
.write = openpic_gbl_write,
.read = openpic_gbl_read,
-   .endianness = DEVICE_BIG_ENDIAN,
-   .impl = {
-.min_access_size = 4,
-.max_access_size = 4,
-},
-};
-
-static const MemoryRegionOps openpic_tmr_ops_le = {
-   .write = openpic_tmr_write,
-   .read = openpic_tmr_read,
-   .endianness = DEVICE_LITTLE_ENDIAN,
-   .impl = {
-.min_access_size = 4,
-.max_access_size = 4,
-},
 };
 
 static const MemoryRegionOps openpic_tmr_ops_be = {
.write = openpic_tmr_write,
.read = openpic_tmr_read,
-   .endianness = DEVICE_BIG_ENDIAN,
-   .impl = {
-.min_access_size = 4,
-.max_access_size = 4,
-},
-};
-
-static const MemoryRegionOps openpic_cpu_ops_le = {
-   

[RFC PATCH v2 4/6] kvm/ppc/mpic: adapt to kernel style and environment

2013-04-01 Thread Scott Wood
Remove braces that Linux style doesn't permit, remove space after
'*' that Lindent added, keep error/debug strings contiguous, etc.

Substitute type names, debug prints, etc.

Signed-off-by: Scott Wood scottw...@freescale.com
---
 arch/powerpc/kvm/mpic.c |  445 ++-
 1 file changed, 208 insertions(+), 237 deletions(-)

diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
index d6d70a4..1df67ae 100644
--- a/arch/powerpc/kvm/mpic.c
+++ b/arch/powerpc/kvm/mpic.c
@@ -42,22 +42,22 @@
 #define OPENPIC_TMR_REG_SIZE 0x220
 #define OPENPIC_MSI_REG_START0x1600
 #define OPENPIC_MSI_REG_SIZE 0x200
-#define OPENPIC_SUMMARY_REG_START   0x3800
-#define OPENPIC_SUMMARY_REG_SIZE0x800
+#define OPENPIC_SUMMARY_REG_START0x3800
+#define OPENPIC_SUMMARY_REG_SIZE 0x800
 #define OPENPIC_SRC_REG_START0x1
 #define OPENPIC_SRC_REG_SIZE (MAX_SRC * 0x20)
 #define OPENPIC_CPU_REG_START0x2
-#define OPENPIC_CPU_REG_SIZE 0x100 + ((MAX_CPU - 1) * 0x1000)
+#define OPENPIC_CPU_REG_SIZE (0x100 + ((MAX_CPU - 1) * 0x1000))
 
-typedef struct FslMpicInfo {
+struct fsl_mpic_info {
int max_ext;
-} FslMpicInfo;
+};
 
-static FslMpicInfo fsl_mpic_20 = {
+static struct fsl_mpic_info fsl_mpic_20 = {
.max_ext = 12,
 };
 
-static FslMpicInfo fsl_mpic_42 = {
+static struct fsl_mpic_info fsl_mpic_42 = {
.max_ext = 12,
 };
 
@@ -100,44 +100,43 @@ static int get_current_cpu(void)
 {
CPUState *cpu_single_cpu;
 
-   if (!cpu_single_env) {
+   if (!cpu_single_env)
return -1;
-   }
 
cpu_single_cpu = ENV_GET_CPU(cpu_single_env);
return cpu_single_cpu-cpu_index;
 }
 
-static uint32_t openpic_cpu_read_internal(void *opaque, hwaddr addr, int idx);
-static void openpic_cpu_write_internal(void *opaque, hwaddr addr,
+static uint32_t openpic_cpu_read_internal(void *opaque, gpa_t addr, int idx);
+static void openpic_cpu_write_internal(void *opaque, gpa_t addr,
   uint32_t val, int idx);
 
-typedef enum IRQType {
+enum irq_type {
IRQ_TYPE_NORMAL = 0,
IRQ_TYPE_FSLINT,/* FSL internal interrupt -- level only */
IRQ_TYPE_FSLSPECIAL,/* FSL timer/IPI interrupt, edge, no polarity */
-} IRQType;
+};
 
-typedef struct IRQQueue {
+struct irq_queue {
/* Round up to the nearest 64 IRQs so that the queue length
 * won't change when moving between 32 and 64 bit hosts.
 */
unsigned long queue[BITS_TO_LONGS((MAX_IRQ + 63)  ~63)];
int next;
int priority;
-} IRQQueue;
+};
 
-typedef struct IRQSource {
+struct irq_source {
uint32_t ivpr;  /* IRQ vector/priority register */
uint32_t idr;   /* IRQ destination register */
uint32_t destmask;  /* bitmap of CPU destinations */
int last_cpu;
int output; /* IRQ level, e.g. OPENPIC_OUTPUT_INT */
int pending;/* TRUE if IRQ is pending */
-   IRQType type;
+   enum irq_type type;
bool level:1;   /* level-triggered */
-   bool nomask:1;  /* critical interrupts ignore mask on some FSL 
MPICs */
-} IRQSource;
+   bool nomask:1;  /* critical interrupts ignore mask on some FSL MPICs */
+};
 
 #define IVPR_MASK_SHIFT   31
 #define IVPR_MASK_MASK(1  IVPR_MASK_SHIFT)
@@ -158,22 +157,19 @@ typedef struct IRQSource {
 #define IDR_EP  0x8000 /* external pin */
 #define IDR_CI  0x4000 /* critical interrupt */
 
-typedef struct IRQDest {
+struct irq_dest {
int32_t ctpr;   /* CPU current task priority */
-   IRQQueue raised;
-   IRQQueue servicing;
+   struct irq_queue raised;
+   struct irq_queue servicing;
qemu_irq *irqs;
 
/* Count of IRQ sources asserting on non-INT outputs */
uint32_t outputs_active[OPENPIC_OUTPUT_NB];
-} IRQDest;
-
-typedef struct OpenPICState {
-   SysBusDevice busdev;
-   MemoryRegion mem;
+};
 
+struct openpic {
/* Behavior control */
-   FslMpicInfo *fsl;
+   struct fsl_mpic_info *fsl;
uint32_t model;
uint32_t flags;
uint32_t nb_irqs;
@@ -186,9 +182,6 @@ typedef struct OpenPICState {
uint32_t brr1;
uint32_t mpic_mode_mask;
 
-   /* Sub-regions */
-   MemoryRegion sub_io_mem[6];
-
/* Global registers */
uint32_t frr;   /* Feature reporting register */
uint32_t gcr;   /* Global configuration register  */
@@ -196,9 +189,9 @@ typedef struct OpenPICState {
uint32_t spve;  /* Spurious vector register */
uint32_t tfrr;  /* Timer frequency reporting register */
/* Source registers */
-   IRQSource src[MAX_IRQ];
+   struct irq_source src[MAX_IRQ];
/* Local registers per output pin */
-   IRQDest dst[MAX_CPU];
+   struct irq_dest 

[RFC PATCH v2 1/6] kvm: add device control API

2013-04-01 Thread Scott Wood
Currently, devices that are emulated inside KVM are configured in a
hardcoded manner based on an assumption that any given architecture
only has one way to do it.  If there's any need to access device state,
it is done through inflexible one-purpose-only IOCTLs (e.g.
KVM_GET/SET_LAPIC).  Defining new IOCTLs for every little thing is
cumbersome and depletes a limited numberspace.

This API provides a mechanism to instantiate a device of a certain
type, returning an ID that can be used to set/get attributes of the
device.  Attributes may include configuration parameters (e.g.
register base address), device state, operational commands, etc.  It
is similar to the ONE_REG API, except that it acts on devices rather
than vcpus.

Both device types and individual attributes can be tested without having
to create the device or get/set the attribute, without the need for
separately managing enumerated capabilities.

Signed-off-by: Scott Wood scottw...@freescale.com
---
 Documentation/virtual/kvm/api.txt|   70 ++
 Documentation/virtual/kvm/devices/README |1 +
 arch/powerpc/include/asm/kvm_host.h  |6 +++
 arch/powerpc/include/asm/kvm_ppc.h   |2 +
 arch/powerpc/kvm/powerpc.c   |7 +++
 include/uapi/linux/kvm.h |   27 
 virt/kvm/kvm_main.c  |   31 +
 7 files changed, 144 insertions(+)
 create mode 100644 Documentation/virtual/kvm/devices/README

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 976eb65..77328aa 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2173,6 +2173,76 @@ header; first `n_valid' valid entries with contents from 
the data
 written, then `n_invalid' invalid entries, invalidating any previously
 valid entries found.
 
+4.79 KVM_CREATE_DEVICE
+
+Capability: KVM_CAP_DEVICE_CTRL
+Type: vm ioctl
+Parameters: struct kvm_create_device (in/out)
+Returns: 0 on success, -1 on error
+Errors:
+  ENODEV: The device type is unknown or unsupported
+  EEXIST: Device already created, and this type of device may not
+  be instantiated multiple times
+  ENOSPC: Too many devices have been created
+
+  Other error conditions may be defined by individual device types.
+
+Creates an emulated device in the kernel.  The file descriptor returned
+in fd can be used with KVM_SET/GET/HAS_DEVICE_ATTR.
+
+If the KVM_CREATE_DEVICE_TEST flag is set, only test whether the
+device type is supported (not necessarily whether it can be created
+in the current vm).
+
+Individual devices should not define flags.  Attributes should be used
+for specifying any behavior that is not implied by the device type
+number.
+
+struct kvm_create_device {
+   __u32   type;   /* in: KVM_DEV_TYPE_xxx */
+   __u32   fd; /* out: device handle */
+   __u32   flags;  /* in: KVM_CREATE_DEVICE_xxx */
+};
+
+4.80 KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR
+
+Capability: KVM_CAP_DEVICE_CTRL
+Type: device ioctl
+Parameters: struct kvm_device_attr
+Returns: 0 on success, -1 on error
+Errors:
+  ENXIO:  The group or attribute is unknown/unsupported for this device
+  EPERM:  The attribute cannot (currently) be accessed this way
+  (e.g. read-only attribute, or attribute that only makes
+  sense when the device is in a different state)
+
+  Other error conditions may be defined by individual device types.
+
+Gets/sets a specified piece of device configuration and/or state.  The
+semantics are device-specific.  See individual device documentation in
+the devices directory.  As with ONE_REG, the size of the data
+transferred is defined by the particular attribute.
+
+struct kvm_device_attr {
+   __u32   flags;  /* no flags currently defined */
+   __u32   group;  /* device-defined */
+   __u64   attr;   /* group-defined */
+   __u64   addr;   /* userspace address of attr data */
+};
+
+4.81 KVM_HAS_DEVICE_ATTR
+
+Capability: KVM_CAP_DEVICE_CTRL
+Type: device ioctl
+Parameters: struct kvm_device_attr
+Returns: 0 on success, -1 on error
+Errors:
+  ENXIO:  The group or attribute is unknown/unsupported for this device
+
+Tests whether a device supports a particular attribute.  A successful
+return indicates the attribute is implemented.  It does not necessarily
+indicate that the attribute can be read or written in the device's
+current state.  addr is ignored.
 
 4.77 KVM_ARM_VCPU_INIT
 
diff --git a/Documentation/virtual/kvm/devices/README 
b/Documentation/virtual/kvm/devices/README
new file mode 100644
index 000..34a6983
--- /dev/null
+++ b/Documentation/virtual/kvm/devices/README
@@ -0,0 +1 @@
+This directory contains specific device bindings for KVM_CAP_DEVICE_CTRL.
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index e34f8fe..e0caae2 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ 

[RFC PATCH v2 6/6] kvm/ppc/mpic: add KVM_CAP_IRQ_MPIC

2013-04-01 Thread Scott Wood
Enabling this capability connects the vcpu to the designated in-kernel
MPIC.  Using explicit connections between vcpus and irqchips allows
for flexibility, but the main benefit at the moment is that it
simplifies the code -- KVM doesn't need vm-global state to remember
which MPIC object is associated with this vm, and it doesn't need to
care about ordering between irqchip creation and vcpu creation.

Signed-off-by: Scott Wood scottw...@freescale.com
---
 Documentation/virtual/kvm/api.txt   |8 ++
 arch/powerpc/include/asm/kvm_host.h |   10 ---
 arch/powerpc/include/asm/kvm_ppc.h  |2 ++
 arch/powerpc/kvm/booke.c|4 ++-
 arch/powerpc/kvm/mpic.c |   49 +++
 arch/powerpc/kvm/powerpc.c  |   25 +++---
 include/uapi/linux/kvm.h|1 +
 7 files changed, 86 insertions(+), 13 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 77328aa..38f9b6d 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2728,3 +2728,11 @@ to receive the topmost interrupt vector.
 When disabled (args[0] == 0), behavior is as if this facility is unsupported.
 
 When this capability is enabled, KVM_EXIT_EPR can occur.
+
+6.6 KVM_CAP_IRQ_MPIC
+
+Architectures: ppc
+Parameters: args[0] is the MPIC device fd
+args[1] is the MPIC CPU number for this vcpu
+
+This capability connects the vcpu to an in-kernel MPIC device.
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 6713327..2a2e235 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -375,8 +375,10 @@ struct kvmppc_booke_debug_reg {
u64 dac[KVMPPC_BOOKE_MAX_DAC];
 };
 
-#define KVMPPC_IRQCHIP_NONE0
-#define KVMPPC_IRQCHIP_MPIC1
+#define KVMPPC_IRQ_DEFAULT 0
+#define KVMPPC_IRQ_MPIC1
+
+struct openpic;
 
 struct kvm_vcpu_arch {
ulong host_stack;
@@ -557,8 +559,8 @@ struct kvm_vcpu_arch {
unsigned long magic_page_pa; /* phys addr to map the magic page to */
unsigned long magic_page_ea; /* effect. addr to map the magic page to */
 
-   int irqchip_type;
-   void *irqchip_priv;
+   int irq_type;   /* one of KVM_IRQ_* */
+   struct openpic *mpic;   /* KVM_IRQ_MPIC */
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
struct kvm_vcpu_arch_shared shregs;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 20b2a5e..2cc18a4 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -273,6 +273,8 @@ static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, 
u32 epr)
 }
 
 void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu);
+int kvmppc_mpic_connect_vcpu(struct file *mpic_filp, struct kvm_vcpu *vcpu,
+u32 cpu);
 
 int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
  struct kvm_config_tlb *cfg);
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index cddc6b3..7d00222 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -430,8 +430,10 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu 
*vcpu,
if (update_epr == true) {
if (vcpu-arch.epr_flags  KVMPPC_EPR_USER)
kvm_make_request(KVM_REQ_EPR_EXIT, vcpu);
-   else if (vcpu-arch.epr_flags  KVMPPC_EPR_KERNEL)
+   else if (vcpu-arch.epr_flags  KVMPPC_EPR_KERNEL) {
+   BUG_ON(vcpu-arch.irq_type != KVMPPC_IRQ_MPIC);
kvmppc_mpic_set_epr(vcpu);
+   }
}
 
new_msr = msr_mask;
diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
index 9aace50..b790f47 100644
--- a/arch/powerpc/kvm/mpic.c
+++ b/arch/powerpc/kvm/mpic.c
@@ -1159,7 +1159,7 @@ static uint32_t openpic_iack(struct openpic *opp, struct 
irq_dest *dst,
 
 void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu)
 {
-   struct openpic *opp = vcpu-arch.irqchip_priv;
+   struct openpic *opp = vcpu-arch.mpic;
int cpu = vcpu-vcpu_id;
unsigned long flags;
 
@@ -1442,10 +1442,10 @@ static void map_mmio(struct openpic *opp)
 
 static void unmap_mmio(struct openpic *opp)
 {
-   BUG_ON(opp-mmio_mapped);
-   opp-mmio_mapped = false;
-
-   kvm_io_bus_unregister_dev(opp-kvm, KVM_MMIO_BUS, opp-mmio);
+   if (opp-mmio_mapped) {
+   opp-mmio_mapped = false;
+   kvm_io_bus_unregister_dev(opp-kvm, KVM_MMIO_BUS, opp-mmio);
+   }
 }
 
 static int set_base_addr(struct openpic *opp, struct kvm_device_attr *attr)
@@ -1683,6 +1683,45 @@ static const struct file_operations kvm_mpic_fops = {
.release = kvm_mpic_release,
 };
 
+int kvmppc_mpic_connect_vcpu(struct file *mpic_filp, struct kvm_vcpu *vcpu,
+