[PATCH] vfio-pci: Quirk RTL8168 NIC

2014-05-10 Thread Alex Williamson
This device is ridiculous.  It has two MMIO BARs, BAR4 and BAR2.  BAR4
hosts the MSI-X table, so oviously it would be too easy to access it
directly, instead it creates a window register in BAR2 that, among
other things, provides access to the MSI-X table.  This means MSI-X
doesn't work in the guest because the driver actually manages to
program the physical table.  When interrupt remapping is present, the
device MSI will be blocked.  The Linux driver doesn't make use of this
window, so apparently it's not required to make use of MSI-X.  This
quirk makes the device work with the Windows driver that does use this
window for MSI-X, but I certainly cannot recommend this device for
assignment (the Windows 7 driver also constantly pokes PCI config
space).

Signed-off-by: Alex Williamson 
---
 hw/misc/vfio.c |  145 
 1 file changed, 145 insertions(+)

diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index 9cf5b84..c3d2f7a 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -1668,6 +1668,150 @@ static void vfio_probe_ati_bar4_window_quirk(VFIODevice 
*vdev, int nr)
 vdev->host.function);
 }
 
+#define PCI_VENDOR_ID_REALTEK 0x10ec
+
+/*
+ * RTL8168 devices have a backdoor that can access the MSI-X table.  At BAR2
+ * offset 0x70 there is a dword data register, offset 0x74 is a dword address
+ * register.  According to the Linux r8169 driver, the MSI-X table is addressed
+ * when the "type" portion of the address register is set to 0x1.  This appears
+ * to be bits 16:30.  Bit 31 is both a write indicator and some sort of
+ * "address latched" indicator.  Bits 12:15 is a mask field, which we're
+ * going to ignore because we don't really know what it means and the MSI-X
+ * area always seems to be accessed with a full mask.  Bits 0:11 is offset
+ * within the type.
+ *
+ * Example trace:
+ *
+ * Read from MSI-X table offset 0
+ * vfio: vfio_bar_write(:05:00.0:BAR2+0x74, 0x1f000, 4) // store read addr
+ * vfio: vfio_bar_read(:05:00.0:BAR2+0x74, 4) = 0x8001f000 // latch
+ * vfio: vfio_bar_read(:05:00.0:BAR2+0x70, 4) = 0xfee00398 // read data
+ *
+ * Write 0xfee0 to MSI-X table offset 0
+ * vfio: vfio_bar_write(:05:00.0:BAR2+0x70, 0xfee0, 4) // write data
+ * vfio: vfio_bar_write(:05:00.0:BAR2+0x74, 0x8001f000, 4) // do write
+ * vfio: vfio_bar_read(:05:00.0:BAR2+0x74, 4) = 0x1f000 // complete
+ */
+
+static uint64_t vfio_rtl8168_window_quirk_read(void *opaque,
+   hwaddr addr, unsigned size)
+{
+VFIOQuirk *quirk = opaque;
+VFIODevice *vdev = quirk->vdev;
+
+switch (addr) {
+case 4: /* address */
+if (quirk->data.flags) {
+DPRINTF("%s fake read(%04x:%02x:%02x.%xd)\n",
+memory_region_name(&quirk->mem), vdev->host.domain,
+vdev->host.bus, vdev->host.slot, vdev->host.function);
+
+return quirk->data.address_match ^ 0x1000U;
+}
+break;
+case 0: /* data */
+if (quirk->data.flags) {
+uint64_t val;
+
+DPRINTF("%s MSI-X table read(%04x:%02x:%02x.%xd)\n",
+memory_region_name(&quirk->mem), vdev->host.domain,
+vdev->host.bus, vdev->host.slot, vdev->host.function);
+
+if (!(vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX)) {
+return 0;
+}
+
+io_mem_read(&vdev->pdev.msix_table_mmio,
+(hwaddr)(quirk->data.address_match & 0xfff),
+&val, size);
+return val;
+}
+}
+
+DPRINTF("%s direct read(%04x:%02x:%02x.%xd)\n",
+memory_region_name(&quirk->mem), vdev->host.domain,
+vdev->host.bus, vdev->host.slot, vdev->host.function);
+
+return vfio_bar_read(&vdev->bars[quirk->data.bar], addr + 0x70, size);
+}
+
+static void vfio_rtl8168_window_quirk_write(void *opaque, hwaddr addr,
+uint64_t data, unsigned size)
+{
+VFIOQuirk *quirk = opaque;
+VFIODevice *vdev = quirk->vdev;
+
+switch (addr) {
+case 4: /* address */
+if ((data & 0x7fff) == 0x1) {
+if (data & 0x1000U &&
+vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX) {
+
+DPRINTF("%s MSI-X table write(%04x:%02x:%02x.%xd)\n",
+memory_region_name(&quirk->mem), vdev->host.domain,
+vdev->host.bus, vdev->host.slot, vdev->host.function);
+
+io_mem_write(&vdev->pdev.msix_table_mmio,
+ (hwaddr)(quirk->data.address_match & 0xfff),
+ data, size);
+}
+
+quirk->data.flags = 1;
+quirk->data.address_match = data;
+
+return;
+}
+quirk->data.flags = 0;
+break;
+case 0: /* data */
+quirk->data.address_mask = data;
+break;

Re: [PATCH v10 08/19] qspinlock: Make a new qnode structure to support virtualization

2014-05-10 Thread Peter Zijlstra
On Sat, May 10, 2014 at 04:14:17PM +0200, Peter Zijlstra wrote:
> On Fri, May 09, 2014 at 09:08:56PM -0400, Waiman Long wrote:
> > On 05/08/2014 03:04 PM, Peter Zijlstra wrote:
> > >On Wed, May 07, 2014 at 11:01:36AM -0400, Waiman Long wrote:
> > >>  /*
> > >>+ * To have additional features for better virtualization support, it is
> > >>+ * necessary to store additional data in the queue node structure. So
> > >>+ * a new queue node structure will have to be defined and used here.
> > >>+ */
> > >>+struct qnode {
> > >>+ struct mcs_spinlock mcs;
> > >>+};
> > >You can ditch this entire patch; its pointless, just add a new
> > >DEFINE_PER_CPU for the para-virt muck.
> > 
> > Yes, I can certainly merge it to the next one in the series. I break it out
> > to make each individual patch smaller, more single-purpose and easier to
> > review.
> 
> No, don't merge it, _drop_ it. Wrapping things in a struct generates a
> ton of pointless change.
> 
> Put the new data in a new DEFINE_PER_CPU and leave the existing code as
> is.

So I had a look at the resulting code:

struct qnode {
struct mcs_spinlock mcs;
#ifdef CONFIG_PARAVIRT_UNFAIR_LOCKS
int lsteal_mask;/* Lock stealing frequency mask */
u32 prev_tail;  /* Tail code of previous node   */
#ifndef CONFIG_PARAVIRT_SPINLOCKS
struct qnode   *qprev;  /* Previous queue node addr */
#endif
#endif
struct pv_qvars pv; /* For para-virtualization  */
};

With all the bells and whistles on (say an enterprise distro), that
single node will now fill an entire cacheline on its own.

That means that the normal case for normal people who stay the heck away
from virt shit will very often hit _3_ cachelines for their spin_lock().

1 - the cacheline that has the spinlock_t in,
2 - the cacheline that has node[0].count in to find which node to use
3 - the cacheline that has the actual right node in

That's of course complete and utter crap.

Not to mention that the final result of those 19 patches is going to
take me days to untangle :-(

Days I don't really have because I get to go hunt bugs in existing code
before thinking about adding shiny new stuff.


pgp3fLZxYHOe1.pgp
Description: PGP signature


Re: [PATCH v10 08/19] qspinlock: Make a new qnode structure to support virtualization

2014-05-10 Thread Peter Zijlstra
On Fri, May 09, 2014 at 09:08:56PM -0400, Waiman Long wrote:
> On 05/08/2014 03:04 PM, Peter Zijlstra wrote:
> >On Wed, May 07, 2014 at 11:01:36AM -0400, Waiman Long wrote:
> >>  /*
> >>+ * To have additional features for better virtualization support, it is
> >>+ * necessary to store additional data in the queue node structure. So
> >>+ * a new queue node structure will have to be defined and used here.
> >>+ */
> >>+struct qnode {
> >>+   struct mcs_spinlock mcs;
> >>+};
> >You can ditch this entire patch; its pointless, just add a new
> >DEFINE_PER_CPU for the para-virt muck.
> 
> Yes, I can certainly merge it to the next one in the series. I break it out
> to make each individual patch smaller, more single-purpose and easier to
> review.

No, don't merge it, _drop_ it. Wrapping things in a struct generates a
ton of pointless change.

Put the new data in a new DEFINE_PER_CPU and leave the existing code as
is.


pgp44pHRGZ7w8.pgp
Description: PGP signature


Re: [PATCH v10 09/19] qspinlock: Prepare for unfair lock support

2014-05-10 Thread Peter Zijlstra
On Fri, May 09, 2014 at 09:19:32PM -0400, Waiman Long wrote:
> On 05/08/2014 03:06 PM, Peter Zijlstra wrote:
> >On Wed, May 07, 2014 at 11:01:37AM -0400, Waiman Long wrote:
> >>If unfair lock is supported, the lock acquisition loop at the end of
> >>the queue_spin_lock_slowpath() function may need to detect the fact
> >>the lock can be stolen. Code are added for the stolen lock detection.
> >>
> >>A new qhead macro is also defined as a shorthand for mcs.locked.
> >NAK, unfair should be a pure test-and-set lock.
> 
> I have performance data showing that a simple test-and-set lock does not
> scale well. That is the primary reason of ditching the test-and-set lock and
> use a more complicated scheme which scales better.

Nobody should give a fuck about scalability in this case anyway.

Also, as I explained/asked earlier:

  lkml.kernel.org/r/20140314083001.gn27...@twins.programming.kicks-ass.net

Lock holder preemption is _way_ worse with any kind of queueing. You've
not explained how the simple 3 cpu example in that email gets better
performance than a test-and-set lock.

> Also, it will be hard to
> make the unfair test-and-set lock code to coexist nicely with PV spinlock
> code.

That's just complete crap as the test-and-set lock is like 3 lines of
code.


pgpceVksftY6O.pgp
Description: PGP signature


Re: [PATCH v10 06/19] qspinlock: prolong the stay in the pending bit path

2014-05-10 Thread Peter Zijlstra
On Fri, May 09, 2014 at 08:58:47PM -0400, Waiman Long wrote:
> On 05/08/2014 02:58 PM, Peter Zijlstra wrote:
> >On Wed, May 07, 2014 at 11:01:34AM -0400, Waiman Long wrote:
> >>@@ -221,11 +222,37 @@ static inline int trylock_pending(struct qspinlock 
> >>*lock, u32 *pval)
> >> */
> >>for (;;) {
> >>/*
> >>-* If we observe any contention; queue.
> >>+* If we observe that the queue is not empty,
> >>+* return and be queued.
> >> */
> >>-   if (val&  ~_Q_LOCKED_MASK)
> >>+   if (val&  _Q_TAIL_MASK)
> >>return 0;
> >>
> >>+   if (val == (_Q_LOCKED_VAL|_Q_PENDING_VAL)) {
> >>+   /*
> >>+* If both the lock and pending bits are set, we wait
> >>+* a while to see if that either bit will be cleared.
> >>+* If that is no change, we return and be queued.
> >>+*/
> >>+   if (!retry)
> >>+   return 0;
> >>+   retry--;
> >>+   cpu_relax();
> >>+   cpu_relax();
> >>+   *pval = val = atomic_read(&lock->val);
> >>+   continue;
> >>+   } else if (val == _Q_PENDING_VAL) {
> >>+   /*
> >>+* Pending bit is set, but not the lock bit.
> >>+* Assuming that the pending bit holder is going to
> >>+* set the lock bit and clear the pending bit soon,
> >>+* it is better to wait than to exit at this point.
> >>+*/
> >>+   cpu_relax();
> >>+   *pval = val = atomic_read(&lock->val);
> >>+   continue;
> >>+   }
> >Didn't I give a much saner alternative to this mess last time?
> 
> I don't recall you have any suggestion last time. Anyway, if you think the
> code is too messy, I think I can give up the first if statement which is
> more an optimistic spinning kind of code for short critical section. The 2nd
> if statement is still need to improve chance of using this code path due to
> timing reason. I will rerun my performance test to make sure it won't have
> too much performance impact.

lkml.kernel.org/r/20140417163640.gt11...@twins.programming.kicks-ass.net


pgpsmsjLWbfSw.pgp
Description: PGP signature


differences between KVM_GET_API_VERSION and KVM_CREATE_VM which both call _IO

2014-05-10 Thread Francisc Simon
Hi,

I try to figure out how this 2 calls:
KVM_GET_API_VERSION
KVM_CREATE_VM
differ from each other because here:
http://www.cs.fsu.edu/~baker/devices/lxr/http/source/linux/include/linux/kvm.h#L351
i see that both call _IO and as far i understood the _IO function only
shifts 8 bits from the given address 0xAE which in fact if i
open /dev/kvm always returns 12. I guess that i missed something or I
misunderstood it. It would be nice to be able to clarify it.

Thank you for your time!

Beste Regards!
Frank :-)


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 02/13] KVM: PPC: Book3S: PR: Fix C/R bit setting

2014-05-10 Thread Paul Mackerras
On Thu, Apr 24, 2014 at 03:12:29PM +0200, Alexander Graf wrote:
> Commit 9308ab8e2d made C/R HTAB updates go byte-wise into the target HTAB.
> However, it didn't update the guest's copy of the HTAB, but instead the
> host local copy of it.
> 
> Write to the guest's HTAB instead.
> 
> Signed-off-by: Alexander Graf 
> CC: Paul Mackerras 

Acked-by: Paul Mackerras 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] KVM: x86: Fix CR3 reserved bits check in long mode

2014-05-10 Thread Jan Kiszka
From: Jan Kiszka 

Regression of 346874c9: PAE is set in long mode, but that does not mean
we have valid PDPTRs.

Signed-off-by: Jan Kiszka 
---
 arch/x86/kvm/x86.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c5582c3..198aac8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -701,10 +701,11 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
return 0;
}
 
-   if (is_long_mode(vcpu) && (cr3 & CR3_L_MODE_RESERVED_BITS))
-   return 1;
-   if (is_pae(vcpu) && is_paging(vcpu) &&
-   !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
+   if (is_long_mode(vcpu)) {
+   if (cr3 & CR3_L_MODE_RESERVED_BITS)
+   return 1;
+   } else if (is_pae(vcpu) && is_paging(vcpu) &&
+  !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
return 1;
 
vcpu->arch.cr3 = cr3;
-- 
1.8.1.1.298.ge7eed54



signature.asc
Description: OpenPGP digital signature


Re: [PATCH 2/5] KVM: x86: Fix CR3 reserved bits

2014-05-10 Thread Jan Kiszka
On 2014-04-18 02:35, Nadav Amit wrote:
> According to Intel specifications, PAE and non-PAE does not have any reserved
> bits.  In long-mode, regardless to PCIDE, only the high bits (above the
> physical address) are reserved.
> 
> Signed-off-by: Nadav Amit 
> ---
> :100644 100644 7de069af.. e21aee9... March/x86/include/asm/kvm_host.h
> :100644 100644 205b17e... 1d60374... March/x86/kvm/emulate.c
> :100644 100644 8b8fc0b... f4d9839... March/x86/kvm/x86.c
>  arch/x86/include/asm/kvm_host.h |6 +-
>  arch/x86/kvm/emulate.c  |4 
>  arch/x86/kvm/x86.c  |   25 +
>  3 files changed, 6 insertions(+), 29 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 7de069af..e21aee9 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -50,11 +50,7 @@
> | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
> | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
>  
> -#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
> -#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | 
> X86_CR3_PCD))
> -#define CR3_PCID_ENABLED_RESERVED_BITS 0xFF00ULL
> -#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
> -   0xFF00ULL)
> +#define CR3_L_MODE_RESERVED_BITS 0xFF00ULL
>  #define CR4_RESERVED_BITS   \
>   (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
> | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
> index 205b17e..1d60374 100644
> --- a/arch/x86/kvm/emulate.c
> +++ b/arch/x86/kvm/emulate.c
> @@ -3386,10 +3386,6 @@ static int check_cr_write(struct x86_emulate_ctxt 
> *ctxt)
>   ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
>   if (efer & EFER_LMA)
>   rsvd = CR3_L_MODE_RESERVED_BITS;
> - else if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PAE)
> - rsvd = CR3_PAE_RESERVED_BITS;
> - else if (ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PG)
> - rsvd = CR3_NONPAE_RESERVED_BITS;
>  
>   if (new_val & rsvd)
>   return emulate_gp(ctxt, 0);
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 8b8fc0b..f4d9839 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -701,26 +701,11 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long 
> cr3)
>   return 0;
>   }
>  
> - if (is_long_mode(vcpu)) {
> - if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) {
> - if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
> - return 1;
> - } else
> - if (cr3 & CR3_L_MODE_RESERVED_BITS)
> - return 1;
> - } else {
> - if (is_pae(vcpu)) {
> - if (cr3 & CR3_PAE_RESERVED_BITS)
> - return 1;
> - if (is_paging(vcpu) &&
> - !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
> - return 1;
> - }
> - /*
> -  * We don't check reserved bits in nonpae mode, because
> -  * this isn't enforced, and VMware depends on this.
> -  */
> - }
> + if (is_long_mode(vcpu) && (cr3 & CR3_L_MODE_RESERVED_BITS))
> + return 1;
> + if (is_pae(vcpu) && is_paging(vcpu) &&
> + !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
> + return 1;

This is wrong: is_pae returns true in long mode, but we don't have valid
pdptrs then. Crashes my Jailhouse guest.

I suppose we need a patch on top as this is already in kvm.next, right?

Jan




signature.asc
Description: OpenPGP digital signature