Re: [PATCH] kvm tools: Support virtio indirect buffers

2011-11-28 Thread Sasha Levin
On Tue, 2011-11-29 at 10:31 +0400, Cyrill Gorcunov wrote:
> On Mon, Nov 28, 2011 at 07:54:27PM +0200, Sasha Levin wrote:
> >  
> > +/*
> > + * Each buffer in the virtqueues is actually a chain of descriptors.  This
> > + * function returns the next descriptor in the chain, or vq->vring.num if 
> > we're
> > + * at the end.
> > + */
> > +static unsigned next_desc(struct vring_desc *desc,
> > + unsigned int i, unsigned int max)
> > +{
> > +   unsigned int next;
> > +
> > +   /* If this descriptor says it doesn't chain, we're done. */
> > +   if (!(desc[i].flags & VRING_DESC_F_NEXT))
> > +   return max;
> > +
> > +   /* Check they're not leading us off end of descriptors. */
> > +   next = desc[i].next;
> > +   /* Make sure compiler knows to grab that: we don't want it changing! */
> > +   wmb();
> > +
> > +   return next;
> > +}
> > +
> 
> Hi Sasha, where the rmb() then? Or maybe you wanted plain barrier() here?

On the kernel side.
Theres a mb there which happens there during the kick.

-- 

Sasha.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5] kvm tools, qcow: Add the support for copy-on-write cluster

2011-11-28 Thread Lan Tianyu
When meeting request to write the cluster without copied flag,
allocate a new cluster and write original data with modification
to the new cluster. This also adds support for the writing operation
of the qcow2 compressed image. After testing, image file can pass
through "qemu-img check". The performance is needed to be improved.

Signed-off-by: Lan Tianyu 
---
 tools/kvm/disk/qcow.c|  454 ++
 tools/kvm/include/kvm/qcow.h |2 +
 2 files changed, 289 insertions(+), 167 deletions(-)

diff --git a/tools/kvm/disk/qcow.c b/tools/kvm/disk/qcow.c
index 680b37d..e139fa5 100644
--- a/tools/kvm/disk/qcow.c
+++ b/tools/kvm/disk/qcow.c
@@ -20,6 +20,16 @@
 #include 
 #include 
 
+
+static inline int qcow_pwrite_sync(int fd,
+   void *buf, size_t count, off_t offset)
+{
+   if (pwrite_in_full(fd, buf, count, offset) < 0)
+   return -1;
+
+   return fdatasync(fd);
+}
+
 static int l2_table_insert(struct rb_root *root, struct qcow_l2_table *new)
 {
struct rb_node **link = &(root->rb_node), *parent = NULL;
@@ -101,7 +111,8 @@ static int qcow_l2_cache_write(struct qcow *q, struct 
qcow_l2_table *c)
 
size = 1 << header->l2_bits;
 
-   if (pwrite_in_full(q->fd, c->table, size * sizeof(u64), c->offset) < 0)
+   if (qcow_pwrite_sync(q->fd, c->table,
+   size * sizeof(u64), c->offset) < 0)
return -1;
 
c->dirty = 0;
@@ -122,9 +133,6 @@ static int cache_table(struct qcow *q, struct qcow_l2_table 
*c)
 */
lru = list_first_entry(&l1t->lru_list, struct qcow_l2_table, 
list);
 
-   if (qcow_l2_cache_write(q, lru) < 0)
-   goto error;
-
/* Remove the node from the cache */
rb_erase(&lru->node, r);
list_del_init(&lru->list);
@@ -508,47 +516,6 @@ static ssize_t qcow_read_sector(struct disk_image *disk, 
u64 sector,
return total;
 }
 
-static inline u64 file_size(int fd)
-{
-   struct stat st;
-
-   if (fstat(fd, &st) < 0)
-   return 0;
-
-   return st.st_size;
-}
-
-static inline int qcow_pwrite_sync(int fd, void *buf, size_t count, off_t 
offset)
-{
-   if (pwrite_in_full(fd, buf, count, offset) < 0)
-   return -1;
-
-   return fdatasync(fd);
-}
-
-/* Writes a level 2 table at the end of the file. */
-static u64 qcow_write_l2_table(struct qcow *q, u64 *table)
-{
-   struct qcow_header *header = q->header;
-   u64 clust_sz;
-   u64 f_sz;
-   u64 off;
-   u64 sz;
-
-   f_sz= file_size(q->fd);
-   if (!f_sz)
-   return 0;
-
-   sz  = 1 << header->l2_bits;
-   clust_sz= 1 << header->cluster_bits;
-   off = ALIGN(f_sz, clust_sz);
-
-   if (pwrite_in_full(q->fd, table, sz * sizeof(u64), off) < 0)
-   return 0;
-
-   return off;
-}
-
 static void refcount_table_free_cache(struct qcow_refcount_table *rft)
 {
struct rb_root *r = &rft->root;
@@ -601,7 +568,8 @@ static int write_refcount_block(struct qcow *q, struct 
qcow_refcount_block *rfb)
if (!rfb->dirty)
return 0;
 
-   if (pwrite_in_full(q->fd, rfb->entries, rfb->size * sizeof(u16), 
rfb->offset) < 0)
+   if (qcow_pwrite_sync(q->fd, rfb->entries,
+   rfb->size * sizeof(u16), rfb->offset) < 0)
return -1;
 
rfb->dirty = 0;
@@ -618,9 +586,6 @@ static int cache_refcount_block(struct qcow *q, struct 
qcow_refcount_block *c)
if (rft->nr_cached == MAX_CACHE_NODES) {
lru = list_first_entry(&rft->lru_list, struct 
qcow_refcount_block, list);
 
-   if (write_refcount_block(q, lru) < 0)
-   goto error;
-
rb_erase(&lru->node, r);
list_del_init(&lru->list);
rft->nr_cached--;
@@ -706,6 +671,11 @@ static struct qcow_refcount_block 
*qcow_read_refcount_block(struct qcow *q, u64
 
rfb_offset = be64_to_cpu(rft->rf_table[rft_idx]);
 
+   if (!rfb_offset) {
+   pr_warning("Don't support to grow refcount table");
+   return NULL;
+   }
+
rfb = refcount_block_search(q, rfb_offset);
if (rfb)
return rfb;
@@ -728,35 +698,140 @@ error_free_rfb:
return NULL;
 }
 
+static u16 qcow_get_refcount(struct qcow *q, u64 clust_idx)
+{
+   struct qcow_refcount_block *rfb = NULL;
+   struct qcow_header *header = q->header;
+   u64 rfb_idx;
+
+   rfb = qcow_read_refcount_block(q, clust_idx);
+   if (!rfb) {
+   pr_warning("Error while reading refcount table");
+   return -1;
+   }
+
+   rfb_idx = clust_idx & (((1ULL <<
+   (header->cluster_bits - QCOW_REFCOUNT_BLOCK_SHIFT)) - 1));
+
+   if (rfb_idx >= rfb->size) {
+   pr_warning("L1: refcount block index out of bounds");
+   r

Re: [Qemu-devel] [PATCH v2] ivshmem: add a new PIO BAR3(Doorbell) besides MMIO BAR0 to reduce notification time

2011-11-28 Thread Zang Hongyong

于 2011/11/29,星期二 14:38, Cam Macdonell 写道:

On Thu, Nov 17, 2011 at 10:50 PM,  wrote:

From: Hongyong Zang

This patch, adds a PIO BAR3 for guest notifying qemu. And we find the new 
notification way of PIO BAR3 reduces 30% time in comparison with the original 
MMIO BAR0 way.

Come to think of it, should we bump the PIO to BAR4 so that the shared
memory region could be made a 64-bit BAR and therefore take up BAR2
and BAR3?

OK. The PIO BAR can be placed on BAR4. We'll finish the patch later.

Regards,
Hongyong

Signed-off-by: Hongyong Zang
---
  hw/ivshmem.c |   24 ++--
  kvm-all.c|   23 +++
  kvm.h|1 +
  3 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/hw/ivshmem.c b/hw/ivshmem.c
index 242fbea..031cdd8 100644
--- a/hw/ivshmem.c
+++ b/hw/ivshmem.c
@@ -29,6 +29,7 @@
  #define IVSHMEM_MASTER  1

  #define IVSHMEM_REG_BAR_SIZE 0x100
+#define IVSHIO_REG_BAR_SIZE 0x10

  //#define DEBUG_IVSHMEM
  #ifdef DEBUG_IVSHMEM
@@ -57,8 +58,10 @@ typedef struct IVShmemState {
 CharDriverState **eventfd_chr;
 CharDriverState *server_chr;
 MemoryRegion ivshmem_mmio;
+MemoryRegion ivshmem_pio;

 pcibus_t mmio_addr;
+pcibus_t pio_addr;
 /* We might need to register the BAR before we actually have the memory.
  * So prepare a container MemoryRegion for the BAR immediately and
  * add a subregion when we have the memory.
@@ -234,7 +237,7 @@ static uint64_t ivshmem_io_read(void *opaque, 
target_phys_addr_t addr,
 return ret;
  }

-static const MemoryRegionOps ivshmem_mmio_ops = {
+static const MemoryRegionOps ivshmem_io_ops = {
 .read = ivshmem_io_read,
 .write = ivshmem_io_write,
 .endianness = DEVICE_NATIVE_ENDIAN,
@@ -348,6 +351,8 @@ static void close_guest_eventfds(IVShmemState *s, int posn)
 for (i = 0; i<  guest_curr_max; i++) {
 kvm_set_ioeventfd_mmio_long(s->peers[posn].eventfds[i],
 s->mmio_addr + DOORBELL, (posn<<  16) | i, 0);
+kvm_set_ioeventfd_pio_long(s->peers[posn].eventfds[i],
+s->pio_addr + DOORBELL, (posn<<  16) | i, 0);
 close(s->peers[posn].eventfds[i]);
 }

@@ -367,6 +372,12 @@ static void setup_ioeventfds(IVShmemState *s) {
   true,
   (i<<  16) | j,
   s->peers[i].eventfds[j]);
+memory_region_add_eventfd(&s->ivshmem_pio,
+  DOORBELL,
+  4,
+  true,
+  (i<<  16) | j,
+  s->peers[i].eventfds[j]);
 }
 }
  }
@@ -495,6 +506,10 @@ static void ivshmem_read(void *opaque, const uint8_t * 
buf, int flags)
 (incoming_posn<<  16) | guest_max_eventfd, 1)<  0) {
 fprintf(stderr, "ivshmem: ioeventfd not available\n");
 }
+if (kvm_set_ioeventfd_pio_long(incoming_fd, s->pio_addr + DOORBELL,
+(incoming_posn<<  16) | guest_max_eventfd, 1)<  0) {
+fprintf(stderr, "ivshmem: ioeventfd not available\n");
+}
 }

 return;
@@ -656,8 +671,10 @@ static int pci_ivshmem_init(PCIDevice *dev)

 s->shm_fd = 0;

-memory_region_init_io(&s->ivshmem_mmio,&ivshmem_mmio_ops, s,
+memory_region_init_io(&s->ivshmem_mmio,&ivshmem_io_ops, s,
   "ivshmem-mmio", IVSHMEM_REG_BAR_SIZE);
+memory_region_init_io(&s->ivshmem_pio,&ivshmem_io_ops, s,
+  "ivshmem-pio", IVSHIO_REG_BAR_SIZE);

 if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD)) {
 setup_ioeventfds(s);
@@ -666,6 +683,8 @@ static int pci_ivshmem_init(PCIDevice *dev)
 /* region for registers*/
 pci_register_bar(&s->dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY,
  &s->ivshmem_mmio);
+pci_register_bar(&s->dev, 3, PCI_BASE_ADDRESS_SPACE_IO,
+&s->ivshmem_pio);

 memory_region_init(&s->bar, "ivshmem-bar2-container", s->ivshmem_size);

@@ -742,6 +761,7 @@ static int pci_ivshmem_uninit(PCIDevice *dev)
 IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);

 memory_region_destroy(&s->ivshmem_mmio);
+memory_region_destroy(&s->ivshmem_pio);
 memory_region_del_subregion(&s->bar,&s->ivshmem);
 memory_region_destroy(&s->ivshmem);
 memory_region_destroy(&s->bar);
diff --git a/kvm-all.c b/kvm-all.c
index 5d500e1..737c2e2 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -1396,6 +1396,29 @@ int kvm_set_ioeventfd_mmio_long(int fd, uint32_t addr, 
uint32_t val, bool assign
 return 0;
  }

+int kvm_set_ioeventfd_pio_long(int fd, uint32_t addr, uint32_t val, bool 
assign)
+{
+struct kvm_ioeventfd kick = {
+.datamatch = val,
+.addr = addr,
+.len = 4,
+.flags = KVM_IOEVENTFD_FLAG_DATAMATCH | KVM_IOEVENTFD_FLAG_PIO,
+.fd = fd,
+};
+int r;
+if (!kvm_e

Re: [Qemu-devel] [PATCH v2] ivshmem: add a new PIO BAR3(Doorbell) besides MMIO BAR0 to reduce notification time

2011-11-28 Thread Cam Macdonell
On Thu, Nov 17, 2011 at 10:50 PM,   wrote:
> From: Hongyong Zang 
>
> This patch, adds a PIO BAR3 for guest notifying qemu. And we find the new 
> notification way of PIO BAR3 reduces 30% time in comparison with the original 
> MMIO BAR0 way.

Come to think of it, should we bump the PIO to BAR4 so that the shared
memory region could be made a 64-bit BAR and therefore take up BAR2
and BAR3?

>
> Signed-off-by: Hongyong Zang 
> ---
>  hw/ivshmem.c |   24 ++--
>  kvm-all.c    |   23 +++
>  kvm.h        |    1 +
>  3 files changed, 46 insertions(+), 2 deletions(-)
>
> diff --git a/hw/ivshmem.c b/hw/ivshmem.c
> index 242fbea..031cdd8 100644
> --- a/hw/ivshmem.c
> +++ b/hw/ivshmem.c
> @@ -29,6 +29,7 @@
>  #define IVSHMEM_MASTER  1
>
>  #define IVSHMEM_REG_BAR_SIZE 0x100
> +#define IVSHIO_REG_BAR_SIZE 0x10
>
>  //#define DEBUG_IVSHMEM
>  #ifdef DEBUG_IVSHMEM
> @@ -57,8 +58,10 @@ typedef struct IVShmemState {
>     CharDriverState **eventfd_chr;
>     CharDriverState *server_chr;
>     MemoryRegion ivshmem_mmio;
> +    MemoryRegion ivshmem_pio;
>
>     pcibus_t mmio_addr;
> +    pcibus_t pio_addr;
>     /* We might need to register the BAR before we actually have the memory.
>      * So prepare a container MemoryRegion for the BAR immediately and
>      * add a subregion when we have the memory.
> @@ -234,7 +237,7 @@ static uint64_t ivshmem_io_read(void *opaque, 
> target_phys_addr_t addr,
>     return ret;
>  }
>
> -static const MemoryRegionOps ivshmem_mmio_ops = {
> +static const MemoryRegionOps ivshmem_io_ops = {
>     .read = ivshmem_io_read,
>     .write = ivshmem_io_write,
>     .endianness = DEVICE_NATIVE_ENDIAN,
> @@ -348,6 +351,8 @@ static void close_guest_eventfds(IVShmemState *s, int 
> posn)
>     for (i = 0; i < guest_curr_max; i++) {
>         kvm_set_ioeventfd_mmio_long(s->peers[posn].eventfds[i],
>                     s->mmio_addr + DOORBELL, (posn << 16) | i, 0);
> +        kvm_set_ioeventfd_pio_long(s->peers[posn].eventfds[i],
> +                    s->pio_addr + DOORBELL, (posn << 16) | i, 0);
>         close(s->peers[posn].eventfds[i]);
>     }
>
> @@ -367,6 +372,12 @@ static void setup_ioeventfds(IVShmemState *s) {
>                                       true,
>                                       (i << 16) | j,
>                                       s->peers[i].eventfds[j]);
> +            memory_region_add_eventfd(&s->ivshmem_pio,
> +                                      DOORBELL,
> +                                      4,
> +                                      true,
> +                                      (i << 16) | j,
> +                                      s->peers[i].eventfds[j]);
>         }
>     }
>  }
> @@ -495,6 +506,10 @@ static void ivshmem_read(void *opaque, const uint8_t * 
> buf, int flags)
>                         (incoming_posn << 16) | guest_max_eventfd, 1) < 0) {
>             fprintf(stderr, "ivshmem: ioeventfd not available\n");
>         }
> +        if (kvm_set_ioeventfd_pio_long(incoming_fd, s->pio_addr + DOORBELL,
> +                        (incoming_posn << 16) | guest_max_eventfd, 1) < 0) {
> +            fprintf(stderr, "ivshmem: ioeventfd not available\n");
> +        }
>     }
>
>     return;
> @@ -656,8 +671,10 @@ static int pci_ivshmem_init(PCIDevice *dev)
>
>     s->shm_fd = 0;
>
> -    memory_region_init_io(&s->ivshmem_mmio, &ivshmem_mmio_ops, s,
> +    memory_region_init_io(&s->ivshmem_mmio, &ivshmem_io_ops, s,
>                           "ivshmem-mmio", IVSHMEM_REG_BAR_SIZE);
> +    memory_region_init_io(&s->ivshmem_pio, &ivshmem_io_ops, s,
> +                          "ivshmem-pio", IVSHIO_REG_BAR_SIZE);
>
>     if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD)) {
>         setup_ioeventfds(s);
> @@ -666,6 +683,8 @@ static int pci_ivshmem_init(PCIDevice *dev)
>     /* region for registers*/
>     pci_register_bar(&s->dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY,
>                      &s->ivshmem_mmio);
> +    pci_register_bar(&s->dev, 3, PCI_BASE_ADDRESS_SPACE_IO,
> +                     &s->ivshmem_pio);
>
>     memory_region_init(&s->bar, "ivshmem-bar2-container", s->ivshmem_size);
>
> @@ -742,6 +761,7 @@ static int pci_ivshmem_uninit(PCIDevice *dev)
>     IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
>
>     memory_region_destroy(&s->ivshmem_mmio);
> +    memory_region_destroy(&s->ivshmem_pio);
>     memory_region_del_subregion(&s->bar, &s->ivshmem);
>     memory_region_destroy(&s->ivshmem);
>     memory_region_destroy(&s->bar);
> diff --git a/kvm-all.c b/kvm-all.c
> index 5d500e1..737c2e2 100644
> --- a/kvm-all.c
> +++ b/kvm-all.c
> @@ -1396,6 +1396,29 @@ int kvm_set_ioeventfd_mmio_long(int fd, uint32_t addr, 
> uint32_t val, bool assign
>     return 0;
>  }
>
> +int kvm_set_ioeventfd_pio_long(int fd, uint32_t addr, uint32_t val, bool 
> assign)
> +{
> +    struct kvm_ioeventfd kick = {
> +        .datamatch = val,
> +        .addr = addr,
> +        .len = 4,
> +        .flags = KVM_IOEVENTFD_FLAG_DATAMAT

Re: [PATCH] kvm tools: Support virtio indirect buffers

2011-11-28 Thread Cyrill Gorcunov
On Mon, Nov 28, 2011 at 07:54:27PM +0200, Sasha Levin wrote:
>  
> +/*
> + * Each buffer in the virtqueues is actually a chain of descriptors.  This
> + * function returns the next descriptor in the chain, or vq->vring.num if 
> we're
> + * at the end.
> + */
> +static unsigned next_desc(struct vring_desc *desc,
> +   unsigned int i, unsigned int max)
> +{
> + unsigned int next;
> +
> + /* If this descriptor says it doesn't chain, we're done. */
> + if (!(desc[i].flags & VRING_DESC_F_NEXT))
> + return max;
> +
> + /* Check they're not leading us off end of descriptors. */
> + next = desc[i].next;
> + /* Make sure compiler knows to grab that: we don't want it changing! */
> + wmb();
> +
> + return next;
> +}
> +

Hi Sasha, where the rmb() then? Or maybe you wanted plain barrier() here?
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH] vfio: VFIO Driver core framework

2011-11-28 Thread Alex Williamson
On Tue, 2011-11-29 at 15:34 +1100, Alexey Kardashevskiy wrote:
> Hi!
> 
> On 29/11/11 14:46, Alex Williamson wrote:
> > On Tue, 2011-11-29 at 12:52 +1100, Alexey Kardashevskiy wrote:
> >> Hi!
> >>
> >> I tried (successfully) to run it on POWER and while doing that I found 
> >> some issues. I'll try to
> >> explain them in separate mails.
> > 
> > Great!
> > 
> >> IOMMU domain setup. On POWER, the linux drivers capable of DMA transfer 
> >> want to know
> >> a DMA window, i.e. its start and length in the PHB address space. This 
> >> comes from
> >> hardware. On X86 (correct if I am wrong), every device driver in the guest 
> >> allocates
> >> memory from the same pool.
> > 
> > Yes, current VT-d/AMD-Vi provide independent IOVA spaces for each
> > device.
> > 
> >>  On POWER, device drivers get DMA window and allocate pages
> >> for DMA within this window. In the case of VFIO, that means that QEMU has 
> >> to
> >> preallocate this DMA window before running a quest, pass it to a guest (via
> >> device tree) and then a guest tells the host what pages are taken/released 
> >> by
> >> calling map/unmap callbacks of iommu_ops. Deallocation is made in a device 
> >> detach
> >> callback as I did not want to add more ioctls.
> >> So, there are 2 patches:
> >>
> >> - new VFIO_IOMMU_SETUP ioctl introduced which allocates a DMA window via 
> >> IOMMU API on
> >> POWER.
> >> btw do we need an additional capability bit for it?
> >>
> >> KERNEL PATCH:
> >>
> >> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> >> index 10615ad..a882e08 100644
> >> --- a/drivers/iommu/iommu.c
> >> +++ b/drivers/iommu/iommu.c
> >> @@ -247,3 +247,12 @@ int iommu_device_group(struct device *dev, unsigned 
> >> int *groupid)
> >>return -ENODEV;
> >>  }
> >>  EXPORT_SYMBOL_GPL(iommu_device_group);
> >> +
> >> +int iommu_setup(struct iommu_domain *domain,
> >> +  size_t requested_size, size_t *allocated_size,
> >> +  phys_addr_t *start_address)
> >> +{
> >> +  return domain->ops->setup(domain, requested_size, allocated_size,
> >> +  start_address);
> >> +}
> >> +EXPORT_SYMBOL_GPL(iommu_setup);
> > 
> > requested_size seems redundant both here and in struct vfio_setup.  We
> > can just pre-load size/start with desired values.  I assume x86 IOMMUs
> > would ignore requested values and return start = 0 and size = hardware
> > decoder address bits.  The IOMMU API currently allows:
> > 
> > iommu_domain_alloc
> > [iommu_attach_device]
> > [iommu_map]
> > [iommu_unmap]
> > [iommu_detach_device]
> > iommu_domain_free
> > 
> > where everything between alloc and free can be called in any order.  How
> > does setup fit into that model?
> 
> This is why I posted a QEMU patch :)

Right, but qemu/vfio is by no means the defacto standard of how one must
use the IOMMU API.  KVM currently orders the map vs attach differently.
When is it valid to call setup when factoring in hot attached/detached
devices?

> > For this it seems like we'd almost want
> > to combine alloc, setup, and the first attach into a single call (ie.
> > create a domain with this initial device and these parameters), then
> > subsequent attaches would only allow compatible devices.
> 
> 
> Not exactly. This setup is more likely to get combined with domain alloc only.

At domain_alloc we don't have any association to actual hardware other
than a bus_type, how would you know which iommu is being setup?

> On POWER, we have iommu_table per DMA window which can be or can be not shared
> between devices. At the moment there is one window per PCIe _device_ (so 
> multiple
> functions of multiport network adapter share one DMA window) and one window 
> for
> all the devices behind PCIe-to-PCI bridge. It is more or less so.
> 
> 
> > I'm a little confused though, is the window determined by hardware or is
> > it configurable via requested_size?
> 
> 
> The window parameters are calculated by software and then written to hardware 
> so
> hardware does filtering and prevents bad devices from memory corruption.
> 
> 
> > David had suggested that we could
> > implement a VFIO_IOMMU_GET_INFO ioctl that returns something like:
> > 
> > struct vfio_iommu_info {
> > __u32   argsz;
> > __u32   flags;
> > __u64   iova_max;   /* Maximum IOVA address */
> > __u64   iova_min;   /* Minimum IOVA address */
> > __u64   pgsize_bitmap;  /* Bitmap of supported page sizes */
> > };
> > 
> > The thought being a TBD IOMMU API interface reports the hardware
> > determined IOVA range and we could fudge it on x86 for now reporting
> > 0/~0.  Maybe we should replace iova_max/iova_min with
> > iova_base/iova_size and allow the caller to request a size by setting
> > iova_size and matching bit in the flags.
> 
> 
> No, we need some sort of SET_INFO, not GET as we want QEMU to decide on a DMA
> window size.

Right, GET_INFO is no longer the right name if we're really passing in
requests.  Maybe it becomes VFIO_IOMMU_SETUP as you 

[PATCH 3/3] KVM: MMU: Make drop_large_spte() more generic

2011-11-28 Thread Takuya Yoshikawa
There are many places where we drop large spte and we are always doing
much the same as drop_large_spte().

To avoid these duplications, this patch makes drop_large_spte() more
generically usable: it now takes an argument to know if it must flush
the remote tlbs and returns true or false depending on is_large_pte()
check result.

Signed-off-by: Takuya Yoshikawa 
---
 arch/x86/kvm/mmu.c |   35 +--
 arch/x86/kvm/paging_tmpl.h |4 ++--
 2 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5e761ff..2db12b3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1023,6 +1023,19 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
rmap_remove(kvm, sptep);
 }
 
+static bool drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush_now)
+{
+   if (!is_large_pte(*sptep))
+   return false;
+
+   drop_spte(kvm, sptep);
+   --kvm->stat.lpages;
+
+   if (flush_now)
+   kvm_flush_remote_tlbs(kvm);
+   return true;
+}
+
 int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
   struct kvm_memory_slot *slot)
 {
@@ -1052,8 +1065,7 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
BUG_ON(!is_large_pte(*spte));
pgprintk("rmap_write_protect(large): spte %p %llx 
%lld\n", spte, *spte, gfn);
if (is_writable_pte(*spte)) {
-   drop_spte(kvm, spte);
-   --kvm->stat.lpages;
+   drop_large_spte(kvm, spte, false);
spte = NULL;
write_protected = 1;
}
@@ -1799,15 +1811,6 @@ static void link_shadow_page(u64 *sptep, struct 
kvm_mmu_page *sp)
mmu_spte_set(sptep, spte);
 }
 
-static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
-{
-   if (is_large_pte(*sptep)) {
-   drop_spte(vcpu->kvm, sptep);
-   --vcpu->kvm->stat.lpages;
-   kvm_flush_remote_tlbs(vcpu->kvm);
-   }
-}
-
 static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
   unsigned direct_access)
 {
@@ -1839,9 +1842,8 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct 
kvm_mmu_page *sp,
pte = *spte;
if (is_shadow_present_pte(pte)) {
if (is_last_spte(pte, sp->role.level)) {
-   drop_spte(kvm, spte);
-   if (is_large_pte(pte))
-   --kvm->stat.lpages;
+   if (!drop_large_spte(kvm, spte, false))
+   drop_spte(kvm, spte);
} else {
child = page_header(pte & PT64_BASE_ADDR_MASK);
drop_parent_pte(child, spte);
@@ -3859,11 +3861,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, 
int slot)
  !is_last_spte(pt[i], sp->role.level))
continue;
 
-   if (is_large_pte(pt[i])) {
-   drop_spte(kvm, &pt[i]);
-   --kvm->stat.lpages;
+   if (drop_large_spte(kvm, &pt[i], false))
continue;
-   }
 
/* avoid RMW */
if (is_writable_pte(pt[i]))
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 52e9d58..c40f074 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -498,7 +498,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
gfn_t table_gfn;
 
clear_sp_write_flooding_count(it.sptep);
-   drop_large_spte(vcpu, it.sptep);
+   drop_large_spte(vcpu->kvm, it.sptep, true);
 
sp = NULL;
if (!is_shadow_present_pte(*it.sptep)) {
@@ -526,7 +526,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
clear_sp_write_flooding_count(it.sptep);
validate_direct_spte(vcpu, it.sptep, direct_access);
 
-   drop_large_spte(vcpu, it.sptep);
+   drop_large_spte(vcpu->kvm, it.sptep, true);
 
if (is_shadow_present_pte(*it.sptep))
continue;
-- 
1.7.5.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/3] KVM: MMU: Add missing large page accounting to drop_large_spte()

2011-11-28 Thread Takuya Yoshikawa
Signed-off-by: Takuya Yoshikawa 
---
 arch/x86/kvm/mmu.c |1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 09da963..5e761ff 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1803,6 +1803,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 
*sptep)
 {
if (is_large_pte(*sptep)) {
drop_spte(vcpu->kvm, sptep);
+   --vcpu->kvm->stat.lpages;
kvm_flush_remote_tlbs(vcpu->kvm);
}
 }
-- 
1.7.5.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] KVM: MMU: Remove for_each_unsync_children() macro

2011-11-28 Thread Takuya Yoshikawa
There is only one user of it and for_each_set_bit() does the same.

Signed-off-by: Takuya Yoshikawa 
---
 arch/x86/kvm/mmu.c |7 +--
 1 files changed, 1 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d737443..09da963 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1403,11 +1403,6 @@ struct kvm_mmu_pages {
unsigned int nr;
 };
 
-#define for_each_unsync_children(bitmap, idx)  \
-   for (idx = find_first_bit(bitmap, 512); \
-idx < 512; \
-idx = find_next_bit(bitmap, 512, idx+1))
-
 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
 int idx)
 {
@@ -1429,7 +1424,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
 {
int i, ret, nr_unsync_leaf = 0;
 
-   for_each_unsync_children(sp->unsync_child_bitmap, i) {
+   for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
struct kvm_mmu_page *child;
u64 ent = sp->spt[i];
 
-- 
1.7.5.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/3] KVM: MMU: Trivial fix and cleanups

2011-11-28 Thread Takuya Yoshikawa
Made when I was reading mmu code.

Takuya

BTW, is threre any good way to test large page functionality?
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH] vfio: VFIO Driver core framework

2011-11-28 Thread Alexey Kardashevskiy
Hi!

On 29/11/11 14:46, Alex Williamson wrote:
> On Tue, 2011-11-29 at 12:52 +1100, Alexey Kardashevskiy wrote:
>> Hi!
>>
>> I tried (successfully) to run it on POWER and while doing that I found some 
>> issues. I'll try to
>> explain them in separate mails.
> 
> Great!
> 
>> IOMMU domain setup. On POWER, the linux drivers capable of DMA transfer want 
>> to know
>> a DMA window, i.e. its start and length in the PHB address space. This comes 
>> from
>> hardware. On X86 (correct if I am wrong), every device driver in the guest 
>> allocates
>> memory from the same pool.
> 
> Yes, current VT-d/AMD-Vi provide independent IOVA spaces for each
> device.
> 
>>  On POWER, device drivers get DMA window and allocate pages
>> for DMA within this window. In the case of VFIO, that means that QEMU has to
>> preallocate this DMA window before running a quest, pass it to a guest (via
>> device tree) and then a guest tells the host what pages are taken/released by
>> calling map/unmap callbacks of iommu_ops. Deallocation is made in a device 
>> detach
>> callback as I did not want to add more ioctls.
>> So, there are 2 patches:
>>
>> - new VFIO_IOMMU_SETUP ioctl introduced which allocates a DMA window via 
>> IOMMU API on
>> POWER.
>> btw do we need an additional capability bit for it?
>>
>> KERNEL PATCH:
>>
>> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
>> index 10615ad..a882e08 100644
>> --- a/drivers/iommu/iommu.c
>> +++ b/drivers/iommu/iommu.c
>> @@ -247,3 +247,12 @@ int iommu_device_group(struct device *dev, unsigned int 
>> *groupid)
>>  return -ENODEV;
>>  }
>>  EXPORT_SYMBOL_GPL(iommu_device_group);
>> +
>> +int iommu_setup(struct iommu_domain *domain,
>> +size_t requested_size, size_t *allocated_size,
>> +phys_addr_t *start_address)
>> +{
>> +return domain->ops->setup(domain, requested_size, allocated_size,
>> +start_address);
>> +}
>> +EXPORT_SYMBOL_GPL(iommu_setup);
> 
> requested_size seems redundant both here and in struct vfio_setup.  We
> can just pre-load size/start with desired values.  I assume x86 IOMMUs
> would ignore requested values and return start = 0 and size = hardware
> decoder address bits.  The IOMMU API currently allows:
> 
> iommu_domain_alloc
> [iommu_attach_device]
> [iommu_map]
> [iommu_unmap]
> [iommu_detach_device]
> iommu_domain_free
> 
> where everything between alloc and free can be called in any order.  How
> does setup fit into that model?

This is why I posted a QEMU patch :)

> For this it seems like we'd almost want
> to combine alloc, setup, and the first attach into a single call (ie.
> create a domain with this initial device and these parameters), then
> subsequent attaches would only allow compatible devices.


Not exactly. This setup is more likely to get combined with domain alloc only.
On POWER, we have iommu_table per DMA window which can be or can be not shared
between devices. At the moment there is one window per PCIe _device_ (so 
multiple
functions of multiport network adapter share one DMA window) and one window for
all the devices behind PCIe-to-PCI bridge. It is more or less so.


> I'm a little confused though, is the window determined by hardware or is
> it configurable via requested_size?


The window parameters are calculated by software and then written to hardware so
hardware does filtering and prevents bad devices from memory corruption.


> David had suggested that we could
> implement a VFIO_IOMMU_GET_INFO ioctl that returns something like:
> 
> struct vfio_iommu_info {
> __u32   argsz;
> __u32   flags;
> __u64   iova_max;   /* Maximum IOVA address */
> __u64   iova_min;   /* Minimum IOVA address */
> __u64   pgsize_bitmap;  /* Bitmap of supported page sizes */
> };
> 
> The thought being a TBD IOMMU API interface reports the hardware
> determined IOVA range and we could fudge it on x86 for now reporting
> 0/~0.  Maybe we should replace iova_max/iova_min with
> iova_base/iova_size and allow the caller to request a size by setting
> iova_size and matching bit in the flags.


No, we need some sort of SET_INFO, not GET as we want QEMU to decide on a DMA
window size.

Or simply add these parameters to domain allocation callback.


>> diff --git a/drivers/vfio/vfio_iommu.c b/drivers/vfio/vfio_iommu.c
>> index 029dae3..57fb70d 100644
>> --- a/drivers/vfio/vfio_iommu.c
>> +++ b/drivers/vfio/vfio_iommu.c
>> @@ -507,6 +507,23 @@ static long vfio_iommu_unl_ioctl(struct file *filep,
>>
>>  if (!ret && copy_to_user((void __user *)arg, &dm, sizeof dm))
>>  ret = -EFAULT;
>> +
>> +} else if (cmd == VFIO_IOMMU_SETUP) {
>> +struct vfio_setup setup;
>> +size_t allocated_size = 0;
>> +phys_addr_t start_address = 0;
>> +
>> +if (copy_from_user(&setup, (void __user *)arg, sizeof setup))
>> +return -EFAULT;
>> +
>> +printk("udomain %p, priv=%p\n

Re: [PATCH 2/5] KVM: MMU: audit: replace mmu audit tracepoint with jump-lable

2011-11-28 Thread Xiao Guangrong
On 11/29/2011 06:33 AM, Jason Baron wrote:

 
> hmmm..this always going to do a call to 'kvm_mmu_audit' and then return.
> I think you want to avoid the function call altogether. You could do
> something like:
> 
> #define kvm_mmu_audit()
>   if (static_branch((&mmu_audit_key))) {
>   __kvm_mmu_audit();
>   }
> 
> and s/kvm_mmu_audit/__kvm_mmu_audit
> 
> That should give you a single nop for the case where kvm_mmu_audit is
> disabled instead of a function call.


Good point, thanks Jason!

Avi, could you please apply the following patch instead?

Subject: [PATCH v2 2/5] KVM: MMU: audit: replace mmu audit tracepoint with 
jump-lable

The tracepoint is only used to audit mmu code, it should not be exposed to
user, let us replace it with jump-lable

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |   23 ---
 arch/x86/kvm/mmu_audit.c   |   17 +
 arch/x86/kvm/mmutrace.h|   19 ---
 arch/x86/kvm/paging_tmpl.h |4 ++--
 4 files changed, 23 insertions(+), 40 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d737443..34bc3fc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2840,6 +2840,13 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
return mmu_alloc_shadow_roots(vcpu);
 }

+#ifdef CONFIG_KVM_MMU_AUDIT
+#include "mmu_audit.c"
+#else
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
+static void mmu_audit_disable(void) { }
+#endif
+
 static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 {
int i;
@@ -2852,12 +2859,12 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
return;

vcpu_clear_mmio_info(vcpu, ~0ul);
-   trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
+   kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
sp = page_header(root);
mmu_sync_children(vcpu, sp);
-   trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
+   kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
return;
}
for (i = 0; i < 4; ++i) {
@@ -2869,7 +2876,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
mmu_sync_children(vcpu, sp);
}
}
-   trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
+   kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
 }

 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -3667,7 +3674,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,

spin_lock(&vcpu->kvm->mmu_lock);
++vcpu->kvm->stat.mmu_pte_write;
-   trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
+   kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);

mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
@@ -3700,7 +3707,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
}
mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-   trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
+   kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
spin_unlock(&vcpu->kvm->mmu_lock);
 }

@@ -4030,12 +4037,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
mmu_free_memory_caches(vcpu);
 }

-#ifdef CONFIG_KVM_MMU_AUDIT
-#include "mmu_audit.c"
-#else
-static void mmu_audit_disable(void) { }
-#endif
-
 void kvm_mmu_module_exit(void)
 {
mmu_destroy_caches();
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 746ec25..967a535 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -224,7 +224,7 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
mmu_spte_walk(vcpu, audit_spte);
 }

-static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
+static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
 {
static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);

@@ -237,17 +237,19 @@ static void kvm_mmu_audit(void *ignore, struct kvm_vcpu 
*vcpu, int point)
 }

 static bool mmu_audit;
+static struct jump_label_key mmu_audit_key;
+
+#define kvm_mmu_audit(vcpu, point) \
+   if (static_branch((&mmu_audit_key))) {  \
+   __kvm_mmu_audit(vcpu, point);   \
+   }

 static void mmu_audit_enable(void)
 {
-   int ret;
-
if (mmu_audit)
return;

-   ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
-   WARN_ON(ret);
-
+   jump_label_inc(&mmu_audit_key);
mmu_audit = true;
 }

@@ -256,8 +258,7 @@ static void mmu_audit_disable(void)
if (!mmu_audit)
return;

-   unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
-   tracepoint_synchronize_unregister();
+   jump_label_dec(&mmu_audit_key);
mmu_audit = false;
 }

diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index eed67f3..89fb0e

Re: [RFC PATCH] vfio: VFIO Driver core framework

2011-11-28 Thread Alex Williamson
On Tue, 2011-11-29 at 13:01 +1100, Alexey Kardashevskiy wrote:
> Hi all,
> 
> Another problem I hit on POWER - MSI interrupts allocation. The existing VFIO 
> does not expect a PBH
> to support less interrupts that a device might request. In my case, PHB's 
> limit is 8 interrupts
> while my test card (10Gb ethernet CXGB3) wants 9. Below are the patches to 
> demonstrate the idea.

Seems reasonable.  I assume we'd need similar for vfio_pci_setup_msi,
though I haven't seen anything use more than a single MSI interrupt.
Thanks,

Alex

> KERNEL patch:
> 
> diff --git a/drivers/vfio/pci/vfio_pci_intrs.c 
> b/drivers/vfio/pci/vfio_pci_intrs.c
> index 7d45c6b..d44b9bf 100644
> --- a/drivers/vfio/pci/vfio_pci_intrs.c
> +++ b/drivers/vfio/pci/vfio_pci_intrs.c
> @@ -458,17 +458,32 @@ int vfio_pci_setup_msix(struct vfio_pci_device *vdev, 
> int nvec, int __user *inta
>   vdev->msix[i].entry = i;
>   vdev->ev_msix[i] = ctx;
>   }
> - if (!ret)
> + if (!ret) {
>   ret = pci_enable_msix(pdev, vdev->msix, nvec);
> + /*
> +The kernel is unable to allocate requested number of IRQs
> +and returned the available number.
> +  */
> + if (0 < ret) {
> + ret = pci_enable_msix(pdev, vdev->msix, ret);
> + }
> + }
>   vdev->msix_nvec = 0;
> - for (i = 0; i < nvec && !ret; i++) {
> - ret = request_irq(vdev->msix[i].vector, msihandler, 0,
> -   "vfio", vdev->ev_msix[i]);
> - if (ret)
> - break;
> - vdev->msix_nvec = i+1;
> + if (0 == ret) {
> + vdev->msix_nvec = 0;
> + ret = 0;
> + for (i = 0; i < nvec && !ret; i++) {
> + ret = request_irq(vdev->msix[i].vector, msihandler, 0,
> + "vfio", vdev->ev_msix[i]);
> + if (ret)
> + break;
> + vdev->msix_nvec = i+1;
> + }
> + if ((0 == vdev->msix_nvec) && (0 != ret))
> + vfio_pci_drop_msix(vdev);
> + else
> + ret = vdev->msix_nvec;
>   }
> - if (ret)
> - vfio_pci_drop_msix(vdev);
> +
>   return ret;
>  }
> 
> === end ===
> 
> 
> QEMU patch:
> 
> diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
> index 020961a..980eec7 100644
> --- a/hw/vfio_pci.c
> +++ b/hw/vfio_pci.c
> @@ -341,7 +341,8 @@ static void vfio_enable_msi(VFIODevice *vdev, bool msix)
>  }
>  }
> 
> -if (ioctl(vdev->fd, VFIO_DEVICE_SET_IRQ_EVENTFDS, fds)) {
> +ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQ_EVENTFDS, fds);
> +if (0 > ret) {
>  fprintf(stderr, "vfio: Error: Failed to setup MSI/X fds %s\n",
>  strerror(errno));
>  for (i = 0; i < vdev->nr_vectors; i++) {
> @@ -355,6 +356,8 @@ static void vfio_enable_msi(VFIODevice *vdev, bool msix)
>  qemu_free(vdev->msi_vectors);
>  vdev->nr_vectors = 0;
>  return;
> +} else if (0 < ret) {
> +vdev->nr_vectors = ret;
>  }
> 
>  vdev->interrupt = msix ? INT_MSIX : INT_MSI;
> 
> 
> === end ===


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH] vfio: VFIO Driver core framework

2011-11-28 Thread Alex Williamson
On Tue, 2011-11-29 at 12:52 +1100, Alexey Kardashevskiy wrote:
> Hi!
> 
> I tried (successfully) to run it on POWER and while doing that I found some 
> issues. I'll try to
> explain them in separate mails.

Great!

> IOMMU domain setup. On POWER, the linux drivers capable of DMA transfer want 
> to know
> a DMA window, i.e. its start and length in the PHB address space. This comes 
> from
> hardware. On X86 (correct if I am wrong), every device driver in the guest 
> allocates
> memory from the same pool.

Yes, current VT-d/AMD-Vi provide independent IOVA spaces for each
device.

>  On POWER, device drivers get DMA window and allocate pages
> for DMA within this window. In the case of VFIO, that means that QEMU has to
> preallocate this DMA window before running a quest, pass it to a guest (via
> device tree) and then a guest tells the host what pages are taken/released by
> calling map/unmap callbacks of iommu_ops. Deallocation is made in a device 
> detach
> callback as I did not want to add more ioctls.
> So, there are 2 patches:
> 
> - new VFIO_IOMMU_SETUP ioctl introduced which allocates a DMA window via 
> IOMMU API on
> POWER.
> btw do we need an additional capability bit for it?
> 
> KERNEL PATCH:
> 
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index 10615ad..a882e08 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -247,3 +247,12 @@ int iommu_device_group(struct device *dev, unsigned int 
> *groupid)
>   return -ENODEV;
>  }
>  EXPORT_SYMBOL_GPL(iommu_device_group);
> +
> +int iommu_setup(struct iommu_domain *domain,
> + size_t requested_size, size_t *allocated_size,
> + phys_addr_t *start_address)
> +{
> + return domain->ops->setup(domain, requested_size, allocated_size,
> + start_address);
> +}
> +EXPORT_SYMBOL_GPL(iommu_setup);

requested_size seems redundant both here and in struct vfio_setup.  We
can just pre-load size/start with desired values.  I assume x86 IOMMUs
would ignore requested values and return start = 0 and size = hardware
decoder address bits.  The IOMMU API currently allows:

iommu_domain_alloc
[iommu_attach_device]
[iommu_map]
[iommu_unmap]
[iommu_detach_device]
iommu_domain_free

where everything between alloc and free can be called in any order.  How
does setup fit into that model?  For this it seems like we'd almost want
to combine alloc, setup, and the first attach into a single call (ie.
create a domain with this initial device and these parameters), then
subsequent attaches would only allow compatible devices.

I'm a little confused though, is the window determined by hardware or is
it configurable via requested_size?  David had suggested that we could
implement a VFIO_IOMMU_GET_INFO ioctl that returns something like:

struct vfio_iommu_info {
__u32   argsz;
__u32   flags;
__u64   iova_max;   /* Maximum IOVA address */
__u64   iova_min;   /* Minimum IOVA address */
__u64   pgsize_bitmap;  /* Bitmap of supported page sizes */
};

The thought being a TBD IOMMU API interface reports the hardware
determined IOVA range and we could fudge it on x86 for now reporting
0/~0.  Maybe we should replace iova_max/iova_min with
iova_base/iova_size and allow the caller to request a size by setting
iova_size and matching bit in the flags.

> diff --git a/drivers/vfio/vfio_iommu.c b/drivers/vfio/vfio_iommu.c
> index 029dae3..57fb70d 100644
> --- a/drivers/vfio/vfio_iommu.c
> +++ b/drivers/vfio/vfio_iommu.c
> @@ -507,6 +507,23 @@ static long vfio_iommu_unl_ioctl(struct file *filep,
> 
>   if (!ret && copy_to_user((void __user *)arg, &dm, sizeof dm))
>   ret = -EFAULT;
> +
> + } else if (cmd == VFIO_IOMMU_SETUP) {
> + struct vfio_setup setup;
> + size_t allocated_size = 0;
> + phys_addr_t start_address = 0;
> +
> + if (copy_from_user(&setup, (void __user *)arg, sizeof setup))
> + return -EFAULT;
> +
> + printk("udomain %p, priv=%p\n", iommu->domain, 
> iommu->domain->priv);
> + ret = iommu_setup(iommu->domain, setup.requested_size,
> + &allocated_size, &start_address);
> + setup.allocated_size = allocated_size;
> + setup.start_address = start_address;
> +
> + if (!ret && copy_to_user((void __user *)arg, &setup, sizeof 
> setup))
> + ret = -EFAULT;
>   }
>   return ret;
>  }
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 93617e7..355cf8b 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -45,6 +45,7 @@ struct iommu_domain {
> 
>  #define IOMMU_CAP_CACHE_COHERENCY0x1
>  #define IOMMU_CAP_INTR_REMAP 0x2 /* isolates device intrs */
> +#define IOMMU_CAP_SETUP_REQUIRED 0x3 /* requires setup to be called 
> */
> 
>  #ifdef CONFIG_IOMMU_API
> 
> @@ -62,6 +63,9 @@ struct iommu_op

Re: [RFC PATCH] vfio: VFIO Driver core framework

2011-11-28 Thread Alexey Kardashevskiy
Hi all again,

It was actually the very first problem - endianess :-)
I am still not sure what format is better for cached config space or whether we 
should cache it all.

Also, as Benh already mentioned, vfio_virt_init reads a config space to a cache 
by pci_read_config_dword
for the whole space while some devices may not like it as they might 
distinguish length of PCI
transactions.



KERNEL patch:

diff --git a/drivers/vfio/pci/vfio_pci_config.c 
b/drivers/vfio/pci/vfio_pci_config.c
index b3bab99..9d563b4 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -757,6 +757,16 @@ static int vfio_virt_init(struct vfio_pci_device *vdev)
vdev->rbar[5] = *(u32 *)&vdev->vconfig[PCI_BASE_ADDRESS_5];
vdev->rbar[6] = *(u32 *)&vdev->vconfig[PCI_ROM_ADDRESS];

+   /*
+* As pci_read_config_ returns data in native format,
+* and the cached copy is used in assumption that it is
+* native PCI format, fix endianness in the cached copy.
+*/
+   lp = (u32 *)vdev->vconfig;
+   for (i = 0; i < pdev->cfg_size/sizeof(u32); i++, lp++) {
+   *lp = cpu_to_le32(*lp);
+   }
+
/* for sr-iov devices */
vdev->vconfig[PCI_VENDOR_ID] = pdev->vendor & 0xFF;
vdev->vconfig[PCI_VENDOR_ID+1] = pdev->vendor >> 8;
@@ -807,18 +817,18 @@ static void vfio_bar_fixup(struct vfio_pci_device *vdev)
else
mask = 0;
lp = (u32 *)(vdev->vconfig + PCI_BASE_ADDRESS_0 + 4*bar);
-   *lp &= (u32)mask;
+   *lp &= cpu_to_le32((u32)mask);

if (pci_resource_flags(pdev, bar) & IORESOURCE_IO)
-   *lp |= PCI_BASE_ADDRESS_SPACE_IO;
+   *lp |= cpu_to_le32(PCI_BASE_ADDRESS_SPACE_IO);
else if (pci_resource_flags(pdev, bar) & IORESOURCE_MEM) {
-   *lp |= PCI_BASE_ADDRESS_SPACE_MEMORY;
+   *lp |= cpu_to_le32(PCI_BASE_ADDRESS_SPACE_MEMORY);
if (pci_resource_flags(pdev, bar) & IORESOURCE_PREFETCH)
-   *lp |= PCI_BASE_ADDRESS_MEM_PREFETCH;
+   *lp |= 
cpu_to_le32(PCI_BASE_ADDRESS_MEM_PREFETCH);
if (pci_resource_flags(pdev, bar) & IORESOURCE_MEM_64) {
-   *lp |= PCI_BASE_ADDRESS_MEM_TYPE_64;
+   *lp |= 
cpu_to_le32(PCI_BASE_ADDRESS_MEM_TYPE_64);
lp++;
-   *lp &= (u32)(mask >> 32);
+   *lp &= cpu_to_le32((u32)(mask >> 32));
bar++;
}
}
@@ -830,7 +840,7 @@ static void vfio_bar_fixup(struct vfio_pci_device *vdev)
} else
mask = 0;
lp = (u32 *)(vdev->vconfig + PCI_ROM_ADDRESS);
-   *lp &= (u32)mask;
+   *lp &= cpu_to_le32((u32)mask);

vdev->bardirty = 0;
 }


=== end ===




QEMU patch:


diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
index 980eec7..1c97c35 100644
--- a/hw/vfio_pci.c
+++ b/hw/vfio_pci.c
@@ -405,6 +405,8 @@ static void vfio_resource_write(void *opaque, 
target_phys_addr_t addr,
 {
 PCIResource *res = opaque;

+fprintf(stderr, "change endianness\n");
+
 if (pwrite(res->fd, &data, size, res->offset + addr) != size) {
 fprintf(stderr, "%s(,0x%"PRIx64", 0x%"PRIx64", %d) failed: %s\n",
 __FUNCTION__, addr, data, size, strerror(errno));
@@ -429,6 +431,9 @@ static uint64_t vfio_resource_read(void *opaque,
 DPRINTF("%s(BAR%d+0x%"PRIx64", %d) = 0x%"PRIx64"\n",
 __FUNCTION__, res->bar, addr, size, data);

+data = le32_to_cpu(data);
+DPRINTF("%s(BAR%d+0x%"PRIx64", %d) = 0x%"PRIx64" --- CPU\n",
+__FUNCTION__, res->bar, addr, size, data);
 return data;
 }

@@ -454,13 +459,25 @@ static uint32_t vfio_pci_read_config(PCIDevice *pdev, 
uint32_t addr, int len)

 val = pci_default_read_config(pdev, addr, len);
 } else {
-if (pread(vdev->fd, &val, len, vdev->config_offset + addr) != len) {
+u8 buf[4] = {0};
+if (pread(vdev->fd, buf, len, vdev->config_offset + addr) != len) {
 fprintf(stderr, "%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %s\n",
 __FUNCTION__, vdev->host.seg, vdev->host.bus,
 vdev->host.dev, vdev->host.func, addr, len,
 strerror(errno));
 return -1;
 }
+   switch (len) {
+case 1: val = buf[0]; break;
+case 2: val = le16_to_cpupu((uint16_t*)buf); break;
+case 4: val = le32_to_cpupu((uint32_t*)buf); break;
+default:
+fprintf(stderr, "%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: 
%s\n",
+__FUNCTION__, vdev->host.seg, vdev->host.bus,
+vdev->host.dev, vdev->host.func,

Re: [RFC PATCH] vfio: VFIO Driver core framework

2011-11-28 Thread Alexey Kardashevskiy
Hi all,

Another problem I hit on POWER - MSI interrupts allocation. The existing VFIO 
does not expect a PBH
to support less interrupts that a device might request. In my case, PHB's limit 
is 8 interrupts
while my test card (10Gb ethernet CXGB3) wants 9. Below are the patches to 
demonstrate the idea.


KERNEL patch:

diff --git a/drivers/vfio/pci/vfio_pci_intrs.c 
b/drivers/vfio/pci/vfio_pci_intrs.c
index 7d45c6b..d44b9bf 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -458,17 +458,32 @@ int vfio_pci_setup_msix(struct vfio_pci_device *vdev, int 
nvec, int __user *inta
vdev->msix[i].entry = i;
vdev->ev_msix[i] = ctx;
}
-   if (!ret)
+   if (!ret) {
ret = pci_enable_msix(pdev, vdev->msix, nvec);
+   /*
+  The kernel is unable to allocate requested number of IRQs
+  and returned the available number.
+*/
+   if (0 < ret) {
+   ret = pci_enable_msix(pdev, vdev->msix, ret);
+   }
+   }
vdev->msix_nvec = 0;
-   for (i = 0; i < nvec && !ret; i++) {
-   ret = request_irq(vdev->msix[i].vector, msihandler, 0,
- "vfio", vdev->ev_msix[i]);
-   if (ret)
-   break;
-   vdev->msix_nvec = i+1;
+   if (0 == ret) {
+   vdev->msix_nvec = 0;
+   ret = 0;
+   for (i = 0; i < nvec && !ret; i++) {
+   ret = request_irq(vdev->msix[i].vector, msihandler, 0,
+   "vfio", vdev->ev_msix[i]);
+   if (ret)
+   break;
+   vdev->msix_nvec = i+1;
+   }
+   if ((0 == vdev->msix_nvec) && (0 != ret))
+   vfio_pci_drop_msix(vdev);
+   else
+   ret = vdev->msix_nvec;
}
-   if (ret)
-   vfio_pci_drop_msix(vdev);
+
return ret;
 }

=== end ===


QEMU patch:

diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
index 020961a..980eec7 100644
--- a/hw/vfio_pci.c
+++ b/hw/vfio_pci.c
@@ -341,7 +341,8 @@ static void vfio_enable_msi(VFIODevice *vdev, bool msix)
 }
 }

-if (ioctl(vdev->fd, VFIO_DEVICE_SET_IRQ_EVENTFDS, fds)) {
+ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQ_EVENTFDS, fds);
+if (0 > ret) {
 fprintf(stderr, "vfio: Error: Failed to setup MSI/X fds %s\n",
 strerror(errno));
 for (i = 0; i < vdev->nr_vectors; i++) {
@@ -355,6 +356,8 @@ static void vfio_enable_msi(VFIODevice *vdev, bool msix)
 qemu_free(vdev->msi_vectors);
 vdev->nr_vectors = 0;
 return;
+} else if (0 < ret) {
+vdev->nr_vectors = ret;
 }

 vdev->interrupt = msix ? INT_MSIX : INT_MSI;


=== end ===




On 29/11/11 12:52, Alexey Kardashevskiy wrote:
> Hi!
> 
> I tried (successfully) to run it on POWER and while doing that I found some 
> issues. I'll try to
> explain them in separate mails.
> 
> 
> 
> On 04/11/11 07:12, Alex Williamson wrote:
>> VFIO provides a secure, IOMMU based interface for user space
>> drivers, including device assignment to virtual machines.
>> This provides the base management of IOMMU groups, devices,
>> and IOMMU objects.  See Documentation/vfio.txt included in
>> this patch for user and kernel API description.
>>
>> Note, this implements the new API discussed at KVM Forum
>> 2011, as represented by the drvier version 0.2.  It's hoped
>> that this provides a modular enough interface to support PCI
>> and non-PCI userspace drivers across various architectures
>> and IOMMU implementations.
>>
>> Signed-off-by: Alex Williamson 
>> ---
>>
>> Fingers crossed, this is the last RFC for VFIO, but we need
>> the iommu group support before this can go upstream
>> (http://lkml.indiana.edu/hypermail/linux/kernel/1110.2/02303.html),
>> hoping this helps push that along.
>>
>> Since the last posting, this version completely modularizes
>> the device backends and better defines the APIs between the
>> core VFIO code and the device backends.  I expect that we
>> might also adopt a modular IOMMU interface as iommu_ops learns
>> about different types of hardware.  Also many, many cleanups.
>> Check the complete git history for details:
>>
>> git://github.com/awilliam/linux-vfio.git vfio-ng
>>
>> (matching qemu tree: git://github.com/awilliam/qemu-vfio.git)
>>
>> This version, along with the supporting VFIO PCI backend can
>> be found here:
>>
>> git://github.com/awilliam/linux-vfio.git vfio-next-2003
>>
>> I've held off on implementing a kernel->user signaling
>> mechanism for now since the previous netlink version produced
>> too many gag reflexes.  It's easy enough to set a bit in the
>> group flags too indicate such support in the future, so I
>> think we can move ahead without it.
>>
>> Appreciate any 

Re: [RFC PATCH] vfio: VFIO Driver core framework

2011-11-28 Thread Alexey Kardashevskiy
Hi!

I tried (successfully) to run it on POWER and while doing that I found some 
issues. I'll try to
explain them in separate mails.

IOMMU domain setup. On POWER, the linux drivers capable of DMA transfer want to 
know
a DMA window, i.e. its start and length in the PHB address space. This comes 
from
hardware. On X86 (correct if I am wrong), every device driver in the guest 
allocates
memory from the same pool. On POWER, device drivers get DMA window and allocate 
pages
for DMA within this window. In the case of VFIO, that means that QEMU has to
preallocate this DMA window before running a quest, pass it to a guest (via
device tree) and then a guest tells the host what pages are taken/released by
calling map/unmap callbacks of iommu_ops. Deallocation is made in a device 
detach
callback as I did not want to add more ioctls.
So, there are 2 patches:

- new VFIO_IOMMU_SETUP ioctl introduced which allocates a DMA window via IOMMU 
API on
POWER.
btw do we need an additional capability bit for it?

KERNEL PATCH:

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 10615ad..a882e08 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -247,3 +247,12 @@ int iommu_device_group(struct device *dev, unsigned int 
*groupid)
return -ENODEV;
 }
 EXPORT_SYMBOL_GPL(iommu_device_group);
+
+int iommu_setup(struct iommu_domain *domain,
+   size_t requested_size, size_t *allocated_size,
+   phys_addr_t *start_address)
+{
+   return domain->ops->setup(domain, requested_size, allocated_size,
+   start_address);
+}
+EXPORT_SYMBOL_GPL(iommu_setup);
diff --git a/drivers/vfio/vfio_iommu.c b/drivers/vfio/vfio_iommu.c
index 029dae3..57fb70d 100644
--- a/drivers/vfio/vfio_iommu.c
+++ b/drivers/vfio/vfio_iommu.c
@@ -507,6 +507,23 @@ static long vfio_iommu_unl_ioctl(struct file *filep,

if (!ret && copy_to_user((void __user *)arg, &dm, sizeof dm))
ret = -EFAULT;
+
+   } else if (cmd == VFIO_IOMMU_SETUP) {
+   struct vfio_setup setup;
+   size_t allocated_size = 0;
+   phys_addr_t start_address = 0;
+
+   if (copy_from_user(&setup, (void __user *)arg, sizeof setup))
+   return -EFAULT;
+
+   printk("udomain %p, priv=%p\n", iommu->domain, 
iommu->domain->priv);
+   ret = iommu_setup(iommu->domain, setup.requested_size,
+   &allocated_size, &start_address);
+   setup.allocated_size = allocated_size;
+   setup.start_address = start_address;
+
+   if (!ret && copy_to_user((void __user *)arg, &setup, sizeof 
setup))
+   ret = -EFAULT;
}
return ret;
 }
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 93617e7..355cf8b 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -45,6 +45,7 @@ struct iommu_domain {

 #define IOMMU_CAP_CACHE_COHERENCY  0x1
 #define IOMMU_CAP_INTR_REMAP   0x2 /* isolates device intrs */
+#define IOMMU_CAP_SETUP_REQUIRED   0x3 /* requires setup to be called 
*/

 #ifdef CONFIG_IOMMU_API

@@ -62,6 +63,9 @@ struct iommu_ops {
int (*domain_has_cap)(struct iommu_domain *domain,
  unsigned long cap);
int (*device_group)(struct device *dev, unsigned int *groupid);
+   int (*setup)(struct iommu_domain *domain,
+size_t requested_size, size_t *allocated_size,
+phys_addr_t *start_address);
 };

 extern int bus_set_iommu(struct bus_type *bus, struct iommu_ops *ops);
@@ -80,6 +84,9 @@ extern phys_addr_t iommu_iova_to_phys(struct iommu_domain 
*domain,
  unsigned long iova);
 extern int iommu_domain_has_cap(struct iommu_domain *domain,
unsigned long cap);
+extern int iommu_setup(struct iommu_domain *domain,
+  size_t requested_size, size_t *allocated_size,
+  phys_addr_t *start_address);
 extern void iommu_set_fault_handler(struct iommu_domain *domain,
iommu_fault_handler_t handler);
 extern int iommu_device_group(struct device *dev, unsigned int *groupid);
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 971e3b1..5e0ee75 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -26,6 +26,7 @@
  * Author: Michael S. Tsirkin 
  */
 #include 
+#include 

 #ifndef VFIO_H
 #define VFIO_H
@@ -172,4 +173,13 @@ enum {
VFIO_PCI_NUM_IRQS
 };

+/* Setup domain */
+#define VFIO_IOMMU_SETUP   _IOWR(';', 150, struct vfio_setup)
+
+struct vfio_setup {
+   __u64   requested_size;
+   __u64   allocated_size;
+   __u64   start_address;
+};
+
  #endif /* VFIO_H */

=== end ===


QEMU PATCH:

diff --git a/hw/linux-vfio.h b/hw/linux-vfio.h
index ac48d85..a2c719f 100644
--- a/hw/linux-vfio.h
+++ b/hw/linux-vf

[PATCH 2/3] KVM: booke: Add booke206 TLB trace

2011-11-28 Thread Scott Wood
From: Liu Yu 

Signed-off-by: Liu Yu 
[scottw...@freescale.com: made mas2 64-bit, and added mas8 init]
Signed-off-by: Scott Wood 
---
 arch/powerpc/kvm/e500_tlb.c |   10 ---
 arch/powerpc/kvm/trace.h|   57 +++
 2 files changed, 63 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index 5073768..d041f5e 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -294,6 +294,9 @@ static inline void __write_host_tlbe(struct 
kvm_book3e_206_tlb_entry *stlbe,
mtspr(SPRN_MAS7, (u32)(stlbe->mas7_3 >> 32));
asm volatile("isync; tlbwe" : : : "memory");
local_irq_restore(flags);
+
+   trace_kvm_booke206_stlb_write(mas0, stlbe->mas8, stlbe->mas1,
+ stlbe->mas2, stlbe->mas7_3);
 }
 
 /* esel is index into set, not whole array */
@@ -308,8 +311,6 @@ static inline void write_host_tlbe(struct kvmppc_vcpu_e500 
*vcpu_e500,
  MAS0_TLBSEL(1) |
  MAS0_ESEL(to_htlb1_esel(esel)));
}
-   trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2,
-(u32)stlbe->mas7_3, (u32)(stlbe->mas7_3 >> 32));
 }
 
 void kvmppc_map_magic(struct kvm_vcpu *vcpu)
@@ -331,6 +332,7 @@ void kvmppc_map_magic(struct kvm_vcpu *vcpu)
magic.mas2 = vcpu->arch.magic_page_ea | MAS2_M;
magic.mas7_3 = ((u64)pfn << PAGE_SHIFT) |
   MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR;
+   magic.mas8 = 0;
 
__write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index));
preempt_enable();
@@ -946,8 +948,8 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
gtlbe->mas2 = vcpu->arch.shared->mas2;
gtlbe->mas7_3 = vcpu->arch.shared->mas7_3;
 
-   trace_kvm_gtlb_write(vcpu->arch.shared->mas0, gtlbe->mas1, gtlbe->mas2,
-(u32)gtlbe->mas7_3, (u32)(gtlbe->mas7_3 >> 32));
+   trace_kvm_booke206_gtlb_write(vcpu->arch.shared->mas0, gtlbe->mas1,
+ gtlbe->mas2, gtlbe->mas7_3);
 
/* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */
if (tlbe_is_host_safe(vcpu, gtlbe)) {
diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h
index b135d3d..f2ea44b 100644
--- a/arch/powerpc/kvm/trace.h
+++ b/arch/powerpc/kvm/trace.h
@@ -337,6 +337,63 @@ TRACE_EVENT(kvm_book3s_slbmte,
 
 #endif /* CONFIG_PPC_BOOK3S */
 
+
+/*
+ * Book3E trace points   *
+ */
+
+#ifdef CONFIG_BOOKE
+
+TRACE_EVENT(kvm_booke206_stlb_write,
+   TP_PROTO(__u32 mas0, __u32 mas8, __u32 mas1, __u64 mas2, __u64 mas7_3),
+   TP_ARGS(mas0, mas8, mas1, mas2, mas7_3),
+
+   TP_STRUCT__entry(
+   __field(__u32,  mas0)
+   __field(__u32,  mas8)
+   __field(__u32,  mas1)
+   __field(__u64,  mas2)
+   __field(__u64,  mas7_3  )
+   ),
+
+   TP_fast_assign(
+   __entry->mas0   = mas0;
+   __entry->mas8   = mas8;
+   __entry->mas1   = mas1;
+   __entry->mas2   = mas2;
+   __entry->mas7_3 = mas7_3;
+   ),
+
+   TP_printk("mas0=%x mas8=%x mas1=%x mas2=%llx mas7_3=%llx",
+   __entry->mas0, __entry->mas8, __entry->mas1,
+   __entry->mas2, __entry->mas7_3)
+);
+
+TRACE_EVENT(kvm_booke206_gtlb_write,
+   TP_PROTO(__u32 mas0, __u32 mas1, __u64 mas2, __u64 mas7_3),
+   TP_ARGS(mas0, mas1, mas2, mas7_3),
+
+   TP_STRUCT__entry(
+   __field(__u32,  mas0)
+   __field(__u32,  mas1)
+   __field(__u64,  mas2)
+   __field(__u64,  mas7_3  )
+   ),
+
+   TP_fast_assign(
+   __entry->mas0   = mas0;
+   __entry->mas1   = mas1;
+   __entry->mas2   = mas2;
+   __entry->mas7_3 = mas7_3;
+   ),
+
+   TP_printk("mas0=%x mas1=%x mas2=%llx mas7_3=%llx",
+   __entry->mas0, __entry->mas1,
+   __entry->mas2, __entry->mas7_3)
+);
+
+#endif
+
 #endif /* _TRACE_KVM_H */
 
 /* This part must be outside protection */
-- 
1.7.7.rc3.4.g8d714


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] KVM: PPC: e500: use hardware hint when loading TLB0 entries

2011-11-28 Thread Scott Wood
The hardware maintains a per-set next victim hint.  Using this
reduces conflicts, especially on e500v2 where a single guest
TLB entry is mapped to two shadow TLB entries (user and kernel).
We want those two entries to go to different TLB ways.

sesel is now only used for TLB1.

Reported-by: Liu Yu 
Signed-off-by: Scott Wood 
---
 arch/powerpc/include/asm/mmu-book3e.h |1 +
 arch/powerpc/kvm/e500_tlb.c   |   68 ++--
 2 files changed, 39 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu-book3e.h 
b/arch/powerpc/include/asm/mmu-book3e.h
index 4c30de3..3a7042d 100644
--- a/arch/powerpc/include/asm/mmu-book3e.h
+++ b/arch/powerpc/include/asm/mmu-book3e.h
@@ -44,6 +44,7 @@
 #define MAS0_ESEL(x)   (((x) << 16) & 0x0FFF)
 #define MAS0_NV(x) ((x) & 0x0FFF)
 #define MAS0_ESEL_MASK 0x0FFF
+#define MAS0_ESEL_SHIFT16
 #define MAS0_HES   0x4000
 #define MAS0_WQ_ALLWAYS0x
 #define MAS0_WQ_COND   0x1000
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index d041f5e..6be6917 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -299,17 +299,40 @@ static inline void __write_host_tlbe(struct 
kvm_book3e_206_tlb_entry *stlbe,
  stlbe->mas2, stlbe->mas7_3);
 }
 
-/* esel is index into set, not whole array */
+/*
+ * Acquire a mas0 with victim hint, as if we just took a TLB miss.
+ *
+ * We don't care about the address we're searching for, other than that
+ * it's right set and is not present in the TLB.  Using a zero PID and a
+ * userspace address means we don't have to set and then restore MAS5, or
+ * calculate a proper MAS6 value.
+ */
+static u32 get_host_mas0(unsigned long eaddr)
+{
+   unsigned long flags, mas0;
+
+   local_irq_save(flags);
+   mtspr(SPRN_MAS6, 0);
+   asm volatile("tlbsx 0, %0" : : "b" (eaddr & ~CONFIG_PAGE_OFFSET));
+   mas0 = mfspr(SPRN_MAS0);
+   local_irq_restore(flags);
+
+   return mas0;
+}
+
+/* sesel is for tlb1 only */
 static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
-   int tlbsel, int esel, struct kvm_book3e_206_tlb_entry *stlbe)
+   int tlbsel, int sesel, struct kvm_book3e_206_tlb_entry *stlbe)
 {
+   unsigned long mas0;
+
if (tlbsel == 0) {
-   int way = esel & (vcpu_e500->gtlb_params[0].ways - 1);
-   __write_host_tlbe(stlbe, MAS0_TLBSEL(0) | MAS0_ESEL(way));
+   mas0 = get_host_mas0(stlbe->mas2);
+   __write_host_tlbe(stlbe, mas0);
} else {
__write_host_tlbe(stlbe,
  MAS0_TLBSEL(1) |
- MAS0_ESEL(to_htlb1_esel(esel)));
+ MAS0_ESEL(to_htlb1_esel(sesel)));
}
 }
 
@@ -424,12 +447,6 @@ static int gtlb0_set_base(struct kvmppc_vcpu_e500 
*vcpu_e500, gva_t addr)
 vcpu_e500->gtlb_params[0].ways);
 }
 
-static int htlb0_set_base(gva_t addr)
-{
-   return tlb0_set_base(addr, host_tlb_params[0].sets,
-host_tlb_params[0].ways);
-}
-
 static unsigned int get_tlb_esel(struct kvm_vcpu *vcpu, int tlbsel)
 {
struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
@@ -587,10 +604,9 @@ static inline void kvmppc_e500_setup_stlbe(
vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
 }
 
-/* sesel is an index into the entire array, not just the set */
 static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe,
-   int tlbsel, int sesel, struct kvm_book3e_206_tlb_entry *stlbe,
+   int tlbsel, struct kvm_book3e_206_tlb_entry *stlbe,
struct tlbe_ref *ref)
 {
struct kvm_memory_slot *slot;
@@ -723,27 +739,19 @@ static inline void kvmppc_e500_shadow_map(struct 
kvmppc_vcpu_e500 *vcpu_e500,
 }
 
 /* XXX only map the one-one case, for now use TLB0 */
-static int kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500,
-   int esel,
-   struct kvm_book3e_206_tlb_entry *stlbe)
+static void kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500,
+int esel,
+struct kvm_book3e_206_tlb_entry *stlbe)
 {
struct kvm_book3e_206_tlb_entry *gtlbe;
struct tlbe_ref *ref;
-   int sesel = esel & (host_tlb_params[0].ways - 1);
-   int sesel_base;
-   gva_t ea;
 
gtlbe = get_entry(vcpu_e500, 0, esel);
ref = &vcpu_e500->gtlb_priv[0][esel].ref;
 
-   ea = get_tlb_eaddr(gtlbe);
-   sesel_base = htlb0_set_base(ea);
-
kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe),
get_tlb_raddr(gtlbe) >> PAGE_SHIFT,
-   gtl

[PATCH 1/3] KVM: PPC: e500: Fix TLBnCFG in KVM_CONFIG_TLB

2011-11-28 Thread Scott Wood
The associativity, not just total size, can differ from the host
hardware.

Signed-off-by: Scott Wood 
---
 arch/powerpc/kvm/e500_tlb.c |   19 ++-
 1 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index 9cd124a..5073768 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -1227,12 +1227,14 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
vcpu_e500->gtlb_offset[0] = 0;
vcpu_e500->gtlb_offset[1] = params.tlb_sizes[0];
 
-   vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & ~0xfffUL;
+   vcpu_e500->tlb0cfg &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
if (params.tlb_sizes[0] <= 2048)
vcpu_e500->tlb0cfg |= params.tlb_sizes[0];
+   vcpu_e500->tlb0cfg |= params.tlb_ways[0] << TLBnCFG_ASSOC_SHIFT;
 
-   vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) & ~0xfffUL;
+   vcpu_e500->tlb1cfg &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
vcpu_e500->tlb1cfg |= params.tlb_sizes[1];
+   vcpu_e500->tlb1cfg |= params.tlb_ways[1] << TLBnCFG_ASSOC_SHIFT;
 
vcpu_e500->shared_tlb_pages = pages;
vcpu_e500->num_shared_tlb_pages = num_pages;
@@ -1348,10 +1350,17 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 
*vcpu_e500)
goto err;
 
/* Init TLB configuration register */
-   vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & ~0xfffUL;
+   vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) &
+~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_params[0].entries;
-   vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) & ~0xfffUL;
-   vcpu_e500->tlb1cfg |= vcpu_e500->gtlb_params[1].entries;
+   vcpu_e500->tlb0cfg |=
+   vcpu_e500->gtlb_params[0].ways << TLBnCFG_ASSOC_SHIFT;
+
+   vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) &
+~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
+   vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_params[1].entries;
+   vcpu_e500->tlb0cfg |=
+   vcpu_e500->gtlb_params[1].ways << TLBnCFG_ASSOC_SHIFT;
 
return 0;
 
-- 
1.7.7.rc3.4.g8d714


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/3] KVM: PPC: e500: misc MMU stuff

2011-11-28 Thread Scott Wood
Liu Yu (1):
  KVM: booke: Add booke206 TLB trace

Scott Wood (2):
  KVM: PPC: e500: Fix TLBnCFG in KVM_CONFIG_TLB
  KVM: PPC: e500: use hardware hint when loading TLB0 entries

 arch/powerpc/include/asm/mmu-book3e.h |1 +
 arch/powerpc/kvm/e500_tlb.c   |   97 -
 arch/powerpc/kvm/trace.h  |   57 +++
 3 files changed, 116 insertions(+), 39 deletions(-)

-- 
1.7.7.rc3.4.g8d714

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Improving RAID5 write performance in a KVM VM

2011-11-28 Thread Simon Wilson

- Message from Simon Wilson  -
   Date: Fri, 25 Nov 2011 20:34:47 +1000
   From: Simon Wilson 
Subject: Improving RAID5 write performance in a KVM VM
 To: kvm@vger.kernel.org



Good afternoon list.

I have a CentOS 6 server running six KVM VMs, and am struggling with  
poor disk write performance in the VMs.


Setup summary:

Physical server has 2 x 500GB SATA drives in a RAID1 mirror, and 3 x  
2TB SATA drives in a RAID5 stripeset. The 2 x 500GB drives host /,  
/boot, and the VM .img files.


The .img files are mounted as per the following example in virsh xml:


  
  
  
  function='0x0'/>



Each virtual server also has a slice of the RAID5 array for data,  
including a CentOS 5.7 Samba server that has a 3TB piece.


The Samba drive is mounted in virsh xml as follows:


  
  
  
  function='0x0'/>



Mount parameters in the VM are:

  /dev/vdb1  /mnt/data  ext3  suid,dev,exec,noatime  0  0

When I first built the server performance in the VMs was terribly  
bad with constant CPU IOWAIT, but after a lot of searching the  
addition of io=native significantly improved it.


It is copying file data to the Samba share that is causing me  
issues. Some files I use are many GBs in size. Copies start at  
110MBps across the network, but once the Samba server's network copy  
cache fills up (the server has 4GB RAM allocated, and the cache  
seems to fill after about 2.5 to 3GB copied from the network) they  
hit a wall and crawl to 35MBps.


Allocating the Samba server more RAM tricks Windows into thinking  
the copy is finished sooner (although it still hits a brick wall  
once the larger cache is full), but of course the underlying data  
write continues for quite some time after Windows is done.


I have done a fair bit of analysis with dstat and using dd copies,  
and file writes onto the VM's RAID5 slice sit between 35 and 40MBps  
consistently.


Using Samba on the physical server (not my normal practice, but for  
troubleshooting) using a different partition on the same RAID5  
array, I get a consistent 75MBps overall write speed.


Reads from the partitions are comparable speed from both physical  
and virtual servers (~100MBps).


Are there any hints from anyone please about how I can improve the  
write performance in the VM? I realise it will likely never achieve  
the bare metal speed, but closer than 50% would be good...


Thanks
Simon.
--
Simon Wilson
M: 0400 12 11 16



Hi all - there's been a lot of activity on the list over the last few  
days since I posted this, but no responses to my query.


The Mailing List web page indicates that user support is welcomed in  
the list, but if this is not an appropriate channel for getting  
assistance could someone please point me in the right direction?


Thanks
Simon.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/5] KVM: MMU: audit: replace mmu audit tracepoint with jump-lable

2011-11-28 Thread Jason Baron
On Mon, Nov 28, 2011 at 08:41:00PM +0800, Xiao Guangrong wrote:
> +static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
>  {
>   static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
> 
> - if (!__ratelimit(&ratelimit_state))
> - return;
> + if (static_branch((&mmu_audit_key))) {
> + if (!__ratelimit(&ratelimit_state))
> + return;
> 
> - vcpu->kvm->arch.audit_point = point;
> - audit_all_active_sps(vcpu->kvm);
> - audit_vcpu_spte(vcpu);
> + vcpu->kvm->arch.audit_point = point;
> + audit_all_active_sps(vcpu->kvm);
> + audit_vcpu_spte(vcpu);
> + }
>  }

hmmm..this always going to do a call to 'kvm_mmu_audit' and then return.
I think you want to avoid the function call altogether. You could do
something like:

#define kvm_mmu_audit()
if (static_branch((&mmu_audit_key))) {
__kvm_mmu_audit();
}

and s/kvm_mmu_audit/__kvm_mmu_audit

That should give you a single nop for the case where kvm_mmu_audit is
disabled instead of a function call.

Thanks,

-Jason
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] KVM: PPC: booke: Improve timer register emulation

2011-11-28 Thread Alexander Graf

On 28.11.2011, at 21:59, Stuart Yoder wrote:

> On Fri, Nov 18, 2011 at 7:54 AM, Alexander Graf  wrote:
>> 
>> On 17.11.2011, at 23:39, Scott Wood wrote:
>> 
>>> Decrementers are now properly driven by TCR/TSR, and the guest
>>> has full read/write access to these registers.
>>> 
>>> The decrementer keeps ticking (and setting the TSR bit) regardless of
>>> whether the interrupts are enabled with TCR.
>>> 
>>> The decrementer stops at zero, rather than going negative.
>>> 
>>> Decrementers (and FITs, once implemented) are delivered as
>>> level-triggered interrupts -- dequeued when the TSR bit is cleared, not
>>> on delivery.
>>> 
>>> Signed-off-by: Liu Yu 
>>> [scottw...@freescale.com: significant changes]
>>> Signed-off-by: Scott Wood 
>> 
>> Thanks, applied to kvm-ppc-next.
> 
> Alex, don't see this patch in your tree yet.

Sorry, pushed :)


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm tools: Support virtio indirect buffers

2011-11-28 Thread Sasha Levin
On Mon, 2011-11-28 at 22:17 +0200, Sasha Levin wrote:
> On Mon, 2011-11-28 at 20:49 +0200, Pekka Enberg wrote:
> > On Mon, Nov 28, 2011 at 7:54 PM, Sasha Levin  
> > wrote:
> > > Indirect buffers are ring descriptors which point to more (even more)
> > > descriptors.
> > >
> > > This can be used to increase the effective ring capacity, which helps the
> > > guest to batch large requests - very useful for blk devices.
> > >
> > > This patch also enables indirect buffers for virtio-net and virtio-blk.
> > >
> > > The patch is based on the lguest's code which does the same.
> > >
> > > Signed-off-by: Sasha Levin 
> > 
> > In what exact way is it useful? Improved throughput? Will this have
> > negative impact on virtio block or virtio net latency?
> 
> The total size of requests is limited by the size of the virtio ring.
> This makes it hard to squeeze large requests (like the ones you usually
> see with blk devices) into the ring. This patch simply makes each entry
> in the virtio ring point to another descriptor list, thus it allows to
> squeeze much more requests into a singe virtio ring.
> 
> It shouldn't hurt latency.
> 
> I tried getting benchmarks with it, but the results I get from fio are
> all over the place and I can't seem to get a steady result (bad sign for
> my poor spindle disk).

btw, on an unrelated subject, I think that with this patch we've fully
covered the virtio spec, and as far as I know it's the first userspace
implementation which covers the entire spec :)

-- 

Sasha.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] KVM: PPC: booke: Improve timer register emulation

2011-11-28 Thread Stuart Yoder
On Fri, Nov 18, 2011 at 7:54 AM, Alexander Graf  wrote:
>
> On 17.11.2011, at 23:39, Scott Wood wrote:
>
>> Decrementers are now properly driven by TCR/TSR, and the guest
>> has full read/write access to these registers.
>>
>> The decrementer keeps ticking (and setting the TSR bit) regardless of
>> whether the interrupts are enabled with TCR.
>>
>> The decrementer stops at zero, rather than going negative.
>>
>> Decrementers (and FITs, once implemented) are delivered as
>> level-triggered interrupts -- dequeued when the TSR bit is cleared, not
>> on delivery.
>>
>> Signed-off-by: Liu Yu 
>> [scottw...@freescale.com: significant changes]
>> Signed-off-by: Scott Wood 
>
> Thanks, applied to kvm-ppc-next.

Alex, don't see this patch in your tree yet.

Stuart
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm tools: Support virtio indirect buffers

2011-11-28 Thread Sasha Levin
On Mon, 2011-11-28 at 20:49 +0200, Pekka Enberg wrote:
> On Mon, Nov 28, 2011 at 7:54 PM, Sasha Levin  wrote:
> > Indirect buffers are ring descriptors which point to more (even more)
> > descriptors.
> >
> > This can be used to increase the effective ring capacity, which helps the
> > guest to batch large requests - very useful for blk devices.
> >
> > This patch also enables indirect buffers for virtio-net and virtio-blk.
> >
> > The patch is based on the lguest's code which does the same.
> >
> > Signed-off-by: Sasha Levin 
> 
> In what exact way is it useful? Improved throughput? Will this have
> negative impact on virtio block or virtio net latency?

The total size of requests is limited by the size of the virtio ring.
This makes it hard to squeeze large requests (like the ones you usually
see with blk devices) into the ring. This patch simply makes each entry
in the virtio ring point to another descriptor list, thus it allows to
squeeze much more requests into a singe virtio ring.

It shouldn't hurt latency.

I tried getting benchmarks with it, but the results I get from fio are
all over the place and I can't seem to get a steady result (bad sign for
my poor spindle disk).

-- 

Sasha.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm tools: Improve virtio blk request processing

2011-11-28 Thread Pekka Enberg
On Mon, Nov 28, 2011 at 7:34 AM, Asias He  wrote:
> There are at most bdev->reqs[VIRTIO_BLK_QUEUE_SIZE] outstanding requests
> at any time.  We can simply use the head of each request to fetch the
> right 'struct blk_dev_req' in bdev->reqs[].
>
> So, we can eliminate the list and lock operations which introduced by
> virtio_blk_req_{pop, push}.
>
> Signed-off-by: Asias He 

Seems like a reasonable thing to do. Sasha?
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm tools: Support virtio indirect buffers

2011-11-28 Thread Pekka Enberg
On Mon, Nov 28, 2011 at 7:54 PM, Sasha Levin  wrote:
> Indirect buffers are ring descriptors which point to more (even more)
> descriptors.
>
> This can be used to increase the effective ring capacity, which helps the
> guest to batch large requests - very useful for blk devices.
>
> This patch also enables indirect buffers for virtio-net and virtio-blk.
>
> The patch is based on the lguest's code which does the same.
>
> Signed-off-by: Sasha Levin 

In what exact way is it useful? Improved throughput? Will this have
negative impact on virtio block or virtio net latency?

Pekka
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] kvm tools: Support virtio indirect buffers

2011-11-28 Thread Sasha Levin
Indirect buffers are ring descriptors which point to more (even more)
descriptors.

This can be used to increase the effective ring capacity, which helps the
guest to batch large requests - very useful for blk devices.

This patch also enables indirect buffers for virtio-net and virtio-blk.

The patch is based on the lguest's code which does the same.

Signed-off-by: Sasha Levin 
---
 tools/kvm/virtio/blk.c  |3 ++-
 tools/kvm/virtio/core.c |   47 ++-
 tools/kvm/virtio/net.c  |3 ++-
 3 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/tools/kvm/virtio/blk.c b/tools/kvm/virtio/blk.c
index 8c6f90b..d1a0197 100644
--- a/tools/kvm/virtio/blk.c
+++ b/tools/kvm/virtio/blk.c
@@ -148,7 +148,8 @@ static u32 get_host_features(struct kvm *kvm, void *dev)
 {
return  1UL << VIRTIO_BLK_F_SEG_MAX
| 1UL << VIRTIO_BLK_F_FLUSH
-   | 1UL << VIRTIO_RING_F_EVENT_IDX;
+   | 1UL << VIRTIO_RING_F_EVENT_IDX
+   | 1UL << VIRTIO_RING_F_INDIRECT_DESC;
 }
 
 static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
diff --git a/tools/kvm/virtio/core.c b/tools/kvm/virtio/core.c
index a6f180e..fe9d588 100644
--- a/tools/kvm/virtio/core.c
+++ b/tools/kvm/virtio/core.c
@@ -33,27 +33,56 @@ struct vring_used_elem *virt_queue__set_used_elem(struct 
virt_queue *queue, u32
return used_elem;
 }
 
+/*
+ * Each buffer in the virtqueues is actually a chain of descriptors.  This
+ * function returns the next descriptor in the chain, or vq->vring.num if we're
+ * at the end.
+ */
+static unsigned next_desc(struct vring_desc *desc,
+ unsigned int i, unsigned int max)
+{
+   unsigned int next;
+
+   /* If this descriptor says it doesn't chain, we're done. */
+   if (!(desc[i].flags & VRING_DESC_F_NEXT))
+   return max;
+
+   /* Check they're not leading us off end of descriptors. */
+   next = desc[i].next;
+   /* Make sure compiler knows to grab that: we don't want it changing! */
+   wmb();
+
+   return next;
+}
+
 u16 virt_queue__get_head_iov(struct virt_queue *vq, struct iovec iov[], u16 
*out, u16 *in, u16 head, struct kvm *kvm)
 {
struct vring_desc *desc;
u16 idx;
+   u16 max;
 
idx = head;
*out = *in = 0;
+   max = vq->vring.num;
+   desc = vq->vring.desc;
+
+   if (desc[idx].flags & VRING_DESC_F_INDIRECT) {
+
+   max = desc[idx].len / sizeof(struct vring_desc);
+   desc = guest_flat_to_host(kvm, desc[idx].addr);
+   idx = 0;
+   }
 
do {
-   desc = virt_queue__get_desc(vq, idx);
-   iov[*out + *in].iov_base = guest_flat_to_host(kvm, desc->addr);
-   iov[*out + *in].iov_len  = desc->len;
-   if (desc->flags & VRING_DESC_F_WRITE)
+   /* Grab the first descriptor, and check it's OK. */
+   iov[*out + *in].iov_len = desc[idx].len;
+   iov[*out + *in].iov_base = guest_flat_to_host(kvm, 
desc[idx].addr);
+   /* If this is an input descriptor, increment that count. */
+   if (desc[idx].flags & VRING_DESC_F_WRITE)
(*in)++;
else
(*out)++;
-   if (desc->flags & VRING_DESC_F_NEXT)
-   idx = desc->next;
-   else
-   break;
-   } while (1);
+   } while ((idx = next_desc(desc, idx, max)) != max);
 
return head;
 }
diff --git a/tools/kvm/virtio/net.c b/tools/kvm/virtio/net.c
index ce34555..3fb5054 100644
--- a/tools/kvm/virtio/net.c
+++ b/tools/kvm/virtio/net.c
@@ -318,7 +318,8 @@ static u32 get_host_features(struct kvm *kvm, void *dev)
| 1UL << VIRTIO_NET_F_GUEST_UFO
| 1UL << VIRTIO_NET_F_GUEST_TSO4
| 1UL << VIRTIO_NET_F_GUEST_TSO6
-   | 1UL << VIRTIO_RING_F_EVENT_IDX;
+   | 1UL << VIRTIO_RING_F_EVENT_IDX
+   | 1UL << VIRTIO_RING_F_INDIRECT_DESC;
 }
 
 static void set_guest_features(struct kvm *kvm, void *dev, u32 features)
-- 
1.7.8.rc3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] [kvm-autotest] tests.cgroup: Add CpuShare test

2011-11-28 Thread Lukas Doktor
 * Adds base _TestCpuShare
 * Adds TestCpuShare10 - cpu.shares ratio is 1:10
 * Adds TestCpuShare50 - cpu.shares ratio is 1:1
 * cgroup_common - incorrect lookup for mounted cgroups
 * kvm cgroup - sort the executed tests (per expression)
 * minor bugfixies

Tests the cpu.share cgroup capability. It creates n cgroups accordingly
to self.speeds variable and sufficient VMs to symetricaly test three
different scenerios:

1) #threads == #CPUs
2) #threads + 1 == #CPUs, +1thread have the lowest priority (or equal)
3) #threads * #cgroups == #CPUs

Scheduler shouldn't slow down VMs on unoccupied CPUs. With thread
overcommit the scheduler should stabilize accordingly to speeds
value.

Signed-off-by: Lukas Doktor 
---
 client/tests/cgroup/cgroup_common.py |8 +-
 client/tests/kvm/tests/cgroup.py |  258 +-
 2 files changed, 260 insertions(+), 6 deletions(-)

diff --git a/client/tests/cgroup/cgroup_common.py 
b/client/tests/cgroup/cgroup_common.py
index 7d5daf2..95b0e61 100755
--- a/client/tests/cgroup/cgroup_common.py
+++ b/client/tests/cgroup/cgroup_common.py
@@ -201,6 +201,8 @@ class Cgroup(object):
 value = str(value)
 if pwd == None:
 pwd = self.root
+if isinstance(pwd, int):
+pwd = self.cgroups[pwd]
 try:
 open(pwd+prop, 'w').write(value)
 except Exception, inst:
@@ -280,8 +282,8 @@ class CgroupModules(object):
 try:
 utils.system('umount %s -l' % self.modules[1][i])
 except Exception, failure_detail:
-logging.warn("CGM: Couldn't unmount %s directory: %s"
- % (self.modules[1][i], failure_detail))
+logging.warn("CGM: Couldn't unmount %s directory: %s",
+ self.modules[1][i], failure_detail)
 try:
 shutil.rmtree(self.mountdir)
 except Exception:
@@ -308,7 +310,7 @@ class CgroupModules(object):
 # Is it already mounted?
 i = False
 for mount in mounts:
-if mount[3].find(module) != -1:
+if  module in mount[3].split(','):
 self.modules[0].append(module)
 self.modules[1].append(mount[1] + '/')
 self.modules[2].append(False)
diff --git a/client/tests/kvm/tests/cgroup.py b/client/tests/kvm/tests/cgroup.py
index 6880c2c..295ce04 100644
--- a/client/tests/kvm/tests/cgroup.py
+++ b/client/tests/kvm/tests/cgroup.py
@@ -8,6 +8,7 @@ from random import random
 from autotest_lib.client.common_lib import error
 from autotest_lib.client.bin import utils
 from autotest_lib.client.tests.cgroup.cgroup_common import Cgroup, 
CgroupModules
+from autotest_lib.client.virt import virt_utils, virt_env_process
 from autotest_lib.client.virt.aexpect import ExpectTimeoutError
 from autotest_lib.client.virt.aexpect import ExpectProcessTerminatedError
 
@@ -951,7 +952,7 @@ def run_cgroup(test, params, env):
 
 sessions[1].cmd('killall -SIGUSR1 dd')
 for i in range(10):
-logging.debug("Moving vm into cgroup %s." % (i%2))
+logging.debug("Moving vm into cgroup %s.", (i%2))
 assign_vm_into_cgroup(self.vm, self.cgroup, i%2)
 time.sleep(0.1)
 time.sleep(2)
@@ -1080,6 +1081,254 @@ def run_cgroup(test, params, env):
 return ("Limits were enforced successfully.")
 
 
+class _TestCpuShare:
+"""
+Tests the cpu.share cgroup capability. It creates n cgroups accordingly
+to self.speeds variable and sufficient VMs to symetricaly test three
+different scenerios.
+1) #threads == #CPUs
+2) #threads + 1 == #CPUs, +1thread have the lowest priority (or equal)
+3) #threads * #cgroups == #CPUs
+Cgroup shouldn't slow down VMs on unoccupied CPUs. With thread
+overcommit the scheduler should stabilize accordingly to speeds
+value.
+"""
+def __init__(self, vms, modules):
+self.vms = vms[:]  # Copy of virt machines
+self.vms_count = len(vms) # Original number of vms
+self.modules = modules  # cgroup module handler
+self.cgroup = Cgroup('cpu', '')   # cgroup blkio handler
+self.speeds = None  # cpu.share values [cg1, cg2]
+self.sessions = []# ssh sessions
+self.serials = []   # serial consoles
+
+
+def cleanup(self):
+""" Cleanup """
+err = ""
+try:
+del(self.cgroup)
+except Exception, failure_detail:
+err += "\nCan't remove Cgroup: %s" % failure_detail
+
+# Stop all VMS in parallel, then check for success.
+for i in range(len(self.vms)):
+self.serials[i].sendline('rm -f /tmp/cgroup-cpu-lock')
+time.sleep(2)
+ 

[kvm-autotest] tests.cgroup: Add CpuShare test

2011-11-28 Thread Lukas Doktor
Hi guys,

I have a new kvm cgroup test for you.

This time it's in cpu cgroup and it uses cpu.shares capability. It modifies
the number of shares/time_slices which each process gets from the scheduler
As the verification is kind of tricky I'm testing only three different
scenerios and only symetrical solutions:

1) #threads == #CPUs
2) #threads + 1 == #CPUs, +1thread have the lowest priority (or equal)
3) #threads * #cgroups == #CPUs

ad1) When less threads are occupied, the utilisation should be equal.
ad2) When there is +1 thread with lower priority, it fight with the
 second thread from the lowest priority group, thus the utilisations
 are still equal. If you try it other way around (higher utilisation)
 this current thread forces the utilisation in cgroups ratios. This
 is not tested as it would be trickier with multiple cgroups/threads
ad3) When each cgroup have multiple threads it can fight for it's share
 thus in the end they all should settle on cgroup ratio utilisations.

To minimalize system impact I'm using 10-100times bigger slices/priority
then other processes. Still when are you running other processes on the
system the test might fail. Keep in mind it should run on clean test
machine.

Anyway, be my guests at github pull request:
https://github.com/autotest/autotest/pull/94

Regards,
Lukáš Doktor

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] KVM call agenda for November 29

2011-11-28 Thread Avi Kivity
On 11/28/2011 04:35 PM, Anthony Liguori wrote:
>> (somewhat related) memory API conversion queue merge plan
>
>
> No need to wait until tomorrow to discuss it, I guess.
>
> 1.1 will open up on Friday.  I was going to make the suggestion that
> if anyone has more than 50 non-trivial patches queued in their trees
> (I suspect a few people do), they split up the pull requests into ~50
> or so chunks and send them with a week or so spacing.
>
> 50 is just a random number.  Just split the requests into reasonable
> chunks so master doesn't totally fall apart all at once :-)
>
> I think memory API may be a special case since a lot of the changes
> are trivial.  So I would expect a larger pull request for the memory
> API bits.

Yes, for the memory API conversion I'd like use 126 patch chunks.  While
there are probably a few regressions lurking in there, it's easily
bisectable and fixable.  Better in than out.

-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/5] KVM: MMU: remove oos_shadow parameter

2011-11-28 Thread Avi Kivity
On 11/28/2011 02:43 PM, Xiao Guangrong wrote:
> The unsync code should be stable now, maybe it is the time to remove this
> parameter to cleanup the code a little bit
>

Thanks, all applied.

-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/5] KVM: x86: remove the dead code of KVM_EXIT_HYPERCALL

2011-11-28 Thread Avi Kivity
On 11/28/2011 02:56 PM, Gleb Natapov wrote:
> On Mon, Nov 28, 2011 at 08:41:38PM +0800, Xiao Guangrong wrote:
> > KVM_EXIT_HYPERCALL is not used anymore, so remove the code
> > 
> Why not remove the define as well?
>
>

It's in kvm.h, best not to touch it in case someone has

  case KVM_EXIT_HYPERCALL:
   abort();

-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] KVM call agenda for November 29

2011-11-28 Thread Anthony Liguori

On 11/28/2011 08:30 AM, Avi Kivity wrote:

On 11/28/2011 04:28 PM, Anthony Liguori wrote:

On 11/28/2011 08:24 AM, Juan Quintela wrote:


Hi

Please send in any agenda items you are interested in covering.


- 1.1 development window logistics


(somewhat related) memory API conversion queue merge plan


No need to wait until tomorrow to discuss it, I guess.

1.1 will open up on Friday.  I was going to make the suggestion that if anyone 
has more than 50 non-trivial patches queued in their trees (I suspect a few 
people do), they split up the pull requests into ~50 or so chunks and send them 
with a week or so spacing.


50 is just a random number.  Just split the requests into reasonable chunks so 
master doesn't totally fall apart all at once :-)


I think memory API may be a special case since a lot of the changes are trivial. 
 So I would expect a larger pull request for the memory API bits.


Regards,

Anthony Liguori





--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] KVM call agenda for November 29

2011-11-28 Thread Avi Kivity
On 11/28/2011 04:28 PM, Anthony Liguori wrote:
> On 11/28/2011 08:24 AM, Juan Quintela wrote:
>>
>> Hi
>>
>> Please send in any agenda items you are interested in covering.
>
> - 1.1 development window logistics

(somewhat related) memory API conversion queue merge plan

-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] KVM call agenda for November 29

2011-11-28 Thread Anthony Liguori

On 11/28/2011 08:24 AM, Juan Quintela wrote:


Hi

Please send in any agenda items you are interested in covering.


- 1.1 development window logistics

Regards,

Anthony Liguori



Thanks, Juan.



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


KVM call agenda for November 29

2011-11-28 Thread Juan Quintela

Hi

Please send in any agenda items you are interested in covering.

Thanks, Juan.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Biweekly KVM Test report, kernel 1db6efe3... qemu 9ac357fd...

2011-11-28 Thread Ren, Yongjie
Hi All,
This is KVM test result against kvm.git 
1db6efe35afcda0bd05e5f4449ea814a9071da7f based on 3.2.0-rc2, and qemu-kvm.git 
9ac357fdfc51bac582a61f95b32565648b844878.

We found no new bug and 1 bug was fixed during our test.
The fixed bug is the SR-IOV detachment issue (qemu BZ #875723). Alex (@Redhat) 
fixed this issue. 
The VT-D/SR-IOV issue (qemu BZ #877155) in windows guest still exists. But 
there's some difference now. Now VT-D/SR-IOV NIC in win2k8 guest works fine 
now, but VT-D/SR-IOV NIC in win7 or winXP guest doesn't work. However, when I 
reported that bug, VT-D/SR-IOV NIC didn't work in win2k8, win7 or winXP guest. 

New issue(0):

Fixed issue(1):
1. guest aborts when detaching a SR-IOV VF
  https://bugs.launchpad.net/qemu/+bug/875723

Old issue(1):
1. VT-D NIC doesn't work in windows guest
  https://bugs.launchpad.net/qemu/+bug/877155

Test environment:
==
  Platform   Westmere-EP 
  CPU Cores   24 
  Memory size 10G 

Report summary of IA32E on Westmere-EP:
Summary Test Report of Last Session
=
Total   PassFailNoResult   Crash
=
control_panel_ept_vpid  12  12  0 00
control_panel_ept   4   4   0 00
control_panel_vpid  3   3   0 00
control_panel   3   3   0 00
gtest_vpid  1   1   0 00
gtest_ept   1   1   0 00
gtest   3   3   0 00
vtd_ept_vpid3   2   1 00
gtest_ept_vpid  12  12  0 00
sriov_ept_vpid  6   6   0 00
=
control_panel_ept_vpid  12  12  0 00
 :KVM_LM_Continuity_64_g3   1   1   0 00
 :KVM_four_dguest_64_g32e   1   1   0 00
 :KVM_SR_SMP_64_g32e1   1   0 00
 :KVM_LM_SMP_64_g32e1   1   0 00
 :KVM_1500M_guest_64_gPAE   1   1   0 00
 :KVM_linux_win_64_g32e 1   1   0 00
 :KVM_1500M_guest_64_g32e   1   1   0 00
 :KVM_two_winxp_64_g32e 1   1   0 00
 :KVM_256M_guest_64_gPAE1   1   0 00
 :KVM_SR_Continuity_64_g3   1   1   0 00
 :KVM_256M_guest_64_g32e1   1   0 00
 :KVM_four_sguest_64_g32e   1   1   0 00
control_panel_ept   4   4   0 00
 :KVM_linux_win_64_g32e 1   1   0 00
 :KVM_1500M_guest_64_g32e   1   1   0 00
 :KVM_LM_SMP_64_g32e1   1   0 00
 :KVM_1500M_guest_64_gPAE   1   1   0 00
control_panel_vpid  3   3   0 00
 :KVM_linux_win_64_g32e 1   1   0 00
 :KVM_1500M_guest_64_g32e   1   1   0 00
 :KVM_1500M_guest_64_gPAE   1   1   0 00
control_panel   3   3   0 00
 :KVM_1500M_guest_64_g32e   1   1   0 00
 :KVM_LM_SMP_64_g32e1   1   0 00
 :KVM_1500M_guest_64_gPAE   1   1   0 00
gtest_vpid  1   1   0 00
 :boot_smp_win7_ent_64_g3   1   1   0 00
gtest_ept   1   1   0 00
 :boot_smp_win7_ent_64_g3   1   1   0 00
gtest   3   3   0 00
 :boot_smp_win2008_64_g32   1   1   0 00
 :boot_smp_win7_ent_64_gP   1   1   0 00
 :boot_smp_vista_64_g32e1   1   0 00
vtd_ept_vpid3   2   1 00
 :one_pcie_smp_xp_64_g32e   1   0   1 00
 :one_pcie_smp_64_g32e  1   1   0 00
 :two_dev_smp_64_g32e   1   1  0 00
gtest_ept_vpid  12  12  0 00
 :boot_up_acpi_64_g32e  1   1   0 00
 :boot_base_kernel_64_g32   1   1   0 00
 :boot_up_win2008_64_g32e   1   1   0 00
 :kb_nightly_64_g32e1   1   0 00
 :boot_up_acpi_win2k3_64_   1   1   0 00
 :boot_up_vista_64_g32e 1   1   0 00
 :ltp_nig

[PATCH kvm-unit-tests] emulator: fix bsf/bsr tests

2011-11-28 Thread Avi Kivity
The inline assembly constraints did not specify ax as an output,
confusing the compiler.  Rewrite the test to properly specify all
outputs.

Signed-off-by: Avi Kivity 
---
 x86/emulator.c |   89 +--
 1 files changed, 34 insertions(+), 55 deletions(-)

diff --git a/x86/emulator.c b/x86/emulator.c
index 73079f8..b584122 100644
--- a/x86/emulator.c
+++ b/x86/emulator.c
@@ -475,65 +475,44 @@ void test_btc(void *mem)
 
 void test_bsfbsr(void *mem)
 {
-   unsigned long *memq = mem, rax;
-
-   asm volatile("movw $0xC000, (%[memq])\n\t"
-"bsfw (%[memq]), %%ax\n\t"
-::[memq]"r"(memq));
-   asm ("mov %%rax, %[rax]": [rax]"=m"(rax));
-   report("bsfw r/m, reg", rax == 14);
-
-   asm volatile("movl $0xC000, (%[memq])\n\t"
-"bsfl (%[memq]), %%eax\n\t"
-::[memq]"r"(memq));
-   asm ("mov %%rax, %[rax]": [rax]"=m"(rax));
-   report("bsfl r/m, reg", rax == 30);
-
-   asm volatile("movq $0xC000, %%rax\n\t"
-"movq %%rax, (%[memq])\n\t"
-"bsfq (%[memq]), %%rax\n\t"
-::[memq]"r"(memq));
-   asm ("mov %%rax, %[rax]": [rax]"=m"(rax));
+   unsigned long rax, *memq = mem;
+   unsigned eax, *meml = mem;
+   unsigned short ax, *memw = mem;
+   unsigned char z;
+
+   *memw = 0xc000;
+   asm("bsfw %[mem], %[a]" : [a]"=a"(ax) : [mem]"m"(*memw));
+   report("bsfw r/m, reg", ax == 14);
+
+   *meml = 0xc000;
+   asm("bsfl %[mem], %[a]" : [a]"=a"(eax) : [mem]"m"(*meml));
+   report("bsfl r/m, reg", eax == 30);
+
+   *memq = 0xc000;
+   asm("bsfq %[mem], %[a]" : [a]"=a"(rax) : [mem]"m"(*memq));
report("bsfq r/m, reg", rax == 46);
 
-   asm volatile("movq $0, %%rax\n\t"
-"movq %%rax, (%[memq])\n\t"
-"bsfq (%[memq]), %%rax\n\t"
-"jnz 1f\n\t"
-"movl $1, %[rax]\n\t"
-"1:\n\t"
-:[rax]"=m"(rax)
-:[memq]"r"(memq));
-   report("bsfq r/m, reg", rax == 1);
-
-   asm volatile("movw $0xC000, (%[memq])\n\t"
-"bsrw (%[memq]), %%ax\n\t"
-::[memq]"r"(memq));
-   asm ("mov %%rax, %[rax]": [rax]"=m"(rax));
-   report("bsrw r/m, reg", rax == 15);
-
-   asm volatile("movl $0xC000, (%[memq])\n\t"
-"bsrl (%[memq]), %%eax\n\t"
-::[memq]"r"(memq));
-   asm ("mov %%rax, %[rax]": [rax]"=m"(rax));
-   report("bsrl r/m, reg", rax == 31);
-
-   asm volatile("movq $0xC000, %%rax\n\t"
-"movq %%rax, (%[memq])\n\t"
-"bsrq (%[memq]), %%rax\n\t"
-::[memq]"r"(memq));
-   asm ("mov %%rax, %[rax]": [rax]"=m"(rax));
+   *memq = 0;
+   asm("bsfq %[mem], %[a]; setz %[z]"
+   : [a]"=a"(rax), [z]"=rm"(z) : [mem]"m"(*memq));
+   report("bsfq r/m, reg", z == 1);
+
+   *memw = 0xc000;
+   asm("bsrw %[mem], %[a]" : [a]"=a"(ax) : [mem]"m"(*memw));
+   report("bsrw r/m, reg", ax == 15);
+
+   *meml = 0xc000;
+   asm("bsrl %[mem], %[a]" : [a]"=a"(eax) : [mem]"m"(*meml));
+   report("bsrl r/m, reg", eax == 31);
+
+   *memq = 0xc000;
+   asm("bsrq %[mem], %[a]" : [a]"=a"(rax) : [mem]"m"(*memq));
report("bsrq r/m, reg", rax == 47);
 
-   asm volatile("movq $0, %%rax\n\t"
-"movq %%rax, (%[memq])\n\t"
-"bsrq (%[memq]), %%rax\n\t"
-"jnz 1f\n\t"
-"movl $1, %[rax]\n\t"
-"1:\n\t"
-:[rax]"=m"(rax)
-:[memq]"r"(memq));
-   report("bsrq r/m, reg", rax == 1);
+   *memq = 0;
+   asm("bsrq %[mem], %[a]; setz %[z]"
+   : [a]"=a"(rax), [z]"=rm"(z) : [mem]"m"(*memq));
+   report("bsrq r/m, reg", z == 1);
 }
 
 static void test_imul(ulong *mem)
-- 
1.7.7.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/5] KVM: x86: remove the dead code of KVM_EXIT_HYPERCALL

2011-11-28 Thread Gleb Natapov
On Mon, Nov 28, 2011 at 08:41:38PM +0800, Xiao Guangrong wrote:
> KVM_EXIT_HYPERCALL is not used anymore, so remove the code
> 
Why not remove the define as well?

> Signed-off-by: Xiao Guangrong 
> ---
>  arch/x86/kvm/x86.c |4 
>  1 files changed, 0 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index d54746c..7b6fd72 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -5386,10 +5386,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, 
> struct kvm_run *kvm_run)
>   if (r <= 0)
>   goto out;
> 
> - if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
> - kvm_register_write(vcpu, VCPU_REGS_RAX,
> -  kvm_run->hypercall.ret);
> -
>   r = __vcpu_run(vcpu);
> 
>  out:
> -- 
> 1.7.7.3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 08/62] x86: remove the second argument of k[un]map_atomic()

2011-11-28 Thread Herbert Xu
On Sun, Nov 27, 2011 at 01:26:48PM +0800, Cong Wang wrote:
> 
> Signed-off-by: Cong Wang 

Acked-by: Herbert Xu 

Thanks,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/5] KVM: MMU: remove oos_shadow parameter

2011-11-28 Thread Xiao Guangrong
The unsync code should be stable now, maybe it is the time to remove this
parameter to cleanup the code a little bit

Signed-off-by: Xiao Guangrong 
---
 Documentation/kernel-parameters.txt |3 ---
 arch/x86/kvm/mmu.c  |5 -
 2 files changed, 0 insertions(+), 8 deletions(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index a0c5c5f..7402a24 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1178,9 +1178,6 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs.
Default is 0 (don't ignore, but inject #GP)

-   kvm.oos_shadow= [KVM] Disable out-of-sync shadow paging.
-   Default is 1 (enabled)
-
kvm.mmu_audit=  [KVM] This is a R/W parameter which allows audit
KVM MMU at runtime.
Default is 0 (off)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 262a3af..b1178d1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -93,9 +93,6 @@ static int dbg = 0;
 module_param(dbg, bool, 0644);
 #endif

-static int oos_shadow = 1;
-module_param(oos_shadow, bool, 0644);
-
 #ifndef MMU_DEBUG
 #define ASSERT(x) do { } while (0)
 #else
@@ -2196,8 +2193,6 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, 
gfn_t gfn,
return 1;

if (!need_unsync && !s->unsync) {
-   if (!oos_shadow)
-   return 1;
need_unsync = true;
}
}
-- 
1.7.7.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/5] KVM: MMU: move the relevant mmu code to mmu.c

2011-11-28 Thread Xiao Guangrong
Move the mmu code in kvm_arch_vcpu_init() to kvm_mmu_create()

Signed-off-by: Xiao Guangrong 
---
 arch/x86/include/asm/kvm_host.h |6 ++
 arch/x86/kvm/mmu.c  |6 +-
 arch/x86/kvm/x86.c  |   11 +--
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1769f3d..020413a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -752,6 +752,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 int kvm_mmu_load(struct kvm_vcpu *vcpu);
 void kvm_mmu_unload(struct kvm_vcpu *vcpu);
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
  struct x86_exception *exception);
 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
@@ -773,6 +774,11 @@ void kvm_disable_tdp(void);
 int complete_pio(struct kvm_vcpu *vcpu);
 bool kvm_check_iopl(struct kvm_vcpu *vcpu);

+static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
+{
+   return gpa;
+}
+
 static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
 {
struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 62f69db..262a3af 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3839,7 +3839,11 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
 int kvm_mmu_create(struct kvm_vcpu *vcpu)
 {
ASSERT(vcpu);
-   ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+   vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+   vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+   vcpu->arch.mmu.translate_gpa = translate_gpa;
+   vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;

return alloc_mmu_pages(vcpu);
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7b6fd72..45c8640 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3427,12 +3427,7 @@ void kvm_get_segment(struct kvm_vcpu *vcpu,
kvm_x86_ops->get_segment(vcpu, var, seg);
 }

-static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
-{
-   return gpa;
-}
-
-static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
+gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
 {
gpa_t t_gpa;
struct x86_exception exception;
@@ -5912,10 +5907,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
kvm = vcpu->kvm;

vcpu->arch.emulate_ctxt.ops = &emulate_ops;
-   vcpu->arch.walk_mmu = &vcpu->arch.mmu;
-   vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-   vcpu->arch.mmu.translate_gpa = translate_gpa;
-   vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
-- 
1.7.7.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/5] KVM: x86: remove the dead code of KVM_EXIT_HYPERCALL

2011-11-28 Thread Xiao Guangrong
KVM_EXIT_HYPERCALL is not used anymore, so remove the code

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/x86.c |4 
 1 files changed, 0 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d54746c..7b6fd72 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5386,10 +5386,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, 
struct kvm_run *kvm_run)
if (r <= 0)
goto out;

-   if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
-   kvm_register_write(vcpu, VCPU_REGS_RAX,
-kvm_run->hypercall.ret);
-
r = __vcpu_run(vcpu);

 out:
-- 
1.7.7.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/5] KVM: MMU: audit: replace mmu audit tracepoint with jump-lable

2011-11-28 Thread Xiao Guangrong
The tracepoint is only used to audit mmu code, it should not be exposed to
user, let us replace it with jump-lable

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |   16 +++-
 arch/x86/kvm/mmu_audit.c   |   28 +---
 arch/x86/kvm/mmutrace.h|   19 ---
 arch/x86/kvm/paging_tmpl.h |4 ++--
 4 files changed, 26 insertions(+), 41 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d737443..62f69db 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -68,6 +68,12 @@ char *audit_point_name[] = {
"post sync"
 };

+#ifdef CONFIG_KVM_MMU_AUDIT
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point);
+#else
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
+#endif
+
 #undef MMU_DEBUG

 #ifdef MMU_DEBUG
@@ -2852,12 +2858,12 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
return;

vcpu_clear_mmio_info(vcpu, ~0ul);
-   trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
+   kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
sp = page_header(root);
mmu_sync_children(vcpu, sp);
-   trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
+   kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
return;
}
for (i = 0; i < 4; ++i) {
@@ -2869,7 +2875,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
mmu_sync_children(vcpu, sp);
}
}
-   trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
+   kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
 }

 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -3667,7 +3673,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,

spin_lock(&vcpu->kvm->mmu_lock);
++vcpu->kvm->stat.mmu_pte_write;
-   trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
+   kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);

mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
@@ -3700,7 +3706,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
}
mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-   trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
+   kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
spin_unlock(&vcpu->kvm->mmu_lock);
 }

diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 746ec25..5df6736 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -224,30 +224,29 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
mmu_spte_walk(vcpu, audit_spte);
 }

-static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
+static bool mmu_audit;
+static struct jump_label_key mmu_audit_key;
+
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
 {
static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);

-   if (!__ratelimit(&ratelimit_state))
-   return;
+   if (static_branch((&mmu_audit_key))) {
+   if (!__ratelimit(&ratelimit_state))
+   return;

-   vcpu->kvm->arch.audit_point = point;
-   audit_all_active_sps(vcpu->kvm);
-   audit_vcpu_spte(vcpu);
+   vcpu->kvm->arch.audit_point = point;
+   audit_all_active_sps(vcpu->kvm);
+   audit_vcpu_spte(vcpu);
+   }
 }

-static bool mmu_audit;
-
 static void mmu_audit_enable(void)
 {
-   int ret;
-
if (mmu_audit)
return;

-   ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
-   WARN_ON(ret);
-
+   jump_label_inc(&mmu_audit_key);
mmu_audit = true;
 }

@@ -256,8 +255,7 @@ static void mmu_audit_disable(void)
if (!mmu_audit)
return;

-   unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
-   tracepoint_synchronize_unregister();
+   jump_label_dec(&mmu_audit_key);
mmu_audit = false;
 }

diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index eed67f3..89fb0e8 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -243,25 +243,6 @@ TRACE_EVENT(
TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
  __entry->access)
 );
-
-TRACE_EVENT(
-   kvm_mmu_audit,
-   TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
-   TP_ARGS(vcpu, audit_point),
-
-   TP_STRUCT__entry(
-   __field(struct kvm_vcpu *, vcpu)
-   __field(int, audit_point)
-   ),
-
-   TP_fast_assign(
-   __entry->vcpu = vcpu;
-   __entry->audit_point = audit_point;
-   ),
-
-   TP_printk("vcpu:%d %s", __entry->vcpu->cpu,
- audit_point_name[__entry->audit_point])
-);
 #endif /* _TRACE_KVMMMU_H */

 #undef TRACE_INCLUDE_PATH
diff --git a/

[PATCH 1/5] jump-label: export jump_label_inc/jump_label_dec

2011-11-28 Thread Xiao Guangrong
Export these two symbols, they will be used by mmu audit

Signed-off-by: Xiao Guangrong 
---
 kernel/jump_label.c |2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index bbdfe2a..7d3d452 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -70,6 +70,7 @@ void jump_label_inc(struct jump_label_key *key)
jump_label_update(key, JUMP_LABEL_ENABLE);
jump_label_unlock();
 }
+EXPORT_SYMBOL_GPL(jump_label_inc);

 void jump_label_dec(struct jump_label_key *key)
 {
@@ -79,6 +80,7 @@ void jump_label_dec(struct jump_label_key *key)
jump_label_update(key, JUMP_LABEL_DISABLE);
jump_label_unlock();
 }
+EXPORT_SYMBOL_GPL(jump_label_dec);

 static int addr_conflict(struct jump_entry *entry, void *start, void *end)
 {
-- 
1.7.7.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 08/62] x86: remove the second argument of k[un]map_atomic()

2011-11-28 Thread Avi Kivity
On 11/27/2011 07:26 AM, Cong Wang wrote:
> Signed-off-by: Cong Wang 
> ---

>  arch/x86/kvm/lapic.c   |8 
>  arch/x86/kvm/paging_tmpl.h |4 ++--
>  arch/x86/kvm/x86.c |8 
>

The kvm parts are:

Acked-by: Avi Kivity 

-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] KVM: Refactor and simplify kvm_dev_ioctl_get_supported_cpuid

2011-11-28 Thread Avi Kivity
On 11/28/2011 11:52 AM, Avi Kivity wrote:
> On 11/28/2011 11:20 AM, Sasha Levin wrote:
> > This patch cleans and simplifies kvm_dev_ioctl_get_supported_cpuid by using 
> > a table
> > instead of duplicating code as Avi suggested.
> >
> > This patch also fixes a bug where kvm_dev_ioctl_get_supported_cpuid would 
> > return
> > -E2BIG when amount of entries passed was just right.
> >
> > do_cpuid_1_ent(entry, function, index);
> > ++*nent;
> >  
> > @@ -275,7 +282,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 
> > *entry, u32 function,
> >  
> > entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
> > entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
> > -   for (t = 1; t < times && *nent < maxnent; ++t) {
> > +   for (t = 1; t < times; ++t) {
> > +   if (*nent >= maxnent)
> > +   goto out;
> > +
> > do_cpuid_1_ent(&entry[t], function, 0);
> > entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
> > ++*nent;
>
> Please move the check into do_cpuid_1_ent(); it's more consistent.
>
>

Given that do_cpuid_1_ent() doesn't receive nent/maxent, I applied this;
thanks.

-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] KVM: Refactor and simplify kvm_dev_ioctl_get_supported_cpuid

2011-11-28 Thread Avi Kivity
On 11/28/2011 11:20 AM, Sasha Levin wrote:
> This patch cleans and simplifies kvm_dev_ioctl_get_supported_cpuid by using a 
> table
> instead of duplicating code as Avi suggested.
>
> This patch also fixes a bug where kvm_dev_ioctl_get_supported_cpuid would 
> return
> -E2BIG when amount of entries passed was just right.
>
>   do_cpuid_1_ent(entry, function, index);
>   ++*nent;
>  
> @@ -275,7 +282,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, 
> u32 function,
>  
>   entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
>   entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
> - for (t = 1; t < times && *nent < maxnent; ++t) {
> + for (t = 1; t < times; ++t) {
> + if (*nent >= maxnent)
> + goto out;
> +
>   do_cpuid_1_ent(&entry[t], function, 0);
>   entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
>   ++*nent;

Please move the check into do_cpuid_1_ent(); it's more consistent.

> @@ -296,6 +309,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, 
> u32 function,
>   entry[i].flags |=
>  KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
>   ++*nent;
> + 
>   }
>   break;

Spurious?

>   }
> @@ -335,7 +352,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, 
> u32 function,
>   int idx, i;
>  
>   entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
> - for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) {
> + for (idx = 1, i = 1; idx < 64; ++idx) {
> + if (*nent >= maxnent)
> + goto out;
> +
>   do_cpuid_1_ent(&entry[i], function, idx);
>   if (entry[i].eax == 0 || !supported_xcr0_bit(idx))
>   continue;

This would go away too then.


-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] X86: expose latest Intel cpu new features to guest

2011-11-28 Thread Avi Kivity
On 11/28/2011 07:10 AM, Liu, Jinsong wrote:
> From 8bb5d052825149c211afa92458912bc49a50ee2f Mon Sep 17 00:00:00 2001
> From: Liu, Jinsong 
> Date: Mon, 28 Nov 2011 03:55:19 -0800
> Subject: [PATCH] X86: expose latest Intel cpu new features to guest
>
> Intel latest cpu add 6 new features, refer 
> http://software.intel.com/file/36945
> The new feature cpuid listed as below:
> 1. FMACPUID.EAX=01H:ECX.FMA[bit 12]
> 2. MOVBE  CPUID.EAX=01H:ECX.MOVBE[bit 22]
> 3. BMI1   CPUID.EAX=07H,ECX=0H:EBX.BMI1[bit 3]
> 4. AVX2   CPUID.EAX=07H,ECX=0H:EBX.AVX2[bit 5]
> 5. BMI2   CPUID.EAX=07H,ECX=0H:EBX.BMI2[bit 8]
> 6. LZCNT  CPUID.EAX=8001H:ECX.LZCNT[bit 5]
>
> This patch expose these features to guest.
> Among them, FMA/MOVBE/LZCNT has already been defined, MOVBE/LZCNT has already 
> been exposed.
> This patch defines BMI1/AVX2/BMI2, and exposes FMA/BMI1/AVX2/BMI2 to guest.
>
>

Thanks, applied.

-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2] KVM: Refactor and simplify kvm_dev_ioctl_get_supported_cpuid

2011-11-28 Thread Sasha Levin
This patch cleans and simplifies kvm_dev_ioctl_get_supported_cpuid by using a 
table
instead of duplicating code as Avi suggested.

This patch also fixes a bug where kvm_dev_ioctl_get_supported_cpuid would return
-E2BIG when amount of entries passed was just right.

Cc: Avi Kivity 
Cc: Marcelo Tosatti 
Signed-off-by: Sasha Levin 
---
 arch/x86/kvm/cpuid.c |  114 --
 1 files changed, 64 insertions(+), 50 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index bbaa6d8..892f52d 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -187,9 +187,10 @@ static bool supported_xcr0_bit(unsigned bit)
 
 #define F(x) bit(X86_FEATURE_##x)
 
-static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 u32 index, int *nent, int maxnent)
 {
+   int r;
unsigned f_nx = is_efer_nx() ? F(NX) : 0;
 #ifdef CONFIG_X86_64
unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
@@ -250,6 +251,12 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, 
u32 function,
 
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
+
+   r = -E2BIG;
+
+   if (*nent >= maxnent)
+   goto out;
+
do_cpuid_1_ent(entry, function, index);
++*nent;
 
@@ -275,7 +282,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, 
u32 function,
 
entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
-   for (t = 1; t < times && *nent < maxnent; ++t) {
+   for (t = 1; t < times; ++t) {
+   if (*nent >= maxnent)
+   goto out;
+
do_cpuid_1_ent(&entry[t], function, 0);
entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
++*nent;
@@ -288,7 +298,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, 
u32 function,
 
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
/* read more entries until cache_type is zero */
-   for (i = 1; *nent < maxnent; ++i) {
+   for (i = 1; ; ++i) {
+   if (*nent >= maxnent)
+   goto out;
+
cache_type = entry[i - 1].eax & 0x1f;
if (!cache_type)
break;
@@ -296,6 +309,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, 
u32 function,
entry[i].flags |=
   KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
++*nent;
+   
}
break;
}
@@ -320,7 +334,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, 
u32 function,
 
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
/* read more entries until level_type is zero */
-   for (i = 1; *nent < maxnent; ++i) {
+   for (i = 1; ; ++i) {
+   if (*nent >= maxnent)
+   goto out;
+
level_type = entry[i - 1].ecx & 0xff00;
if (!level_type)
break;
@@ -335,7 +352,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, 
u32 function,
int idx, i;
 
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-   for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) {
+   for (idx = 1, i = 1; idx < 64; ++idx) {
+   if (*nent >= maxnent)
+   goto out;
+
do_cpuid_1_ent(&entry[i], function, idx);
if (entry[i].eax == 0 || !supported_xcr0_bit(idx))
continue;
@@ -420,17 +440,41 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, 
u32 function,
 
kvm_x86_ops->set_supported_cpuid(function, entry);
 
+   r = 0;
+
+out:
put_cpu();
+
+   return r;
 }
 
 #undef F
 
+struct kvm_cpuid_param {
+   u32 func;
+   u32 idx;
+   bool has_leaf_count;
+   bool (*qualifier)(struct kvm_cpuid_param *param);
+};
+
+static bool is_centaur_cpu(struct kvm_cpuid_param *param)
+{
+   return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR;
+}
+
 int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
  struct kvm_cpuid_entry2 __user *entries)
 {
struct kvm_cpuid_entry2 *cpuid_entries;
-   int limit, nent = 0, r = -E2BIG;
+   int limit, nent = 0, r = -E2BIG, i;
u32 func;
+   static struct kvm_cpuid_param param[] = {
+   { .func = 0, .has_leaf_count = true },
+   { .func = 0x8000, .has_leaf_count = true },
+ 

Re: [PATCHv3 RFC] virtio-pci: flexible configuration layout

2011-11-28 Thread Sasha Levin
On Mon, 2011-11-28 at 11:25 +1030, Rusty Russell wrote:
> I'd like to see kvmtools remove support for legacy mode altogether,
> but they probably have existing users.

While we can't simply remove it right away, instead of mixing our
implementation for both legacy and new spec in the same code we can
split the virtio-pci implementation into two:

- virtio/virtio-pci-legacy.c
- virtio/virtio-pci.c

At that point we can #ifdef the entire virtio-pci-legacy.c for now and
remove it at the same time legacy virtio-pci is removed from the kernel.

I think this is something very similar to what you want done in the
kernel code, so an added plus is that the usermode code will be
mirroring the kernel code - which is something we try to have in the KVM
tool :)

-- 

Sasha.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH 2/2] kvm: exit to userspace with reason KVM_EXIT_VCPU_DEAD

2011-11-28 Thread Gleb Natapov
On Mon, Nov 28, 2011 at 03:16:01PM +0800, Liu ping fan wrote:
> On Sun, Nov 27, 2011 at 6:50 PM, Gleb Natapov  wrote:
> > On Sun, Nov 27, 2011 at 12:36:55PM +0200, Avi Kivity wrote:
> >> On 11/27/2011 04:42 AM, Liu Ping Fan wrote:
> >> > From: Liu Ping Fan 
> >> >
> >> > The vcpu can be safely released when
> >> > --1.guest tells us that the vcpu is not needed any longer.
> >> > --2.vcpu hits the last instruction _halt_
> >> >
> >> > If both of the conditions are satisfied, kvm exits to userspace
> >> > with the reason vcpu dead. So the user thread can exit safely.
> >> >
> >> >
> >>
> >> Seems to be completely unnecessary.  If you want to exit from the vcpu
> >> thread, send it a signal.
> >>
> Hi Avi and Gleb,
> 
> First, I wanted to make sure my assumption is right, so I can grab
> your meaning more clearly -:). Could you elaborate it for me, thanks.
> 
> I had thought that when a vcpu was being removed from guest, kvm must
> satisfy the following conditions to safely remove the vcpu:
> --1. The tasks on vcpu in GUEST  have already been migrated to other
> vcpus and ONLY idle_task left  The CPU_DEAD is the checkpoint.
> --2. We must wait the idle task to hit native_halt() in GUEST, till
> that time, this vcpu is no needed even by idle_task. In KVM, the vcpu
> thread will finally sit on "kvm_vcpu_block(vcpu);"
> We CAN NOT suppose the sequence of the two condition because they come
> from different threads.  Am I right?
> 
No, KVM can remove vcpu whenever it told to do so (may be not in the
middle of emulated io though). It is a guest responsibility to eject cpu
only when it is safe to do so from guest's point of view.

> And here comes my question,
> --1. I think the signal will make vcpu_run exit to user, but is it
> allow vcpu thread to finally call  "kernel/exit.c : void do_exit(long
> code)" in current code in kvm or in qemu?
Yes. Why not?

> --2. If we got CPU_DEAD event, and then send a signal to vcpu thread,
> could we ensure that we have already sit on "kvm_vcpu_block(vcpu);"
CPU_DEAD event is internal to a guest (one of them). KVM does not care
about it. And to remove vcpu it does not have to sit in kvm_vcpu_block().
And actually since signal kicks vcpu thread out from kernel into userspace
you can be sure it is not sitting in kvm_vcpu_block(). 

> 
> Thanks and regards,
> ping fan
> 
> > Also if guest "tells us that the vcpu is not needed any longer" (via
> > ACPI I presume) and vcpu actually doing something critical instead of
> > sitting in 1:hlt; jmp 1b loop then it is guest's problem if it stops
> > working after vcpu destruction.
> >
> 
> 
> > --
> >                        Gleb.
> >

--
Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv3 RFC] virtio-pci: flexible configuration layout

2011-11-28 Thread Michael S. Tsirkin
On Mon, Nov 28, 2011 at 11:25:43AM +1030, Rusty Russell wrote:
> > > But I'm *terrified* of making the spec more complex;
> > 
> > All you do is move stuff around. Why do you think it simplifies the spec
> > so much?
> 
> No, but it reduces the yuk factor.  Which has been important to adoption.

Sorry if I'm dense. Could you please clarify: do you think we can live
with the slightly higher yuk factor assuming the spec moves the
legacy mode into an appendix as you explain below and driver has a
single 'legacy' switch?

> And that's *not* all I do: reducing the number of options definitely
> simplifies the spec.  For example, the spec should currently say
> (looking at your implementation):
> 
>   Notifying the device
>   
>   If you find a valid VIRTIO_PCI_CAP_NOTIFY_CFG capability, and you can
>   map 2 bytes within it, those two bytes should be used to notify the
>   device of new descriptors in its virtqueues, by writing the index of the
>   virtqueue to that mapping.
> 
>   If the capability is missing or malformed or you cannot map it, the
>   virtqueue index should be written to the VIRTIO_PCI_QUEUE_NOTIFY offset
>   of the legacy bar.
> 
> Vs:
> 
>   Notifying the device
>   
>   The index of the virtqueue containing new descriptors should be written
>   to the location specified by the VIRTIO_PCI_CAP_NOTIFY_CFG capability.
>   (Unless the device is in legacy mode, see Appendix Y: Legacy Mode).

Yes, I agree, this is better.

...

> Look, I try to be more inclusive and polite than Linus, but at some
> point more verbiage is wasted.
> We will have single Legacy Mode switch.

Sorry, I'm adding more verbiage :( 
When you say a single Legacy Mode switch, you mean that the driver will
assume either legacy layout or the new one, correct?

> Accept it, or fork the standard.
>
> If you want to reuse the same structure, we're going to need to figure
> out how to specify the virtqueue address without a fixed alignment, and
> how to specify the alignment itself.

I think I see a way to do that in a relatively painless way.
Do you prefer seeing driver patches or spec? Or are you not interested
in reusing the same structure at all?

-- 
MST
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html