date:20181122

Re: [Qemu-devel] [RFC v9 06/17] virtio-iommu: Endpoint and domains structs and helpers

2018-11-22 Thread Auger Eric

Hi Bharat,

On 11/23/18 7:38 AM, Bharat Bhushan wrote:
> Hi Eric,
> 
>> -Original Message-
>> From: Eric Auger 
>> Sent: Thursday, November 22, 2018 10:45 PM
>> To: eric.auger@gmail.com; eric.au...@redhat.com; qemu-
>> de...@nongnu.org; qemu-...@nongnu.org; peter.mayd...@linaro.org;
>> m...@redhat.com; jean-philippe.bruc...@arm.com
>> Cc: kevin.t...@intel.com; t...@semihalf.com; Bharat Bhushan
>> ; pet...@redhat.com
>> Subject: [RFC v9 06/17] virtio-iommu: Endpoint and domains structs and
>> helpers
>>
>> This patch introduce domain and endpoint internal datatypes. Both are
>> stored in RB trees. The domain owns a list of endpoints attached to it.
>>
>> Helpers to get/put end points and domains are introduced.
>> get() helpers will become static in subsequent patches.
>>
>> Signed-off-by: Eric Auger 
>>
>> ---
>>
>> v6 -> v7:
>> - on virtio_iommu_find_add_as the bus number computation may
>>   not be finalized yet so we cannot register the EPs at that time.
>>   Hence, let's remove the get_endpoint and also do not use the
>>   bus number for building the memory region name string (only
>>   used for debug though).
> 
> Endpoint registration from virtio_iommu_find_add_as to PROBE request.
> It is mentioned that " the bus number computation may not be finalized ". Can 
> you please give some more information.
> I am asking this because from vfio perspective translate/replay will be 
> called much before the PROBE request and endpoint needed to be registered by 
> that time.
When from virtio_iommu_find_add() gets called, there are cases where the
BDF of the device is not yet computed, typically if the EP is plugged on
a secondary bus. That's why I postponed the registration. Do you have
idea When you would need the registration to happen?

Thanks

Eric
> 
> 
> Thanks
> -Bharat
> 
>>
>> v4 -> v5:
>> - initialize as->endpoint_list
>>
>> v3 -> v4:
>> - new separate patch
>> ---
>>  hw/virtio/trace-events   |   4 ++
>>  hw/virtio/virtio-iommu.c | 125
>> ++-
>>  2 files changed, 128 insertions(+), 1 deletion(-)
>>
>> diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index
>> 9270b0463e..4b15086872 100644
>> --- a/hw/virtio/trace-events
>> +++ b/hw/virtio/trace-events
>> @@ -61,3 +61,7 @@ virtio_iommu_map(uint32_t domain_id, uint64_t
>> virt_start, uint64_t virt_end, uin  virtio_iommu_unmap(uint32_t domain_id,
>> uint64_t virt_start, uint64_t virt_end) "domain=%d virt_start=0x%"PRIx64"
>> virt_end=0x%"PRIx64  virtio_iommu_translate(const char *name, uint32_t
>> rid, uint64_t iova, int flag) "mr=%s rid=%d addr=0x%"PRIx64" flag=%d"
>>  virtio_iommu_init_iommu_mr(char *iommu_mr) "init %s"
>> +virtio_iommu_get_endpoint(uint32_t ep_id) "Alloc endpoint=%d"
>> +virtio_iommu_put_endpoint(uint32_t ep_id) "Free endpoint=%d"
>> +virtio_iommu_get_domain(uint32_t domain_id) "Alloc domain=%d"
>> +virtio_iommu_put_domain(uint32_t domain_id) "Free domain=%d"
>> diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index
>> dead062baf..1b9c3ba416 100644
>> --- a/hw/virtio/virtio-iommu.c
>> +++ b/hw/virtio/virtio-iommu.c
>> @@ -33,20 +33,124 @@
>>  #include "hw/virtio/virtio-bus.h"
>>  #include "hw/virtio/virtio-access.h"
>>  #include "hw/virtio/virtio-iommu.h"
>> +#include "hw/pci/pci_bus.h"
>> +#include "hw/pci/pci.h"
>>
>>  /* Max size */
>>  #define VIOMMU_DEFAULT_QUEUE_SIZE 256
>>
>> +typedef struct viommu_domain {
>> +uint32_t id;
>> +GTree *mappings;
>> +QLIST_HEAD(, viommu_endpoint) endpoint_list; } viommu_domain;
>> +
>> +typedef struct viommu_endpoint {
>> +uint32_t id;
>> +viommu_domain *domain;
>> +QLIST_ENTRY(viommu_endpoint) next;
>> +VirtIOIOMMU *viommu;
>> +} viommu_endpoint;
>> +
>> +typedef struct viommu_interval {
>> +uint64_t low;
>> +uint64_t high;
>> +} viommu_interval;
>> +
>>  static inline uint16_t virtio_iommu_get_sid(IOMMUDevice *dev)  {
>>  return PCI_BUILD_BDF(pci_bus_num(dev->bus), dev->devfn);  }
>>
>> +static gint interval_cmp(gconstpointer a, gconstpointer b, gpointer
>> +user_data) {
>> +viommu_interval *inta = (viommu_interval *)a;
>> +viommu_interval *intb = (viommu_interval *)b;
>> +
>> +if (inta->high <= intb->low) {
>> +return -1;
>> +} else if (intb->high <= inta->low) {
>> +return 1;
>> +} else {
>> +return 0;
>> +}
>> +}
>> +
>> +static void
>> virtio_iommu_detach_endpoint_from_domain(viommu_endpoint
>> +*ep) {
>> +QLIST_REMOVE(ep, next);
>> +ep->domain = NULL;
>> +}
>> +
>> +viommu_endpoint *virtio_iommu_get_endpoint(VirtIOIOMMU *s,
>> uint32_t
>> +ep_id); viommu_endpoint *virtio_iommu_get_endpoint(VirtIOIOMMU *s,
>> +uint32_t ep_id) {
>> +viommu_endpoint *ep;
>> +
>> +ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(ep_id));
>> +if (ep) {
>> +return ep;
>> +}
>> +ep = g_malloc0(sizeof(*ep));
>> +ep->id = ep_id;
>> +ep->viommu = s;
>> +trace_virtio_iommu_get_endpoint(ep_id);
>> +

Re: [Qemu-devel] [Qemu-block] [PATCH 3/5] iotests: allow resume_drive by node name

2018-11-22 Thread Peter Krempa

On Thu, Nov 22, 2018 at 21:48:05 +0300, Andrey Shinkevich wrote:
> After node graph changes, we may not be able to resume_drive by device
> name (backing files are not recursively searched). So, lets allow to
> resume by node-name. Set constant name for breakpoints, to avoid
> introducing extra parameters.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---

This patch has a mismatch between the author name and the person signing
it off.


signature.asc
Description: PGP signature

Re: [Qemu-devel] [Qemu-block] [PATCH 0/5] Discrad blocks during block-stream operation

2018-11-22 Thread Peter Krempa

On Thu, Nov 22, 2018 at 21:48:02 +0300, Andrey Shinkevich wrote:
> Hello everyone!
> 
> The given feature discards blocks with copy-on-read operation while the
> streaming process runs. Adding the 'discard' argument to the QMP block-stream
> command allows dropping a block in the backing chain after it has been copied
> to the active layer. That will elude the block duplication in the intermediate
> backing file. It saves the disk space while external snapshots are being
> merged.

So you specifically want to merge the snapshot by pulling rather than
commiting? Do you have any specific reasons for that? I'm curious
because I plan to finally finish external snapshots in libvirt.

Allowing to pull into intermediate layers will be (or is?) very welcome
by libvirt since I plan to do external snapshot deletion/merging and
that will be greatly simplified by pulling.

On the other hand libvirt will not be able to always use 'discard' as
libvirt's API allows creating alternate histories for a VM and in such
case when merging a snapshot at a branching point we'll need to pull it
into multiple images. The 'discard' optimization can then be used only
with the last branch.

Libvirt's reasons for using 'block-stream' are mostly as it corresponds
to the operations necessary for not messing up the relationship between
the snapshot and which files on disk belong to it.

signature.asc
Description: PGP signature

Re: [Qemu-devel] [PATCH v4] hw/arm: Add arm SBSA reference machine

2018-11-22 Thread Hongbo Zhang

On Fri, 23 Nov 2018 at 15:14, Hongbo Zhang  wrote:
>
> On Fri, 16 Nov 2018 at 00:05, Peter Maydell  wrote:
> >
> > On 19 October 2018 at 09:55, Hongbo Zhang  wrote:
> > > For the Aarch64, there is one machine 'virt', it is primarily meant to
> > > run on KVM and execute virtualization workloads, but we need an
> > > environment as faithful as possible to physical hardware, for supporting
> > > firmware and OS development for pysical Aarch64 machines.
> > >
> > > This patch introduces new machine type 'sbsa-ref' with main features:
> > >  - Based on 'virt' machine type.
> > >  - CPU type cortex-a57.
> > >  - EL2 and EL3 are enabled by default.
> > >  - GIC version 3 only.
> > >  - Re-designed memory map.
> > >  - AHCI controller attached to system bus.
> > >  - EHCI controller attached to system bus.
> > >  - CDROM and hard disc on AHCI bus.
> > >  - USB mouse and key board.
> > >  - E1000E ethernet card on PCIE bus.
> > >  - VGA display adaptor on PCIE bus.
> > >  - No virtio deivces.
> > >  - No paravirtualized fw_cfg device.
> > >  - No ACPI table supplied.
> > >  - Only minimal device tree nodes.
> > >
> > > Arm Trusted Firmware and UEFI porting to this are done accordingly.
> > >
> > > Signed-off-by: Hongbo Zhang 
> >
> > Hi; I've had a quick run through this patch. My comments
> > below are mostly about there still being a lot of code
> > here which has been copied from virt.c but which you do
> > not need.
> >
> > If after you've done that this patch is still more than
> > about 500 lines long, I would recommend that you split it
> > up into coherent pieces, to make it easier to review.
> >
> > > ---
> > >  hw/arm/Makefile.objs  |   2 +-
> > >  hw/arm/sbsa-ref.c | 937 
> > > ++
> > >  include/hw/arm/virt.h |   2 +
> > >  3 files changed, 940 insertions(+), 1 deletion(-)
> > >  create mode 100644 hw/arm/sbsa-ref.c
> > >
> > > diff --git a/hw/arm/Makefile.objs b/hw/arm/Makefile.objs
> > > index d51fcec..a8895eb 100644
> > > --- a/hw/arm/Makefile.objs
> > > +++ b/hw/arm/Makefile.objs
> > > @@ -1,4 +1,4 @@
> > > -obj-y += boot.o virt.o sysbus-fdt.o
> > > +obj-y += boot.o virt.o sbsa-ref.o sysbus-fdt.o
> > >  obj-$(CONFIG_ACPI) += virt-acpi-build.o
> > >  obj-$(CONFIG_DIGIC) += digic_boards.o
> > >  obj-$(CONFIG_EXYNOS4) += exynos4_boards.o
> > > diff --git a/hw/arm/sbsa-ref.c b/hw/arm/sbsa-ref.c
> > > new file mode 100644
> > > index 000..28ebb3a
> > > --- /dev/null
> > > +++ b/hw/arm/sbsa-ref.c
> > > @@ -0,0 +1,937 @@
> > > +/*
> > > + * ARM SBSA Reference Platform emulation
> > > + *
> > > + * Copyright (c) 2018 Linaro Limited
> > > + * Written by Hongbo Zhang 
> > > + *
> > > + * Based on hw/arm/virt.c
> > > + *
> > > + * This program is free software; you can redistribute it and/or modify 
> > > it
> > > + * under the terms and conditions of the GNU General Public License,
> > > + * version 2 or later, as published by the Free Software Foundation.
> > > + *
> > > + * This program is distributed in the hope it will be useful, but WITHOUT
> > > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> > > + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
> > > for
> > > + * more details.
> > > + *
> > > + * You should have received a copy of the GNU General Public License 
> > > along with
> > > + * this program.  If not, see .
> > > + */
> > > +
> > > +#include "qemu/osdep.h"
> > > +#include "qapi/error.h"
> > > +#include "hw/arm/virt.h"
> > > +#include "hw/devices.h"
> > > +#include "net/net.h"
> > > +#include "sysemu/device_tree.h"
> > > +#include "sysemu/numa.h"
> > > +#include "hw/loader.h"
> > > +#include "qemu/error-report.h"
> > > +#include "hw/pci-host/gpex.h"
> > > +#include "hw/arm/sysbus-fdt.h"
> > > +#include "hw/arm/fdt.h"
> > > +#include "hw/intc/arm_gic.h"
> > > +#include "hw/intc/arm_gicv3_common.h"
> > > +#include "kvm_arm.h"
> > > +#include "hw/ide/internal.h"
> > > +#include "hw/ide/ahci_internal.h"
> > > +#include "hw/usb.h"
> > > +#include "qemu/units.h"
> > > +
> > > +#define NUM_IRQS 256
> > > +
> > > +#define SATA_NUM_PORTS 6
> > > +
> > > +#define RAMLIMIT_GB 255
> > > +#define RAMLIMIT_BYTES (RAMLIMIT_GB * GiB)
> >
> > You probably don't want to stick yourself with the same
> > ram limits that the virt board has, especially since you
> > don't need to care about AArch32. Strongly consider
> > putting the RAM somwhere that lets you get up to a
> > maximum value that matches what we might expect in
> > server-class hardware.
> >
> > > +
> > > +static const MemMapEntry sbsa_ref_memmap[] = {
> > > +/* Space up to 0x800 is reserved for a boot ROM */
> > > +[VIRT_FLASH] =  {  0, 0x0800 },
> > > +[VIRT_CPUPERIPHS] = { 0x0800, 0x0002 },
> > > +/* GIC distributor and CPU interfaces sit inside the CPU peripheral 
> > > space */
> > > +[VIRT_GIC_DIST] =   { 0x0800, 0x0001 },
> > >

Re: [Qemu-devel] [PATCH v5 06/36] ppc/xive: add support for the END Event State buffers

2018-11-22 Thread Cédric Le Goater

On 11/23/18 5:36 AM, David Gibson wrote:
> On Thu, Nov 22, 2018 at 10:58:56PM +0100, Cédric Le Goater wrote:
>> On 11/22/18 6:13 AM, David Gibson wrote:
>>> On Fri, Nov 16, 2018 at 11:56:59AM +0100, Cédric Le Goater wrote:
 The Event Notification Descriptor also contains two Event State
 Buffers providing further coalescing of interrupts, one for the
 notification event (ESn) and one for the escalation events (ESe). A
 MMIO page is assigned for each to control the EOI through loads
 only. Stores are not allowed.

 The END ESBs are modeled through an object resembling the 'XiveSource'
 It is stateless as the END state bits are backed into the XiveEND
 structure under the XiveRouter and the MMIO accesses follow the same
 rules as for the standard source ESBs.

 END ESBs are not supported by the Linux drivers neither on OPAL nor on
 sPAPR. Nevetherless, it provides a mean to study the question in the
 future and validates a bit more the XIVE model.

 Signed-off-by: Cédric Le Goater 
 ---
  include/hw/ppc/xive.h |  20 ++
  hw/intc/xive.c| 160 +-
  2 files changed, 178 insertions(+), 2 deletions(-)

 diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
 index ce62aaf28343..24301bf2076d 100644
 --- a/include/hw/ppc/xive.h
 +++ b/include/hw/ppc/xive.h
 @@ -208,6 +208,26 @@ int xive_router_get_end(XiveRouter *xrtr, uint8_t 
 end_blk, uint32_t end_idx,
  int xive_router_set_end(XiveRouter *xrtr, uint8_t end_blk, uint32_t 
 end_idx,
  XiveEND *end);
  
 +/*
 + * XIVE END ESBs
 + */
 +
 +#define TYPE_XIVE_END_SOURCE "xive-end-source"
 +#define XIVE_END_SOURCE(obj) \
 +OBJECT_CHECK(XiveENDSource, (obj), TYPE_XIVE_END_SOURCE)
>>>
>>> Is there a particular reason to make this a full QOM object, rather
>>> than just embedding it in the XiveRouter?
>>
>> yes, it should probably be under the XiveRouter you are right because
>> there is a direct link with the ENDT which is in the XiverRouter. 
>>
>> But if I remove the chip_id field from the XiveRouter, it becomes a QOM
>> interface. something to ponder.
> 
> Huh?  I really don't understand what you're saying here.  What does
> chip_id have to do with anything?

I am quoting a comment of yours :

> +/*
> + * XIVE Router
> + */
> +
> +typedef struct XiveRouter {
> +SysBusDeviceparent;
> +
> +uint32_tchip_id;

I don't think this belongs in the base class.  The PowerNV specific
variants will need it, but it doesn't make sense for the PAPR version.


If we remove 'chip_id' from XiveRouter, it can become a QOM interface 
without state, like the XiveFabric is.

C.

Re: [Qemu-devel] [PATCH v4] hw/arm: Add arm SBSA reference machine

2018-11-22 Thread Hongbo Zhang

On Fri, 16 Nov 2018 at 00:05, Peter Maydell  wrote:
>
> On 19 October 2018 at 09:55, Hongbo Zhang  wrote:
> > For the Aarch64, there is one machine 'virt', it is primarily meant to
> > run on KVM and execute virtualization workloads, but we need an
> > environment as faithful as possible to physical hardware, for supporting
> > firmware and OS development for pysical Aarch64 machines.
> >
> > This patch introduces new machine type 'sbsa-ref' with main features:
> >  - Based on 'virt' machine type.
> >  - CPU type cortex-a57.
> >  - EL2 and EL3 are enabled by default.
> >  - GIC version 3 only.
> >  - Re-designed memory map.
> >  - AHCI controller attached to system bus.
> >  - EHCI controller attached to system bus.
> >  - CDROM and hard disc on AHCI bus.
> >  - USB mouse and key board.
> >  - E1000E ethernet card on PCIE bus.
> >  - VGA display adaptor on PCIE bus.
> >  - No virtio deivces.
> >  - No paravirtualized fw_cfg device.
> >  - No ACPI table supplied.
> >  - Only minimal device tree nodes.
> >
> > Arm Trusted Firmware and UEFI porting to this are done accordingly.
> >
> > Signed-off-by: Hongbo Zhang 
>
> Hi; I've had a quick run through this patch. My comments
> below are mostly about there still being a lot of code
> here which has been copied from virt.c but which you do
> not need.
>
> If after you've done that this patch is still more than
> about 500 lines long, I would recommend that you split it
> up into coherent pieces, to make it easier to review.
>
> > ---
> >  hw/arm/Makefile.objs  |   2 +-
> >  hw/arm/sbsa-ref.c | 937 
> > ++
> >  include/hw/arm/virt.h |   2 +
> >  3 files changed, 940 insertions(+), 1 deletion(-)
> >  create mode 100644 hw/arm/sbsa-ref.c
> >
> > diff --git a/hw/arm/Makefile.objs b/hw/arm/Makefile.objs
> > index d51fcec..a8895eb 100644
> > --- a/hw/arm/Makefile.objs
> > +++ b/hw/arm/Makefile.objs
> > @@ -1,4 +1,4 @@
> > -obj-y += boot.o virt.o sysbus-fdt.o
> > +obj-y += boot.o virt.o sbsa-ref.o sysbus-fdt.o
> >  obj-$(CONFIG_ACPI) += virt-acpi-build.o
> >  obj-$(CONFIG_DIGIC) += digic_boards.o
> >  obj-$(CONFIG_EXYNOS4) += exynos4_boards.o
> > diff --git a/hw/arm/sbsa-ref.c b/hw/arm/sbsa-ref.c
> > new file mode 100644
> > index 000..28ebb3a
> > --- /dev/null
> > +++ b/hw/arm/sbsa-ref.c
> > @@ -0,0 +1,937 @@
> > +/*
> > + * ARM SBSA Reference Platform emulation
> > + *
> > + * Copyright (c) 2018 Linaro Limited
> > + * Written by Hongbo Zhang 
> > + *
> > + * Based on hw/arm/virt.c
> > + *
> > + * This program is free software; you can redistribute it and/or modify it
> > + * under the terms and conditions of the GNU General Public License,
> > + * version 2 or later, as published by the Free Software Foundation.
> > + *
> > + * This program is distributed in the hope it will be useful, but WITHOUT
> > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> > + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
> > for
> > + * more details.
> > + *
> > + * You should have received a copy of the GNU General Public License along 
> > with
> > + * this program.  If not, see .
> > + */
> > +
> > +#include "qemu/osdep.h"
> > +#include "qapi/error.h"
> > +#include "hw/arm/virt.h"
> > +#include "hw/devices.h"
> > +#include "net/net.h"
> > +#include "sysemu/device_tree.h"
> > +#include "sysemu/numa.h"
> > +#include "hw/loader.h"
> > +#include "qemu/error-report.h"
> > +#include "hw/pci-host/gpex.h"
> > +#include "hw/arm/sysbus-fdt.h"
> > +#include "hw/arm/fdt.h"
> > +#include "hw/intc/arm_gic.h"
> > +#include "hw/intc/arm_gicv3_common.h"
> > +#include "kvm_arm.h"
> > +#include "hw/ide/internal.h"
> > +#include "hw/ide/ahci_internal.h"
> > +#include "hw/usb.h"
> > +#include "qemu/units.h"
> > +
> > +#define NUM_IRQS 256
> > +
> > +#define SATA_NUM_PORTS 6
> > +
> > +#define RAMLIMIT_GB 255
> > +#define RAMLIMIT_BYTES (RAMLIMIT_GB * GiB)
>
> You probably don't want to stick yourself with the same
> ram limits that the virt board has, especially since you
> don't need to care about AArch32. Strongly consider
> putting the RAM somwhere that lets you get up to a
> maximum value that matches what we might expect in
> server-class hardware.
>
> > +
> > +static const MemMapEntry sbsa_ref_memmap[] = {
> > +/* Space up to 0x800 is reserved for a boot ROM */
> > +[VIRT_FLASH] =  {  0, 0x0800 },
> > +[VIRT_CPUPERIPHS] = { 0x0800, 0x0002 },
> > +/* GIC distributor and CPU interfaces sit inside the CPU peripheral 
> > space */
> > +[VIRT_GIC_DIST] =   { 0x0800, 0x0001 },
> > +[VIRT_GIC_CPU] ={ 0x0801, 0x0001 },
> > +/* The space in between here is reserved for GICv3 CPU/vCPU/HYP */
> > +/* This redistributor space allows up to 2*64kB*123 CPUs */
>
> You don't need to do the split-redistributor layout that
> "virt" does because you have no backwards

Re: [Qemu-devel] 答复: Can't see mouse cursor on VNC viewer

2018-11-22 Thread Heyi Guo


Thanks; I'll try it.

Heyi


On 2018/11/22 18:04, Gerd Hoffmann wrote:

On Thu, Nov 22, 2018 at 05:08:16PM +0800, Heyi Guo wrote:

Hi Gerd,

One more question: do you know any VNC client which supports rich
cursor extension? We'd like to try such client to see the real effect.

remote-viewer (comes with virt-viewer) supports it for sure.

cheers,
   Gerd

[Qemu-devel] [PATCH] cirrus_vga/migration: update the bank offset before use

2018-11-22 Thread linzhecheng

From: Wang Xin 

The cirrus bank0/1 offset should be updated before we update the vram's alias
offset.

Signed-off-by: Wang Xin 

diff --git a/hw/display/cirrus_vga.c b/hw/display/cirrus_vga.c
index d9b854d..a0e7146 100644
--- a/hw/display/cirrus_vga.c
+++ b/hw/display/cirrus_vga.c
@@ -2746,11 +2746,12 @@ static int cirrus_post_load(void *opaque, int 
version_id)
 s->vga.gr[0x00] = s->cirrus_shadow_gr0 & 0x0f;
 s->vga.gr[0x01] = s->cirrus_shadow_gr1 & 0x0f;
 
+cirrus_update_bank_ptr(s, 0);
+cirrus_update_bank_ptr(s, 1);
 cirrus_update_memory_access(s);
 /* force refresh */
 s->vga.graphic_mode = -1;
-cirrus_update_bank_ptr(s, 0);
-cirrus_update_bank_ptr(s, 1);
+
 return 0;
 }
 
-- 
2.8.1.windows.1

Re: [Qemu-devel] [PATCH v5 1/6] fsdev-throttle-qmp: factor out throttle code to reuse code

2018-11-22 Thread xiezhide



> -Original Message-
> From: Greg Kurz [mailto:gr...@kaod.org]
> Sent: Thursday, November 22, 2018 10:46 PM
> To: xiezhide 
> Cc: qemu-devel@nongnu.org; aneesh.ku...@linux.vnet.ibm.com;
> ebl...@redhat.com; arm...@redhat.com; be...@igalia.com; zengcanfu
> 00215970 ; Jinxuefeng ;
> Chenhui (Felix, Euler) 
> Subject: Re: [PATCH v5 1/6] fsdev-throttle-qmp: factor out throttle code to
> reuse code
> 
> On Fri, 16 Nov 2018 15:59:16 +0800
> xiezhide  wrote:
> 
> > Factor out throttle parameter parsing code to a new common function
> > which will be used by block and fsdev.
> > Rename function throttle_parse_options to throttle_parse_group to
> > resolve function name conflict
> >
> > Reviewed-by: Eric Blake 
> > Signed-off-by: xiezhide 
> > ---
> 
> Reviewed-by: Greg Kurz 
> 
> And, since I guess this will likely go through someone else's tree, for the 
> fsdev
> changes:

Yes，Pradeep Jagadeesh had done some work for 
this, and I take it over now

> 
> Acked-by: Greg Kurz 
> 
> >  block/throttle.c|  6 ++--
> >  blockdev.c  | 43 +-
> >  fsdev/qemu-fsdev-throttle.c | 44 ++
> >  include/qemu/throttle-options.h |  2 ++
> >  include/qemu/throttle.h |  4 +--
> >  include/qemu/typedefs.h |  1 +
> >  util/throttle.c | 68
> +
> >  7 files changed, 79 insertions(+), 89 deletions(-)
> >
> > diff --git a/block/throttle.c b/block/throttle.c index
> > 636c976..bd23c58 100644
> > --- a/block/throttle.c
> > +++ b/block/throttle.c
> > @@ -41,7 +41,7 @@ static QemuOptsList throttle_opts = {
> >   * @group and must be freed by the caller.
> >   * If there's an error then @group remains unmodified.
> >   */
> > -static int throttle_parse_options(QDict *options, char **group, Error
> > **errp)
> > +static int throttle_parse_group(QDict *options, char **group, Error
> > +**errp)
> >  {
> >  int ret;
> >  const char *group_name;
> > @@ -90,7 +90,7 @@ static int throttle_open(BlockDriverState *bs, QDict
> *options,
> >  bs->supported_zero_flags = bs->file->bs->supported_zero_flags |
> > BDRV_REQ_WRITE_UNCHANGED;
> >
> > -ret = throttle_parse_options(options, , errp);
> > +ret = throttle_parse_group(options, , errp);
> >  if (ret == 0) {
> >  /* Register membership to group with name group_name */
> >  throttle_group_register_tgm(tgm, group,
> > bdrv_get_aio_context(bs)); @@ -179,7 +179,7 @@ static int
> throttle_reopen_prepare(BDRVReopenState *reopen_state,
> >  assert(reopen_state != NULL);
> >  assert(reopen_state->bs != NULL);
> >
> > -ret = throttle_parse_options(reopen_state->options, , errp);
> > +ret = throttle_parse_group(reopen_state->options, , errp);
> >  reopen_state->opaque = group;
> >  return ret;
> >  }
> > diff --git a/blockdev.c b/blockdev.c
> > index 81f95d9..fce5d8f 100644
> > --- a/blockdev.c
> > +++ b/blockdev.c
> > @@ -400,48 +400,7 @@ static void
> extract_common_blockdev_options(QemuOpts *opts, int *bdrv_flags,
> >  }
> >
> >  if (throttle_cfg) {
> > -throttle_config_init(throttle_cfg);
> > -throttle_cfg->buckets[THROTTLE_BPS_TOTAL].avg =
> > -qemu_opt_get_number(opts, "throttling.bps-total", 0);
> > -throttle_cfg->buckets[THROTTLE_BPS_READ].avg  =
> > -qemu_opt_get_number(opts, "throttling.bps-read", 0);
> > -throttle_cfg->buckets[THROTTLE_BPS_WRITE].avg =
> > -qemu_opt_get_number(opts, "throttling.bps-write", 0);
> > -throttle_cfg->buckets[THROTTLE_OPS_TOTAL].avg =
> > -qemu_opt_get_number(opts, "throttling.iops-total", 0);
> > -throttle_cfg->buckets[THROTTLE_OPS_READ].avg =
> > -qemu_opt_get_number(opts, "throttling.iops-read", 0);
> > -throttle_cfg->buckets[THROTTLE_OPS_WRITE].avg =
> > -qemu_opt_get_number(opts, "throttling.iops-write", 0);
> > -
> > -throttle_cfg->buckets[THROTTLE_BPS_TOTAL].max =
> > -qemu_opt_get_number(opts, "throttling.bps-total-max", 0);
> > -throttle_cfg->buckets[THROTTLE_BPS_READ].max  =
> > -qemu_opt_get_number(opts, "throttling.bps-read-max", 0);
> > -throttle_cfg->buckets[THROTTLE_BPS_WRITE].max =
> > -qemu_opt_get_number(opts, "throttling.bps-write-max", 0);
> > -throttle_cfg->buckets[THROTTLE_OPS_TOTAL].max =
> > -qemu_opt_get_number(opts, "throttling.iops-total-max", 0);
> > -throttle_cfg->buckets[THROTTLE_OPS_READ].max =
> > -qemu_opt_get_number(opts, "throttling.iops-read-max", 0);
> > -throttle_cfg->buckets[THROTTLE_OPS_WRITE].max =
> > -qemu_opt_get_number(opts, "throttling.iops-write-max", 0);
> > -
> > -throttle_cfg->buckets[THROTTLE_BPS_TOTAL].burst_length =
> > -qemu_opt_get_number(opts,
> "throttling.bps-total-max-length", 1);
> > -

[Qemu-devel] [PATCH v2] audio/hda: fix guest triggerable assert

2018-11-22 Thread Gerd Hoffmann

Guest writes to a readonly register trigger the assert in
intel_hda_reg_write().  Add a check and just ignore them.

Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1628433
Signed-off-by: Gerd Hoffmann 
Reviewed-by: Dr. David Alan Gilbert 
Reviewed-by: Philippe Mathieu-Daudé 
---
 hw/audio/intel-hda.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/hw/audio/intel-hda.c b/hw/audio/intel-hda.c
index 23a2cf6484..33e333cc26 100644
--- a/hw/audio/intel-hda.c
+++ b/hw/audio/intel-hda.c
@@ -23,6 +23,7 @@
 #include "hw/pci/msi.h"
 #include "qemu/timer.h"
 #include "qemu/bitops.h"
+#include "qemu/log.h"
 #include "hw/audio/soundhw.h"
 #include "intel-hda.h"
 #include "intel-hda-defs.h"
@@ -929,6 +930,11 @@ static void intel_hda_reg_write(IntelHDAState *d, const 
IntelHDAReg *reg, uint32
 if (!reg) {
 return;
 }
+if (!reg->wmask) {
+qemu_log_mask(LOG_GUEST_ERROR, "intel-hda: write to r/o reg %s\n",
+  reg->name);
+return;
+}
 
 if (d->debug) {
 time_t now = time(NULL);
-- 
2.9.3

Re: [Qemu-devel] [RFC v9 06/17] virtio-iommu: Endpoint and domains structs and helpers

2018-11-22 Thread Bharat Bhushan

Hi Eric,

> -Original Message-
> From: Eric Auger 
> Sent: Thursday, November 22, 2018 10:45 PM
> To: eric.auger@gmail.com; eric.au...@redhat.com; qemu-
> de...@nongnu.org; qemu-...@nongnu.org; peter.mayd...@linaro.org;
> m...@redhat.com; jean-philippe.bruc...@arm.com
> Cc: kevin.t...@intel.com; t...@semihalf.com; Bharat Bhushan
> ; pet...@redhat.com
> Subject: [RFC v9 06/17] virtio-iommu: Endpoint and domains structs and
> helpers
> 
> This patch introduce domain and endpoint internal datatypes. Both are
> stored in RB trees. The domain owns a list of endpoints attached to it.
> 
> Helpers to get/put end points and domains are introduced.
> get() helpers will become static in subsequent patches.
> 
> Signed-off-by: Eric Auger 
> 
> ---
> 
> v6 -> v7:
> - on virtio_iommu_find_add_as the bus number computation may
>   not be finalized yet so we cannot register the EPs at that time.
>   Hence, let's remove the get_endpoint and also do not use the
>   bus number for building the memory region name string (only
>   used for debug though).

Endpoint registration from virtio_iommu_find_add_as to PROBE request.
It is mentioned that " the bus number computation may not be finalized ". Can 
you please give some more information.
I am asking this because from vfio perspective translate/replay will be called 
much before the PROBE request and endpoint needed to be registered by that time.


Thanks
-Bharat

> 
> v4 -> v5:
> - initialize as->endpoint_list
> 
> v3 -> v4:
> - new separate patch
> ---
>  hw/virtio/trace-events   |   4 ++
>  hw/virtio/virtio-iommu.c | 125
> ++-
>  2 files changed, 128 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index
> 9270b0463e..4b15086872 100644
> --- a/hw/virtio/trace-events
> +++ b/hw/virtio/trace-events
> @@ -61,3 +61,7 @@ virtio_iommu_map(uint32_t domain_id, uint64_t
> virt_start, uint64_t virt_end, uin  virtio_iommu_unmap(uint32_t domain_id,
> uint64_t virt_start, uint64_t virt_end) "domain=%d virt_start=0x%"PRIx64"
> virt_end=0x%"PRIx64  virtio_iommu_translate(const char *name, uint32_t
> rid, uint64_t iova, int flag) "mr=%s rid=%d addr=0x%"PRIx64" flag=%d"
>  virtio_iommu_init_iommu_mr(char *iommu_mr) "init %s"
> +virtio_iommu_get_endpoint(uint32_t ep_id) "Alloc endpoint=%d"
> +virtio_iommu_put_endpoint(uint32_t ep_id) "Free endpoint=%d"
> +virtio_iommu_get_domain(uint32_t domain_id) "Alloc domain=%d"
> +virtio_iommu_put_domain(uint32_t domain_id) "Free domain=%d"
> diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index
> dead062baf..1b9c3ba416 100644
> --- a/hw/virtio/virtio-iommu.c
> +++ b/hw/virtio/virtio-iommu.c
> @@ -33,20 +33,124 @@
>  #include "hw/virtio/virtio-bus.h"
>  #include "hw/virtio/virtio-access.h"
>  #include "hw/virtio/virtio-iommu.h"
> +#include "hw/pci/pci_bus.h"
> +#include "hw/pci/pci.h"
> 
>  /* Max size */
>  #define VIOMMU_DEFAULT_QUEUE_SIZE 256
> 
> +typedef struct viommu_domain {
> +uint32_t id;
> +GTree *mappings;
> +QLIST_HEAD(, viommu_endpoint) endpoint_list; } viommu_domain;
> +
> +typedef struct viommu_endpoint {
> +uint32_t id;
> +viommu_domain *domain;
> +QLIST_ENTRY(viommu_endpoint) next;
> +VirtIOIOMMU *viommu;
> +} viommu_endpoint;
> +
> +typedef struct viommu_interval {
> +uint64_t low;
> +uint64_t high;
> +} viommu_interval;
> +
>  static inline uint16_t virtio_iommu_get_sid(IOMMUDevice *dev)  {
>  return PCI_BUILD_BDF(pci_bus_num(dev->bus), dev->devfn);  }
> 
> +static gint interval_cmp(gconstpointer a, gconstpointer b, gpointer
> +user_data) {
> +viommu_interval *inta = (viommu_interval *)a;
> +viommu_interval *intb = (viommu_interval *)b;
> +
> +if (inta->high <= intb->low) {
> +return -1;
> +} else if (intb->high <= inta->low) {
> +return 1;
> +} else {
> +return 0;
> +}
> +}
> +
> +static void
> virtio_iommu_detach_endpoint_from_domain(viommu_endpoint
> +*ep) {
> +QLIST_REMOVE(ep, next);
> +ep->domain = NULL;
> +}
> +
> +viommu_endpoint *virtio_iommu_get_endpoint(VirtIOIOMMU *s,
> uint32_t
> +ep_id); viommu_endpoint *virtio_iommu_get_endpoint(VirtIOIOMMU *s,
> +uint32_t ep_id) {
> +viommu_endpoint *ep;
> +
> +ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(ep_id));
> +if (ep) {
> +return ep;
> +}
> +ep = g_malloc0(sizeof(*ep));
> +ep->id = ep_id;
> +ep->viommu = s;
> +trace_virtio_iommu_get_endpoint(ep_id);
> +g_tree_insert(s->endpoints, GUINT_TO_POINTER(ep_id), ep);
> +return ep;
> +}
> +
> +static void virtio_iommu_put_endpoint(gpointer data) {
> +viommu_endpoint *ep = (viommu_endpoint *)data;
> +
> +if (ep->domain) {
> +virtio_iommu_detach_endpoint_from_domain(ep);
> +g_tree_unref(ep->domain->mappings);
> +}
> +
> +trace_virtio_iommu_put_endpoint(ep->id);
> +g_free(ep);
> +}
> +
> +viommu_domain *virtio_iommu_get_domain(VirtIOIOMMU *s,

Re: [Qemu-devel] [PATCH-for-3.1] [REGRESSION FIX] ps2kbd: default to scan enabled after reset

2018-11-22 Thread Gerd Hoffmann

On Thu, Nov 22, 2018 at 07:30:41PM +0100, Hervé Poussineau wrote:
> Ping again.

Queued up for 3.1

thanks,
  Gerd

Re: [Qemu-devel] [PATCH] audio/hda: fix guest triggerable assert

2018-11-22 Thread Gerd Hoffmann

On Thu, Nov 22, 2018 at 03:52:12PM +, Dr. David Alan Gilbert wrote:
> * Gerd Hoffmann (kra...@redhat.com) wrote:
> > Guest writes to a readonly register trigger the assert in
> > intel_hda_reg_write().  Add a check and just ignore them.
> > 
> > Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1628433
> > Signed-off-by: Gerd Hoffmann 
> 
> Does make you wonder:
>   a) Why the guest was writing to a read-only register
>   b) Wth it only did it in the weird combination of
>  devices tested.

Note that we also have pci hotplug involved.  Probably a bug in the
guest, maybe due to a register access landing at the wrong device while
reshuffling the bars or something like that.

cheers,
  Gerd

Re: [Qemu-devel] [PATCH v5 04/36] ppc/xive: introduce the XiveRouter model

2018-11-22 Thread David Gibson

On Thu, Nov 22, 2018 at 08:53:00AM +0100, Cédric Le Goater wrote:
> On 11/22/18 5:11 AM, David Gibson wrote:
> > On Fri, Nov 16, 2018 at 11:56:57AM +0100, Cédric Le Goater wrote:
> >> The XiveRouter models the second sub-engine of the overall XIVE
> >> architecture : the Interrupt Virtualization Routing Engine (IVRE).
> >>
> >> The IVRE handles event notifications of the IVSE through MMIO stores
> >> and performs the interrupt routing process. For this purpose, it uses
> >> a set of table stored in system memory, the first of which being the
> >> Event Assignment Structure (EAS) table.
> >>
> >> The EAT associates an interrupt source number with an Event Notification
> >> Descriptor (END) which will be used in a second phase of the routing
> >> process to identify a Notification Virtual Target.
> >>
> >> The XiveRouter is an abstract class which needs to be inherited from
> >> to define a storage for the EAT, and other upcoming tables. The
> >> 'chip-id' atttribute is not strictly necessary for the sPAPR and
> >> PowerNV machines but it's a good way to test the routing algorithm.
> >> Without this atttribute, the XiveRouter could be a simple QOM
> >> interface.
> >>
> >> Signed-off-by: Cédric Le Goater 
> >> ---
> >>  include/hw/ppc/xive.h  | 32 ++
> >>  include/hw/ppc/xive_regs.h | 31 ++
> >>  hw/intc/xive.c | 86 ++
> >>  3 files changed, 149 insertions(+)
> >>  create mode 100644 include/hw/ppc/xive_regs.h
> >>
> >> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
> >> index be93fae6317b..5a0696366577 100644
> >> --- a/include/hw/ppc/xive.h
> >> +++ b/include/hw/ppc/xive.h
> >> @@ -11,6 +11,7 @@
> >>  #define PPC_XIVE_H
> >>  
> >>  #include "hw/sysbus.h"
> > 
> > Again, I don't think making this a SysBusDevice is quite right.
> > Even more so for the router than the source, because at least for PAPR
> > it might not have any MMIO presence at all.
> 
> The controller model inherits from the XiveRouter and manages the
> TIMA.

Um.. I'm not sure what you mean by the "controller model".  Surely the
presenter should own the TIMA, not the router?

> 
> >> +#include "hw/ppc/xive_regs.h"
> >>  
> >>  /*
> >>   * XIVE Fabric (Interface between Source and Router)
> >> @@ -168,4 +169,35 @@ static inline void xive_source_irq_set(XiveSource 
> >> *xsrc, uint32_t srcno,
> >>  }
> >>  }
> >>  
> >> +/*
> >> + * XIVE Router
> >> + */
> >> +
> >> +typedef struct XiveRouter {
> >> +SysBusDeviceparent;
> >> +
> >> +uint32_tchip_id;
> > 
> > I don't think this belongs in the base class.  The PowerNV specific
> > variants will need it, but it doesn't make sense for the PAPR version.
> 
> yeah. I am using it as a END and NVT block identifier but it's not 
> required for sPAPR, it could just be zero. 
> 
> It was good to test the routing algo which should not assume that the 
> block id is zero. 
>  
> > 
> >> +} XiveRouter;
> >> +
> >> +#define TYPE_XIVE_ROUTER "xive-router"
> >> +#define XIVE_ROUTER(obj)\
> >> +OBJECT_CHECK(XiveRouter, (obj), TYPE_XIVE_ROUTER)
> >> +#define XIVE_ROUTER_CLASS(klass)\
> >> +OBJECT_CLASS_CHECK(XiveRouterClass, (klass), TYPE_XIVE_ROUTER)
> >> +#define XIVE_ROUTER_GET_CLASS(obj)  \
> >> +OBJECT_GET_CLASS(XiveRouterClass, (obj), TYPE_XIVE_ROUTER)
> >> +
> >> +typedef struct XiveRouterClass {
> >> +SysBusDeviceClass parent;
> >> +
> >> +/* XIVE table accessors */
> >> +int (*get_eas)(XiveRouter *xrtr, uint32_t lisn, XiveEAS *eas);
> >> +int (*set_eas)(XiveRouter *xrtr, uint32_t lisn, XiveEAS *eas);
> >> +} XiveRouterClass;
> >> +
> >> +void xive_eas_pic_print_info(XiveEAS *eas, uint32_t lisn, Monitor *mon);
> >> +
> >> +int xive_router_get_eas(XiveRouter *xrtr, uint32_t lisn, XiveEAS *eas);
> >> +int xive_router_set_eas(XiveRouter *xrtr, uint32_t lisn, XiveEAS *eas);
> >> +
> >>  #endif /* PPC_XIVE_H */
> >> diff --git a/include/hw/ppc/xive_regs.h b/include/hw/ppc/xive_regs.h
> >> new file mode 100644
> >> index ..12499b33614c
> >> --- /dev/null
> >> +++ b/include/hw/ppc/xive_regs.h
> >> @@ -0,0 +1,31 @@
> >> +/*
> >> + * QEMU PowerPC XIVE interrupt controller model
> >> + *
> >> + * Copyright (c) 2016-2018, IBM Corporation.
> >> + *
> >> + * This code is licensed under the GPL version 2 or later. See the
> >> + * COPYING file in the top-level directory.
> >> + */
> >> +
> >> +#ifndef PPC_XIVE_REGS_H
> >> +#define PPC_XIVE_REGS_H
> >> +
> >> +/* EAS (Event Assignment Structure)
> >> + *
> >> + * One per interrupt source. Targets an interrupt to a given Event
> >> + * Notification Descriptor (END) and provides the corresponding
> >> + * logical interrupt number (END data)
> >> + */
> >> +typedef struct XiveEAS {
> >> +/* Use a single 64-bit definition to make it easier to
> >> + * perform atomic updates
> >> + */
> >> +

Re: [Qemu-devel] [PATCH v5 07/36] ppc/xive: introduce the XIVE interrupt thread context

2018-11-22 Thread David Gibson

On Fri, Nov 16, 2018 at 11:57:00AM +0100, Cédric Le Goater wrote:
> Each POWER9 processor chip has a XIVE presenter that can generate four
> different exceptions to its threads:
> 
>   - hypervisor exception,
>   - O/S exception
>   - Event-Based Branch (EBB)
>   - msgsnd (doorbell).
> 
> Each exception has a state independent from the others called a Thread
> Interrupt Management context. This context is a set of registers which
> lets the thread handle priority management and interrupt acknowledgment
> among other things. The most important ones being :
> 
>   - Interrupt Priority Register  (PIPR)
>   - Interrupt Pending Buffer (IPB)
>   - Current Processor Priority   (CPPR)
>   - Notification Source Register (NSR)
> 
> These registers are accessible through a specific MMIO region, called
> the Thread Interrupt Management Area (TIMA), four aligned pages, each
> exposing a different view of the registers. First page (page address
> ending in 0b00) gives access to the entire context and is reserved for
> the ring 0 security monitor. The second (page address ending in 0b01)
> is for the hypervisor, ring 1. The third (page address ending in 0b10)
> is for the operating system, ring 2. The fourth (page address ending
> in 0b11) is for user level, ring 3.
> 
> The thread interrupt context is modeled with a XiveTCTX object
> containing the values of the different exception registers. The TIMA
> region is mapped at the same address for each CPU.
> 
> Signed-off-by: Cédric Le Goater 
> ---
>  include/hw/ppc/xive.h  |  36 +++
>  include/hw/ppc/xive_regs.h |  82 +++
>  hw/intc/xive.c | 443 +
>  3 files changed, 561 insertions(+)
> 
> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
> index 24301bf2076d..5987f26ddb98 100644
> --- a/include/hw/ppc/xive.h
> +++ b/include/hw/ppc/xive.h
> @@ -238,4 +238,40 @@ typedef struct XiveENDSource {
>  void xive_end_reset(XiveEND *end);
>  void xive_end_pic_print_info(XiveEND *end, uint32_t end_idx, Monitor *mon);
>  
> +/*
> + * XIVE Thread interrupt Management (TM) context
> + */
> +
> +#define TYPE_XIVE_TCTX "xive-tctx"
> +#define XIVE_TCTX(obj) OBJECT_CHECK(XiveTCTX, (obj), TYPE_XIVE_TCTX)
> +
> +/*
> + * XIVE Thread interrupt Management register rings :
> + *
> + *   QW-0  User   event-based exception state
> + *   QW-1  O/SOS context for priority management, interrupt acks
> + *   QW-2  Pool   hypervisor context for virtual processor being 
> dispatched
> + *   QW-3  Physical   for the security monitor to manage the entire context

That last description is misleading, AIUI the hypervisor can and does
make use of the physical ring as well as the pool ring.

> + */
> +#define TM_RING_COUNT   4
> +#define TM_RING_SIZE0x10
> +
> +typedef struct XiveTCTX {
> +DeviceState parent_obj;
> +
> +CPUState*cs;
> +qemu_irqoutput;
> +
> +uint8_t regs[TM_RING_COUNT * TM_RING_SIZE];

I'm a bit dubious about representing the state with a full buffer like
this.  Isn't a fair bit of this space reserved or derived values which
aren't backed by real state?

> +
> +XiveRouter  *xrtr;

What's this for?  AFAIK a TCTX isn't associated with a particular
routing unit.

> +} XiveTCTX;
> +
> +/*
> + * XIVE Thread Interrupt Management Aera (TIMA)

Typo s/Aera/Area/

> + */
> +extern const MemoryRegionOps xive_tm_ops;
> +
> +void xive_tctx_pic_print_info(XiveTCTX *tctx, Monitor *mon);
> +
>  #endif /* PPC_XIVE_H */
> diff --git a/include/hw/ppc/xive_regs.h b/include/hw/ppc/xive_regs.h
> index f97fb2b90bee..2e3d6cb507da 100644
> --- a/include/hw/ppc/xive_regs.h
> +++ b/include/hw/ppc/xive_regs.h
> @@ -10,6 +10,88 @@
>  #ifndef PPC_XIVE_REGS_H
>  #define PPC_XIVE_REGS_H
>  
> +#define TM_SHIFT16
> +
> +/* TM register offsets */
> +#define TM_QW0_USER 0x000 /* All rings */
> +#define TM_QW1_OS   0x010 /* Ring 0..2 */
> +#define TM_QW2_HV_POOL  0x020 /* Ring 0..1 */
> +#define TM_QW3_HV_PHYS  0x030 /* Ring 0..1 */
> +
> +/* Byte offsets inside a QW QW0 QW1 QW2 QW3 */
> +#define TM_NSR  0x0  /*  +   +   -   +  */
> +#define TM_CPPR 0x1  /*  -   +   -   +  */
> +#define TM_IPB  0x2  /*  -   +   +   +  */
> +#define TM_LSMFB0x3  /*  -   +   +   +  */
> +#define TM_ACK_CNT  0x4  /*  -   +   -   -  */
> +#define TM_INC  0x5  /*  -   +   -   +  */
> +#define TM_AGE  0x6  /*  -   +   -   +  */
> +#define TM_PIPR 0x7  /*  -   +   -   +  */
> +
> +#define TM_WORD00x0
> +#define TM_WORD10x4
> +
> +/*
> + * QW word 2 contains the valid bit at the top and other fields
> + * depending on the QW.
> + */
> +#define TM_WORD20x8
> +#define   TM_QW0W2_VU   PPC_BIT32(0)
> +#define   TM_QW0W2_LOGIC_SERV   PPC_BITMASK32(1, 31) /* XX 2,31 ? */
>

Re: [Qemu-devel] [PATCH v5 04/36] ppc/xive: introduce the XiveRouter model

2018-11-22 Thread David Gibson

On Thu, Nov 22, 2018 at 05:50:07PM +1100, Benjamin Herrenschmidt wrote:
> On Thu, 2018-11-22 at 15:44 +1100, David Gibson wrote:
> > 
> > Sorry, didn't think of this in my first reply.
> > 
> > 1) Does the hardware ever actually write back to the EAS?  I know it
> > does for the END, but it's not clear why it would need to for the
> > EAS.  If not, we don't need the setter.
> 
> Nope, though the PAPR model will via hcalls

Right, bit AIUI the set_eas hook is about abstracting PAPR vs bare
metal details.  Since the hcall knows it's PAPR it can just update the
backing information for the EAS directly, and no need for an
abstracted hook.

> 
> > 
> > 2) The signatures are a bit odd here.  For the setter, a value would
> > make sense than a (XiveEAS *), since it's just a word.  For the getter
> > you could return the EAS value directly rather than using a pointer -
> > there's already a valid bit in the EAS so you can construct a value
> > with that cleared if the lisn is out of bounds.
> 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature

Re: [Qemu-devel] [PATCH v5 05/36] ppc/xive: introduce the XIVE Event Notification Descriptors

2018-11-22 Thread David Gibson

On Thu, Nov 22, 2018 at 10:47:44PM +0100, Cédric Le Goater wrote:
> On 11/22/18 5:41 AM, David Gibson wrote:
> > On Fri, Nov 16, 2018 at 11:56:58AM +0100, Cédric Le Goater wrote:
> >> To complete the event routing, the IVRE sub-engine uses an internal
> >> table containing Event Notification Descriptor (END) structures.
> >>
> >> An END specifies on which Event Queue (EQ) the event notification
> >> data, defined in the associated EAS, should be posted when an
> >> exception occurs. It also defines which Notification Virtual Target
> >> (NVT) should be notified.
> >>
> >> The Event Queue is a memory page provided by the O/S defining a
> >> circular buffer, one per server and priority couple, containing Event
> >> Queue entries. These are 4 bytes long, the first bit being a
> >> 'generation' bit and the 31 following bits the END Data field. They
> >> are pulled by the O/S when the exception occurs.
> >>
> >> The END Data field is a way to set an invariant logical event source
> >> number for an IRQ. It is set with the H_INT_SET_SOURCE_CONFIG hcall
> >> when the EISN flag is used.
> >>
> >> Signed-off-by: Cédric Le Goater 
> >> ---
> >>  include/hw/ppc/xive.h  |  18 
> >>  include/hw/ppc/xive_regs.h |  48 ++
> >>  hw/intc/xive.c | 185 -
> >>  3 files changed, 248 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
> >> index 5a0696366577..ce62aaf28343 100644
> >> --- a/include/hw/ppc/xive.h
> >> +++ b/include/hw/ppc/xive.h
> >> @@ -193,11 +193,29 @@ typedef struct XiveRouterClass {
> >>  /* XIVE table accessors */
> >>  int (*get_eas)(XiveRouter *xrtr, uint32_t lisn, XiveEAS *eas);
> >>  int (*set_eas)(XiveRouter *xrtr, uint32_t lisn, XiveEAS *eas);
> >> +int (*get_end)(XiveRouter *xrtr, uint8_t end_blk, uint32_t end_idx,
> >> +   XiveEND *end);
> >> +int (*set_end)(XiveRouter *xrtr, uint8_t end_blk, uint32_t end_idx,
> >> +   XiveEND *end);
> > 
> > Hrm.  So unlike the EAS, which is basically just a word, the END is a
> > pretty large structure.  
> 
> yes. and so will be the NVT.
> 
> > It's unclear here if get/set are expected to copy the whole thing out 
> > and in, 
> 
> That's the plan. 

Yeah, I don't think that's a good idea.  In some cases the updates are
on hot paths, so the extra copy isn't good, and more importantly it
makes it look like an atomic update, but it's not really.

Well... I guess it probably is because of the BQL, but I'd prefer not
to rely on that excessively.

> What I had in mind are memory accessors to the XIVE structures, which 
> are local to QEMU for sPAPR and in the guest RAM for PowerNV (Please
> take a look at the XIVE PowerNV model).
> 
> > or if get give you a pointer into a "live" structure 
> 
> no
> 
> > and set just does any necessary barriers after an update.
> that would be too complex for the PowerNV model I think. There is a cache
> in between the software running on the (QEMU) machine and the XIVE HW but
> it would be hard to handle. 
>  
> > Really, for a non-atomic value like this, I'm not sure get/set is the
> > right model.
> 
> ok. we need something to get them out and in.

I've thought about this a bit more.  What I think might work is
"end_read" and "end_write" callbacks, which take a word number in
addition to the parameters you have already

> > Also as I understand it nearly all the indices in XIVE are broken into
> > block/index.  Is there a reason those are folded together into lisn
> > for the EAS, but not for the END?
> 
> The indexing of the EAT is global to the sytem and the index defines
> which blk to use. The IRQ source numbers on the powerbus are architected 
> to be :
> 
> #define XIVE_SRCNO(blk, idx)  ((uint32_t)(blk) << 28 | (idx))
> 
> and XIVE can use different strategies to identify the XIVE IC in charge 
> of routing. It can be a one-to-one chip to block relation as skiboot does. 
> Using a block scope table is possible also. Our model only supports one 
> block per chip and some shortcuts are taken but not that much in fact.
>  
> Remote access to the XIVE structures of another chip are done through 
> MMIO (not modeled in PowerNV) and the blkid is used to partition the MMIO 
> regions. Being local is better for performance because the END and NVT 
> tables have a strong relation with the XIVE subengines using them 
> (VC and PC). 
> 
> May be, Ben can clarified it this is badly explained.

Right.. I think I understand what the blocks are all about.

But my question is, why encode the block and index together for the
EAS, but separately for the END?

> 
> >>  } XiveRouterClass;
> >>  
> >>  void xive_eas_pic_print_info(XiveEAS *eas, uint32_t lisn, Monitor *mon);
> >>  
> >>  int xive_router_get_eas(XiveRouter *xrtr, uint32_t lisn, XiveEAS *eas);
> >>  int xive_router_set_eas(XiveRouter *xrtr, uint32_t lisn, XiveEAS *eas);
> >> +int xive_router_get_end(XiveRouter

Re: [Qemu-devel] [PATCH v5 02/36] ppc/xive: add support for the LSI interrupt sources

2018-11-22 Thread David Gibson

On Thu, Nov 22, 2018 at 08:39:41AM +0100, Cédric Le Goater wrote:
> On 11/22/18 4:19 AM, David Gibson wrote:
> > On Fri, Nov 16, 2018 at 11:56:55AM +0100, Cédric Le Goater wrote:
> >> The 'sent' status of the LSI interrupt source is modeled with the 'P'
> >> bit of the ESB and the assertion status of the source is maintained in
> >> an array under the main sPAPRXive object. The type of the source is
> >> stored in the same array for practical reasons.
> >>
> >> Signed-off-by: Cédric Le Goater 
> > 
> > Looks good except for some minor details.
> > 
> >> ---
> >>  include/hw/ppc/xive.h | 20 -
> >>  hw/intc/xive.c| 68 +++
> >>  2 files changed, 81 insertions(+), 7 deletions(-)
> >>
> >> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
> >> index 5fec4b08705d..e118acd59f1e 100644
> >> --- a/include/hw/ppc/xive.h
> >> +++ b/include/hw/ppc/xive.h
> >> @@ -32,8 +32,10 @@ typedef struct XiveSource {
> >>  /* IRQs */
> >>  uint32_tnr_irqs;
> >>  qemu_irq*qirqs;
> >> +unsigned long   *lsi_map;
> >> +int32_t lsi_map_size; /* for VMSTATE_BITMAP */
> > 
> > At some point it's possible we'll want XiveSource subclasses that just
> > know which irqs are LSI and which aren't without an explicit map.  But
> > this detail isn't exposed in the migration stream or the user
> > interface, so we can tweak it later as ncessary.
> > 
> >> -/* PQ bits */
> >> +/* PQ bits and LSI assertion bit */
> >>  uint8_t *status;
> >>  
> >>  /* ESB memory region */
> >> @@ -89,6 +91,7 @@ static inline hwaddr xive_source_esb_mgmt(XiveSource 
> >> *xsrc, int srcno)
> >>   * When doing an EOI, the Q bit will indicate if the interrupt
> >>   * needs to be re-triggered.
> >>   */
> >> +#define XIVE_STATUS_ASSERTED  0x4  /* Extra bit for LSI */
> >>  #define XIVE_ESB_VAL_P0x2
> >>  #define XIVE_ESB_VAL_Q0x1
> >>  
> >> @@ -127,4 +130,19 @@ static inline qemu_irq xive_source_qirq(XiveSource 
> >> *xsrc, uint32_t srcno)
> >>  return xsrc->qirqs[srcno];
> >>  }
> >>  
> >> +static inline bool xive_source_irq_is_lsi(XiveSource *xsrc, uint32_t 
> >> srcno)
> >> +{
> >> +assert(srcno < xsrc->nr_irqs);
> >> +return test_bit(srcno, xsrc->lsi_map);
> >> +}
> >> +
> >> +static inline void xive_source_irq_set(XiveSource *xsrc, uint32_t srcno,
> >> +   bool lsi)
> > 
> > The function name isn't obvious about this being controlling LSI
> > configuration. '..._irq_set_lsi' maybe?
> 
> yes.
> 
> 
> >> +{
> >> +assert(srcno < xsrc->nr_irqs);
> >> +if (lsi) {
> >> +bitmap_set(xsrc->lsi_map, srcno, 1);
> >> +}
> >> +}
> >> +
> >>  #endif /* PPC_XIVE_H */
> >> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> >> index f7621f84828c..ac4605fee8b7 100644
> >> --- a/hw/intc/xive.c
> >> +++ b/hw/intc/xive.c
> >> @@ -88,14 +88,40 @@ uint8_t xive_source_esb_set(XiveSource *xsrc, uint32_t 
> >> srcno, uint8_t pq)
> >>  return xive_esb_set(>status[srcno], pq);
> >>  }
> >>  
> >> +/*
> >> + * Returns whether the event notification should be forwarded.
> >> + */
> >> +static bool xive_source_lsi_trigger(XiveSource *xsrc, uint32_t
> >> srcno)
> > 
> > What exactly "trigger" means isn't entirely obvious for an LSI.  Might
> > be clearer to have "lsi_assert" and "lsi_deassert" helpers instead.
> 
> This is called only when the interrupt is asserted. So it is a 
> simplified LSI trigger depending only on the 'P' bit.

Yes, I see that.  But the result is that while the MSI logic is
encapsulated in the MSI trigger function, this leaves the LSI logic
split across the trigger function and set_irq() itself.  I think it
would be better to have assert and deassert helpers instead, which
handle both the trigger/notification and also the updating of the
ASSERTED bit.

> > 
> >> +{
> >> +uint8_t old_pq = xive_source_esb_get(xsrc, srcno);
> >> +
> >> +switch (old_pq) {
> >> +case XIVE_ESB_RESET:
> >> +xive_source_esb_set(xsrc, srcno, XIVE_ESB_PENDING);
> >> +return true;
> >> +default:
> >> +return false;
> >> +}
> >> +}
> >> +
> >>  /*
> >>   * Returns whether the event notification should be forwarded.
> >>   */
> >>  static bool xive_source_esb_trigger(XiveSource *xsrc, uint32_t srcno)
> >>  {
> >> +bool ret;
> >> +
> >>  assert(srcno < xsrc->nr_irqs);
> >>  
> >> -return xive_esb_trigger(>status[srcno]);
> >> +ret = xive_esb_trigger(>status[srcno]);
> >> +
> >> +if (xive_source_irq_is_lsi(xsrc, srcno) &&
> >> +xive_source_esb_get(xsrc, srcno) == XIVE_ESB_QUEUED) {
> >> +qemu_log_mask(LOG_GUEST_ERROR,
> >> +  "XIVE: queued an event on LSI IRQ %d\n", srcno);
> >> +}
> >> +
> >> +return ret;
> >>  }
> >>  
> >>  /*
> >> @@ -103,9 +129,22 @@ static bool xive_source_esb_trigger(XiveSource *xsrc, 
> >> uint32_t srcno)
> >>   */
> >>  static bool

Re: [Qemu-devel] [PATCH v3] qapi: add query-display-options command

2018-11-22 Thread Gerd Hoffmann

On Thu, Nov 22, 2018 at 03:58:02PM +0100, Erik Skultety wrote:
> On Thu, Nov 22, 2018 at 08:16:13AM +0100, Gerd Hoffmann wrote:
> > Add query-display-options command, which allows querying the qemu
> > display configuration, and -- as an intentional side effect -- makes
> > DisplayOptions discoverable via query-qmp-schema so libvirt can go
> > figure which display options are supported.
> >
> > Use case: commit d4dc4ab1 added rendernode parameter for egl-headless.
> >
> > Signed-off-by: Gerd Hoffmann 
> > Reviewed-by: Eric Blake 
> > Tested-by: Eric Blake 
> > Tested-by: Erik Skultety 
> > ---
> 
> FYI I have the first libvirt prototype patches [1] (need some polishing 
> though)
> ready and everything worked even with this v3 patch.
> 
> [1] https://github.com/eskultety/libvirt/commits/egl-headless

Good.  Queued up for 3.1

cheers,
  Gerd

Re: [Qemu-devel] [PATCH v5 06/36] ppc/xive: add support for the END Event State buffers

2018-11-22 Thread David Gibson

On Thu, Nov 22, 2018 at 10:58:56PM +0100, Cédric Le Goater wrote:
> On 11/22/18 6:13 AM, David Gibson wrote:
> > On Fri, Nov 16, 2018 at 11:56:59AM +0100, Cédric Le Goater wrote:
> >> The Event Notification Descriptor also contains two Event State
> >> Buffers providing further coalescing of interrupts, one for the
> >> notification event (ESn) and one for the escalation events (ESe). A
> >> MMIO page is assigned for each to control the EOI through loads
> >> only. Stores are not allowed.
> >>
> >> The END ESBs are modeled through an object resembling the 'XiveSource'
> >> It is stateless as the END state bits are backed into the XiveEND
> >> structure under the XiveRouter and the MMIO accesses follow the same
> >> rules as for the standard source ESBs.
> >>
> >> END ESBs are not supported by the Linux drivers neither on OPAL nor on
> >> sPAPR. Nevetherless, it provides a mean to study the question in the
> >> future and validates a bit more the XIVE model.
> >>
> >> Signed-off-by: Cédric Le Goater 
> >> ---
> >>  include/hw/ppc/xive.h |  20 ++
> >>  hw/intc/xive.c| 160 +-
> >>  2 files changed, 178 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
> >> index ce62aaf28343..24301bf2076d 100644
> >> --- a/include/hw/ppc/xive.h
> >> +++ b/include/hw/ppc/xive.h
> >> @@ -208,6 +208,26 @@ int xive_router_get_end(XiveRouter *xrtr, uint8_t 
> >> end_blk, uint32_t end_idx,
> >>  int xive_router_set_end(XiveRouter *xrtr, uint8_t end_blk, uint32_t 
> >> end_idx,
> >>  XiveEND *end);
> >>  
> >> +/*
> >> + * XIVE END ESBs
> >> + */
> >> +
> >> +#define TYPE_XIVE_END_SOURCE "xive-end-source"
> >> +#define XIVE_END_SOURCE(obj) \
> >> +OBJECT_CHECK(XiveENDSource, (obj), TYPE_XIVE_END_SOURCE)
> > 
> > Is there a particular reason to make this a full QOM object, rather
> > than just embedding it in the XiveRouter?
> 
> yes, it should probably be under the XiveRouter you are right because
> there is a direct link with the ENDT which is in the XiverRouter. 
> 
> But if I remove the chip_id field from the XiveRouter, it becomes a QOM
> interface. something to ponder.

Huh?  I really don't understand what you're saying here.  What does
chip_id have to do with anything?

>  
> >> +typedef struct XiveENDSource {
> >> +SysBusDevice parent;
> >> +
> >> +uint32_tnr_ends;
> >> +
> >> +/* ESB memory region */
> >> +uint32_tesb_shift;
> >> +MemoryRegionesb_mmio;
> >> +
> >> +XiveRouter  *xrtr;
> >> +} XiveENDSource;
> >> +
> >>  /*
> >>   * For legacy compatibility, the exceptions define up to 256 different
> >>   * priorities. P9 implements only 9 levels : 8 active levels [0 - 7]
> >> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
> >> index 9cb001e7b540..5a8882d47a98 100644
> >> --- a/hw/intc/xive.c
> >> +++ b/hw/intc/xive.c
> >> @@ -622,8 +622,18 @@ static void xive_router_end_notify(XiveRouter *xrtr, 
> >> uint8_t end_blk,
> >>   * even futher coalescing in the Router
> >>   */
> >>  if (!(end.w0 & END_W0_UCOND_NOTIFY)) {
> >> -qemu_log_mask(LOG_UNIMP, "XIVE: !UCOND_NOTIFY not implemented\n");
> >> -return;
> >> +uint8_t pq = GETFIELD(END_W1_ESn, end.w1);
> >> +bool notify = xive_esb_trigger();
> >> +
> >> +if (pq != GETFIELD(END_W1_ESn, end.w1)) {
> >> +end.w1 = SETFIELD(END_W1_ESn, end.w1, pq);
> >> +xive_router_set_end(xrtr, end_blk, end_idx, );
> >> +}
> >> +
> >> +/* ESn[Q]=1 : end of notification */
> >> +if (!notify) {
> >> +return;
> >> +}
> >>  }
> >>  
> >>  /*
> >> @@ -706,6 +716,151 @@ void xive_eas_pic_print_info(XiveEAS *eas, uint32_t 
> >> lisn, Monitor *mon)
> >> (uint32_t) GETFIELD(EAS_END_DATA, eas->w));
> >>  }
> >>  
> >> +/*
> >> + * END ESB MMIO loads
> >> + */
> >> +static uint64_t xive_end_source_read(void *opaque, hwaddr addr, unsigned 
> >> size)
> >> +{
> >> +XiveENDSource *xsrc = XIVE_END_SOURCE(opaque);
> >> +XiveRouter *xrtr = xsrc->xrtr;
> >> +uint32_t offset = addr & 0xFFF;
> >> +uint8_t end_blk;
> >> +uint32_t end_idx;
> >> +XiveEND end;
> >> +uint32_t end_esmask;
> >> +uint8_t pq;
> >> +uint64_t ret = -1;
> >> +
> >> +end_blk = xrtr->chip_id;
> >> +end_idx = addr >> (xsrc->esb_shift + 1);
> >> +if (xive_router_get_end(xrtr, end_blk, end_idx, )) {
> >> +qemu_log_mask(LOG_GUEST_ERROR, "XIVE: No END %x/%x\n", end_blk,
> >> +  end_idx);
> >> +return -1;
> >> +}
> >> +
> >> +if (!(end.w0 & END_W0_VALID)) {
> >> +qemu_log_mask(LOG_GUEST_ERROR, "XIVE: END %x/%x is invalid\n",
> >> +  end_blk, end_idx);
> >> +return -1;
> >> +}
> >> +
> >> +end_esmask = addr_is_even(addr, xsrc->esb_shift) ? END_W1_ESn : 
> >> END_W1_ESe;
> >> +pq =

Re: [Qemu-devel] [PATCH v5 05/36] ppc/xive: introduce the XIVE Event Notification Descriptors

2018-11-22 Thread David Gibson

On Thu, Nov 22, 2018 at 05:49:09PM +1100, Benjamin Herrenschmidt wrote:
> On Thu, 2018-11-22 at 15:41 +1100, David Gibson wrote:
> > 
> > > +void xive_end_reset(XiveEND *end)
> > > +{
> > > +memset(end, 0, sizeof(*end));
> > > +
> > > +/* switch off the escalation and notification ESBs */
> > > +end->w1 = END_W1_ESe_Q | END_W1_ESn_Q;
> > 
> > It's not obvious to me what circumstances this would be called under.
> > Since the ENDs are in system memory, a memset() seems like an odd
> > thing for (virtual) hardware to be doing to it.
> > 
> > > +}
> 
> Not on PAPR ...

Right, so the memset() can go in PAPR specific code.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature

Re: [Qemu-devel] [PATCH v5 04/36] ppc/xive: introduce the XiveRouter model

2018-11-22 Thread David Gibson

On Thu, Nov 22, 2018 at 08:59:32AM +0100, Cédric Le Goater wrote:
> On 11/22/18 7:50 AM, Benjamin Herrenschmidt wrote:
> > On Thu, 2018-11-22 at 15:44 +1100, David Gibson wrote:
> >>
> >> Sorry, didn't think of this in my first reply.
> >>
> >> 1) Does the hardware ever actually write back to the EAS?  I know it
> >> does for the END, but it's not clear why it would need to for the
> >> EAS.  If not, we don't need the setter.
> > 
> > Nope, though the PAPR model will via hcalls
> 
> Indeed. The H_INT_SET_SOURCE_CONFIG hcall updates the EAT.
> 
> >> 2) The signatures are a bit odd here.  For the setter, a value would
> >> make sense than a (XiveEAS *), since it's just a word.  For the getter
> >> you could return the EAS value directly rather than using a pointer -
> >> there's already a valid bit in the EAS so you can construct a value
> >> with that cleared if the lisn is out of bounds.
> 
> Yes we could. I think I made it that way to be consistent with the 
> other XIVE internal structures which are bigger : END, NVT

Yeah, but as noted elsewhere I don't really like the get/set model for
the bigger-than-word-size structures.  It gives the impression that
they're atomic updates when they can't be, as well as unnecessarily
copying a bunch of stuff, sometimes on hot paths

> There might be other reasons in Pnv. One was to use generic accessors 
> to the guest RAM but I didn't do it finally. Take a look at the Pnv
> model and we might decide to change the prototype then. I don't 
> think it's a major change.

Hmmm.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature

Re: [Qemu-devel] SeaBIOS booting time optimization

2018-11-22 Thread Gerd Hoffmann

On Thu, Nov 22, 2018 at 04:13:38PM +0100, Stefano Garzarella wrote:
> On Thu, Nov 22, 2018 at 12:51 PM Gerd Hoffmann  wrote:
> >
> > On Thu, Nov 22, 2018 at 12:08:55PM +0100, Stefano Garzarella wrote:
> > > Hi,
> > > I continued to investigate how to reduce the boot time with SeaBIOS
> > > and QEMU when it used with linuxboot_dma.bin (-kernel parameter).
> > > I reached ~12ms with a SeaBIOS configuration (attached) where I
> > > disabled debug output, all Hardware support (except SMM & MTRRs) and I
> > > applied a small patch to disable VGA setup and console (attached).
> >
> > Is there any difference to "qemu -vga none" ?
> 
> Using both (qemu -vga none, and my patch) we are around 10.8 ms.
> Note: using only the patch, Linux is still able to initialize and use the VGA.

But do you want linux use the vga console if you care about boot times?
I'd expect virtio-console would be fastest in that case.

> - QEMU -vga none + SeaBIOS config (CONFIG_DEBUG_LEVEL=0, disable all
> HW support except
> SMM & MTRRs) + Stephen's TPM patch
>  qemu_init_end: 43.675803
>  fw_start: 43.865178 (+0.189375)
>  fw_do_boot: 58.093161 (+14.227983)
>  linux_start_boot: 59.490308 (+1.397147)
>  linux_start_user: 556.782354 (+497.292046)
> 
> - QEMU -vga none + SeaBIOS config (CONFIG_DEBUG_LEVEL=0, disable all
> HW support except
> SMM & MTRRs, CONFIG_DISABLE_VGA=y) + Stephen's TPM patch
>  qemu_init_end: 42.387412
>  fw_start: 42.579257 (+0.191845)
>  fw_do_boot: 53.381517 (+10.802260)
>  linux_start_boot: 54.848643 (+1.467126)
>  linux_start_user: 498.517050 (+443.668407)

Interesting that CONFIG_DISABLE_VGA=y makes a noticable difference even
without vga hardware being preset.  And not only in seabios but also for
the linux kernel.

Do you know why?

cheers,
  Gerd

Re: [Qemu-devel] [PATCH v1 00/16] packed ring virtio-net backend support

2018-11-22 Thread Wei Xu

On Thu, Nov 22, 2018 at 06:57:31PM +0100, Maxime Coquelin wrote:
> Hi Wei,
> 
> I just tested your series with Tiwei's v3, and it fails
> with ctrl vq enabled:
> qemu-system-x86_64: virtio-net ctrl missing headers

OK, I haven't tried Tiwei's v3 yet, will give it a try.

Wei

> 
> Regards,
> Maxime
> 
> On 11/22/18 3:06 PM, w...@redhat.com wrote:
> >From: Wei Xu 
> >
> >Code base:
> > https://github.com/Whishay/qemu.git
> >
> >rfc v3 -> v1
> >- migration support for both userspace and vhost-net, need tweak vhost
> >   ioctl() to make it work(the code is pasted in the commit message of
> >   vhost migration patch #13).
> >
> >Note:
> >   the high 32-bit guest feature bit is saved as a subsection for
> >   virtio devices which makes packed ring feature bit check unusable when
> >   loading the saved per-queue variables(this is done before loading
> >   subsection which is the last action for device during migration),
> >   so I save and load all the things generally for now, any idea to fix this?
> >
> >- Fixed comments from Jason for rfc v3 sorted by patch #, two comments I
> >   didn't take were(from patch) listed here:
> >09: - introduce new API(virtqueue_fill_n()).
> >   - Didn't take it since userspace backend does not support batching,
> > so only one element is popped and current API should be enough.
> >06 & 07: Refactor split and packed pop()/get_avail_bytes().
> >  - the duplicated code interwined with split/packed ring specific
> >things and it might make it unclear, so I only extracted the few
> >common parts out side rcu and keep the others separate.
> >
> >The other revised comments:
> >02: - reuse current 'avail/used' for 'driver/device' in 
> >VRingMemoryRegionCache.
> > - remove event_idx since shadow_avail_idx works.
> >03: - move size recalculation to a separate patch.
> > - keep 'avail/used' in current calculation function name.
> > - initialize 'desc' memory region as 'false' for 1.0('true' for 1.1)
> >04: - delete 'event_idx'
> >05: - rename 'wc' to wrap_counter.
> >06: - converge common part outside rcu section for 1.0/1.1.
> > - move memory barrier for the first 'desc' in between checking flag
> >   and read other fields.
> > - remove unnecessary memory barriers for indirect descriptors.
> > - no need to destroy indirect memory cache since it is generally done
> >   before return from the function.
> > - remove redundant maximum chained descriptors limitation check.
> > - there are some differences(desc name, wrap idx/counter, flags) between
> >   split and packed rings, so keep them separate for now.
> > - amend the comment when recording index and wrap counter for a kick
> >   from guest.
> >07: - calculate fields in descriptor instead of read it when filling.
> > - put memory barrier correctly before filling the flags in descriptor.
> > - replace full memory barrier with a write barrier in fill.
> > - shift to read descriptor flags and descriptor necessarily and
> >   separately in packed_pop().
> > - correct memory barrier in packed_pop() as in packed_fill().
> >08: - reuse 'shadow_avail_idx' instead of adding a new 'event_idx'.
> > - use the compact and verified vring_packed_need_event()
> >   version for vhost net/user.
> >12: - remove the odd cherry-pick comment.
> > - used bit '15' for wrap_counters.
> >
> >rfc v2->v3
> >- addressed performance issue
> >- fixed feedback from v2
> >
> >rfc v1->v2
> >- sync to tiwei's v5
> >- reuse memory cache function with 1.0
> >- dropped detach patch and notification helper(04 & 05 in v1)
> >- guest virtio-net driver unload/reload support
> >- event suppression support(not tested)
> >- addressed feedback from v1
> >
> >Wei Xu (15):
> >   virtio: introduce packed ring definitions
> >   virtio: redefine structure & memory cache for packed ring
> >   virtio: expand offset calculation for packed ring
> >   virtio: add memory region init for packed ring
> >   virtio: init wrap counter for packed ring
> >   virtio: init and desc empty check for packed ring
> >   virtio: get avail bytes check for packed ring
> >   virtio: fill/flush/pop for packed ring
> >   virtio: event suppression support for packed ring
> >   virtio-net: fill head desc after done all in a chain
> >   virtio: add userspace migration of packed ring
> >   virtio: add vhost-net migration of packed ring
> >   virtio: packed ring feature bit for userspace backend
> >   vhost: enable packed ring
> >   virtio: enable packed ring via a new command line
> >
> >  VERSION|   2 +-
> >  hw/net/vhost_net.c |   2 +
> >  hw/net/virtio-net.c|  11 +-
> >  hw/virtio/virtio.c | 756 
> > +++--
> >  include/hw/virtio/virtio.h |   8 +-
> >  include/standard-headers/linux/virtio_config.h |  15 +
> >

[Qemu-devel] [PATCH] MAINTAINERS: Fix ACPI tests data files path

2018-11-22 Thread Philippe Mathieu-Daudé

Missed while moving those files in 438c78dab75.

Signed-off-by: Philippe Mathieu-Daudé 
---
 MAINTAINERS | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 1032406c56..6c4f25fb05 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1216,8 +1216,7 @@ F: hw/i386/acpi-build.[hc]
 F: hw/arm/virt-acpi-build.c
 F: tests/bios-tables-test.c
 F: tests/acpi-utils.[hc]
-F: tests/acpi-test-data/*
-F: tests/acpi-test-data/*/*
+F: tests/data/acpi/
 
 ppc4xx
 M: David Gibson 
-- 
2.17.2

Re: [Qemu-devel] [PATCH 1/5] VFIO KABI for migration interface

2018-11-22 Thread Zhao Yan

On Wed, Nov 21, 2018 at 04:39:39AM +0800, Kirti Wankhede wrote:
> - Defined MIGRATION region type and sub-type.
> - Defined VFIO device states during migration process.
> - Defined vfio_device_migration_info structure which will be placed at 0th
>   offset of migration region to get/set VFIO device related information.
>   Defined actions and members of structure usage for each action:
> * To convey VFIO device state to be transitioned to.
> * To get pending bytes yet to be migrated for VFIO device
> * To ask driver to write data to migration region and return number of 
> bytes
>   written in the region
> * In migration resume path, user space app writes to migration region and
>   communicates it to vendor driver.
> * Get bitmap of dirty pages from vendor driver from given start address
> 
> Signed-off-by: Kirti Wankhede 
> Reviewed-by: Neo Jia 
> ---
>  linux-headers/linux/vfio.h | 130 
> +
>  1 file changed, 130 insertions(+)
> 
> diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
> index 3615a269d378..a6e45cb2cae2 100644
> --- a/linux-headers/linux/vfio.h
> +++ b/linux-headers/linux/vfio.h
> @@ -301,6 +301,10 @@ struct vfio_region_info_cap_type {
>  #define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2)
>  #define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG  (3)
> 
> +/* Migration region type and sub-type */
> +#define VFIO_REGION_TYPE_MIGRATION (1 << 30)
> +#define VFIO_REGION_SUBTYPE_MIGRATION  (1)
> +
>  /*
>   * The MSIX mappable capability informs that MSIX data of a BAR can be 
> mmapped
>   * which allows direct access to non-MSIX registers which happened to be 
> within
> @@ -602,6 +606,132 @@ struct vfio_device_ioeventfd {
> 
>  #define VFIO_DEVICE_IOEVENTFD  _IO(VFIO_TYPE, VFIO_BASE + 16)
> 
> +/**
> + * VFIO device states :
> + * VFIO User space application should set the device state to indicate vendor
> + * driver in which state the VFIO device should transitioned.
> + * - VFIO_DEVICE_STATE_NONE:
> + *   State when VFIO device is initialized but not yet running.
> + * - VFIO_DEVICE_STATE_RUNNING:
> + *   Transition VFIO device in running state, that is, user space 
> application or
> + *   VM is active.
> + * - VFIO_DEVICE_STATE_MIGRATION_SETUP:
> + *   Transition VFIO device in migration setup state. This is used to prepare
> + *   VFIO device for migration while application or VM and vCPUs are still in
> + *   running state.
> + * - VFIO_DEVICE_STATE_MIGRATION_PRECOPY:
> + *   When VFIO user space application or VM is active and vCPUs are running,
> + *   transition VFIO device in pre-copy state.
> + * - VFIO_DEVICE_STATE_MIGRATION_STOPNCOPY:
> + *   When VFIO user space application or VM is stopped and vCPUs are halted,
> + *   transition VFIO device in stop-and-copy state.
> + * - VFIO_DEVICE_STATE_MIGRATION_SAVE_COMPLETED:
> + *   When VFIO user space application has copied data provided by vendor 
> driver.
> + *   This state is used by vendor driver to clean up all software state that 
> was
> + *   setup during MIGRATION_SETUP state.
> + * - VFIO_DEVICE_STATE_MIGRATION_RESUME:
> + *   Transition VFIO device to resume state, that is, start resuming VFIO 
> device
> + *   when user space application or VM is not running and vCPUs are halted.
> + * - VFIO_DEVICE_STATE_MIGRATION_RESUME_COMPLETED:
> + *   When user space application completes iterations of providing device 
> state
> + *   data, transition device in resume completed state.
> + * - VFIO_DEVICE_STATE_MIGRATION_FAILED:
> + *   Migration process failed due to some reason, transition device to failed
> + *   state. If migration process fails while saving at source, resume device 
> at
> + *   source. If migration process fails while resuming application or VM at
> + *   destination, stop restoration at destination and resume at source.
> + * - VFIO_DEVICE_STATE_MIGRATION_CANCELLED:
> + *   User space application has cancelled migration process either for some
> + *   known reason or due to user's intervention. Transition device to 
> Cancelled
> + *   state, that is, resume device state as it was during running state at
> + *   source.
> + */
> +
> +enum {
> +VFIO_DEVICE_STATE_NONE,
> +VFIO_DEVICE_STATE_RUNNING,
> +VFIO_DEVICE_STATE_MIGRATION_SETUP,
> +VFIO_DEVICE_STATE_MIGRATION_PRECOPY,
> +VFIO_DEVICE_STATE_MIGRATION_STOPNCOPY,
> +VFIO_DEVICE_STATE_MIGRATION_SAVE_COMPLETED,
> +VFIO_DEVICE_STATE_MIGRATION_RESUME,
> +VFIO_DEVICE_STATE_MIGRATION_RESUME_COMPLETED,
> +VFIO_DEVICE_STATE_MIGRATION_FAILED,
> +VFIO_DEVICE_STATE_MIGRATION_CANCELLED,
> +};
> +
> +/**
> + * Structure vfio_device_migration_info is placed at 0th offset of
> + * VFIO_REGION_SUBTYPE_MIGRATION region to get/set VFIO device related 
> migration
> + * information.
> + *
> + * Action Set state:
> + *  To tell vendor driver the state VFIO device should be transitioned 
> to.
> + *  device_state [input] :

Re: [Qemu-devel] [PATCH 3/5] Add migration functions for VFIO devices

2018-11-22 Thread Zhao Yan

On Fri, Nov 23, 2018 at 02:51:39AM +0530, Kirti Wankhede wrote:
> 
> 
> On 11/21/2018 1:09 PM, Zhao, Yan Y wrote:
> > 
> > 
> >> -Original Message-
> >> From: Qemu-devel [mailto:qemu-devel-
> >> bounces+yan.y.zhao=intel@nongnu.org] On Behalf Of Kirti Wankhede
> >> Sent: Wednesday, November 21, 2018 4:40 AM
> >> To: alex.william...@redhat.com; c...@nvidia.com
> >> Cc: zhengxiao...@alibaba-inc.com; Tian, Kevin ; Liu, 
> >> Yi L
> >> ; eskul...@redhat.com; Yang, Ziye 
> >> ;
> >> qemu-devel@nongnu.org; coh...@redhat.com; shuangtai@alibaba-inc.com;
> >> dgilb...@redhat.com; Wang, Zhi A ;
> >> mlevi...@redhat.com; pa...@linux.ibm.com; a...@ozlabs.ru; Kirti Wankhede
> >> ; eau...@redhat.com; fel...@nutanix.com;
> >> jonathan.dav...@nutanix.com; Liu, Changpeng ;
> >> ken@amd.com
> >> Subject: [Qemu-devel] [PATCH 3/5] Add migration functions for VFIO devices
> >>
> >> - Migration function are implemented for VFIO_DEVICE_TYPE_PCI device.
> >> - Added SaveVMHandlers and implemented all basic functions required for 
> >> live
> >>   migration.
> >> - Added VM state change handler to know running or stopped state of VM.
> >> - Added migration state change notifier to get notification on migration 
> >> state
> >>   change. This state is translated to VFIO device state and conveyed to 
> >> vendor
> >>   driver.
> >> - VFIO device supportd migration or not is decided based of migration 
> >> region
> >>   query. If migration region query is successful then migration is 
> >> supported
> >>   else migration is blocked.
> >> - Structure vfio_device_migration_info is mapped at 0th offset of migration
> >>   region and should always trapped by VFIO device's driver. Added both 
> >> type of
> >>   access support, trapped or mmapped, for data section of the region.
> >> - To save device state, read data offset and size using structure
> >>   vfio_device_migration_info.data, accordingly copy data from the region.
> >> - To restore device state, write data offset and size in the structure and 
> >> write
> >>   data in the region.
> >> - To get dirty page bitmap, write start address and pfn count then read 
> >> count of
> >>   pfns copied and accordingly read those from the rest of the region or 
> >> mmaped
> >>   part of the region. This copy is iterated till page bitmap for all 
> >> requested
> >>   pfns are copied.
> >>
> >> Signed-off-by: Kirti Wankhede 
> >> Reviewed-by: Neo Jia 
> >> ---
> >>  hw/vfio/Makefile.objs |   2 +-
> >>  hw/vfio/migration.c   | 729
> >> ++
> >>  include/hw/vfio/vfio-common.h |  23 ++
> >>  3 files changed, 753 insertions(+), 1 deletion(-)  create mode 100644
> >> hw/vfio/migration.c
> >>
> >> diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs index
> >> a2e7a0a7cf02..2cf2ba1440f2 100644
> >> --- a/hw/vfio/Makefile.objs
> >> +++ b/hw/vfio/Makefile.objs
> >> @@ -1,5 +1,5 @@
> >>  ifeq ($(CONFIG_LINUX), y)
> >> -obj-$(CONFIG_SOFTMMU) += common.o
> >> +obj-$(CONFIG_SOFTMMU) += common.o migration.o
> >>  obj-$(CONFIG_PCI) += pci.o pci-quirks.o display.o
> >>  obj-$(CONFIG_VFIO_CCW) += ccw.o
> >>  obj-$(CONFIG_SOFTMMU) += platform.o
> >> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c new file mode 100644
> >> index ..717fb63e4f43
> >> --- /dev/null
> >> +++ b/hw/vfio/migration.c
> >> @@ -0,0 +1,729 @@
> >> +/*
> >> + * Migration support for VFIO devices
> >> + *
> >> + * Copyright NVIDIA, Inc. 2018
> >> + *
> >> + * This work is licensed under the terms of the GNU GPL, version 2. See
> >> + * the COPYING file in the top-level directory.
> >> + */
> >> +
> >> +#include "qemu/osdep.h"
> >> +#include 
> >> +
> >> +#include "hw/vfio/vfio-common.h"
> >> +#include "cpu.h"
> >> +#include "migration/migration.h"
> >> +#include "migration/qemu-file.h"
> >> +#include "migration/register.h"
> >> +#include "migration/blocker.h"
> >> +#include "migration/misc.h"
> >> +#include "qapi/error.h"
> >> +#include "exec/ramlist.h"
> >> +#include "exec/ram_addr.h"
> >> +#include "pci.h"
> >> +
> >> +/*
> >> + * Flags used as delimiter:
> >> + * 0x => MSB 32-bit all 1s
> >> + * 0xef10 => emulated (virtual) function IO
> >> + * 0x => 16-bits reserved for flags
> >> + */
> >> +#define VFIO_MIG_FLAG_END_OF_STATE  (0xef11ULL)
> >> +#define VFIO_MIG_FLAG_DEV_CONFIG_STATE  (0xef12ULL)
> >> +#define VFIO_MIG_FLAG_DEV_SETUP_STATE   (0xef13ULL)
> >> +
> >> +static void vfio_migration_region_exit(VFIODevice *vbasedev) {
> >> +VFIOMigration *migration = vbasedev->migration;
> >> +
> >> +if (!migration) {
> >> +return;
> >> +}
> >> +
> >> +if (migration->region.buffer.size) {
> >> +vfio_region_exit(>region.buffer);
> >> +vfio_region_finalize(>region.buffer);
> >> +}
> >> +}
> >> +
> >> +static int vfio_migration_region_init(VFIODevice *vbasedev) {
> >> +VFIOMigration *migration = vbasedev->migration;
> >> +Object *obj =

Re: [Qemu-devel] MAINTAINERS leaves too many files uncovered

2018-11-22 Thread Philippe Mathieu-Daudé

Hi Zoltan,

On 22/11/18 22:56, BALATON Zoltan wrote:
> On Thu, 22 Nov 2018, Thomas Huth wrote:
>>>  19 hw/display/sm501.c
>>
>> Maybe Balaton wants to adopt this file?
> 
> As discussed before this was originally part of SH4 but since that seems
> to be not actively maintained any more and latest changes were for
> sam460ex I can add it to sam460ex. I planned to do that when I make any
> changes in the future but if needed I can send a patch for this now.
> 
> While we're there, should I also add pc-bios/canyonlands.dts which is
> the device tree for sam460ex?

Yes, I suggest this set:

pc-bios/canyonlands.dt?
pc-bios/u-boot-sam460*
roms/u-boot-sam460ex

Regards,

Phil.

Re: [Qemu-devel] [PATCH] MAINTAINERS: add missing xtensa patterns

2018-11-22 Thread Philippe Mathieu-Daudé

On 23/11/18 0:08, Max Filippov wrote:
> Signed-off-by: Max Filippov 

Reviewed-by: Philippe Mathieu-Daudé 

> ---
>  MAINTAINERS | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 1032406c5607..f4a7e453c06f 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -311,6 +311,8 @@ F: target/xtensa/
>  F: hw/xtensa/
>  F: tests/tcg/xtensa/
>  F: disas/xtensa.c
> +F: include/hw/xtensa/xtensa-isa.h
> +F: default-configs/xtensa*.mak
>  
>  TriCore
>  M: Bastian Koppelmann 
>

Re: [Qemu-devel] [PATCH] Xen PCI passthrough: fix passthrough failure when irq map failure

2018-11-22 Thread Zhao Yan

On Thu, Nov 22, 2018 at 03:18:05PM +0100, Roger Pau Monné wrote:
> On Thu, Nov 22, 2018 at 08:11:20AM -0500, Zhao Yan wrote:
> > On Thu, Oct 18, 2018 at 03:56:36PM +0100, Roger Pau Monné wrote:
> > > On Thu, Oct 18, 2018 at 08:22:41AM +, Zhao, Yan Y wrote:
> > > > Hi
> > > > The background for this patch is that: for some pci device, even it's 
> > > > PCI_INTERRUPT_PIN is not 0, it actually does not support INTx mode, so 
> > > > we should just report error, disable INTx mode and continue the 
> > > > passthrough.
> > > > However, the commit 5a11d0f7 regards this as error condition and let 
> > > > qemu quit passthrough, which is too rigorous.
> > > > 
> > > > Error message is below:
> > > > libxl: error: libxl_qmp.c:287:qmp_handle_error_response: Domain 
> > > > 2:received an error message from QMP server: Mapping machine irq 0 to 
> > > > pirq -1 failed: Operation not permitted
> > > 
> > > I'm having issues figuring out what's happening here.
> > > s->real_device.irq is 0, yet the PCI config space read of
> > > PCI_INTERRUPT_PIN returns something different than 0.
> > > 
> > > AFAICT this is due to some kind of error in Linux, so that even when
> > > the device is supposed to have a valid IRQ the sysfs node it is set to
> > > 0, do you know the actual underlying cause of this?
> > > 
> > > Thanks, Roger.
> > Hi Roger
> > Sorry for the later reply, I just missed this mail...
> > On my side, it's because the hardware actually does not support INTx mode,
> > but its configuration space does not report PCI_INTERRUPT_PIN to 0. It's a
> > hardware bug, but previous version of qemu can tolerate it, switch to MSI
> > and make passthrough work.
> 
> Then I think it would be better to check both PCI_INTERRUPT_PIN and
> s->real_device.irq before attempting to map the IRQ.
> 
> Making the error non-fatal would mean that a device with a valid IRQ
> could fail to be setup correctly but the guest will still be created,
> and things won't go as expected when the guest attempts to use it.
> 
> Thanks, Roger.
hi roger
thanks for your sugguestion. it's right that "s->real_device.irq" is needed to 
be checked before mapping, like if it's 0.
but on the other hand, maybe xc_physdev_map_pirq() itself can serve as a 
checking of "s->real_device.irq" ?
like in our case, it will fail and return -EPERM.
then error hanling is still conducted ==>set INTX_DISABLE flag, eventhrough the 
error is not fatal.

machine_irq = s->real_device.irq;
rc = xc_physdev_map_pirq(xen_xc, xen_domid, machine_irq, );
if (rc < 0) {
error_setg_errno(errp, errno, "Mapping machine irq %u to"
 " pirq %i failed", machine_irq, pirq);

/* Disable PCI intx assertion (turn on bit10 of devctl) */
cmd |= PCI_COMMAND_INTX_DISABLE;
machine_irq = 0;
s->machine_irq = 0;
So, do you think it's all right just converting fatal error to non-fatal?


Thanks
Yan

Re: [Qemu-devel] [PATCH 06/22] dma/puv3_dma: Convert sysbus initfunction to realize function

2018-11-22 Thread maozy





On 11/20/18 10:46 PM, Peter Maydell wrote:

On 19 November 2018 at 12:08, Mao Zhongyi
 wrote:

Use DeviceClass rather than SysBusDeviceClass in
puv3_dma_class_init().

Cc: g...@mprc.pku.edu.cn

Signed-off-by: Mao Zhongyi 
Signed-off-by: Zhang Shengju 
---
  hw/dma/puv3_dma.c | 10 --
  1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/hw/dma/puv3_dma.c b/hw/dma/puv3_dma.c
index b97a6c1767..c89eade029 100644
--- a/hw/dma/puv3_dma.c
+++ b/hw/dma/puv3_dma.c
@@ -76,7 +76,7 @@ static const MemoryRegionOps puv3_dma_ops = {
  .endianness = DEVICE_NATIVE_ENDIAN,
  };

-static int puv3_dma_init(SysBusDevice *dev)
+static void puv3_dma_realize(DeviceState *dev, Error **errp)
  {
  PUV3DMAState *s = PUV3_DMA(dev);
  int i;
@@ -87,16 +87,14 @@ static int puv3_dma_init(SysBusDevice *dev)

  memory_region_init_io(>iomem, OBJECT(s), _dma_ops, s, "puv3_dma",
  PUV3_REGS_OFFSET);
-sysbus_init_mmio(dev, >iomem);
-
-return 0;
+sysbus_init_mmio(SYS_BUS_DEVICE(dev), >iomem);
  }

  static void puv3_dma_class_init(ObjectClass *klass, void *data)
  {
-SysBusDeviceClass *sdc = SYS_BUS_DEVICE_CLASS(klass);
+DeviceClass *dc = DEVICE_CLASS(klass);

-sdc->init = puv3_dma_init;
+dc->realize = puv3_dma_realize;
  }

  static const TypeInfo puv3_dma_info = {


Reviewed-by: Peter Maydell 

(I note that this device is missing a reset function and is
instead resetting in its init/realize function, but that's a
separate bug. It's also missing vmstate.)


OK, I will fix it later in a separate patch.

Thanks,
Mao



thanks
-- PMM

Re: [Qemu-devel] [PATCH 22/22] core/sysbus: remove the SysBusDeviceClass::initpath

2018-11-22 Thread maozy


Hi, Eduardo

On 11/20/18 7:31 AM, Eduardo Habkost wrote:

On Mon, Nov 19, 2018 at 08:08:20PM +0800, Mao Zhongyi wrote:

Currently, all sysbus devices have been converted to realize(),
so remove this path.

Cc: ehabk...@redhat.com
Cc: th...@redhat.com
Cc: pbonz...@redhat.com
Cc: arm...@redhat.com
Cc: peter.mayd...@linaro.org
Cc: richard.hender...@linaro.org
Cc: alistair.fran...@wdc.com

Signed-off-by: Mao Zhongyi 
Signed-off-by: Zhang Shengju 
---
  hw/core/sysbus.c| 15 ---
  include/hw/sysbus.h |  3 ---
  2 files changed, 18 deletions(-)

diff --git a/hw/core/sysbus.c b/hw/core/sysbus.c
index 7ac36ad3e7..030ad426c1 100644
--- a/hw/core/sysbus.c
+++ b/hw/core/sysbus.c
@@ -201,20 +201,6 @@ void sysbus_init_ioports(SysBusDevice *dev, uint32_t 
ioport, uint32_t size)
  }
  }
  
-/* TODO remove once all sysbus devices have been converted to realize */

-static void sysbus_realize(DeviceState *dev, Error **errp)
-{
-SysBusDevice *sd = SYS_BUS_DEVICE(dev);
-SysBusDeviceClass *sbc = SYS_BUS_DEVICE_GET_CLASS(sd);
-
-if (!sbc->init) {
-return;
-}
-if (sbc->init(sd) < 0) {
-error_setg(errp, "Device initialization failed");
-}
-}


Nice.  :)



-
  DeviceState *sysbus_create_varargs(const char *name,
 hwaddr addr, ...)
  {
@@ -327,7 +313,6 @@ MemoryRegion *sysbus_address_space(SysBusDevice *dev)
  static void sysbus_device_class_init(ObjectClass *klass, void *data)
  {
  DeviceClass *k = DEVICE_CLASS(klass);
-k->realize = sysbus_realize;


Have you ensured this won't break any subclasses that
saved the original realize function on a parent_realize field?


Thanks for the catch.


Now they will have parent_realize set to NULL.


In order to void the subclasses whose parent_realize field is
set to NULL, the k->realize function must be retained even
though it doesn't do anything practical. Just like this:


-/* TODO remove once all sysbus devices have been converted to realize*/
 static void sysbus_realize(DeviceState *dev, Error **errp)
 {
-SysBusDevice *sd = SYS_BUS_DEVICE(dev);
-SysBusDeviceClass *sbc = SYS_BUS_DEVICE_GET_CLASS(sd);
-
-if (!sbc->init) {
-return;
-}
-if (sbc->init(sd) < 0) {
-error_setg(errp, "Device initialization failed");
-}
 }

it doesn't look elegant, but I didn't think of a better way, if you
can give me some hints, I really appreciate it. :)

Thanks,
Mao




Most of them use device_class_set_parent_realize() to implement
that.


  k->bus_type = TYPE_SYSTEM_BUS;
  /*
   * device_add plugs devices into a suitable bus.  For "real" buses,
diff --git a/include/hw/sysbus.h b/include/hw/sysbus.h
index 0b59a3b8d6..1aedcf05c9 100644
--- a/include/hw/sysbus.h
+++ b/include/hw/sysbus.h
@@ -38,9 +38,6 @@ typedef struct SysBusDevice SysBusDevice;
  typedef struct SysBusDeviceClass {
  /*< private >*/
  DeviceClass parent_class;
-/*< public >*/
-
-int (*init)(SysBusDevice *dev);
  
  /*

   * Let the sysbus device format its own non-PIO, non-MMIO unit address.
--
2.17.1

Re: [Qemu-devel] [PATCH v5 01/36] ppc/xive: introduce a XIVE interrupt source model

2018-11-22 Thread David Gibson

On Thu, Nov 22, 2018 at 08:25:06AM +0100, Cédric Le Goater wrote:
> On 11/22/18 4:05 AM, David Gibson wrote:
> > On Fri, Nov 16, 2018 at 11:56:54AM +0100, Cédric Le Goater wrote:
> >> The first sub-engine of the overall XIVE architecture is the Interrupt
> >> Virtualization Source Engine (IVSE). An IVSE can be integrated into
> >> another logic, like in a PCI PHB or in the main interrupt controller
> >> to manage IPIs.
> >>
> >> Each IVSE instance is associated with an Event State Buffer (ESB) that
> >> contains a two bit state entry for each possible event source. When an
> >> event is signaled to the IVSE, by MMIO or some other means, the
> >> associated interrupt state bits are fetched from the ESB and
> >> modified. Depending on the resulting ESB state, the event is forwarded
> >> to the IVRE sub-engine of the controller doing the routing.
> >>
> >> Each supported ESB entry is associated with either a single or a
> >> even/odd pair of pages which provides commands to manage the source:
> >> to EOI, to turn off the source for instance.
> >>
> >> On a sPAPR machine, the O/S will obtain the page address of the ESB
> >> entry associated with a source and its characteristic using the
> >> H_INT_GET_SOURCE_INFO hcall. On PowerNV, a similar OPAL call is used.
> >>
> >> The xive_source_notify() routine is in charge forwarding the source
> >> event notification to the routing engine. It will be filled later on.
> >>
> >> Signed-off-by: Cédric Le Goater 
> > 
> > Ok, this is looking basically pretty good.  Few details to query
> > below.
> > 
> > 
> >> ---
> >>  default-configs/ppc64-softmmu.mak |   1 +
> >>  include/hw/ppc/xive.h | 130 ++
> >>  hw/intc/xive.c| 379 ++
> >>  hw/intc/Makefile.objs |   1 +
> >>  4 files changed, 511 insertions(+)
> >>  create mode 100644 include/hw/ppc/xive.h
> >>  create mode 100644 hw/intc/xive.c
> >>
> >> diff --git a/default-configs/ppc64-softmmu.mak 
> >> b/default-configs/ppc64-softmmu.mak
> >> index aec2855750d6..2d1e7c5c4668 100644
> >> --- a/default-configs/ppc64-softmmu.mak
> >> +++ b/default-configs/ppc64-softmmu.mak
> >> @@ -16,6 +16,7 @@ CONFIG_VIRTIO_VGA=y
> >>  CONFIG_XICS=$(CONFIG_PSERIES)
> >>  CONFIG_XICS_SPAPR=$(CONFIG_PSERIES)
> >>  CONFIG_XICS_KVM=$(call land,$(CONFIG_PSERIES),$(CONFIG_KVM))
> >> +CONFIG_XIVE=$(CONFIG_PSERIES)
> >>  CONFIG_MEM_DEVICE=y
> >>  CONFIG_DIMM=y
> >>  CONFIG_SPAPR_RNG=y
> >> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
> >> new file mode 100644
> >> index ..5fec4b08705d
> >> --- /dev/null
> >> +++ b/include/hw/ppc/xive.h
> >> @@ -0,0 +1,130 @@
> >> +/*
> >> + * QEMU PowerPC XIVE interrupt controller model
> >> + *
> >> + * Copyright (c) 2017-2018, IBM Corporation.
> >> + *
> >> + * This code is licensed under the GPL version 2 or later. See the
> >> + * COPYING file in the top-level directory.
> > 
> > A cheat sheet in the top of this header with the old and new XIVE
> > terms would quite nice to have.
> 
> Yes. It's a good place. I will put the XIVE acronyms here :
>  
>  EA   Event Assignment
>  EISN Effective Interrupt Source Number
>  END  Event Notification Descriptor
>  ESB  Event State Buffer
>  EQ   Event Queue
>  LISN Logical Interrupt Source Number
>  NVT  Notification Virtual Target
>  TIMA Thread Interrupt Management Area
>  ...

That sounds good, but what I'd also like is showing that NVT == VP and
EAS == IVT and so forth.

> >> + */
> >> +
> >> +#ifndef PPC_XIVE_H
> >> +#define PPC_XIVE_H
> >> +
> >> +#include "hw/sysbus.h"
> > 
> > So, I'm a bit dubious about making the XiveSource a SysBus device -
> > I'm concerned it won't play well with tying it into the other devices
> > like PHB that "own" it in real hardware.
> 
> It does but I can take a look at changing it to a DeviceState. The 
> reset handlers might be a concern.

As "non bus" device I think you'd need to register your own reset
handler rather than just setting dc->reset.  Otherwise, I think that
should work.

> > I think we'd be better off making it a direct descendent of
> > TYPE_DEVICE which constructs the MMIO region, but doesn't map it.
> 
> At a moment, I started working on a XiveESB object doing what I think 
> you are suggesting and I removed it. I am reluctant adding more 
> complexity now, the patchset is just growing and growing ... 
> 
> But I agree there are fundamentals to get right for KVM. Let's talk 
> about it after you have looked at the overall patchset, at least up 
> to KVM initial support.

Hm, ok.

> > Then we can havea SysBusDevice (and/or other) wrapper which
> > instantiates the XiveSource core and maps it into somewhere
> > accessible.
> 
> The XIVE controller model does the mapping of the source currently.

I'm.. I'm not sure what you mean by that.   We have a
sysbus_init_mmio() right here which effectively maps in the MMIO
region AFAICT.

Re: [Qemu-devel] [PATCH] spapr: drop redundant statement in spapr_populate_drconf_memory()

2018-11-22 Thread David Gibson

On Thu, Nov 22, 2018 at 03:31:36PM +0100, Greg Kurz wrote:
> Signed-off-by: Greg Kurz 

Applied to ppc-for-3.2.

> ---
>  hw/ppc/spapr.c |2 --
>  1 file changed, 2 deletions(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 9c41098b5781..7771a628f879 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -893,8 +893,6 @@ static int spapr_populate_drconf_memory(sPAPRMachineState 
> *spapr, void *fdt)
>  /* ibm,associativity-lookup-arrays */
>  buf_len = (nr_nodes * 4 + 2) * sizeof(uint32_t);
>  cur_index = int_buf = g_malloc0(buf_len);
> -
> -cur_index = int_buf;
>  int_buf[0] = cpu_to_be32(nr_nodes);
>  int_buf[1] = cpu_to_be32(4); /* Number of entries per associativity list 
> */
>  cur_index += 2;
> 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature

Re: [Qemu-devel] [PATCH for 3.1 v3] spapr: Fix ibm, max-associativity-domains property number of nodes

2018-11-22 Thread David Gibson

On Thu, Nov 22, 2018 at 08:19:27AM -0500, Serhii Popovych wrote:
> Laurent Vivier reported off by one with maximum number of NUMA nodes
> provided by qemu-kvm being less by one than required according to
> description of "ibm,max-associativity-domains" property in LoPAPR.
> 
> It appears that I incorrectly treated LoPAPR description of this
> property assuming it provides last valid domain (NUMA node here)
> instead of maximum number of domains.
> 
>   ### Before hot-add
> 
>   (qemu) info numa
>   3 nodes
>   node 0 cpus: 0
>   node 0 size: 0 MB
>   node 0 plugged: 0 MB
>   node 1 cpus:
>   node 1 size: 1024 MB
>   node 1 plugged: 0 MB
>   node 2 cpus:
>   node 2 size: 0 MB
>   node 2 plugged: 0 MB
> 
>   $ numactl -H
>   available: 2 nodes (0-1)
>   node 0 cpus: 0
>   node 0 size: 0 MB
>   node 0 free: 0 MB
>   node 1 cpus:
>   node 1 size: 999 MB
>   node 1 free: 658 MB
>   node distances:
>   node   0   1
> 0:  10  40
> 1:  40  10
> 
>   ### Hot-add
> 
>   (qemu) object_add memory-backend-ram,id=mem0,size=1G
>   (qemu) device_add pc-dimm,id=dimm1,memdev=mem0,node=2
>   (qemu) [   87.704898] pseries-hotplug-mem: Attempting to hot-add 4 ...
>   
>   [   87.705128] lpar: Attempting to resize HPT to shift 21
>   ... 
> 
>   ### After hot-add
> 
>   (qemu) info numa
>   3 nodes
>   node 0 cpus: 0
>   node 0 size: 0 MB
>   node 0 plugged: 0 MB
>   node 1 cpus:
>   node 1 size: 1024 MB
>   node 1 plugged: 0 MB
>   node 2 cpus:
>   node 2 size: 1024 MB
>   node 2 plugged: 1024 MB
> 
>   $ numactl -H
>   available: 2 nodes (0-1)
>   
>  Still only two nodes (and memory hot-added to node 0 below)
>   node 0 cpus: 0
>   node 0 size: 1024 MB
>   node 0 free: 1021 MB
>   node 1 cpus:
>   node 1 size: 999 MB
>   node 1 free: 658 MB
>   node distances:
>   node   0   1
> 0:  10  40
> 1:  40  10
> 
> After fix applied numactl(8) reports 3 nodes available and memory
> plugged into node 2 as expected.
> 
> >From David Gibson:
> --
>   Qemu makes a distinction between "non NUMA" (nb_numa_nodes == 0) and
>   "NUMA with one node" (nb_numa_nodes == 1).  But from a PAPR guests's
>   point of view these are equivalent.  I don't want to present two
>   different cases to the guest when we don't need to, so even though the
>   guest can handle it, I'd prefer we put a '1' here for both the
>   nb_numa_nodes == 0 and nb_numa_nodes == 1 case.
> 
> This consolidates everything discussed previously on mailing list.
> 
> Fixes: da9f80fbad21 ("spapr: Add ibm,max-associativity-domains property")
> Reported-by: Laurent Vivier 
> Signed-off-by: Serhii Popovych 

Applied, thanks.

> ---
>  hw/ppc/spapr.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 7afd1a1..2ee7201 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -1033,7 +1033,7 @@ static void spapr_dt_rtas(sPAPRMachineState *spapr, 
> void *fdt)
>  cpu_to_be32(0),
>  cpu_to_be32(0),
>  cpu_to_be32(0),
> -cpu_to_be32(nb_numa_nodes ? nb_numa_nodes - 1 : 0),
> +cpu_to_be32(nb_numa_nodes ? nb_numa_nodes : 1),
>  };
>  
>  _FDT(rtas = fdt_add_subnode(fdt, 0, "rtas"));

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature

[Qemu-devel] [PATCH] MAINTAINERS: add missing xtensa patterns

2018-11-22 Thread Max Filippov

Signed-off-by: Max Filippov 
---
 MAINTAINERS | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 1032406c5607..f4a7e453c06f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -311,6 +311,8 @@ F: target/xtensa/
 F: hw/xtensa/
 F: tests/tcg/xtensa/
 F: disas/xtensa.c
+F: include/hw/xtensa/xtensa-isa.h
+F: default-configs/xtensa*.mak
 
 TriCore
 M: Bastian Koppelmann 
-- 
2.11.0

Re: [Qemu-devel] [PATCH] audio/hda: fix guest triggerable assert

2018-11-22 Thread Philippe Mathieu-Daudé

Hi Gerd,

On 22/11/18 14:32, Gerd Hoffmann wrote:
> Guest writes to a readonly register trigger the assert in
> intel_hda_reg_write().  Add a check and just ignore them.

Is this 3.1 material? It seems to apply.

> 
> Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1628433
> Signed-off-by: Gerd Hoffmann 
> ---
>  hw/audio/intel-hda.c | 4 
>  1 file changed, 4 insertions(+)
> 
> diff --git a/hw/audio/intel-hda.c b/hw/audio/intel-hda.c
> index 23a2cf6484..066532713c 100644
> --- a/hw/audio/intel-hda.c
> +++ b/hw/audio/intel-hda.c
> @@ -929,6 +929,10 @@ static void intel_hda_reg_write(IntelHDAState *d, const 
> IntelHDAReg *reg, uint32
>  if (!reg) {
>  return;
>  }
> +if (!reg->wmask) {
> +/* read-only register */

Can you add:

   qemu_log_mask(LOG_GUEST_ERROR,
 "intel-hda: Register %s is read-only\n",
 reg->name);

Regardless:
Reviewed-by: Philippe Mathieu-Daudé 

> +return;
> +}
>  
>  if (d->debug) {
>  time_t now = time(NULL);
>

Re: [Qemu-devel] [PATCH for-3.1] MAINTAINERS: Add an ARM SMMU section

2018-11-22 Thread Philippe Mathieu-Daudé

Hi Eric,

On 22/11/18 19:01, Eric Auger wrote:
> Add a new ARM SMMU section and set Eric Auger as the maintainer
> for ARM SMMU emulation sources.
> 
> Signed-off-by: Eric Auger 
> Suggested-by: Peter Maydell 
> ---
>  MAINTAINERS | 7 +++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 1032406c56..3cac9f0a0c 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -151,6 +151,13 @@ F: disas/arm.c
>  F: disas/arm-a64.cc
>  F: disas/libvixl/
>  
> +ARM SMMU
> +M: Eric Auger 
> +L: qemu-...@nongnu.org
> +S: Maintained

Shouldn't you use 'Supported': "Someone is actually paid to look after
this."?

> +F: hw/arm/smmu*
> +F: include/hw/arm/smmu*
> +
>  CRIS
>  M: Edgar E. Iglesias 
>  S: Maintained
>

Re: [Qemu-devel] [PATCH v2 for-4.0 3/3] elf_ops.h: Use address_space_write() to write memory

2018-11-22 Thread Philippe Mathieu-Daudé

On 22/11/18 18:26, Peter Maydell wrote:
> Currently the load_elf function in elf_ops.h uses
> cpu_physical_memory_write() to write the ELF file to
> memory if it is not handling it as a ROM blob. This
> means we ignore the AddressSpace that the function
> is passed to define where it should be loaded.
> Use address_space_write() instead.
> 
> Signed-off-by: Peter Maydell 
> ---
> v1->v2: handle NULL as

Reviewed-by: Philippe Mathieu-Daudé 

> ---
>  include/hw/elf_ops.h | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/include/hw/elf_ops.h b/include/hw/elf_ops.h
> index 81cecaf27e2..74679ff8da3 100644
> --- a/include/hw/elf_ops.h
> +++ b/include/hw/elf_ops.h
> @@ -482,7 +482,9 @@ static int glue(load_elf, SZ)(const char *name, int fd,
>  rom_add_elf_program(label, data, file_size, mem_size,
>  addr, as);
>  } else {
> -cpu_physical_memory_write(addr, data, file_size);
> +address_space_write(as ? as : _space_memory,
> +addr, MEMTXATTRS_UNSPECIFIED,
> +data, file_size);
>  g_free(data);
>  }
>  }
>

Re: [Qemu-devel] [PATCH v2 for-4.0 2/3] monitor: Use address_space_read() to read memory

2018-11-22 Thread Philippe Mathieu-Daudé

On 22/11/18 18:26, Peter Maydell wrote:
> Currently monitor.c reads physical memory using
> cpu_physical_memory_read(). This effectively hard-codes
> assuming that all CPUs have the same view of physical
> memory. Switch to address_space_read() instead, which
> lets us use the AddressSpace for the CPU we're
> reading memory for (falling back to address_space_memory
> if there is no CPU, as happens with the "none" board).
> As a bonus, this allows us to detect failures to read memory.
> 
> Signed-off-by: Peter Maydell 
> Reviewed-by: Dr. David Alan Gilbert 

Reviewed-by: Philippe Mathieu-Daudé 

> ---
>  monitor.c | 8 +++-
>  1 file changed, 7 insertions(+), 1 deletion(-)
> 
> diff --git a/monitor.c b/monitor.c
> index d39390c2f2f..b0e8f2c490a 100644
> --- a/monitor.c
> +++ b/monitor.c
> @@ -1604,7 +1604,13 @@ static void memory_dump(Monitor *mon, int count, int 
> format, int wsize,
>  if (l > line_size)
>  l = line_size;
>  if (is_physical) {
> -cpu_physical_memory_read(addr, buf, l);
> +AddressSpace *as = cs ? cs->as : _space_memory;
> +MemTxResult r = address_space_read(as, addr,
> +   MEMTXATTRS_UNSPECIFIED, buf, 
> l);
> +if (r != MEMTX_OK) {
> +monitor_printf(mon, " Cannot access memory\n");
> +break;
> +}
>  } else {
>  if (cpu_memory_rw_debug(cs, addr, buf, l, 0) < 0) {
>  monitor_printf(mon, " Cannot access memory\n");
>

Re: [Qemu-devel] [PATCH v5 06/36] ppc/xive: add support for the END Event State buffers

2018-11-22 Thread Cédric Le Goater

On 11/22/18 6:13 AM, David Gibson wrote:
> On Fri, Nov 16, 2018 at 11:56:59AM +0100, Cédric Le Goater wrote:
>> The Event Notification Descriptor also contains two Event State
>> Buffers providing further coalescing of interrupts, one for the
>> notification event (ESn) and one for the escalation events (ESe). A
>> MMIO page is assigned for each to control the EOI through loads
>> only. Stores are not allowed.
>>
>> The END ESBs are modeled through an object resembling the 'XiveSource'
>> It is stateless as the END state bits are backed into the XiveEND
>> structure under the XiveRouter and the MMIO accesses follow the same
>> rules as for the standard source ESBs.
>>
>> END ESBs are not supported by the Linux drivers neither on OPAL nor on
>> sPAPR. Nevetherless, it provides a mean to study the question in the
>> future and validates a bit more the XIVE model.
>>
>> Signed-off-by: Cédric Le Goater 
>> ---
>>  include/hw/ppc/xive.h |  20 ++
>>  hw/intc/xive.c| 160 +-
>>  2 files changed, 178 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
>> index ce62aaf28343..24301bf2076d 100644
>> --- a/include/hw/ppc/xive.h
>> +++ b/include/hw/ppc/xive.h
>> @@ -208,6 +208,26 @@ int xive_router_get_end(XiveRouter *xrtr, uint8_t 
>> end_blk, uint32_t end_idx,
>>  int xive_router_set_end(XiveRouter *xrtr, uint8_t end_blk, uint32_t end_idx,
>>  XiveEND *end);
>>  
>> +/*
>> + * XIVE END ESBs
>> + */
>> +
>> +#define TYPE_XIVE_END_SOURCE "xive-end-source"
>> +#define XIVE_END_SOURCE(obj) \
>> +OBJECT_CHECK(XiveENDSource, (obj), TYPE_XIVE_END_SOURCE)
> 
> Is there a particular reason to make this a full QOM object, rather
> than just embedding it in the XiveRouter?

yes, it should probably be under the XiveRouter you are right because
there is a direct link with the ENDT which is in the XiverRouter. 

But if I remove the chip_id field from the XiveRouter, it becomes a QOM
interface. something to ponder.
 
>> +typedef struct XiveENDSource {
>> +SysBusDevice parent;
>> +
>> +uint32_tnr_ends;
>> +
>> +/* ESB memory region */
>> +uint32_tesb_shift;
>> +MemoryRegionesb_mmio;
>> +
>> +XiveRouter  *xrtr;
>> +} XiveENDSource;
>> +
>>  /*
>>   * For legacy compatibility, the exceptions define up to 256 different
>>   * priorities. P9 implements only 9 levels : 8 active levels [0 - 7]
>> diff --git a/hw/intc/xive.c b/hw/intc/xive.c
>> index 9cb001e7b540..5a8882d47a98 100644
>> --- a/hw/intc/xive.c
>> +++ b/hw/intc/xive.c
>> @@ -622,8 +622,18 @@ static void xive_router_end_notify(XiveRouter *xrtr, 
>> uint8_t end_blk,
>>   * even futher coalescing in the Router
>>   */
>>  if (!(end.w0 & END_W0_UCOND_NOTIFY)) {
>> -qemu_log_mask(LOG_UNIMP, "XIVE: !UCOND_NOTIFY not implemented\n");
>> -return;
>> +uint8_t pq = GETFIELD(END_W1_ESn, end.w1);
>> +bool notify = xive_esb_trigger();
>> +
>> +if (pq != GETFIELD(END_W1_ESn, end.w1)) {
>> +end.w1 = SETFIELD(END_W1_ESn, end.w1, pq);
>> +xive_router_set_end(xrtr, end_blk, end_idx, );
>> +}
>> +
>> +/* ESn[Q]=1 : end of notification */
>> +if (!notify) {
>> +return;
>> +}
>>  }
>>  
>>  /*
>> @@ -706,6 +716,151 @@ void xive_eas_pic_print_info(XiveEAS *eas, uint32_t 
>> lisn, Monitor *mon)
>> (uint32_t) GETFIELD(EAS_END_DATA, eas->w));
>>  }
>>  
>> +/*
>> + * END ESB MMIO loads
>> + */
>> +static uint64_t xive_end_source_read(void *opaque, hwaddr addr, unsigned 
>> size)
>> +{
>> +XiveENDSource *xsrc = XIVE_END_SOURCE(opaque);
>> +XiveRouter *xrtr = xsrc->xrtr;
>> +uint32_t offset = addr & 0xFFF;
>> +uint8_t end_blk;
>> +uint32_t end_idx;
>> +XiveEND end;
>> +uint32_t end_esmask;
>> +uint8_t pq;
>> +uint64_t ret = -1;
>> +
>> +end_blk = xrtr->chip_id;
>> +end_idx = addr >> (xsrc->esb_shift + 1);
>> +if (xive_router_get_end(xrtr, end_blk, end_idx, )) {
>> +qemu_log_mask(LOG_GUEST_ERROR, "XIVE: No END %x/%x\n", end_blk,
>> +  end_idx);
>> +return -1;
>> +}
>> +
>> +if (!(end.w0 & END_W0_VALID)) {
>> +qemu_log_mask(LOG_GUEST_ERROR, "XIVE: END %x/%x is invalid\n",
>> +  end_blk, end_idx);
>> +return -1;
>> +}
>> +
>> +end_esmask = addr_is_even(addr, xsrc->esb_shift) ? END_W1_ESn : 
>> END_W1_ESe;
>> +pq = GETFIELD(end_esmask, end.w1);
>> +
>> +switch (offset) {
>> +case XIVE_ESB_LOAD_EOI ... XIVE_ESB_LOAD_EOI + 0x7FF:
>> +ret = xive_esb_eoi();
>> +
>> +/* Forward the source event notification for routing ?? */
>> +break;
>> +
>> +case XIVE_ESB_GET ... XIVE_ESB_GET + 0x3FF:
>> +ret = pq;
>> +break;
>> +
>> +case XIVE_ESB_SET_PQ_00 ... XIVE_ESB_SET_PQ_00 + 0x0FF:
>> +case XIVE_ESB_SET_PQ_01

Re: [Qemu-devel] MAINTAINERS leaves too many files uncovered

2018-11-22 Thread BALATON Zoltan


On Thu, 22 Nov 2018, Thomas Huth wrote:

 19 hw/display/sm501.c


Maybe Balaton wants to adopt this file?


As discussed before this was originally part of SH4 but since that seems 
to be not actively maintained any more and latest changes were for 
sam460ex I can add it to sam460ex. I planned to do that when I make any 
changes in the future but if needed I can send a patch for this now.


While we're there, should I also add pc-bios/canyonlands.dts which is the 
device tree for sam460ex?


Regards,
BALATON Zoltan

Re: [Qemu-devel] [PATCH v3 0/5] migration: improve multithreads

2018-11-22 Thread no-reply

Hi,

This series seems to have some coding style problems. See output below for
more information:

Message-id: 20181122072028.22819-1-xiaoguangr...@tencent.com
Type: series
Subject: [Qemu-devel] [PATCH v3 0/5] migration: improve multithreads

=== TEST SCRIPT BEGIN ===
#!/bin/bash

BASE=base
n=1
total=$(git log --oneline $BASE.. | wc -l)
failed=0

git config --local diff.renamelimit 0
git config --local diff.renames True
git config --local diff.algorithm histogram

commits="$(git log --format=%H --reverse $BASE..)"
for c in $commits; do
echo "Checking PATCH $n/$total: $(git log -n 1 --format=%s $c)..."
if ! git show $c --format=email | ./scripts/checkpatch.pl --mailback -; then
failed=1
echo
fi
n=$((n+1))
done

exit $failed
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
Switched to a new branch 'test'
1f40f88 tests: add threaded-workqueue-bench
0c560ac migration: use threaded workqueue for decompression
effdcb4 migration: use threaded workqueue for compression
eb91c63 util: introduce threaded workqueue
3bf8b44 bitops: introduce change_bit_atomic

=== OUTPUT BEGIN ===
Checking PATCH 1/5: bitops: introduce change_bit_atomic...
Checking PATCH 2/5: util: introduce threaded workqueue...
WARNING: added, moved or deleted file(s), does MAINTAINERS need updating?
#41: 
new file mode 100644

ERROR: externs should be avoided in .c files
#233: FILE: util/threaded-workqueue.c:65:
+uint64_t request_fill_bitmap QEMU_ALIGNED(SMP_CACHE_BYTES);

ERROR: externs should be avoided in .c files
#235: FILE: util/threaded-workqueue.c:67:
+uint64_t request_done_bitmap QEMU_ALIGNED(SMP_CACHE_BYTES);

ERROR: externs should be avoided in .c files
#241: FILE: util/threaded-workqueue.c:73:
+QemuEvent request_valid_ev QEMU_ALIGNED(SMP_CACHE_BYTES);

ERROR: externs should be avoided in .c files
#247: FILE: util/threaded-workqueue.c:79:
+QemuEvent request_free_ev QEMU_ALIGNED(SMP_CACHE_BYTES);

total: 4 errors, 1 warnings, 575 lines checked

Your patch has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.

Checking PATCH 3/5: migration: use threaded workqueue for compression...
Checking PATCH 4/5: migration: use threaded workqueue for decompression...
Checking PATCH 5/5: tests: add threaded-workqueue-bench...
WARNING: added, moved or deleted file(s), does MAINTAINERS need updating?
#36: 
new file mode 100644

WARNING: line over 80 characters
#234: FILE: tests/threaded-workqueue-bench.c:194:
+printf("   -r:   the number of requests handled by each thread 
(default %d).\n",

WARNING: line over 80 characters
#236: FILE: tests/threaded-workqueue-bench.c:196:
+printf("   -m:   the size of the memory (G) used to test (default 
%dG).\n",

ERROR: line over 90 characters
#282: FILE: tests/threaded-workqueue-bench.c:242:
+printf("Run the benchmark: threads %d requests-per-thread: %d memory %ldG 
repeat %d.\n",

total: 1 errors, 3 warnings, 272 lines checked

Your patch has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.

=== OUTPUT END ===

Test command exited with code: 1


---
Email generated automatically by Patchew [http://patchew.org/].
Please send your feedback to patchew-de...@redhat.com

Re: [Qemu-devel] [PATCH v5 05/36] ppc/xive: introduce the XIVE Event Notification Descriptors

2018-11-22 Thread Cédric Le Goater

On 11/22/18 5:41 AM, David Gibson wrote:
> On Fri, Nov 16, 2018 at 11:56:58AM +0100, Cédric Le Goater wrote:
>> To complete the event routing, the IVRE sub-engine uses an internal
>> table containing Event Notification Descriptor (END) structures.
>>
>> An END specifies on which Event Queue (EQ) the event notification
>> data, defined in the associated EAS, should be posted when an
>> exception occurs. It also defines which Notification Virtual Target
>> (NVT) should be notified.
>>
>> The Event Queue is a memory page provided by the O/S defining a
>> circular buffer, one per server and priority couple, containing Event
>> Queue entries. These are 4 bytes long, the first bit being a
>> 'generation' bit and the 31 following bits the END Data field. They
>> are pulled by the O/S when the exception occurs.
>>
>> The END Data field is a way to set an invariant logical event source
>> number for an IRQ. It is set with the H_INT_SET_SOURCE_CONFIG hcall
>> when the EISN flag is used.
>>
>> Signed-off-by: Cédric Le Goater 
>> ---
>>  include/hw/ppc/xive.h  |  18 
>>  include/hw/ppc/xive_regs.h |  48 ++
>>  hw/intc/xive.c | 185 -
>>  3 files changed, 248 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h
>> index 5a0696366577..ce62aaf28343 100644
>> --- a/include/hw/ppc/xive.h
>> +++ b/include/hw/ppc/xive.h
>> @@ -193,11 +193,29 @@ typedef struct XiveRouterClass {
>>  /* XIVE table accessors */
>>  int (*get_eas)(XiveRouter *xrtr, uint32_t lisn, XiveEAS *eas);
>>  int (*set_eas)(XiveRouter *xrtr, uint32_t lisn, XiveEAS *eas);
>> +int (*get_end)(XiveRouter *xrtr, uint8_t end_blk, uint32_t end_idx,
>> +   XiveEND *end);
>> +int (*set_end)(XiveRouter *xrtr, uint8_t end_blk, uint32_t end_idx,
>> +   XiveEND *end);
> 
> Hrm.  So unlike the EAS, which is basically just a word, the END is a
> pretty large structure.  

yes. and so will be the NVT.

> It's unclear here if get/set are expected to copy the whole thing out 
> and in, 

That's the plan. 

What I had in mind are memory accessors to the XIVE structures, which 
are local to QEMU for sPAPR and in the guest RAM for PowerNV (Please
take a look at the XIVE PowerNV model).

> or if get give you a pointer into a "live" structure 

no

> and set just does any necessary barriers after an update.
that would be too complex for the PowerNV model I think. There is a cache
in between the software running on the (QEMU) machine and the XIVE HW but
it would be hard to handle. 
 
> Really, for a non-atomic value like this, I'm not sure get/set is the
> right model.

ok. we need something to get them out and in.

> Also as I understand it nearly all the indices in XIVE are broken into
> block/index.  Is there a reason those are folded together into lisn
> for the EAS, but not for the END?

The indexing of the EAT is global to the sytem and the index defines
which blk to use. The IRQ source numbers on the powerbus are architected 
to be :

#define XIVE_SRCNO(blk, idx)  ((uint32_t)(blk) << 28 | (idx))

and XIVE can use different strategies to identify the XIVE IC in charge 
of routing. It can be a one-to-one chip to block relation as skiboot does. 
Using a block scope table is possible also. Our model only supports one 
block per chip and some shortcuts are taken but not that much in fact.
 
Remote access to the XIVE structures of another chip are done through 
MMIO (not modeled in PowerNV) and the blkid is used to partition the MMIO 
regions. Being local is better for performance because the END and NVT 
tables have a strong relation with the XIVE subengines using them 
(VC and PC). 

May be, Ben can clarified it this is badly explained. 

>>  } XiveRouterClass;
>>  
>>  void xive_eas_pic_print_info(XiveEAS *eas, uint32_t lisn, Monitor *mon);
>>  
>>  int xive_router_get_eas(XiveRouter *xrtr, uint32_t lisn, XiveEAS *eas);
>>  int xive_router_set_eas(XiveRouter *xrtr, uint32_t lisn, XiveEAS *eas);
>> +int xive_router_get_end(XiveRouter *xrtr, uint8_t end_blk, uint32_t end_idx,
>> +XiveEND *end);
>> +int xive_router_set_end(XiveRouter *xrtr, uint8_t end_blk, uint32_t end_idx,
>> +XiveEND *end);
>> +
>> +/*
>> + * For legacy compatibility, the exceptions define up to 256 different
>> + * priorities. P9 implements only 9 levels : 8 active levels [0 - 7]
>> + * and the least favored level 0xFF.
>> + */
>> +#define XIVE_PRIORITY_MAX  7
>> +
>> +void xive_end_reset(XiveEND *end);
>> +void xive_end_pic_print_info(XiveEND *end, uint32_t end_idx, Monitor *mon);
>>  
>>  #endif /* PPC_XIVE_H */
>> diff --git a/include/hw/ppc/xive_regs.h b/include/hw/ppc/xive_regs.h
>> index 12499b33614c..f97fb2b90bee 100644
>> --- a/include/hw/ppc/xive_regs.h
>> +++ b/include/hw/ppc/xive_regs.h
>> @@ -28,4 +28,52 @@ typedef struct XiveEAS {
>>  #define EAS_END_DATAPPC_BITMASK(33, 63)  /*

Re: [Qemu-devel] [PATCH v3 0/5] migration: improve multithreads

2018-11-22 Thread no-reply

Hi,

This series failed docker-mingw@fedora build test. Please find the testing 
commands and
their output below. If you have Docker installed, you can probably reproduce it
locally.

Message-id: 20181122072028.22819-1-xiaoguangr...@tencent.com
Type: series
Subject: [Qemu-devel] [PATCH v3 0/5] migration: improve multithreads

=== TEST SCRIPT BEGIN ===
#!/bin/bash
time make docker-test-mingw@fedora SHOW_ENV=1 J=8
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
Switched to a new branch 'test'
1f40f88 tests: add threaded-workqueue-bench
0c560ac migration: use threaded workqueue for decompression
effdcb4 migration: use threaded workqueue for compression
eb91c63 util: introduce threaded workqueue
3bf8b44 bitops: introduce change_bit_atomic

=== OUTPUT BEGIN ===
  BUILD   fedora
make[1]: Entering directory `/var/tmp/patchew-tester-tmp-1vusqh9v/src'
  GEN 
/var/tmp/patchew-tester-tmp-1vusqh9v/src/docker-src.2018-11-22-16.24.23.23218/qemu.tar
Cloning into 
'/var/tmp/patchew-tester-tmp-1vusqh9v/src/docker-src.2018-11-22-16.24.23.23218/qemu.tar.vroot'...
done.
Submodule 'dtc' (https://git.qemu.org/git/dtc.git) registered for path 'dtc'
Cloning into 'dtc'...
Submodule path 'dtc': checked out '88f18909db731a627456f26d779445f84e449536'
Submodule 'ui/keycodemapdb' (https://git.qemu.org/git/keycodemapdb.git) 
registered for path 'ui/keycodemapdb'
Cloning into 'ui/keycodemapdb'...
Submodule path 'ui/keycodemapdb': checked out 
'6b3d716e2b6472eb7189d3220552280ef3d832ce'
  COPYRUNNER
RUN test-mingw in qemu:fedora 
Packages installed:
SDL2-devel-2.0.9-1.fc28.x86_64
bc-1.07.1-5.fc28.x86_64
bison-3.0.4-9.fc28.x86_64
bluez-libs-devel-5.50-1.fc28.x86_64
brlapi-devel-0.6.7-19.fc28.x86_64
bzip2-1.0.6-26.fc28.x86_64
bzip2-devel-1.0.6-26.fc28.x86_64
ccache-3.4.2-2.fc28.x86_64
clang-6.0.1-2.fc28.x86_64
device-mapper-multipath-devel-0.7.4-3.git07e7bd5.fc28.x86_64
findutils-4.6.0-19.fc28.x86_64
flex-2.6.1-7.fc28.x86_64
gcc-8.2.1-5.fc28.x86_64
gcc-c++-8.2.1-5.fc28.x86_64
gettext-0.19.8.1-14.fc28.x86_64
git-2.17.2-1.fc28.x86_64
glib2-devel-2.56.3-2.fc28.x86_64
glusterfs-api-devel-4.1.5-1.fc28.x86_64
gnutls-devel-3.6.4-1.fc28.x86_64
gtk3-devel-3.22.30-1.fc28.x86_64
hostname-3.20-3.fc28.x86_64
libaio-devel-0.3.110-11.fc28.x86_64
libasan-8.2.1-5.fc28.x86_64
libattr-devel-2.4.48-3.fc28.x86_64
libcap-devel-2.25-9.fc28.x86_64
libcap-ng-devel-0.7.9-4.fc28.x86_64
libcurl-devel-7.59.0-8.fc28.x86_64
libfdt-devel-1.4.7-1.fc28.x86_64
libpng-devel-1.6.34-6.fc28.x86_64
librbd-devel-12.2.8-1.fc28.x86_64
libssh2-devel-1.8.0-7.fc28.x86_64
libubsan-8.2.1-5.fc28.x86_64
libusbx-devel-1.0.22-1.fc28.x86_64
libxml2-devel-2.9.8-4.fc28.x86_64
llvm-6.0.1-8.fc28.x86_64
lzo-devel-2.08-12.fc28.x86_64
make-4.2.1-6.fc28.x86_64
mingw32-SDL2-2.0.9-1.fc28.noarch
mingw32-bzip2-1.0.6-9.fc27.noarch
mingw32-curl-7.57.0-1.fc28.noarch
mingw32-glib2-2.56.1-1.fc28.noarch
mingw32-gmp-6.1.2-2.fc27.noarch
mingw32-gnutls-3.6.3-1.fc28.noarch
mingw32-gtk3-3.22.30-1.fc28.noarch
mingw32-libjpeg-turbo-1.5.1-3.fc27.noarch
mingw32-libpng-1.6.29-2.fc27.noarch
mingw32-libssh2-1.8.0-3.fc27.noarch
mingw32-libtasn1-4.13-1.fc28.noarch
mingw32-nettle-3.4-1.fc28.noarch
mingw32-pixman-0.34.0-3.fc27.noarch
mingw32-pkg-config-0.28-9.fc27.x86_64
mingw64-SDL2-2.0.9-1.fc28.noarch
mingw64-bzip2-1.0.6-9.fc27.noarch
mingw64-curl-7.57.0-1.fc28.noarch
mingw64-glib2-2.56.1-1.fc28.noarch
mingw64-gmp-6.1.2-2.fc27.noarch
mingw64-gnutls-3.6.3-1.fc28.noarch
mingw64-gtk3-3.22.30-1.fc28.noarch
mingw64-libjpeg-turbo-1.5.1-3.fc27.noarch
mingw64-libpng-1.6.29-2.fc27.noarch
mingw64-libssh2-1.8.0-3.fc27.noarch
mingw64-libtasn1-4.13-1.fc28.noarch
mingw64-nettle-3.4-1.fc28.noarch
mingw64-pixman-0.34.0-3.fc27.noarch
mingw64-pkg-config-0.28-9.fc27.x86_64
ncurses-devel-6.1-5.20180224.fc28.x86_64
nettle-devel-3.4-2.fc28.x86_64
nss-devel-3.39.0-1.0.fc28.x86_64
numactl-devel-2.0.11-8.fc28.x86_64
package PyYAML is not installed
package libjpeg-devel is not installed
perl-5.26.2-414.fc28.x86_64
pixman-devel-0.34.0-8.fc28.x86_64
python3-3.6.6-1.fc28.x86_64
snappy-devel-1.1.7-5.fc28.x86_64
sparse-0.5.2-1.fc28.x86_64
spice-server-devel-0.14.0-4.fc28.x86_64
systemtap-sdt-devel-4.0-1.fc28.x86_64
tar-1.30-3.fc28.x86_64
usbredir-devel-0.8.0-1.fc28.x86_64
virglrenderer-devel-0.6.0-4.20170210git76b3da97b.fc28.x86_64
vte3-devel-0.36.5-6.fc28.x86_64
which-2.21-8.fc28.x86_64
xen-devel-4.10.2-2.fc28.x86_64
zlib-devel-1.2.11-8.fc28.x86_64

Environment variables:
TARGET_LIST=
PACKAGES=bc bison bluez-libs-devel brlapi-devel bzip2 
bzip2-devel ccache clang device-mapper-multipath-devel 
findutils flex gcc gcc-c++ gettext git glib2-devel 
glusterfs-api-devel gnutls-devel gtk3-devel hostname 
libaio-devel libasan libattr-devel libcap-devel libcap-ng-devel 
libcurl-devel libfdt-devel libjpeg-devel libpng-devel 
librbd-devel libssh2-devel libubsan libusbx-devel libxml2-devel

Re: [Qemu-devel] [PATCH 3/5] Add migration functions for VFIO devices

2018-11-22 Thread Kirti Wankhede




On 11/21/2018 1:09 PM, Zhao, Yan Y wrote:
> 
> 
>> -Original Message-
>> From: Qemu-devel [mailto:qemu-devel-
>> bounces+yan.y.zhao=intel@nongnu.org] On Behalf Of Kirti Wankhede
>> Sent: Wednesday, November 21, 2018 4:40 AM
>> To: alex.william...@redhat.com; c...@nvidia.com
>> Cc: zhengxiao...@alibaba-inc.com; Tian, Kevin ; Liu, 
>> Yi L
>> ; eskul...@redhat.com; Yang, Ziye ;
>> qemu-devel@nongnu.org; coh...@redhat.com; shuangtai@alibaba-inc.com;
>> dgilb...@redhat.com; Wang, Zhi A ;
>> mlevi...@redhat.com; pa...@linux.ibm.com; a...@ozlabs.ru; Kirti Wankhede
>> ; eau...@redhat.com; fel...@nutanix.com;
>> jonathan.dav...@nutanix.com; Liu, Changpeng ;
>> ken@amd.com
>> Subject: [Qemu-devel] [PATCH 3/5] Add migration functions for VFIO devices
>>
>> - Migration function are implemented for VFIO_DEVICE_TYPE_PCI device.
>> - Added SaveVMHandlers and implemented all basic functions required for live
>>   migration.
>> - Added VM state change handler to know running or stopped state of VM.
>> - Added migration state change notifier to get notification on migration 
>> state
>>   change. This state is translated to VFIO device state and conveyed to 
>> vendor
>>   driver.
>> - VFIO device supportd migration or not is decided based of migration region
>>   query. If migration region query is successful then migration is supported
>>   else migration is blocked.
>> - Structure vfio_device_migration_info is mapped at 0th offset of migration
>>   region and should always trapped by VFIO device's driver. Added both type 
>> of
>>   access support, trapped or mmapped, for data section of the region.
>> - To save device state, read data offset and size using structure
>>   vfio_device_migration_info.data, accordingly copy data from the region.
>> - To restore device state, write data offset and size in the structure and 
>> write
>>   data in the region.
>> - To get dirty page bitmap, write start address and pfn count then read 
>> count of
>>   pfns copied and accordingly read those from the rest of the region or 
>> mmaped
>>   part of the region. This copy is iterated till page bitmap for all 
>> requested
>>   pfns are copied.
>>
>> Signed-off-by: Kirti Wankhede 
>> Reviewed-by: Neo Jia 
>> ---
>>  hw/vfio/Makefile.objs |   2 +-
>>  hw/vfio/migration.c   | 729
>> ++
>>  include/hw/vfio/vfio-common.h |  23 ++
>>  3 files changed, 753 insertions(+), 1 deletion(-)  create mode 100644
>> hw/vfio/migration.c
>>
>> diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs index
>> a2e7a0a7cf02..2cf2ba1440f2 100644
>> --- a/hw/vfio/Makefile.objs
>> +++ b/hw/vfio/Makefile.objs
>> @@ -1,5 +1,5 @@
>>  ifeq ($(CONFIG_LINUX), y)
>> -obj-$(CONFIG_SOFTMMU) += common.o
>> +obj-$(CONFIG_SOFTMMU) += common.o migration.o
>>  obj-$(CONFIG_PCI) += pci.o pci-quirks.o display.o
>>  obj-$(CONFIG_VFIO_CCW) += ccw.o
>>  obj-$(CONFIG_SOFTMMU) += platform.o
>> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c new file mode 100644
>> index ..717fb63e4f43
>> --- /dev/null
>> +++ b/hw/vfio/migration.c
>> @@ -0,0 +1,729 @@
>> +/*
>> + * Migration support for VFIO devices
>> + *
>> + * Copyright NVIDIA, Inc. 2018
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2. See
>> + * the COPYING file in the top-level directory.
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include 
>> +
>> +#include "hw/vfio/vfio-common.h"
>> +#include "cpu.h"
>> +#include "migration/migration.h"
>> +#include "migration/qemu-file.h"
>> +#include "migration/register.h"
>> +#include "migration/blocker.h"
>> +#include "migration/misc.h"
>> +#include "qapi/error.h"
>> +#include "exec/ramlist.h"
>> +#include "exec/ram_addr.h"
>> +#include "pci.h"
>> +
>> +/*
>> + * Flags used as delimiter:
>> + * 0x => MSB 32-bit all 1s
>> + * 0xef10 => emulated (virtual) function IO
>> + * 0x => 16-bits reserved for flags
>> + */
>> +#define VFIO_MIG_FLAG_END_OF_STATE  (0xef11ULL)
>> +#define VFIO_MIG_FLAG_DEV_CONFIG_STATE  (0xef12ULL)
>> +#define VFIO_MIG_FLAG_DEV_SETUP_STATE   (0xef13ULL)
>> +
>> +static void vfio_migration_region_exit(VFIODevice *vbasedev) {
>> +VFIOMigration *migration = vbasedev->migration;
>> +
>> +if (!migration) {
>> +return;
>> +}
>> +
>> +if (migration->region.buffer.size) {
>> +vfio_region_exit(>region.buffer);
>> +vfio_region_finalize(>region.buffer);
>> +}
>> +}
>> +
>> +static int vfio_migration_region_init(VFIODevice *vbasedev) {
>> +VFIOMigration *migration = vbasedev->migration;
>> +Object *obj = NULL;
>> +int ret = -EINVAL;
>> +
>> +if (!migration) {
>> +return ret;
>> +}
>> +
>> +/* Migration support added for PCI device only */
>> +if (vbasedev->type == VFIO_DEVICE_TYPE_PCI) {
>> +obj = vfio_pci_get_object(vbasedev);
>> +}
>> +
>> +if (!obj) {
>> +return ret;
>> +}
>> +

Re: [Qemu-devel] [PATCH 0/5] Add migration support for VFIO device

2018-11-22 Thread Kirti Wankhede




On 11/21/2018 11:17 AM, Peter Xu wrote:
> On Wed, Nov 21, 2018 at 02:09:38AM +0530, Kirti Wankhede wrote:
>> Add migration support for VFIO device
> 
> Hi, Kirti,
> 
> I failed to apply the series cleanly onto master.  Could you push the
> tree somewhere so that people might read the work easier?  Or would
> you tell me the base commit, then I can apply it myself.
> 

Sorry of inconvenience.
These patches are on top of v3.0.0 release (tag: v3.0.0)

Thanks,
Kirti

Re: [Qemu-devel] [PATCH 1/5] VFIO KABI for migration interface

2018-11-22 Thread Kirti Wankhede




On 11/23/2018 12:24 AM, Dr. David Alan Gilbert wrote:
> * Kirti Wankhede (kwankh...@nvidia.com) wrote:
>> - Defined MIGRATION region type and sub-type.
>> - Defined VFIO device states during migration process.
>> - Defined vfio_device_migration_info structure which will be placed at 0th
>>   offset of migration region to get/set VFIO device related information.
>>   Defined actions and members of structure usage for each action:
>> * To convey VFIO device state to be transitioned to.
>> * To get pending bytes yet to be migrated for VFIO device
>> * To ask driver to write data to migration region and return number of 
>> bytes
>>   written in the region
>> * In migration resume path, user space app writes to migration region and
>>   communicates it to vendor driver.
>> * Get bitmap of dirty pages from vendor driver from given start address
>>
>> Signed-off-by: Kirti Wankhede 
>> Reviewed-by: Neo Jia 
> 
> 
> 
>> + * Action Get buffer:
>> + *  On this action, vendor driver should write data to migration region 
>> and
>> + *  return number of bytes written in the region.
>> + *  data.offset [output] : offset in the region from where data is 
>> written.
>> + *  data.size [output] : number of bytes written in migration buffer by
>> + *  vendor driver.
> 
> 
> 
>> + */
>> +
>> +struct vfio_device_migration_info {
>> +__u32 device_state; /* VFIO device state */
>> +struct {
>> +__u64 precopy_only;
>> +__u64 compatible;
>> +__u64 postcopy_only;
>> +__u64 threshold_size;
>> +} pending;
>> +struct {
>> +__u64 offset;   /* offset */
>> +__u64 size; /* size */
>> +} data;
> 
> I'm curious how the offsets/size work; how does the 
> kernel driver know the maximum size of state it's allowed to write?


Migration region looks like:
 --
|vfio_device_migration_info|data section  | 
|  | ///  |
 --
 ^   ^ ^
 offset 0-trapped part data.offset data.size


Kernel driver defines the size of migration region and tells VFIO user
space application (QEMU here) through VFIO_DEVICE_GET_REGION_INFO ioctl.
So kernel driver can calculate the size of data section. Then kernel
driver can have (data.size >= data section size) or (data.size < data
section size), hence VFIO user space application need to know data.size
to copy only relevant data.

> Why would it pick a none-0 offset into the output region?

Data section is always followed by vfio_device_migration_info structure
in the region, so data.offset will always be none-0.
Offset from where data is copied is decided by kernel driver, data
section can be trapped or mapped depending on how kernel driver defines
data section. If mmapped, then data.offset should be page aligned, where
as initial section which contain vfio_device_migration_info structure
might not end at offset which is page aligned.

Thanks,
Kirti

> Without having dug further these feel like i/o rather than just output;
> i.e. the calling process says 'put it at that offset and you've got size
> bytes' and the kernel replies with 'I did put it at offset and I wrote
> only this size bytes'
> 
> Dave
> 
>> +struct {
>> +__u64 start_addr;
>> +__u64 total;
>> +__u64 copied;
>> +} dirty_pfns;
>> +} __attribute__((packed));
>> +
>>  /*  API for Type1 VFIO IOMMU  */
>>  
>>  /**
>> -- 
>> 2.7.0
>>
> --
> Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
>

Re: [Qemu-devel] [PATCH v2 1/2] RFC: Acceptance tests: add the build directory to the system PATH

2018-11-22 Thread Wainer dos Santos Moschetta




On 11/21/2018 07:48 PM, Cleber Rosa wrote:

So that when binaries such as qemu-img are searched for, those in the
build tree will be favored.  As a clarification, SRC_ROOT_DIR is
dependent on the location from where tests are executed, so they are
equal to the build directory if one is being used.


On avocado's job.log file I can see the full path of the qemu-img which 
was called, but I wonder if it wouldn't be better to log that 
information somewhere more explicitly. It wouldn't prevent this patch 
from being merged though, it's just an improvement suggestion.




The original motivation is that Avocado libraries such as
avocado.utils.vmimage.get() may use the matching binaries, but it may
also apply to any other binary that test code may eventually attempt
to execute.

Other competing alternatives would be a more explicit path or binary
registration mechanism, in which we tell the libraries such as
avocado.utils.vmimage, the binaries to use in advance.  I think the
model proposed here is simpler though, and is not inconsistent with
the general approach of favoring the built binaries, and falling back
to binaries available in the system.  I'd love to have comments on
that, though.


IMHO it makes sense to pick the built binaries, falling back to system's 
otherwise. Keep it simple (and consistent) unless we eventually need 
something robust, I would go for that approach here.




Signed-off-by: Cleber Rosa 
---
  tests/acceptance/avocado_qemu/__init__.py | 9 +
  1 file changed, 9 insertions(+)

diff --git a/tests/acceptance/avocado_qemu/__init__.py 
b/tests/acceptance/avocado_qemu/__init__.py
index 1e54fd5932..3d5190cbab 100644
--- a/tests/acceptance/avocado_qemu/__init__.py
+++ b/tests/acceptance/avocado_qemu/__init__.py
@@ -49,6 +49,15 @@ class Test(avocado.Test):
  self.cancel("No QEMU binary defined or found in the source tree")
  self.vm = QEMUMachine(self.qemu_bin)
  
+# RFC: avocado.utils.vmimage.get() uses qemu-img, from the

+# system's PATH, to create a snapshot.  This is a transparent,
+# but implicit way of making sure it finds the qemu-img that
+# matches the code being tested (as tests it indirectly too).
+# As for the cleanup, given that in the Avocado test execution
+# model every test is started in a different process, no
+# cleanup is needed.
+os.environ['PATH'] = '%s:%s' % (SRC_ROOT_DIR, os.environ['PATH'])
+
  def tearDown(self):
  if self.vm is not None:
  self.vm.shutdown()


The boot Linux test (added on patch 02) exits with error when I ran 
'make check-acceptance'. I am using Fedora 29 x86_64 which don't have 
qemu-img installed system-wide. See below:


--
# make check-acceptance
  AVOCADO tests/acceptance
make: *** [/root/qemu/tests/Makefile.include:940: check-acceptance] Error 9
# cat tests/results/latest/results.tap
1..8
not ok 1 /root/qemu/tests/acceptance/boot_linux.py:BootLinux.test
# grep -e 'ERROR.*CmdNotFoundError' tests/results/latest/job.log
2018-11-22 14:51:30,540 stacktrace   L0047 ERROR| raise 
CmdNotFoundError(cmd, path_paths)
2018-11-22 14:51:30,540 stacktrace   L0047 ERROR| 
avocado.utils.path.CmdNotFoundError: Command 'qemu-img' could not be 
found in any of the PATH dirs: ['/usr/local/bin', '/bin', '/root/bin', 
'/sbin', '/usr/libexec', '/usr/local/sbin', '/root/qemu', '/usr/bin', 
'/usr/sbin']
2018-11-22 14:51:30,572 test L0984 ERROR| 
avocado.utils.path.CmdNotFoundError: Command 'qemu-img' could not be 
found in any of the PATH dirs: ['/usr/local/bin', '/bin', '/root/bin', 
'/sbin', '/usr/libexec', '/usr/local/sbin', '/root/qemu', '/usr/bin', 
'/usr/sbin']
2018-11-22 14:51:30,572 test L0999 ERROR| ERROR 
1-/root/qemu/tests/acceptance/boot_linux.py:BootLinux.test -> 
CmdNotFoundError: Command 'qemu-img' could not be found in any of the 
PATH dirs: ['/usr/local/bin', '/bin', '/root/bin', '/sbin', 
'/usr/libexec', '/usr/local/sbin', '/root/qemu', '/usr/bin', '/usr/sbin']

--

The same test finished successfully when I ran with 'avocado run (...)' 
though.


- Wainer

Re: [Qemu-devel] [PATCH 0/2] Update the inherits_from pointer after stream and commit

2018-11-22 Thread Alberto Garcia

On Thu 22 Nov 2018 06:52:00 PM CET, Kevin Wolf  wrote:
> Not a problem with the series, but I tried to run the test case without
> the fix, and this is what I got:
>
> -{"return": ""}
> +{"return": "Cannot change the option 'backing.detect-zeroes'rn"}
>
> Where does that final "rn" come from? Looks like we have a bug
> somewhere in the error reporting code?

It looks like a \r\n that hasn't been properly interpreted as CRLF.

Berto

Re: [Qemu-devel] [PATCH 3/5] Add migration functions for VFIO devices

2018-11-22 Thread Dr. David Alan Gilbert

* Kirti Wankhede (kwankh...@nvidia.com) wrote:
> - Migration function are implemented for VFIO_DEVICE_TYPE_PCI device.
> - Added SaveVMHandlers and implemented all basic functions required for live
>   migration.
> - Added VM state change handler to know running or stopped state of VM.
> - Added migration state change notifier to get notification on migration state
>   change. This state is translated to VFIO device state and conveyed to vendor
>   driver.
> - VFIO device supportd migration or not is decided based of migration region
>   query. If migration region query is successful then migration is supported
>   else migration is blocked.
> - Structure vfio_device_migration_info is mapped at 0th offset of migration
>   region and should always trapped by VFIO device's driver. Added both type of
>   access support, trapped or mmapped, for data section of the region.
> - To save device state, read data offset and size using structure
>   vfio_device_migration_info.data, accordingly copy data from the region.
> - To restore device state, write data offset and size in the structure and 
> write
>   data in the region.
> - To get dirty page bitmap, write start address and pfn count then read count 
> of
>   pfns copied and accordingly read those from the rest of the region or mmaped
>   part of the region. This copy is iterated till page bitmap for all requested
>   pfns are copied.
> 
> Signed-off-by: Kirti Wankhede 
> Reviewed-by: Neo Jia 

This does need something for device/data versioning.
Please consider adding some 'trace' calls to make it easier to
debug in situ.
Splitting the patch a bit more would also help; see some more comments
below.


> ---
>  hw/vfio/Makefile.objs |   2 +-
>  hw/vfio/migration.c   | 729 
> ++
>  include/hw/vfio/vfio-common.h |  23 ++
>  3 files changed, 753 insertions(+), 1 deletion(-)
>  create mode 100644 hw/vfio/migration.c
> 
> diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs
> index a2e7a0a7cf02..2cf2ba1440f2 100644
> --- a/hw/vfio/Makefile.objs
> +++ b/hw/vfio/Makefile.objs
> @@ -1,5 +1,5 @@
>  ifeq ($(CONFIG_LINUX), y)
> -obj-$(CONFIG_SOFTMMU) += common.o
> +obj-$(CONFIG_SOFTMMU) += common.o migration.o
>  obj-$(CONFIG_PCI) += pci.o pci-quirks.o display.o
>  obj-$(CONFIG_VFIO_CCW) += ccw.o
>  obj-$(CONFIG_SOFTMMU) += platform.o
> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> new file mode 100644
> index ..717fb63e4f43
> --- /dev/null
> +++ b/hw/vfio/migration.c
> @@ -0,0 +1,729 @@
> +/*
> + * Migration support for VFIO devices
> + *
> + * Copyright NVIDIA, Inc. 2018
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2. See
> + * the COPYING file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +#include 
> +
> +#include "hw/vfio/vfio-common.h"
> +#include "cpu.h"
> +#include "migration/migration.h"
> +#include "migration/qemu-file.h"
> +#include "migration/register.h"
> +#include "migration/blocker.h"
> +#include "migration/misc.h"
> +#include "qapi/error.h"
> +#include "exec/ramlist.h"
> +#include "exec/ram_addr.h"
> +#include "pci.h"
> +
> +/*
> + * Flags used as delimiter:
> + * 0x => MSB 32-bit all 1s
> + * 0xef10 => emulated (virtual) function IO
> + * 0x => 16-bits reserved for flags
> + */
> +#define VFIO_MIG_FLAG_END_OF_STATE  (0xef11ULL)
> +#define VFIO_MIG_FLAG_DEV_CONFIG_STATE  (0xef12ULL)
> +#define VFIO_MIG_FLAG_DEV_SETUP_STATE   (0xef13ULL)
> +
> +static void vfio_migration_region_exit(VFIODevice *vbasedev)
> +{
> +VFIOMigration *migration = vbasedev->migration;
> +
> +if (!migration) {
> +return;
> +}
> +
> +if (migration->region.buffer.size) {
> +vfio_region_exit(>region.buffer);
> +vfio_region_finalize(>region.buffer);
> +}
> +}
> +
> +static int vfio_migration_region_init(VFIODevice *vbasedev)
> +{
> +VFIOMigration *migration = vbasedev->migration;
> +Object *obj = NULL;
> +int ret = -EINVAL;
> +
> +if (!migration) {
> +return ret;

Here and ...
> +}
> +
> +/* Migration support added for PCI device only */
> +if (vbasedev->type == VFIO_DEVICE_TYPE_PCI) {
> +obj = vfio_pci_get_object(vbasedev);
> +}
> +
> +if (!obj) {
> +return ret;

Here, you've failed the migration but not printed an error to say why;
please print something so we cna tell what happened.

> +}
> +
> +ret = vfio_region_setup(obj, vbasedev, >region.buffer,
> +migration->region.index, "migration");
> +if (ret) {
> +error_report("Failed to setup VFIO migration region %d: %s",
> +  migration->region.index, strerror(-ret));
> +goto err;
> +}
> +
> +if (!migration->region.buffer.size) {
> +ret = -EINVAL;
> +error_report("Invalid region size of VFIO migration region %d: %s",
> +

Re: [Qemu-devel] [PATCH 4/5] Add vfio_listerner_log_sync to mark dirty pages

2018-11-22 Thread Dr. David Alan Gilbert

* Kirti Wankhede (kwankh...@nvidia.com) wrote:
> vfio_listerner_log_sync gets list of dirty pages from vendor driver and mark
> those pages dirty.
> 
> Signed-off-by: Kirti Wankhede 
> Reviewed-by: Neo Jia 
> ---
>  hw/vfio/common.c | 32 
>  1 file changed, 32 insertions(+)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index fb396cf00ac4..338aad7426f0 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -697,9 +697,41 @@ static void vfio_listener_region_del(MemoryListener 
> *listener,
>  }
>  }
>  
> +static void vfio_listerner_log_sync(MemoryListener *listener,
> +MemoryRegionSection *section)
> +{
> +uint64_t start_addr, size, pfn_count;
> +VFIOGroup *group;
> +VFIODevice *vbasedev;
> +
> +QLIST_FOREACH(group, _group_list, next) {
> +QLIST_FOREACH(vbasedev, >device_list, next) {
> +switch (vbasedev->device_state) {
> +case VFIO_DEVICE_STATE_MIGRATION_PRECOPY:
> +case VFIO_DEVICE_STATE_MIGRATION_STOPNCOPY:
> +continue;
> +
> +default:
> +return;
> +}
> +}
> +}

Is that big loop just trying to find devices not in migration?
Some comments would be good.

Dave

> +start_addr = TARGET_PAGE_ALIGN(section->offset_within_address_space);
> +size = int128_get64(section->size);
> +pfn_count = size >> TARGET_PAGE_BITS;
> +
> +QLIST_FOREACH(group, _group_list, next) {
> +QLIST_FOREACH(vbasedev, >device_list, next) {
> +vfio_get_dirty_page_list(vbasedev, start_addr, pfn_count);
> +}
> +}
> +}
> +
>  static const MemoryListener vfio_memory_listener = {
>  .region_add = vfio_listener_region_add,
>  .region_del = vfio_listener_region_del,
> +.log_sync = vfio_listerner_log_sync,
>  };
>  
>  static void vfio_listener_release(VFIOContainer *container)
> -- 
> 2.7.0
> 
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK

Re: [Qemu-devel] [PATCH 1/5] VFIO KABI for migration interface

2018-11-22 Thread Kirti Wankhede




On 11/21/2018 11:43 AM, Tian, Kevin wrote:
>> From: Kirti Wankhede [mailto:kwankh...@nvidia.com]
>> Sent: Wednesday, November 21, 2018 12:24 PM
>>
>>
>> On 11/21/2018 5:56 AM, Tian, Kevin wrote:
 From: Kirti Wankhede [mailto:kwankh...@nvidia.com]
 Sent: Wednesday, November 21, 2018 4:40 AM

 - Defined MIGRATION region type and sub-type.
 - Defined VFIO device states during migration process.
 - Defined vfio_device_migration_info structure which will be placed at
>> 0th
   offset of migration region to get/set VFIO device related information.
   Defined actions and members of structure usage for each action:
 * To convey VFIO device state to be transitioned to.
 * To get pending bytes yet to be migrated for VFIO device
 * To ask driver to write data to migration region and return number of
 bytes
   written in the region
 * In migration resume path, user space app writes to migration region
 and
   communicates it to vendor driver.
 * Get bitmap of dirty pages from vendor driver from given start
>> address

 Signed-off-by: Kirti Wankhede 
 Reviewed-by: Neo Jia 
 ---
  linux-headers/linux/vfio.h | 130
 +
  1 file changed, 130 insertions(+)

 diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
 index 3615a269d378..a6e45cb2cae2 100644
 --- a/linux-headers/linux/vfio.h
 +++ b/linux-headers/linux/vfio.h
 @@ -301,6 +301,10 @@ struct vfio_region_info_cap_type {
  #define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG(2)
  #define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3)

 +/* Migration region type and sub-type */
 +#define VFIO_REGION_TYPE_MIGRATION(1 << 30)
 +#define VFIO_REGION_SUBTYPE_MIGRATION (1)
 +
  /*
   * The MSIX mappable capability informs that MSIX data of a BAR can be
 mmapped
   * which allows direct access to non-MSIX registers which happened to
>> be
 within
 @@ -602,6 +606,132 @@ struct vfio_device_ioeventfd {

  #define VFIO_DEVICE_IOEVENTFD _IO(VFIO_TYPE, VFIO_BASE
 + 16)

 +/**
 + * VFIO device states :
 + * VFIO User space application should set the device state to indicate
 vendor
 + * driver in which state the VFIO device should transitioned.
 + * - VFIO_DEVICE_STATE_NONE:
 + *   State when VFIO device is initialized but not yet running.
 + * - VFIO_DEVICE_STATE_RUNNING:
 + *   Transition VFIO device in running state, that is, user space
>> application
 or
 + *   VM is active.
 + * - VFIO_DEVICE_STATE_MIGRATION_SETUP:
 + *   Transition VFIO device in migration setup state. This is used to
>> prepare
 + *   VFIO device for migration while application or VM and vCPUs are
>> still in
 + *   running state.
 + * - VFIO_DEVICE_STATE_MIGRATION_PRECOPY:
 + *   When VFIO user space application or VM is active and vCPUs are
 running,
 + *   transition VFIO device in pre-copy state.
 + * - VFIO_DEVICE_STATE_MIGRATION_STOPNCOPY:
 + *   When VFIO user space application or VM is stopped and vCPUs are
 halted,
 + *   transition VFIO device in stop-and-copy state.
 + * - VFIO_DEVICE_STATE_MIGRATION_SAVE_COMPLETED:
 + *   When VFIO user space application has copied data provided by
>> vendor
 driver.
 + *   This state is used by vendor driver to clean up all software state 
 that
 was
 + *   setup during MIGRATION_SETUP state.
 + * - VFIO_DEVICE_STATE_MIGRATION_RESUME:
 + *   Transition VFIO device to resume state, that is, start resuming VFIO
 device
 + *   when user space application or VM is not running and vCPUs are
 halted.
 + * - VFIO_DEVICE_STATE_MIGRATION_RESUME_COMPLETED:
 + *   When user space application completes iterations of providing
>> device
 state
 + *   data, transition device in resume completed state.
 + * - VFIO_DEVICE_STATE_MIGRATION_FAILED:
 + *   Migration process failed due to some reason, transition device to
 failed
 + *   state. If migration process fails while saving at source, resume
>> device
 at
 + *   source. If migration process fails while resuming application or VM
>> at
 + *   destination, stop restoration at destination and resume at source.
 + * - VFIO_DEVICE_STATE_MIGRATION_CANCELLED:
 + *   User space application has cancelled migration process either for
>> some
 + *   known reason or due to user's intervention. Transition device to
 Cancelled
 + *   state, that is, resume device state as it was during running state at
 + *   source.
 + */
 +
 +enum {
 +VFIO_DEVICE_STATE_NONE,
 +VFIO_DEVICE_STATE_RUNNING,
 +VFIO_DEVICE_STATE_MIGRATION_SETUP,
 +VFIO_DEVICE_STATE_MIGRATION_PRECOPY,
 +

Re: [Qemu-devel] [PATCH 1/5] VFIO KABI for migration interface

2018-11-22 Thread Dr. David Alan Gilbert

* Kirti Wankhede (kwankh...@nvidia.com) wrote:
> - Defined MIGRATION region type and sub-type.
> - Defined VFIO device states during migration process.
> - Defined vfio_device_migration_info structure which will be placed at 0th
>   offset of migration region to get/set VFIO device related information.
>   Defined actions and members of structure usage for each action:
> * To convey VFIO device state to be transitioned to.
> * To get pending bytes yet to be migrated for VFIO device
> * To ask driver to write data to migration region and return number of 
> bytes
>   written in the region
> * In migration resume path, user space app writes to migration region and
>   communicates it to vendor driver.
> * Get bitmap of dirty pages from vendor driver from given start address
> 
> Signed-off-by: Kirti Wankhede 
> Reviewed-by: Neo Jia 



> + * Action Get buffer:
> + *  On this action, vendor driver should write data to migration region 
> and
> + *  return number of bytes written in the region.
> + *  data.offset [output] : offset in the region from where data is 
> written.
> + *  data.size [output] : number of bytes written in migration buffer by
> + *  vendor driver.



> + */
> +
> +struct vfio_device_migration_info {
> +__u32 device_state; /* VFIO device state */
> +struct {
> +__u64 precopy_only;
> +__u64 compatible;
> +__u64 postcopy_only;
> +__u64 threshold_size;
> +} pending;
> +struct {
> +__u64 offset;   /* offset */
> +__u64 size; /* size */
> +} data;

I'm curious how the offsets/size work; how does the 
kernel driver know the maximum size of state it's allowed to write?
Why would it pick a none-0 offset into the output region?

Without having dug further these feel like i/o rather than just output;
i.e. the calling process says 'put it at that offset and you've got size
bytes' and the kernel replies with 'I did put it at offset and I wrote
only this size bytes'

Dave

> +struct {
> +__u64 start_addr;
> +__u64 total;
> +__u64 copied;
> +} dirty_pfns;
> +} __attribute__((packed));
> +
>  /*  API for Type1 VFIO IOMMU  */
>  
>  /**
> -- 
> 2.7.0
> 
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK

[Qemu-devel] [PATCH 5/5] iotests: 030 with block-stream discard

2018-11-22 Thread Andrey Shinkevich

The classes that set tests for the block-stream command with discard
option on are inherited from the existent classes in the 030 file.
Some QMP commands do not have the optional 'discard' argument because
the WRITE permission is not being granted when the filter is inserted.
For instance, it is true while streaming into an inactive layer.

Signed-off-by: Andrey Shinkevich 
---
 tests/qemu-iotests/030 | 143 ++---
 tests/qemu-iotests/030.out |   4 +-
 2 files changed, 111 insertions(+), 36 deletions(-)

diff --git a/tests/qemu-iotests/030 b/tests/qemu-iotests/030
index 5d148b0..eba2fff 100755
--- a/tests/qemu-iotests/030
+++ b/tests/qemu-iotests/030
@@ -29,6 +29,7 @@ test_img = os.path.join(iotests.test_dir, 'test.img')
 
 class TestSingleDrive(iotests.QMPTestCase):
 image_len = 1 * 1024 * 1024 # MB
+do_discard = False
 
 def setUp(self):
 iotests.create_image(backing_img, TestSingleDrive.image_len)
@@ -49,7 +50,8 @@ class TestSingleDrive(iotests.QMPTestCase):
 def test_stream(self):
 self.assert_no_active_block_jobs()
 
-result = self.vm.qmp('block-stream', device='drive0')
+result = self.vm.qmp('block-stream', discard=self.do_discard,
+ device='drive0')
 self.assert_qmp(result, 'return', {})
 
 self.wait_until_completed()
@@ -84,7 +86,8 @@ class TestSingleDrive(iotests.QMPTestCase):
 self.assert_no_active_block_jobs()
 
 self.vm.pause_drive('drive0')
-result = self.vm.qmp('block-stream', device='drive0')
+result = self.vm.qmp('block-stream', discard=self.do_discard,
+ device='drive0')
 self.assert_qmp(result, 'return', {})
 
 self.pause_job('drive0', wait=False)
@@ -117,7 +120,8 @@ class TestSingleDrive(iotests.QMPTestCase):
 empty_map = qemu_io('-f', iotests.imgfmt, '-rU', '-c', 'map', test_img)
 
 # This is a no-op: no data should ever be copied from the base image
-result = self.vm.qmp('block-stream', device='drive0', base=mid_img)
+result = self.vm.qmp('block-stream', discard=self.do_discard,
+ device='drive0', base=mid_img)
 self.assert_qmp(result, 'return', {})
 
 self.wait_until_completed()
@@ -131,7 +135,8 @@ class TestSingleDrive(iotests.QMPTestCase):
 def test_stream_partial(self):
 self.assert_no_active_block_jobs()
 
-result = self.vm.qmp('block-stream', device='drive0', base=backing_img)
+result = self.vm.qmp('block-stream', discard=self.do_discard,
+ device='drive0', base=backing_img)
 self.assert_qmp(result, 'return', {})
 
 self.wait_until_completed()
@@ -144,11 +149,13 @@ class TestSingleDrive(iotests.QMPTestCase):
  'image file map does not match backing file after 
streaming')
 
 def test_device_not_found(self):
-result = self.vm.qmp('block-stream', device='nonexistent')
+result = self.vm.qmp('block-stream', discard=self.do_discard,
+ device='nonexistent')
 self.assert_qmp(result, 'error/class', 'GenericError')
 
 def test_job_id_missing(self):
-result = self.vm.qmp('block-stream', device='mid')
+result = self.vm.qmp('block-stream', discard=self.do_discard,
+ device='mid')
 self.assert_qmp(result, 'error/class', 'GenericError')
 
 
@@ -157,6 +164,7 @@ class TestParallelOps(iotests.QMPTestCase):
 num_imgs = num_ops * 2 + 1
 image_len = num_ops * 512 * 1024
 imgs = []
+do_discard = False
 
 def setUp(self):
 opts = []
@@ -241,13 +249,16 @@ class TestParallelOps(iotests.QMPTestCase):
 result = self.vm.qmp('block-stream', device='node4', 
job_id='stream-node4', base=self.imgs[1], speed=1024*1024)
 self.assert_qmp(result, 'return', {})
 
-result = self.vm.qmp('block-stream', device='node5', 
job_id='stream-node5', base=self.imgs[2])
+result = self.vm.qmp('block-stream', discard=self.do_discard,
+ device='node5', job_id='stream-node5', 
base=self.imgs[2])
 self.assert_qmp(result, 'error/class', 'GenericError')
 
-result = self.vm.qmp('block-stream', device='node3', 
job_id='stream-node3', base=self.imgs[2])
+result = self.vm.qmp('block-stream', discard=self.do_discard,
+ device='node3', job_id='stream-node3', 
base=self.imgs[2])
 self.assert_qmp(result, 'error/class', 'GenericError')
 
-result = self.vm.qmp('block-stream', device='node4', 
job_id='stream-node4-v2')
+result = self.vm.qmp('block-stream', discard=self.do_discard,
+ device='node4', job_id='stream-node4-v2')
 self.assert_qmp(result, 'error/class', 'GenericError')
 
 # block-commit should also fail if it touches nodes used by the stream

[Qemu-devel] [PATCH 3/5] iotests: allow resume_drive by node name

2018-11-22 Thread Andrey Shinkevich

After node graph changes, we may not be able to resume_drive by device
name (backing files are not recursively searched). So, lets allow to
resume by node-name. Set constant name for breakpoints, to avoid
introducing extra parameters.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 tests/qemu-iotests/iotests.py | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index 27bb2b6..78a96f0 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -407,11 +407,11 @@ class VM(qtest.QEMUQtestMachine):
 self.pause_drive(drive, "write_aio")
 return
 self.qmp('human-monitor-command',
-command_line='qemu-io %s "break %s bp_%s"' % (drive, 
event, drive))
+command_line='qemu-io %s "break %s bp_0"' % (drive, event))
 
 def resume_drive(self, drive):
 self.qmp('human-monitor-command',
-command_line='qemu-io %s "remove_break bp_%s"' % (drive, 
drive))
+command_line='qemu-io %s "remove_break bp_0"' % (drive))
 
 def hmp_qemu_io(self, drive, cmd):
 '''Write to a given drive using an HMP command'''
@@ -535,13 +535,14 @@ class QMPTestCase(unittest.TestCase):
 
self.assertEqual(self.vm.flatten_qmp_object(json.loads(json_filename[5:])),
  self.vm.flatten_qmp_object(reference))
 
-def cancel_and_wait(self, drive='drive0', force=False, resume=False):
+def cancel_and_wait(self, drive='drive0', force=False,
+resume=False,resume_node=None):
 '''Cancel a block job and wait for it to finish, returning the event'''
 result = self.vm.qmp('block-job-cancel', device=drive, force=force)
 self.assert_qmp(result, 'return', {})
 
 if resume:
-self.vm.resume_drive(drive)
+self.vm.resume_drive(resume_node or drive)
 
 cancelled = False
 result = None
-- 
1.8.3.1

[Qemu-devel] [PATCH 1/5] Discard blocks while copy-on-read

2018-11-22 Thread Andrey Shinkevich

Discards the block duplicated in an intermediate backing file
after the block have been copied into the active layer during
QMP block-stream operation.
It saves the disk space while merging external snapshots.

Signed-off-by: Andrey Shinkevich 
---
 block/stream.c | 428 +++--
 1 file changed, 413 insertions(+), 15 deletions(-)

diff --git a/block/stream.c b/block/stream.c
index 81a7ec8..9e85954 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -12,6 +12,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/cutils.h"
 #include "trace.h"
 #include "block/block_int.h"
 #include "block/blockjob_int.h"
@@ -35,9 +36,62 @@ typedef struct StreamBlockJob {
 BlockdevOnError on_error;
 char *backing_file_str;
 int bs_flags;
+bool discard;
+BlockDriverState *stream_top_bs;
+GSList *im_nodes;
 } StreamBlockJob;
 
-static int coroutine_fn stream_populate(BlockBackend *blk,
+typedef struct IntermediateNode {
+BlockBackend *blk;
+int flags;
+} IntermediateNode;
+
+static inline void restore_all_im_nodes(StreamBlockJob *s)
+{
+GSList *l;
+BlockDriverState *bs_active;
+BlockDriverState *bs_im;
+IntermediateNode *im_node;
+BlockReopenQueue *queue = NULL;
+Error *local_err = NULL;
+
+assert(s->stream_top_bs && s->stream_top_bs->backing &&
+   s->stream_top_bs->backing->bs);
+bs_active = backing_bs(s->stream_top_bs);
+assert(backing_bs(bs_active));
+
+bdrv_subtree_drained_begin(backing_bs(bs_active));
+
+for (l = s->im_nodes; l; l = l->next) {
+im_node = l->data;
+if (im_node->blk) {
+bs_im = blk_bs(im_node->blk);
+
+if (im_node->flags != bdrv_get_flags(bs_im) && bs_im) {
+queue = bdrv_reopen_queue(queue, bs_im, NULL, im_node->flags);
+}
+/* Give up write permissions before making it read-only */
+blk_set_perm(im_node->blk, 0, BLK_PERM_ALL, _abort);
+blk_unref(im_node->blk);
+bdrv_unref(bs_im);
+}
+g_free(im_node);
+}
+g_slist_free(s->im_nodes);
+s->im_nodes = NULL;
+
+if (queue) {
+bdrv_reopen_multiple(bdrv_get_aio_context(bs_active), queue,
+ _err);
+if (local_err != NULL) {
+error_report_err(local_err);
+}
+}
+
+bdrv_subtree_drained_end(backing_bs(bs_active));
+}
+
+static int coroutine_fn stream_populate(const StreamBlockJob *s,
 int64_t offset, uint64_t bytes,
 void *buf)
 {
@@ -46,19 +100,33 @@ static int coroutine_fn stream_populate(BlockBackend *blk,
 .iov_len  = bytes,
 };
 QEMUIOVector qiov;
+GSList *l;
+IntermediateNode *im_node;
+int ret;
 
+assert(s);
 assert(bytes < SIZE_MAX);
 qemu_iovec_init_external(, , 1);
 
 /* Copy-on-read the unallocated clusters */
-return blk_co_preadv(blk, offset, qiov.size, , BDRV_REQ_COPY_ON_READ);
+ret = blk_co_preadv(s->common.blk, offset, qiov.size, ,
+BDRV_REQ_COPY_ON_READ);
+
+if (ret < 0 || !s->discard) {
+return ret;
+}
+
+for (l = s->im_nodes; l; l = l->next) {
+im_node = l->data;
+blk_co_pdiscard(im_node->blk, offset, bytes);
+}
+
+return ret;
 }
 
-static int stream_prepare(Job *job)
+static int stream_change_backing_file(StreamBlockJob *s,
+  BlockDriverState *bs)
 {
-StreamBlockJob *s = container_of(job, StreamBlockJob, common.job);
-BlockJob *bjob = >common;
-BlockDriverState *bs = blk_bs(bjob->blk);
 BlockDriverState *base = s->base;
 Error *local_err = NULL;
 int ret = 0;
@@ -82,6 +150,68 @@ static int stream_prepare(Job *job)
 return ret;
 }
 
+static int stream_exit_discard(StreamBlockJob *s, bool abort)
+{
+BlockJob *bjob = >common;
+BlockDriverState *bs_active = backing_bs(s->stream_top_bs);
+int ret = 0;
+
+/* Make sure that the BDS doesn't go away during bdrv_replace_node,
+ * before we can call bdrv_drained_end */
+bdrv_ref(s->stream_top_bs);
+/* Reopen intermediate images back in read-only mode */
+restore_all_im_nodes(s);
+/* Hold a guest back from writing until we remove the filter */
+bdrv_drained_begin(bs_active);
+/* Dropping WRITE is required before changing the backing file. */
+bdrv_child_try_set_perm(s->stream_top_bs->backing, 0, BLK_PERM_ALL,
+_abort);
+if (abort == false) {
+ret = stream_change_backing_file(s, bs_active);
+}
+/* Remove the filter driver from the graph. Before this, get rid of
+ * the blockers on the intermediate nodes so that the resulting state is
+ * valid. Also give up permissions on stream_top_bs->backing, which might
+ * block the removal. */
+block_job_remove_all_bdrv(bjob);
+

[Qemu-devel] [PATCH 0/5] Discrad blocks during block-stream operation

2018-11-22 Thread Andrey Shinkevich

Hello everyone!

The given feature discards blocks with copy-on-read operation while the
streaming process runs. Adding the 'discard' argument to the QMP block-stream
command allows dropping a block in the backing chain after it has been copied
to the active layer. That will elude the block duplication in the intermediate
backing file. It saves the disk space while external snapshots are being
merged.
The method involves the filter insertion above the active layer to allow write
operation in the backing chain. The method is similar to that in the 'commit
active' command (mirror.c).
The permission to write into an inactive layer can not be obtained due to the
existing child permission mechanism. There is a commented up hack in the
callback function bdrv_stream_top_pwritev() in block/stream.c that redirects
write operations below the filter node. Being uncommented, it enables writing
into the inactive layer and passing all the iotests in the 030 file. Otherwise,
no WRITE permission is granted after the filter insertion above the target node.
Any suggestions to resolve that issue will be appreciated.

The suggestions of Dr. David Alan Gilbert and Alberto Garcia after their first
review have been applied.

Sincerely,

Andrey Shinkevich (5):
  Discard blocks while copy-on-read
  The discard flag for block stream operation
  iotests: allow resume_drive by node name
  iotests: prepare 030 for graph change
  iotests: 030 with block-stream discard

 block/stream.c| 429 --
 blockdev.c|   8 +-
 hmp-commands.hx   |   4 +-
 hmp.c |   4 +-
 include/block/block_int.h |   2 +-
 qapi/block-core.json  |   5 +-
 tests/qemu-iotests/030| 163 +++-
 tests/qemu-iotests/030.out|   4 +-
 tests/qemu-iotests/iotests.py |   9 +-
 9 files changed, 558 insertions(+), 70 deletions(-)

-- 
1.8.3.1

[Qemu-devel] [PATCH 4/5] iotests: prepare 030 for graph change

2018-11-22 Thread Andrey Shinkevich

The discard option for block-stream command requires insertion of the
filter to write into the backing chain. In that case, the job will not
resume by device name. So, the node name is specified.

Signed-off-by: Andrey Shinkevich 
---
 tests/qemu-iotests/030 | 20 
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/tests/qemu-iotests/030 b/tests/qemu-iotests/030
index 276e06b..5d148b0 100755
--- a/tests/qemu-iotests/030
+++ b/tests/qemu-iotests/030
@@ -36,7 +36,8 @@ class TestSingleDrive(iotests.QMPTestCase):
 qemu_img('create', '-f', iotests.imgfmt, '-o', 'backing_file=%s' % 
mid_img, test_img)
 qemu_io('-f', 'raw', '-c', 'write -P 0x1 0 512', backing_img)
 qemu_io('-f', iotests.imgfmt, '-c', 'write -P 0x1 524288 512', mid_img)
-self.vm = iotests.VM().add_drive("blkdebug::" + test_img, 
"backing.node-name=mid")
+self.vm = iotests.VM().add_drive("blkdebug::" + test_img,
+   "node-name=source,backing.node-name=mid")
 self.vm.launch()
 
 def tearDown(self):
@@ -87,7 +88,7 @@ class TestSingleDrive(iotests.QMPTestCase):
 self.assert_qmp(result, 'return', {})
 
 self.pause_job('drive0', wait=False)
-self.vm.resume_drive('drive0')
+self.vm.resume_drive('source')
 self.pause_wait('drive0')
 
 result = self.vm.qmp('query-block-jobs')
@@ -743,7 +744,8 @@ class TestStreamStop(iotests.QMPTestCase):
 qemu_io('-f', 'raw', '-c', 'write -P 0x1 0 32M', backing_img)
 qemu_img('create', '-f', iotests.imgfmt, '-o', 'backing_file=%s' % 
backing_img, test_img)
 qemu_io('-f', iotests.imgfmt, '-c', 'write -P 0x1 32M 32M', test_img)
-self.vm = iotests.VM().add_drive("blkdebug::" + test_img)
+self.vm = iotests.VM().add_drive("blkdebug::" + test_img,
+ "node-name=source")
 self.vm.launch()
 
 def tearDown(self):
@@ -764,7 +766,7 @@ class TestStreamStop(iotests.QMPTestCase):
 self.assert_qmp(e, 'event', 'JOB_STATUS_CHANGE')
 self.assert_qmp(e, 'data/id', 'drive0')
 
-self.cancel_and_wait(resume=True)
+self.cancel_and_wait(resume=True, resume_node='source')
 
 class TestSetSpeed(iotests.QMPTestCase):
 image_len = 80 * 1024 * 1024 # MB
@@ -774,7 +776,8 @@ class TestSetSpeed(iotests.QMPTestCase):
 qemu_io('-f', 'raw', '-c', 'write -P 0x1 0 32M', backing_img)
 qemu_img('create', '-f', iotests.imgfmt, '-o', 'backing_file=%s' % 
backing_img, test_img)
 qemu_io('-f', iotests.imgfmt, '-c', 'write -P 0x1 32M 32M', test_img)
-self.vm = iotests.VM().add_drive('blkdebug::' + test_img)
+self.vm = iotests.VM().add_drive('blkdebug::' + test_img,
+ "node-name=source")
 self.vm.launch()
 
 def tearDown(self):
@@ -817,7 +820,7 @@ class TestSetSpeed(iotests.QMPTestCase):
 self.assert_qmp(result, 'return[0]/device', 'drive0')
 self.assert_qmp(result, 'return[0]/speed', 8 * 1024 * 1024)
 
-self.cancel_and_wait(resume=True)
+self.cancel_and_wait(resume=True, resume_node='source')
 self.vm.pause_drive('drive0')
 
 # Check setting speed in block-stream works
@@ -828,7 +831,7 @@ class TestSetSpeed(iotests.QMPTestCase):
 self.assert_qmp(result, 'return[0]/device', 'drive0')
 self.assert_qmp(result, 'return[0]/speed', 4 * 1024 * 1024)
 
-self.cancel_and_wait(resume=True)
+self.cancel_and_wait(resume=True, resume_node='source')
 
 def test_set_speed_invalid(self):
 self.assert_no_active_block_jobs()
@@ -845,7 +848,8 @@ class TestSetSpeed(iotests.QMPTestCase):
 result = self.vm.qmp('block-job-set-speed', device='drive0', speed=-1)
 self.assert_qmp(result, 'error/class', 'GenericError')
 
-self.cancel_and_wait(resume=True)
+self.cancel_and_wait(resume=True, resume_node='source')
+
 
 if __name__ == '__main__':
 iotests.main(supported_fmts=['qcow2', 'qed'])
-- 
1.8.3.1

[Qemu-devel] [PATCH 2/5] The discard flag for block stream operation

2018-11-22 Thread Andrey Shinkevich

Adding a parameter to QMP block-stream command to allow discarding
blocks in the backing chain while blocks are being copied to the
active layer.

Signed-off-by: Andrey Shinkevich 
---
 block/stream.c| 3 +--
 blockdev.c| 8 +++-
 hmp-commands.hx   | 4 ++--
 hmp.c | 4 +++-
 include/block/block_int.h | 2 +-
 qapi/block-core.json  | 5 -
 6 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/block/stream.c b/block/stream.c
index 9e85954..e844e94 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -584,10 +584,9 @@ static const BlockJobDriver stream_job_driver = {
 
 void stream_start(const char *job_id, BlockDriverState *bs,
   BlockDriverState *base, const char *backing_file_str,
-  int creation_flags, int64_t speed,
+  int creation_flags, int64_t speed, bool discard,
   BlockdevOnError on_error, Error **errp)
 {
-const bool discard = false;
 StreamBlockJob *s = NULL;
 BlockDriverState *iter;
 int orig_bs_flags;
diff --git a/blockdev.c b/blockdev.c
index 81f95d9..333592e 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -3141,6 +3141,7 @@ void qmp_block_stream(bool has_job_id, const char 
*job_id, const char *device,
   bool has_base_node, const char *base_node,
   bool has_backing_file, const char *backing_file,
   bool has_speed, int64_t speed,
+  bool has_discard, bool discard,
   bool has_on_error, BlockdevOnError on_error,
   bool has_auto_finalize, bool auto_finalize,
   bool has_auto_dismiss, bool auto_dismiss,
@@ -3157,6 +3158,10 @@ void qmp_block_stream(bool has_job_id, const char 
*job_id, const char *device,
 on_error = BLOCKDEV_ON_ERROR_REPORT;
 }
 
+if (!has_discard) {
+discard = false;
+}
+
 bs = bdrv_lookup_bs(device, device, errp);
 if (!bs) {
 return;
@@ -3221,7 +3226,8 @@ void qmp_block_stream(bool has_job_id, const char 
*job_id, const char *device,
 }
 
 stream_start(has_job_id ? job_id : NULL, bs, base_bs, base_name,
- job_flags, has_speed ? speed : 0, on_error, _err);
+ job_flags, has_speed ? speed : 0,
+ discard, on_error, _err);
 if (local_err) {
 error_propagate(errp, local_err);
 goto out;
diff --git a/hmp-commands.hx b/hmp-commands.hx
index db0c681..a7e2a10 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -95,8 +95,8 @@ ETEXI
 
 {
 .name   = "block_stream",
-.args_type  = "device:B,speed:o?,base:s?",
-.params = "device [speed [base]]",
+.args_type  = "device:B,speed:o?,base:s?,discard:-d",
+.params = "device [speed [base]] [-d]",
 .help   = "copy data from a backing file into a block device",
 .cmd= hmp_block_stream,
 },
diff --git a/hmp.c b/hmp.c
index 7828f93..0d263e4 100644
--- a/hmp.c
+++ b/hmp.c
@@ -1920,9 +1920,11 @@ void hmp_block_stream(Monitor *mon, const QDict *qdict)
 const char *device = qdict_get_str(qdict, "device");
 const char *base = qdict_get_try_str(qdict, "base");
 int64_t speed = qdict_get_try_int(qdict, "speed", 0);
+bool discard = qdict_get_try_bool(qdict, "discard", false);
 
 qmp_block_stream(true, device, device, base != NULL, base, false, NULL,
- false, NULL, qdict_haskey(qdict, "speed"), speed, true,
+ false, NULL, qdict_haskey(qdict, "speed"), speed,
+ true, discard, true,
  BLOCKDEV_ON_ERROR_REPORT, false, false, false, false,
  );
 
diff --git a/include/block/block_int.h b/include/block/block_int.h
index f605622..2660336 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -970,7 +970,7 @@ int is_windows_drive(const char *filename);
  */
 void stream_start(const char *job_id, BlockDriverState *bs,
   BlockDriverState *base, const char *backing_file_str,
-  int creation_flags, int64_t speed,
+  int creation_flags, int64_t speed, bool discard,
   BlockdevOnError on_error, Error **errp);
 
 /**
diff --git a/qapi/block-core.json b/qapi/block-core.json
index d4fe710..f4538fa 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2334,6 +2334,9 @@
 #
 # @speed:  the maximum speed, in bytes per second
 #
+# @discard: true to delete blocks duplicated in old backing files.
+#   (default: false). Since 3.1.
+#
 # @on-error: the action to take on an error (default report).
 #'stop' and 'enospc' can only be used if the block device
 #supports io-status (see BlockInfo).  Since 1.3.
@@ -2366,7 +2369,7 @@
 { 'command': 'block-stream',
   'data': { '*job-id': 'str', 'device': 'str', '*base': 'str',
 '*base-node':

Re: [Qemu-devel] [PATCH] nvme: fix CMB endianness confusion

2018-11-22 Thread Kevin Wolf

Am 22.11.2018 um 19:23 hat Paolo Bonzini geschrieben:
> The CMB is marked as DEVICE_LITTLE_ENDIAN, so the data must be
> read/written as if it was little-endian output (in the case of
> big endian, we get two swaps, one in the memory core and one
> in nvme.c).
> 
> Signed-off-by: Paolo Bonzini 

Thanks, applied to the block branch.

Kevin

[Qemu-devel] [PULL v2 00/16] Block layer patches

2018-11-22 Thread Kevin Wolf

The following changes since commit 47c1cc30e440860aa695358f7c2dd0b9d7b53d16:

  Update version for v3.1.0-rc2 release (2018-11-20 18:10:26 +)

are available in the Git repository at:

  git://repo.or.cz/qemu/kevin.git tags/for-upstream

for you to fetch changes up to 6bd858b3117a5aab066f3cf02ca72000eaa10ddb:

  block: Update BlockDriverState.inherits_from on bdrv_drop_intermediate() 
(2018-11-22 19:37:31 +0100)


Block layer patches:

- block: Fix update of BDRV_O_AUTO_RDONLY in update_flags_from_options()
- block: Fix option inheritance after stream/commit job graph changes
- qemu-img: Fix memory leak and typo in error message
- nvme: Fixes for lockups and crashes
- scsi-disk: Fix crash if underlying host file or disk returns error
- Several qemu-iotests fixes and improvements


Alberto Garcia (3):
  block: Fix update of BDRV_O_AUTO_RDONLY in update_flags_from_options()
  block: Update BlockDriverState.inherits_from on bdrv_set_backing_hd()
  block: Update BlockDriverState.inherits_from on bdrv_drop_intermediate()

Daniel P. Berrangé (1):
  iotests: fix nbd test 233 to work correctly with raw images

Eric Blake (2):
  iotests: Skip 233 if certtool not installed
  iotests: Enhance 223 to cover multiple bitmap granularities

Igor Druzhinin (1):
  nvme: call blk_drain in NVMe reset code to avoid lockups

Kevin Wolf (3):
  iotests: Replace time.clock() with Timeout
  iotests: Replace assertEquals() with assertEqual()
  Revert "nvme: fix oob access issue(CVE-2018-16847)"

Logan Gunthorpe (1):
  nvme: fix bug with PCI IRQ pins on teardown

Max Reitz (2):
  qemu-img: Fix typo
  qemu-img: Fix leak

Paolo Bonzini (2):
  nvme: fix out-of-bounds access to the CMB
  nvme: fix CMB endianness confusion

Richard W.M. Jones (1):
  scsi-disk: Fix crash if underlying host file or disk returns error

 block.c   |  41 -
 hw/block/nvme.c   |  19 ++
 hw/scsi/scsi-disk.c   |   2 +-
 qemu-img.c|   3 +-
 tests/nvme-test.c |  68 ++---
 tests/Makefile.include|   2 +-
 tests/qemu-iotests/041|   6 +-
 tests/qemu-iotests/118|  20 +++---
 tests/qemu-iotests/161| 137 ++
 tests/qemu-iotests/161.out|  39 
 tests/qemu-iotests/223|  43 ++---
 tests/qemu-iotests/223.out|  32 +++---
 tests/qemu-iotests/233|   9 ++-
 tests/qemu-iotests/common.tls |   3 +
 tests/qemu-iotests/group  |   1 +
 tests/qemu-iotests/iotests.py |   2 +-
 16 files changed, 364 insertions(+), 63 deletions(-)
 create mode 100755 tests/qemu-iotests/161
 create mode 100644 tests/qemu-iotests/161.out

Re: [Qemu-devel] [PATCH-for-3.1] [REGRESSION FIX] ps2kbd: default to scan enabled after reset

2018-11-22 Thread Hervé Poussineau


Ping again.

Le 18/11/2018 à 11:09, Hervé Poussineau a écrit :

Ping again.

v3.0 didn't contain 143c04c7e0639e53086519592ead15d2556bfbf2, so this commit 
fixes a regression.

Le 10/11/2018 à 21:53, Hervé Poussineau a écrit :

Ping.

Le 21/10/2018 à 21:07, Hervé Poussineau a écrit :

A check for scan_enabled has been added to ps2_keyboard_event in commit
143c04c7e0639e53086519592ead15d2556bfbf2 to prevent stream corruption.
This works well as long as operating system is resetting keyboard, or enabling 
it.

This fixes IBM 40p firmware, which doesn't bother sending KBD_CMD_RESET,
KBD_CMD_ENABLE or KBD_CMD_RESET_ENABLE before trying to use the keyboard.

Signed-off-by: Hervé Poussineau 
---
  hw/input/ps2.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/input/ps2.c b/hw/input/ps2.c
index fdfcadf9a1..eded4f0f8d 100644
--- a/hw/input/ps2.c
+++ b/hw/input/ps2.c
@@ -938,7 +938,7 @@ static void ps2_kbd_reset(void *opaque)
  trace_ps2_kbd_reset(opaque);
  ps2_common_reset(>common);
-    s->scan_enabled = 0;
+    s->scan_enabled = 1;
  s->translate = 0;
  s->scancode_set = 2;
  s->modifiers = 0;

Re: [Qemu-devel] [PATCH] nvme: fix CMB endianness confusion

2018-11-22 Thread Peter Maydell

On 22 November 2018 at 18:23, Paolo Bonzini  wrote:
> The CMB is marked as DEVICE_LITTLE_ENDIAN, so the data must be
> read/written as if it was little-endian output (in the case of
> big endian, we get two swaps, one in the memory core and one
> in nvme.c).
>
> Signed-off-by: Paolo Bonzini 
> ---
>  hw/block/nvme.c | 7 ++-
>  1 file changed, 2 insertions(+), 5 deletions(-)

Tested-by: Peter Maydell 

This is sufficient to get the nvme-test to pass on sparc
(big-endian host).

thanks
-- PMM

[Qemu-devel] [PATCH] nvme: fix CMB endianness confusion

2018-11-22 Thread Paolo Bonzini

The CMB is marked as DEVICE_LITTLE_ENDIAN, so the data must be
read/written as if it was little-endian output (in the case of
big endian, we get two swaps, one in the memory core and one
in nvme.c).

Signed-off-by: Paolo Bonzini 
---
 hw/block/nvme.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 5d92794ef7..8a12fba24f 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1175,16 +1175,13 @@ static void nvme_cmb_write(void *opaque, hwaddr addr, 
uint64_t data,
 unsigned size)
 {
 NvmeCtrl *n = (NvmeCtrl *)opaque;
-memcpy(>cmbuf[addr], , size);
+stn_le_p(>cmbuf[addr], size, data);
 }
 
 static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
 {
-uint64_t val;
 NvmeCtrl *n = (NvmeCtrl *)opaque;
-
-memcpy(, >cmbuf[addr], size);
-return val;
+return ldn_le_p(>cmbuf[addr], size);
 }
 
 static const MemoryRegionOps nvme_cmb_ops = {
-- 
2.19.1

Re: [Qemu-devel] [RFC PATCH v2 1/3] target/ppc: Add macro definitions for relocated interrupt vectors offsets

2018-11-22 Thread Fabiano Rosas

David Gibson  writes:

> On Wed, Nov 21, 2018 at 04:13:45PM -0200, Fabiano Rosas wrote:
>> The PowerISA prescribes that depending on the values of MSR_IR,
>> MSR_DR, MSR_HV and LPCR_AIL, the interrupt vectors might be relocated
>> by specific offsets.
>> 
>> This patch defines macros for these offsets so that they can be used
>> by another part of the code in a future patch.
>> 
>> Signed-off-by: Fabiano Rosas 
>> ---
>>  target/ppc/cpu.h | 3 +++
>>  target/ppc/excp_helper.c | 4 ++--
>>  2 files changed, 5 insertions(+), 2 deletions(-)
>> 
>> diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
>> index ab68abe8a2..5147db4460 100644
>> --- a/target/ppc/cpu.h
>> +++ b/target/ppc/cpu.h
>> @@ -2390,6 +2390,9 @@ enum {
>>  AIL_C000___4000 = 3,
>>  };
>>  
>> +#define AIL_0001_8000_OFFSET 0x18000
>> +#define AIL_C000___4000_OFFSET 0xc0004000ull
>
> Hrm.  Is there really a point making a #define, if the name spells out
> the value?  It's not like you can change the value without having to
> change the places that use it that way?

You're right, this is a bit clumsy.

I just checked and the single step works within SLOF code as well, so
I'll probably need to borrow the AIL-checking logic from excp_helper.c
to get the correct offset so this patch is likely to go away.

Cheers.

[Qemu-devel] [PATCH for-3.1] MAINTAINERS: Add an ARM SMMU section

2018-11-22 Thread Eric Auger

Add a new ARM SMMU section and set Eric Auger as the maintainer
for ARM SMMU emulation sources.

Signed-off-by: Eric Auger 
Suggested-by: Peter Maydell 
---
 MAINTAINERS | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 1032406c56..3cac9f0a0c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -151,6 +151,13 @@ F: disas/arm.c
 F: disas/arm-a64.cc
 F: disas/libvixl/
 
+ARM SMMU
+M: Eric Auger 
+L: qemu-...@nongnu.org
+S: Maintained
+F: hw/arm/smmu*
+F: include/hw/arm/smmu*
+
 CRIS
 M: Edgar E. Iglesias 
 S: Maintained
-- 
2.17.2

Re: [Qemu-devel] [PATCH 0/2] Update the inherits_from pointer after stream and commit

2018-11-22 Thread Kevin Wolf

Am 31.10.2018 um 17:16 hat Alberto Garcia geschrieben:
> Hi all,
> 
> when you open an image [A] with a few more images on the backing chain
> you get something like this:
> 
> [E] <- [D] <- [C] <- [B] <- [A]
> 
> Here you can go from [A] to [E] by following the bs->backing
> pointer. At the same time each one of the backing files has an
> 'inherits_from' attribute pointing to their parent, so you can go from
> [E] to [A] following the inherits_from pointer.
> 
> 'inherits_from' is used on bdrv_reopen_queue_child() to decide if a
> node's children must be reopened together with the parent and inherit
> its options.
> 
> If some the intermediate nodes are removed (either by block-stream or
> by block-commit) you end up with something like this:
> 
>[E] <- [A]
> 
> In this case we would expect [E] to inherit from [A], however its
> inherits_from pointer is NULL and trying to change its options by
> reopening [A] with backing.option=value fails.
> 
> This patch series fixes this. See each individual patch for more
> details.

Thanks, applied to the block branch.

Not a problem with the series, but I tried to run the test case without
the fix, and this is what I got:

-{"return": ""}
+{"return": "Cannot change the option 'backing.detect-zeroes'rn"}

Where does that final "rn" come from? Looks like we have a bug somewhere
in the error reporting code?

Kevin

Re: [Qemu-devel] [PATCH v1 00/16] packed ring virtio-net backend support

2018-11-22 Thread Maxime Coquelin


Hi Wei,

I just tested your series with Tiwei's v3, and it fails
with ctrl vq enabled:
qemu-system-x86_64: virtio-net ctrl missing headers

Regards,
Maxime

On 11/22/18 3:06 PM, w...@redhat.com wrote:

From: Wei Xu 

Code base:
 https://github.com/Whishay/qemu.git

rfc v3 -> v1
- migration support for both userspace and vhost-net, need tweak vhost
   ioctl() to make it work(the code is pasted in the commit message of
   vhost migration patch #13).

Note:
   the high 32-bit guest feature bit is saved as a subsection for
   virtio devices which makes packed ring feature bit check unusable when
   loading the saved per-queue variables(this is done before loading
   subsection which is the last action for device during migration),
   so I save and load all the things generally for now, any idea to fix this?

- Fixed comments from Jason for rfc v3 sorted by patch #, two comments I
   didn't take were(from patch) listed here:
09: - introduce new API(virtqueue_fill_n()).
   - Didn't take it since userspace backend does not support batching,
 so only one element is popped and current API should be enough.
06 & 07: Refactor split and packed pop()/get_avail_bytes().
  - the duplicated code interwined with split/packed ring specific
things and it might make it unclear, so I only extracted the few
common parts out side rcu and keep the others separate.

The other revised comments:
02: - reuse current 'avail/used' for 'driver/device' in VRingMemoryRegionCache.
 - remove event_idx since shadow_avail_idx works.
03: - move size recalculation to a separate patch.
 - keep 'avail/used' in current calculation function name.
 - initialize 'desc' memory region as 'false' for 1.0('true' for 1.1)
04: - delete 'event_idx'
05: - rename 'wc' to wrap_counter.
06: - converge common part outside rcu section for 1.0/1.1.
 - move memory barrier for the first 'desc' in between checking flag
   and read other fields.
 - remove unnecessary memory barriers for indirect descriptors.
 - no need to destroy indirect memory cache since it is generally done
   before return from the function.
 - remove redundant maximum chained descriptors limitation check.
 - there are some differences(desc name, wrap idx/counter, flags) between
   split and packed rings, so keep them separate for now.
 - amend the comment when recording index and wrap counter for a kick
   from guest.
07: - calculate fields in descriptor instead of read it when filling.
 - put memory barrier correctly before filling the flags in descriptor.
 - replace full memory barrier with a write barrier in fill.
 - shift to read descriptor flags and descriptor necessarily and
   separately in packed_pop().
 - correct memory barrier in packed_pop() as in packed_fill().
08: - reuse 'shadow_avail_idx' instead of adding a new 'event_idx'.
 - use the compact and verified vring_packed_need_event()
   version for vhost net/user.
12: - remove the odd cherry-pick comment.
 - used bit '15' for wrap_counters.

rfc v2->v3
- addressed performance issue
- fixed feedback from v2

rfc v1->v2
- sync to tiwei's v5
- reuse memory cache function with 1.0
- dropped detach patch and notification helper(04 & 05 in v1)
- guest virtio-net driver unload/reload support
- event suppression support(not tested)
- addressed feedback from v1

Wei Xu (15):
   virtio: introduce packed ring definitions
   virtio: redefine structure & memory cache for packed ring
   virtio: expand offset calculation for packed ring
   virtio: add memory region init for packed ring
   virtio: init wrap counter for packed ring
   virtio: init and desc empty check for packed ring
   virtio: get avail bytes check for packed ring
   virtio: fill/flush/pop for packed ring
   virtio: event suppression support for packed ring
   virtio-net: fill head desc after done all in a chain
   virtio: add userspace migration of packed ring
   virtio: add vhost-net migration of packed ring
   virtio: packed ring feature bit for userspace backend
   vhost: enable packed ring
   virtio: enable packed ring via a new command line

  VERSION|   2 +-
  hw/net/vhost_net.c |   2 +
  hw/net/virtio-net.c|  11 +-
  hw/virtio/virtio.c | 756 +++--
  include/hw/virtio/virtio.h |   8 +-
  include/standard-headers/linux/virtio_config.h |  15 +
  include/standard-headers/linux/virtio_ring.h   |  43 ++
  7 files changed, 783 insertions(+), 54 deletions(-)

[Qemu-devel] [PATCH v2 for-4.0 1/3] disas.c: Use address_space_read() to read memory

2018-11-22 Thread Peter Maydell

Currently disas.c reads physical memory using
cpu_physical_memory_read(). This effectively hard-codes
assuming that all CPUs have the same view of physical
memory. Switch to address_space_read() instead, which
lets us use the AddressSpace for the CPU we're
disassembling for.

Signed-off-by: Peter Maydell 
Reviewed-by: Philippe Mathieu-Daudé 
---
 disas.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/disas.c b/disas.c
index 5325b7e6be6..f9c517b3588 100644
--- a/disas.c
+++ b/disas.c
@@ -588,7 +588,10 @@ static int
 physical_read_memory(bfd_vma memaddr, bfd_byte *myaddr, int length,
  struct disassemble_info *info)
 {
-cpu_physical_memory_read(memaddr, myaddr, length);
+CPUDebug *s = container_of(info, CPUDebug, info);
+
+address_space_read(s->cpu->as, memaddr, MEMTXATTRS_UNSPECIFIED,
+   myaddr, length);
 return 0;
 }
 
-- 
2.19.1

[Qemu-devel] [RFC v9 10/17] virtio-iommu: Implement probe request

2018-11-22 Thread Eric Auger

This patch implements the PROBE request. At the moment,
no reserved regions are returned as none are registered
per device. Only a NONE property is returned.

Signed-off-by: Eric Auger 

---
v8 -> v9:
- fix filling of properties (changes induced by v0.7 -> v0.8 spec
  evolution)

v7 -> v8:
- adapt to removal of value filed in virtio_iommu_probe_property

v6 -> v7:
- adapt to the change in virtio_iommu_probe_resv_mem fields
- use get_endpoint() instead of directly checking the EP
  was registered.

v4 -> v5:
- initialize bufstate.error to false
- add cpu_to_le64(size)
---
 hw/virtio/trace-events   |   2 +
 hw/virtio/virtio-iommu.c | 181 ++-
 2 files changed, 181 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index 1f0e143b55..19824c3e91 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -69,3 +69,5 @@ virtio_iommu_unmap_left_interval(uint64_t low, uint64_t high, 
uint64_t next_low,
 virtio_iommu_unmap_right_interval(uint64_t low, uint64_t high, uint64_t 
next_low, uint64_t next_high) "Unmap right [0x%"PRIx64",0x%"PRIx64"], new 
interval=[0x%"PRIx64",0x%"PRIx64"]"
 virtio_iommu_unmap_inc_interval(uint64_t low, uint64_t high) "Unmap inc 
[0x%"PRIx64",0x%"PRIx64"]"
 virtio_iommu_translate_out(uint64_t virt_addr, uint64_t phys_addr, uint32_t 
sid) "0x%"PRIx64" -> 0x%"PRIx64 " for sid=%d"
+virtio_iommu_fill_resv_property(uint32_t devid, uint8_t subtype, uint64_t 
start, uint64_t end, uint32_t flags, size_t filled) "dev= %d, subtype=%d 
start=0x%"PRIx64" end=0x%"PRIx64" flags=%d filled=0x%lx"
+virtio_iommu_fill_none_property(uint32_t devid) "devid=%d"
diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index af90413b37..4fc43494d9 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -39,6 +39,10 @@
 
 /* Max size */
 #define VIOMMU_DEFAULT_QUEUE_SIZE 256
+#define VIOMMU_PROBE_SIZE 512
+
+#define SUPPORTED_PROBE_PROPERTIES (\
+1 << VIRTIO_IOMMU_PROBE_T_RESV_MEM)
 
 typedef struct viommu_domain {
 uint32_t id;
@@ -51,6 +55,7 @@ typedef struct viommu_endpoint {
 viommu_domain *domain;
 QLIST_ENTRY(viommu_endpoint) next;
 VirtIOIOMMU *viommu;
+GTree *reserved_regions;
 } viommu_endpoint;
 
 typedef struct viommu_interval {
@@ -65,6 +70,13 @@ typedef struct viommu_mapping {
 uint32_t flags;
 } viommu_mapping;
 
+typedef struct viommu_property_buffer {
+viommu_endpoint *endpoint;
+size_t filled;
+uint8_t *start;
+bool error;
+} viommu_property_buffer;
+
 static inline uint16_t virtio_iommu_get_sid(IOMMUDevice *dev)
 {
 return PCI_BUILD_BDF(pci_bus_num(dev->bus), dev->devfn);
@@ -104,6 +116,9 @@ static viommu_endpoint 
*virtio_iommu_get_endpoint(VirtIOIOMMU *s,
 ep->viommu = s;
 trace_virtio_iommu_get_endpoint(ep_id);
 g_tree_insert(s->endpoints, GUINT_TO_POINTER(ep_id), ep);
+ep->reserved_regions = g_tree_new_full((GCompareDataFunc)interval_cmp,
+NULL, (GDestroyNotify)g_free,
+(GDestroyNotify)g_free);
 return ep;
 }
 
@@ -117,6 +132,7 @@ static void virtio_iommu_put_endpoint(gpointer data)
 }
 
 trace_virtio_iommu_put_endpoint(ep->id);
+g_tree_destroy(ep->reserved_regions);
 g_free(ep);
 }
 
@@ -352,6 +368,131 @@ static int virtio_iommu_unmap(VirtIOIOMMU *s,
 return VIRTIO_IOMMU_S_INVAL;
 }
 
+/**
+ * virtio_iommu_fill_resv_mem_prop - Add a RESV_MEM probe
+ * property into the probe request buffer
+ *
+ * @key: interval handle
+ * @value: handle to the reserved memory region
+ * @data: handle to the probe request buffer state
+ */
+static gboolean virtio_iommu_fill_resv_mem_prop(gpointer key,
+gpointer value,
+gpointer data)
+{
+struct virtio_iommu_probe_resv_mem *resv =
+(struct virtio_iommu_probe_resv_mem *)value;
+struct virtio_iommu_probe_resv_mem *buf_prop;
+viommu_property_buffer *bufstate = (viommu_property_buffer *)data;
+size_t prop_size = sizeof(*resv);
+
+if (bufstate->filled + prop_size >= VIOMMU_PROBE_SIZE) {
+bufstate->error = true;
+/* get the traversal stopped by returning true */
+return true;
+}
+buf_prop = (struct virtio_iommu_probe_resv_mem *)
+(bufstate->start + bufstate->filled);
+*buf_prop = *resv;
+
+bufstate->filled += prop_size;
+trace_virtio_iommu_fill_resv_property(bufstate->endpoint->id,
+  resv->subtype, resv->start,
+  resv->end, resv->subtype,
+  bufstate->filled);
+return false;
+}
+
+static int virtio_iommu_fill_none_prop(viommu_property_buffer *bufstate)
+{
+struct virtio_iommu_probe_property *prop;
+
+prop = (struct virtio_iommu_probe_property *)
+

[Qemu-devel] [RFC v9 06/17] virtio-iommu: Endpoint and domains structs and helpers

2018-11-22 Thread Eric Auger

This patch introduce domain and endpoint internal
datatypes. Both are stored in RB trees. The domain
owns a list of endpoints attached to it.

Helpers to get/put end points and domains are introduced.
get() helpers will become static in subsequent patches.

Signed-off-by: Eric Auger 

---

v6 -> v7:
- on virtio_iommu_find_add_as the bus number computation may
  not be finalized yet so we cannot register the EPs at that time.
  Hence, let's remove the get_endpoint and also do not use the
  bus number for building the memory region name string (only
  used for debug though).

v4 -> v5:
- initialize as->endpoint_list

v3 -> v4:
- new separate patch
---
 hw/virtio/trace-events   |   4 ++
 hw/virtio/virtio-iommu.c | 125 ++-
 2 files changed, 128 insertions(+), 1 deletion(-)

diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index 9270b0463e..4b15086872 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -61,3 +61,7 @@ virtio_iommu_map(uint32_t domain_id, uint64_t virt_start, 
uint64_t virt_end, uin
 virtio_iommu_unmap(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end) 
"domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64
 virtio_iommu_translate(const char *name, uint32_t rid, uint64_t iova, int 
flag) "mr=%s rid=%d addr=0x%"PRIx64" flag=%d"
 virtio_iommu_init_iommu_mr(char *iommu_mr) "init %s"
+virtio_iommu_get_endpoint(uint32_t ep_id) "Alloc endpoint=%d"
+virtio_iommu_put_endpoint(uint32_t ep_id) "Free endpoint=%d"
+virtio_iommu_get_domain(uint32_t domain_id) "Alloc domain=%d"
+virtio_iommu_put_domain(uint32_t domain_id) "Free domain=%d"
diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index dead062baf..1b9c3ba416 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -33,20 +33,124 @@
 #include "hw/virtio/virtio-bus.h"
 #include "hw/virtio/virtio-access.h"
 #include "hw/virtio/virtio-iommu.h"
+#include "hw/pci/pci_bus.h"
+#include "hw/pci/pci.h"
 
 /* Max size */
 #define VIOMMU_DEFAULT_QUEUE_SIZE 256
 
+typedef struct viommu_domain {
+uint32_t id;
+GTree *mappings;
+QLIST_HEAD(, viommu_endpoint) endpoint_list;
+} viommu_domain;
+
+typedef struct viommu_endpoint {
+uint32_t id;
+viommu_domain *domain;
+QLIST_ENTRY(viommu_endpoint) next;
+VirtIOIOMMU *viommu;
+} viommu_endpoint;
+
+typedef struct viommu_interval {
+uint64_t low;
+uint64_t high;
+} viommu_interval;
+
 static inline uint16_t virtio_iommu_get_sid(IOMMUDevice *dev)
 {
 return PCI_BUILD_BDF(pci_bus_num(dev->bus), dev->devfn);
 }
 
+static gint interval_cmp(gconstpointer a, gconstpointer b, gpointer user_data)
+{
+viommu_interval *inta = (viommu_interval *)a;
+viommu_interval *intb = (viommu_interval *)b;
+
+if (inta->high <= intb->low) {
+return -1;
+} else if (intb->high <= inta->low) {
+return 1;
+} else {
+return 0;
+}
+}
+
+static void virtio_iommu_detach_endpoint_from_domain(viommu_endpoint *ep)
+{
+QLIST_REMOVE(ep, next);
+ep->domain = NULL;
+}
+
+viommu_endpoint *virtio_iommu_get_endpoint(VirtIOIOMMU *s, uint32_t ep_id);
+viommu_endpoint *virtio_iommu_get_endpoint(VirtIOIOMMU *s, uint32_t ep_id)
+{
+viommu_endpoint *ep;
+
+ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(ep_id));
+if (ep) {
+return ep;
+}
+ep = g_malloc0(sizeof(*ep));
+ep->id = ep_id;
+ep->viommu = s;
+trace_virtio_iommu_get_endpoint(ep_id);
+g_tree_insert(s->endpoints, GUINT_TO_POINTER(ep_id), ep);
+return ep;
+}
+
+static void virtio_iommu_put_endpoint(gpointer data)
+{
+viommu_endpoint *ep = (viommu_endpoint *)data;
+
+if (ep->domain) {
+virtio_iommu_detach_endpoint_from_domain(ep);
+g_tree_unref(ep->domain->mappings);
+}
+
+trace_virtio_iommu_put_endpoint(ep->id);
+g_free(ep);
+}
+
+viommu_domain *virtio_iommu_get_domain(VirtIOIOMMU *s, uint32_t domain_id);
+viommu_domain *virtio_iommu_get_domain(VirtIOIOMMU *s, uint32_t domain_id)
+{
+viommu_domain *domain;
+
+domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id));
+if (domain) {
+return domain;
+}
+domain = g_malloc0(sizeof(*domain));
+domain->id = domain_id;
+domain->mappings = g_tree_new_full((GCompareDataFunc)interval_cmp,
+   NULL, (GDestroyNotify)g_free,
+   (GDestroyNotify)g_free);
+g_tree_insert(s->domains, GUINT_TO_POINTER(domain_id), domain);
+QLIST_INIT(>endpoint_list);
+trace_virtio_iommu_get_domain(domain_id);
+return domain;
+}
+
+static void virtio_iommu_put_domain(gpointer data)
+{
+viommu_domain *domain = (viommu_domain *)data;
+viommu_endpoint *iter, *tmp;
+
+QLIST_FOREACH_SAFE(iter, >endpoint_list, next, tmp) {
+virtio_iommu_detach_endpoint_from_domain(iter);
+}
+g_tree_destroy(domain->mappings);
+trace_virtio_iommu_put_domain(domain->id);
+g_free(domain);
+}
+

[Qemu-devel] [RFC v9 03/17] virtio-iommu: Add skeleton

2018-11-22 Thread Eric Auger

This patchs adds the skeleton for the virtio-iommu device.

Signed-off-by: Eric Auger 

---
v8 -> v9:
- properly initialize tail

v7 -> v8:
- expose VIRTIO_IOMMU_F_BYPASS and VIRTIO_F_VERSION_1
  features
- set_config dummy implementation + tracing
- add trace in get_features
- set the features on realize() and store the acked ones
- remove inclusion of linux/virtio_iommu.h

v6 -> v7:
- removed qapi-event.h include
- add primary_bus and associated property

v4 -> v5:
- use the new v0.5 terminology (domain, endpoint)
- add the event virtqueue

v3 -> v4:
- use page_size_mask instead of page_sizes
- added set_features()
- added some traces (reset, set_status, set_features)
- empty virtio_iommu_set_config() as the driver MUST NOT
  write to device configuration fields
- add get_config trace

v2 -> v3:
- rebase on 2.10-rc0, ie. use IOMMUMemoryRegion and remove
  iommu_ops.
- advertise VIRTIO_IOMMU_F_MAP_UNMAP feature
- page_sizes set to TARGET_PAGE_SIZE

Conflicts:
hw/virtio/trace-events
---
 hw/virtio/Makefile.objs  |   1 +
 hw/virtio/trace-events   |   9 +
 hw/virtio/virtio-iommu.c | 273 +++
 include/hw/virtio/virtio-iommu.h |  62 +++
 4 files changed, 345 insertions(+)
 create mode 100644 hw/virtio/virtio-iommu.c
 create mode 100644 include/hw/virtio/virtio-iommu.h

diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs
index 1b2799cfd8..49eba67a54 100644
--- a/hw/virtio/Makefile.objs
+++ b/hw/virtio/Makefile.objs
@@ -10,6 +10,7 @@ obj-$(CONFIG_VIRTIO_CRYPTO) += virtio-crypto.o
 obj-$(call land,$(CONFIG_VIRTIO_CRYPTO),$(CONFIG_VIRTIO_PCI)) += 
virtio-crypto-pci.o
 
 obj-$(CONFIG_LINUX) += vhost.o vhost-backend.o vhost-user.o
+obj-$(CONFIG_LINUX) += virtio-iommu.o
 obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock.o
 endif
 
diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index 07bcbe9e85..84da904d8b 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -46,3 +46,12 @@ virtio_balloon_handle_output(const char *name, uint64_t gpa) 
"section name: %s g
 virtio_balloon_get_config(uint32_t num_pages, uint32_t actual) "num_pages: %d 
actual: %d"
 virtio_balloon_set_config(uint32_t actual, uint32_t oldactual) "actual: %d 
oldactual: %d"
 virtio_balloon_to_target(uint64_t target, uint32_t num_pages) "balloon target: 
0x%"PRIx64" num_pages: %d"
+
+# hw/virtio/virtio-iommu.c
+#
+virtio_iommu_device_reset(void) "reset!"
+virtio_iommu_get_features(uint64_t features) "device supports 
features=0x%"PRIx64
+virtio_iommu_set_features(uint64_t features) "features accepted by the driver 
=0x%"PRIx64
+virtio_iommu_device_status(uint8_t status) "driver status = %d"
+virtio_iommu_get_config(uint64_t page_size_mask, uint64_t start, uint64_t end, 
uint8_t domain_bits, uint32_t probe_size) "page_size_mask=0x%"PRIx64" 
start=0x%"PRIx64" end=0x%"PRIx64" domain_bits=%d probe_size=0x%x"
+virtio_iommu_set_config(uint64_t page_size_mask, uint64_t start, uint64_t end, 
uint8_t domain_bits, uint32_t probe_size) "page_size_mask=0x%"PRIx64" 
start=0x%"PRIx64" end=0x%"PRIx64" domain_bits=%d probe_size=0x%x"
diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
new file mode 100644
index 00..d8894f047b
--- /dev/null
+++ b/hw/virtio/virtio-iommu.c
@@ -0,0 +1,273 @@
+/*
+ * virtio-iommu device
+ *
+ * Copyright (c) 2017 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2 or later, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see .
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/iov.h"
+#include "qemu-common.h"
+#include "hw/virtio/virtio.h"
+#include "sysemu/kvm.h"
+#include "trace.h"
+
+#include "standard-headers/linux/virtio_ids.h"
+
+#include "hw/virtio/virtio-bus.h"
+#include "hw/virtio/virtio-access.h"
+#include "hw/virtio/virtio-iommu.h"
+
+/* Max size */
+#define VIOMMU_DEFAULT_QUEUE_SIZE 256
+
+static int virtio_iommu_handle_attach(VirtIOIOMMU *s,
+  struct iovec *iov,
+  unsigned int iov_cnt)
+{
+return -ENOENT;
+}
+static int virtio_iommu_handle_detach(VirtIOIOMMU *s,
+  struct iovec *iov,
+  unsigned int iov_cnt)
+{
+return -ENOENT;
+}
+static int virtio_iommu_handle_map(VirtIOIOMMU *s,
+   struct iovec *iov,
+   unsigned int iov_cnt)
+{
+return -ENOENT;
+}
+static int

[Qemu-devel] [PATCH v2 for-4.0 3/3] elf_ops.h: Use address_space_write() to write memory

2018-11-22 Thread Peter Maydell

Currently the load_elf function in elf_ops.h uses
cpu_physical_memory_write() to write the ELF file to
memory if it is not handling it as a ROM blob. This
means we ignore the AddressSpace that the function
is passed to define where it should be loaded.
Use address_space_write() instead.

Signed-off-by: Peter Maydell 
---
v1->v2: handle NULL as
---
 include/hw/elf_ops.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/hw/elf_ops.h b/include/hw/elf_ops.h
index 81cecaf27e2..74679ff8da3 100644
--- a/include/hw/elf_ops.h
+++ b/include/hw/elf_ops.h
@@ -482,7 +482,9 @@ static int glue(load_elf, SZ)(const char *name, int fd,
 rom_add_elf_program(label, data, file_size, mem_size,
 addr, as);
 } else {
-cpu_physical_memory_write(addr, data, file_size);
+address_space_write(as ? as : _space_memory,
+addr, MEMTXATTRS_UNSPECIFIED,
+data, file_size);
 g_free(data);
 }
 }
-- 
2.19.1

[Qemu-devel] [RFC v9 02/17] linux-headers: Partial update for virtio-iommu v0.8

2018-11-22 Thread Eric Auger

Partial sync against Jean-Philippe's branch:
git://linux-arm.org/linux-jpb.git virtio-iommu/v0.8

Signed-off-by: Eric Auger 
---
 include/standard-headers/linux/virtio_ids.h   |   1 +
 include/standard-headers/linux/virtio_iommu.h | 159 ++
 linux-headers/linux/virtio_iommu.h|   1 +
 3 files changed, 161 insertions(+)
 create mode 100644 include/standard-headers/linux/virtio_iommu.h
 create mode 100644 linux-headers/linux/virtio_iommu.h

diff --git a/include/standard-headers/linux/virtio_ids.h 
b/include/standard-headers/linux/virtio_ids.h
index 6d5c3b2d4f..cfe47c5d9a 100644
--- a/include/standard-headers/linux/virtio_ids.h
+++ b/include/standard-headers/linux/virtio_ids.h
@@ -43,5 +43,6 @@
 #define VIRTIO_ID_INPUT18 /* virtio input */
 #define VIRTIO_ID_VSOCK19 /* virtio vsock transport */
 #define VIRTIO_ID_CRYPTO   20 /* virtio crypto */
+#define VIRTIO_ID_IOMMU23 /* virtio IOMMU */
 
 #endif /* _LINUX_VIRTIO_IDS_H */
diff --git a/include/standard-headers/linux/virtio_iommu.h 
b/include/standard-headers/linux/virtio_iommu.h
new file mode 100644
index 00..0a40b21ea9
--- /dev/null
+++ b/include/standard-headers/linux/virtio_iommu.h
@@ -0,0 +1,159 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/*
+ * Virtio-iommu definition v0.8
+ *
+ * Copyright (C) 2018 Arm Ltd.
+ */
+#ifndef _LINUX_VIRTIO_IOMMU_H
+#define _LINUX_VIRTIO_IOMMU_H
+
+#include "standard-headers/linux/types.h"
+
+/* Feature bits */
+#define VIRTIO_IOMMU_F_INPUT_RANGE 0
+#define VIRTIO_IOMMU_F_DOMAIN_BITS 1
+#define VIRTIO_IOMMU_F_MAP_UNMAP   2
+#define VIRTIO_IOMMU_F_BYPASS  3
+#define VIRTIO_IOMMU_F_PROBE   4
+
+struct virtio_iommu_config {
+   /* Supported page sizes */
+   uint64_tpage_size_mask;
+   /* Supported IOVA range */
+   struct virtio_iommu_range {
+   uint64_tstart;
+   uint64_tend;
+   } input_range;
+   /* Max domain ID size */
+   uint8_t domain_bits;
+   uint8_t padding[3];
+   /* Probe buffer size */
+   uint32_tprobe_size;
+};
+
+/* Request types */
+#define VIRTIO_IOMMU_T_ATTACH  0x01
+#define VIRTIO_IOMMU_T_DETACH  0x02
+#define VIRTIO_IOMMU_T_MAP 0x03
+#define VIRTIO_IOMMU_T_UNMAP   0x04
+#define VIRTIO_IOMMU_T_PROBE   0x05
+
+/* Status types */
+#define VIRTIO_IOMMU_S_OK  0x00
+#define VIRTIO_IOMMU_S_IOERR   0x01
+#define VIRTIO_IOMMU_S_UNSUPP  0x02
+#define VIRTIO_IOMMU_S_DEVERR  0x03
+#define VIRTIO_IOMMU_S_INVAL   0x04
+#define VIRTIO_IOMMU_S_RANGE   0x05
+#define VIRTIO_IOMMU_S_NOENT   0x06
+#define VIRTIO_IOMMU_S_FAULT   0x07
+
+struct virtio_iommu_req_head {
+   uint8_t type;
+   uint8_t reserved[3];
+};
+
+struct virtio_iommu_req_tail {
+   uint8_t status;
+   uint8_t reserved[3];
+};
+
+struct virtio_iommu_req_attach {
+   struct virtio_iommu_req_headhead;
+   uint32_tdomain;
+   uint32_tendpoint;
+   uint8_t reserved[8];
+   struct virtio_iommu_req_tailtail;
+};
+
+struct virtio_iommu_req_detach {
+   struct virtio_iommu_req_headhead;
+   uint32_tdomain;
+   uint32_tendpoint;
+   uint8_t reserved[8];
+   struct virtio_iommu_req_tailtail;
+};
+
+#define VIRTIO_IOMMU_MAP_F_READ(1 << 0)
+#define VIRTIO_IOMMU_MAP_F_WRITE   (1 << 1)
+#define VIRTIO_IOMMU_MAP_F_EXEC(1 << 2)
+#define VIRTIO_IOMMU_MAP_F_MMIO(1 << 3)
+
+#define VIRTIO_IOMMU_MAP_F_MASK
(VIRTIO_IOMMU_MAP_F_READ |  \
+VIRTIO_IOMMU_MAP_F_WRITE | 
\
+VIRTIO_IOMMU_MAP_F_EXEC |  
\
+VIRTIO_IOMMU_MAP_F_MMIO)
+
+struct virtio_iommu_req_map {
+   struct virtio_iommu_req_headhead;
+   uint32_tdomain;
+   uint64_tvirt_start;
+   uint64_tvirt_end;
+   uint64_t

[Qemu-devel] [RFC v9 14/17] virtio-iommu-pci: Add virtio iommu pci support

2018-11-22 Thread Eric Auger

This patch adds virtio-iommu-pci, which is the pci proxy for
the virtio-iommu device.

Signed-off-by: Eric Auger 

---

v8 -> v9:
- add the msi-bypass property
---
 hw/virtio/virtio-pci.c | 51 ++
 hw/virtio/virtio-pci.h | 14 
 include/hw/pci/pci.h   |  1 +
 qdev-monitor.c |  1 +
 4 files changed, 67 insertions(+)

diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index a954799267..cdd18afe9e 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -2342,6 +2342,56 @@ static const TypeInfo virtio_balloon_pci_info = {
 .class_init= virtio_balloon_pci_class_init,
 };
 
+/* virtio-iommu-pci */
+
+static Property virtio_iommu_pci_properties[] = {
+DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0),
+DEFINE_PROP_BOOL("msi-bypass", VirtIOIOMMUPCI, vdev.msi_bypass, true),
+DEFINE_PROP_END_OF_LIST(),
+};
+
+static void virtio_iommu_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+VirtIOIOMMUPCI *dev = VIRTIO_IOMMU_PCI(vpci_dev);
+DeviceState *vdev = DEVICE(>vdev);
+
+qdev_set_parent_bus(vdev, BUS(_dev->bus));
+object_property_set_link(OBJECT(dev),
+ OBJECT(pci_get_bus(_dev->pci_dev)),
+ "primary-bus", errp);
+object_property_set_bool(OBJECT(vdev), true, "realized", errp);
+}
+
+static void virtio_iommu_pci_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(klass);
+VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+k->realize = virtio_iommu_pci_realize;
+set_bit(DEVICE_CATEGORY_MISC, dc->categories);
+dc->props = virtio_iommu_pci_properties;
+pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_IOMMU;
+pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
+pcidev_k->class_id = PCI_CLASS_OTHERS;
+}
+
+static void virtio_iommu_pci_instance_init(Object *obj)
+{
+VirtIOIOMMUPCI *dev = VIRTIO_IOMMU_PCI(obj);
+
+virtio_instance_init_common(obj, >vdev, sizeof(dev->vdev),
+TYPE_VIRTIO_IOMMU);
+}
+
+static const TypeInfo virtio_iommu_pci_info = {
+.name  = TYPE_VIRTIO_IOMMU_PCI,
+.parent= TYPE_VIRTIO_PCI,
+.instance_size = sizeof(VirtIOIOMMUPCI),
+.instance_init = virtio_iommu_pci_instance_init,
+.class_init= virtio_iommu_pci_class_init,
+};
+
 /* virtio-serial-pci */
 
 static void virtio_serial_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
@@ -2712,6 +2762,7 @@ static void virtio_pci_register_types(void)
 #endif
 type_register_static(_scsi_pci_info);
 type_register_static(_balloon_pci_info);
+type_register_static(_iommu_pci_info);
 type_register_static(_serial_pci_info);
 type_register_static(_net_pci_info);
 #ifdef CONFIG_VHOST_SCSI
diff --git a/hw/virtio/virtio-pci.h b/hw/virtio/virtio-pci.h
index 813082b0d7..a17100b4a0 100644
--- a/hw/virtio/virtio-pci.h
+++ b/hw/virtio/virtio-pci.h
@@ -26,6 +26,7 @@
 #include "hw/virtio/virtio-input.h"
 #include "hw/virtio/virtio-gpu.h"
 #include "hw/virtio/virtio-crypto.h"
+#include "hw/virtio/virtio-iommu.h"
 #include "hw/virtio/vhost-user-scsi.h"
 #if defined(CONFIG_VHOST_USER) && defined(CONFIG_LINUX)
 #include "hw/virtio/vhost-user-blk.h"
@@ -57,6 +58,7 @@ typedef struct VirtIOInputHostPCI VirtIOInputHostPCI;
 typedef struct VirtIOGPUPCI VirtIOGPUPCI;
 typedef struct VHostVSockPCI VHostVSockPCI;
 typedef struct VirtIOCryptoPCI VirtIOCryptoPCI;
+typedef struct VirtIOIOMMUPCI VirtIOIOMMUPCI;
 
 /* virtio-pci-bus */
 
@@ -363,6 +365,18 @@ struct VirtIOInputHIDPCI {
 VirtIOInputHID vdev;
 };
 
+/*
+ *  * virtio-iommu-pci: This extends VirtioPCIProxy.
+ *   */
+#define TYPE_VIRTIO_IOMMU_PCI "virtio-iommu-pci"
+#define VIRTIO_IOMMU_PCI(obj) \
+OBJECT_CHECK(VirtIOIOMMUPCI, (obj), TYPE_VIRTIO_IOMMU_PCI)
+
+struct VirtIOIOMMUPCI {
+VirtIOPCIProxy parent_obj;
+VirtIOIOMMU vdev;
+};
+
 #ifdef CONFIG_LINUX
 
 #define TYPE_VIRTIO_INPUT_HOST_PCI "virtio-input-host-pci"
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index e6514bba23..a01fd4d2e1 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -85,6 +85,7 @@ extern bool pci_available;
 #define PCI_DEVICE_ID_VIRTIO_RNG 0x1005
 #define PCI_DEVICE_ID_VIRTIO_9P  0x1009
 #define PCI_DEVICE_ID_VIRTIO_VSOCK   0x1012
+#define PCI_DEVICE_ID_VIRTIO_IOMMU   0x1013
 
 #define PCI_VENDOR_ID_REDHAT 0x1b36
 #define PCI_DEVICE_ID_REDHAT_BRIDGE  0x0001
diff --git a/qdev-monitor.c b/qdev-monitor.c
index 07147c63bf..4f1ae056da 100644
--- a/qdev-monitor.c
+++ b/qdev-monitor.c
@@ -62,6 +62,7 @@ static const QDevAlias qdev_alias_table[] = {
 { "virtio-input-host-ccw", "virtio-input-host", QEMU_ARCH_S390X },
 { "virtio-input-host-pci", "virtio-input-host",
 QEMU_ARCH_ALL & ~QEMU_ARCH_S390X },
+{ "virtio-iommu-pci",

[Qemu-devel] [RFC v9 01/17] update-linux-headers: Import virtio_iommu.h

2018-11-22 Thread Eric Auger

Update the script to update the virtio_iommu.h header.

Signed-off-by: Eric Auger 
---
 scripts/update-linux-headers.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh
index 0a964fe240..55fd271a32 100755
--- a/scripts/update-linux-headers.sh
+++ b/scripts/update-linux-headers.sh
@@ -159,6 +159,9 @@ fi
 cat <$output/linux-headers/linux/virtio_config.h
 #include "standard-headers/linux/virtio_config.h"
 EOF
+cat <$output/linux-headers/linux/virtio_iommu.h
+#include "standard-headers/linux/virtio_iommu.h"
+EOF
 cat <$output/linux-headers/linux/virtio_ring.h
 #include "standard-headers/linux/virtio_ring.h"
 EOF
-- 
2.17.2

[Qemu-devel] [RFC v9 15/17] hw/arm/virt: Add the virtio-iommu device tree mappings

2018-11-22 Thread Eric Auger

Adds the "virtio,pci-iommu" node in the host bridge node and
the RID mapping, excluding the IOMMU RID.

Signed-off-by: Eric Auger 

---

v8 -> v9:
- disable msi-bypass property
- addition of the subnode is handled is the hotplug handler
  and IOMMU RID is notimposed anymore

v6 -> v7:
- align to the smmu instantiation code

v4 -> v5:
- VirtMachineClass no_iommu added in this patch
- Use object_resolve_path_type
---
 hw/arm/virt.c | 57 +--
 include/hw/arm/virt.h |  2 ++
 2 files changed, 52 insertions(+), 7 deletions(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index a2b8d8f7c2..b2bbb0ef49 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -29,6 +29,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "monitor/qdev.h"
 #include "qapi/error.h"
 #include "hw/sysbus.h"
 #include "hw/arm/arm.h"
@@ -49,6 +50,7 @@
 #include "qemu/bitops.h"
 #include "qemu/error-report.h"
 #include "hw/pci-host/gpex.h"
+#include "hw/virtio/virtio-pci.h"
 #include "hw/arm/sysbus-fdt.h"
 #include "hw/platform-bus.h"
 #include "hw/arm/fdt.h"
@@ -59,6 +61,7 @@
 #include "qapi/visitor.h"
 #include "standard-headers/linux/input.h"
 #include "hw/arm/smmuv3.h"
+#include "hw/virtio/virtio-iommu.h"
 
 #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \
 static void virt_##major##_##minor##_class_init(ObjectClass *oc, \
@@ -1085,6 +1088,33 @@ static void create_smmu(const VirtMachineState *vms, 
qemu_irq *pic,
 g_free(node);
 }
 
+static void create_virtio_iommu(VirtMachineState *vms, Error **errp)
+{
+const char compat[] = "virtio,pci-iommu";
+uint16_t bdf = vms->virtio_iommu_bdf;
+char *node;
+
+vms->iommu_phandle = qemu_fdt_alloc_phandle(vms->fdt);
+
+node = g_strdup_printf("%s/virtio_iommu@%d", vms->pciehb_nodename, bdf);
+qemu_fdt_add_subnode(vms->fdt, node);
+qemu_fdt_setprop(vms->fdt, node, "compatible", compat, sizeof(compat));
+qemu_fdt_setprop_sized_cells(vms->fdt, node, "reg",
+ 1, bdf << 8 /* phys.hi */,
+ 1, 0/* phys.mid */,
+ 1, 0/* phys.lo  */,
+ 1, 0/* size.hi  */,
+ 1, 0/* size.low */);
+
+qemu_fdt_setprop_cell(vms->fdt, node, "#iommu-cells", 1);
+qemu_fdt_setprop_cell(vms->fdt, node, "phandle", vms->iommu_phandle);
+g_free(node);
+
+qemu_fdt_setprop_cells(vms->fdt, vms->pciehb_nodename, "iommu-map",
+   0x0, vms->iommu_phandle, 0x0, bdf,
+   bdf + 1, vms->iommu_phandle, bdf + 1, 0x - bdf);
+}
+
 static void create_pcie(VirtMachineState *vms, qemu_irq *pic)
 {
 hwaddr base_mmio = vms->memmap[VIRT_PCIE_MMIO].base;
@@ -1162,7 +1192,7 @@ static void create_pcie(VirtMachineState *vms, qemu_irq 
*pic)
 }
 }
 
-nodename = g_strdup_printf("/pcie@%" PRIx64, base);
+nodename = vms->pciehb_nodename = g_strdup_printf("/pcie@%" PRIx64, base);
 qemu_fdt_add_subnode(vms->fdt, nodename);
 qemu_fdt_setprop_string(vms->fdt, nodename,
 "compatible", "pci-host-ecam-generic");
@@ -1205,13 +1235,17 @@ static void create_pcie(VirtMachineState *vms, qemu_irq 
*pic)
 if (vms->iommu) {
 vms->iommu_phandle = qemu_fdt_alloc_phandle(vms->fdt);
 
-create_smmu(vms, pic, pci->bus);
+switch (vms->iommu) {
+case VIRT_IOMMU_SMMUV3:
+create_smmu(vms, pic, pci->bus);
+qemu_fdt_setprop_cells(vms->fdt, nodename, "iommu-map",
+   0x0, vms->iommu_phandle, 0x0, 0x1);
+break;
+default:
+g_assert_not_reached();
+}
 
-qemu_fdt_setprop_cells(vms->fdt, nodename, "iommu-map",
-   0x0, vms->iommu_phandle, 0x0, 0x1);
 }
-
-g_free(nodename);
 }
 
 static void create_platform_bus(VirtMachineState *vms, qemu_irq *pic)
@@ -1736,12 +1770,21 @@ static void virt_machine_device_plug_cb(HotplugHandler 
*hotplug_dev,
  SYS_BUS_DEVICE(dev));
 }
 }
+if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) {
+PCIDevice *pdev = PCI_DEVICE(dev);
+
+vms->iommu = VIRT_IOMMU_VIRTIO;
+vms->virtio_iommu_bdf = pci_get_bdf(pdev);
+object_property_set_bool(OBJECT(dev), false, "msi-bypass", errp);
+create_virtio_iommu(vms, errp);
+}
 }
 
 static HotplugHandler *virt_machine_get_hotplug_handler(MachineState *machine,
 DeviceState *dev)
 {
-if (object_dynamic_cast(OBJECT(dev), TYPE_SYS_BUS_DEVICE)) {
+if (object_dynamic_cast(OBJECT(dev), TYPE_SYS_BUS_DEVICE) ||
+object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_IOMMU_PCI)) {
 return HOTPLUG_HANDLER(machine);
 }
 
diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
index

Re: [Qemu-devel] [PULL 00/13] Block layer patches

2018-11-22 Thread Peter Maydell

On 22 November 2018 at 16:54, Kevin Wolf  wrote:
> The following changes since commit 47c1cc30e440860aa695358f7c2dd0b9d7b53d16:
>
>   Update version for v3.1.0-rc2 release (2018-11-20 18:10:26 +)
>
> are available in the Git repository at:
>
>   git://repo.or.cz/qemu/kevin.git tags/for-upstream
>
> for you to fetch changes up to 924956b1efc50af7cc334b7a14f56aa213ca27ef:
>
>   iotests: Enhance 223 to cover multiple bitmap granularities (2018-11-22 
> 16:43:52 +0100)
>
> 
> Block layer patches:
>
> - block: Fix update of BDRV_O_AUTO_RDONLY in update_flags_from_options()
> - qemu-img: Fix memory leak and typo in error message
> - nvme: Fixes for lockups and crashes
> - scsi-disk: Fix crash if underlying host file or disk returns error
> - Several qemu-iotests fixes and improvements
>
> 

Hi; this seems to fail make check on s390x, sparc64, ppc64 (ie all
the bigendian hosts):

TEST: tests/nvme-test... (pid=12356)
  /i386/nvme/nop:  OK
  /i386/nvme/cmb_test: **
ERROR:/home/linux1/qemu/tests/nvme-test.c:60:nvmetest_cmb_test:
assertion failed (qpci_io_re
adb(pdev, bar, 0) == 0x99): (0 == 153)
FAIL

thanks
-- PMM

[Qemu-devel] [RFC v9 04/17] virtio-iommu: Decode the command payload

2018-11-22 Thread Eric Auger

This patch adds the command payload decoding and
introduces the functions that will do the actual
command handling. Those functions are not yet implemented.

Signed-off-by: Eric Auger 

---
v7 -> v8:
- handle new domain parameter in detach
- remove reserved checks

v5 -> v6:
- change map/unmap semantics (remove size)

v4 -> v5:
- adopt new v0.5 terminology

v3 -> v4:
- no flags field anymore in struct virtio_iommu_req_unmap
- test reserved on attach/detach, change trace proto
- rebase on v2.10.0.
---
 hw/virtio/trace-events   |  4 ++
 hw/virtio/virtio-iommu.c | 95 ++--
 2 files changed, 95 insertions(+), 4 deletions(-)

diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index 84da904d8b..e6177ca0e4 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -55,3 +55,7 @@ virtio_iommu_set_features(uint64_t features) "features 
accepted by the driver =0
 virtio_iommu_device_status(uint8_t status) "driver status = %d"
 virtio_iommu_get_config(uint64_t page_size_mask, uint64_t start, uint64_t end, 
uint8_t domain_bits, uint32_t probe_size) "page_size_mask=0x%"PRIx64" 
start=0x%"PRIx64" end=0x%"PRIx64" domain_bits=%d probe_size=0x%x"
 virtio_iommu_set_config(uint64_t page_size_mask, uint64_t start, uint64_t end, 
uint8_t domain_bits, uint32_t probe_size) "page_size_mask=0x%"PRIx64" 
start=0x%"PRIx64" end=0x%"PRIx64" domain_bits=%d probe_size=0x%x"
+virtio_iommu_attach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d"
+virtio_iommu_detach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d"
+virtio_iommu_map(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end, 
uint64_t phys_start, uint32_t flags) "domain=%d virt_start=0x%"PRIx64" 
virt_end=0x%"PRIx64 " phys_start=0x%"PRIx64" flags=%d"
+virtio_iommu_unmap(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end) 
"domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64
diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index d8894f047b..fc95751c40 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -33,29 +33,116 @@
 /* Max size */
 #define VIOMMU_DEFAULT_QUEUE_SIZE 256
 
+static int virtio_iommu_attach(VirtIOIOMMU *s,
+   struct virtio_iommu_req_attach *req)
+{
+uint32_t domain_id = le32_to_cpu(req->domain);
+uint32_t ep_id = le32_to_cpu(req->endpoint);
+
+trace_virtio_iommu_attach(domain_id, ep_id);
+
+return VIRTIO_IOMMU_S_UNSUPP;
+}
+
+static int virtio_iommu_detach(VirtIOIOMMU *s,
+   struct virtio_iommu_req_detach *req)
+{
+uint32_t domain_id = le32_to_cpu(req->domain);
+uint32_t ep_id = le32_to_cpu(req->endpoint);
+
+trace_virtio_iommu_detach(domain_id, ep_id);
+
+return VIRTIO_IOMMU_S_UNSUPP;
+}
+
+static int virtio_iommu_map(VirtIOIOMMU *s,
+struct virtio_iommu_req_map *req)
+{
+uint32_t domain_id = le32_to_cpu(req->domain);
+uint64_t phys_start = le64_to_cpu(req->phys_start);
+uint64_t virt_start = le64_to_cpu(req->virt_start);
+uint64_t virt_end = le64_to_cpu(req->virt_end);
+uint32_t flags = le32_to_cpu(req->flags);
+
+trace_virtio_iommu_map(domain_id, virt_start, virt_end, phys_start, flags);
+
+return VIRTIO_IOMMU_S_UNSUPP;
+}
+
+static int virtio_iommu_unmap(VirtIOIOMMU *s,
+  struct virtio_iommu_req_unmap *req)
+{
+uint32_t domain_id = le32_to_cpu(req->domain);
+uint64_t virt_start = le64_to_cpu(req->virt_start);
+uint64_t virt_end = le64_to_cpu(req->virt_end);
+
+trace_virtio_iommu_unmap(domain_id, virt_start, virt_end);
+
+return VIRTIO_IOMMU_S_UNSUPP;
+}
+
+#define get_payload_size(req) (\
+sizeof((req)) - sizeof(struct virtio_iommu_req_tail))
+
 static int virtio_iommu_handle_attach(VirtIOIOMMU *s,
   struct iovec *iov,
   unsigned int iov_cnt)
 {
-return -ENOENT;
+struct virtio_iommu_req_attach req;
+size_t sz, payload_sz;
+
+payload_sz = get_payload_size(req);
+
+sz = iov_to_buf(iov, iov_cnt, 0, , payload_sz);
+if (sz != payload_sz) {
+return VIRTIO_IOMMU_S_INVAL;
+}
+return virtio_iommu_attach(s, );
 }
 static int virtio_iommu_handle_detach(VirtIOIOMMU *s,
   struct iovec *iov,
   unsigned int iov_cnt)
 {
-return -ENOENT;
+struct virtio_iommu_req_detach req;
+size_t sz, payload_sz;
+
+payload_sz = get_payload_size(req);
+
+sz = iov_to_buf(iov, iov_cnt, 0, , payload_sz);
+if (sz != payload_sz) {
+return VIRTIO_IOMMU_S_INVAL;
+}
+return virtio_iommu_detach(s, );
 }
 static int virtio_iommu_handle_map(VirtIOIOMMU *s,
struct iovec *iov,
unsigned int iov_cnt)
 {
-return -ENOENT;
+struct virtio_iommu_req_map req;
+size_t sz, payload_sz;
+
+

[Qemu-devel] [RFC v9 11/17] virtio-iommu: Expose the IOAPIC MSI reserved region when relevant

2018-11-22 Thread Eric Auger

We introduce a new msi_bypass field which indicates whether
the IOAPIC MSI window [0xFEE0 - 0xFEEF] must be exposed
as a reserved region. By default the field is set to true at
instantiation time. Later on we will introduce a property at
virtio pci proxy level to turn it off.

Signed-off-by: Eric Auger 

---

v8 -> v9:
- pass IOAPIC_RANGE_END to virtio_iommu_register_resv_region
- take into account the change in the struct virtio_iommu_probe_resv_mem
  definition
- We just introduce the field here. A property will be introduced later on
  at pci proxy level.
---
 hw/virtio/virtio-iommu.c | 36 
 include/hw/virtio/virtio-iommu.h |  1 +
 2 files changed, 37 insertions(+)

diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index 4fc43494d9..324518c300 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -41,6 +41,9 @@
 #define VIOMMU_DEFAULT_QUEUE_SIZE 256
 #define VIOMMU_PROBE_SIZE 512
 
+#define IOAPIC_RANGE_START  (0xfee0)
+#define IOAPIC_RANGE_END(0xfeef)
+
 #define SUPPORTED_PROBE_PROPERTIES (\
 1 << VIRTIO_IOMMU_PROBE_T_RESV_MEM)
 
@@ -102,6 +105,30 @@ static void 
virtio_iommu_detach_endpoint_from_domain(viommu_endpoint *ep)
 ep->domain = NULL;
 }
 
+static void virtio_iommu_register_resv_region(viommu_endpoint *ep,
+  uint8_t subtype,
+  uint64_t start, uint64_t end)
+{
+viommu_interval *interval;
+struct virtio_iommu_probe_resv_mem *resv_reg_prop;
+size_t prop_size = sizeof(struct virtio_iommu_probe_resv_mem);
+size_t value_size = prop_size -
+sizeof(struct virtio_iommu_probe_property);
+
+interval = g_malloc0(sizeof(*interval));
+interval->low = start;
+interval->high = end;
+
+resv_reg_prop = g_malloc0(prop_size);
+resv_reg_prop->head.type = VIRTIO_IOMMU_PROBE_T_RESV_MEM;
+resv_reg_prop->head.length = cpu_to_le64(value_size);
+resv_reg_prop->subtype = cpu_to_le64(subtype);
+resv_reg_prop->start = cpu_to_le64(start);
+resv_reg_prop->end = cpu_to_le64(end);
+
+g_tree_insert(ep->reserved_regions, interval, resv_reg_prop);
+}
+
 static viommu_endpoint *virtio_iommu_get_endpoint(VirtIOIOMMU *s,
   uint32_t ep_id)
 {
@@ -119,6 +146,12 @@ static viommu_endpoint 
*virtio_iommu_get_endpoint(VirtIOIOMMU *s,
 ep->reserved_regions = g_tree_new_full((GCompareDataFunc)interval_cmp,
 NULL, (GDestroyNotify)g_free,
 (GDestroyNotify)g_free);
+if (s->msi_bypass) {
+virtio_iommu_register_resv_region(ep, VIRTIO_IOMMU_RESV_MEM_T_MSI,
+  IOAPIC_RANGE_START,
+  IOAPIC_RANGE_END);
+}
+
 return ep;
 }
 
@@ -858,6 +891,9 @@ static void virtio_iommu_set_status(VirtIODevice *vdev, 
uint8_t status)
 
 static void virtio_iommu_instance_init(Object *obj)
 {
+VirtIOIOMMU *s = VIRTIO_IOMMU(obj);
+
+s->msi_bypass = true;
 }
 
 static const VMStateDescription vmstate_virtio_iommu = {
diff --git a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-iommu.h
index f55f48d304..56c8b4e57f 100644
--- a/include/hw/virtio/virtio-iommu.h
+++ b/include/hw/virtio/virtio-iommu.h
@@ -59,6 +59,7 @@ typedef struct VirtIOIOMMU {
 GTree *domains;
 QemuMutex mutex;
 GTree *endpoints;
+bool msi_bypass;
 } VirtIOIOMMU;
 
 #endif
-- 
2.17.2

[Qemu-devel] [PATCH v2 for-4.0 2/3] monitor: Use address_space_read() to read memory

2018-11-22 Thread Peter Maydell

Currently monitor.c reads physical memory using
cpu_physical_memory_read(). This effectively hard-codes
assuming that all CPUs have the same view of physical
memory. Switch to address_space_read() instead, which
lets us use the AddressSpace for the CPU we're
reading memory for (falling back to address_space_memory
if there is no CPU, as happens with the "none" board).
As a bonus, this allows us to detect failures to read memory.

Signed-off-by: Peter Maydell 
Reviewed-by: Dr. David Alan Gilbert 
---
 monitor.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/monitor.c b/monitor.c
index d39390c2f2f..b0e8f2c490a 100644
--- a/monitor.c
+++ b/monitor.c
@@ -1604,7 +1604,13 @@ static void memory_dump(Monitor *mon, int count, int 
format, int wsize,
 if (l > line_size)
 l = line_size;
 if (is_physical) {
-cpu_physical_memory_read(addr, buf, l);
+AddressSpace *as = cs ? cs->as : _space_memory;
+MemTxResult r = address_space_read(as, addr,
+   MEMTXATTRS_UNSPECIFIED, buf, l);
+if (r != MEMTX_OK) {
+monitor_printf(mon, " Cannot access memory\n");
+break;
+}
 } else {
 if (cpu_memory_rw_debug(cs, addr, buf, l, 0) < 0) {
 monitor_printf(mon, " Cannot access memory\n");
-- 
2.19.1

[Qemu-devel] [RFC v9 16/17] hw/arm/virt-acpi-build: Introduce fill_iort_idmap helper

2018-11-22 Thread Eric Auger

To avoid code duplication, let's introduce an helper that
fills one IORT ID mappings array index.

Signed-off-by: Eric Auger 

---

v8: new
---
 hw/arm/virt-acpi-build.c | 43 
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index 5785fb697c..ec7c4835fe 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -396,6 +396,17 @@ build_rsdp(GArray *rsdp_table, BIOSLinker *linker, 
unsigned xsdt_tbl_offset)
 return rsdp_table;
 }
 
+static inline void
+fill_iort_idmap(AcpiIortIdMapping *idmap, int i,
+uint32_t input_base, uint32_t id_count,
+uint32_t output_base, uint32_t output_reference)
+{
+idmap[i].input_base = cpu_to_le32(input_base);
+idmap[i].id_count = cpu_to_le32(id_count);
+idmap[i].output_base = cpu_to_le32(output_base);
+idmap[i].output_reference = cpu_to_le32(output_reference);
+}
+
 static void
 build_iort(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
 {
@@ -453,13 +464,12 @@ build_iort(GArray *table_data, BIOSLinker *linker, 
VirtMachineState *vms)
 smmu->gerr_gsiv = cpu_to_le32(irq + 2);
 smmu->sync_gsiv = cpu_to_le32(irq + 3);
 
-/* Identity RID mapping covering the whole input RID range */
-idmap = >id_mapping_array[0];
-idmap->input_base = 0;
-idmap->id_count = cpu_to_le32(0x);
-idmap->output_base = 0;
-/* output IORT node is the ITS group node (the first node) */
-idmap->output_reference = cpu_to_le32(iort_node_offset);
+/*
+ * Identity RID mapping covering the whole input RID range.
+ * The output IORT node is the ITS group node (the first node).
+ */
+fill_iort_idmap(smmu->id_mapping_array, 0, 0, 0x, 0,
+iort_node_offset);
 }
 
 /* Root Complex Node */
@@ -477,18 +487,17 @@ build_iort(GArray *table_data, BIOSLinker *linker, 
VirtMachineState *vms)
 rc->memory_properties.memory_flags = 0x3; /* CCA = CPM = DCAS = 1 */
 rc->pci_segment_number = 0; /* MCFG pci_segment */
 
-/* Identity RID mapping covering the whole input RID range */
-idmap = >id_mapping_array[0];
-idmap->input_base = 0;
-idmap->id_count = cpu_to_le32(0x);
-idmap->output_base = 0;
-
 if (vms->iommu == VIRT_IOMMU_SMMUV3) {
-/* output IORT node is the smmuv3 node */
-idmap->output_reference = cpu_to_le32(smmu_offset);
+/* Identity RID mapping and output IORT node is the iommu node */
+fill_iort_idmap(rc->id_mapping_array, 0, 0, 0x, 0,
+smmu_offset);
 } else {
-/* output IORT node is the ITS group node (the first node) */
-idmap->output_reference = cpu_to_le32(iort_node_offset);
+/*
+ * Identity RID mapping and the output IORT node is the ITS group
+ * node (the first node).
+ */
+fill_iort_idmap(rc->id_mapping_array, 0, 0, 0x, 0,
+iort_node_offset);
 }
 
 /*
-- 
2.17.2

Re: [Qemu-devel] [RFC 11/48] atomic_template: fix indentation in GEN_ATOMIC_HELPER

2018-11-22 Thread Alex Bennée



Emilio G. Cota  writes:

> Signed-off-by: Emilio G. Cota 

Reviewed-by: Alex Bennée 

> ---
>  accel/tcg/atomic_template.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/accel/tcg/atomic_template.h b/accel/tcg/atomic_template.h
> index efde12fdb2..8d177fefef 100644
> --- a/accel/tcg/atomic_template.h
> +++ b/accel/tcg/atomic_template.h
> @@ -284,7 +284,7 @@ ABI_TYPE ATOMIC_NAME(xchg)(CPUArchState *env, 
> target_ulong addr,
>
>  #define GEN_ATOMIC_HELPER(X)\
>  ABI_TYPE ATOMIC_NAME(X)(CPUArchState *env, target_ulong addr,   \
> - ABI_TYPE val EXTRA_ARGS)   \
> +ABI_TYPE val EXTRA_ARGS)\
>  {   \
>  ATOMIC_MMU_DECLS;   \
>  DATA_TYPE *haddr = ATOMIC_MMU_LOOKUP;   \


--
Alex Bennée

[Qemu-devel] [PATCH v2 for-4.0 0/3] Avoid cpu_physical_memory_read() in generic code

2018-11-22 Thread Peter Maydell

This patchset takes three places in generic code which
use cpu_physical_memory_read(), and changes them to use
address_space_read() instead.

Changes v1->v2:
 * patches 1, 2 unchanged (and reviewed)
 * patch 3: handle as being NULL (which for the
   load_elf APIs means "use address_space_memory")

cpu_physical_memory_{read,rw,write} all implicitly assume
that there is exactly one view of physical memory. This
is sort-of true today, but we'd like to be able to move
to having heterogenous systems where not all CPUs share
the same view of physical memory.

In disas.c we are disassembling for a particular CPU, so
use that CPU's primary address space (cs->as).

In monitor.c we are reading physical memory for a
particular CPU, so again use that CPU's primary address
space; we fall back to address_space_memory for the case
where there are no CPUs in the system (-machine none).

In elf_ops.h the function was passed an address space to
use, so just use it.

Other places in generic code that use these functions are:
 * dump.c -- the whole UI here seems to assume that there
   is only one view of memory and that is what is being dumped
 * cpu.c:qmp_pmemsave() -- again, the UI assumption is that
   there's only one view of memory
So I've left those alone.

NB: git grep command line for finding callsites:
 git grep '\'

thanks
-- PMM


Peter Maydell (3):
  disas.c: Use address_space_read() to read memory
  monitor: Use address_space_read() to read memory
  elf_ops.h: Use address_space_write() to write memory

 include/hw/elf_ops.h | 4 +++-
 disas.c  | 5 -
 monitor.c| 8 +++-
 3 files changed, 14 insertions(+), 3 deletions(-)

-- 
2.19.1

[Qemu-devel] [RFC v9 13/17] virtio_iommu: Handle reserved regions in translation process

2018-11-22 Thread Eric Auger

When translating an address we need to check if it belongs to
a reserved virtual address range. If it does, there are 2 cases:

- it belongs to a RESERVED region: the guest should neither use
  this address in a MAP not instruct the end-point to DMA on
  them. We report an error

- It belongs to an MSI region: we bypass the translation.

Signed-off-by: Eric Auger 
---
 hw/virtio/virtio-iommu.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index 1246dd6bdf..2ec01f3b9e 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -740,6 +740,7 @@ static IOMMUTLBEntry 
virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr,
 viommu_interval interval;
 bool bypass_allowed;
 bool read_fault, write_fault;
+struct virtio_iommu_probe_resv_mem *reg;
 
 interval.low = addr;
 interval.high = addr + 1;
@@ -772,6 +773,21 @@ static IOMMUTLBEntry 
virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr,
 goto unlock;
 }
 
+reg = g_tree_lookup(ep->reserved_regions, (gpointer)());
+if (reg) {
+switch (reg->subtype) {
+case VIRTIO_IOMMU_RESV_MEM_T_MSI:
+entry.perm = flag;
+break;
+case VIRTIO_IOMMU_RESV_MEM_T_RESERVED:
+default:
+virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_MAPPING,
+  0, sid, addr);
+break;
+}
+goto unlock;
+}
+
 if (!ep->domain) {
 if (!bypass_allowed) {
 qemu_log_mask(LOG_GUEST_ERROR,
-- 
2.17.2

[Qemu-devel] [RFC v9 09/17] virtio-iommu: Implement translate

2018-11-22 Thread Eric Auger

This patch implements the translate callback

Signed-off-by: Eric Auger 

---
v6 -> v7:
- implemented bypass-mode

v5 -> v6:
- replace error_report by qemu_log_mask

v4 -> v5:
- check the device domain is not NULL
- s/printf/error_report
- set flags to IOMMU_NONE in case of all translation faults
---
 hw/virtio/trace-events   |  1 +
 hw/virtio/virtio-iommu.c | 57 +++-
 2 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index b3c5c2604e..1f0e143b55 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -68,3 +68,4 @@ virtio_iommu_put_domain(uint32_t domain_id) "Free domain=%d"
 virtio_iommu_unmap_left_interval(uint64_t low, uint64_t high, uint64_t 
next_low, uint64_t next_high) "Unmap left [0x%"PRIx64",0x%"PRIx64"], new 
interval=[0x%"PRIx64",0x%"PRIx64"]"
 virtio_iommu_unmap_right_interval(uint64_t low, uint64_t high, uint64_t 
next_low, uint64_t next_high) "Unmap right [0x%"PRIx64",0x%"PRIx64"], new 
interval=[0x%"PRIx64",0x%"PRIx64"]"
 virtio_iommu_unmap_inc_interval(uint64_t low, uint64_t high) "Unmap inc 
[0x%"PRIx64",0x%"PRIx64"]"
+virtio_iommu_translate_out(uint64_t virt_addr, uint64_t phys_addr, uint32_t 
sid) "0x%"PRIx64" -> 0x%"PRIx64 " for sid=%d"
diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index 23f7dc6f7f..af90413b37 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -480,19 +480,74 @@ static IOMMUTLBEntry 
virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr,
 int iommu_idx)
 {
 IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr);
+VirtIOIOMMU *s = sdev->viommu;
 uint32_t sid;
+viommu_endpoint *ep;
+viommu_mapping *mapping;
+viommu_interval interval;
+bool bypass_allowed;
+
+interval.low = addr;
+interval.high = addr + 1;
 
 IOMMUTLBEntry entry = {
 .target_as = _space_memory,
 .iova = addr,
 .translated_addr = addr,
-.addr_mask = ~(hwaddr)0,
+.addr_mask = (1 << ctz32(s->config.page_size_mask)) - 1,
 .perm = IOMMU_NONE,
 };
 
+bypass_allowed = virtio_has_feature(s->acked_features,
+VIRTIO_IOMMU_F_BYPASS);
+
 sid = virtio_iommu_get_sid(sdev);
 
 trace_virtio_iommu_translate(mr->parent_obj.name, sid, addr, flag);
+qemu_mutex_lock(>mutex);
+
+ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(sid));
+if (!ep) {
+if (!bypass_allowed) {
+error_report("%s sid=%d is not known!!", __func__, sid);
+} else {
+entry.perm = flag;
+}
+goto unlock;
+}
+
+if (!ep->domain) {
+if (!bypass_allowed) {
+qemu_log_mask(LOG_GUEST_ERROR,
+  "%s %02x:%02x.%01x not attached to any domain\n",
+  __func__, PCI_BUS_NUM(sid), PCI_SLOT(sid), 
PCI_FUNC(sid));
+} else {
+entry.perm = flag;
+}
+goto unlock;
+}
+
+mapping = g_tree_lookup(ep->domain->mappings, (gpointer)());
+if (!mapping) {
+qemu_log_mask(LOG_GUEST_ERROR,
+  "%s no mapping for 0x%"PRIx64" for sid=%d\n",
+  __func__, addr, sid);
+goto unlock;
+}
+
+if (((flag & IOMMU_RO) && !(mapping->flags & VIRTIO_IOMMU_MAP_F_READ)) ||
+((flag & IOMMU_WO) && !(mapping->flags & VIRTIO_IOMMU_MAP_F_WRITE))) {
+qemu_log_mask(LOG_GUEST_ERROR,
+  "Permission error on 0x%"PRIx64"(%d): allowed=%d\n",
+  addr, flag, mapping->flags);
+goto unlock;
+}
+entry.translated_addr = addr - mapping->virt_addr + mapping->phys_addr;
+entry.perm = flag;
+trace_virtio_iommu_translate_out(addr, entry.translated_addr, sid);
+
+unlock:
+qemu_mutex_unlock(>mutex);
 return entry;
 }
 
-- 
2.17.2

Re: [Qemu-devel] [RFC 10/48] exec: export do_tb_flush

2018-11-22 Thread Alex Bennée



Emilio G. Cota  writes:

> This will be used by plugin code to flush the code cache as well
> as doing other bookkeeping in a safe work environment.

This seems a little excessive given the plugin code could just call
tb_flush() directly. Wouldn't calling tb_flush after scheduling the
plugin_destroy be enough?

If there is a race condition here maybe we could build some sort of
awareness into tb_flush as to the current run state. But having two
entry points to this rather fundamental action seems likely to either be
misused or misunderstood.

>
> Signed-off-by: Emilio G. Cota 
> ---
>  include/exec/exec-all.h   | 1 +
>  accel/tcg/translate-all.c | 2 +-
>  2 files changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
> index 815e5b1e83..232e2f8966 100644
> --- a/include/exec/exec-all.h
> +++ b/include/exec/exec-all.h
> @@ -427,6 +427,7 @@ void tb_invalidate_phys_range(target_ulong start, 
> target_ulong end);
>  void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs 
> attrs);
>  #endif
>  void tb_flush(CPUState *cpu);
> +void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count);
>  void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr);
>  TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
> target_ulong cs_base, uint32_t flags,
> diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
> index c8b3e0a491..db2d28f8d3 100644
> --- a/accel/tcg/translate-all.c
> +++ b/accel/tcg/translate-all.c
> @@ -1230,7 +1230,7 @@ static gboolean tb_host_size_iter(gpointer key, 
> gpointer value, gpointer data)
>  }
>
>  /* flush all the translation blocks */
> -static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
> +void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
>  {
>  mmap_lock();
>  /* If it is already been done on request of another CPU,


--
Alex Bennée

[Qemu-devel] [RFC v9 07/17] virtio-iommu: Implement attach/detach command

2018-11-22 Thread Eric Auger

This patch implements the endpoint attach/detach to/from
a domain.

Signed-off-by: Eric Auger 

---
---
 hw/virtio/virtio-iommu.c | 40 ++--
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index 1b9c3ba416..5c231f865c 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -82,8 +82,8 @@ static void 
virtio_iommu_detach_endpoint_from_domain(viommu_endpoint *ep)
 ep->domain = NULL;
 }
 
-viommu_endpoint *virtio_iommu_get_endpoint(VirtIOIOMMU *s, uint32_t ep_id);
-viommu_endpoint *virtio_iommu_get_endpoint(VirtIOIOMMU *s, uint32_t ep_id)
+static viommu_endpoint *virtio_iommu_get_endpoint(VirtIOIOMMU *s,
+  uint32_t ep_id)
 {
 viommu_endpoint *ep;
 
@@ -112,8 +112,8 @@ static void virtio_iommu_put_endpoint(gpointer data)
 g_free(ep);
 }
 
-viommu_domain *virtio_iommu_get_domain(VirtIOIOMMU *s, uint32_t domain_id);
-viommu_domain *virtio_iommu_get_domain(VirtIOIOMMU *s, uint32_t domain_id)
+static viommu_domain *virtio_iommu_get_domain(VirtIOIOMMU *s,
+  uint32_t domain_id)
 {
 viommu_domain *domain;
 
@@ -191,10 +191,27 @@ static int virtio_iommu_attach(VirtIOIOMMU *s,
 {
 uint32_t domain_id = le32_to_cpu(req->domain);
 uint32_t ep_id = le32_to_cpu(req->endpoint);
+viommu_domain *domain;
+viommu_endpoint *ep;
 
 trace_virtio_iommu_attach(domain_id, ep_id);
 
-return VIRTIO_IOMMU_S_UNSUPP;
+ep = virtio_iommu_get_endpoint(s, ep_id);
+if (ep->domain) {
+/*
+ * the device is already attached to a domain,
+ * detach it first
+ */
+virtio_iommu_detach_endpoint_from_domain(ep);
+}
+
+domain = virtio_iommu_get_domain(s, domain_id);
+QLIST_INSERT_HEAD(>endpoint_list, ep, next);
+
+ep->domain = domain;
+g_tree_ref(domain->mappings);
+
+return VIRTIO_IOMMU_S_OK;
 }
 
 static int virtio_iommu_detach(VirtIOIOMMU *s,
@@ -202,10 +219,21 @@ static int virtio_iommu_detach(VirtIOIOMMU *s,
 {
 uint32_t domain_id = le32_to_cpu(req->domain);
 uint32_t ep_id = le32_to_cpu(req->endpoint);
+viommu_endpoint *ep;
 
 trace_virtio_iommu_detach(domain_id, ep_id);
 
-return VIRTIO_IOMMU_S_UNSUPP;
+ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(ep_id));
+if (!ep) {
+return VIRTIO_IOMMU_S_NOENT;
+}
+
+if (!ep->domain) {
+return VIRTIO_IOMMU_S_INVAL;
+}
+
+virtio_iommu_detach_endpoint_from_domain(ep);
+return VIRTIO_IOMMU_S_OK;
 }
 
 static int virtio_iommu_map(VirtIOIOMMU *s,
-- 
2.17.2

[Qemu-devel] [RFC v9 08/17] virtio-iommu: Implement map/unmap

2018-11-22 Thread Eric Auger

This patch implements virtio_iommu_map/unmap.

Signed-off-by: Eric Auger 

---

v5 -> v6:
- use new v0.6 fields
- replace error_report by qemu_log_mask

v3 -> v4:
- implement unmap semantics as specified in v0.4
---
 hw/virtio/trace-events   |  3 ++
 hw/virtio/virtio-iommu.c | 94 +++-
 2 files changed, 95 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index 4b15086872..b3c5c2604e 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -65,3 +65,6 @@ virtio_iommu_get_endpoint(uint32_t ep_id) "Alloc endpoint=%d"
 virtio_iommu_put_endpoint(uint32_t ep_id) "Free endpoint=%d"
 virtio_iommu_get_domain(uint32_t domain_id) "Alloc domain=%d"
 virtio_iommu_put_domain(uint32_t domain_id) "Free domain=%d"
+virtio_iommu_unmap_left_interval(uint64_t low, uint64_t high, uint64_t 
next_low, uint64_t next_high) "Unmap left [0x%"PRIx64",0x%"PRIx64"], new 
interval=[0x%"PRIx64",0x%"PRIx64"]"
+virtio_iommu_unmap_right_interval(uint64_t low, uint64_t high, uint64_t 
next_low, uint64_t next_high) "Unmap right [0x%"PRIx64",0x%"PRIx64"], new 
interval=[0x%"PRIx64",0x%"PRIx64"]"
+virtio_iommu_unmap_inc_interval(uint64_t low, uint64_t high) "Unmap inc 
[0x%"PRIx64",0x%"PRIx64"]"
diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index 5c231f865c..23f7dc6f7f 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -18,6 +18,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/log.h"
 #include "qemu/iov.h"
 #include "qemu-common.h"
 #include "hw/virtio/virtio.h"
@@ -57,6 +58,13 @@ typedef struct viommu_interval {
 uint64_t high;
 } viommu_interval;
 
+typedef struct viommu_mapping {
+uint64_t virt_addr;
+uint64_t phys_addr;
+uint64_t size;
+uint32_t flags;
+} viommu_mapping;
+
 static inline uint16_t virtio_iommu_get_sid(IOMMUDevice *dev)
 {
 return PCI_BUILD_BDF(pci_bus_num(dev->bus), dev->devfn);
@@ -244,10 +252,37 @@ static int virtio_iommu_map(VirtIOIOMMU *s,
 uint64_t virt_start = le64_to_cpu(req->virt_start);
 uint64_t virt_end = le64_to_cpu(req->virt_end);
 uint32_t flags = le32_to_cpu(req->flags);
+viommu_domain *domain;
+viommu_interval *interval;
+viommu_mapping *mapping;
+
+interval = g_malloc0(sizeof(*interval));
+
+interval->low = virt_start;
+interval->high = virt_end;
+
+domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id));
+if (!domain) {
+return VIRTIO_IOMMU_S_NOENT;
+}
+
+mapping = g_tree_lookup(domain->mappings, (gpointer)interval);
+if (mapping) {
+g_free(interval);
+return VIRTIO_IOMMU_S_INVAL;
+}
 
 trace_virtio_iommu_map(domain_id, virt_start, virt_end, phys_start, flags);
 
-return VIRTIO_IOMMU_S_UNSUPP;
+mapping = g_malloc0(sizeof(*mapping));
+mapping->virt_addr = virt_start;
+mapping->phys_addr = phys_start;
+mapping->size = virt_end - virt_start + 1;
+mapping->flags = flags;
+
+g_tree_insert(domain->mappings, interval, mapping);
+
+return VIRTIO_IOMMU_S_OK;
 }
 
 static int virtio_iommu_unmap(VirtIOIOMMU *s,
@@ -256,10 +291,65 @@ static int virtio_iommu_unmap(VirtIOIOMMU *s,
 uint32_t domain_id = le32_to_cpu(req->domain);
 uint64_t virt_start = le64_to_cpu(req->virt_start);
 uint64_t virt_end = le64_to_cpu(req->virt_end);
+uint64_t size = virt_end - virt_start + 1;
+viommu_mapping *mapping;
+viommu_interval interval;
+viommu_domain *domain;
 
 trace_virtio_iommu_unmap(domain_id, virt_start, virt_end);
 
-return VIRTIO_IOMMU_S_UNSUPP;
+domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id));
+if (!domain) {
+qemu_log_mask(LOG_GUEST_ERROR, "%s: no domain\n", __func__);
+return VIRTIO_IOMMU_S_NOENT;
+}
+interval.low = virt_start;
+interval.high = virt_end;
+
+mapping = g_tree_lookup(domain->mappings, (gpointer)());
+
+while (mapping) {
+viommu_interval current;
+uint64_t low  = mapping->virt_addr;
+uint64_t high = mapping->virt_addr + mapping->size - 1;
+
+current.low = low;
+current.high = high;
+
+if (low == interval.low && size >= mapping->size) {
+g_tree_remove(domain->mappings, (gpointer)());
+interval.low = high + 1;
+trace_virtio_iommu_unmap_left_interval(current.low, current.high,
+interval.low, interval.high);
+} else if (high == interval.high && size >= mapping->size) {
+trace_virtio_iommu_unmap_right_interval(current.low, current.high,
+interval.low, interval.high);
+g_tree_remove(domain->mappings, (gpointer)());
+interval.high = low - 1;
+} else if (low > interval.low && high < interval.high) {
+trace_virtio_iommu_unmap_inc_interval(current.low, current.high);
+g_tree_remove(domain->mappings, (gpointer)());
+} else {
+break;
+

[Qemu-devel] [RFC v9 17/17] hw/arm/virt-acpi-build: Add virtio-iommu node in IORT table

2018-11-22 Thread Eric Auger

This patch builds the virtio-iommu node in the ACPI IORT table.

The RID space of the root complex, which spans 0x0-0x1
maps to streamid space 0x0-0x1 in the virtio-iommu which in
turn maps to deviceid space 0x0-0x1 in the ITS group.

The iommu RID is excluded as described in virtio-iommu
specification.

Signed-off-by: Eric Auger 

---
v8 -> v9:
- iommu RID is not fixed anymore

v7 -> v8:
- exclude the iommu RID (0x8) in the root complex ID mapping
---
 hw/arm/virt-acpi-build.c| 50 ++---
 include/hw/acpi/acpi-defs.h | 21 +++-
 2 files changed, 61 insertions(+), 10 deletions(-)

diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index ec7c4835fe..0e621f6551 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -414,14 +414,14 @@ build_iort(GArray *table_data, BIOSLinker *linker, 
VirtMachineState *vms)
 AcpiIortIdMapping *idmap;
 AcpiIortItsGroup *its;
 AcpiIortTable *iort;
-AcpiIortSmmu3 *smmu;
-size_t node_size, iort_node_offset, iort_length, smmu_offset = 0;
+size_t node_size, iort_node_offset, iort_length, iommu_offset = 0;
 AcpiIortRC *rc;
+int nb_rc_idmappings = 1;
 
 iort = acpi_data_push(table_data, sizeof(*iort));
 
-if (vms->iommu == VIRT_IOMMU_SMMUV3) {
-nb_nodes = 3; /* RC, ITS, SMMUv3 */
+if (vms->iommu) {
+nb_nodes = 3; /* RC, ITS, IOMMU */
 } else {
 nb_nodes = 2; /* RC, ITS */
 }
@@ -446,10 +446,10 @@ build_iort(GArray *table_data, BIOSLinker *linker, 
VirtMachineState *vms)
 its->identifiers[0] = 0; /* MADT translation_id */
 
 if (vms->iommu == VIRT_IOMMU_SMMUV3) {
+AcpiIortSmmu3 *smmu;
 int irq =  vms->irqmap[VIRT_SMMU];
 
-/* SMMUv3 node */
-smmu_offset = iort_node_offset + node_size;
+iommu_offset = iort_node_offset + node_size;
 node_size = sizeof(*smmu) + sizeof(*idmap);
 iort_length += node_size;
 smmu = acpi_data_push(table_data, node_size);
@@ -470,16 +470,38 @@ build_iort(GArray *table_data, BIOSLinker *linker, 
VirtMachineState *vms)
  */
 fill_iort_idmap(smmu->id_mapping_array, 0, 0, 0x, 0,
 iort_node_offset);
+} else if (vms->iommu == VIRT_IOMMU_VIRTIO) {
+AcpiIortPVIommuPCI *iommu;
+
+nb_rc_idmappings = 2;
+iommu_offset = iort_node_offset + node_size;
+node_size = sizeof(*iommu) + sizeof(*idmap);
+iort_length += node_size;
+iommu = acpi_data_push(table_data, node_size);
+
+iommu->type = ACPI_IORT_NODE_PARAVIRT;
+iommu->length = cpu_to_le16(node_size);
+iommu->mapping_count = cpu_to_le32(2);
+iommu->mapping_offset = cpu_to_le32(sizeof(*iommu));
+iommu->devid = cpu_to_le32(vms->virtio_iommu_bdf);
+iommu->model = cpu_to_le32(ACPI_IORT_NODE_PV_VIRTIO_IOMMU_PCI);
+
+/*
+ * Identity RID mapping covering the whole input RID range
+ * output IORT node is the ITS group node (the first node)
+ */
+fill_iort_idmap(iommu->id_mapping_array, 0, 0, 0x, 0,
+iort_node_offset);
 }
 
 /* Root Complex Node */
-node_size = sizeof(*rc) + sizeof(*idmap);
+node_size = sizeof(*rc) + nb_rc_idmappings * sizeof(*idmap);
 iort_length += node_size;
 rc = acpi_data_push(table_data, node_size);
 
 rc->type = ACPI_IORT_NODE_PCI_ROOT_COMPLEX;
 rc->length = cpu_to_le16(node_size);
-rc->mapping_count = cpu_to_le32(1);
+rc->mapping_count = cpu_to_le32(nb_rc_idmappings);
 rc->mapping_offset = cpu_to_le32(sizeof(*rc));
 
 /* fully coherent device */
@@ -490,7 +512,17 @@ build_iort(GArray *table_data, BIOSLinker *linker, 
VirtMachineState *vms)
 if (vms->iommu == VIRT_IOMMU_SMMUV3) {
 /* Identity RID mapping and output IORT node is the iommu node */
 fill_iort_idmap(rc->id_mapping_array, 0, 0, 0x, 0,
-smmu_offset);
+iommu_offset);
+} else if (vms->iommu == VIRT_IOMMU_VIRTIO) {
+/*
+ * Identity mapping with the IOMMU RID (0x8) excluded. The output
+ * IORT node is the iommu node.
+ */
+fill_iort_idmap(rc->id_mapping_array, 0, 0, vms->virtio_iommu_bdf, 0,
+iommu_offset);
+fill_iort_idmap(rc->id_mapping_array, 1, vms->virtio_iommu_bdf + 1,
+0x - vms->virtio_iommu_bdf,
+vms->virtio_iommu_bdf + 1, iommu_offset);
 } else {
 /*
  * Identity RID mapping and the output IORT node is the ITS group
diff --git a/include/hw/acpi/acpi-defs.h b/include/hw/acpi/acpi-defs.h
index af8e023968..b14aa95dc1 100644
--- a/include/hw/acpi/acpi-defs.h
+++ b/include/hw/acpi/acpi-defs.h
@@ -601,7 +601,8 @@ enum {
 ACPI_IORT_NODE_NAMED_COMPONENT = 0x01,
 ACPI_IORT_NODE_PCI_ROOT_COMPLEX = 0x02,
 ACPI_IORT_NODE_SMMU

[Qemu-devel] [RFC v9 05/17] virtio-iommu: Add the iommu regions

2018-11-22 Thread Eric Auger

This patch initializes the iommu memory regions so that
PCIe end point transactions get translated. The translation
function is not yet implemented though.

Signed-off-by: Eric Auger 

---
v6 -> v7:
- use primary_bus
- rebase on new translate proto featuring iommu_idx

v5 -> v6:
- include qapi/error.h
- fix g_hash_table_lookup key in virtio_iommu_find_add_as

v4 -> v5:
- use PCI bus handle as a key
- use get_primary_pci_bus() callback

v3 -> v4:
- add trace_virtio_iommu_init_iommu_mr

v2 -> v3:
- use IOMMUMemoryRegion
- iommu mr name built with BDF
- rename smmu_get_sid into virtio_iommu_get_sid and use PCI_BUILD_BDF
---
 hw/virtio/trace-events   |  2 +
 hw/virtio/virtio-iommu.c | 94 
 include/hw/virtio/virtio-iommu.h |  2 +
 3 files changed, 98 insertions(+)

diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index e6177ca0e4..9270b0463e 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -59,3 +59,5 @@ virtio_iommu_attach(uint32_t domain_id, uint32_t ep_id) 
"domain=%d endpoint=%d"
 virtio_iommu_detach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d"
 virtio_iommu_map(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end, 
uint64_t phys_start, uint32_t flags) "domain=%d virt_start=0x%"PRIx64" 
virt_end=0x%"PRIx64 " phys_start=0x%"PRIx64" flags=%d"
 virtio_iommu_unmap(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end) 
"domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64
+virtio_iommu_translate(const char *name, uint32_t rid, uint64_t iova, int 
flag) "mr=%s rid=%d addr=0x%"PRIx64" flag=%d"
+virtio_iommu_init_iommu_mr(char *iommu_mr) "init %s"
diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index fc95751c40..dead062baf 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -22,6 +22,10 @@
 #include "qemu-common.h"
 #include "hw/virtio/virtio.h"
 #include "sysemu/kvm.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "hw/i386/pc.h"
+#include "hw/arm/virt.h"
 #include "trace.h"
 
 #include "standard-headers/linux/virtio_ids.h"
@@ -33,6 +37,50 @@
 /* Max size */
 #define VIOMMU_DEFAULT_QUEUE_SIZE 256
 
+static inline uint16_t virtio_iommu_get_sid(IOMMUDevice *dev)
+{
+return PCI_BUILD_BDF(pci_bus_num(dev->bus), dev->devfn);
+}
+
+static AddressSpace *virtio_iommu_find_add_as(PCIBus *bus, void *opaque,
+  int devfn)
+{
+VirtIOIOMMU *s = opaque;
+IOMMUPciBus *sbus = g_hash_table_lookup(s->as_by_busptr, bus);
+IOMMUDevice *sdev;
+
+if (!sbus) {
+sbus = g_malloc0(sizeof(IOMMUPciBus) +
+ sizeof(IOMMUDevice *) * IOMMU_PCI_DEVFN_MAX);
+sbus->bus = bus;
+g_hash_table_insert(s->as_by_busptr, bus, sbus);
+}
+
+sdev = sbus->pbdev[devfn];
+if (!sdev) {
+char *name = g_strdup_printf("%s-%d-%d",
+ TYPE_VIRTIO_IOMMU_MEMORY_REGION,
+ pci_bus_num(bus), devfn);
+sdev = sbus->pbdev[devfn] = g_malloc0(sizeof(IOMMUDevice));
+
+sdev->viommu = s;
+sdev->bus = bus;
+sdev->devfn = devfn;
+
+trace_virtio_iommu_init_iommu_mr(name);
+
+memory_region_init_iommu(>iommu_mr, sizeof(sdev->iommu_mr),
+ TYPE_VIRTIO_IOMMU_MEMORY_REGION,
+ OBJECT(s), name,
+ UINT64_MAX);
+address_space_init(>as,
+   MEMORY_REGION(>iommu_mr), TYPE_VIRTIO_IOMMU);
+}
+
+return >as;
+
+}
+
 static int virtio_iommu_attach(VirtIOIOMMU *s,
struct virtio_iommu_req_attach *req)
 {
@@ -204,6 +252,27 @@ static void virtio_iommu_handle_command(VirtIODevice 
*vdev, VirtQueue *vq)
 }
 }
 
+static IOMMUTLBEntry virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr,
+IOMMUAccessFlags flag,
+int iommu_idx)
+{
+IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr);
+uint32_t sid;
+
+IOMMUTLBEntry entry = {
+.target_as = _space_memory,
+.iova = addr,
+.translated_addr = addr,
+.addr_mask = ~(hwaddr)0,
+.perm = IOMMU_NONE,
+};
+
+sid = virtio_iommu_get_sid(sdev);
+
+trace_virtio_iommu_translate(mr->parent_obj.name, sid, addr, flag);
+return entry;
+}
+
 static void virtio_iommu_get_config(VirtIODevice *vdev, uint8_t *config_data)
 {
 VirtIOIOMMU *dev = VIRTIO_IOMMU(vdev);
@@ -286,6 +355,15 @@ static void virtio_iommu_device_realize(DeviceState *dev, 
Error **errp)
 virtio_add_feature(>features, VIRTIO_IOMMU_F_DOMAIN_BITS);
 virtio_add_feature(>features, VIRTIO_IOMMU_F_MAP_UNMAP);
 virtio_add_feature(>features, VIRTIO_IOMMU_F_BYPASS);
+
+memset(s->as_by_bus_num, 0, sizeof(s->as_by_bus_num));
+s->as_by_busptr =

[Qemu-devel] [PATCH 2/2] vfio-ccw: support async command subregion

2018-11-22 Thread Cornelia Huck

A vfio-ccw device may provide an async command subregion for
issuing halt/clear subchannel requests. If it is present, use
it for sending halt/clear request to the device; if not, fall
back to emulation (as done today).

Signed-off-by: Cornelia Huck 
---
 hw/s390x/css.c  |  27 +++--
 hw/vfio/ccw.c   | 109 +++-
 include/hw/s390x/s390-ccw.h |   3 +
 3 files changed, 133 insertions(+), 6 deletions(-)

diff --git a/hw/s390x/css.c b/hw/s390x/css.c
index 04ec5cc970..0897c041c5 100644
--- a/hw/s390x/css.c
+++ b/hw/s390x/css.c
@@ -22,6 +22,7 @@
 #include "trace.h"
 #include "hw/s390x/s390_flic.h"
 #include "hw/s390x/s390-virtio-ccw.h"
+#include "hw/s390x/s390-ccw.h"
 
 typedef struct CrwContainer {
 CRW crw;
@@ -1194,6 +1195,26 @@ static void sch_handle_start_func_virtual(SubchDev *sch)
 
 }
 
+static void sch_handle_halt_func_passthrough(SubchDev *sch)
+{
+int ret;
+
+ret = vfio_ccw_handle_halt(sch);
+if (ret == -ENOSYS) {
+sch_handle_halt_func(sch);
+}
+}
+
+static void sch_handle_clear_func_passthrough(SubchDev *sch)
+{
+int ret;
+
+ret = vfio_ccw_handle_clear(sch);
+if (ret == -ENOSYS) {
+sch_handle_clear_func(sch);
+}
+}
+
 static IOInstEnding sch_handle_start_func_passthrough(SubchDev *sch)
 {
 
@@ -1237,11 +1258,9 @@ IOInstEnding do_subchannel_work_passthrough(SubchDev 
*sch)
 SCSW *s = >curr_status.scsw;
 
 if (s->ctrl & SCSW_FCTL_CLEAR_FUNC) {
-/* TODO: Clear handling */
-sch_handle_clear_func(sch);
+sch_handle_clear_func_passthrough(sch);
 } else if (s->ctrl & SCSW_FCTL_HALT_FUNC) {
-/* TODO: Halt handling */
-sch_handle_halt_func(sch);
+sch_handle_halt_func_passthrough(sch);
 } else if (s->ctrl & SCSW_FCTL_START_FUNC) {
 return sch_handle_start_func_passthrough(sch);
 }
diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index 9246729a75..dd0fecf168 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -2,9 +2,12 @@
  * vfio based subchannel assignment support
  *
  * Copyright 2017 IBM Corp.
+ * Copyright 2018 Red Hat, Inc.
+ *
  * Author(s): Dong Jia Shi 
  *Xiao Feng Ren 
  *Pierre Morel 
+ *Cornelia Huck 
  *
  * This work is licensed under the terms of the GNU GPL, version 2 or (at
  * your option) any later version. See the COPYING file in the top-level
@@ -32,6 +35,9 @@ typedef struct VFIOCCWDevice {
 uint64_t io_region_size;
 uint64_t io_region_offset;
 struct ccw_io_region *io_region;
+uint64_t async_cmd_region_size;
+uint64_t async_cmd_region_offset;
+struct ccw_cmd_region *async_cmd_region;
 EventNotifier io_notifier;
 bool force_orb_pfch;
 bool warned_orb_pfch;
@@ -114,6 +120,87 @@ again:
 }
 }
 
+int vfio_ccw_handle_clear(SubchDev *sch)
+{
+S390CCWDevice *cdev = sch->driver_data;
+VFIOCCWDevice *vcdev = DO_UPCAST(VFIOCCWDevice, cdev, cdev);
+struct ccw_cmd_region *region = vcdev->async_cmd_region;
+int ret;
+
+if (!vcdev->async_cmd_region) {
+/* Async command region not available, fall back to emulation */
+return -ENOSYS;
+}
+
+memset(region, 0, sizeof(*region));
+region->command = VFIO_CCW_ASYNC_CMD_CSCH;
+
+again:
+ret = pwrite(vcdev->vdev.fd, region,
+ vcdev->async_cmd_region_size, vcdev->async_cmd_region_offset);
+if (ret != vcdev->async_cmd_region_size) {
+if (errno == EAGAIN) {
+goto again;
+}
+error_report("vfio-ccw: wirte I/O region failed with errno=%d", errno);
+ret = -errno;
+} else {
+ret = region->ret_code;
+}
+switch (ret) {
+case 0:
+case -ENODEV:
+case -EACCES:
+return 0;
+case -EFAULT:
+default:
+sch_gen_unit_exception(sch);
+css_inject_io_interrupt(sch);
+return 0;
+}
+}
+
+int vfio_ccw_handle_halt(SubchDev *sch)
+{
+S390CCWDevice *cdev = sch->driver_data;
+VFIOCCWDevice *vcdev = DO_UPCAST(VFIOCCWDevice, cdev, cdev);
+struct ccw_cmd_region *region = vcdev->async_cmd_region;
+int ret;
+
+if (!vcdev->async_cmd_region) {
+/* Async command region not available, fall back to emulation */
+return -ENOSYS;
+}
+
+memset(region, 0, sizeof(*region));
+region->command = VFIO_CCW_ASYNC_CMD_HSCH;
+
+again:
+ret = pwrite(vcdev->vdev.fd, region,
+ vcdev->async_cmd_region_size, vcdev->async_cmd_region_offset);
+if (ret != vcdev->async_cmd_region_size) {
+if (errno == EAGAIN) {
+goto again;
+}
+error_report("vfio-ccw: wirte I/O region failed with errno=%d", errno);
+ret = -errno;
+} else {
+ret = region->ret_code;
+}
+switch (ret) {
+case 0:
+case -EBUSY:
+case -ENODEV:
+case -EACCES:
+return 0;
+case -EFAULT:
+default:
+sch_gen_unit_exception(sch);
+

[Qemu-devel] [RFC v9 00/17] VIRTIO-IOMMU device

2018-11-22 Thread Eric Auger

This series rebases the virtio-iommu device on qemu 3.1.0-rc2
and implements the v0.8(.1) virtio-iommu spec [1]. The pci proxy
for the virtio-iommu device is now available and needs to be
instantiated from the command line using "-device virtio-iommu-pci".
The iommu machvirt option is not used anymore to instantiate the
virtio-iommu.

At the moment the virtio-iommu-device only is functional in the
ARM virt machine. Indeed, besides its instantiation, links between
the PCIe end points and the IOMMU must be described. This is achieved
by DT or ACPI description (IORT). This description currently only is
done in ARM virt.

Best Regards

Eric

This series can be found at:
https://github.com/eauger/qemu/tree/v3.1.0-rc2-virtio-iommu-v0.9

References:
[1] [PATCH v3 0/7] Add virtio-iommu driver

[2] guest branch featuring the virtio-iommu driver v0.8.1 + ACPI
integration not yet officially released by Jean.
https://github.com/eauger/linux/tree/virtio-iommu-v0.8.1

Testing:
- tested with guest using virtio-net-pci
  (,vhost=off,iommu_platform,disable-modern=off,disable-legacy=on)
  and virtio-blk-pci
- VFIO/VHOST integration is not part of this series
- When using the virtio-blk-pci, some EDK2 FW versions feature
  unmapped transactions and in that case the guest fails to boot.

History:

v8 -> v9:
- virtio-iommu-pci device needs to be instantiated from the command
  line (RID is not imposed anymore).
- tail structure properly initialized

v7 -> v8:
- virtio-iommu-pci added
- virt instantiation modified
- DT and ACPI modified to exclude the iommu RID from the mapping
- VIRTIO_IOMMU_F_BYPASS, VIRTIO_F_VERSION_1 features exposed

v6 -> v7:
- rebase on qemu 3.0.0-rc3
- minor update against v0.7
- fix issue with EP not on pci.0 and ACPI probing
- change the instantiation method

v5 -> v6:
- minor update against v0.6 spec
- fix g_hash_table_lookup in virtio_iommu_find_add_as
- replace some error_reports by qemu_log_mask(LOG_GUEST_ERROR, ...)

v4 -> v5:
- event queue and fault reporting
- we now return the IOAPIC MSI region if the virtio-iommu is instantiated
  in a PC machine.
- we bypass transactions on MSI HW region and fault on reserved ones.
- We support ACPI boot with mach-virt (based on IORT proposal)
- We moved to the new driver naming conventions
- simplified mach-virt instantiation
- worked around the disappearing of pci_find_primary_bus
- in virtio_iommu_translate, check the dev->as is not NULL
- initialize as->device_list in virtio_iommu_get_as
- initialize bufstate.error to false in virtio_iommu_probe

v3 -> v4:
- probe request support although no reserved region is returned at
  the moment
- unmap semantics less strict, as specified in v0.4
- device registration, attach/detach revisited
- split into smaller patches to ease review
- propose a way to inform the IOMMU mr about the page_size_mask
  of underlying HW IOMMU, if any
- remove warning associated with the translation of the MSI doorbell

v2 -> v3:
- rebase on top of 2.10-rc0 and especially
  [PATCH qemu v9 0/2] memory/iommu: QOM'fy IOMMU MemoryRegion
- add mutex init
- fix as->mappings deletion using g_tree_ref/unref
- when a dev is attached whereas it is already attached to
  another address space, first detach it
- fix some error values
- page_sizes = TARGET_PAGE_MASK;
- I haven't changed the unmap() semantics yet, waiting for the
  next virtio-iommu spec revision.

v1 -> v2:
- fix redifinition of viommu_as typedef

Eric Auger (17):
  update-linux-headers: Import virtio_iommu.h
  linux-headers: Partial update for virtio-iommu v0.8
  virtio-iommu: Add skeleton
  virtio-iommu: Decode the command payload
  virtio-iommu: Add the iommu regions
  virtio-iommu: Endpoint and domains structs and helpers
  virtio-iommu: Implement attach/detach command
  virtio-iommu: Implement map/unmap
  virtio-iommu: Implement translate
  virtio-iommu: Implement probe request
  virtio-iommu: Expose the IOAPIC MSI reserved region when relevant
  virtio-iommu: Implement fault reporting
  virtio_iommu: Handle reserved regions in translation process
  virtio-iommu-pci: Add virtio iommu pci support
  hw/arm/virt: Add the virtio-iommu device tree mappings
  hw/arm/virt-acpi-build: Introduce fill_iort_idmap helper
  hw/arm/virt-acpi-build: Add virtio-iommu node in IORT table

 hw/arm/virt-acpi-build.c  |   91 +-
 hw/arm/virt.c |   57 +-
 hw/virtio/Makefile.objs   |1 +
 hw/virtio/trace-events|   26 +
 hw/virtio/virtio-iommu.c  | 1040 +
 hw/virtio/virtio-pci.c|   51 +
 hw/virtio/virtio-pci.h|   14 +
 include/hw/acpi/acpi-defs.h   |   21 +-
 include/hw/arm/virt.h |2 +
 include/hw/pci/pci.h  |1 +
 include/hw/virtio/virtio-iommu.h  |   65 ++
 include/standard-headers/linux/virtio_ids.h   |1 +

[Qemu-devel] [RFC v9 12/17] virtio-iommu: Implement fault reporting

2018-11-22 Thread Eric Auger

The event queue allows to report asynchronous errors.
The translate function now injects faults when relevant.

Signed-off-by: Eric Auger 
---
 hw/virtio/trace-events   |  1 +
 hw/virtio/virtio-iommu.c | 67 ++--
 2 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index 19824c3e91..053a07b3fc 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -71,3 +71,4 @@ virtio_iommu_unmap_inc_interval(uint64_t low, uint64_t high) 
"Unmap inc [0x%"PRI
 virtio_iommu_translate_out(uint64_t virt_addr, uint64_t phys_addr, uint32_t 
sid) "0x%"PRIx64" -> 0x%"PRIx64 " for sid=%d"
 virtio_iommu_fill_resv_property(uint32_t devid, uint8_t subtype, uint64_t 
start, uint64_t end, uint32_t flags, size_t filled) "dev= %d, subtype=%d 
start=0x%"PRIx64" end=0x%"PRIx64" flags=%d filled=0x%lx"
 virtio_iommu_fill_none_property(uint32_t devid) "devid=%d"
+virtio_iommu_report_fault(uint8_t reason, uint32_t flags, uint32_t endpoint, 
uint64_t addr) "FAULT reason=%d flags=%d endpoint=%d address =0x%"PRIx64
diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index 324518c300..1246dd6bdf 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -683,17 +683,63 @@ push:
 }
 }
 
+static void virtio_iommu_report_fault(VirtIOIOMMU *viommu, uint8_t reason,
+  uint32_t flags, uint32_t endpoint,
+  uint64_t address)
+{
+VirtIODevice *vdev = >parent_obj;
+VirtQueue *vq = viommu->event_vq;
+struct virtio_iommu_fault fault;
+VirtQueueElement *elem;
+size_t sz;
+
+memset(, 0, sizeof(fault));
+fault.reason = reason;
+fault.flags = flags;
+fault.endpoint = endpoint;
+fault.address = address;
+
+for (;;) {
+elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
+
+if (!elem) {
+virtio_error(vdev,
+ "no buffer available in event queue to report event");
+return;
+}
+
+if (iov_size(elem->in_sg, elem->in_num) < sizeof(fault)) {
+virtio_error(vdev, "error buffer of wrong size");
+virtqueue_detach_element(vq, elem, 0);
+g_free(elem);
+continue;
+}
+break;
+}
+/* we have a buffer to fill in */
+sz = iov_from_buf(elem->in_sg, elem->in_num, 0,
+  , sizeof(fault));
+assert(sz == sizeof(fault));
+
+trace_virtio_iommu_report_fault(reason, flags, endpoint, address);
+virtqueue_push(vq, elem, sz);
+virtio_notify(vdev, vq);
+g_free(elem);
+
+}
+
 static IOMMUTLBEntry virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr,
 IOMMUAccessFlags flag,
 int iommu_idx)
 {
 IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr);
 VirtIOIOMMU *s = sdev->viommu;
-uint32_t sid;
+uint32_t sid, flags;
 viommu_endpoint *ep;
 viommu_mapping *mapping;
 viommu_interval interval;
 bool bypass_allowed;
+bool read_fault, write_fault;
 
 interval.low = addr;
 interval.high = addr + 1;
@@ -718,6 +764,8 @@ static IOMMUTLBEntry 
virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr,
 if (!ep) {
 if (!bypass_allowed) {
 error_report("%s sid=%d is not known!!", __func__, sid);
+virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_UNKNOWN,
+  0, sid, 0);
 } else {
 entry.perm = flag;
 }
@@ -729,6 +777,8 @@ static IOMMUTLBEntry 
virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr,
 qemu_log_mask(LOG_GUEST_ERROR,
   "%s %02x:%02x.%01x not attached to any domain\n",
   __func__, PCI_BUS_NUM(sid), PCI_SLOT(sid), 
PCI_FUNC(sid));
+virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_DOMAIN,
+  0, sid, 0);
 } else {
 entry.perm = flag;
 }
@@ -740,14 +790,25 @@ static IOMMUTLBEntry 
virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr,
 qemu_log_mask(LOG_GUEST_ERROR,
   "%s no mapping for 0x%"PRIx64" for sid=%d\n",
   __func__, addr, sid);
+virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_MAPPING,
+  0, sid, addr);
 goto unlock;
 }
 
-if (((flag & IOMMU_RO) && !(mapping->flags & VIRTIO_IOMMU_MAP_F_READ)) ||
-((flag & IOMMU_WO) && !(mapping->flags & VIRTIO_IOMMU_MAP_F_WRITE))) {
+read_fault = (flag & IOMMU_RO) &&
+!(mapping->flags & VIRTIO_IOMMU_MAP_F_READ);
+write_fault = (flag & IOMMU_WO) &&
+!(mapping->flags & VIRTIO_IOMMU_MAP_F_WRITE);
+
+flags = read_fault ? VIRTIO_IOMMU_FAULT_F_READ : 0;
+flags |= write_fault ?

[Qemu-devel] [PATCH 1/2] vfio-ccw: new capability chain support

2018-11-22 Thread Cornelia Huck

To be replaced with a real linux-headers update.

Signed-off-by: Cornelia Huck 
---
 linux-headers/linux/vfio.h |  4 
 linux-headers/linux/vfio_ccw.h | 12 
 2 files changed, 16 insertions(+)

diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index ceb6453394..c9ba8d52a0 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -297,12 +297,16 @@ struct vfio_region_info_cap_type {
 
 #define VFIO_REGION_TYPE_PCI_VENDOR_TYPE   (1 << 31)
 #define VFIO_REGION_TYPE_PCI_VENDOR_MASK   (0x)
+#define VFIO_REGION_TYPE_CCW   (1 << 30)
 
 /* 8086 Vendor sub-types */
 #define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1)
 #define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2)
 #define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG  (3)
 
+/* ccw sub-types */
+#define VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD  (1)
+
 /*
  * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped
  * which allows direct access to non-MSIX registers which happened to be within
diff --git a/linux-headers/linux/vfio_ccw.h b/linux-headers/linux/vfio_ccw.h
index 5bf96c3812..fcc3e69ef5 100644
--- a/linux-headers/linux/vfio_ccw.h
+++ b/linux-headers/linux/vfio_ccw.h
@@ -12,6 +12,7 @@
 
 #include 
 
+/* used for START SUBCHANNEL, always present */
 struct ccw_io_region {
 #define ORB_AREA_SIZE 12
__u8orb_area[ORB_AREA_SIZE];
@@ -22,4 +23,15 @@ struct ccw_io_region {
__u32   ret_code;
 } __attribute__((packed));
 
+/*
+ * used for processing commands that trigger asynchronous actions
+ * Note: this is controlled by a capability
+ */
+#define VFIO_CCW_ASYNC_CMD_HSCH (1 << 0)
+#define VFIO_CCW_ASYNC_CMD_CSCH (1 << 1)
+struct ccw_cmd_region {
+   __u32 command;
+   __u32 ret_code;
+} __attribute__((packed));
+
 #endif
-- 
2.17.2

Re: [Qemu-devel] [PATCH] nvme: fix out-of-bounds access to the CMB

2018-11-22 Thread Peter Maydell

On 20 November 2018 at 18:41, Paolo Bonzini  wrote:
> Because the CMB BAR has a min_access_size of 2, if you read the last
> byte it will try to memcpy *2* bytes from n->cmbuf, causing an off-by-one
> error.  This is CVE-2018-16847.

Maybe we should change the MemoryRegionOps API so that
devices have to explicitly opt in to handling accesses
that span off the end of the region size they've registered?
IIRC we have one or two oddball devices that care about that
(probably mostly x86 IO port devices), but most device
implementations will not be expecting it.

I'm also surprised that the memory subsystem permits a
2 byte access at address sz-1 here, since .impl.unaligned
is not set...

thanks
-- PMM

Re: [Qemu-devel] [RFC 13/48] xxhash: add qemu_xxhash8

2018-11-22 Thread Alex Bennée



Emilio G. Cota  writes:

> It will be used for TB hashing soon.
>
> Signed-off-by: Emilio G. Cota 
> ---
>  include/qemu/xxhash.h | 40 +++-
>  1 file changed, 27 insertions(+), 13 deletions(-)
>
> diff --git a/include/qemu/xxhash.h b/include/qemu/xxhash.h
> index fe35dde328..450427eeaa 100644
> --- a/include/qemu/xxhash.h
> +++ b/include/qemu/xxhash.h
> @@ -49,7 +49,8 @@
>   * contiguous in memory.
>   */
>  static inline uint32_t
> -qemu_xxhash7(uint64_t ab, uint64_t cd, uint32_t e, uint32_t f, uint32_t g)
> +qemu_xxhash8(uint64_t ab, uint64_t cd, uint32_t e, uint32_t f, uint32_t g,
> + uint32_t h)

As we've expanded to bigger and bigger hashes why are everything after
cd passed as 32 bit values? Isn't this just generating extra register
pressure or is the compiler smart enough to figure it out?

>  {
>  uint32_t v1 = QEMU_XXHASH_SEED + PRIME32_1 + PRIME32_2;
>  uint32_t v2 = QEMU_XXHASH_SEED + PRIME32_2;
> @@ -77,17 +78,24 @@ qemu_xxhash7(uint64_t ab, uint64_t cd, uint32_t e, 
> uint32_t f, uint32_t g)
>  v4 = rol32(v4, 13);
>  v4 *= PRIME32_1;
>
> -h32 = rol32(v1, 1) + rol32(v2, 7) + rol32(v3, 12) + rol32(v4, 18);
> -h32 += 28;
> +v1 += e * PRIME32_2;
> +v1 = rol32(v1, 13);
> +v1 *= PRIME32_1;
>
> -h32 += e * PRIME32_3;
> -h32  = rol32(h32, 17) * PRIME32_4;
> +v2 += f * PRIME32_2;
> +v2 = rol32(v2, 13);
> +v2 *= PRIME32_1;
> +
> +v3 += g * PRIME32_2;
> +v3 = rol32(v3, 13);
> +v3 *= PRIME32_1;
>
> -h32 += f * PRIME32_3;
> -h32  = rol32(h32, 17) * PRIME32_4;
> +v4 += h * PRIME32_2;
> +v4 = rol32(v4, 13);
> +v4 *= PRIME32_1;
>
> -h32 += g * PRIME32_3;
> -h32  = rol32(h32, 17) * PRIME32_4;
> +h32 = rol32(v1, 1) + rol32(v2, 7) + rol32(v3, 12) + rol32(v4, 18);
> +h32 += 32;

How do we validate we haven't broken the distribution of the original
xxhash as we've extended it?

>
>  h32 ^= h32 >> 15;
>  h32 *= PRIME32_2;
> @@ -100,23 +108,29 @@ qemu_xxhash7(uint64_t ab, uint64_t cd, uint32_t e, 
> uint32_t f, uint32_t g)
>
>  static inline uint32_t qemu_xxhash2(uint64_t ab)
>  {
> -return qemu_xxhash7(ab, 0, 0, 0, 0);
> +return qemu_xxhash8(ab, 0, 0, 0, 0, 0);
>  }
>
>  static inline uint32_t qemu_xxhash4(uint64_t ab, uint64_t cd)
>  {
> -return qemu_xxhash7(ab, cd, 0, 0, 0);
> +return qemu_xxhash8(ab, cd, 0, 0, 0, 0);
>  }
>
>  static inline uint32_t qemu_xxhash5(uint64_t ab, uint64_t cd, uint32_t e)
>  {
> -return qemu_xxhash7(ab, cd, e, 0, 0);
> +return qemu_xxhash8(ab, cd, e, 0, 0, 0);
>  }
>
>  static inline uint32_t qemu_xxhash6(uint64_t ab, uint64_t cd, uint32_t e,
>  uint32_t f)
>  {
> -return qemu_xxhash7(ab, cd, e, f, 0);
> +return qemu_xxhash8(ab, cd, e, f, 0, 0);
> +}
> +
> +static inline uint32_t qemu_xxhash7(uint64_t ab, uint64_t cd, uint32_t e,
> +uint32_t f, uint32_t g)
> +{
> +return qemu_xxhash8(ab, cd, e, f, g, 0);
>  }
>
>  #endif /* QEMU_XXHASH_H */


--
Alex Bennée

[Qemu-devel] [PATCH 0/2] vfio-ccw: support hsch/csch (QEMU part)

2018-11-22 Thread Cornelia Huck

[This is the QEMU part, git tree is available at
https://github.com/cohuck/qemu vfio-ccw-caps

The companion Linux kernel patches are available at
https://git.kernel.org/pub/scm/linux/kernel/git/kvms390/vfio-ccw.git 
vfio-ccw-caps]

Currently, vfio-ccw only relays START SUBCHANNEL requests to the real
device. This tends to work well for the most common 'good path' scenarios;
however, as we emulate {HALT,CLEAR} SUBCHANNEL in QEMU, things like
clearing pending requests at the device is currently not supported.
This may be a problem for e.g. error recovery.

This patch series makes use of the newly introduced async command region
to issue hsch/csch; if it is not present, continue to emulate hsch/csch,
as before.

[I'm not quite happy with how this async processing hooks up in css.c;
ideas welcome.]

Very lightly tested (I can interact with a dasd as before; I have not
found a reliable way to trigger hsch/csch in the Linux dasd guest driver.)

Cornelia Huck (2):
  vfio-ccw: new capability chain support
  vfio-ccw: support async command subregion

 hw/s390x/css.c |  27 ++--
 hw/vfio/ccw.c  | 109 -
 include/hw/s390x/s390-ccw.h|   3 +
 linux-headers/linux/vfio.h |   4 ++
 linux-headers/linux/vfio_ccw.h |  12 
 5 files changed, 149 insertions(+), 6 deletions(-)

-- 
2.17.2

[Qemu-devel] [PATCH for-4.0] ui/console: Remove qemu_create_display_surface_guestmem()

2018-11-22 Thread Peter Maydell

The qemu_create_display_surface_guestmem() function was added in
commit a77549b3ffcc24c32ee4e but apparently never used. Remove it.

(The API of this function is in any case awkward as a generic
function: it assumes that a physical address uniquely identifies
a piece of memory in the system, which is mostly but not
always true.)

Signed-off-by: Peter Maydell 
---
 include/ui/console.h |  4 
 ui/console.c | 36 
 2 files changed, 40 deletions(-)

diff --git a/include/ui/console.h b/include/ui/console.h
index c17803c530a..853fcf4eb75 100644
--- a/include/ui/console.h
+++ b/include/ui/console.h
@@ -257,10 +257,6 @@ DisplaySurface *qemu_create_displaysurface_from(int width, 
int height,
 pixman_format_code_t format,
 int linesize, uint8_t *data);
 DisplaySurface *qemu_create_displaysurface_pixman(pixman_image_t *image);
-DisplaySurface *qemu_create_displaysurface_guestmem(int width, int height,
-pixman_format_code_t 
format,
-int linesize,
-uint64_t addr);
 DisplaySurface *qemu_create_message_surface(int w, int h,
 const char *msg);
 PixelFormat qemu_default_pixelformat(int bpp);
diff --git a/ui/console.c b/ui/console.c
index 3a285bae00a..7076becedd5 100644
--- a/ui/console.c
+++ b/ui/console.c
@@ -1385,42 +1385,6 @@ DisplaySurface 
*qemu_create_displaysurface_pixman(pixman_image_t *image)
 return surface;
 }
 
-static void qemu_unmap_displaysurface_guestmem(pixman_image_t *image,
-   void *unused)
-{
-void *data = pixman_image_get_data(image);
-uint32_t size = pixman_image_get_stride(image) *
-pixman_image_get_height(image);
-cpu_physical_memory_unmap(data, size, 0, 0);
-}
-
-DisplaySurface *qemu_create_displaysurface_guestmem(int width, int height,
-pixman_format_code_t 
format,
-int linesize, uint64_t 
addr)
-{
-DisplaySurface *surface;
-hwaddr size;
-void *data;
-
-if (linesize == 0) {
-linesize = width * PIXMAN_FORMAT_BPP(format) / 8;
-}
-
-size = (hwaddr)linesize * height;
-data = cpu_physical_memory_map(addr, , 0);
-if (size != (hwaddr)linesize * height) {
-cpu_physical_memory_unmap(data, size, 0, 0);
-return NULL;
-}
-
-surface = qemu_create_displaysurface_from
-(width, height, format, linesize, data);
-pixman_image_set_destroy_function
-(surface->image, qemu_unmap_displaysurface_guestmem, NULL);
-
-return surface;
-}
-
 DisplaySurface *qemu_create_message_surface(int w, int h,
 const char *msg)
 {
-- 
2.19.1

1 2 3 >

1 - 100 of 268 matches

Mail list logo