Re: [Qemu-devel] [PULL] tftp: fake support for netascii protocol

2016-11-18 Thread Thomas Huth
On 18.11.2016 18:51, Samuel Thibault wrote:
> From: Vincent Bernat 
> 
> Some network equipments are requesting a file using the netascii
> protocol and this is not configurable. Currently, qemu's tftpd only
> supports the octet protocol. This commit makes it accept the netascii
> protocol as well but do not perform the requested transformation (LF ->
> CR,LF) as it would be far more complex.

That sounds somewhat wrong to me. QEMU now seems to support a transfer
mode that is not really implemented. I think you should at least issue a
qemu_log_mask(LOG_UNIMP, "...") call in that case.

 Thomas


PS: As far as I know, this month, PULL requests should be CC:ed to
Stefan, not Peter, since Peter is away.



Re: [Qemu-devel] [PATCH v2 3/4] xen: create qdev for each backend device

2016-11-18 Thread Stefano Stabellini
On Wed, 2 Nov 2016, Juergen Gross wrote:
> Create a qdev plugged to the xen-sysbus for each new backend device.
> This device can be used as a parent for all needed devices of that
> backend. The id of the new device will be "xen--" with
>  being the xen backend type (e.g. "qdisk") and  the xen
> backend number of the type under which it is to be found in xenstore.
> 
> Signed-off-by: Juergen Gross 
> ---
>  hw/xen/xen_backend.c | 48 
> +++-
>  hw/xen/xen_pvdev.c   |  5 -
>  include/hw/xen/xen_backend.h |  4 
>  include/hw/xen/xen_pvdev.h   |  1 +
>  4 files changed, 56 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/xen/xen_backend.c b/hw/xen/xen_backend.c
> index 5ad3caa..3cb40b2 100644
> --- a/hw/xen/xen_backend.c
> +++ b/hw/xen/xen_backend.c
> @@ -27,11 +27,13 @@
>  
>  #include "hw/hw.h"
>  #include "hw/sysbus.h"
> +#include "hw/boards.h"
>  #include "sysemu/char.h"
>  #include "qemu/log.h"
>  #include "qapi/error.h"
>  #include "hw/xen/xen_backend.h"
>  #include "hw/xen/xen_pvdev.h"
> +#include "monitor/qdev.h"
>  
>  #include 
>  
> @@ -121,6 +123,12 @@ static struct XenDevice *xen_be_get_xendev(const char 
> *type, int dom, int dev,
>  
>  /* init new xendev */
>  xendev = g_malloc0(ops->size);
> +object_initialize(&xendev->qdev, ops->size, TYPE_XENBACKEND);
> +qdev_set_parent_bus(&xendev->qdev, xen_sysbus);
> +qdev_set_id(&xendev->qdev, g_strdup_printf("xen-%s-%d", type, dev));
> +qdev_init_nofail(&xendev->qdev);
> +object_unref(OBJECT(&xendev->qdev));
> +
>  xendev->type  = type;
>  xendev->dom   = dom;
>  xendev->dev   = dev;
> @@ -163,7 +171,6 @@ static struct XenDevice *xen_be_get_xendev(const char 
> *type, int dom, int dev,
>  return xendev;
>  }
>  
> -

spurious change


>  /*
>   * Sync internal data structures on xenstore updates.
>   * Node specifies the changed field.  node = NULL means
> @@ -541,6 +548,15 @@ err:
>  return -1;
>  }
>  
> +static void xen_set_dynamic_sysbus(void)
> +{
> +Object *machine = qdev_get_machine();
> +ObjectClass *oc = object_get_class(machine);
> +MachineClass *mc = MACHINE_CLASS(oc);
> +
> +mc->has_dynamic_sysbus = true;
> +}
> +
>  int xen_be_register(const char *type, struct XenDevOps *ops)
>  {
>  char path[50];
> @@ -562,6 +578,8 @@ int xen_be_register(const char *type, struct XenDevOps 
> *ops)
>  
>  void xen_be_register_common(void)
>  {
> +xen_set_dynamic_sysbus();
> +
>  xen_be_register("console", &xen_console_ops);
>  xen_be_register("vkbd", &xen_kbdmouse_ops);
>  xen_be_register("qdisk", &xen_blkdev_ops);
> @@ -588,9 +606,36 @@ int xen_be_bind_evtchn(struct XenDevice *xendev)
>  }
>  
>  
> +static Property xendev_properties[] = {
> +DEFINE_PROP_END_OF_LIST(),
> +};
> +
> +static void xendev_class_init(ObjectClass *klass, void *data)
> +{
> +DeviceClass *dc = DEVICE_CLASS(klass);
> +
> +dc->props = xendev_properties;
> +set_bit(DEVICE_CATEGORY_MISC, dc->categories);
> +}
> +
> +static const TypeInfo xendev_type_info = {
> +.name  = TYPE_XENBACKEND,
> +.parent= TYPE_XENSYSDEV,
> +.class_init= xendev_class_init,
> +.instance_size = sizeof(struct XenDevice),
> +};
> +
> +static void xen_sysbus_class_init(ObjectClass *klass, void *data)
> +{
> +HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(klass);
> +
> +hc->unplug = qdev_simple_device_unplug_cb;
> +}
> +
>  static const TypeInfo xensysbus_info = {
>  .name   = TYPE_XENSYSBUS,
>  .parent = TYPE_BUS,
> +.class_init = xen_sysbus_class_init,
>  .interfaces = (InterfaceInfo[]) {
>  { TYPE_HOTPLUG_HANDLER },
>  { }
> @@ -627,6 +672,7 @@ static void xenbe_register_types(void)
>  {
>  type_register_static(&xensysbus_info);
>  type_register_static(&xensysdev_info);
> +type_register_static(&xendev_type_info);
>  }
>  
>  type_init(xenbe_register_types)
> diff --git a/hw/xen/xen_pvdev.c b/hw/xen/xen_pvdev.c
> index 405e154..773c278 100644
> --- a/hw/xen/xen_pvdev.c
> +++ b/hw/xen/xen_pvdev.c
> @@ -18,10 +18,12 @@
>   */
>  
>  #include "qemu/osdep.h"
> +#include "hw/qdev-core.h"
>  
>  #include "hw/xen/xen_backend.h"
>  #include "hw/xen/xen_pvdev.h"
>  
> +

spurious change

but aside from that:

Reviewed-by: Stefano Stabellini 


>  /* private */
>  static int debug;
>  
> @@ -307,7 +309,8 @@ void xen_pv_del_xendev(struct XenDevice *xendev)
>  }
>  
>  QTAILQ_REMOVE(&xendevs, xendev, next);
> -g_free(xendev);
> +
> +qdev_unplug(&xendev->qdev, NULL);
>  }
>  
>  void xen_pv_insert_xendev(struct XenDevice *xendev)
> diff --git a/include/hw/xen/xen_backend.h b/include/hw/xen/xen_backend.h
> index 38f730e..4f4799a 100644
> --- a/include/hw/xen/xen_backend.h
> +++ b/include/hw/xen/xen_backend.h
> @@ -8,6 +8,10 @@
>  
>  #define TYPE_XENSYSDEV "xen-sysdev"
>  #define TYPE_XENSYSBUS "xen-sysbus"
> +#define TYPE_XENBACKEND "xen-backend"
> +

Re: [Qemu-devel] [PATCH v2 2/4] qdev: add function qdev_set_id()

2016-11-18 Thread Stefano Stabellini
On Wed, 2 Nov 2016, Juergen Gross wrote:
> In order to have an easy way to add a new qdev with a specific id
> carve out the needed functionality from qdev_device_add() into a new
> function qdev_set_id().
> 
> Signed-off-by: Juergen Gross 

Reviewed-by: Stefano Stabellini 


>  include/monitor/qdev.h |  1 +
>  qdev-monitor.c | 36 
>  2 files changed, 21 insertions(+), 16 deletions(-)
> 
> diff --git a/include/monitor/qdev.h b/include/monitor/qdev.h
> index 8e504bc..0ff3331 100644
> --- a/include/monitor/qdev.h
> +++ b/include/monitor/qdev.h
> @@ -12,5 +12,6 @@ void qmp_device_add(QDict *qdict, QObject **ret_data, Error 
> **errp);
>  
>  int qdev_device_help(QemuOpts *opts);
>  DeviceState *qdev_device_add(QemuOpts *opts, Error **errp);
> +void qdev_set_id(DeviceState *dev, const char *id);
>  
>  #endif
> diff --git a/qdev-monitor.c b/qdev-monitor.c
> index 4f78ecb..c73410c 100644
> --- a/qdev-monitor.c
> +++ b/qdev-monitor.c
> @@ -539,10 +539,28 @@ static BusState *qbus_find(const char *path, Error 
> **errp)
>  return bus;
>  }
>  
> +void qdev_set_id(DeviceState *dev, const char *id)
> +{
> +if (id) {
> +dev->id = id;
> +}
> +
> +if (dev->id) {
> +object_property_add_child(qdev_get_peripheral(), dev->id,
> +  OBJECT(dev), NULL);
> +} else {
> +static int anon_count;
> +gchar *name = g_strdup_printf("device[%d]", anon_count++);
> +object_property_add_child(qdev_get_peripheral_anon(), name,
> +  OBJECT(dev), NULL);
> +g_free(name);
> +}
> +}
> +
>  DeviceState *qdev_device_add(QemuOpts *opts, Error **errp)
>  {
>  DeviceClass *dc;
> -const char *driver, *path, *id;
> +const char *driver, *path;
>  DeviceState *dev;
>  BusState *bus = NULL;
>  Error *err = NULL;
> @@ -591,21 +609,7 @@ DeviceState *qdev_device_add(QemuOpts *opts, Error 
> **errp)
>  qdev_set_parent_bus(dev, bus);
>  }
>  
> -id = qemu_opts_id(opts);
> -if (id) {
> -dev->id = id;
> -}
> -
> -if (dev->id) {
> -object_property_add_child(qdev_get_peripheral(), dev->id,
> -  OBJECT(dev), NULL);
> -} else {
> -static int anon_count;
> -gchar *name = g_strdup_printf("device[%d]", anon_count++);
> -object_property_add_child(qdev_get_peripheral_anon(), name,
> -  OBJECT(dev), NULL);
> -g_free(name);
> -}
> +qdev_set_id(dev, qemu_opts_id(opts));
>  
>  /* set properties */
>  if (qemu_opt_foreach(opts, set_property, dev, &err)) {
> -- 
> 2.6.6
> 



Re: [Qemu-devel] [PATCH v2 4/4] xen: attach pvusb usb bus to backend qdev

2016-11-18 Thread Stefano Stabellini
On Wed, 2 Nov 2016, Juergen Gross wrote:
> Attach the usb bus of a new pvusb controller to the qdev associated
> with the Xen backend. Any device connected to that controller can now
> specify the bus and port directly via its properties.
> 
> Signed-off-by: Juergen Gross 

Reviewed-by: Stefano Stabellini 


>  hw/usb/xen-usb.c | 23 ++-
>  1 file changed, 10 insertions(+), 13 deletions(-)
> 
> diff --git a/hw/usb/xen-usb.c b/hw/usb/xen-usb.c
> index 1b3c2fb..8e676e6 100644
> --- a/hw/usb/xen-usb.c
> +++ b/hw/usb/xen-usb.c
> @@ -712,15 +712,10 @@ static void usbback_portid_detach(struct usbback_info 
> *usbif, unsigned port)
>  
>  static void usbback_portid_remove(struct usbback_info *usbif, unsigned port)
>  {
> -USBPort *p;
> -
>  if (!usbif->ports[port - 1].dev) {
>  return;
>  }
>  
> -p = &(usbif->ports[port - 1].port);
> -snprintf(p->path, sizeof(p->path), "%d", 99);
> -
>  object_unparent(OBJECT(usbif->ports[port - 1].dev));
>  usbif->ports[port - 1].dev = NULL;
>  usbback_portid_detach(usbif, port);
> @@ -733,10 +728,10 @@ static void usbback_portid_add(struct usbback_info 
> *usbif, unsigned port,
>  {
>  unsigned speed;
>  char *portname;
> -USBPort *p;
>  Error *local_err = NULL;
>  QDict *qdict;
>  QemuOpts *opts;
> +char *tmp;
>  
>  if (usbif->ports[port - 1].dev) {
>  return;
> @@ -749,11 +744,16 @@ static void usbback_portid_add(struct usbback_info 
> *usbif, unsigned port,
>  return;
>  }
>  portname++;
> -p = &(usbif->ports[port - 1].port);
> -snprintf(p->path, sizeof(p->path), "%s", portname);
>  
>  qdict = qdict_new();
>  qdict_put(qdict, "driver", qstring_from_str("usb-host"));
> +tmp = g_strdup_printf("%s.0", usbif->xendev.qdev.id);
> +qdict_put(qdict, "bus", qstring_from_str(tmp));
> +g_free(tmp);
> +tmp = g_strdup_printf("%s-%u", usbif->xendev.qdev.id, port);
> +qdict_put(qdict, "id", qstring_from_str(tmp));
> +g_free(tmp);
> +qdict_put(qdict, "port", qint_from_int(port));
>  qdict_put(qdict, "hostbus", qint_from_int(atoi(busid)));
>  qdict_put(qdict, "hostport", qstring_from_str(portname));
>  opts = qemu_opts_from_qdict(qemu_find_opts("device"), qdict, &local_err);
> @@ -765,7 +765,6 @@ static void usbback_portid_add(struct usbback_info 
> *usbif, unsigned port,
>  goto err;
>  }
>  QDECREF(qdict);
> -snprintf(p->path, sizeof(p->path), "%d", port);
>  speed = usbif->ports[port - 1].dev->speed;
>  switch (speed) {
>  case USB_SPEED_LOW:
> @@ -799,7 +798,6 @@ static void usbback_portid_add(struct usbback_info 
> *usbif, unsigned port,
>  
>  err:
>  QDECREF(qdict);
> -snprintf(p->path, sizeof(p->path), "%d", 99);
>  xen_pv_printf(&usbif->xendev, 0, "device %s could not be opened\n", 
> busid);
>  }
>  
> @@ -1012,13 +1010,13 @@ static void usbback_alloc(struct XenDevice *xendev)
>  
>  usbif = container_of(xendev, struct usbback_info, xendev);
>  
> -usb_bus_new(&usbif->bus, sizeof(usbif->bus), &xen_usb_bus_ops, 
> xen_sysdev);
> +usb_bus_new(&usbif->bus, sizeof(usbif->bus), &xen_usb_bus_ops,
> +DEVICE(&xendev->qdev));
>  for (i = 0; i < USBBACK_MAXPORTS; i++) {
>  p = &(usbif->ports[i].port);
>  usb_register_port(&usbif->bus, p, usbif, i, &xen_usb_port_ops,
>USB_SPEED_MASK_LOW | USB_SPEED_MASK_FULL |
>USB_SPEED_MASK_HIGH);
> -snprintf(p->path, sizeof(p->path), "%d", 99);
>  }
>  
>  QTAILQ_INIT(&usbif->req_free_q);
> @@ -1066,7 +1064,6 @@ static int usbback_free(struct XenDevice *xendev)
>  }
>  
>  usb_bus_release(&usbif->bus);
> -object_unparent(OBJECT(&usbif->bus));
>  
>  TR_BUS(xendev, "finished\n");
>  
> -- 
> 2.6.6
> 



Re: [Qemu-devel] [PATCH v2 1/4] xen: add an own bus for xen backend devices

2016-11-18 Thread Stefano Stabellini
On Wed, 2 Nov 2016, Juergen Gross wrote:
> Add a bus for Xen backend devices in order to be able to establish a
> dedicated device path for pluggable devices.
> 
> Signed-off-by: Juergen Gross 

Reviewed-by: Stefano Stabellini 


>  hw/xen/xen_backend.c | 19 ---
>  include/hw/xen/xen_backend.h |  4 
>  2 files changed, 20 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/xen/xen_backend.c b/hw/xen/xen_backend.c
> index 41ba5c5..5ad3caa 100644
> --- a/hw/xen/xen_backend.c
> +++ b/hw/xen/xen_backend.c
> @@ -29,14 +29,14 @@
>  #include "hw/sysbus.h"
>  #include "sysemu/char.h"
>  #include "qemu/log.h"
> +#include "qapi/error.h"
>  #include "hw/xen/xen_backend.h"
>  #include "hw/xen/xen_pvdev.h"
>  
>  #include 
>  
> -#define TYPE_XENSYSDEV "xensysdev"
> -
>  DeviceState *xen_sysdev;
> +BusState *xen_sysbus;
>  
>  /* - */
>  
> @@ -528,6 +528,8 @@ int xen_be_init(void)
>  
>  xen_sysdev = qdev_create(NULL, TYPE_XENSYSDEV);
>  qdev_init_nofail(xen_sysdev);
> +xen_sysbus = qbus_create(TYPE_XENSYSBUS, DEVICE(xen_sysdev), 
> "xen-sysbus");
> +qbus_set_bus_hotplug_handler(xen_sysbus, &error_abort);
>  
>  return 0;
>  
> @@ -586,6 +588,15 @@ int xen_be_bind_evtchn(struct XenDevice *xendev)
>  }
>  
>  
> +static const TypeInfo xensysbus_info = {
> +.name   = TYPE_XENSYSBUS,
> +.parent = TYPE_BUS,
> +.interfaces = (InterfaceInfo[]) {
> +{ TYPE_HOTPLUG_HANDLER },
> +{ }
> +}
> +};
> +
>  static int xen_sysdev_init(SysBusDevice *dev)
>  {
>  return 0;
> @@ -602,6 +613,7 @@ static void xen_sysdev_class_init(ObjectClass *klass, 
> void *data)
>  
>  k->init = xen_sysdev_init;
>  dc->props = xen_sysdev_properties;
> +dc->bus_type = TYPE_XENSYSBUS;
>  }
>  
>  static const TypeInfo xensysdev_info = {
> @@ -613,7 +625,8 @@ static const TypeInfo xensysdev_info = {
>  
>  static void xenbe_register_types(void)
>  {
> +type_register_static(&xensysbus_info);
>  type_register_static(&xensysdev_info);
>  }
>  
> -type_init(xenbe_register_types);
> +type_init(xenbe_register_types)
> diff --git a/include/hw/xen/xen_backend.h b/include/hw/xen/xen_backend.h
> index cbda40e..38f730e 100644
> --- a/include/hw/xen/xen_backend.h
> +++ b/include/hw/xen/xen_backend.h
> @@ -6,12 +6,16 @@
>  #include "sysemu/sysemu.h"
>  #include "net/net.h"
>  
> +#define TYPE_XENSYSDEV "xen-sysdev"
> +#define TYPE_XENSYSBUS "xen-sysbus"
> +
>  /* variables */
>  extern xc_interface *xen_xc;
>  extern xenforeignmemory_handle *xen_fmem;
>  extern struct xs_handle *xenstore;
>  extern const char *xen_protocol;
>  extern DeviceState *xen_sysdev;
> +extern BusState *xen_sysbus;
>  
>  int xenstore_mkdir(char *path, int p);
>  int xenstore_write_be_str(struct XenDevice *xendev, const char *node, const 
> char *val);
> -- 
> 2.6.6
> 



[Qemu-devel] [Bug 1594239] Re: After adding more scsi disks for Aarch64 virtual machine, start the VM and got Qemu Error

2016-11-18 Thread PabloSaenz
I’m sorry to post as a newbie here. I just want to confirm the bug
described above by Kevin Zhao. At least that’s what it sounds like to
me. I had a perfectly working VM on Qemu/KVM, with the VirtManager
hypervisor, which I have *only* to be able to access an Audigy pci card
from Windows XP. After I added a /dev/sbd disk as a device, the VM
wouldn’t boot and even froze my system. Then I disconnected the disk
from the VM, and now, though it doesn’t crash or freeze, I still can’t
get into the VM, and I get the error message below:

«Error starting domain: internal error: process exited while connecting
to monitor: 2016-11-18T22:45:31.643085Z qemu-system-x86_64: -drive
file=/home/[folder]/[folder]/VWinXP.raw,format=raw,if=none,id=drive-
ide0-0-0: Could not open '/home/[folder]/[folder]/VWinXP.raw':
Permission denied»

I have changing permissions for the .raw file, but −intriguinly, for a
newbie like me− everytime I try to open the VM, the permissions are
changed automatically back to:

Owner = Libvirt Qemu
Group = kvm

I know this is not a help forum, but I would be grateful for some
feedback. Been trying to fix this non-stop for the last two days.

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1594239

Title:
  After adding more scsi disks for Aarch64 virtual machine, start the VM
  and got Qemu Error

Status in QEMU:
  Fix Released

Bug description:
  Description
  ===
  Using virt-manager to create a VM in Aarch64, Ubuntu 16.04.
  Add scsi disk to the VM. After add four or more scsi disks, start the VM and 
will got Qemu error.

  Steps to reproduce
  ==
  1.Use virt-manager to create a VM.
  2.After the VM is started, add scsi disk to the VM. They will be allocated to 
"sdb,sdc,sdd." .
  3.If we got a disk name > sdg, virt-manager will also assign a virtio-scsi 
controller for this disk.And the VM will be shutdown.
  4.Start the VM, will see the error log.

  
  Expected result
  ===
  Start the vm smoothly.The added disks can work.

  Actual result
  =
  Got the error:
  starting domain: internal error: process exited while connecting to monitor: 
qemu-system-aarch64: /build/qemu-zxCwKP/qemu-2.5+dfsg/migration/savevm.c:620: 
vmstate_register_with_alias_id: Assertion `!se->compat || se->instance_id == 0' 
failed.
  details=Traceback (most recent call last):
File "/usr/share/virt-manager/virtManager/asyncjob.py", line 90, in 
cb_wrapper
  callback(asyncjob, *args, **kwargs)
File "/usr/share/virt-manager/virtManager/asyncjob.py", line 126, in tmpcb
  callback(*args, **kwargs)
File "/usr/share/virt-manager/virtManager/libvirtobject.py", line 83, in 
newfn
  ret = fn(self, *args, **kwargs)
File "/usr/share/virt-manager/virtManager/domain.py", line 1402, in startup
  self._backend.create()
File "/usr/local/lib/python2.7/dist-packages/libvirt.py", line 1035, in 
create
  if ret == -1: raise libvirtError ('virDomainCreate() failed', dom=self)
  libvirtError: internal error: process exited while connecting to monitor: 
qemu-system-aarch64: /build/qemu-zxCwKP/qemu-2.5+dfsg/migration/savevm.c:620: 
vmstate_register_with_alias_id: Assertion `!se->compat || se->instance_id == 0' 
failed.

  
  Environment
  ===
  1. virt-manager version is 1.3.2

  2. Which hypervisor did you use?
  Libvirt+KVM
  $ kvm --version
  QEMU emulator version 2.5.0 (Debian 1:2.5+dfsg-5ubuntu10.1), Copyright 
(c) 2003-2008 Fabrice Bellard
  $ libvirtd --version
  libvirtd (libvirt) 1.3.1

  3. Which storage type did you use?
 In the host file system,all in one physics machine.
  stack@u202154:/opt/stack/nova$ df -hl
  Filesystem Size Used Avail Use% Mounted on
  udev 7.8G 0 7.8G 0% /dev
  tmpfs 1.6G 61M 1.6G 4% /run
  /dev/sda2 917G 41G 830G 5% /
  tmpfs 7.9G 0 7.9G 0% /dev/shm
  tmpfs 5.0M 0 5.0M 0% /run/lock
  tmpfs 7.9G 0 7.9G 0% /sys/fs/cgroup
  /dev/sda1 511M 888K 511M 1% /boot/efi
  cgmfs 100K 0 100K 0% /run/cgmanager/fs
  tmpfs 1.6G 0 1.6G 0% /run/user/1002
  tmpfs 1.6G 0 1.6G 0% /run/user/1000
  tmpfs 1.6G 0 1.6G 0% /run/user/0

  4. Environment information:
 Architecture : AARCH64
 OS: Ubuntu 16.04

  The Qemu commmand of libvirt is :
  2016-06-20 02:39:46.561+: starting up libvirt version: 1.3.1, package: 
1ubuntu10 (William Grant  Fri, 15 Apr 2016 12:08:21 +1000), 
qemu version: 2.5.0 (Debian 1:2.5+dfsg-5ubuntu10.1), hostname: u202154
  LC_ALL=C PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin 
QEMU_AUDIO_DRV=none /usr/bin/kvm -name cent7 -S -machine virt,accel=kvm,usb=off 
-cpu host -drive 
file=/usr/share/edk2.git/aarch64/QEMU_EFI-pflash.raw,if=pflash,format=raw,unit=0,readonly=on
 -drive 
file=/var/lib/libvirt/qemu/nvram/cent7_VARS.fd,if=pflash,format=raw,unit=1 -m 
2048 -realtime mlock=off -smp 2,sockets=2,cores=1,threads=1 -uuid 
d5462bb6-159e-4dbd-9266-bf8c07fa1695 -nographic -n

Re: [Qemu-devel] ARM documentation

2016-11-18 Thread Programmingkid

On Nov 18, 2016, at 7:31 PM, Programmingkid wrote:

> 
> On Nov 18, 2016, at 4:10 PM, Alistair Francis wrote:
> 
>> On Fri, Nov 11, 2016 at 8:53 AM, G 3  wrote:
>>> 
>>> On Nov 11, 2016, at 11:31 AM, Alex Bennée wrote:
>>> 
 
 G 3  writes:
 
> On Nov 11, 2016, at 8:27 AM, G 3 wrote:
> 
>> 
>> On Nov 11, 2016, at 8:08 AM, Stefan Hajnoczi wrote:
>> 
>>> On Fri, Nov 11, 2016 at 12:09:31AM -0500, G 3 wrote:
 
 I was wondering if there is a list somewhere of all the ARM
 boards QEMU
 supports. I want to add a section to the ARM wiki page that lists
 at least a
 few of them.
>>> 
>>> 
>>> Are you looking for something more detailed than the following?
>>> 
>>> $ arm-softmmu/qemu-system-arm -M \?
>>> Supported machines are:
>>> akitaSharp SL-C1000 (Akita) PDA (PXA270)
>>> ast2500-evb  Aspeed AST2500 EVB (ARM1176)
>>> borzoi   Sharp SL-C3100 (Borzoi) PDA (PXA270)
>>> canon-a1100  Canon PowerShot A1100 IS
>>> cheetah  Palm Tungsten|E aka. Cheetah PDA (OMAP310)
>>> collie   Sharp SL-5500 (Collie) PDA (SA-1110)
>>> connex   Gumstix Connex (PXA255)
>>> cubieboard   cubietech cubieboard
>>> highbank Calxeda Highbank (ECX-1000)
>>> imx25-pdkARM i.MX25 PDK board (ARM926)
>>> integratorcp ARM Integrator/CP (ARM926EJ-S)
>>> kzm  ARM KZM Emulation Baseboard (ARM1136)
>>> lm3s6965evb  Stellaris LM3S6965EVB
>>> lm3s811evb   Stellaris LM3S811EVB
>>> mainstoneMainstone II (PXA27x)
>>> midway   Calxeda Midway (ECX-2000)
>>> musicpal Marvell 88w8618 / MusicPal (ARM926EJ-S)
>>> n800 Nokia N800 tablet aka. RX-34 (OMAP2420)
>>> n810 Nokia N810 tablet aka. RX-44 (OMAP2420)
>>> netduino2Netduino 2 Machine
>>> none empty machine
>>> nuri Samsung NURI board (Exynos4210)
>>> palmetto-bmc OpenPOWER Palmetto BMC (ARM926EJ-S)
>>> raspi2   Raspberry Pi 2
>>> realview-eb  ARM RealView Emulation Baseboard (ARM926EJ-S)
>>> realview-eb-mpcore   ARM RealView Emulation Baseboard (ARM11MPCore)
>>> realview-pb-a8   ARM RealView Platform Baseboard for Cortex-A8
>>> realview-pbx-a9  ARM RealView Platform Baseboard Explore for
>>> Cortex-A9
>>> sabreliteFreescale i.MX6 Quad SABRE Lite Board (Cortex
>>> A9)
>>> smdkc210 Samsung SMDKC210 board (Exynos4210)
>>> spitzSharp SL-C3000 (Spitz) PDA (PXA270)
>>> sx1  Siemens SX1 (OMAP310) V2
>>> sx1-v1   Siemens SX1 (OMAP310) V1
>>> terrier  Sharp SL-C3200 (Terrier) PDA (PXA270)
>>> tosa Sharp SL-6000 (Tosa) PDA (PXA255)
>>> verdex   Gumstix Verdex (PXA270)
>>> versatileab  ARM Versatile/AB (ARM926EJ-S)
>>> versatilepb  ARM Versatile/PB (ARM926EJ-S)
>>> vexpress-a15 ARM Versatile Express for Cortex-A15
>>> vexpress-a9  ARM Versatile Express for Cortex-A9
>>> virt-2.6 QEMU 2.6 ARM Virtual Machine
>>> virt-2.7 QEMU 2.7 ARM Virtual Machine
>>> virt QEMU 2.8 ARM Virtual Machine (alias of virt-2.8)
>>> virt-2.8 QEMU 2.8 ARM Virtual Machine
>>> xilinx-zynq-a9   Xilinx Zynq Platform Baseboard for Cortex-A9
>>> z2   Zipit Z2 (PXA27x)
>>> 
>>> $ aarch64-softmmu/qemu-system-aarch64 -M \?
>>> Supported machines are:
>>> akitaSharp SL-C1000 (Akita) PDA (PXA270)
>>> ast2500-evb  Aspeed AST2500 EVB (ARM1176)
>>> borzoi   Sharp SL-C3100 (Borzoi) PDA (PXA270)
>>> canon-a1100  Canon PowerShot A1100 IS
>>> cheetah  Palm Tungsten|E aka. Cheetah PDA (OMAP310)
>>> collie   Sharp SL-5500 (Collie) PDA (SA-1110)
>>> connex   Gumstix Connex (PXA255)
>>> cubieboard   cubietech cubieboard
>>> highbank Calxeda Highbank (ECX-1000)
>>> imx25-pdkARM i.MX25 PDK board (ARM926)
>>> integratorcp ARM Integrator/CP (ARM926EJ-S)
>>> kzm  ARM KZM Emulation Baseboard (ARM1136)
>>> lm3s6965evb  Stellaris LM3S6965EVB
>>> lm3s811evb   Stellaris LM3S811EVB
>>> mainstoneMainstone II (PXA27x)
>>> midway   Calxeda Midway (ECX-2000)
>>> musicpal Marvell 88w8618 / MusicPal (ARM926EJ-S)
>>> n800 Nokia N800 tablet aka. RX-34 (OMAP2420)
>>> n810 Nokia N810 tablet aka. RX-44 (OMAP2420)
>>> netduino2Netduino 2 Machine
>>> none emp

Re: [Qemu-devel] ARM documentation

2016-11-18 Thread Programmingkid

On Nov 18, 2016, at 4:10 PM, Alistair Francis wrote:

> On Fri, Nov 11, 2016 at 8:53 AM, G 3  wrote:
>> 
>> On Nov 11, 2016, at 11:31 AM, Alex Bennée wrote:
>> 
>>> 
>>> G 3  writes:
>>> 
 On Nov 11, 2016, at 8:27 AM, G 3 wrote:
 
> 
> On Nov 11, 2016, at 8:08 AM, Stefan Hajnoczi wrote:
> 
>> On Fri, Nov 11, 2016 at 12:09:31AM -0500, G 3 wrote:
>>> 
>>> I was wondering if there is a list somewhere of all the ARM
>>> boards QEMU
>>> supports. I want to add a section to the ARM wiki page that lists
>>> at least a
>>> few of them.
>> 
>> 
>> Are you looking for something more detailed than the following?
>> 
>> $ arm-softmmu/qemu-system-arm -M \?
>> Supported machines are:
>> akitaSharp SL-C1000 (Akita) PDA (PXA270)
>> ast2500-evb  Aspeed AST2500 EVB (ARM1176)
>> borzoi   Sharp SL-C3100 (Borzoi) PDA (PXA270)
>> canon-a1100  Canon PowerShot A1100 IS
>> cheetah  Palm Tungsten|E aka. Cheetah PDA (OMAP310)
>> collie   Sharp SL-5500 (Collie) PDA (SA-1110)
>> connex   Gumstix Connex (PXA255)
>> cubieboard   cubietech cubieboard
>> highbank Calxeda Highbank (ECX-1000)
>> imx25-pdkARM i.MX25 PDK board (ARM926)
>> integratorcp ARM Integrator/CP (ARM926EJ-S)
>> kzm  ARM KZM Emulation Baseboard (ARM1136)
>> lm3s6965evb  Stellaris LM3S6965EVB
>> lm3s811evb   Stellaris LM3S811EVB
>> mainstoneMainstone II (PXA27x)
>> midway   Calxeda Midway (ECX-2000)
>> musicpal Marvell 88w8618 / MusicPal (ARM926EJ-S)
>> n800 Nokia N800 tablet aka. RX-34 (OMAP2420)
>> n810 Nokia N810 tablet aka. RX-44 (OMAP2420)
>> netduino2Netduino 2 Machine
>> none empty machine
>> nuri Samsung NURI board (Exynos4210)
>> palmetto-bmc OpenPOWER Palmetto BMC (ARM926EJ-S)
>> raspi2   Raspberry Pi 2
>> realview-eb  ARM RealView Emulation Baseboard (ARM926EJ-S)
>> realview-eb-mpcore   ARM RealView Emulation Baseboard (ARM11MPCore)
>> realview-pb-a8   ARM RealView Platform Baseboard for Cortex-A8
>> realview-pbx-a9  ARM RealView Platform Baseboard Explore for
>> Cortex-A9
>> sabreliteFreescale i.MX6 Quad SABRE Lite Board (Cortex
>> A9)
>> smdkc210 Samsung SMDKC210 board (Exynos4210)
>> spitzSharp SL-C3000 (Spitz) PDA (PXA270)
>> sx1  Siemens SX1 (OMAP310) V2
>> sx1-v1   Siemens SX1 (OMAP310) V1
>> terrier  Sharp SL-C3200 (Terrier) PDA (PXA270)
>> tosa Sharp SL-6000 (Tosa) PDA (PXA255)
>> verdex   Gumstix Verdex (PXA270)
>> versatileab  ARM Versatile/AB (ARM926EJ-S)
>> versatilepb  ARM Versatile/PB (ARM926EJ-S)
>> vexpress-a15 ARM Versatile Express for Cortex-A15
>> vexpress-a9  ARM Versatile Express for Cortex-A9
>> virt-2.6 QEMU 2.6 ARM Virtual Machine
>> virt-2.7 QEMU 2.7 ARM Virtual Machine
>> virt QEMU 2.8 ARM Virtual Machine (alias of virt-2.8)
>> virt-2.8 QEMU 2.8 ARM Virtual Machine
>> xilinx-zynq-a9   Xilinx Zynq Platform Baseboard for Cortex-A9
>> z2   Zipit Z2 (PXA27x)
>> 
>> $ aarch64-softmmu/qemu-system-aarch64 -M \?
>> Supported machines are:
>> akitaSharp SL-C1000 (Akita) PDA (PXA270)
>> ast2500-evb  Aspeed AST2500 EVB (ARM1176)
>> borzoi   Sharp SL-C3100 (Borzoi) PDA (PXA270)
>> canon-a1100  Canon PowerShot A1100 IS
>> cheetah  Palm Tungsten|E aka. Cheetah PDA (OMAP310)
>> collie   Sharp SL-5500 (Collie) PDA (SA-1110)
>> connex   Gumstix Connex (PXA255)
>> cubieboard   cubietech cubieboard
>> highbank Calxeda Highbank (ECX-1000)
>> imx25-pdkARM i.MX25 PDK board (ARM926)
>> integratorcp ARM Integrator/CP (ARM926EJ-S)
>> kzm  ARM KZM Emulation Baseboard (ARM1136)
>> lm3s6965evb  Stellaris LM3S6965EVB
>> lm3s811evb   Stellaris LM3S811EVB
>> mainstoneMainstone II (PXA27x)
>> midway   Calxeda Midway (ECX-2000)
>> musicpal Marvell 88w8618 / MusicPal (ARM926EJ-S)
>> n800 Nokia N800 tablet aka. RX-34 (OMAP2420)
>> n810 Nokia N810 tablet aka. RX-44 (OMAP2420)
>> netduino2Netduino 2 Machine
>> none empty machine
>> nuri Samsung NURI board (Exynos4210)
>> palmetto-bmc OpenPOWER Palmetto BMC (ARM926EJ-S)
>> raspi2

Re: [Qemu-devel] [PATCH v2 7/9] blkdebug: Add pass-through write_zero and discard support

2016-11-18 Thread Eric Blake
On 11/17/2016 04:47 PM, Max Reitz wrote:
> On 17.11.2016 21:14, Eric Blake wrote:
>> In order to test the effects of artificial geometry constraints
>> on operations like write zero or discard, we first need blkdebug
>> to manage these actions.  Ideally, it would be nice to let these
>> operations also react to injected errors like read/write/flush,
>> but it is not trivial to turn bdrv_aio error injection (where
>> we return BlockAIOCB*) into bdrv_co (where we return int), not
>> to mention the fact that I don't want to conflict with Kevin's
>> concurrent work on refactoring away from bdrv_aio.  So for now,
>> the operations merely have a TODO comment for adding error
>> injection.
>>
>> However, one thing we CAN test is the contract promised by the
>> block layer; namely, if a device has specified limits on
>> alignment or maximum size, then those limits must be obeyed (for
>> now, the blkdebug driver merely inherits limits from whatever it
>> is wrapping, but the next patch will further enhance it to allow
>> specific limit overrides).
>>
>> Tested by setting up an NBD server with export 'foo', then invoking:
>> $ ./qemu-io
>> qemu-io> open -o driver=blkdebug blkdebug::nbd://localhost:10809/foo
>> qemu-io> d 0 15M
>> qemu-io> w -z 0 15M
>>
>> Pre-patch, the server never sees the discard (it was silently
>> eaten by the block layer); post-patch it is passed across the
>> wire.  Likewise, pre-patch the write is always passed with
>> NBD_WRITE (with 15M of zeroes on the wire), while post-patch
>> it can utilize NBD_WRITE_ZEROES (for less traffic).
>>
>> Signed-off-by: Eric Blake 
>> ---
>>  block/blkdebug.c | 61 
>> 
>>  1 file changed, 61 insertions(+)
>>
>> diff --git a/block/blkdebug.c b/block/blkdebug.c
>> index 0a47977..d45826d 100644
>> --- a/block/blkdebug.c
>> +++ b/block/blkdebug.c
> 
> [...]
> 
>> @@ -522,6 +528,59 @@ static BlockAIOCB *blkdebug_aio_flush(BlockDriverState 
>> *bs,
>>  }
>>
>>
>> +static int coroutine_fn blkdebug_co_pwrite_zeroes(BlockDriverState *bs,
>> +  int64_t offset, int count,
>> +  BdrvRequestFlags flags)
>> +{
>> +uint32_t align = MAX(bs->bl.request_alignment,
>> + bs->bl.pwrite_zeroes_alignment);
>> +
>> +/* Regardless of whether the lower layer has a finer granularity,
>> + * we want to treat any unaligned request as unsupported, and
> 
> Why?

Hmm, at the moment, I'm having a hard time coming up with a strong
reason why I did that. I'll retest without it, and see if it still picks
up the regression fixed by 3/5; if so I'll drop it as part of the respin
(since I still have the iotest to fix); if not, I'll have a good reason
why and include it in the commit message.

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] [PATCH v2 4/9] block: Return -ENOTSUP rather than assert on unaligned discards

2016-11-18 Thread Eric Blake
On 11/17/2016 04:01 PM, Max Reitz wrote:
> On 17.11.2016 21:13, Eric Blake wrote:
>> Right now, the block layer rounds discard requests, so that
>> individual drivers are able to assert that discard requests
>> will never be unaligned.  But there are some ISCSI devices
>> that track and coalesce multiple unaligned requests, turning it
>> into an actual discard if the requests eventually cover an
>> entire page, which implies that it is better to always pass
>> discard requests as low down the stack as possible.
>>
>> In isolation, this patch has no semantic effect, since the
>> block layer currently never passes an unaligned request through.
>> But the block layer already has code that silently ignores
>> drivers that return -ENOTSUP for a discard request that cannot
>> be honored (as well as drivers that return 0 even when nothing
>> was done).  But the next patch will update the block layer to
>> fragment discard requests, so that clients are guaranteed that
>> they are either dealing with an unaligned head or tail, or an
>> aligned core, making it similar to the block layer semantics of
>> write zero fragmentation.
>>

>> +++ b/block/iscsi.c
>> @@ -1083,7 +1083,9 @@ coroutine_fn iscsi_co_pdiscard(BlockDriverState *bs, 
>> int64_t offset, int count)
>>  struct IscsiTask iTask;
>>  struct unmap_list list;
>>
>> -assert(is_byte_request_lun_aligned(offset, count, iscsilun));
>> +if (!is_byte_request_lun_aligned(offset, count, iscsilun)) {
>> +return -ENOTSUP;
>> +}
>>
>>  if (!iscsilun->lbp.lbpu) {
>>  /* UNMAP is not supported by the target */
> 
> Next line is:
> 
>>  return 0;
> 
> Hmm... -ENOTSUP would be the obvious return value here, too. That might
> interfere with your next patch, though.

Shouldn't interfere. I guess no one value is better than the other; I
can respin to pick a consistent value (I'd lean towards -ENOTSUP) if you
think it is worth it; but I'd rather get this into 2.8 without worrying
about it.

> 
>> diff --git a/block/qcow2.c b/block/qcow2.c
>> index e22f6dc..7cfcd84 100644
>> --- a/block/qcow2.c
>> +++ b/block/qcow2.c
>> @@ -2491,6 +2491,11 @@ static coroutine_fn int 
>> qcow2_co_pdiscard(BlockDriverState *bs,
>>  int ret;
>>  BDRVQcow2State *s = bs->opaque;
>>
>> +if (!QEMU_IS_ALIGNED(offset | count, s->cluster_size)) {
> 
> Ha! I like "offset | count".

It only works because we know that qcow2 guarantees that s->cluster_size
is a power of 2 (it does not work at the block layer, where the
bs->bl.pdiscard_align need not be a power of 2).

> 
>> +assert(count < s->cluster_size);
> 
> Maybe add a comment for this assertion? E.g. "The block layer will only
> generate unaligned discard requests that are smaller than the alignment".

Sure, if the maintainer wants a respin.

> 
>> +return -ENOTSUP;
>> +}
>> +
>>  qemu_co_mutex_lock(&s->lock);
>>  ret = qcow2_discard_clusters(bs, offset, count >> BDRV_SECTOR_BITS,
>>   QCOW2_DISCARD_REQUEST, false);
>> diff --git a/block/sheepdog.c b/block/sheepdog.c
>> index 1fb9173..4c9af89 100644
>> --- a/block/sheepdog.c
>> +++ b/block/sheepdog.c
>> @@ -2829,8 +2829,9 @@ static coroutine_fn int 
>> sd_co_pdiscard(BlockDriverState *bs, int64_t offset,
>>  iov.iov_len = sizeof(zero);
>>  discard_iov.iov = &iov;
>>  discard_iov.niov = 1;
>> -assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
>> -assert((count & (BDRV_SECTOR_SIZE - 1)) == 0);
>> +if (!QEMU_IS_ALIGNED(offset | count, BDRV_SECTOR_SIZE)) {

Again, works because this is a power of 2.

>> +return -ENOTSUP;
>> +}
> 
> Out of interest: Where does sheepdog tell the block layer that requests
> have to be aligned to this value?

It's Magic !  Don't tell anyone that I told you :)

Actually, it's because sheepdog still uses .bdrv_co_readv
(sector-based), and not .bdrv_co_preadv (byte-based), so the block layer
automatically sets bs->bl.request_alignment to BDRV_SECTOR_SIZE for
sheepdog and all other old-school drivers.  The block layer then treats
bs->bl.request_alignment as the very minimum that can be changed in the
image at a time, so it only makes sense that sheepdog can't react to a
discard request aligned below those limits.

This code is weakening from an assertion to an early error return, and
then 5/9 is what starts calling the code even with something aligned
smaller than a sector.

Someday the sheepdog driver may be relaxed to implement byte-based
callbacks, and then we may want to delete the early error return of
-ENOTSUP for requests smaller than 512; but that depends on whether
sheepdog uses bytes or sectors over the wire.

> 
> With this patch, it doesn't matter though, it only did before, so:
> 
> Reviewed-by: Max Reitz 
> 
>>  acb = sd_aio_setup(bs, &discard_iov, offset >> BDRV_SECTOR_BITS,
>> count >> BDRV_SECTOR_BITS);
>>  acb->aiocb_type = AIOCB_DISCARD_OBJ;
>>
> 
> 

-- 
Eric Blake   eblake redhat

[Qemu-devel] [PATCH for-2.8] dma/rc4030: translate memory accesses only when they occur

2016-11-18 Thread Hervé Poussineau
This simplifies the code a lot, and this fixes big memory leaks
introduced in a3d586f704609a45b6037534cb2f34da5dfd8895

Windows NT is now able to boot without using gigabytes of ram on the host.

Signed-off-by: Hervé Poussineau 
---
 hw/dma/rc4030.c | 158 +---
 1 file changed, 36 insertions(+), 122 deletions(-)

diff --git a/hw/dma/rc4030.c b/hw/dma/rc4030.c
index 41fc043..5f10b9d 100644
--- a/hw/dma/rc4030.c
+++ b/hw/dma/rc4030.c
@@ -34,8 +34,6 @@
 //
 /* rc4030 emulation */
 
-#define MAX_TL_ENTRIES 512
-
 typedef struct dma_pagetable_entry {
 int32_t frame;
 int32_t owner;
@@ -91,14 +89,8 @@ typedef struct rc4030State
 qemu_irq timer_irq;
 qemu_irq jazz_bus_irq;
 
-/* biggest translation table */
-MemoryRegion dma_tt;
-/* translation table memory region alias, added to system RAM */
-MemoryRegion dma_tt_alias;
 /* whole DMA memory region, root of DMA address space */
 MemoryRegion dma_mr;
-/* translation table entry aliases, added to DMA memory region */
-MemoryRegion dma_mrs[MAX_TL_ENTRIES];
 AddressSpace dma_as;
 
 MemoryRegion iomem_chipset;
@@ -256,96 +248,6 @@ static uint64_t rc4030_read(void *opaque, hwaddr addr, 
unsigned int size)
 return val;
 }
 
-static void rc4030_dma_as_update_one(rc4030State *s, int index, uint32_t frame)
-{
-if (index < MAX_TL_ENTRIES) {
-memory_region_set_enabled(&s->dma_mrs[index], false);
-}
-
-if (!frame) {
-return;
-}
-
-if (index >= MAX_TL_ENTRIES) {
-qemu_log_mask(LOG_UNIMP,
-  "rc4030: trying to use too high "
-  "translation table entry %d (max allowed=%d)",
-  index, MAX_TL_ENTRIES);
-return;
-}
-memory_region_set_alias_offset(&s->dma_mrs[index], frame);
-memory_region_set_enabled(&s->dma_mrs[index], true);
-}
-
-static void rc4030_dma_tt_write(void *opaque, hwaddr addr, uint64_t data,
-unsigned int size)
-{
-rc4030State *s = opaque;
-
-/* write memory */
-memcpy(memory_region_get_ram_ptr(&s->dma_tt) + addr, &data, size);
-
-/* update dma address space (only if frame field has been written) */
-if (addr % sizeof(dma_pagetable_entry) == 0) {
-int index = addr / sizeof(dma_pagetable_entry);
-memory_region_transaction_begin();
-rc4030_dma_as_update_one(s, index, (uint32_t)data);
-memory_region_transaction_commit();
-}
-}
-
-static const MemoryRegionOps rc4030_dma_tt_ops = {
-.write = rc4030_dma_tt_write,
-.impl.min_access_size = 4,
-.impl.max_access_size = 4,
-};
-
-static void rc4030_dma_tt_update(rc4030State *s, uint32_t new_tl_base,
- uint32_t new_tl_limit)
-{
-int entries, i;
-dma_pagetable_entry *dma_tl_contents;
-
-if (s->dma_tl_limit) {
-/* write old dma tl table to physical memory */
-memory_region_del_subregion(get_system_memory(), &s->dma_tt_alias);
-cpu_physical_memory_write(s->dma_tl_limit & 0x7fff,
-  memory_region_get_ram_ptr(&s->dma_tt),
-  memory_region_size(&s->dma_tt_alias));
-}
-object_unparent(OBJECT(&s->dma_tt_alias));
-
-s->dma_tl_base = new_tl_base;
-s->dma_tl_limit = new_tl_limit;
-new_tl_base &= 0x7fff;
-
-if (s->dma_tl_limit) {
-uint64_t dma_tt_size;
-if (s->dma_tl_limit <= memory_region_size(&s->dma_tt)) {
-dma_tt_size = s->dma_tl_limit;
-} else {
-dma_tt_size = memory_region_size(&s->dma_tt);
-}
-memory_region_init_alias(&s->dma_tt_alias, OBJECT(s),
- "dma-table-alias",
- &s->dma_tt, 0, dma_tt_size);
-dma_tl_contents = memory_region_get_ram_ptr(&s->dma_tt);
-cpu_physical_memory_read(new_tl_base, dma_tl_contents, dma_tt_size);
-
-memory_region_transaction_begin();
-entries = dma_tt_size / sizeof(dma_pagetable_entry);
-for (i = 0; i < entries; i++) {
-rc4030_dma_as_update_one(s, i, dma_tl_contents[i].frame);
-}
-memory_region_add_subregion(get_system_memory(), new_tl_base,
-&s->dma_tt_alias);
-memory_region_transaction_commit();
-} else {
-memory_region_init(&s->dma_tt_alias, OBJECT(s),
-   "dma-table-alias", 0);
-}
-}
-
 static void rc4030_write(void *opaque, hwaddr addr, uint64_t data,
  unsigned int size)
 {
@@ -362,11 +264,11 @@ static void rc4030_write(void *opaque, hwaddr addr, 
uint64_t data,
 break;
 /* DMA transl. table base */
 case 0x0018:
-rc4030_dma_tt_update(s, val, s->dma_tl_limit);
+s->dma_tl_base = val;
 

[Qemu-devel] [PATCH for-2.8] scsi/esp: do not raise an interrupt when reading the FIFO register

2016-11-18 Thread Hervé Poussineau
This fixes SCSI adapter self-tests done in MIPS Jazz emulation,
broken since ff589551c8e8e9e95e211b9d8daafb4ed39f1aec.

Signed-off-by: Hervé Poussineau 
---
 hw/scsi/esp.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/hw/scsi/esp.c b/hw/scsi/esp.c
index 1f2f2d3..5a5a4e9 100644
--- a/hw/scsi/esp.c
+++ b/hw/scsi/esp.c
@@ -406,11 +406,9 @@ uint64_t esp_reg_read(ESPState *s, uint32_t saddr)
 /* Data out.  */
 qemu_log_mask(LOG_UNIMP, "esp: PIO data read not implemented\n");
 s->rregs[ESP_FIFO] = 0;
-esp_raise_irq(s);
 } else if (s->ti_rptr < s->ti_wptr) {
 s->ti_size--;
 s->rregs[ESP_FIFO] = s->ti_buf[s->ti_rptr++];
-esp_raise_irq(s);
 }
 if (s->ti_rptr == s->ti_wptr) {
 s->ti_rptr = 0;
-- 
2.1.4




Re: [Qemu-devel] [RFC PATCH 6/8] quorum: Avoid bdrv_aio_writev() for rewrites

2016-11-18 Thread Eric Blake
On 11/18/2016 06:21 AM, Kevin Wolf wrote:

>>> +ret = bdrv_co_pwritev(s->children[co->i],
>>> +  acb->sector_num * BDRV_SECTOR_SIZE,
>>> +  acb->nb_sectors * BDRV_SECTOR_SIZE,
>>> +  acb->qiov, 0);
>>> +(void) ret;
>>
>> Why do you need 'ret' at all? If it's a placeholder to remind us to do
>> something with this value in the future, you can simply add a FIXME
>> comment.
> 
> I'm not sure whether we want to fix anything, it looks intentional to
> me. I just wanted to be explicit about the ignored return value, both
> for human readers and for tools like Coverity.

In bdrv_co_flush(), we have:

/* Return value is ignored - it's ok if wait queue is empty */
qemu_co_queue_next(&bs->flush_queue);

I don't know if Coverity would squawk, but the cast to void looks a bit
stranger to me than a comment, where what we did in bdrv_co_flush()
seems reasonable.  There's also the patch proposal to introduce
ignore_value() in place of a cast to void, which is a bit more
self-documenting about places that intentionally ignore a return value
while still shutting Coverity up:

https://lists.gnu.org/archive/html/qemu-devel/2016-09/msg05165.html

> 
>>> +/* one less rewrite to do */
>>> +acb->rewrite_count--;
>>> +qemu_coroutine_enter_if_inactive(acb->co);
>>
>> I think you should only enter acb->co when acb->rewrite_count reaches
>> zero.
>>
>> In all other cases the main coroutine simply iterates inside the while()
>> loop, verifies that the number is still positive and yields again.
>>
>> The same applies to all other cases of qemu_coroutine_enter_if_inactive,
>> by the way (I failed to notice it in patch #5).
> 
> I think I like it better this way because it keeps the loop condition
> local to the caller instead of spreading it across the caller and the
> places that reenter. On the other hand, I can see that not doing the
> extra context switch might be a little more efficient.

Do we have a feel for how many context switches this would save?  If
it's in the noise, cleaner code is probably a win; but if it is a
hotspot, then we should definitely try the optimization.

> 
> If you feel strongly about this, I will change it.
> 
> Kevin
> 
> 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] ARM documentation

2016-11-18 Thread Alistair Francis
On Fri, Nov 11, 2016 at 8:53 AM, G 3  wrote:
>
> On Nov 11, 2016, at 11:31 AM, Alex Bennée wrote:
>
>>
>> G 3  writes:
>>
>>> On Nov 11, 2016, at 8:27 AM, G 3 wrote:
>>>

 On Nov 11, 2016, at 8:08 AM, Stefan Hajnoczi wrote:

> On Fri, Nov 11, 2016 at 12:09:31AM -0500, G 3 wrote:
>>
>> I was wondering if there is a list somewhere of all the ARM
>> boards QEMU
>> supports. I want to add a section to the ARM wiki page that lists
>> at least a
>> few of them.
>
>
> Are you looking for something more detailed than the following?
>
> $ arm-softmmu/qemu-system-arm -M \?
> Supported machines are:
> akitaSharp SL-C1000 (Akita) PDA (PXA270)
> ast2500-evb  Aspeed AST2500 EVB (ARM1176)
> borzoi   Sharp SL-C3100 (Borzoi) PDA (PXA270)
> canon-a1100  Canon PowerShot A1100 IS
> cheetah  Palm Tungsten|E aka. Cheetah PDA (OMAP310)
> collie   Sharp SL-5500 (Collie) PDA (SA-1110)
> connex   Gumstix Connex (PXA255)
> cubieboard   cubietech cubieboard
> highbank Calxeda Highbank (ECX-1000)
> imx25-pdkARM i.MX25 PDK board (ARM926)
> integratorcp ARM Integrator/CP (ARM926EJ-S)
> kzm  ARM KZM Emulation Baseboard (ARM1136)
> lm3s6965evb  Stellaris LM3S6965EVB
> lm3s811evb   Stellaris LM3S811EVB
> mainstoneMainstone II (PXA27x)
> midway   Calxeda Midway (ECX-2000)
> musicpal Marvell 88w8618 / MusicPal (ARM926EJ-S)
> n800 Nokia N800 tablet aka. RX-34 (OMAP2420)
> n810 Nokia N810 tablet aka. RX-44 (OMAP2420)
> netduino2Netduino 2 Machine
> none empty machine
> nuri Samsung NURI board (Exynos4210)
> palmetto-bmc OpenPOWER Palmetto BMC (ARM926EJ-S)
> raspi2   Raspberry Pi 2
> realview-eb  ARM RealView Emulation Baseboard (ARM926EJ-S)
> realview-eb-mpcore   ARM RealView Emulation Baseboard (ARM11MPCore)
> realview-pb-a8   ARM RealView Platform Baseboard for Cortex-A8
> realview-pbx-a9  ARM RealView Platform Baseboard Explore for
> Cortex-A9
> sabreliteFreescale i.MX6 Quad SABRE Lite Board (Cortex
> A9)
> smdkc210 Samsung SMDKC210 board (Exynos4210)
> spitzSharp SL-C3000 (Spitz) PDA (PXA270)
> sx1  Siemens SX1 (OMAP310) V2
> sx1-v1   Siemens SX1 (OMAP310) V1
> terrier  Sharp SL-C3200 (Terrier) PDA (PXA270)
> tosa Sharp SL-6000 (Tosa) PDA (PXA255)
> verdex   Gumstix Verdex (PXA270)
> versatileab  ARM Versatile/AB (ARM926EJ-S)
> versatilepb  ARM Versatile/PB (ARM926EJ-S)
> vexpress-a15 ARM Versatile Express for Cortex-A15
> vexpress-a9  ARM Versatile Express for Cortex-A9
> virt-2.6 QEMU 2.6 ARM Virtual Machine
> virt-2.7 QEMU 2.7 ARM Virtual Machine
> virt QEMU 2.8 ARM Virtual Machine (alias of virt-2.8)
> virt-2.8 QEMU 2.8 ARM Virtual Machine
> xilinx-zynq-a9   Xilinx Zynq Platform Baseboard for Cortex-A9
> z2   Zipit Z2 (PXA27x)
>
> $ aarch64-softmmu/qemu-system-aarch64 -M \?
> Supported machines are:
> akitaSharp SL-C1000 (Akita) PDA (PXA270)
> ast2500-evb  Aspeed AST2500 EVB (ARM1176)
> borzoi   Sharp SL-C3100 (Borzoi) PDA (PXA270)
> canon-a1100  Canon PowerShot A1100 IS
> cheetah  Palm Tungsten|E aka. Cheetah PDA (OMAP310)
> collie   Sharp SL-5500 (Collie) PDA (SA-1110)
> connex   Gumstix Connex (PXA255)
> cubieboard   cubietech cubieboard
> highbank Calxeda Highbank (ECX-1000)
> imx25-pdkARM i.MX25 PDK board (ARM926)
> integratorcp ARM Integrator/CP (ARM926EJ-S)
> kzm  ARM KZM Emulation Baseboard (ARM1136)
> lm3s6965evb  Stellaris LM3S6965EVB
> lm3s811evb   Stellaris LM3S811EVB
> mainstoneMainstone II (PXA27x)
> midway   Calxeda Midway (ECX-2000)
> musicpal Marvell 88w8618 / MusicPal (ARM926EJ-S)
> n800 Nokia N800 tablet aka. RX-34 (OMAP2420)
> n810 Nokia N810 tablet aka. RX-44 (OMAP2420)
> netduino2Netduino 2 Machine
> none empty machine
> nuri Samsung NURI board (Exynos4210)
> palmetto-bmc OpenPOWER Palmetto BMC (ARM926EJ-S)
> raspi2   Raspberry Pi 2
> realview-eb  ARM RealView Emulation Baseboard (ARM926EJ-S)
> realview-eb-mpcore   ARM RealView Emulation Baseboard (ARM11MPCore)

Re: [Qemu-devel] [PATCH 1/2] compiler: add ignore_value() macro

2016-11-18 Thread Eric Blake
On 10/25/2016 10:21 AM, Felipe Franciosi wrote:
> 
>> On 12 Oct 2016, at 18:20, Felipe Franciosi  wrote:
>>
>>
>>> On 21 Sep 2016, at 19:15, Eric Blake  wrote:
>>>
>>> On 09/21/2016 10:27 AM, Felipe Franciosi wrote:
 On GCC versions 3.4 and newer, simply using (void) in front of a
 function that has been declared with WUR will no longer suppress a
 compilation warning. This commit brings the ignore_value() macro from
 GNULIB's ignore_value.h, licensed under the terms of LGPLv2+.

 See the link below for the original author's comment:
 https://lists.nongnu.org/archive/html/qemu-devel/2016-09/msg05148.html

 Signed-off-by: Felipe Franciosi 
 ---
 include/qemu/compiler.h | 8 
 1 file changed, 8 insertions(+)
>>>
>>> Reviewed-by: Eric Blake 
>>>

> 
> Ping?

It's missed 2.8, but I still think it is a useful patch; I just
commented on another thread about a case where a cast-to-void might look
better with the use of ignore_value().

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] [RFC v2 0/3] virtio-net: Add support to MTU feature

2016-11-18 Thread Aaron Conole
Maxime Coquelin  writes:

> On 11/18/2016 07:15 PM, Aaron Conole wrote:
>> Maxime Coquelin  writes:
>>
>>> This series implements Virtio spec update from Aaron Conole which
>>> defines a way for the host to expose its max MTU to the guest.
>>>
>>> Changes since RFC v1:
>>> -
>>>  - Rebased on top of v2.8.0-rc0 (2.7.90)
>>>  - Write MTU unconditionnaly in netcfg to avoid memory leak (Paolo)
>>>  - Add host_mtu property to be able to disable the feature from QEMU
>>>
>>> Maxime Coquelin (3):
>>>   vhost-user: Add new protocol feature MTU
>>>   vhost-net: Add new MTU feature support
>>>   virtio-net: Add MTU feature support
>>>
>>>  hw/net/vhost_net.c | 11 +++
>>>  hw/net/virtio-net.c| 14 ++
>>>  hw/virtio/vhost-user.c | 11 +++
>>>  include/hw/virtio/vhost.h  |  1 +
>>>  include/hw/virtio/virtio-net.h |  1 +
>>>  include/net/vhost_net.h|  2 ++
>>>  6 files changed, 40 insertions(+)
>>
>> I ran this with a VM, but it seems the offered maximum MTU was of value
>> 0 - is this expected with this version?  How can I change the offered
>> value?  Sorry, I'm not as familiar with QEMU/libvirt side of the world.
>
> They way I implemented it, the MTU value is to be provided by
> vhost-user process (e.g. OVS/DPDK). I added a Vhost protocol
> feature for this. The sequence is:
> 1. Qemu send VHOST_USER_GET_PROTOCOL_FEATURES request
> 2. DPDK replies with providing supported features
> 3. If DPDK supports VHOST_USER_PROTOCOL_F_MTU, Qemu send
>VHOST_USER_GET_MTU resuest
> 4. DPDK replies with MTU value
>
> Does that make sense?

In the case of a vhost-user backed port, yes (so for instance, if I use
ovs+dpdk vhost-user in client or server mode).  However, what about the
non-dpdk case, where I still use a virtio-net driver in kernel and want
to have it backed with, say, a tap device in the host attached to
virbr0 (or some other bridge).  It should still pull the mtu from that
device and offer it, I think.

> Another possibility would be that we could directly pass the MTU value
> to Qemu. It may be easier to implement, and to handle migration.
> Problem is that if we do this, this is not the vSwitch that decides the
> MTU to set.

Might be better to determined the mtu by looking at what actually
provides the back-end for the networking.

> Regards,
> Maxime



Re: [Qemu-devel] [RFC v2 0/3] virtio-net: Add support to MTU feature

2016-11-18 Thread Maxime Coquelin



On 11/18/2016 07:15 PM, Aaron Conole wrote:

Maxime Coquelin  writes:


This series implements Virtio spec update from Aaron Conole which
defines a way for the host to expose its max MTU to the guest.

Changes since RFC v1:
-
 - Rebased on top of v2.8.0-rc0 (2.7.90)
 - Write MTU unconditionnaly in netcfg to avoid memory leak (Paolo)
 - Add host_mtu property to be able to disable the feature from QEMU

Maxime Coquelin (3):
  vhost-user: Add new protocol feature MTU
  vhost-net: Add new MTU feature support
  virtio-net: Add MTU feature support

 hw/net/vhost_net.c | 11 +++
 hw/net/virtio-net.c| 14 ++
 hw/virtio/vhost-user.c | 11 +++
 include/hw/virtio/vhost.h  |  1 +
 include/hw/virtio/virtio-net.h |  1 +
 include/net/vhost_net.h|  2 ++
 6 files changed, 40 insertions(+)


I ran this with a VM, but it seems the offered maximum MTU was of value
0 - is this expected with this version?  How can I change the offered
value?  Sorry, I'm not as familiar with QEMU/libvirt side of the world.


They way I implemented it, the MTU value is to be provided by
vhost-user process (e.g. OVS/DPDK). I added a Vhost protocol
feature for this. The sequence is:
1. Qemu send VHOST_USER_GET_PROTOCOL_FEATURES request
2. DPDK replies with providing supported features
3. If DPDK supports VHOST_USER_PROTOCOL_F_MTU, Qemu send
   VHOST_USER_GET_MTU resuest
4. DPDK replies with MTU value

Does that make sense?

Another possibility would be that we could directly pass the MTU value
to Qemu. It may be easier to implement, and to handle migration.
Problem is that if we do this, this is not the vSwitch that decides the
MTU to set.

Regards,
Maxime



Re: [Qemu-devel] [PATCH] xen_disk: convert discard input to byte ranges

2016-11-18 Thread Eric Blake
On 11/18/2016 11:41 AM, Olaf Hering wrote:
> On Fri, Nov 18, Eric Blake wrote:
> 
>> On 11/18/2016 04:24 AM, Olaf Hering wrote:
>>> +/* Overflowing byte limit? */
>>> +if ((sec_start + sec_count) > ((INT64_MAX + INT_MAX) >> 
>>> BDRV_SECTOR_BITS)) {
>> This is undefined.  INT64_MAX + anything non-negative overflows int64,
> 
> The expanded value used to be stored into a uint64_t before it was used
> here. A "cleanup" introduced this error. Thanks for spotting.
> 
>> If you are trying to detect guests that make a request that would cover
>> more than INT64_MAX bytes, you can simplify.  Besides, for as much
>> storage as there is out there, I seriously doubt ANYONE will ever have
>> 2^63 bytes addressable through a single device.  Why not just write it as:
>>
>> if ((INT64_MAX >> BDRV_SECTOR_BITS) - sec_count < sec_start) {
> 
> That would always be false I think. I will resubmit with this:
> if ((sec_start + sec_count) > (INT64_MAX >> BDRV_SECTOR_BITS)) {

You're testing whether something overflows, but you don't want to cause
overflow as part of the test.  So use the commutative law to rewrite it
to avoid sec_start+sec_count from overflowing, and you get:

if (sec_start > (INT64_MAX >> BDRV_SECTOR_BITS) - sec_count)

but that's exactly the expression I wrote above.

> 
> Regarding the cast for ->req, it has type blkif_request_t, but the
> pointer needs to be assigned to type blkif_request_discard_t.

Then why is the cast to (void*) instead of (blkif_request_discard_t*) ?

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] [RFC v2 2/3] vhost-net: Add new MTU feature support

2016-11-18 Thread Aaron Conole
Maxime Coquelin  writes:

> If VHOST_USER_F_MTU feature is negociated, vhost-net makes the
> advised MTU available to virtio-net through a vhost_net_get_mtu()

s/advised/maximum/

> call.
>
> Cc: Michael S. Tsirkin 
> Cc: Aaron Conole 
> Signed-off-by: Maxime Coquelin 
> ---
>  hw/net/vhost_net.c  | 11 +++
>  include/net/vhost_net.h |  2 ++
>  2 files changed, 13 insertions(+)
>
> diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
> index f2d49ad..21057d6 100644
> --- a/hw/net/vhost_net.c
> +++ b/hw/net/vhost_net.c
> @@ -74,6 +74,7 @@ static const int user_feature_bits[] = {
>  VIRTIO_NET_F_HOST_ECN,
>  VIRTIO_NET_F_HOST_UFO,
>  VIRTIO_NET_F_MRG_RXBUF,
> +VIRTIO_NET_F_MTU,
>  
>  /* This bit implies RARP isn't sent by QEMU out of band */
>  VIRTIO_NET_F_GUEST_ANNOUNCE,
> @@ -435,6 +436,11 @@ int vhost_set_vring_enable(NetClientState *nc, int 
> enable)
>  return 0;
>  }
>  
> +uint64_t vhost_net_get_mtu(struct vhost_net *net)
> +{
> +return net->dev.mtu;
> +}
> +
>  #else
>  uint64_t vhost_net_get_max_queues(VHostNetState *net)
>  {
> @@ -501,4 +507,9 @@ int vhost_set_vring_enable(NetClientState *nc, int enable)
>  {
>  return 0;
>  }
> +
> +uint64_t vhost_net_get_mtu(struct vhost_net *net)
> +{
> +return 0;
> +}
>  #endif
> diff --git a/include/net/vhost_net.h b/include/net/vhost_net.h
> index 5a08eff..37de17b 100644
> --- a/include/net/vhost_net.h
> +++ b/include/net/vhost_net.h
> @@ -35,4 +35,6 @@ int vhost_set_vring_enable(NetClientState * nc, int enable);
>  
>  uint64_t vhost_net_get_acked_features(VHostNetState *net);
>  
> +uint64_t vhost_net_get_mtu(struct vhost_net *net);
> +
>  #endif



Re: [Qemu-devel] [RFC v2 0/3] virtio-net: Add support to MTU feature

2016-11-18 Thread Aaron Conole
Maxime Coquelin  writes:

> This series implements Virtio spec update from Aaron Conole which
> defines a way for the host to expose its max MTU to the guest.
>
> Changes since RFC v1:
> -
>  - Rebased on top of v2.8.0-rc0 (2.7.90)
>  - Write MTU unconditionnaly in netcfg to avoid memory leak (Paolo)
>  - Add host_mtu property to be able to disable the feature from QEMU
>
> Maxime Coquelin (3):
>   vhost-user: Add new protocol feature MTU
>   vhost-net: Add new MTU feature support
>   virtio-net: Add MTU feature support
>
>  hw/net/vhost_net.c | 11 +++
>  hw/net/virtio-net.c| 14 ++
>  hw/virtio/vhost-user.c | 11 +++
>  include/hw/virtio/vhost.h  |  1 +
>  include/hw/virtio/virtio-net.h |  1 +
>  include/net/vhost_net.h|  2 ++
>  6 files changed, 40 insertions(+)

I ran this with a VM, but it seems the offered maximum MTU was of value
0 - is this expected with this version?  How can I change the offered
value?  Sorry, I'm not as familiar with QEMU/libvirt side of the world.

-Aaron



Re: [Qemu-devel] [PATCH] ipmi: fix qemu crash while migrating with ipmi

2016-11-18 Thread Corey Minyard

On 11/18/2016 02:22 AM, Zhuangyanying wrote:

From: ZhuangYanying 

Qemu crash in the source side while migrating, after starting ipmi service 
inside vm.


Yeah, I see the issue with that, it won't always work.

Reviewed-by: Corey Minyard 

Thanks.



./x86_64-softmmu/qemu-system-x86_64 --enable-kvm -smp 4 -m 4096 \
-drive 
file=/work/suse/suse11_sp3_64_vt,format=raw,if=none,id=drive-virtio-disk0,cache=none
 \
-device 
virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0
 \
-vnc :99 -monitor vc -device ipmi-bmc-sim,id=bmc0 -device 
isa-ipmi-kcs,bmc=bmc0,ioport=0xca2

Program received signal SIGSEGV, Segmentation fault.
[Switching to Thread 0x7ffec4268700 (LWP 7657)]
__memcpy_ssse3_back () at ../sysdeps/x86_64/multiarch/memcpy-ssse3-back.S:2757
(gdb) bt
  #0  __memcpy_ssse3_back () at 
../sysdeps/x86_64/multiarch/memcpy-ssse3-back.S:2757
  #1  0x559ef775 in memcpy (__len=3, __src=0xc1421c, __dest=)
  at /usr/include/bits/string3.h:51
  #2  qemu_put_buffer (f=0x57a97690, buf=0xc1421c , size=3)
  at migration/qemu-file.c:346
  #3  0x559eef66 in vmstate_save_state (f=f@entry=0x57a97690,
  vmsd=0x55f8a5a0 , opaque=0x57231160,
  vmdesc=vmdesc@entry=0x5798cc40) at migration/vmstate.c:333
  #4  0x557cfe45 in vmstate_save (f=f@entry=0x57a97690, 
se=se@entry=0x57231de0,
  vmdesc=vmdesc@entry=0x5798cc40) at 
/mnt/sdb/zyy/qemu/migration/savevm.c:720
  #5  0x557d2be7 in qemu_savevm_state_complete_precopy 
(f=0x57a97690,
  iterable_only=iterable_only@entry=false) at 
/mnt/sdb/zyy/qemu/migration/savevm.c:1128
  #6  0x559ea102 in migration_completion (start_time=,
  old_vm_running=, current_active_state=,
  s=0x560eaa80 ) at migration/migration.c:1707
  #7  migration_thread (opaque=0x560eaa80 ) at 
migration/migration.c:1855
  #8  0x73900dc5 in start_thread (arg=0x7ffec4268700) at 
pthread_create.c:308
  #9  0x7fffefc6c71d in clone () at 
../sysdeps/unix/sysv/linux/x86_64/clone.S:113

Signed-off-by: Zhuang Yanying 
---
  hw/ipmi/isa_ipmi_kcs.c | 6 ++
  1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/hw/ipmi/isa_ipmi_kcs.c b/hw/ipmi/isa_ipmi_kcs.c
index 9a38f8a..8044497 100644
--- a/hw/ipmi/isa_ipmi_kcs.c
+++ b/hw/ipmi/isa_ipmi_kcs.c
@@ -433,10 +433,8 @@ const VMStateDescription vmstate_ISAIPMIKCSDevice = {
  VMSTATE_BOOL(kcs.use_irq, ISAIPMIKCSDevice),
  VMSTATE_BOOL(kcs.irqs_enabled, ISAIPMIKCSDevice),
  VMSTATE_UINT32(kcs.outpos, ISAIPMIKCSDevice),
-VMSTATE_VBUFFER_UINT32(kcs.outmsg, ISAIPMIKCSDevice, 1, NULL, 0,
-   kcs.outlen),
-VMSTATE_VBUFFER_UINT32(kcs.inmsg, ISAIPMIKCSDevice, 1, NULL, 0,
-   kcs.inlen),
+VMSTATE_UINT8_ARRAY(kcs.outmsg, ISAIPMIKCSDevice, MAX_IPMI_MSG_SIZE),
+VMSTATE_UINT8_ARRAY(kcs.inmsg, ISAIPMIKCSDevice, MAX_IPMI_MSG_SIZE),
  VMSTATE_BOOL(kcs.write_end, ISAIPMIKCSDevice),
  VMSTATE_UINT8(kcs.status_reg, ISAIPMIKCSDevice),
  VMSTATE_UINT8(kcs.data_out_reg, ISAIPMIKCSDevice),






Re: [Qemu-devel] [PULL] tftp: fake support for netascii protocol

2016-11-18 Thread no-reply
Hi,

Your series seems to have some coding style problems. See output below for
more information:

Subject: [Qemu-devel] [PULL] tftp: fake support for netascii protocol
Type: series
Message-id: 20161118175128.17192-2-samuel.thiba...@ens-lyon.org

=== TEST SCRIPT BEGIN ===
#!/bin/bash

BASE=base
n=1
total=$(git log --oneline $BASE.. | wc -l)
failed=0

# Useful git options
git config --local diff.renamelimit 0
git config --local diff.renames True

commits="$(git log --format=%H --reverse $BASE..)"
for c in $commits; do
echo "Checking PATCH $n/$total: $(git log -n 1 --format=%s $c)..."
if ! git show $c --format=email | ./scripts/checkpatch.pl --mailback -; then
failed=1
echo
fi
n=$((n+1))
done

exit $failed
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
From https://github.com/patchew-project/qemu
 * [new tag] 
patchew/20161118175128.17192-2-samuel.thiba...@ens-lyon.org -> 
patchew/20161118175128.17192-2-samuel.thiba...@ens-lyon.org
Switched to a new branch 'test'
01931bf tftp: fake support for netascii protocol

=== OUTPUT BEGIN ===
Checking PATCH 1/1: tftp: fake support for netascii protocol...
ERROR: suspect code indent for conditional statements (2, 6)
#27: FILE: slirp/tftp.c:329:
+  if (strcasecmp(&tp->x.tp_buf[k], "octet") == 0) {
+  k += 6;

ERROR: suspect code indent for conditional statements (2, 6)
#29: FILE: slirp/tftp.c:331:
+  } else if (strcasecmp(&tp->x.tp_buf[k], "netascii") == 0) {
+  k += 9;

total: 2 errors, 0 warnings, 18 lines checked

Your patch has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.

=== OUTPUT END ===

Test command exited with code: 1


---
Email generated automatically by Patchew [http://patchew.org/].
Please send your feedback to patchew-de...@freelists.org

Re: [Qemu-devel] [PATCH] tftp: fake support for netascii protocol

2016-11-18 Thread Samuel Thibault
Hello,

Vincent Bernat, on Thu 17 Nov 2016 13:22:32 +0100, wrote:
>  ❦ 17 novembre 2016 13:20 +0100, Vincent Bernat  :
> 
> > Some network equipments are requesting a file using the netascii
> > protocol and this is not configurable. Currently, qemu's tftpd only
> > supports the octet protocol. This commit makes it accept the netascii
> > protocol as well but do not perform the requested transformation (LF ->
> > CR,LF) as it would be far more complex. The current implementation is
> > good enough. A user has always the choice to preencode the served file
> > correctly.
> 
> Signed-off-by: Vincent Bernat 

Thanks, I've pushed to my tree and requested a pull.

Samuel



[Qemu-devel] [PULL] tftp: fake support for netascii protocol

2016-11-18 Thread Samuel Thibault
From: Vincent Bernat 

Some network equipments are requesting a file using the netascii
protocol and this is not configurable. Currently, qemu's tftpd only
supports the octet protocol. This commit makes it accept the netascii
protocol as well but do not perform the requested transformation (LF ->
CR,LF) as it would be far more complex. The current implementation is
good enough. A user has always the choice to preencode the served file
correctly.

Signed-off-by: Vincent Bernat 
Signed-off-by: Samuel Thibault 
---
 slirp/tftp.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/slirp/tftp.c b/slirp/tftp.c
index c185906..ab1c05d 100644
--- a/slirp/tftp.c
+++ b/slirp/tftp.c
@@ -326,13 +326,15 @@ static void tftp_handle_rrq(Slirp *slirp, struct 
sockaddr_storage *srcsas,
 return;
   }
 
-  if (strcasecmp(&tp->x.tp_buf[k], "octet") != 0) {
+  if (strcasecmp(&tp->x.tp_buf[k], "octet") == 0) {
+  k += 6;
+  } else if (strcasecmp(&tp->x.tp_buf[k], "netascii") == 0) {
+  k += 9;
+  } else {
   tftp_send_error(spt, 4, "Unsupported transfer mode", tp);
   return;
   }
 
-  k += 6; /* skipping octet */
-
   /* do sanity checks on the filename */
   if (!strncmp(req_fname, "../", 3) ||
   req_fname[strlen(req_fname) - 1] == '/' ||
-- 
2.10.2




[Qemu-devel] [PULL] tftp: fake support for netascii protocol

2016-11-18 Thread Samuel Thibault
The following changes since commit b0bcc86d2a87456f5a276f941dc775b265b309cf:

  Update version for v2.8.0-rc0 release (2016-11-15 20:55:12 +)

are available in the git repository at:

  http://people.debian.org/~sthibault/qemu.git tags/samuel-thibault

for you to fetch changes up to 43fccf87c92a6a88a7294597b719f17fd1b41d3d:

  tftp: fake support for netascii protocol (2016-11-18 18:49:02 +0100)


slirp updates


Vincent Bernat (1):
  tftp: fake support for netascii protocol

 slirp/tftp.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)



Re: [Qemu-devel] [PATCH] xen_disk: convert discard input to byte ranges

2016-11-18 Thread Olaf Hering
On Fri, Nov 18, Eric Blake wrote:

> On 11/18/2016 04:24 AM, Olaf Hering wrote:
> > +/* Overflowing byte limit? */
> > +if ((sec_start + sec_count) > ((INT64_MAX + INT_MAX) >> 
> > BDRV_SECTOR_BITS)) {
> This is undefined.  INT64_MAX + anything non-negative overflows int64,

The expanded value used to be stored into a uint64_t before it was used
here. A "cleanup" introduced this error. Thanks for spotting.

> If you are trying to detect guests that make a request that would cover
> more than INT64_MAX bytes, you can simplify.  Besides, for as much
> storage as there is out there, I seriously doubt ANYONE will ever have
> 2^63 bytes addressable through a single device.  Why not just write it as:
> 
> if ((INT64_MAX >> BDRV_SECTOR_BITS) - sec_count < sec_start) {

That would always be false I think. I will resubmit with this:
if ((sec_start + sec_count) > (INT64_MAX >> BDRV_SECTOR_BITS)) {

Regarding the cast for ->req, it has type blkif_request_t, but the
pointer needs to be assigned to type blkif_request_discard_t.


Olaf


signature.asc
Description: PGP signature


Re: [Qemu-devel] [PATCH v3] ivshmem: Fix 64 bit memory bar configuration

2016-11-18 Thread Paolo Bonzini


On 18/11/2016 16:27, Michael S. Tsirkin wrote:
> On Thu, Nov 17, 2016 at 08:31:03PM +0800, Zhuangyanying wrote:
>> From: Zhuang Yanying 
>>
>> Device ivshmem property use64=0 is designed to make the device
>> expose a 32 bit shared memory BAR instead of 64 bit one.  The
>> default is a 64 bit BAR, except pc-1.2 and older retain a 32 bit
>> BAR.  A 32 bit BAR can support only up to 1 GiB of shared memory.
>>
>> This worked as designed until commit 5400c02 accidentally flipped
>> its sense: since then, we misinterpret use64=0 as use64=1 and vice
>> versa.  Worse, the default got flipped as well.  Devices
>> ivshmem-plain and ivshmem-doorbell are not affected.
>>
>> Fix by restoring the test of IVShmemState member not_legacy_32bit
>> that got messed up in commit 5400c02.  Also update its
>> initialization for devices ivhsmem-plain and ivshmem-doorbell.
>> Without that, they'd regress to 32 bit BARs.
>>
>> Cc: qemu-sta...@nongnu.org
>> Signed-off-by: Zhuang Yanying 
>> Reviewed-by: Gonglei 
>> Reviewed-by: Marc-André Lureau 
> 
> This is UTF-8 encoded, but your mail header says
> Content-Transfer-Encoding: 8bit
> so git am fails to apply this.

The problematic header is more likely

Content-Type: text/plain; charset=iso-8859-1

Paolo

> 
> 
>> ---
>>  hw/misc/ivshmem.c | 4 +++-
>>  1 file changed, 3 insertions(+), 1 deletion(-)
>>
>> diff --git a/hw/misc/ivshmem.c b/hw/misc/ivshmem.c
>> index 230e51b..abeaf3d 100644
>> --- a/hw/misc/ivshmem.c
>> +++ b/hw/misc/ivshmem.c
>> @@ -858,7 +858,7 @@ static void ivshmem_common_realize(PCIDevice *dev, Error 
>> **errp)
>>  pci_register_bar(dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY,
>>   &s->ivshmem_mmio);
>>  
>> -if (!s->not_legacy_32bit) {
>> +if (s->not_legacy_32bit) {
>>  attr |= PCI_BASE_ADDRESS_MEM_TYPE_64;
>>  }
>>  
>> @@ -1045,6 +1045,7 @@ static void ivshmem_plain_init(Object *obj)
>>   ivshmem_check_memdev_is_busy,
>>   OBJ_PROP_LINK_UNREF_ON_RELEASE,
>>   &error_abort);
>> +s->not_legacy_32bit = 1;
>>  }
>>  
>>  static void ivshmem_plain_realize(PCIDevice *dev, Error **errp)
>> @@ -1116,6 +1117,7 @@ static void ivshmem_doorbell_init(Object *obj)
>>  
>>  s->features |= (1 << IVSHMEM_MSI);
>>  s->legacy_size = SIZE_MAX;  /* whatever the server sends */
>> +s->not_legacy_32bit = 1;
>>  }
>>  
>>  static void ivshmem_doorbell_realize(PCIDevice *dev, Error **errp)
>> -- 
>> 1.8.3.1
>>



Re: [Qemu-devel] [QEMU PATCH v13 0/4] migration: migrate QTAILQ

2016-11-18 Thread Jianjun Duan
ping

On 11/16/2016 04:07 PM, Jianjun Duan wrote:
> Hi all,
> 
> I addressed some review comments. Comments are welcome. 
> 
> v13: - Changed some QTAILQ related macro names to match existing ones. 
> 
> Previous versions are:
> 
> v12: - Fixed type for put_qtailq which caused build break.
> (link: http://lists.gnu.org/archive/html/qemu-devel/2016-11/msg01328.html
> 
> v11: - Split error_report statements into a separate patch.
>  - Changed the signature of put. It now returns int type.
>  - Minor changes to QTAILQ macros. 
>  
> v10: - Fixed a typo.
> (link: http://lists.nongnu.org/archive/html/qemu-ppc/2016-10/msg01206.html)
> 
> v9: - No more hard encoding of QTAILQ layout information
> (link: http://lists.nongnu.org/archive/html/qemu-ppc/2016-10/msg01042.html)
> 
> v8: - Fixed a style issue. 
> (link: http://lists.nongnu.org/archive/html/qemu-ppc/2016-10/msg00874.html)
> 
> v7: - Fixed merge errors.
> - Simplified macro definitions related to pointer arithmetic based QTAILQ 
> access.
> - Added test case for QTAILQ migration in tests/test-vmstate.c.
> (link: http://lists.nongnu.org/archive/html/qemu-ppc/2016-10/msg00711.html)
> 
> 
> v6: - Split from Power specific patches. 
> - Dropped VMS_LINKED flag.
> - Rebased to master.
> - Added comments to clarify about put/get in VMStateInfo.  
> (link: http://lists.nongnu.org/archive/html/qemu-ppc/2016-10/msg00336.html)
> 
> v5: - Rebased to David's ppc-for-2.8. 
> (link: https://lists.nongnu.org/archive/html/qemu-devel/2016-10/msg00270.html)
> 
> v4: - Introduce a way to set customized instance_id in SaveStateEntry. Use it
>   to set instance_id for DRC using its unique index to address David 
>   Gibson's concern.
> - Rename VMS_CSTM to VMS_LINKED based on Paolo Bonzini's suggestions.
> - Clean up qjson stuff in put_qtailq. 
> - Add trace for put_qtailq and get_qtailq based on David Gilbert's 
>   suggestion.
> - Based on David's ppc-for-2.7. 
> (link: https://lists.nongnu.org/archive/html/qemu-devel/2016-06/msg07720.html)
> 
> v3: - Simplify overall design followng discussion with Paolo. No longer need
>   metadata to migrate QTAILQ.
> - Extend VMStateInfo instead of adding similar fields to VMStateField.
> - Clean up macros in qemu/queue.h.
> (link: https://lists.nongnu.org/archive/html/qemu-devel/2016-05/msg05695.html)
> 
> v2: - Introduce a general approach to migrate QTAILQ in qemu/queue.h.
> - Migrate signalled field in the DRC state.
> - Put the newly added migrating fields in subsections so that backward 
>   migration is not broken.  
> - Set detach_cb field right after migration so that a migrated hot-unplug
>   event could finish its course.
> (link: https://lists.nongnu.org/archive/html/qemu-devel/2016-05/msg04188.html)
> 
> v1: - Inital version.
> (link: https://lists.nongnu.org/archive/html/qemu-devel/2016-04/msg02601.html)
> 
> 
> Jianjun Duan (4):
>   migration: extend VMStateInfo
>   migration: migrate QTAILQ
>   tests/migration: Add test for QTAILQ migration
>   migration: add error_report
> 
>  hw/display/virtio-gpu.c |   8 +-
>  hw/intc/s390_flic_kvm.c |   8 +-
>  hw/net/vmxnet3.c|  24 --
>  hw/nvram/eeprom93xx.c   |   8 +-
>  hw/nvram/fw_cfg.c   |   8 +-
>  hw/pci/msix.c   |   8 +-
>  hw/pci/pci.c|  16 +++-
>  hw/pci/shpc.c   |   7 +-
>  hw/scsi/scsi-bus.c  |   8 +-
>  hw/timer/twl92230.c |   8 +-
>  hw/usb/redirect.c   |  24 --
>  hw/virtio/virtio-pci.c  |   8 +-
>  hw/virtio/virtio.c  |  15 +++-
>  include/migration/vmstate.h |  39 --
>  include/qemu/queue.h|  60 +++
>  migration/savevm.c  |   7 +-
>  migration/trace-events  |   4 +
>  migration/vmstate.c | 184 
> +++-
>  target-alpha/machine.c  |   6 +-
>  target-arm/machine.c|  14 +++-
>  target-i386/machine.c   |  26 +--
>  target-mips/machine.c   |  14 +++-
>  target-ppc/machine.c|  12 ++-
>  target-sparc/machine.c  |   6 +-
>  tests/test-vmstate.c| 160 ++
>  25 files changed, 577 insertions(+), 105 deletions(-)
> 




Re: [Qemu-devel] [PATCH v14 00/22] Add Mediated device support

2016-11-18 Thread Alex Williamson
On Fri, 18 Nov 2016 17:09:59 +0100
Daniel Vetter  wrote:

> On Fri, Nov 18, 2016 at 4:40 PM, Alex Williamson
>  wrote:
> >> Alex, could you do a pull request of mdev for Daniel's drm-intel tree?
> >> We need to send KVMGT mdev support pull base on that.  
> >
> > No, this is not how I intend or prefer to merge this.  This is a large
> > change for vfio and it is not exclusive to KVMGT.  We have linux-next
> > to facilitate handling dependencies between subsystems during
> > development and a two week merge window to allow managing how these
> > changes enter the mainline tree.  If I were to have this pulled into
> > drm-intel it ties my hands as to how I can manage changes within my
> > functional area.  I want these two weeks of linux-next exposure for
> > vetting the changes and resolving any remaining issues.  I'm not going
> > to compromise my ability to react to such issues.  linux-next inclusion
> > should be sufficient for you to coordinate through the drm tree, though
> > Daniel will need to be made aware of the dependency.  I will however
> > plan to send my pull request to Linus early in the merge window to
> > accommodate dependent changes also being included for v4.10. Hope
> > you understand, thanks,  
> 
> My understanding was that the mdev changes are needed to be able to
> apply the kvmgt stuff, and otherwise it won't build. For that I need a
> stable git tag&pull request (can be specific topic branch, which means
> subsystems can land in any order, or the full subsystem tree, which
> means depencies need to be tracked correctly). I am not going to
> resolve that in the merge window, since in drm we want everything
> lined up _before_ that opens (the feature cutoff is this w/e, but
> there's some wiggle room ofc).
> 
> Sounds like there's just not enough time to line all the things up in
> time for 4.10, and the i915/kmvgt stuff needs to be postponed to 4.11.

My only alternate suggestion is that perhaps the KVMGT code can be
sufficiently partitioned off with #ifdefs that could be removed later,
allowing mdev and KVMGT to be merged independently.  We have only just
added mdev to linux-next, my intention is that the next two weeks are
for finding and correcting issues.  There are still outstanding API
changes, specifically for KVMGT being proposed by Intel included with
that. Thanks,

Alex



Re: [Qemu-devel] [PATCH for-2.8 2/3] migration: spapr_drc: defined VMStateDescription struct

2016-11-18 Thread Michael Roth
Quoting David Gibson (2016-11-18 00:04:33)
> On Thu, Nov 17, 2016 at 07:40:26PM -0600, Michael Roth wrote:
> > From: Jianjun Duan 
> > 
> > To manage hotplug/unplug of dynamic resources such as PCI cards,
> > memory, and CPU on sPAPR guests, a firmware abstraction known as
> > a Dynamic Resource Connector (DRC) is used to assign a particular
> > dynamic resource to the guest, and provide an interface for the
> > guest to manage configuration/removal of the resource associated
> > with it.
> > 
> > To migrate the hotplugged resources in migration, the
> > associated DRC state need be migrated. To migrate the DRC state,
> > we defined the VMStateDescription struct for spapr_drc to enable
> > the transmission of spapr_drc state in migration.
> > 
> > Not all the elements in the DRC state are migrated. Only those
> > ones modifiable or needed by guest actions or device add/remove
> > operation are migrated. From the perspective of device
> > hotplugging, if we hotplug a device on the source, we need to
> > "coldplug" it on the target. The states across two hosts for the
> > same device are not the same. Ideally we want the states be same
> > after migration so that the device would function as hotplugged
> > on the target. For example we can unplug it. The minimum DRC
> > state we need to transfer should cover all the pieces changed by
> > hotplugging. Out of the elements of the DRC state, isolation_state,
> > allocation_sate, and configured are involved in the DR state
> > transition diagram from PAPR+ 2.7, 13.4. configured and signalled
> > are needed in attaching and detaching devices. indicator_state
> > provides users with hardware state information. These 6 elements
> > are migrated.
> > 
> > detach_cb in the DRC state is a function pointer that cannot be
> > migrated. We set it right after DRC state is migrated so that
> > a migrated hot-unplug event could finish its work.
> > 
> > The instance_id is used to identify objects in migration. We set
> > instance_id of DRC using the unique index so that it is the same
> > across migration.
> > 
> > Signed-off-by: Jianjun Duan 
> > * add migration for awaiting_allocation state
> > Signed-off-by: Michael Roth 
> > ---
> >  hw/ppc/spapr_drc.c | 70 
> > ++
> >  hw/ppc/spapr_pci.c | 22 +++
> >  include/hw/ppc/spapr_drc.h |  9 ++
> >  3 files changed, 101 insertions(+)
> > 
> > diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c
> > index a0c44ee..1ec6551 100644
> > --- a/hw/ppc/spapr_drc.c
> > +++ b/hw/ppc/spapr_drc.c
> > @@ -632,6 +632,72 @@ static void spapr_dr_connector_instance_init(Object 
> > *obj)
> >  NULL, NULL, NULL, NULL);
> >  }
> >  
> > +static bool spapr_drc_needed(void *opaque)
> > +{
> > +sPAPRDRConnector *drc = (sPAPRDRConnector *)opaque;
> > +sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> > +bool rc = false;
> > +sPAPRDREntitySense value;
> > +
> > +drck->entity_sense(drc, &value);
> > +/* If no dev is plugged in there is no need to migrate the DRC state */
> > +if (value != SPAPR_DR_ENTITY_SENSE_PRESENT) {
> > +return false;
> > +}
> > +/*
> > + * If there is dev plugged in, we need to migrate the DRC state when
> > + * it is different from cold-plugged state
> > + */
> > +switch(drc->type) {
> > +/* for PCI type */
> > +case SPAPR_DR_CONNECTOR_TYPE_PCI:
> > +rc = !((drc->isolation_state == 
> > SPAPR_DR_ISOLATION_STATE_UNISOLATED) &&
> > +   (drc->allocation_state == SPAPR_DR_ALLOCATION_STATE_USABLE) 
> > &&
> > +   drc->configured && drc->signalled && 
> > !drc->awaiting_release);
> > +break;
> > +/* for LMB type */
> > +case SPAPR_DR_CONNECTOR_TYPE_LMB:
> > +rc = !((drc->isolation_state == SPAPR_DR_ISOLATION_STATE_ISOLATED) 
> > &&
> > +   (drc->allocation_state == 
> > SPAPR_DR_ALLOCATION_STATE_UNUSABLE) &&
> > +   drc->configured && drc->signalled && 
> > !drc->awaiting_release);
> > +break;
> > +default:
> > +;
> > +}
> > +
> > +return rc;
> > +}
> > +
> > +/* detach_cb needs be set since it is not migrated */
> > +static void postmigrate_set_detach_cb(sPAPRDRConnector *drc,
> > +  spapr_drc_detach_cb *detach_cb)
> > +{
> > +drc->detach_cb = detach_cb;
> > +}
> > +
> > +/* return the unique drc index as instance_id for qom interfaces*/
> > +static int get_instance_id(DeviceState *dev)
> > +{
> > +return (int)get_index(SPAPR_DR_CONNECTOR(OBJECT(dev)));
> > +}
> > +
> > +static const VMStateDescription vmstate_spapr_drc = {
> > +.name = "spapr_drc",
> > +.version_id = 1,
> > +.minimum_version_id = 1,
> > +.needed = spapr_drc_needed,
> > +.fields  = (VMStateField []) {
> > +VMSTATE_UINT32(isolation_state, sPAPRDRConnector),
> > +VMSTATE_UINT32(allocation_state, sPAPRDRCo

Re: [Qemu-devel] [PATCH for-2.8 0/3] spapr: fix breakage of memory unplug after migration

2016-11-18 Thread Michael Roth
Quoting David Gibson (2016-11-17 23:45:05)
> On Thu, Nov 17, 2016 at 07:40:24PM -0600, Michael Roth wrote:
> > These patches are based on David's ppc-for-2.8 tree, and are also
> > available from:
> > 
> >   https://github.com/mdroth/qemu/commits/spapr-cas-migration
> > 
> > Currently, memory hotplugged to a pseries guest cannot be removed after
> > the guest has been migrated. This is due to 2 issues:
> > 
> > 1) The coldplugged state of memory on the target side is one where the
> >corresponding DRC's allocation state is:
> > 
> >  allocation_state == unallocated,
> >  awaiting_allocation == true,
> > 
> >When the guest attempts to unplug memory on the target side, it first
> >checks that allocation_state == allocated. If we fix this, the guest
> >can successfully notify QEMU of completion on it's end, but then the
> >DRC code sees that awaiting_allocation == true, so it defers the
> >finalizing of the LMB and corresponding DIMM since it assumes that
> >the DIMM must have been previously allocated before it can be removed.
> > 
> >To address this, we pull in patches 1-2 from Jian Jun's DRC migration
> >series:
> > 
> >  https://lists.gnu.org/archive/html/qemu-ppc/2016-10/msg00048.html
> > 
> >with some minor changes relating to prior review comments, and
> >the addition of migrating the DRC's awaiting_allocation value, which
> >wasn't part of the original patch. This doesn't address the full scope
> >of the issues Jian Jun was looking at (involving synchronizing state
> >when migration occurs during fairly small race windows), just this
> >particular case, which is more user visible since the time window is
> >indefinite.
> > 
> > 2) The ability to unplug memory is gated on the QEMU side by a check as
> >to whether or not support for newer-style hotplug events was negotiated
> >via CAS during boot. The check is performed by checking the corresponding
> >entry in the sPAPROptionVector structure. However, since this value isn't
> >migrated currently, we are unable to unplug until after the guest 
> > reboots.
> > 
> >We address that here by adding migration support for sPAPROptionVectors,
> >and including the CAS-negotiated vector as part of the migration stream
> >for any cases where we advertise newer-style hotplug event support to
> >the guest.
> > 
> > David,
> > 
> > These fixes ended up going out much later than planned. I'm not sure
> > if you're planning another pull for 2.8 or not, and realize there are
> > some patches here not specifically pseries-related so it's
> > understandable if we opt to pursue these for 2.9/2.8.1 instead. But if
> > possible I'm hoping to get these in so that the memory unplug
> > support is fully functional for 2.8.
> 
> Yeah, I'm still expecting to push a few bugfixes in before 2.8.  So,
> I've merged these patches into ppc-for-2.8 (fixing a couple of trivial
> style nits along the way).  I have a couple of comments that I'll make
> on the patches, but they're not important enough to stop these going
> in ASAP.
> 
> Unfortunately, of course, this is not the only migration breakage we
> have at the moment.  I'm presently wrestling with both breakage due to
> changes in the insns_flags masks, and due to the reworking of the mmio
> windows for the PHB.

Ok, thanks for the heads up. FYI I'm still hoping to get the insns_flags
fix in for 2.7.1 (which is a bit behind at this point, should have schedule
and initial tree posted next week though), so I will keep an eye out for
those.

> 
> > 
> > 
> >  hw/core/qdev.c  |  6 +-
> >  hw/ppc/spapr.c  | 68 
> > 
> >  hw/ppc/spapr_drc.c  | 70 
> > ++
> >  hw/ppc/spapr_ovec.c | 12 
> >  hw/ppc/spapr_pci.c  | 22 ++
> >  include/hw/ppc/spapr_drc.h  |  9 +
> >  include/hw/ppc/spapr_ovec.h |  4 
> >  include/hw/qdev-core.h  |  9 +
> >  migration/savevm.c  |  4 ++--
> >  9 files changed, 201 insertions(+), 3 deletions(-)
> > 
> 
> -- 
> David Gibson| I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
> | _way_ _around_!
> http://www.ozlabs.org/~dgibson




Re: [Qemu-devel] [PATCH for-2.8 3/3] spapr: migration support for CAS-negotiated option vectors

2016-11-18 Thread Michael Roth
Quoting Michael Roth (2016-11-17 19:40:27)
> With the additional of the OV5_HP_EVT option vector, we now have
> certain functionality (namely, memory unplug) that checks at run-time
> for whether or not the guest negotiated the option via CAS. Because
> we don't currently migrate these negotiated values, we are unable
> to unplug memory from a guest after it's been migrated until after
> the guest is rebooted and CAS-negotiation is repeated.
> 
> This patch fixes this by adding CAS-negotiated options to the
> migration stream. We do this using a subsection, since the
> negotiated value of OV5_HP_EVT is the only option currently needed
> to maintain proper functionality for a running guest.
> 
> Signed-off-by: Michael Roth 
> ---
>  hw/ppc/spapr.c  | 68 
> +
>  hw/ppc/spapr_ovec.c | 12 
>  include/hw/ppc/spapr_ovec.h |  4 +++
>  3 files changed, 84 insertions(+)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 0cbab24..9e08aed 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -1267,6 +1267,70 @@ static bool version_before_3(void *opaque, int 
> version_id)
>  return version_id < 3;
>  }
> 
> +static bool spapr_ov5_cas_needed(void *opaque)
> +{
> +sPAPRMachineState *spapr = opaque;
> +sPAPROptionVector *ov5_mask = spapr_ovec_new();
> +sPAPROptionVector *ov5_legacy = spapr_ovec_new();
> +sPAPROptionVector *ov5_removed = spapr_ovec_new();
> +bool cas_needed;
> +
> +/* Prior to the introduction of sPAPROptionVector, we had two option
> + * vectors we dealt with: OV5_FORM1_AFFINITY, and OV5_DRCONF_MEMORY.
> + * Both of these options encode machine topology into the device-tree
> + * in such a way that the now-booted OS should still be able to interact
> + * appropriately with QEMU regardless of what options were actually
> + * negotiatied on the source side.
> + *
> + * As such, we can avoid migrating the CAS-negotiated options if these
> + * are the only options available on the current machine/platform.
> + * Since these are the only options available for pseries-2.7 and
> + * earlier, this allows us to maintain old->new/new->old migration
> + * compatibility.
> + *
> + * For QEMU 2.8+, there are additional CAS-negotiatable options available
> + * via default pseries-2.8 machines and explicit command-line parameters.
> + * Some of these options, like OV5_HP_EVT, *do* require QEMU to be aware
> + * of the actual CAS-negotiated values to continue working properly. For
> + * example, availability of memory unplug depends on knowing whether
> + * OV5_HP_EVT was negotiated via CAS.
> + *
> + * Thus, for any cases where the set of available CAS-negotiatable
> + * options extends beyond OV5_FORM1_AFFINITY and OV5_DRCONF_MEMORY, we
> + * include the CAS-negotiated options in the migration stream.
> + */
> +spapr_ovec_set(ov5_mask, OV5_FORM1_AFFINITY);
> +spapr_ovec_set(ov5_mask, OV5_DRCONF_MEMORY);
> +
> +/* spapr_ovec_diff returns true if bits were removed. we avoid using
> + * the mask itself since in the future it's possible "legacy" bits may be
> + * removed via machine options, which could generate a false positive
> + * that breaks migration.
> + */
> +spapr_ovec_intersect(ov5_legacy, spapr->ov5, ov5_mask);
> +cas_needed = spapr_ovec_diff(ov5_removed, spapr->ov5, ov5_legacy);
> +
> +spapr_ovec_cleanup(ov5_mask);
> +spapr_ovec_cleanup(ov5_legacy);
> +spapr_ovec_cleanup(ov5_removed);
> +
> +error_report("MIGRATION NEEDED: %d", cas_needed);

Argh, sorry, I just noticed this stray debug comment that slipped in.
Would you prefer a v2, or just removing it in-tree?

> +
> +return cas_needed;
> +}
> +
> +static const VMStateDescription vmstate_spapr_ov5_cas = {
> +.name = "spapr_option_vector_ov5_cas",
> +.version_id = 1,
> +.minimum_version_id = 1,
> +.needed = spapr_ov5_cas_needed,
> +.fields = (VMStateField[]) {
> +VMSTATE_STRUCT_POINTER_V(ov5_cas, sPAPRMachineState, 1,
> + vmstate_spapr_ovec, sPAPROptionVector),
> +VMSTATE_END_OF_LIST()
> +},
> +};
> +
>  static const VMStateDescription vmstate_spapr = {
>  .name = "spapr",
>  .version_id = 3,
> @@ -1282,6 +1346,10 @@ static const VMStateDescription vmstate_spapr = {
>  VMSTATE_PPC_TIMEBASE_V(tb, sPAPRMachineState, 2),
>  VMSTATE_END_OF_LIST()
>  },
> +.subsections = (const VMStateDescription*[]) {
> +&vmstate_spapr_ov5_cas,
> +NULL
> +}
>  };
> 
>  static int htab_save_setup(QEMUFile *f, void *opaque)
> diff --git a/hw/ppc/spapr_ovec.c b/hw/ppc/spapr_ovec.c
> index c2a0d18..3eb1d59 100644
> --- a/hw/ppc/spapr_ovec.c
> +++ b/hw/ppc/spapr_ovec.c
> @@ -37,6 +37,17 @@
>   */
>  struct sPAPROptionVector {
>  unsigned long *bitmap;
> +int32_t bitmap_size; /* only used for migration

[Qemu-devel] [PATCH 4/5] sheepdog: simplify inflight_aio_head management

2016-11-18 Thread Paolo Bonzini
Add to the list in add_aio_request and, indirectly, resend_aioreq.  Inline
free_aio_req in the caller, it does not simply undo alloc_aio_req's job.

Cc: jc...@redhat.com
Cc: qemu-bl...@nongnu.org
Cc: Hitoshi Mitake 
Cc: Liu Yuan 
Signed-off-by: Paolo Bonzini 
---
 block/sheepdog.c | 23 ++-
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index c09de31..e5ac733 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -479,16 +479,6 @@ static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, 
SheepdogAIOCB *acb,
 return aio_req;
 }
 
-static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
-{
-SheepdogAIOCB *acb = aio_req->aiocb;
-
-QLIST_REMOVE(aio_req, aio_siblings);
-g_free(aio_req);
-
-acb->nr_pending--;
-}
-
 static void sd_aio_setup(SheepdogAIOCB *acb, BDRVSheepdogState *s,
  QEMUIOVector *qiov, int64_t sector_num, int 
nb_sectors,
  int type)
@@ -730,7 +720,6 @@ static coroutine_fn void reconnect_to_sdog(void *opaque)
 while (!QLIST_EMPTY(&s->failed_aio_head)) {
 aio_req = QLIST_FIRST(&s->failed_aio_head);
 QLIST_REMOVE(aio_req, aio_siblings);
-QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
 resend_aioreq(s, aio_req);
 }
 }
@@ -825,6 +814,7 @@ static void coroutine_fn aio_read_response(void *opaque)
 */
 s->co_recv = NULL;
 
+QLIST_REMOVE(aio_req, aio_siblings);
 switch (rsp.result) {
 case SD_RES_SUCCESS:
 break;
@@ -849,8 +839,9 @@ static void coroutine_fn aio_read_response(void *opaque)
 break;
 }
 
-free_aio_req(s, aio_req);
-if (!acb->nr_pending) {
+g_free(aio_req);
+
+if (!--acb->nr_pending) {
 /*
  * We've finished all requests which belong to the AIOCB, so
  * we can switch back to sd_co_readv/writev now.
@@ -1108,6 +1099,8 @@ static void coroutine_fn 
add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
 uint64_t old_oid = aio_req->base_oid;
 bool create = aio_req->create;
 
+QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
+
 if (!nr_copies) {
 error_report("bug");
 }
@@ -1981,7 +1974,6 @@ static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
 iov.iov_len = sizeof(s->inode);
 aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
 data_len, offset, 0, false, 0, offset);
-QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
 add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA);
 if (--acb->nr_pending) {
 qemu_coroutine_yield();
@@ -2183,8 +2175,6 @@ static void coroutine_fn sd_co_rw_vector(SheepdogAIOCB 
*acb)
 old_oid,
 acb->aiocb_type == AIOCB_DISCARD_OBJ ?
 0 : done);
-QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
-
 add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
 acb->aiocb_type);
 done:
@@ -2278,7 +2268,6 @@ static int coroutine_fn 
sd_co_flush_to_disk(BlockDriverState *bs)
 acb.nr_pending++;
 aio_req = alloc_aio_req(s, &acb, vid_to_vdi_oid(s->inode.vdi_id),
 0, 0, 0, false, 0, 0);
-QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
 add_aio_request(s, aio_req, NULL, 0, acb.aiocb_type);
 
 if (--acb.nr_pending) {
-- 
2.9.3





[Qemu-devel] [PATCH 5/5] sheepdog: reorganize check for overlapping requests

2016-11-18 Thread Paolo Bonzini
Wrap the code that was copied repeatedly in the two functions,
sd_aio_setup and sd_aio_complete.

Cc: jc...@redhat.com
Cc: qemu-bl...@nongnu.org
Cc: Hitoshi Mitake 
Cc: Liu Yuan 
Signed-off-by: Paolo Bonzini 
---
 block/sheepdog.c | 66 ++--
 1 file changed, 30 insertions(+), 36 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index e5ac733..07271bc 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -479,6 +479,19 @@ static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, 
SheepdogAIOCB *acb,
 return aio_req;
 }
 
+static void wait_for_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB 
*acb)
+{
+SheepdogAIOCB *cb;
+
+retry:
+QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
+if (AIOCBOverlapping(acb, cb)) {
+qemu_co_queue_wait(&s->overlapping_queue);
+goto retry;
+}
+}
+}
+
 static void sd_aio_setup(SheepdogAIOCB *acb, BDRVSheepdogState *s,
  QEMUIOVector *qiov, int64_t sector_num, int 
nb_sectors,
  int type)
@@ -505,6 +518,13 @@ static void sd_aio_setup(SheepdogAIOCB *acb, 
BDRVSheepdogState *s,
 acb->min_dirty_data_idx = UINT32_MAX;
 acb->max_dirty_data_idx = 0;
 acb->aiocb_type = type;
+
+if (type == AIOCB_FLUSH_CACHE) {
+return;
+}
+
+wait_for_overlapping_aiocb(s, acb);
+QLIST_INSERT_HEAD(&s->inflight_aiocb_head, acb, aiocb_siblings);
 }
 
 /* Return -EIO in case of error, file descriptor on success */
@@ -2187,18 +2207,14 @@ static void coroutine_fn sd_co_rw_vector(SheepdogAIOCB 
*acb)
 }
 }
 
-static bool check_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *aiocb)
+static void sd_aio_complete(SheepdogAIOCB *acb)
 {
-SheepdogAIOCB *cb;
-
-QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
-if (AIOCBOverlapping(aiocb, cb)) {
-return true;
-}
+if (acb->aiocb_type == AIOCB_FLUSH_CACHE) {
+return;
 }
 
-QLIST_INSERT_HEAD(&s->inflight_aiocb_head, aiocb, aiocb_siblings);
-return false;
+QLIST_REMOVE(acb, aiocb_siblings);
+qemu_co_queue_restart_all(&acb->s->overlapping_queue);
 }
 
 static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
@@ -2217,18 +2233,10 @@ static coroutine_fn int sd_co_writev(BlockDriverState 
*bs, int64_t sector_num,
 }
 
 sd_aio_setup(&acb, s, qiov, sector_num, nb_sectors, AIOCB_WRITE_UDATA);
-
-retry:
-if (check_overlapping_aiocb(s, &acb)) {
-qemu_co_queue_wait(&s->overlapping_queue);
-goto retry;
-}
-
 sd_co_rw_vector(&acb);
 sd_write_done(&acb);
+sd_aio_complete(&acb);
 
-QLIST_REMOVE(&acb, aiocb_siblings);
-qemu_co_queue_restart_all(&s->overlapping_queue);
 return acb.ret;
 }
 
@@ -2239,17 +2247,9 @@ static coroutine_fn int sd_co_readv(BlockDriverState 
*bs, int64_t sector_num,
 BDRVSheepdogState *s = bs->opaque;
 
 sd_aio_setup(&acb, s, qiov, sector_num, nb_sectors, AIOCB_READ_UDATA);
-
-retry:
-if (check_overlapping_aiocb(s, &acb)) {
-qemu_co_queue_wait(&s->overlapping_queue);
-goto retry;
-}
-
 sd_co_rw_vector(&acb);
+sd_aio_complete(&acb);
 
-QLIST_REMOVE(&acb, aiocb_siblings);
-qemu_co_queue_restart_all(&s->overlapping_queue);
 return acb.ret;
 }
 
@@ -2273,6 +2273,8 @@ static int coroutine_fn 
sd_co_flush_to_disk(BlockDriverState *bs)
 if (--acb.nr_pending) {
 qemu_coroutine_yield();
 }
+
+sd_aio_complete(&acb);
 return acb.ret;
 }
 
@@ -2727,17 +2729,9 @@ static coroutine_fn int sd_co_pdiscard(BlockDriverState 
*bs, int64_t offset,
 assert((count & (BDRV_SECTOR_SIZE - 1)) == 0);
 sd_aio_setup(&acb, s, &discard_iov, offset >> BDRV_SECTOR_BITS,
  count >> BDRV_SECTOR_BITS, AIOCB_DISCARD_OBJ);
-
-retry:
-if (check_overlapping_aiocb(s, &acb)) {
-qemu_co_queue_wait(&s->overlapping_queue);
-goto retry;
-}
-
 sd_co_rw_vector(&acb);
+sd_aio_complete(&acb);
 
-QLIST_REMOVE(&acb, aiocb_siblings);
-qemu_co_queue_restart_all(&s->overlapping_queue);
 return acb.ret;
 }
 
-- 
2.9.3




[Qemu-devel] [PATCH 2/5] sheepdog: reorganize coroutine flow

2016-11-18 Thread Paolo Bonzini
Delimit co_recv's lifetime clearly in aio_read_response.

Do a simple qemu_coroutine_enter in aio_read_response, letting
sd_co_writev call sd_write_done.

Handle nr_pending in the same way in sd_co_rw_vector,
sd_write_done and sd_co_flush_to_disk.

Remove sd_co_rw_vector's return value; just leave with no
pending requests.

Cc: jc...@redhat.com
Cc: qemu-bl...@nongnu.org
Cc: Hitoshi Mitake 
Cc: Liu Yuan 
Signed-off-by: Paolo Bonzini 
---
 block/sheepdog.c | 115 ---
 1 file changed, 41 insertions(+), 74 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index d2b14fd..f849941 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -345,8 +345,6 @@ struct SheepdogAIOCB {
 enum AIOCBState aiocb_type;
 
 Coroutine *coroutine;
-void (*aio_done_func)(SheepdogAIOCB *);
-
 int nr_pending;
 
 uint32_t min_affect_data_idx;
@@ -449,14 +447,13 @@ static const char * sd_strerror(int err)
  *
  * 1. In sd_co_rw_vector, we send the I/O requests to the server and
  *link the requests to the inflight_list in the
- *BDRVSheepdogState.  The function exits without waiting for
+ *BDRVSheepdogState.  The function yields while waiting for
  *receiving the response.
  *
  * 2. We receive the response in aio_read_response, the fd handler to
- *the sheepdog connection.  If metadata update is needed, we send
- *the write request to the vdi object in sd_write_done, the write
- *completion function.  We switch back to sd_co_readv/writev after
- *all the requests belonging to the AIOCB are finished.
+ *the sheepdog connection.  We switch back to sd_co_readv/sd_writev
+ *after all the requests belonging to the AIOCB are finished.  If
+ *needed, sd_co_writev will send another requests for the vdi object.
  */
 
 static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
@@ -491,12 +488,6 @@ static inline void free_aio_req(BDRVSheepdogState *s, 
AIOReq *aio_req)
 acb->nr_pending--;
 }
 
-static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB *acb)
-{
-qemu_coroutine_enter(acb->coroutine);
-qemu_aio_unref(acb);
-}
-
 static const AIOCBInfo sd_aiocb_info = {
 .aiocb_size = sizeof(SheepdogAIOCB),
 };
@@ -517,7 +508,6 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, 
QEMUIOVector *qiov,
 acb->sector_num = sector_num;
 acb->nb_sectors = nb_sectors;
 
-acb->aio_done_func = NULL;
 acb->coroutine = qemu_coroutine_self();
 acb->ret = 0;
 acb->nr_pending = 0;
@@ -788,9 +778,6 @@ static void coroutine_fn aio_read_response(void *opaque)
 
 switch (acb->aiocb_type) {
 case AIOCB_WRITE_UDATA:
-/* this coroutine context is no longer suitable for co_recv
- * because we may send data to update vdi objects */
-s->co_recv = NULL;
 if (!is_data_obj(aio_req->oid)) {
 break;
 }
@@ -838,6 +825,11 @@ static void coroutine_fn aio_read_response(void *opaque)
 }
 }
 
+/* No more data for this aio_req (reload_inode below uses its own file
+ * descriptor handler which doesn't use co_recv).
+*/
+s->co_recv = NULL;
+
 switch (rsp.result) {
 case SD_RES_SUCCESS:
 break;
@@ -855,7 +847,7 @@ static void coroutine_fn aio_read_response(void *opaque)
 aio_req->oid = vid_to_vdi_oid(s->inode.vdi_id);
 }
 resend_aioreq(s, aio_req);
-goto out;
+return;
 default:
 acb->ret = -EIO;
 error_report("%s", sd_strerror(rsp.result));
@@ -868,13 +860,10 @@ static void coroutine_fn aio_read_response(void *opaque)
  * We've finished all requests which belong to the AIOCB, so
  * we can switch back to sd_co_readv/writev now.
  */
-acb->aio_done_func(acb);
+qemu_coroutine_enter(acb->coroutine);
 }
-out:
-s->co_recv = NULL;
-return;
+
 err:
-s->co_recv = NULL;
 reconnect_to_sdog(opaque);
 }
 
@@ -1973,7 +1962,6 @@ static int sd_truncate(BlockDriverState *bs, int64_t 
offset)
 /*
  * This function is called after writing data objects.  If we need to
  * update metadata, this sends a write request to the vdi object.
- * Otherwise, this switches back to sd_co_readv/writev.
  */
 static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
 {
@@ -1986,6 +1974,7 @@ static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
 mx = acb->max_dirty_data_idx;
 if (mn <= mx) {
 /* we need to update the vdi object. */
+++acb->nr_pending;
 offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +
 mn * sizeof(s->inode.data_vdi_id[0]);
 data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]);
@@ -1999,13 +1988,10 @@ static void coroutine_fn sd_write_done(SheepdogAIOCB 
*acb)
 data_len, offset, 0, false, 0, offset);
 QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
 add_ai

[Qemu-devel] [PATCH 3/5] sheepdog: do not use BlockAIOCB

2016-11-18 Thread Paolo Bonzini
Sheepdog's AIOCB are completely internal entities for a group of
requests and do not need dynamic allocation.

Cc: jc...@redhat.com
Cc: qemu-bl...@nongnu.org
Cc: Hitoshi Mitake 
Cc: Liu Yuan 
Signed-off-by: Paolo Bonzini 
---
 block/sheepdog.c | 99 ++--
 1 file changed, 39 insertions(+), 60 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index f849941..c09de31 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -306,6 +306,7 @@ static inline size_t count_data_objs(const struct 
SheepdogInode *inode)
 } while (0)
 
 typedef struct SheepdogAIOCB SheepdogAIOCB;
+typedef struct BDRVSheepdogState BDRVSheepdogState;
 
 typedef struct AIOReq {
 SheepdogAIOCB *aiocb;
@@ -334,7 +335,7 @@ enum AIOCBState {
|| y->max_affect_data_idx < x->min_affect_data_idx))
 
 struct SheepdogAIOCB {
-BlockAIOCB common;
+BDRVSheepdogState *s;
 
 QEMUIOVector *qiov;
 
@@ -362,7 +363,7 @@ struct SheepdogAIOCB {
 QLIST_ENTRY(SheepdogAIOCB) aiocb_siblings;
 };
 
-typedef struct BDRVSheepdogState {
+struct BDRVSheepdogState {
 BlockDriverState *bs;
 AioContext *aio_context;
 
@@ -389,7 +390,7 @@ typedef struct BDRVSheepdogState {
 
 CoQueue overlapping_queue;
 QLIST_HEAD(inflight_aiocb_head, SheepdogAIOCB) inflight_aiocb_head;
-} BDRVSheepdogState;
+};
 
 typedef struct BDRVSheepdogReopenState {
 int fd;
@@ -488,20 +489,15 @@ static inline void free_aio_req(BDRVSheepdogState *s, 
AIOReq *aio_req)
 acb->nr_pending--;
 }
 
-static const AIOCBInfo sd_aiocb_info = {
-.aiocb_size = sizeof(SheepdogAIOCB),
-};
-
-static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
-   int64_t sector_num, int nb_sectors)
+static void sd_aio_setup(SheepdogAIOCB *acb, BDRVSheepdogState *s,
+ QEMUIOVector *qiov, int64_t sector_num, int 
nb_sectors,
+ int type)
 {
-SheepdogAIOCB *acb;
 uint32_t object_size;
-BDRVSheepdogState *s = bs->opaque;
 
 object_size = (UINT32_C(1) << s->inode.block_size_shift);
 
-acb = qemu_aio_get(&sd_aiocb_info, bs, NULL, NULL);
+acb->s = s;
 
 acb->qiov = qiov;
 
@@ -518,8 +514,7 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, 
QEMUIOVector *qiov,
 
 acb->min_dirty_data_idx = UINT32_MAX;
 acb->max_dirty_data_idx = 0;
-
-return acb;
+acb->aiocb_type = type;
 }
 
 /* Return -EIO in case of error, file descriptor on success */
@@ -1965,7 +1960,7 @@ static int sd_truncate(BlockDriverState *bs, int64_t 
offset)
  */
 static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
 {
-BDRVSheepdogState *s = acb->common.bs->opaque;
+BDRVSheepdogState *s = acb->s;
 struct iovec iov;
 AIOReq *aio_req;
 uint32_t offset, data_len, mn, mx;
@@ -2103,16 +2098,15 @@ out:
  * Returns 1 when we need to wait a response, 0 when there is no sent
  * request and -errno in error cases.
  */
-static void coroutine_fn sd_co_rw_vector(void *p)
+static void coroutine_fn sd_co_rw_vector(SheepdogAIOCB *acb)
 {
-SheepdogAIOCB *acb = p;
 int ret = 0;
 unsigned long len, done = 0, total = acb->nb_sectors * BDRV_SECTOR_SIZE;
 unsigned long idx;
 uint32_t object_size;
 uint64_t oid;
 uint64_t offset;
-BDRVSheepdogState *s = acb->common.bs->opaque;
+BDRVSheepdogState *s = acb->s;
 SheepdogInode *inode = &s->inode;
 AIOReq *aio_req;
 
@@ -2220,7 +2214,7 @@ static bool check_overlapping_aiocb(BDRVSheepdogState *s, 
SheepdogAIOCB *aiocb)
 static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
 int nb_sectors, QEMUIOVector *qiov)
 {
-SheepdogAIOCB *acb;
+SheepdogAIOCB acb;
 int ret;
 int64_t offset = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE;
 BDRVSheepdogState *s = bs->opaque;
@@ -2232,76 +2226,65 @@ static coroutine_fn int sd_co_writev(BlockDriverState 
*bs, int64_t sector_num,
 }
 }
 
-acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
-acb->aiocb_type = AIOCB_WRITE_UDATA;
+sd_aio_setup(&acb, s, qiov, sector_num, nb_sectors, AIOCB_WRITE_UDATA);
 
 retry:
-if (check_overlapping_aiocb(s, acb)) {
+if (check_overlapping_aiocb(s, &acb)) {
 qemu_co_queue_wait(&s->overlapping_queue);
 goto retry;
 }
 
-sd_co_rw_vector(acb);
-sd_write_done(acb);
+sd_co_rw_vector(&acb);
+sd_write_done(&acb);
 
-QLIST_REMOVE(acb, aiocb_siblings);
+QLIST_REMOVE(&acb, aiocb_siblings);
 qemu_co_queue_restart_all(&s->overlapping_queue);
-ret = acb->ret;
-qemu_aio_unref(acb);
-return ret;
+return acb.ret;
 }
 
 static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
int nb_sectors, QEMUIOVector *qiov)
 {
-SheepdogAIOCB *acb;
-int ret;
+SheepdogAIOCB acb;
 BDRVSheepdogState *s = bs->opaque;
 
-acb = sd_aio_setup(bs, qiov, se

[Qemu-devel] [PATCH 1/5] sheepdog: remove unused cancellation support

2016-11-18 Thread Paolo Bonzini
SheepdogAIOCB is internal to sheepdog.c, hence it is never canceled.

Cc: jc...@redhat.com
Cc: qemu-bl...@nongnu.org
Cc: Hitoshi Mitake 
Cc: Liu Yuan 
Signed-off-by: Paolo Bonzini 
---
 block/sheepdog.c | 52 
 1 file changed, 52 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 1fb9173..d2b14fd 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -347,7 +347,6 @@ struct SheepdogAIOCB {
 Coroutine *coroutine;
 void (*aio_done_func)(SheepdogAIOCB *);
 
-bool cancelable;
 int nr_pending;
 
 uint32_t min_affect_data_idx;
@@ -486,7 +485,6 @@ static inline void free_aio_req(BDRVSheepdogState *s, 
AIOReq *aio_req)
 {
 SheepdogAIOCB *acb = aio_req->aiocb;
 
-acb->cancelable = false;
 QLIST_REMOVE(aio_req, aio_siblings);
 g_free(aio_req);
 
@@ -499,57 +497,8 @@ static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB 
*acb)
 qemu_aio_unref(acb);
 }
 
-/*
- * Check whether the specified acb can be canceled
- *
- * We can cancel aio when any request belonging to the acb is:
- *  - Not processed by the sheepdog server.
- *  - Not linked to the inflight queue.
- */
-static bool sd_acb_cancelable(const SheepdogAIOCB *acb)
-{
-BDRVSheepdogState *s = acb->common.bs->opaque;
-AIOReq *aioreq;
-
-if (!acb->cancelable) {
-return false;
-}
-
-QLIST_FOREACH(aioreq, &s->inflight_aio_head, aio_siblings) {
-if (aioreq->aiocb == acb) {
-return false;
-}
-}
-
-return true;
-}
-
-static void sd_aio_cancel(BlockAIOCB *blockacb)
-{
-SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb;
-BDRVSheepdogState *s = acb->common.bs->opaque;
-AIOReq *aioreq, *next;
-
-if (sd_acb_cancelable(acb)) {
-/* Remove outstanding requests from failed queue.  */
-QLIST_FOREACH_SAFE(aioreq, &s->failed_aio_head, aio_siblings,
-   next) {
-if (aioreq->aiocb == acb) {
-free_aio_req(s, aioreq);
-}
-}
-
-assert(acb->nr_pending == 0);
-if (acb->common.cb) {
-acb->common.cb(acb->common.opaque, -ECANCELED);
-}
-sd_finish_aiocb(acb);
-}
-}
-
 static const AIOCBInfo sd_aiocb_info = {
 .aiocb_size = sizeof(SheepdogAIOCB),
-.cancel_async   = sd_aio_cancel,
 };
 
 static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
@@ -569,7 +518,6 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, 
QEMUIOVector *qiov,
 acb->nb_sectors = nb_sectors;
 
 acb->aio_done_func = NULL;
-acb->cancelable = true;
 acb->coroutine = qemu_coroutine_self();
 acb->ret = 0;
 acb->nr_pending = 0;
-- 
2.9.3





[Qemu-devel] [PATCH for-2.9 v2 0/5] Sheepdog cleanups

2016-11-18 Thread Paolo Bonzini


Cleaning up the code and removing duplication makes it simpler to
later adapt it for the multiqueue work.

Tested against sheepdog 1.0.  I also tested taking snapshots and reverting
to older snapshots, but the latter only worked with "dog vdi rollback".
Neither loadvm nor qemu-img worked for me.

Paolo

v1->v2: placate patchew

Paolo Bonzini (5):
  sheepdog: remove unused cancellation support
  sheepdog: reorganize coroutine flow
  sheepdog: do not use BlockAIOCB
  sheepdog: simplify inflight_aio_head management
  sheepdog: reorganize check for overlapping requests

 block/sheepdog.c | 289 ---
 1 file changed, 83 insertions(+), 206 deletions(-)

-- 
2.9.3




Re: [Qemu-devel] [PATCH] xen_disk: convert discard input to byte ranges

2016-11-18 Thread Eric Blake
On 11/18/2016 04:24 AM, Olaf Hering wrote:
> The guest sends discard requests as u64 sector/count pairs, but the
> block layer operates internally with s64/s32 pairs. The conversion
> leads to IO errors in the guest, the discard request is not processed.
> 
>   domU.cfg:
>   'vdev=xvda, format=qcow2, backendtype=qdisk, target=/x.qcow2'
>   domU:
>   mkfs.ext4 -F /dev/xvda
>   Discarding device blocks: failed - Input/output error
> 
> Fix this by splitting the request into chunks of BDRV_REQUEST_MAX_SECTORS.
> Add input range checking to avoid overflow.
> 
> Signed-off-by: Olaf Hering 
> ---
>  hw/block/xen_disk.c | 45 +++--
>  1 file changed, 39 insertions(+), 6 deletions(-)
> 
> diff --git a/hw/block/xen_disk.c b/hw/block/xen_disk.c
> index 3a7dc19..c3f572f 100644
> --- a/hw/block/xen_disk.c
> +++ b/hw/block/xen_disk.c
> @@ -660,6 +660,41 @@ static void qemu_aio_complete(void *opaque, int ret)
>  qemu_bh_schedule(ioreq->blkdev->bh);
>  }
>  
> +static bool blk_split_discard(struct ioreq *ioreq, blkif_sector_t 
> sector_number,
> +  uint64_t nr_sectors)
> +{
> +struct XenBlkDev *blkdev = ioreq->blkdev;
> +int64_t byte_offset;
> +int byte_chunk;
> +uint64_t sec_start = sector_number;
> +uint64_t sec_count = nr_sectors;
> +uint64_t byte_remaining;
> +uint64_t limit = BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS;

[For reference, this limit is the same as rounding INT32_MAX down to the
nearest 512-byte limit, or 0x7e00]

> +
> +/* Wrap around? */
> +if ((sec_start + sec_count) < sec_count) {
> +return false;
> +}
> +/* Overflowing byte limit? */
> +if ((sec_start + sec_count) > ((INT64_MAX + INT_MAX) >> 
> BDRV_SECTOR_BITS)) {

This is undefined.  INT64_MAX + anything non-negative overflows int64,
and even if you treat overflow as defined by twos-complement
representation (which creates a negative number), shifting a negative
number is also undefined.

If you are trying to detect guests that make a request that would cover
more than INT64_MAX bytes, you can simplify.  Besides, for as much
storage as there is out there, I seriously doubt ANYONE will ever have
2^63 bytes addressable through a single device.  Why not just write it as:

if ((INT64_MAX >> BDRV_SECTOR_BITS) - sec_count < sec_start) {

> +return false;
> +}
> +
> +byte_offset = sec_start << BDRV_SECTOR_BITS;
> +byte_remaining = sec_count << BDRV_SECTOR_BITS;
> +
> +do {
> +byte_chunk = byte_remaining > limit ? limit : byte_remaining;
> +ioreq->aio_inflight++;
> +blk_aio_pdiscard(blkdev->blk, byte_offset, byte_chunk,
> + qemu_aio_complete, ioreq);
> +byte_remaining -= byte_chunk;
> +byte_offset += byte_chunk;
> +} while (byte_remaining > 0);

This part looks reasonable.

> +
> +return true;
> +}
> +
>  static int ioreq_runio_qemu_aio(struct ioreq *ioreq)
>  {
>  struct XenBlkDev *blkdev = ioreq->blkdev;
> @@ -708,12 +743,10 @@ static int ioreq_runio_qemu_aio(struct ioreq *ioreq)
>  break;
>  case BLKIF_OP_DISCARD:
>  {
> -struct blkif_request_discard *discard_req = (void *)&ioreq->req;

The old code had it...

> -ioreq->aio_inflight++;
> -blk_aio_pdiscard(blkdev->blk,
> - discard_req->sector_number << BDRV_SECTOR_BITS,
> - discard_req->nr_sectors << BDRV_SECTOR_BITS,
> - qemu_aio_complete, ioreq);
> +struct blkif_request_discard *req = (void *)&ioreq->req;

...but C doesn't require a cast to void*. As long as you are touching
this, you could remove the cast (unless I'm missing something, and the
cast is also there to cast away const).

> +if (!blk_split_discard(ioreq, req->sector_number, req->nr_sectors)) {
> +goto err;
> +}
>  break;
>  }
>  default:
> 
> 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] [PATCH for-2.9 0/5] Sheepdog cleanups

2016-11-18 Thread no-reply
Hi,

Your series failed automatic build test. Please find the testing commands and
their output below. If you have docker installed, you can probably reproduce it
locally.

Subject: [Qemu-devel] [PATCH for-2.9 0/5] Sheepdog cleanups
Type: series
Message-id: 20161118155500.11050-1-pbonz...@redhat.com

=== TEST SCRIPT BEGIN ===
#!/bin/bash
set -e
git submodule update --init dtc
# Let docker tests dump environment info
export SHOW_ENV=1
export J=16
make docker-test-quick@centos6
make docker-test-mingw@fedora
make docker-test-build@min-glib
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
Switched to a new branch 'test'
9c884e7 sheepdog: reorganize check for overlapping requests
de86cde sheepdog: simplify inflight_aio_head management
36a2558 sheepdog: do not use BlockAIOCB
1fd9950 sheepdog: reorganize coroutine flow
b7f8f7b sheepdog: remove unused cancellation support

=== OUTPUT BEGIN ===
Submodule 'dtc' (git://git.qemu-project.org/dtc.git) registered for path 'dtc'
Cloning into 'dtc'...
Submodule path 'dtc': checked out '65cc4d2748a2c2e6f27f1cf39e07a5dbabd80ebf'
  BUILD   centos6
make[1]: Entering directory `/var/tmp/patchew-tester-tmp-pdjhtmz8/src'
  ARCHIVE qemu.tgz
  ARCHIVE dtc.tgz
  COPYRUNNER
RUN test-quick in qemu:centos6 
Packages installed:
SDL-devel-1.2.14-7.el6_7.1.x86_64
ccache-3.1.6-2.el6.x86_64
epel-release-6-8.noarch
gcc-4.4.7-17.el6.x86_64
git-1.7.1-4.el6_7.1.x86_64
glib2-devel-2.28.8-5.el6.x86_64
libfdt-devel-1.4.0-1.el6.x86_64
make-3.81-23.el6.x86_64
package g++ is not installed
pixman-devel-0.32.8-1.el6.x86_64
tar-1.23-15.el6_8.x86_64
zlib-devel-1.2.3-29.el6.x86_64

Environment variables:
PACKAGES=libfdt-devel ccache tar git make gcc g++ zlib-devel 
glib2-devel SDL-devel pixman-devel epel-release
HOSTNAME=9b45bd20533a
TERM=xterm
MAKEFLAGS= -j16
HISTSIZE=1000
J=16
USER=root
CCACHE_DIR=/var/tmp/ccache
EXTRA_CONFIGURE_OPTS=
V=
SHOW_ENV=1
MAIL=/var/spool/mail/root
PATH=/usr/lib/ccache:/usr/lib64/ccache:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
PWD=/
LANG=en_US.UTF-8
TARGET_LIST=
HISTCONTROL=ignoredups
SHLVL=1
HOME=/root
TEST_DIR=/tmp/qemu-test
LOGNAME=root
LESSOPEN=||/usr/bin/lesspipe.sh %s
FEATURES= dtc
DEBUG=
G_BROKEN_FILENAMES=1
CCACHE_HASHDIR=
_=/usr/bin/env

Configure options:
--enable-werror --target-list=x86_64-softmmu,aarch64-softmmu 
--prefix=/var/tmp/qemu-build/install
No C++ compiler available; disabling C++ specific optional code
Install prefix/var/tmp/qemu-build/install
BIOS directory/var/tmp/qemu-build/install/share/qemu
binary directory  /var/tmp/qemu-build/install/bin
library directory /var/tmp/qemu-build/install/lib
module directory  /var/tmp/qemu-build/install/lib/qemu
libexec directory /var/tmp/qemu-build/install/libexec
include directory /var/tmp/qemu-build/install/include
config directory  /var/tmp/qemu-build/install/etc
local state directory   /var/tmp/qemu-build/install/var
Manual directory  /var/tmp/qemu-build/install/share/man
ELF interp prefix /usr/gnemul/qemu-%M
Source path   /tmp/qemu-test/src
C compilercc
Host C compiler   cc
C++ compiler  
Objective-C compiler cc
ARFLAGS   rv
CFLAGS-O2 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 -g 
QEMU_CFLAGS   -I/usr/include/pixman-1-pthread -I/usr/include/glib-2.0 
-I/usr/lib64/glib-2.0/include   -fPIE -DPIE -m64 -mcx16 -D_GNU_SOURCE 
-D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -Wstrict-prototypes 
-Wredundant-decls -Wall -Wundef -Wwrite-strings -Wmissing-prototypes 
-fno-strict-aliasing -fno-common -fwrapv  -Wendif-labels -Wmissing-include-dirs 
-Wempty-body -Wnested-externs -Wformat-security -Wformat-y2k -Winit-self 
-Wignored-qualifiers -Wold-style-declaration -Wold-style-definition 
-Wtype-limits -fstack-protector-all
LDFLAGS   -Wl,--warn-common -Wl,-z,relro -Wl,-z,now -pie -m64 -g 
make  make
install   install
pythonpython -B
smbd  /usr/sbin/smbd
module supportno
host CPU  x86_64
host big endian   no
target list   x86_64-softmmu aarch64-softmmu
tcg debug enabled no
gprof enabled no
sparse enabledno
strip binariesyes
profiler  no
static build  no
pixmansystem
SDL support   yes (1.2.14)
GTK support   no 
GTK GL supportno
VTE support   no 
TLS priority  NORMAL
GNUTLS supportno
GNUTLS rndno
libgcrypt no
libgcrypt kdf no
nettleno 
nettle kdfno
libtasn1  no
curses supportno
virgl support no
curl support  no
mingw32 support   no
Audio drivers oss
Block whitelist (rw) 
Block whitelist (ro) 
VirtFS supportno
VNC support   yes
VNC SASL support  no
VNC JPEG support  no
VNC PNG support   no
xen support   no
brlapi supportno
bluez  supportno
Documentation no
PIE   yes
vde support   no
netmap supportno
Linux AIO support no
ATTR/XATTR support yes
Install blobs yes
KVM support   yes
COLO su

[Qemu-devel] [PATCH 4/5] sheepdog: simplify inflight_aio_head management

2016-11-18 Thread Paolo Bonzini
Add to the list in add_aio_request and, indirectly, resend_aioreq.  Inline
free_aio_req in the caller, it does not simply undo alloc_aio_req's job.

Cc: Hitoshi Mitake 
Cc: Liu Yuan 
Signed-off-by: Paolo Bonzini 
---
 block/sheepdog.c | 23 ++-
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 5071c41..5aef382 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -479,16 +479,6 @@ static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, 
SheepdogAIOCB *acb,
 return aio_req;
 }
 
-static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
-{
-SheepdogAIOCB *acb = aio_req->aiocb;
-
-QLIST_REMOVE(aio_req, aio_siblings);
-g_free(aio_req);
-
-acb->nr_pending--;
-}
-
 static void sd_aio_setup(SheepdogAIOCB *acb, BDRVSheepdogState *s,
  QEMUIOVector *qiov, int64_t sector_num, int 
nb_sectors,
  int type)
@@ -730,7 +720,6 @@ static coroutine_fn void reconnect_to_sdog(void *opaque)
 while (!QLIST_EMPTY(&s->failed_aio_head)) {
 aio_req = QLIST_FIRST(&s->failed_aio_head);
 QLIST_REMOVE(aio_req, aio_siblings);
-QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
 resend_aioreq(s, aio_req);
 }
 }
@@ -825,6 +814,7 @@ static void coroutine_fn aio_read_response(void *opaque)
 */
 s->co_recv = NULL;
 
+QLIST_REMOVE(aio_req, aio_siblings);
 switch (rsp.result) {
 case SD_RES_SUCCESS:
 break;
@@ -849,8 +839,9 @@ static void coroutine_fn aio_read_response(void *opaque)
 break;
 }
 
-free_aio_req(s, aio_req);
-if (!acb->nr_pending) {
+g_free(aio_req);
+
+if (!--acb->nr_pending) {
 /*
  * We've finished all requests which belong to the AIOCB, so
  * we can switch back to sd_co_readv/writev now.
@@ -1108,6 +1099,8 @@ static void coroutine_fn 
add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
 uint64_t old_oid = aio_req->base_oid;
 bool create = aio_req->create;
 
+QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
+
 if (!nr_copies) {
 error_report("bug");
 }
@@ -1981,7 +1974,6 @@ static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
 iov.iov_len = sizeof(s->inode);
 aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
 data_len, offset, 0, false, 0, offset);
-QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
 add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA);
 if (--acb->nr_pending) {
 qemu_coroutine_yield();
@@ -2183,8 +2175,6 @@ static void coroutine_fn sd_co_rw_vector(SheepdogAIOCB 
*acb)
 old_oid,
 acb->aiocb_type == AIOCB_DISCARD_OBJ ?
 0 : done);
-QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
-
 add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
 acb->aiocb_type);
 done:
@@ -2278,7 +2268,6 @@ static int coroutine_fn 
sd_co_flush_to_disk(BlockDriverState *bs)
 acb.nr_pending++;
 aio_req = alloc_aio_req(s, &acb, vid_to_vdi_oid(s->inode.vdi_id),
 0, 0, 0, false, 0, 0);
-QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
 add_aio_request(s, aio_req, NULL, 0, acb.aiocb_type);
 
 if (--acb.nr_pending) {
-- 
2.9.3





Re: [Qemu-devel] [PATCH v7 RFC] block/vxhs: Initial commit to add Veritas HyperScale VxHS block device support

2016-11-18 Thread Jeff Cody
On Fri, Nov 18, 2016 at 10:34:54AM +, Ketan Nilangekar wrote:
> 
> 
> 
> 
> 
> On 11/18/16, 12:56 PM, "Jeff Cody"  wrote:
> 
> >On Wed, Nov 16, 2016 at 08:12:41AM +, Stefan Hajnoczi wrote:
> >> On Tue, Nov 15, 2016 at 10:38 PM, ashish mittal  
> >> wrote:
> >> > On Wed, Sep 28, 2016 at 2:45 PM, Stefan Hajnoczi  
> >> > wrote:
> >> >> On Tue, Sep 27, 2016 at 09:09:49PM -0700, Ashish Mittal wrote:
> >> >> 5.
> >> >> I don't see any endianness handling or portable alignment of struct
> >> >> fields in the network protocol code.  Binary network protocols need to
> >> >> take care of these issue for portability.  This means libqnio compiled
> >> >> for different architectures will not work.  Do you plan to support any
> >> >> other architectures besides x86?
> >> >>
> >> >
> >> > No, we support only x86 and do not plan to support any other arch.
> >> > Please let me know if this necessitates any changes to the configure
> >> > script.
> >> 
> >> I think no change to ./configure is necessary.  The library will only
> >> ship on x86 so other platforms will never attempt to compile the code.
> >> 
> >> >> 6.
> >> >> The networking code doesn't look robust: kvset uses assert() on input
> >> >> from the network so the other side of the connection could cause SIGABRT
> >> >> (coredump), the client uses the msg pointer as the cookie for the
> >> >> response packet so the server can easily crash the client by sending a
> >> >> bogus cookie value, etc.  Even on the client side these things are
> >> >> troublesome but on a server they are guaranteed security issues.  I
> >> >> didn't look into it deeply.  Please audit the code.
> >> >>
> >> >
> >> > By design, our solution on OpenStack platform uses a closed set of
> >> > nodes communicating on dedicated networks. VxHS servers on all the
> >> > nodes are on a dedicated network. Clients (qemu) connects to these
> >> > only after reading the server IP from the XML (read by libvirt). The
> >> > XML cannot be modified without proper access. Therefore, IMO this
> >> > problem would be  relevant only if someone were to use qnio as a
> >> > generic mode of communication/data transfer, but for our use-case, we
> >> > will not run into this problem. Is this explanation acceptable?
> >> 
> >> No.  The trust model is that the guest is untrusted and in the worst
> >> case may gain code execution in QEMU due to security bugs.
> >> 
> >> You are assuming block/vxhs.c and libqnio are trusted but that
> >> assumption violates the trust model.
> >> 
> >> In other words:
> >> 1. Guest exploits a security hole inside QEMU and gains code execution
> >> on the host.
> >> 2. Guest uses VxHS client file descriptor on host to send a malicious
> >> packet to VxHS server.
> >> 3. VxHS server is compromised by guest.
> >> 4. Compromised VxHS server sends malicious packets to all other
> >> connected clients.
> >> 5. All clients have been compromised.
> >> 
> >> This means both the VxHS client and server must be robust.  They have
> >> to validate inputs to avoid buffer overflows, assertion failures,
> >> infinite loops, etc.
> >> 
> >> Stefan
> >
> >
> >The libqnio code is important with respect to the VxHS driver.  It is a bit
> >different than other existing external protocol drivers, in that the current
> >user and developer base is small, and the code itself is pretty new.  So I
> >think for the VxHS driver here upstream, we really do need to get some of
> >the libqnio issues squared away.  I don't know if we've ever explicitly
> >address the extent to which libqnio issues affect the driver
> >merging, so I figure it is probably worth discussing here.
> >
> >To try and consolidate libqnio discussion, here is what I think I've read /
> >seen from others as the major issues that should be addressed in libqnio:
> >
> >* Code auditing, static analysis, and general code cleanup.  Things like
> >  memory leaks shouldn't be happening, and some prior libqnio compiler
> >  warnings imply that there is more code analysis that should be done with
> >  libqnio.
> >
> >  (With regards to memory leaks:  Valgrind may be useful to track these down:
> >
> ># valgrind  ./qemu-io -c 'write -pP 0xae 66000 128k' \
> >vxhs://localhost/test.raw
> >
> >==30369== LEAK SUMMARY:
> >==30369==definitely lost: 4,168 bytes in 2 blocks
> >==30369==indirectly lost: 1,207,720 bytes in 58,085 blocks) 
> 
> We have done and are doing exhaustive memory leak tests using valgrind.
> Memory leaks within qnio have been addressed to some extent. We will post
> detailed valgrind results to this thread.
>

That is good to hear.  I ran the above on the latest HEAD from the qnio
github repo, so I look forward to checking out the latest code once it is
available.

> >
> >* Potential security issues such as buffer overruns, input validation, etc., 
> >  need to be audited.
> 
> We have known a few such issues from previous comments and have addressed
> some of those. If there are any important out

[Qemu-devel] [PATCH 3/5] sheepdog: do not use BlockAIOCB

2016-11-18 Thread Paolo Bonzini
Sheepdog's AIOCB are completely internal entities for a group of
requests and do not need dynamic allocation.

Cc: Hitoshi Mitake 
Cc: Liu Yuan 
Signed-off-by: Paolo Bonzini 
---
 block/sheepdog.c | 95 ++--
 1 file changed, 37 insertions(+), 58 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index f849941..5071c41 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -306,6 +306,7 @@ static inline size_t count_data_objs(const struct 
SheepdogInode *inode)
 } while (0)
 
 typedef struct SheepdogAIOCB SheepdogAIOCB;
+typedef struct BDRVSheepdogState BDRVSheepdogState;
 
 typedef struct AIOReq {
 SheepdogAIOCB *aiocb;
@@ -334,7 +335,7 @@ enum AIOCBState {
|| y->max_affect_data_idx < x->min_affect_data_idx))
 
 struct SheepdogAIOCB {
-BlockAIOCB common;
+BDRVSheepdogState *s;
 
 QEMUIOVector *qiov;
 
@@ -488,20 +489,15 @@ static inline void free_aio_req(BDRVSheepdogState *s, 
AIOReq *aio_req)
 acb->nr_pending--;
 }
 
-static const AIOCBInfo sd_aiocb_info = {
-.aiocb_size = sizeof(SheepdogAIOCB),
-};
-
-static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
-   int64_t sector_num, int nb_sectors)
+static void sd_aio_setup(SheepdogAIOCB *acb, BDRVSheepdogState *s,
+ QEMUIOVector *qiov, int64_t sector_num, int 
nb_sectors,
+ int type)
 {
-SheepdogAIOCB *acb;
 uint32_t object_size;
-BDRVSheepdogState *s = bs->opaque;
 
 object_size = (UINT32_C(1) << s->inode.block_size_shift);
 
-acb = qemu_aio_get(&sd_aiocb_info, bs, NULL, NULL);
+acb->s = s;
 
 acb->qiov = qiov;
 
@@ -518,8 +514,7 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, 
QEMUIOVector *qiov,
 
 acb->min_dirty_data_idx = UINT32_MAX;
 acb->max_dirty_data_idx = 0;
-
-return acb;
+acb->aiocb_type = type;
 }
 
 /* Return -EIO in case of error, file descriptor on success */
@@ -1965,7 +1960,7 @@ static int sd_truncate(BlockDriverState *bs, int64_t 
offset)
  */
 static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
 {
-BDRVSheepdogState *s = acb->common.bs->opaque;
+BDRVSheepdogState *s = acb->s;
 struct iovec iov;
 AIOReq *aio_req;
 uint32_t offset, data_len, mn, mx;
@@ -2103,16 +2098,15 @@ out:
  * Returns 1 when we need to wait a response, 0 when there is no sent
  * request and -errno in error cases.
  */
-static void coroutine_fn sd_co_rw_vector(void *p)
+static void coroutine_fn sd_co_rw_vector(SheepdogAIOCB *acb)
 {
-SheepdogAIOCB *acb = p;
 int ret = 0;
 unsigned long len, done = 0, total = acb->nb_sectors * BDRV_SECTOR_SIZE;
 unsigned long idx;
 uint32_t object_size;
 uint64_t oid;
 uint64_t offset;
-BDRVSheepdogState *s = acb->common.bs->opaque;
+BDRVSheepdogState *s = acb->s;
 SheepdogInode *inode = &s->inode;
 AIOReq *aio_req;
 
@@ -2220,7 +2214,7 @@ static bool check_overlapping_aiocb(BDRVSheepdogState *s, 
SheepdogAIOCB *aiocb)
 static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
 int nb_sectors, QEMUIOVector *qiov)
 {
-SheepdogAIOCB *acb;
+SheepdogAIOCB acb;
 int ret;
 int64_t offset = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE;
 BDRVSheepdogState *s = bs->opaque;
@@ -2232,76 +2226,65 @@ static coroutine_fn int sd_co_writev(BlockDriverState 
*bs, int64_t sector_num,
 }
 }
 
-acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
-acb->aiocb_type = AIOCB_WRITE_UDATA;
+sd_aio_setup(&acb, s, qiov, sector_num, nb_sectors, AIOCB_WRITE_UDATA);
 
 retry:
-if (check_overlapping_aiocb(s, acb)) {
+if (check_overlapping_aiocb(s, &acb)) {
 qemu_co_queue_wait(&s->overlapping_queue);
 goto retry;
 }
 
-sd_co_rw_vector(acb);
-sd_write_done(acb);
+sd_co_rw_vector(&acb);
+sd_write_done(&acb);
 
-QLIST_REMOVE(acb, aiocb_siblings);
+QLIST_REMOVE(&acb, aiocb_siblings);
 qemu_co_queue_restart_all(&s->overlapping_queue);
-ret = acb->ret;
-qemu_aio_unref(acb);
-return ret;
+return acb.ret;
 }
 
 static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
int nb_sectors, QEMUIOVector *qiov)
 {
-SheepdogAIOCB *acb;
-int ret;
+SheepdogAIOCB acb;
 BDRVSheepdogState *s = bs->opaque;
 
-acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
-acb->aiocb_type = AIOCB_READ_UDATA;
+sd_aio_setup(&acb, s, qiov, sector_num, nb_sectors, AIOCB_READ_UDATA);
 
 retry:
-if (check_overlapping_aiocb(s, acb)) {
+if (check_overlapping_aiocb(s, &acb)) {
 qemu_co_queue_wait(&s->overlapping_queue);
 goto retry;
 }
 
-sd_co_rw_vector(acb);
+sd_co_rw_vector(&acb);
 
-QLIST_REMOVE(acb, aiocb_siblings);
+QLIST_REMOVE(&acb, aiocb_siblings);
 qemu_co_queue_restart_all(&s->overlapping_queue)

[Qemu-devel] [PATCH 2/5] sheepdog: reorganize coroutine flow

2016-11-18 Thread Paolo Bonzini
Delimit co_recv's lifetime clearly in aio_read_response.

Do a simple qemu_coroutine_enter in aio_read_response, letting
sd_co_writev call sd_write_done.

Handle nr_pending in the same way in sd_co_rw_vector,
sd_write_done and sd_co_flush_to_disk.

Remove sd_co_rw_vector's return value; just leave with no
pending requests.

Cc: Hitoshi Mitake 
Cc: Liu Yuan 
Signed-off-by: Paolo Bonzini 
---
 block/sheepdog.c | 115 ---
 1 file changed, 41 insertions(+), 74 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index d2b14fd..f849941 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -345,8 +345,6 @@ struct SheepdogAIOCB {
 enum AIOCBState aiocb_type;
 
 Coroutine *coroutine;
-void (*aio_done_func)(SheepdogAIOCB *);
-
 int nr_pending;
 
 uint32_t min_affect_data_idx;
@@ -449,14 +447,13 @@ static const char * sd_strerror(int err)
  *
  * 1. In sd_co_rw_vector, we send the I/O requests to the server and
  *link the requests to the inflight_list in the
- *BDRVSheepdogState.  The function exits without waiting for
+ *BDRVSheepdogState.  The function yields while waiting for
  *receiving the response.
  *
  * 2. We receive the response in aio_read_response, the fd handler to
- *the sheepdog connection.  If metadata update is needed, we send
- *the write request to the vdi object in sd_write_done, the write
- *completion function.  We switch back to sd_co_readv/writev after
- *all the requests belonging to the AIOCB are finished.
+ *the sheepdog connection.  We switch back to sd_co_readv/sd_writev
+ *after all the requests belonging to the AIOCB are finished.  If
+ *needed, sd_co_writev will send another requests for the vdi object.
  */
 
 static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
@@ -491,12 +488,6 @@ static inline void free_aio_req(BDRVSheepdogState *s, 
AIOReq *aio_req)
 acb->nr_pending--;
 }
 
-static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB *acb)
-{
-qemu_coroutine_enter(acb->coroutine);
-qemu_aio_unref(acb);
-}
-
 static const AIOCBInfo sd_aiocb_info = {
 .aiocb_size = sizeof(SheepdogAIOCB),
 };
@@ -517,7 +508,6 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, 
QEMUIOVector *qiov,
 acb->sector_num = sector_num;
 acb->nb_sectors = nb_sectors;
 
-acb->aio_done_func = NULL;
 acb->coroutine = qemu_coroutine_self();
 acb->ret = 0;
 acb->nr_pending = 0;
@@ -788,9 +778,6 @@ static void coroutine_fn aio_read_response(void *opaque)
 
 switch (acb->aiocb_type) {
 case AIOCB_WRITE_UDATA:
-/* this coroutine context is no longer suitable for co_recv
- * because we may send data to update vdi objects */
-s->co_recv = NULL;
 if (!is_data_obj(aio_req->oid)) {
 break;
 }
@@ -838,6 +825,11 @@ static void coroutine_fn aio_read_response(void *opaque)
 }
 }
 
+/* No more data for this aio_req (reload_inode below uses its own file
+ * descriptor handler which doesn't use co_recv).
+*/
+s->co_recv = NULL;
+
 switch (rsp.result) {
 case SD_RES_SUCCESS:
 break;
@@ -855,7 +847,7 @@ static void coroutine_fn aio_read_response(void *opaque)
 aio_req->oid = vid_to_vdi_oid(s->inode.vdi_id);
 }
 resend_aioreq(s, aio_req);
-goto out;
+return;
 default:
 acb->ret = -EIO;
 error_report("%s", sd_strerror(rsp.result));
@@ -868,13 +860,10 @@ static void coroutine_fn aio_read_response(void *opaque)
  * We've finished all requests which belong to the AIOCB, so
  * we can switch back to sd_co_readv/writev now.
  */
-acb->aio_done_func(acb);
+qemu_coroutine_enter(acb->coroutine);
 }
-out:
-s->co_recv = NULL;
-return;
+
 err:
-s->co_recv = NULL;
 reconnect_to_sdog(opaque);
 }
 
@@ -1973,7 +1962,6 @@ static int sd_truncate(BlockDriverState *bs, int64_t 
offset)
 /*
  * This function is called after writing data objects.  If we need to
  * update metadata, this sends a write request to the vdi object.
- * Otherwise, this switches back to sd_co_readv/writev.
  */
 static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
 {
@@ -1986,6 +1974,7 @@ static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
 mx = acb->max_dirty_data_idx;
 if (mn <= mx) {
 /* we need to update the vdi object. */
+++acb->nr_pending;
 offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +
 mn * sizeof(s->inode.data_vdi_id[0]);
 data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]);
@@ -1999,13 +1988,10 @@ static void coroutine_fn sd_write_done(SheepdogAIOCB 
*acb)
 data_len, offset, 0, false, 0, offset);
 QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
 add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDAT

[Qemu-devel] [PULL 7/7] acpi: Use apic_id_limit when calculating legacy ACPI table size

2016-11-18 Thread Michael S. Tsirkin
From: Eduardo Habkost 

The code that calculates the legacy ACPI table size for migration
compatibility uses max_cpus when calculating legacy_aml_len (the size of
the DSDT and SSDT tables). However, the SSDT grows according to APIC ID
limit, not max_cpus.

The bug is not triggered very often because of the 4k alignment on the
table size. But it can be triggered if you are unlucky enough to cross a
4k boundary.

Change the legacy_aml_len calculation to use apic_id_limit, to calculate
the right size.

Signed-off-by: Eduardo Habkost 
Reviewed-by: Michael S. Tsirkin 
Signed-off-by: Michael S. Tsirkin 
---
 hw/i386/acpi-build.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index a155857..45a2ccf 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -2860,7 +2860,7 @@ void acpi_build(AcpiBuildTables *tables, MachineState 
*machine)
  */
 int legacy_aml_len =
 pcmc->legacy_acpi_table_size +
-ACPI_BUILD_LEGACY_CPU_AML_SIZE * max_cpus;
+ACPI_BUILD_LEGACY_CPU_AML_SIZE * pcms->apic_id_limit;
 int legacy_table_size =
 ROUND_UP(tables_blob->len - aml_len + legacy_aml_len,
  ACPI_BUILD_ALIGN_SIZE);
-- 
MST




[Qemu-devel] [PULL 6/7] ipmi: fix qemu crash while migrating with ipmi

2016-11-18 Thread Michael S. Tsirkin
From: ZhuangYanying 

Qemu crash in the source side while migrating, after starting ipmi service 
inside vm.

./x86_64-softmmu/qemu-system-x86_64 --enable-kvm -smp 4 -m 4096 \
-drive 
file=/work/suse/suse11_sp3_64_vt,format=raw,if=none,id=drive-virtio-disk0,cache=none
 \
-device 
virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0
 \
-vnc :99 -monitor vc -device ipmi-bmc-sim,id=bmc0 -device 
isa-ipmi-kcs,bmc=bmc0,ioport=0xca2

Program received signal SIGSEGV, Segmentation fault.
[Switching to Thread 0x7ffec4268700 (LWP 7657)]
__memcpy_ssse3_back () at ../sysdeps/x86_64/multiarch/memcpy-ssse3-back.S:2757
(gdb) bt
 #0  __memcpy_ssse3_back () at 
../sysdeps/x86_64/multiarch/memcpy-ssse3-back.S:2757
 #1  0x559ef775 in memcpy (__len=3, __src=0xc1421c, __dest=)
 at /usr/include/bits/string3.h:51
 #2  qemu_put_buffer (f=0x57a97690, buf=0xc1421c , size=3)
 at migration/qemu-file.c:346
 #3  0x559eef66 in vmstate_save_state (f=f@entry=0x57a97690,
 vmsd=0x55f8a5a0 , opaque=0x57231160,
 vmdesc=vmdesc@entry=0x5798cc40) at migration/vmstate.c:333
 #4  0x557cfe45 in vmstate_save (f=f@entry=0x57a97690, 
se=se@entry=0x57231de0,
 vmdesc=vmdesc@entry=0x5798cc40) at 
/mnt/sdb/zyy/qemu/migration/savevm.c:720
 #5  0x557d2be7 in qemu_savevm_state_complete_precopy (f=0x57a97690,
 iterable_only=iterable_only@entry=false) at 
/mnt/sdb/zyy/qemu/migration/savevm.c:1128
 #6  0x559ea102 in migration_completion (start_time=,
 old_vm_running=, current_active_state=,
 s=0x560eaa80 ) at migration/migration.c:1707
 #7  migration_thread (opaque=0x560eaa80 ) at 
migration/migration.c:1855
 #8  0x73900dc5 in start_thread (arg=0x7ffec4268700) at 
pthread_create.c:308
 #9  0x7fffefc6c71d in clone () at 
../sysdeps/unix/sysv/linux/x86_64/clone.S:113

Signed-off-by: Zhuang Yanying 
Reviewed-by: Michael S. Tsirkin 
Signed-off-by: Michael S. Tsirkin 
---
 hw/ipmi/isa_ipmi_kcs.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/hw/ipmi/isa_ipmi_kcs.c b/hw/ipmi/isa_ipmi_kcs.c
index 9a38f8a..8044497 100644
--- a/hw/ipmi/isa_ipmi_kcs.c
+++ b/hw/ipmi/isa_ipmi_kcs.c
@@ -433,10 +433,8 @@ const VMStateDescription vmstate_ISAIPMIKCSDevice = {
 VMSTATE_BOOL(kcs.use_irq, ISAIPMIKCSDevice),
 VMSTATE_BOOL(kcs.irqs_enabled, ISAIPMIKCSDevice),
 VMSTATE_UINT32(kcs.outpos, ISAIPMIKCSDevice),
-VMSTATE_VBUFFER_UINT32(kcs.outmsg, ISAIPMIKCSDevice, 1, NULL, 0,
-   kcs.outlen),
-VMSTATE_VBUFFER_UINT32(kcs.inmsg, ISAIPMIKCSDevice, 1, NULL, 0,
-   kcs.inlen),
+VMSTATE_UINT8_ARRAY(kcs.outmsg, ISAIPMIKCSDevice, MAX_IPMI_MSG_SIZE),
+VMSTATE_UINT8_ARRAY(kcs.inmsg, ISAIPMIKCSDevice, MAX_IPMI_MSG_SIZE),
 VMSTATE_BOOL(kcs.write_end, ISAIPMIKCSDevice),
 VMSTATE_UINT8(kcs.status_reg, ISAIPMIKCSDevice),
 VMSTATE_UINT8(kcs.data_out_reg, ISAIPMIKCSDevice),
-- 
MST




Re: [Qemu-devel] [PATCH v14 00/22] Add Mediated device support

2016-11-18 Thread Daniel Vetter
On Fri, Nov 18, 2016 at 4:40 PM, Alex Williamson
 wrote:
>> Alex, could you do a pull request of mdev for Daniel's drm-intel tree?
>> We need to send KVMGT mdev support pull base on that.
>
> No, this is not how I intend or prefer to merge this.  This is a large
> change for vfio and it is not exclusive to KVMGT.  We have linux-next
> to facilitate handling dependencies between subsystems during
> development and a two week merge window to allow managing how these
> changes enter the mainline tree.  If I were to have this pulled into
> drm-intel it ties my hands as to how I can manage changes within my
> functional area.  I want these two weeks of linux-next exposure for
> vetting the changes and resolving any remaining issues.  I'm not going
> to compromise my ability to react to such issues.  linux-next inclusion
> should be sufficient for you to coordinate through the drm tree, though
> Daniel will need to be made aware of the dependency.  I will however
> plan to send my pull request to Linus early in the merge window to
> accommodate dependent changes also being included for v4.10. Hope
> you understand, thanks,

My understanding was that the mdev changes are needed to be able to
apply the kvmgt stuff, and otherwise it won't build. For that I need a
stable git tag&pull request (can be specific topic branch, which means
subsystems can land in any order, or the full subsystem tree, which
means depencies need to be tracked correctly). I am not going to
resolve that in the merge window, since in drm we want everything
lined up _before_ that opens (the feature cutoff is this w/e, but
there's some wiggle room ofc).

Sounds like there's just not enough time to line all the things up in
time for 4.10, and the i915/kmvgt stuff needs to be postponed to 4.11.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch



[Qemu-devel] [PULL 4/7] virtio: set ISR on dataplane notifications

2016-11-18 Thread Michael S. Tsirkin
From: Paolo Bonzini 

Dataplane has been omitting forever the step of setting ISR when
an interrupt is raised.  This caused little breakage, because the
specification actually says that ISR may not be updated in MSI mode.

Some versions of the Windows drivers however didn't clear MSI mode
correctly, and proceeded using polling mode (using ISR, not the used
ring index!) for crashdump and hibernation.  If it were just crashdump
and hibernation it would not be a big deal, but recent releases of
Windows do not really shut down, but rather log out and hibernate to
make the next startup faster.  Hence, this manifested as a more serious
hang during shutdown with e.g. Windows 8.1 and virtio-win 1.8.0 RPMs.
Newer versions fixed this, while older versions do not use MSI at all.

The failure has always been there for virtio dataplane, but it became
visible after commits 9ffe337 ("virtio-blk: always use dataplane path
if ioeventfd is active", 2016-10-30) and ad07cd6 ("virtio-scsi: always
use dataplane path if ioeventfd is active", 2016-10-30) made virtio-blk
and virtio-scsi always use the dataplane code under KVM.  The good news
therefore is that it was not a bug in the patches---they were doing
exactly what they were meant for, i.e. shake out remaining dataplane bugs.

The fix is not hard, so it's worth arranging for the broken drivers.
The virtio_should_notify+event_notifier_set pair that is common to
virtio-blk and virtio-scsi dataplane is replaced with a new public
function virtio_notify_irqfd that also sets ISR.  The irqfd emulation
code now need not set ISR anymore, so virtio_irq is removed.

Reviewed-by: Stefan Hajnoczi 
Tested-by: Farhan Ali 
Tested-by: Alex Williamson 
Signed-off-by: Paolo Bonzini 
Reviewed-by: Michael S. Tsirkin 
Signed-off-by: Michael S. Tsirkin 
---
 include/hw/virtio/virtio-scsi.h |  1 -
 include/hw/virtio/virtio.h  |  2 +-
 hw/block/dataplane/virtio-blk.c |  4 +---
 hw/scsi/virtio-scsi-dataplane.c |  7 ---
 hw/scsi/virtio-scsi.c   |  2 +-
 hw/virtio/virtio.c  | 36 
 hw/virtio/trace-events  |  2 +-
 7 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h
index 9fbc7d7..7375196 100644
--- a/include/hw/virtio/virtio-scsi.h
+++ b/include/hw/virtio/virtio-scsi.h
@@ -137,6 +137,5 @@ void virtio_scsi_push_event(VirtIOSCSI *s, SCSIDevice *dev,
 void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp);
 int virtio_scsi_dataplane_start(VirtIODevice *s);
 void virtio_scsi_dataplane_stop(VirtIODevice *s);
-void virtio_scsi_dataplane_notify(VirtIODevice *vdev, VirtIOSCSIReq *req);
 
 #endif /* QEMU_VIRTIO_SCSI_H */
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index 835b085..ab0e030 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -181,6 +181,7 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int 
*in_bytes,
unsigned max_in_bytes, unsigned max_out_bytes);
 
 bool virtio_should_notify(VirtIODevice *vdev, VirtQueue *vq);
+void virtio_notify_irqfd(VirtIODevice *vdev, VirtQueue *vq);
 void virtio_notify(VirtIODevice *vdev, VirtQueue *vq);
 
 void virtio_save(VirtIODevice *vdev, QEMUFile *f);
@@ -280,7 +281,6 @@ void virtio_queue_host_notifier_read(EventNotifier *n);
 void virtio_queue_aio_set_host_notifier_handler(VirtQueue *vq, AioContext *ctx,
 void (*fn)(VirtIODevice *,
VirtQueue *));
-void virtio_irq(VirtQueue *vq);
 VirtQueue *virtio_vector_first_queue(VirtIODevice *vdev, uint16_t vector);
 VirtQueue *virtio_vector_next_queue(VirtQueue *vq);
 
diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index 90ef557..d1f9f63 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -68,9 +68,7 @@ static void notify_guest_bh(void *opaque)
 unsigned i = j + ctzl(bits);
 VirtQueue *vq = virtio_get_queue(s->vdev, i);
 
-if (virtio_should_notify(s->vdev, vq)) {
-event_notifier_set(virtio_queue_get_guest_notifier(vq));
-}
+virtio_notify_irqfd(s->vdev, vq);
 
 bits &= bits - 1; /* clear right-most bit */
 }
diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c
index f2ea29d..6b8d0f0 100644
--- a/hw/scsi/virtio-scsi-dataplane.c
+++ b/hw/scsi/virtio-scsi-dataplane.c
@@ -95,13 +95,6 @@ static int virtio_scsi_vring_init(VirtIOSCSI *s, VirtQueue 
*vq, int n,
 return 0;
 }
 
-void virtio_scsi_dataplane_notify(VirtIODevice *vdev, VirtIOSCSIReq *req)
-{
-if (virtio_should_notify(vdev, req->vq)) {
-event_notifier_set(virtio_queue_get_guest_notifier(req->vq));
-}
-}
-
 /* assumes s->ctx held */
 static void virtio_scsi_clear_aio(VirtIOSCSI *s)
 {
diff --git a/hw/scsi/virtio-scsi.c b/hw/s

[Qemu-devel] [PULL 5/7] ivshmem: Fix 64 bit memory bar configuration

2016-11-18 Thread Michael S. Tsirkin
From: Zhuang Yanying 

Device ivshmem property use64=0 is designed to make the device
expose a 32 bit shared memory BAR instead of 64 bit one.  The
default is a 64 bit BAR, except pc-1.2 and older retain a 32 bit
BAR.  A 32 bit BAR can support only up to 1 GiB of shared memory.

This worked as designed until commit 5400c02 accidentally flipped
its sense: since then, we misinterpret use64=0 as use64=1 and vice
versa.  Worse, the default got flipped as well.  Devices
ivshmem-plain and ivshmem-doorbell are not affected.

Fix by restoring the test of IVShmemState member not_legacy_32bit
that got messed up in commit 5400c02.  Also update its
initialization for devices ivhsmem-plain and ivshmem-doorbell.
Without that, they'd regress to 32 bit BARs.

Cc: qemu-sta...@nongnu.org
Signed-off-by: Zhuang Yanying 
Reviewed-by: Gonglei 
Reviewed-by: Marc-André Lureau 
Reviewed-by: Michael S. Tsirkin 
Signed-off-by: Michael S. Tsirkin 
Reviewed-by: Markus Armbruster 
---
 hw/misc/ivshmem.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/misc/ivshmem.c b/hw/misc/ivshmem.c
index 230e51b..abeaf3d 100644
--- a/hw/misc/ivshmem.c
+++ b/hw/misc/ivshmem.c
@@ -858,7 +858,7 @@ static void ivshmem_common_realize(PCIDevice *dev, Error 
**errp)
 pci_register_bar(dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY,
  &s->ivshmem_mmio);
 
-if (!s->not_legacy_32bit) {
+if (s->not_legacy_32bit) {
 attr |= PCI_BASE_ADDRESS_MEM_TYPE_64;
 }
 
@@ -1045,6 +1045,7 @@ static void ivshmem_plain_init(Object *obj)
  ivshmem_check_memdev_is_busy,
  OBJ_PROP_LINK_UNREF_ON_RELEASE,
  &error_abort);
+s->not_legacy_32bit = 1;
 }
 
 static void ivshmem_plain_realize(PCIDevice *dev, Error **errp)
@@ -1116,6 +1117,7 @@ static void ivshmem_doorbell_init(Object *obj)
 
 s->features |= (1 << IVSHMEM_MSI);
 s->legacy_size = SIZE_MAX;  /* whatever the server sends */
+s->not_legacy_32bit = 1;
 }
 
 static void ivshmem_doorbell_realize(PCIDevice *dev, Error **errp)
-- 
MST




[Qemu-devel] [PATCH 1/5] sheepdog: remove unused cancellation support

2016-11-18 Thread Paolo Bonzini
SheepdogAIOCB is internal to sheepdog.c, hence it is never canceled.

Cc: Hitoshi Mitake 
Cc: Liu Yuan 
Signed-off-by: Paolo Bonzini 
---
 block/sheepdog.c | 52 
 1 file changed, 52 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 1fb9173..d2b14fd 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -347,7 +347,6 @@ struct SheepdogAIOCB {
 Coroutine *coroutine;
 void (*aio_done_func)(SheepdogAIOCB *);
 
-bool cancelable;
 int nr_pending;
 
 uint32_t min_affect_data_idx;
@@ -486,7 +485,6 @@ static inline void free_aio_req(BDRVSheepdogState *s, 
AIOReq *aio_req)
 {
 SheepdogAIOCB *acb = aio_req->aiocb;
 
-acb->cancelable = false;
 QLIST_REMOVE(aio_req, aio_siblings);
 g_free(aio_req);
 
@@ -499,57 +497,8 @@ static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB 
*acb)
 qemu_aio_unref(acb);
 }
 
-/*
- * Check whether the specified acb can be canceled
- *
- * We can cancel aio when any request belonging to the acb is:
- *  - Not processed by the sheepdog server.
- *  - Not linked to the inflight queue.
- */
-static bool sd_acb_cancelable(const SheepdogAIOCB *acb)
-{
-BDRVSheepdogState *s = acb->common.bs->opaque;
-AIOReq *aioreq;
-
-if (!acb->cancelable) {
-return false;
-}
-
-QLIST_FOREACH(aioreq, &s->inflight_aio_head, aio_siblings) {
-if (aioreq->aiocb == acb) {
-return false;
-}
-}
-
-return true;
-}
-
-static void sd_aio_cancel(BlockAIOCB *blockacb)
-{
-SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb;
-BDRVSheepdogState *s = acb->common.bs->opaque;
-AIOReq *aioreq, *next;
-
-if (sd_acb_cancelable(acb)) {
-/* Remove outstanding requests from failed queue.  */
-QLIST_FOREACH_SAFE(aioreq, &s->failed_aio_head, aio_siblings,
-   next) {
-if (aioreq->aiocb == acb) {
-free_aio_req(s, aioreq);
-}
-}
-
-assert(acb->nr_pending == 0);
-if (acb->common.cb) {
-acb->common.cb(acb->common.opaque, -ECANCELED);
-}
-sd_finish_aiocb(acb);
-}
-}
-
 static const AIOCBInfo sd_aiocb_info = {
 .aiocb_size = sizeof(SheepdogAIOCB),
-.cancel_async   = sd_aio_cancel,
 };
 
 static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
@@ -569,7 +518,6 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, 
QEMUIOVector *qiov,
 acb->nb_sectors = nb_sectors;
 
 acb->aio_done_func = NULL;
-acb->cancelable = true;
 acb->coroutine = qemu_coroutine_self();
 acb->ret = 0;
 acb->nr_pending = 0;
-- 
2.9.3





[Qemu-devel] [PULL 1/7] virtio-crypto: fix virtio_queue_set_notification() race

2016-11-18 Thread Michael S. Tsirkin
From: Stefan Hajnoczi 

We must check for new virtqueue buffers after re-enabling notifications.
This prevents the race condition where the guest added buffers just
after we stopped popping the virtqueue but before we re-enabled
notifications.

I think the virtio-crypto code was based on virtio-net but this crucial
detail was missed.  virtio-net does not have the race condition because
it processes the virtqueue one more time after re-enabling
notifications.

Cc: Gonglei 
Signed-off-by: Stefan Hajnoczi 
Tested-by: Alexey Kardashevskiy 
Reviewed-by: Michael S. Tsirkin 
Signed-off-by: Michael S. Tsirkin 
Reviewed-by: Gonglei 
---
 hw/virtio/virtio-crypto.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c
index 3293843..847dc9d 100644
--- a/hw/virtio/virtio-crypto.c
+++ b/hw/virtio/virtio-crypto.c
@@ -692,8 +692,17 @@ static void virtio_crypto_dataq_bh(void *opaque)
 return;
 }
 
-virtio_crypto_handle_dataq(vdev, q->dataq);
-virtio_queue_set_notification(q->dataq, 1);
+for (;;) {
+virtio_crypto_handle_dataq(vdev, q->dataq);
+virtio_queue_set_notification(q->dataq, 1);
+
+/* Are we done or did the guest add more buffers? */
+if (virtio_queue_empty(q->dataq)) {
+break;
+}
+
+virtio_queue_set_notification(q->dataq, 0);
+}
 }
 
 static void
-- 
MST




[Qemu-devel] [PATCH for-2.9 0/5] Sheepdog cleanups

2016-11-18 Thread Paolo Bonzini
Cleaning up the code and removing duplication makes it simpler to
later adapt it for the multiqueue work.

Tested against sheepdog 1.0.  I also tested taking snapshots and reverting
to older snapshots, but the latter only worked with "dog vdi rollback".
Neither loadvm nor qemu-img worked for me.

Paolo Bonzini (5):
  sheepdog: remove unused cancellation support
  sheepdog: reorganize coroutine flow
  sheepdog: do not use BlockAIOCB
  sheepdog: simplify inflight_aio_head management
  sheepdog: reorganize check for overlapping requests

 block/sheepdog.c | 285 ---
 1 file changed, 81 insertions(+), 204 deletions(-)

-- 
2.9.3




Re: [Qemu-devel] [PATCH] hw/pci: disable pci-bridge's shpc by default

2016-11-18 Thread Michael S. Tsirkin
On Fri, Nov 18, 2016 at 04:52:01PM +0100, Andrew Jones wrote:
> On Wed, Nov 16, 2016 at 07:05:25PM +0200, Marcel Apfelbaum wrote:
> > On 11/16/2016 06:44 PM, Andrew Jones wrote:
> > > On Sat, Nov 05, 2016 at 06:46:34PM +0200, Marcel Apfelbaum wrote:
> > > > On 11/03/2016 09:40 PM, Michael S. Tsirkin wrote:
> > > > > On Thu, Nov 03, 2016 at 01:05:44PM +0200, Marcel Apfelbaum wrote:
> > > > > > On 11/03/2016 06:18 AM, Michael S. Tsirkin wrote:
> > > > > > > On Wed, Nov 02, 2016 at 05:16:42PM +0200, Marcel Apfelbaum wrote:
> > > > > > > > The shpc component is optional while  ACPI hotplug is used
> > > > > > > > for hot-plugging PCI devices into a PCI-PCI bridge.
> > > > > > > > Disabling the shpc by default will make slot 0 usable at boot 
> > > > > > > > time
> > > > > > 
> > > > > > Hi Michael
> > > > > > 
> > > > > > > 
> > > > > > > at the cost of breaking all hotplug for all non-acpi users.
> > > > > > > 
> > > > > > 
> > > > > > Do we have a non-acpi user that is able to use the shpc component 
> > > > > > as-is today?
> > > > > 
> > > > > power and some arm systems I guess?
> > > > > 
> > > > 
> > > > Adding Andrew , maybe he can give us an answer.
> > > 
> > > Not really :-) My lack of PCI knowledge makes that difficult. I'd be happy
> > > to help with an experiment though. Can you give me command line arguments,
> > > qmp commands, etc. that I should use to try it out? I imagine I should
> > > just boot an ARM guest using DT (instead of ACPI) and then attempt to
> > > hotplug a PCI device. I'm not sure, however, what, if any, special
> > > configuration I need in order to ensure I'm testing what you're
> > > interested in.
> > > 
> > 
> > Hi Drew,
> > 
> > 
> > Just run QEMU with '-device pci-bridge,chassis_nr=1,id=bridge1 -monitor 
> > stdio'
> > with an ARM guest using DT and wait until the guest finish booting.
> > 
> > Then run at hmp:
> > device_add virtio-net-pci,bus=bridge1,id=net2
> > 
> > Next run lspci in the guest to see the new device.
> 
> Thanks for the instructions Marcel. Here's the results
> 
>  $QEMU -machine virt,accel=$ACCEL -cpu $CPU -nographic -m 4096 -smp 8 \
>-bios /usr/share/AAVMF/AAVMF_CODE.fd \
>-device pci-bridge,chassis_nr=1,id=bridge1 \
>-drive file=$FEDORA_IMG,if=none,id=dr0,format=qcow2 \
>-device virtio-blk-pci,bus=bridge1,addr=01,drive=dr0,id=disk0 \
>-netdev user,id=hostnet0 \
>-device virtio-net-pci,bus=bridge1,addr=02,netdev=hostnet0,id=net0
> 
>  # lspci
>  00:00.0 Host bridge: Red Hat, Inc. Device 0008
>  00:01.0 PCI bridge: Red Hat, Inc. QEMU PCI-PCI bridge
>  01:01.0 SCSI storage controller: Red Hat, Inc Virtio block device
>  01:02.0 Ethernet controller: Red Hat, Inc Virtio network device
> 
>  (qemu) device_add virtio-net-pci,bus=bridge1,id=net2
>  Unsupported PCI slot 0 for standard hotplug controller. Valid slots are
>  between 1 and 31.
> 
> (Tried again giving addr=03)
> 
>  (qemu) device_add virtio-net-pci,bus=bridge1,id=net2,addr=03
> 
> (Seemed to work, but...)
> 
>  # lspci
>  00:00.0 Host bridge: Red Hat, Inc. Device 0008
>  00:01.0 PCI bridge: Red Hat, Inc. QEMU PCI-PCI bridge
>  01:01.0 SCSI storage controller: Red Hat, Inc Virtio block device
>  01:02.0 Ethernet controller: Red Hat, Inc Virtio network device
> 
> (Doesn't show up in lscpi. So I guess it doesn't work)
> 
> > 
> > 
> > BTW, will an ARM guest run 'fast' enough to be usable on a x86 machine?
> > If yes, any pointers on how to create such a guest?
> 
> You can run AArch64 guests on x86 machines. It's not super fast though...
> Certainly I wouldn't want to create my guest image using TCG. So, assuming
> you acquire an image somewhere (or create it on a real machine), then you
> can use the above command line, just change 
> 
> ACCEL=kvm CPU=host to ACCEL=tcg CPU=cortex-a57
> 
> Thanks,
> drew

http://wiki.qemu.org/Testing/System_Images

has some images.
If you have a better one to contribute, upload it there.

-- 
MST



[Qemu-devel] [PULL 0/7] virtio, vhost, pc: fixes

2016-11-18 Thread Michael S. Tsirkin
The following changes since commit 453ac8835b002263a6b7b0843e7c90fa8b19c869:

  docs: add PCIe devices placement guidelines (2016-11-15 17:20:38 +0200)

are available in the git repository at:

  git://git.kernel.org/pub/scm/virt/kvm/mst/qemu.git tags/for_upstream

for you to fetch changes up to 4b5b47abbf23246bd8dde4c6faaed8b7249d8654:

  acpi: Use apic_id_limit when calculating legacy ACPI table size (2016-11-18 
17:50:09 +0200)


virtio, vhost, pc: fixes

Most notably this fixes a regression with vhost introduced by the pull before
last.

Signed-off-by: Michael S. Tsirkin 


Eduardo Habkost (1):
  acpi: Use apic_id_limit when calculating legacy ACPI table size

Paolo Bonzini (3):
  virtio: introduce grab/release_ioeventfd to fix vhost
  virtio: access ISR atomically
  virtio: set ISR on dataplane notifications

Stefan Hajnoczi (1):
  virtio-crypto: fix virtio_queue_set_notification() race

Zhuang Yanying (1):
  ivshmem: Fix 64 bit memory bar configuration

ZhuangYanying (1):
  ipmi: fix qemu crash while migrating with ipmi

 include/hw/virtio/virtio-bus.h  | 14 +
 include/hw/virtio/virtio-scsi.h |  1 -
 include/hw/virtio/virtio.h  |  4 ++-
 hw/block/dataplane/virtio-blk.c |  4 +--
 hw/i386/acpi-build.c|  2 +-
 hw/ipmi/isa_ipmi_kcs.c  |  6 ++--
 hw/misc/ivshmem.c   |  4 ++-
 hw/scsi/virtio-scsi-dataplane.c |  7 -
 hw/scsi/virtio-scsi.c   |  2 +-
 hw/virtio/vhost.c   | 14 -
 hw/virtio/virtio-bus.c  | 58 +---
 hw/virtio/virtio-crypto.c   | 13 ++--
 hw/virtio/virtio-mmio.c |  6 ++--
 hw/virtio/virtio-pci.c  |  9 ++
 hw/virtio/virtio.c  | 66 +++--
 hw/virtio/trace-events  |  2 +-
 16 files changed, 154 insertions(+), 58 deletions(-)




[Qemu-devel] [PULL 2/7] virtio: introduce grab/release_ioeventfd to fix vhost

2016-11-18 Thread Michael S. Tsirkin
From: Paolo Bonzini 

Following the recent refactoring of virtio notifiers [1], more specifically
the patch ed08a2a0b ("virtio: use virtio_bus_set_host_notifier to
start/stop ioeventfd") that uses virtio_bus_set_host_notifier [2]
by default, core virtio code requires 'ioeventfd_started' to be set
to true/false when the host notifiers are configured.

When vhost is stopped and started, however, there is a stop followed by
another start. Since ioeventfd_started was never set to true, the 'stop'
operation triggered by virtio_bus_set_host_notifier() will not result
in a call to virtio_pci_ioeventfd_assign(assign=false). This leaves
the memory regions with stale notifiers and results on the next start
triggering the following assertion:

  kvm_mem_ioeventfd_add: error adding ioeventfd: File exists
  Aborted

This patch reintroduces (hopefully in a cleaner way) the concept
that was present with ioeventfd_disabled before the refactoring.
When ioeventfd_grabbed>0, ioeventfd_started tracks whether ioeventfd
should be enabled or not, but ioeventfd is actually not started at
all until vhost releases the host notifiers.

[1] http://lists.nongnu.org/archive/html/qemu-devel/2016-10/msg07748.html
[2] http://lists.nongnu.org/archive/html/qemu-devel/2016-10/msg07760.html

Reported-by: Felipe Franciosi 
Reported-by: Christian Borntraeger 
Reported-by: Alex Williamson 
Fixes: ed08a2a0b ("virtio: use virtio_bus_set_host_notifier to start/stop 
ioeventfd")
Reviewed-by: Cornelia Huck 
Reviewed-by: Stefan Hajnoczi 
Tested-by: Alexey Kardashevskiy 
Tested-by: Farhan Ali 
Tested-by: Alex Williamson 
Signed-off-by: Paolo Bonzini 
Reviewed-by: Michael S. Tsirkin 
Signed-off-by: Michael S. Tsirkin 
---
 include/hw/virtio/virtio-bus.h | 14 ++
 include/hw/virtio/virtio.h |  2 ++
 hw/virtio/vhost.c  | 14 +-
 hw/virtio/virtio-bus.c | 58 ++
 hw/virtio/virtio.c | 16 
 5 files changed, 86 insertions(+), 18 deletions(-)

diff --git a/include/hw/virtio/virtio-bus.h b/include/hw/virtio/virtio-bus.h
index fdf7fda..8a51e2c 100644
--- a/include/hw/virtio/virtio-bus.h
+++ b/include/hw/virtio/virtio-bus.h
@@ -97,6 +97,16 @@ struct VirtioBusState {
  * Set if ioeventfd has been started.
  */
 bool ioeventfd_started;
+
+/*
+ * Set if ioeventfd has been grabbed by vhost.  When ioeventfd
+ * is grabbed by vhost, we track its started/stopped state (which
+ * depends in turn on the virtio status register), but do not
+ * register a handler for the ioeventfd.  When ioeventfd is
+ * released, if ioeventfd_started is true we finally register
+ * the handler so that QEMU's device model can use ioeventfd.
+ */
+int ioeventfd_grabbed;
 };
 
 void virtio_bus_device_plugged(VirtIODevice *vdev, Error **errp);
@@ -131,6 +141,10 @@ bool virtio_bus_ioeventfd_enabled(VirtioBusState *bus);
 int virtio_bus_start_ioeventfd(VirtioBusState *bus);
 /* Stop the ioeventfd. */
 void virtio_bus_stop_ioeventfd(VirtioBusState *bus);
+/* Tell the bus that vhost is grabbing the ioeventfd. */
+int virtio_bus_grab_ioeventfd(VirtioBusState *bus);
+/* bus that vhost is not using the ioeventfd anymore. */
+void virtio_bus_release_ioeventfd(VirtioBusState *bus);
 /* Switch from/to the generic ioeventfd handler */
 int virtio_bus_set_host_notifier(VirtioBusState *bus, int n, bool assign);
 
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index 5951997..835b085 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -272,6 +272,8 @@ void virtio_queue_set_guest_notifier_fd_handler(VirtQueue 
*vq, bool assign,
 bool with_irqfd);
 int virtio_device_start_ioeventfd(VirtIODevice *vdev);
 void virtio_device_stop_ioeventfd(VirtIODevice *vdev);
+int virtio_device_grab_ioeventfd(VirtIODevice *vdev);
+void virtio_device_release_ioeventfd(VirtIODevice *vdev);
 bool virtio_device_ioeventfd_enabled(VirtIODevice *vdev);
 EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq);
 void virtio_queue_host_notifier_read(EventNotifier *n);
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 30aee88..f7f7023 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -1214,17 +1214,17 @@ void vhost_dev_cleanup(struct vhost_dev *hdev)
 int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
 {
 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
-VirtioBusState *vbus = VIRTIO_BUS(qbus);
-VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
 int i, r, e;
 
-if (!k->ioeventfd_assign) {
+/* We will pass the notifiers to the kernel, make sure that QEMU
+ * doesn't interfere.
+ */
+r = virtio_device_grab_ioeventfd(vdev);
+if (r < 0) {
 error_report("binding does not support host notifiers");
-r = -ENOSYS;
 goto fail;
 }
 
-virtio_device_stop_ioeventfd(vdev);
 for (i = 0; i < hde

[Qemu-devel] [PATCH 5/5] sheepdog: reorganize check for overlapping requests

2016-11-18 Thread Paolo Bonzini
Wrap the code that was copied repeatedly in the two functions,
sd_aio_setup and sd_aio_complete.

Cc: Hitoshi Mitake 
Cc: Liu Yuan 
Signed-off-by: Paolo Bonzini 
---
 block/sheepdog.c | 66 ++--
 1 file changed, 30 insertions(+), 36 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 5aef382..4d1c031 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -479,6 +479,19 @@ static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, 
SheepdogAIOCB *acb,
 return aio_req;
 }
 
+static void wait_for_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB 
*acb)
+{
+SheepdogAIOCB *cb;
+
+retry:
+QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
+if (AIOCBOverlapping(acb, cb)) {
+qemu_co_queue_wait(&s->overlapping_queue);
+goto retry;
+}
+}
+}
+
 static void sd_aio_setup(SheepdogAIOCB *acb, BDRVSheepdogState *s,
  QEMUIOVector *qiov, int64_t sector_num, int 
nb_sectors,
  int type)
@@ -505,6 +518,13 @@ static void sd_aio_setup(SheepdogAIOCB *acb, 
BDRVSheepdogState *s,
 acb->min_dirty_data_idx = UINT32_MAX;
 acb->max_dirty_data_idx = 0;
 acb->aiocb_type = type;
+
+if (type == AIOCB_FLUSH_CACHE) {
+return;
+}
+
+wait_for_overlapping_aiocb(s, acb);
+QLIST_INSERT_HEAD(&s->inflight_aiocb_head, acb, aiocb_siblings);
 }
 
 /* Return -EIO in case of error, file descriptor on success */
@@ -2187,18 +2207,14 @@ static void coroutine_fn sd_co_rw_vector(SheepdogAIOCB 
*acb)
 }
 }
 
-static bool check_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *aiocb)
+static void sd_aio_complete(SheepdogAIOCB *acb)
 {
-SheepdogAIOCB *cb;
-
-QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
-if (AIOCBOverlapping(aiocb, cb)) {
-return true;
-}
+if (acb->aiocb_type == AIOCB_FLUSH_CACHE) {
+return;
 }
 
-QLIST_INSERT_HEAD(&s->inflight_aiocb_head, aiocb, aiocb_siblings);
-return false;
+QLIST_REMOVE(acb, aiocb_siblings);
+qemu_co_queue_restart_all(&acb->s->overlapping_queue);
 }
 
 static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
@@ -2217,18 +2233,10 @@ static coroutine_fn int sd_co_writev(BlockDriverState 
*bs, int64_t sector_num,
 }
 
 sd_aio_setup(&acb, s, qiov, sector_num, nb_sectors, AIOCB_WRITE_UDATA);
-
-retry:
-if (check_overlapping_aiocb(s, &acb)) {
-qemu_co_queue_wait(&s->overlapping_queue);
-goto retry;
-}
-
 sd_co_rw_vector(&acb);
 sd_write_done(&acb);
+sd_aio_complete(&acb);
 
-QLIST_REMOVE(&acb, aiocb_siblings);
-qemu_co_queue_restart_all(&s->overlapping_queue);
 return acb.ret;
 }
 
@@ -2239,17 +2247,9 @@ static coroutine_fn int sd_co_readv(BlockDriverState 
*bs, int64_t sector_num,
 BDRVSheepdogState *s = bs->opaque;
 
 sd_aio_setup(&acb, s, qiov, sector_num, nb_sectors, AIOCB_READ_UDATA);
-
-retry:
-if (check_overlapping_aiocb(s, &acb)) {
-qemu_co_queue_wait(&s->overlapping_queue);
-goto retry;
-}
-
 sd_co_rw_vector(&acb);
+sd_aio_complete(&acb);
 
-QLIST_REMOVE(&acb, aiocb_siblings);
-qemu_co_queue_restart_all(&s->overlapping_queue);
 return acb.ret;
 }
 
@@ -2273,6 +2273,8 @@ static int coroutine_fn 
sd_co_flush_to_disk(BlockDriverState *bs)
 if (--acb.nr_pending) {
 qemu_coroutine_yield();
 }
+
+sd_aio_complete(&acb);
 return acb.ret;
 }
 
@@ -2727,17 +2729,9 @@ static coroutine_fn int sd_co_pdiscard(BlockDriverState 
*bs, int64_t offset,
 assert((count & (BDRV_SECTOR_SIZE - 1)) == 0);
 sd_aio_setup(&acb, s, &discard_iov, offset >> BDRV_SECTOR_BITS,
  count >> BDRV_SECTOR_BITS, AIOCB_DISCARD_OBJ);
-
-retry:
-if (check_overlapping_aiocb(s, &acb)) {
-qemu_co_queue_wait(&s->overlapping_queue);
-goto retry;
-}
-
 sd_co_rw_vector(&acb);
+sd_aio_complete(&acb);
 
-QLIST_REMOVE(&acb, aiocb_siblings);
-qemu_co_queue_restart_all(&s->overlapping_queue);
 return acb.ret;
 }
 
-- 
2.9.3




[Qemu-devel] [PULL 3/7] virtio: access ISR atomically

2016-11-18 Thread Michael S. Tsirkin
From: Paolo Bonzini 

This will be needed once dataplane will be able to set it outside
the big QEMU lock.

Reviewed-by: Stefan Hajnoczi 
Tested-by: Farhan Ali 
Tested-by: Alex Williamson 
Signed-off-by: Paolo Bonzini 
Reviewed-by: Michael S. Tsirkin 
Signed-off-by: Michael S. Tsirkin 
---
 hw/virtio/virtio-mmio.c |  6 +++---
 hw/virtio/virtio-pci.c  |  9 +++--
 hw/virtio/virtio.c  | 22 +-
 3 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/hw/virtio/virtio-mmio.c b/hw/virtio/virtio-mmio.c
index a30270f..17412cb 100644
--- a/hw/virtio/virtio-mmio.c
+++ b/hw/virtio/virtio-mmio.c
@@ -191,7 +191,7 @@ static uint64_t virtio_mmio_read(void *opaque, hwaddr 
offset, unsigned size)
 return virtio_queue_get_addr(vdev, vdev->queue_sel)
 >> proxy->guest_page_shift;
 case VIRTIO_MMIO_INTERRUPTSTATUS:
-return vdev->isr;
+return atomic_read(&vdev->isr);
 case VIRTIO_MMIO_STATUS:
 return vdev->status;
 case VIRTIO_MMIO_HOSTFEATURESSEL:
@@ -299,7 +299,7 @@ static void virtio_mmio_write(void *opaque, hwaddr offset, 
uint64_t value,
 }
 break;
 case VIRTIO_MMIO_INTERRUPTACK:
-vdev->isr &= ~value;
+atomic_and(&vdev->isr, ~value);
 virtio_update_irq(vdev);
 break;
 case VIRTIO_MMIO_STATUS:
@@ -347,7 +347,7 @@ static void virtio_mmio_update_irq(DeviceState *opaque, 
uint16_t vector)
 if (!vdev) {
 return;
 }
-level = (vdev->isr != 0);
+level = (atomic_read(&vdev->isr) != 0);
 DPRINTF("virtio_mmio setting IRQ %d\n", level);
 qemu_set_irq(proxy->irq, level);
 }
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 97b32fe..521ba0b 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -73,7 +73,7 @@ static void virtio_pci_notify(DeviceState *d, uint16_t vector)
 msix_notify(&proxy->pci_dev, vector);
 else {
 VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
-pci_set_irq(&proxy->pci_dev, vdev->isr & 1);
+pci_set_irq(&proxy->pci_dev, atomic_read(&vdev->isr) & 1);
 }
 }
 
@@ -449,8 +449,7 @@ static uint32_t virtio_ioport_read(VirtIOPCIProxy *proxy, 
uint32_t addr)
 break;
 case VIRTIO_PCI_ISR:
 /* reading from the ISR also clears it. */
-ret = vdev->isr;
-vdev->isr = 0;
+ret = atomic_xchg(&vdev->isr, 0);
 pci_irq_deassert(&proxy->pci_dev);
 break;
 case VIRTIO_MSI_CONFIG_VECTOR:
@@ -1379,9 +1378,7 @@ static uint64_t virtio_pci_isr_read(void *opaque, hwaddr 
addr,
 {
 VirtIOPCIProxy *proxy = opaque;
 VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
-uint64_t val = vdev->isr;
-
-vdev->isr = 0;
+uint64_t val = atomic_xchg(&vdev->isr, 0);
 pci_irq_deassert(&proxy->pci_dev);
 
 return val;
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index b7d5828..138a414 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -945,7 +945,7 @@ void virtio_reset(void *opaque)
 vdev->guest_features = 0;
 vdev->queue_sel = 0;
 vdev->status = 0;
-vdev->isr = 0;
+atomic_set(&vdev->isr, 0);
 vdev->config_vector = VIRTIO_NO_VECTOR;
 virtio_notify_vector(vdev, vdev->config_vector);
 
@@ -1318,10 +1318,22 @@ void virtio_del_queue(VirtIODevice *vdev, int n)
 vdev->vq[n].vring.num_default = 0;
 }
 
+static void virtio_set_isr(VirtIODevice *vdev, int value)
+{
+uint8_t old = atomic_read(&vdev->isr);
+
+/* Do not write ISR if it does not change, so that its cacheline remains
+ * shared in the common case where the guest does not read it.
+ */
+if ((old & value) != value) {
+atomic_or(&vdev->isr, value);
+}
+}
+
 void virtio_irq(VirtQueue *vq)
 {
 trace_virtio_irq(vq);
-vq->vdev->isr |= 0x01;
+virtio_set_isr(vq->vdev, 0x1);
 virtio_notify_vector(vq->vdev, vq->vector);
 }
 
@@ -1355,7 +1367,7 @@ void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
 }
 
 trace_virtio_notify(vdev, vq);
-vdev->isr |= 0x01;
+virtio_set_isr(vq->vdev, 0x1);
 virtio_notify_vector(vdev, vq->vector);
 }
 
@@ -1364,7 +1376,7 @@ void virtio_notify_config(VirtIODevice *vdev)
 if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK))
 return;
 
-vdev->isr |= 0x03;
+virtio_set_isr(vdev, 0x3);
 vdev->generation++;
 virtio_notify_vector(vdev, vdev->config_vector);
 }
@@ -1895,7 +1907,7 @@ void virtio_init(VirtIODevice *vdev, const char *name,
 
 vdev->device_id = device_id;
 vdev->status = 0;
-vdev->isr = 0;
+atomic_set(&vdev->isr, 0);
 vdev->queue_sel = 0;
 vdev->config_vector = VIRTIO_NO_VECTOR;
 vdev->vq = g_malloc0(sizeof(VirtQueue) * VIRTIO_QUEUE_MAX);
-- 
MST




Re: [Qemu-devel] [PATCH] hw/pci: disable pci-bridge's shpc by default

2016-11-18 Thread Andrew Jones
On Wed, Nov 16, 2016 at 07:05:25PM +0200, Marcel Apfelbaum wrote:
> On 11/16/2016 06:44 PM, Andrew Jones wrote:
> > On Sat, Nov 05, 2016 at 06:46:34PM +0200, Marcel Apfelbaum wrote:
> > > On 11/03/2016 09:40 PM, Michael S. Tsirkin wrote:
> > > > On Thu, Nov 03, 2016 at 01:05:44PM +0200, Marcel Apfelbaum wrote:
> > > > > On 11/03/2016 06:18 AM, Michael S. Tsirkin wrote:
> > > > > > On Wed, Nov 02, 2016 at 05:16:42PM +0200, Marcel Apfelbaum wrote:
> > > > > > > The shpc component is optional while  ACPI hotplug is used
> > > > > > > for hot-plugging PCI devices into a PCI-PCI bridge.
> > > > > > > Disabling the shpc by default will make slot 0 usable at boot time
> > > > > 
> > > > > Hi Michael
> > > > > 
> > > > > > 
> > > > > > at the cost of breaking all hotplug for all non-acpi users.
> > > > > > 
> > > > > 
> > > > > Do we have a non-acpi user that is able to use the shpc component 
> > > > > as-is today?
> > > > 
> > > > power and some arm systems I guess?
> > > > 
> > > 
> > > Adding Andrew , maybe he can give us an answer.
> > 
> > Not really :-) My lack of PCI knowledge makes that difficult. I'd be happy
> > to help with an experiment though. Can you give me command line arguments,
> > qmp commands, etc. that I should use to try it out? I imagine I should
> > just boot an ARM guest using DT (instead of ACPI) and then attempt to
> > hotplug a PCI device. I'm not sure, however, what, if any, special
> > configuration I need in order to ensure I'm testing what you're
> > interested in.
> > 
> 
> Hi Drew,
> 
> 
> Just run QEMU with '-device pci-bridge,chassis_nr=1,id=bridge1 -monitor stdio'
> with an ARM guest using DT and wait until the guest finish booting.
> 
> Then run at hmp:
> device_add virtio-net-pci,bus=bridge1,id=net2
> 
> Next run lspci in the guest to see the new device.

Thanks for the instructions Marcel. Here's the results

 $QEMU -machine virt,accel=$ACCEL -cpu $CPU -nographic -m 4096 -smp 8 \
   -bios /usr/share/AAVMF/AAVMF_CODE.fd \
   -device pci-bridge,chassis_nr=1,id=bridge1 \
   -drive file=$FEDORA_IMG,if=none,id=dr0,format=qcow2 \
   -device virtio-blk-pci,bus=bridge1,addr=01,drive=dr0,id=disk0 \
   -netdev user,id=hostnet0 \
   -device virtio-net-pci,bus=bridge1,addr=02,netdev=hostnet0,id=net0

 # lspci
 00:00.0 Host bridge: Red Hat, Inc. Device 0008
 00:01.0 PCI bridge: Red Hat, Inc. QEMU PCI-PCI bridge
 01:01.0 SCSI storage controller: Red Hat, Inc Virtio block device
 01:02.0 Ethernet controller: Red Hat, Inc Virtio network device

 (qemu) device_add virtio-net-pci,bus=bridge1,id=net2
 Unsupported PCI slot 0 for standard hotplug controller. Valid slots are
 between 1 and 31.

(Tried again giving addr=03)

 (qemu) device_add virtio-net-pci,bus=bridge1,id=net2,addr=03

(Seemed to work, but...)

 # lspci
 00:00.0 Host bridge: Red Hat, Inc. Device 0008
 00:01.0 PCI bridge: Red Hat, Inc. QEMU PCI-PCI bridge
 01:01.0 SCSI storage controller: Red Hat, Inc Virtio block device
 01:02.0 Ethernet controller: Red Hat, Inc Virtio network device

(Doesn't show up in lscpi. So I guess it doesn't work)

> 
> 
> BTW, will an ARM guest run 'fast' enough to be usable on a x86 machine?
> If yes, any pointers on how to create such a guest?

You can run AArch64 guests on x86 machines. It's not super fast though...
Certainly I wouldn't want to create my guest image using TCG. So, assuming
you acquire an image somewhere (or create it on a real machine), then you
can use the above command line, just change 

ACCEL=kvm CPU=host to ACCEL=tcg CPU=cortex-a57

Thanks,
drew



Re: [Qemu-devel] [PULL 0/1] Tracing patches

2016-11-18 Thread Stefan Hajnoczi
On Fri, Nov 18, 2016 at 03:01:28PM +, Stefan Hajnoczi wrote:
> The following changes since commit b0bcc86d2a87456f5a276f941dc775b265b309cf:
> 
>   Update version for v2.8.0-rc0 release (2016-11-15 20:55:12 +)
> 
> are available in the git repository at:
> 
>   git://github.com/stefanha/qemu.git tags/tracing-pull-request
> 
> for you to fetch changes up to d4f7ca59017835784c6872dfab0e269d9b41b05a:
> 
>   trace: fix generated code build break (2016-11-18 11:09:58 +)
> 
> 
> 
> 
> 
> Greg Kurz (1):
>   trace: fix generated code build break
> 
>  scripts/tracetool.py | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> -- 
> 2.7.4
> 
> 

Thanks, applied to my staging tree:
https://github.com/stefanha/qemu/commits/staging

Stefan


signature.asc
Description: PGP signature


Re: [Qemu-devel] [PATCH v14 00/22] Add Mediated device support

2016-11-18 Thread Alex Williamson
On Fri, 18 Nov 2016 17:16:32 +0800
Zhenyu Wang  wrote:

> On 2016.11.17 16:51:45 -0700, Alex Williamson wrote:
> > On Thu, 17 Nov 2016 23:29:38 +
> > "Tian, Kevin"  wrote:
> >   
> > > > From: Alex Williamson [mailto:alex.william...@redhat.com]
> > > > Sent: Friday, November 18, 2016 5:25 AM
> > > > 
> > > > On Thu, 17 Nov 2016 02:16:12 +0530
> > > > Kirti Wankhede  wrote:
> > > > >
> > > > >  Documentation/ABI/testing/sysfs-bus-vfio-mdev |  111 ++
> > > > >  Documentation/vfio-mediated-device.txt|  399 +++
> > > > >  MAINTAINERS   |9 +
> > > > >  drivers/vfio/Kconfig  |1 +
> > > > >  drivers/vfio/Makefile |1 +
> > > > >  drivers/vfio/mdev/Kconfig |   17 +
> > > > >  drivers/vfio/mdev/Makefile|5 +
> > > > >  drivers/vfio/mdev/mdev_core.c |  385 +++
> > > > >  drivers/vfio/mdev/mdev_driver.c   |  119 ++
> > > > >  drivers/vfio/mdev/mdev_private.h  |   41 +
> > > > >  drivers/vfio/mdev/mdev_sysfs.c|  286 +
> > > > >  drivers/vfio/mdev/vfio_mdev.c |  180 +++
> > > > >  drivers/vfio/pci/vfio_pci.c   |   83 +-
> > > > >  drivers/vfio/platform/vfio_platform_common.c  |   31 +-
> > > > >  drivers/vfio/vfio.c   |  340 +-
> > > > >  drivers/vfio/vfio_iommu_type1.c   |  872 +++---
> > > > >  include/linux/mdev.h  |  177 +++
> > > > >  include/linux/vfio.h  |   32 +-
> > > > >  include/uapi/linux/vfio.h |   10 +
> > > > >  samples/vfio-mdev/Makefile|   13 +
> > > > >  samples/vfio-mdev/mtty.c  | 1503
> > > > +
> > > > >  21 files changed, 4358 insertions(+), 257 deletions(-)
> > > > >  create mode 100644 Documentation/ABI/testing/sysfs-bus-vfio-mdev
> > > > >  create mode 100644 Documentation/vfio-mediated-device.txt
> > > > >  create mode 100644 drivers/vfio/mdev/Kconfig
> > > > >  create mode 100644 drivers/vfio/mdev/Makefile
> > > > >  create mode 100644 drivers/vfio/mdev/mdev_core.c
> > > > >  create mode 100644 drivers/vfio/mdev/mdev_driver.c
> > > > >  create mode 100644 drivers/vfio/mdev/mdev_private.h
> > > > >  create mode 100644 drivers/vfio/mdev/mdev_sysfs.c
> > > > >  create mode 100644 drivers/vfio/mdev/vfio_mdev.c
> > > > >  create mode 100644 include/linux/mdev.h
> > > > >  create mode 100644 samples/vfio-mdev/Makefile
> > > > >  create mode 100644 samples/vfio-mdev/mtty.c
> > > > 
> > > > As discussed, I dropped patch 12, updated the documentation, and added
> > > > 'retries' initialization.  This is now applied to my next branch for
> > > > v4.10.  Thanks to the reviewers and Kirti and Neo for your hard work!
> > > > Thanks,
> > > > 
> > > 
> > > That's a great news! Alex, do you have an idea when this series may
> > > hit linux-next? :-)  
> > 
> > Whenever there's a new build, hopefully within the next 24hrs, but I
> > don't really know the schedule.  Thanks,
> >   
> 
> Alex, could you do a pull request of mdev for Daniel's drm-intel tree?
> We need to send KVMGT mdev support pull base on that.

No, this is not how I intend or prefer to merge this.  This is a large
change for vfio and it is not exclusive to KVMGT.  We have linux-next
to facilitate handling dependencies between subsystems during
development and a two week merge window to allow managing how these
changes enter the mainline tree.  If I were to have this pulled into
drm-intel it ties my hands as to how I can manage changes within my
functional area.  I want these two weeks of linux-next exposure for
vetting the changes and resolving any remaining issues.  I'm not going
to compromise my ability to react to such issues.  linux-next inclusion
should be sufficient for you to coordinate through the drm tree, though
Daniel will need to be made aware of the dependency.  I will however
plan to send my pull request to Linus early in the merge window to
accommodate dependent changes also being included for v4.10. Hope
you understand, thanks,

Alex



Re: [Qemu-devel] [PATCH] xen_disk: convert discard input to byte ranges

2016-11-18 Thread Kevin Wolf
Am 18.11.2016 um 15:35 hat Eric Blake geschrieben:
> On 11/18/2016 08:19 AM, Olaf Hering wrote:
> > Am 18. November 2016 14:43:18 MEZ, schrieb Eric Blake :
> >> On 11/18/2016 04:24 AM, Olaf Hering wrote:
> >>> The guest sends discard requests as u64 sector/count pairs, but the
> >>> block layer operates internally with s64/s32 pairs. The conversion
> >>> leads to IO errors in the guest, the discard request is not
> >> processed.
> >>
> >> Doesn't the block layer already split discard requests into 2^31 byte
> >> chunks?
> > 
> > How would it do that without valid input?  It was wrong before the sectors 
> > to bytes conversion, and now its even worse given that all the world fits 
> > into an int.
> 
> Then it sounds like the real bug is that the block layer
> bdrv_co_pdiscard() is buggy for taking 'int count' instead of 'uint64_t
> count'.  Eventually, I think the entire block layer should be fixed to
> allow 64-bit count everywhere, and then auto-fragment it back down to 31
> bits (or even smaller, like NBD's 32M limit or Linux loopback device 64k
> limit) as needed, rather than making all the backends reimplement
> fragmentation.
> 
> > 
> > Remember that there is no API to let the guest know about the limitations 
> > of the host. 
> 
> Correct. But the goal of the block layer is to hide the quirks, so that
> the code handling the guest requests can offload all the common work to
> one place.
> 
> Kevin, is it too late for 2.8 for patches that change bdrv_co_pdiscard
> to take a 64-bit count?  Or would that still be under bug-fix category
> because of the xen use case?

Given that we're already a few weeks into the freeze, I would very much
prefer an isolated patch for xen_disk for 2.8, and then it can be
cleaned up during the 2.9 cycle.

Kevin


pgpHlfi_sYFpP.pgp
Description: PGP signature


Re: [Qemu-devel] [PATCH v3] ivshmem: Fix 64 bit memory bar configuration

2016-11-18 Thread Michael S. Tsirkin
On Fri, Nov 18, 2016 at 05:27:41PM +0200, Michael S. Tsirkin wrote:
> On Thu, Nov 17, 2016 at 08:31:03PM +0800, Zhuangyanying wrote:
> > From: Zhuang Yanying 
> > 
> > Device ivshmem property use64=0 is designed to make the device
> > expose a 32 bit shared memory BAR instead of 64 bit one.  The
> > default is a 64 bit BAR, except pc-1.2 and older retain a 32 bit
> > BAR.  A 32 bit BAR can support only up to 1 GiB of shared memory.
> > 
> > This worked as designed until commit 5400c02 accidentally flipped
> > its sense: since then, we misinterpret use64=0 as use64=1 and vice
> > versa.  Worse, the default got flipped as well.  Devices
> > ivshmem-plain and ivshmem-doorbell are not affected.
> > 
> > Fix by restoring the test of IVShmemState member not_legacy_32bit
> > that got messed up in commit 5400c02.  Also update its
> > initialization for devices ivhsmem-plain and ivshmem-doorbell.
> > Without that, they'd regress to 32 bit BARs.
> > 
> > Cc: qemu-sta...@nongnu.org
> > Signed-off-by: Zhuang Yanying 
> > Reviewed-by: Gonglei 
> > Reviewed-by: Marc-André Lureau 
> 
> This is UTF-8 encoded, but your mail header says
> Content-Transfer-Encoding: 8bit
> so git am fails to apply this.

In fact this is the problem:
Content-Type: text/plain; charset="n"

This is not a valid charset. I fixed it to
Content-Type: text/plain; charset="utf-8"

and it applies.

Pls take care in the future.

> 
> 
> > ---
> >  hw/misc/ivshmem.c | 4 +++-
> >  1 file changed, 3 insertions(+), 1 deletion(-)
> > 
> > diff --git a/hw/misc/ivshmem.c b/hw/misc/ivshmem.c
> > index 230e51b..abeaf3d 100644
> > --- a/hw/misc/ivshmem.c
> > +++ b/hw/misc/ivshmem.c
> > @@ -858,7 +858,7 @@ static void ivshmem_common_realize(PCIDevice *dev, 
> > Error **errp)
> >  pci_register_bar(dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY,
> >   &s->ivshmem_mmio);
> >  
> > -if (!s->not_legacy_32bit) {
> > +if (s->not_legacy_32bit) {
> >  attr |= PCI_BASE_ADDRESS_MEM_TYPE_64;
> >  }
> >  
> > @@ -1045,6 +1045,7 @@ static void ivshmem_plain_init(Object *obj)
> >   ivshmem_check_memdev_is_busy,
> >   OBJ_PROP_LINK_UNREF_ON_RELEASE,
> >   &error_abort);
> > +s->not_legacy_32bit = 1;
> >  }
> >  
> >  static void ivshmem_plain_realize(PCIDevice *dev, Error **errp)
> > @@ -1116,6 +1117,7 @@ static void ivshmem_doorbell_init(Object *obj)
> >  
> >  s->features |= (1 << IVSHMEM_MSI);
> >  s->legacy_size = SIZE_MAX;  /* whatever the server sends */
> > +s->not_legacy_32bit = 1;
> >  }
> >  
> >  static void ivshmem_doorbell_realize(PCIDevice *dev, Error **errp)
> > -- 
> > 1.8.3.1
> > 



Re: [Qemu-devel] [PATCH v3] ivshmem: Fix 64 bit memory bar configuration

2016-11-18 Thread Michael S. Tsirkin
On Thu, Nov 17, 2016 at 08:31:03PM +0800, Zhuangyanying wrote:
> From: Zhuang Yanying 
> 
> Device ivshmem property use64=0 is designed to make the device
> expose a 32 bit shared memory BAR instead of 64 bit one.  The
> default is a 64 bit BAR, except pc-1.2 and older retain a 32 bit
> BAR.  A 32 bit BAR can support only up to 1 GiB of shared memory.
> 
> This worked as designed until commit 5400c02 accidentally flipped
> its sense: since then, we misinterpret use64=0 as use64=1 and vice
> versa.  Worse, the default got flipped as well.  Devices
> ivshmem-plain and ivshmem-doorbell are not affected.
> 
> Fix by restoring the test of IVShmemState member not_legacy_32bit
> that got messed up in commit 5400c02.  Also update its
> initialization for devices ivhsmem-plain and ivshmem-doorbell.
> Without that, they'd regress to 32 bit BARs.
> 
> Cc: qemu-sta...@nongnu.org
> Signed-off-by: Zhuang Yanying 
> Reviewed-by: Gonglei 
> Reviewed-by: Marc-André Lureau 

This is UTF-8 encoded, but your mail header says
Content-Transfer-Encoding: 8bit
so git am fails to apply this.



> ---
>  hw/misc/ivshmem.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/misc/ivshmem.c b/hw/misc/ivshmem.c
> index 230e51b..abeaf3d 100644
> --- a/hw/misc/ivshmem.c
> +++ b/hw/misc/ivshmem.c
> @@ -858,7 +858,7 @@ static void ivshmem_common_realize(PCIDevice *dev, Error 
> **errp)
>  pci_register_bar(dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY,
>   &s->ivshmem_mmio);
>  
> -if (!s->not_legacy_32bit) {
> +if (s->not_legacy_32bit) {
>  attr |= PCI_BASE_ADDRESS_MEM_TYPE_64;
>  }
>  
> @@ -1045,6 +1045,7 @@ static void ivshmem_plain_init(Object *obj)
>   ivshmem_check_memdev_is_busy,
>   OBJ_PROP_LINK_UNREF_ON_RELEASE,
>   &error_abort);
> +s->not_legacy_32bit = 1;
>  }
>  
>  static void ivshmem_plain_realize(PCIDevice *dev, Error **errp)
> @@ -1116,6 +1117,7 @@ static void ivshmem_doorbell_init(Object *obj)
>  
>  s->features |= (1 << IVSHMEM_MSI);
>  s->legacy_size = SIZE_MAX;  /* whatever the server sends */
> +s->not_legacy_32bit = 1;
>  }
>  
>  static void ivshmem_doorbell_realize(PCIDevice *dev, Error **errp)
> -- 
> 1.8.3.1
> 



Re: [Qemu-devel] [PATCH 0/3] virtio: disable notifications in blk and scsi

2016-11-18 Thread Stefan Hajnoczi
On Fri, Nov 18, 2016 at 04:21:33PM +0200, Michael S. Tsirkin wrote:
> On Fri, Nov 18, 2016 at 10:58:47AM +, Stefan Hajnoczi wrote:
> > On Thu, Nov 17, 2016 at 07:38:45PM +0200, Michael S. Tsirkin wrote:
> > > On Thu, Nov 17, 2016 at 01:27:49PM +, Stefan Hajnoczi wrote:
> > > > On Thu, Nov 17, 2016 at 12:17:57AM +0200, Michael S. Tsirkin wrote:
> > > > > On Wed, Nov 16, 2016 at 09:53:06PM +, Stefan Hajnoczi wrote:
> > > > > > Disabling notifications during virtqueue processing reduces the 
> > > > > > number of
> > > > > > exits.  The virtio-net device already uses 
> > > > > > virtio_queue_set_notifications() but
> > > > > > virtio-blk and virtio-scsi do not.
> > > > > > 
> > > > > > The following benchmark shows a 15% reduction in virtio-blk-pci 
> > > > > > MMIO exits:
> > > > > > 
> > > > > >   (host)$ qemu-system-x86_64 \
> > > > > >   -enable-kvm -m 1024 -cpu host \
> > > > > >   -drive if=virtio,id=drive0,file=f24.img,format=raw,\
> > > > > >  cache=none,aio=native
> > > > > >   (guest)$ fio # jobs=4, iodepth=8, direct=1, randread
> > > > > >   (host)$ sudo perf record -a -e kvm:kvm_fast_mmio
> > > > > > 
> > > > > > Number of kvm_fast_mmio events:
> > > > > > Unpatched: 685k
> > > > > > Patched: 592k (-15%, lower is better)
> > > > > 
> > > > > Any chance to see a gain in actual benchmark numbers?
> > > > > This is important to make sure we are not just
> > > > > shifting overhead around.
> > > > 
> > > > Good idea.  I reran this morning without any tracing and compared
> > > > against bare metal.
> > > > 
> > > > Total reads for a 30-second 4 KB random read benchmark with 4 processes
> > > > x iodepth=8:
> > > > 
> > > > Bare metal: 26440 MB
> > > > Unpatched:  19799 MB
> > > > Patched:21252 MB
> > > > 
> > > > Patched vs Unpatched: +7% improvement
> > > > Patched vs Bare metal: 20% virtualization overhead
> > > > 
> > > > The disk image is a 8 GB raw file on XFS on LVM on dm-crypt on a Samsung
> > > > MZNLN256HCHP 256 GB SATA SSD.  This is just my laptop.
> > > > 
> > > > Seems like a worthwhile improvement to me.
> > > > 
> > > > Stefan
> > > 
> > > Sure. Pls remember to ping or re-post after the release.
> > 
> > How about a -next tree?
> 
> -next would make sense if we did Linus style short merge
> cycles followed by a long stabilization period.
> 
> With current QEMU style -next seems counter-productive, we do freezes in
> particular so people focus on stabilization, with -next everyone except
> maintainers just keeps going as usual, and maintainers must handle
> double the load.
> 
> > I've found that useful for block, net, and tracing in the past.  Most of
> > the time it means patch authors can rest assured their patches will be
> > merged without further action.  It allows development of features that
> > depend on out-of-tree patches.
> > 
> > Stefan
> 
> Less work for authors, more work for me ... I'd rather distribute the load.

Okay.

Stefan


signature.asc
Description: PGP signature


[Qemu-devel] [Bug 1626972] Re: QEMU memfd_create fallback mechanism change for security drivers

2016-11-18 Thread Billy Olsen
** Also affects: cloud-archive
   Importance: Undecided
   Status: New

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1626972

Title:
  QEMU memfd_create fallback mechanism change for security drivers

Status in Ubuntu Cloud Archive:
  In Progress
Status in QEMU:
  In Progress
Status in qemu package in Ubuntu:
  In Progress
Status in qemu source package in Xenial:
  In Progress
Status in qemu source package in Yakkety:
  In Progress
Status in qemu source package in Zesty:
  In Progress

Bug description:
  And, when libvirt starts using apparmor, and creating apparmor
  profiles for every virtual machine created in the compute nodes,
  mitaka qemu (2.5 - and upstream also) uses a fallback mechanism for
  creating shared memory for live-migrations. This fall back mechanism,
  on kernels 3.13 - that don't have memfd_create() system-call, try to
  create files on /tmp/ directory and fails.. causing live-migration not
  to work.

  Trusty with kernel 3.13 + Mitaka with qemu 2.5 + apparmor capability =
  can't live migrate.

  From qemu 2.5, logic is on :

  void *qemu_memfd_alloc(const char *name, size_t size, unsigned int seals, int 
*fd)
  {
  if (memfd_create)... ### only works with HWE kernels

  else ### 3.13 kernels, gets blocked by apparmor
 tmpdir = g_get_tmp_dir
 ...
 mfd = mkstemp(fname)
  }

  And you can see the errors:

  From the host trying to send the virtual machine:

  2016-08-15 16:36:26.160 1974 ERROR nova.virt.libvirt.driver 
[req-0cac612b-8d53-4610-b773-d07ad6bacb91 691a581cfa7046278380ce82b1c38ddd 
133ebc3585c041aebaead8c062cd6511 - - -] [instance: 
2afa1131-bc8c-43d2-9c4a-962c1bf7723e] Migration operation has aborted
  2016-08-15 16:36:26.248 1974 ERROR nova.virt.libvirt.driver 
[req-0cac612b-8d53-4610-b773-d07ad6bacb91 691a581cfa7046278380ce82b1c38ddd 
133ebc3585c041aebaead8c062cd6511 - - -] [instance: 
2afa1131-bc8c-43d2-9c4a-962c1bf7723e] Live Migration failure: internal error: 
unable to execute QEMU command 'migrate': Migration disabled: failed to 
allocate shared memory

  From the host trying to receive the virtual machine:

  Aug 15 16:36:19 tkcompute01 kernel: [ 1194.356794] type=1400 
audit(1471289779.791:72): apparmor="STATUS" operation="profile_load" 
profile="unconfined" name="libvirt-2afa1131-bc8c-43d2-9c4a-962c1bf7723e" 
pid=12565 comm="apparmor_parser"
  Aug 15 16:36:19 tkcompute01 kernel: [ 1194.357048] type=1400 
audit(1471289779.791:73): apparmor="STATUS" operation="profile_load" 
profile="unconfined" name="qemu_bridge_helper" pid=12565 comm="apparmor_parser"
  Aug 15 16:36:20 tkcompute01 kernel: [ 1194.877027] type=1400 
audit(1471289780.311:74): apparmor="STATUS" operation="profile_replace" 
profile="unconfined" name="libvirt-2afa1131-bc8c-43d2-9c4a-962c1bf7723e" 
pid=12613 comm="apparmor_parser"
  Aug 15 16:36:20 tkcompute01 kernel: [ 1194.904407] type=1400 
audit(1471289780.343:75): apparmor="STATUS" operation="profile_replace" 
profile="unconfined" name="qemu_bridge_helper" pid=12613 comm="apparmor_parser"
  Aug 15 16:36:20 tkcompute01 kernel: [ 1194.973064] type=1400 
audit(1471289780.407:76): apparmor="DENIED" operation="mknod" 
profile="libvirt-2afa1131-bc8c-43d2-9c4a-962c1bf7723e" name="/tmp/memfd-tNpKSj" 
pid=12625 comm="qemu-system-x86" requested_mask="c" denied_mask="c" fsuid=107 
ouid=107
  Aug 15 16:36:20 tkcompute01 kernel: [ 1194.979871] type=1400 
audit(1471289780.411:77): apparmor="DENIED" operation="open" 
profile="libvirt-2afa1131-bc8c-43d2-9c4a-962c1bf7723e" name="/tmp/" pid=12625 
comm="qemu-system-x86" requested_mask="r" denied_mask="r" fsuid=107 ouid=0
  Aug 15 16:36:20 tkcompute01 kernel: [ 1194.979881] type=1400 
audit(1471289780.411:78): apparmor="DENIED" operation="open" 
profile="libvirt-2afa1131-bc8c-43d2-9c4a-962c1bf7723e" name="/var/tmp/" 
pid=12625 comm="qemu-system-x86" requested_mask="r" denied_mask="r" fsuid=107 
ouid=0

  When leaving libvirt without apparmor capabilities (thus not confining
  virtual machines on compute nodes, the live migration works as
  expected, so, clearly, apparmor is stepping into the live migration).
  I'm sure that virtual machines have to be confined and that this isn't
  the desired behaviour...

To manage notifications about this bug go to:
https://bugs.launchpad.net/cloud-archive/+bug/1626972/+subscriptions



Re: [Qemu-devel] [kvm-unit-tests PATCH 0/4] kvm-unit-tests: add first GIC MMIO tests

2016-11-18 Thread Itaru Kitayama

Hi Andre,

I've verified tests you proposed to the lists finish without an
issue with the kvm-arm-for-4.9-rc6 kernel.

Itaru



[Qemu-devel] [PULL 0/1] Tracing patches

2016-11-18 Thread Stefan Hajnoczi
The following changes since commit b0bcc86d2a87456f5a276f941dc775b265b309cf:

  Update version for v2.8.0-rc0 release (2016-11-15 20:55:12 +)

are available in the git repository at:

  git://github.com/stefanha/qemu.git tags/tracing-pull-request

for you to fetch changes up to d4f7ca59017835784c6872dfab0e269d9b41b05a:

  trace: fix generated code build break (2016-11-18 11:09:58 +)





Greg Kurz (1):
  trace: fix generated code build break

 scripts/tracetool.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

-- 
2.7.4




Re: [Qemu-devel] [RFCv2 01/12] pseries: Always use core objects for CPU construction

2016-11-18 Thread Greg Kurz
On Wed, 16 Nov 2016 09:17:44 +1100
David Gibson  wrote:

> Currently the pseries machine has two paths for constructing CPUs.  On
> newer machine type versions, which support cpu hotplug, it constructs
> cpu core objects, which in turn construct CPU threads.  For older machine
> versions it individually constructs the CPU threads.
> 
> This division is going to make some future changes to the cpu construction
> harder, so this patch unifies them.  Now cpu core objects are always
> created.  This requires some updates to allow core objects to be created
> without a full complement of threads (since older versions allowed a
> number of cpus not a multiple of the threads-per-core).  Likewise it needs
> some changes to the cpu core hot/cold plug path so as not to choke on the
> old machine types without hotplug support.
> 
> For good measure, we move the cpu construction to its own subfunction,
> spapr_init_cpus().
> 
> Signed-off-by: David Gibson 
> ---

Reviewed-by: Greg Kurz 

>  hw/ppc/spapr.c  | 125 
> +++-
>  hw/ppc/spapr_cpu_core.c |  37 +++---
>  include/hw/ppc/spapr.h  |   1 -
>  3 files changed, 90 insertions(+), 73 deletions(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 0cbab24..cbac537 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -1687,11 +1687,80 @@ static void spapr_validate_node_memory(MachineState 
> *machine, Error **errp)
>  }
>  }
>  
> +static void spapr_init_cpus(sPAPRMachineState *spapr)
> +{
> +MachineState *machine = MACHINE(spapr);
> +MachineClass *mc = MACHINE_GET_CLASS(machine);
> +char *type = spapr_get_cpu_core_type(machine->cpu_model);
> +int smt = kvmppc_smt_threads();
> +int spapr_max_cores, spapr_cores;
> +int i;
> +
> +if (!type) {
> +error_report("Unable to find sPAPR CPU Core definition");
> +exit(1);
> +}
> +
> +if (mc->query_hotpluggable_cpus) {
> +if (smp_cpus % smp_threads) {
> +error_report("smp_cpus (%u) must be multiple of threads (%u)",
> + smp_cpus, smp_threads);
> +exit(1);
> +}
> +if (max_cpus % smp_threads) {
> +error_report("max_cpus (%u) must be multiple of threads (%u)",
> + max_cpus, smp_threads);
> +exit(1);
> +}
> +
> +spapr_max_cores = max_cpus / smp_threads;
> +spapr_cores = smp_cpus / smp_threads;
> +} else {
> +if (max_cpus != smp_cpus) {
> +error_report("This machine version does not support CPU 
> hotplug");
> +exit(1);
> +}
> +
> +spapr_max_cores = QEMU_ALIGN_UP(smp_cpus, smp_threads) / smp_threads;
> +spapr_cores = spapr_max_cores;
> +}
> +
> +spapr->cores = g_new0(Object *, spapr_max_cores);
> +for (i = 0; i < spapr_max_cores; i++) {
> +int core_id = i * smp_threads;
> +
> +if (mc->query_hotpluggable_cpus) {
> +sPAPRDRConnector *drc =
> +spapr_dr_connector_new(OBJECT(spapr),
> +   SPAPR_DR_CONNECTOR_TYPE_CPU,
> +   (core_id / smp_threads) * smt);
> +
> +qemu_register_reset(spapr_drc_reset, drc);
> +}
> +
> +if (i < spapr_cores) {
> +Object *core  = object_new(type);
> +int nr_threads = smp_threads;
> +
> +/* Handle the partially filled core for older machine types */
> +if ((i + 1) * smp_threads >= smp_cpus) {
> +nr_threads = smp_cpus - i * smp_threads;
> +}
> +
> +object_property_set_int(core, nr_threads, "nr-threads",
> +&error_fatal);
> +object_property_set_int(core, core_id, CPU_CORE_PROP_CORE_ID,
> +&error_fatal);
> +object_property_set_bool(core, true, "realized", &error_fatal);
> +}
> +}
> +g_free(type);
> +}
> +
>  /* pSeries LPAR / sPAPR hardware init */
>  static void ppc_spapr_init(MachineState *machine)
>  {
>  sPAPRMachineState *spapr = SPAPR_MACHINE(machine);
> -MachineClass *mc = MACHINE_GET_CLASS(machine);
>  sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
>  const char *kernel_filename = machine->kernel_filename;
>  const char *initrd_filename = machine->initrd_filename;
> @@ -1706,21 +1775,6 @@ static void ppc_spapr_init(MachineState *machine)
>  long load_limit, fw_size;
>  char *filename;
>  int smt = kvmppc_smt_threads();
> -int spapr_cores = smp_cpus / smp_threads;
> -int spapr_max_cores = max_cpus / smp_threads;
> -
> -if (mc->query_hotpluggable_cpus) {
> -if (smp_cpus % smp_threads) {
> -error_report("smp_cpus (%u) must be multiple of threads (%u)",
> - smp_cpus, smp_threads);
> -exit(1);
> -}
> -if (max_cpus 

Re: [Qemu-devel] [PULL 0/1] ivshmem fix for 2.8

2016-11-18 Thread Marc-André Lureau
Hi

On Fri, Nov 18, 2016 at 6:57 PM Stefan Hajnoczi  wrote:

> On Thu, Nov 17, 2016 at 07:26:12PM +0400, Marc-André Lureau wrote:
> > The following changes since commit
> b0bcc86d2a87456f5a276f941dc775b265b309cf:
> >
> >   Update version for v2.8.0-rc0 release (2016-11-15 20:55:12 +)
> >
> > are available in the git repository at:
> >
> >   g...@github.com:elmarco/qemu.git tags/ivshmem-pull-request
>
> This is not a publicly accessible repo URL.  I will manually fetch from
> your repo this time but please update your git-config(1) with a separate
> private pushurl and public url:
>
> [remote "github"]
> url = https://github.com/elmarco/qemu.git
> pushurl = g...@github.com:elmarco/qemu.git
>

fixed, thanks


>
> >
> > for you to fetch changes up to b2b79a696052040389e0f9980801a880ce5a6ae3:
> >
> >   ivshmem: Fix 64 bit memory bar configuration (2016-11-17 18:39:59
> +0400)
> >
> > 
> >
> > 
> >
> > Zhuang Yanying (1):
> >   ivshmem: Fix 64 bit memory bar configuration
> >
> >  hw/misc/ivshmem.c | 4 +++-
> >  1 file changed, 3 insertions(+), 1 deletion(-)
> >
> > --
> > 2.10.0
> >
> >
>
-- 
Marc-André Lureau


[Qemu-devel] [PULL 1/1] trace: fix generated code build break

2016-11-18 Thread Stefan Hajnoczi
From: Greg Kurz 

If the QEMU source dir is

/var/tmp/aaa-qemu-clone

and the build dir is

/var/tmp/qemu-aio-poll-v2

Then I get an error as:

trace/generated-tracers.c:15950:13: error: invalid suffix "_trace_events"
 on integer constant
 TraceEvent *2_trace_events[] = {
 ^
trace/generated-tracers.c:15950:13: error: expected identifier or ‘(’ before
 numeric constant
trace/generated-tracers.c: In function ‘trace_2_register_events’:
trace/generated-tracers.c:17949:32: error: invalid suffix "_trace_events" on
 integer constant
 trace_event_register_group(2_trace_events);
^
make: *** [trace/generated-tracers.o] Error 1

This patch fixes the issue.

Reported-by: Fam Zheng 
Signed-off-by: Greg Kurz 
Tested-by: Fam Zheng 
Signed-off-by: Stefan Hajnoczi 
---
 scripts/tracetool.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/tracetool.py b/scripts/tracetool.py
index fe9c9e9..c9e4737 100755
--- a/scripts/tracetool.py
+++ b/scripts/tracetool.py
@@ -70,7 +70,7 @@ def make_group_name(filename):
 
 if dirname == "":
 return "common"
-return re.sub(r"[^A-Za-z0-9]", "_", dirname)
+return "_" + re.sub(r"[^A-Za-z0-9]", "_", dirname)
 
 def main(args):
 global _SCRIPT
-- 
2.7.4




Re: [Qemu-devel] [PULL 0/1] ivshmem fix for 2.8

2016-11-18 Thread Stefan Hajnoczi
On Thu, Nov 17, 2016 at 07:26:12PM +0400, Marc-André Lureau wrote:
> The following changes since commit b0bcc86d2a87456f5a276f941dc775b265b309cf:
> 
>   Update version for v2.8.0-rc0 release (2016-11-15 20:55:12 +)
> 
> are available in the git repository at:
> 
>   g...@github.com:elmarco/qemu.git tags/ivshmem-pull-request
> 
> for you to fetch changes up to b2b79a696052040389e0f9980801a880ce5a6ae3:
> 
>   ivshmem: Fix 64 bit memory bar configuration (2016-11-17 18:39:59 +0400)
> 
> 
> 
> 
> 
> Zhuang Yanying (1):
>   ivshmem: Fix 64 bit memory bar configuration
> 
>  hw/misc/ivshmem.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> -- 
> 2.10.0
> 
> 

Thanks, applied to my staging tree:
https://github.com/stefanha/qemu/commits/staging

Stefan


signature.asc
Description: PGP signature


Re: [Qemu-devel] [PULL 0/1] ivshmem fix for 2.8

2016-11-18 Thread Stefan Hajnoczi
On Thu, Nov 17, 2016 at 07:26:12PM +0400, Marc-André Lureau wrote:
> The following changes since commit b0bcc86d2a87456f5a276f941dc775b265b309cf:
> 
>   Update version for v2.8.0-rc0 release (2016-11-15 20:55:12 +)
> 
> are available in the git repository at:
> 
>   g...@github.com:elmarco/qemu.git tags/ivshmem-pull-request

This is not a publicly accessible repo URL.  I will manually fetch from
your repo this time but please update your git-config(1) with a separate
private pushurl and public url:

[remote "github"]
url = https://github.com/elmarco/qemu.git
pushurl = g...@github.com:elmarco/qemu.git

> 
> for you to fetch changes up to b2b79a696052040389e0f9980801a880ce5a6ae3:
> 
>   ivshmem: Fix 64 bit memory bar configuration (2016-11-17 18:39:59 +0400)
> 
> 
> 
> 
> 
> Zhuang Yanying (1):
>   ivshmem: Fix 64 bit memory bar configuration
> 
>  hw/misc/ivshmem.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> -- 
> 2.10.0
> 
> 


signature.asc
Description: PGP signature


Re: [Qemu-devel] [PATCH v7 RFC] block/vxhs: Initial commit to add Veritas HyperScale VxHS block device support

2016-11-18 Thread Markus Armbruster
Ketan Nilangekar  writes:

> On 11/18/16, 12:56 PM, "Jeff Cody"  wrote:
[...]
>>* Daniel pointed out that there is no authentication method for taking to a
>>  remote server.  This seems a bit scary.  Maybe all that is needed here is
>>  some clarification of the security scheme for authentication?  My
>>  impression from above is that you are relying on the networks being
>>  private to provide some sort of implicit authentication, though, and this
>>  seems fragile (and doesn't protect against a compromised guest or other
>>  process on the server, for one).
>
> Our auth scheme is based on network isolation at L2/L3 level.

Stefan already explained the trust model.  Since understanding it is
crucial to security work, let me use the opportunity to explain it once
more.

The guest is untrusted.  It interacts only with QEMU and, if enabled,
KVM.  KVM has a relatively small attack surface, but if the guest
penetrates it, game's over.  There's nothing we can do to mitigate.
QEMU has a much larger attack surface, but we *can* do something to
mitigate a compromise: nothing on the host trusts QEMU.  Second line of
defense.

A line of defense is as strong as its weakest point.  Adding an
interface between QEMU and the host that requires the host to trust QEMU
basically destroys the second line of defense.  That's a big deal.

You might argue that you don't require "the host" to trust, but only
your daemon (or whatever it is your driver talks to).  But that puts
that daemon in the same security domain as QEMU itself, i.e. it should
not be trusted by anything else on the host.  Now you have a second
problem.

If you rely on "network isolation at L2/L3 level", chances are
*everything* on this isolated network joins QEMU's security domain.  You
almost certainly need a separate isolated network per guest to have a
chance at being credible.  Even then, I'd rather not bet my own money on
it.

It's better to stick to the common trust model, and have *nothing* on
the host trust QEMU.

> If there is a simplified authentication mechanism which we can implement 
> without imposing significant penalties on IO performance, please let us know 
> and we will implement that if feasible.

Daniel already listed available mechanisms.



Re: [Qemu-devel] [kvm-unit-tests PATCH 4/4] arm/arm64: GICv3: add TYPER test

2016-11-18 Thread Andrew Jones
On Thu, Nov 17, 2016 at 05:57:52PM +, Andre Przywara wrote:
> Add a simple test for the GICv3 TYPER test, which does only one basic
> check to ensure we have actually enough interrupt IDs if we support
> LPIs.
> Allow a GICv3 guest to do the common MMIO checks as well, where the
> register semantics are shared with a GICv2.
> 
> Signed-off-by: Andre Przywara 
> ---
>  arm/gic.c | 34 +++---
>  arm/unittests.cfg |  6 ++
>  2 files changed, 37 insertions(+), 3 deletions(-)
> 
> diff --git a/arm/gic.c b/arm/gic.c
> index 02b1be1..7de0e47 100644
> --- a/arm/gic.c
> +++ b/arm/gic.c
> @@ -327,6 +327,30 @@ static bool test_typer_v2(uint32_t reg)
>   return true;
>  }
>  
> +static bool test_typer_v3(uint32_t reg)
> +{
> + int nr_intids;
> +
> + report("GIC emulation %ssupport%s MBIs", 1,
> +reg & BIT(16) ? "" : "does not ",
> +reg & BIT(16) ? "s" : "");

Could just do the test once

 ("...%s...",  reg & BIT(16) ? "supports" : "does not support"

> + report("GIC emulation %ssupport%s LPIs", 1,
> +reg & BIT(17) ? "" : "does not ",
> +reg & BIT(17) ? "s" : "");
> + report("GIC emulation %ssupport%s Aff3", 1,
> +reg & BIT(24) ? "" : "does not ",
> +reg & BIT(24) ? "s" : "");
> +
> + nr_intids = BIT(((reg >> 19) & 0x1f) + 1);
> + report("%d interrupt IDs implemented", 1, nr_intids);
> +
> + if (reg & BIT(17))
> + report("%d LPIs supported", nr_intids > 8192,
> +nr_intids > 8192 ? nr_intids - 8192 : 0);

I'm wondering if we should try to keep the number of report lines
the same host to host. So anywhere we can't do a PASS/FAIL test we
should do a SKIP. Doing that will allow us to cleanly diff test
results between hosts. (I'm not sure I've been doing a good job of
that with the existing tests though...)

> +
> + return true;

No need to return a value.

> +}
> +
>  #define BYTE(reg32, byte) (((reg32) >> ((byte) * 8)) & 0xff)
>  #define REPLACE_BYTE(reg32, byte, new) (((reg32) & ~(0xff << ((byte) * 8))) 
> |\
>   ((new) << ((byte) * 8)))
> @@ -460,8 +484,9 @@ static int gic_test_mmio(int gic_version)
>   idreg = gic_dist_base + 0xfe8;
>   break;
>   case 0x3:
> - report_abort("GICv3 MMIO tests NYI");
> - return -1;
> + gic_dist_base = gicv3_dist_base();
> + idreg = gic_dist_base + 0xffe8;

No define for this ID reg?

> + break;
>   default:
>   report_abort("GIC version %d not supported", gic_version);
>   return 0;
> @@ -471,7 +496,10 @@ static int gic_test_mmio(int gic_version)
>   nr_irqs = 32 * ((reg & 0x1f) + 1);
>   report("number of implemented SPIs: %d", 1, nr_irqs - 32);
>  
> - test_typer_v2(reg);
> + if (gic_version == 2)
> + test_typer_v2(reg);
> + else
> + test_typer_v3(reg);

Maybe we should use a switch here too, preparing for v4

>  
>   report("IIDR: 0x%x", 1, readl(gic_dist_base + GICD_IIDR));
>  
> diff --git a/arm/unittests.cfg b/arm/unittests.cfg
> index 0162e5a..b432346 100644
> --- a/arm/unittests.cfg
> +++ b/arm/unittests.cfg
> @@ -78,3 +78,9 @@ file = gic.flat
>  smp = $MAX_SMP
>  extra_params = -machine gic-version=3 -append 'ipi'
>  groups = gic
> +
> +[gicv3-mmio]
> +file = gic.flat
> +smp = $MAX_SMP
> +extra_params = -machine gic-version=3 -append 'mmio'
> +groups = gic
> -- 
> 2.9.0
> 
>

Thanks,
drew 



Re: [Qemu-devel] [PATCH] xen_disk: convert discard input to byte ranges

2016-11-18 Thread Eric Blake
On 11/18/2016 08:19 AM, Olaf Hering wrote:
> Am 18. November 2016 14:43:18 MEZ, schrieb Eric Blake :
>> On 11/18/2016 04:24 AM, Olaf Hering wrote:
>>> The guest sends discard requests as u64 sector/count pairs, but the
>>> block layer operates internally with s64/s32 pairs. The conversion
>>> leads to IO errors in the guest, the discard request is not
>> processed.
>>
>> Doesn't the block layer already split discard requests into 2^31 byte
>> chunks?
> 
> How would it do that without valid input?  It was wrong before the sectors to 
> bytes conversion, and now its even worse given that all the world fits into 
> an int.

Then it sounds like the real bug is that the block layer
bdrv_co_pdiscard() is buggy for taking 'int count' instead of 'uint64_t
count'.  Eventually, I think the entire block layer should be fixed to
allow 64-bit count everywhere, and then auto-fragment it back down to 31
bits (or even smaller, like NBD's 32M limit or Linux loopback device 64k
limit) as needed, rather than making all the backends reimplement
fragmentation.

> 
> Remember that there is no API to let the guest know about the limitations of 
> the host. 

Correct. But the goal of the block layer is to hide the quirks, so that
the code handling the guest requests can offload all the common work to
one place.

Kevin, is it too late for 2.8 for patches that change bdrv_co_pdiscard
to take a 64-bit count?  Or would that still be under bug-fix category
because of the xen use case?

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] [RFC v2 1/3] vhost-user: Add new protocol feature MTU

2016-11-18 Thread Aaron Conole
Maxime Coquelin  writes:

> This patch adds VHOST_USER_PROTOCOL_F_MTU protocol feature.
>
> If supported, QEMU sends VHOST_USER_GET_MTU request to the client,
> and expects a u64 reply containing the MTU advised for the guest.
>
> Cc: Michael S. Tsirkin 
> Cc: Aaron Conole 
> Signed-off-by: Maxime Coquelin 
> ---
>  hw/virtio/vhost-user.c| 11 +++
>  include/hw/virtio/vhost.h |  1 +
>  2 files changed, 12 insertions(+)
>
> diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
> index 7ee92b3..eaf007d 100644
> --- a/hw/virtio/vhost-user.c
> +++ b/hw/virtio/vhost-user.c
> @@ -32,6 +32,7 @@ enum VhostUserProtocolFeature {
>  VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1,
>  VHOST_USER_PROTOCOL_F_RARP = 2,
>  VHOST_USER_PROTOCOL_F_REPLY_ACK = 3,
> +VHOST_USER_PROTOCOL_F_MTU = 4,
>  
>  VHOST_USER_PROTOCOL_F_MAX
>  };
> @@ -59,6 +60,7 @@ typedef enum VhostUserRequest {
>  VHOST_USER_GET_QUEUE_NUM = 17,
>  VHOST_USER_SET_VRING_ENABLE = 18,
>  VHOST_USER_SEND_RARP = 19,
> +VHOST_USER_GET_MTU = 20,
>  VHOST_USER_MAX
>  } VhostUserRequest;
>  
> @@ -186,6 +188,7 @@ static bool vhost_user_one_time_request(VhostUserRequest 
> request)
>  case VHOST_USER_RESET_OWNER:
>  case VHOST_USER_SET_MEM_TABLE:
>  case VHOST_USER_GET_QUEUE_NUM:
> +case VHOST_USER_GET_MTU:
>  return true;
>  default:
>  return false;
> @@ -602,6 +605,14 @@ static int vhost_user_init(struct vhost_dev *dev, void 
> *opaque)
>  return err;
>  }
>  }
> +
> +/* query the MTU we support if backend supports MTU feature */
> +if (dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_MTU)) {
> +err = vhost_user_get_u64(dev, VHOST_USER_GET_MTU, &dev->mtu);
> +if (err < 0) {
> +return err;
> +}
> +}
>  }
>  
>  if (dev->migration_blocker == NULL &&
> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
> index 1fe5aad..c674a05 100644
> --- a/include/hw/virtio/vhost.h
> +++ b/include/hw/virtio/vhost.h
> @@ -51,6 +51,7 @@ struct vhost_dev {
>  uint64_t backend_features;
>  uint64_t protocol_features;
>  uint64_t max_queues;
> +uint64_t mtu;

Just a question why the MTU is stored as a u64?  would uint16_t make
more sense - then we can be sure we never have an excessively large mtu
value.

What do you think?

>  bool started;
>  bool log_enabled;
>  uint64_t log_size;



[Qemu-devel] [PATCH v3 3/3] virtio: set ISR on dataplane notifications

2016-11-18 Thread Paolo Bonzini
Dataplane has been omitting forever the step of setting ISR when
an interrupt is raised.  This caused little breakage, because the
specification actually says that ISR may not be updated in MSI mode.

Some versions of the Windows drivers however didn't clear MSI mode
correctly, and proceeded using polling mode (using ISR, not the used
ring index!) for crashdump and hibernation.  If it were just crashdump
and hibernation it would not be a big deal, but recent releases of
Windows do not really shut down, but rather log out and hibernate to
make the next startup faster.  Hence, this manifested as a more serious
hang during shutdown with e.g. Windows 8.1 and virtio-win 1.8.0 RPMs.
Newer versions fixed this, while older versions do not use MSI at all.

The failure has always been there for virtio dataplane, but it became
visible after commits 9ffe337 ("virtio-blk: always use dataplane path
if ioeventfd is active", 2016-10-30) and ad07cd6 ("virtio-scsi: always
use dataplane path if ioeventfd is active", 2016-10-30) made virtio-blk
and virtio-scsi always use the dataplane code under KVM.  The good news
therefore is that it was not a bug in the patches---they were doing
exactly what they were meant for, i.e. shake out remaining dataplane bugs.

The fix is not hard, so it's worth arranging for the broken drivers.
The virtio_should_notify+event_notifier_set pair that is common to
virtio-blk and virtio-scsi dataplane is replaced with a new public
function virtio_notify_irqfd that also sets ISR.  The irqfd emulation
code now need not set ISR anymore, so virtio_irq is removed.

Signed-off-by: Paolo Bonzini 
---
 hw/block/dataplane/virtio-blk.c |  4 +---
 hw/scsi/virtio-scsi-dataplane.c |  7 ---
 hw/scsi/virtio-scsi.c   |  2 +-
 hw/virtio/trace-events  |  2 +-
 hw/virtio/virtio.c  | 36 
 include/hw/virtio/virtio-scsi.h |  1 -
 include/hw/virtio/virtio.h  |  2 +-
 7 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index 90ef557..d1f9f63 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -68,9 +68,7 @@ static void notify_guest_bh(void *opaque)
 unsigned i = j + ctzl(bits);
 VirtQueue *vq = virtio_get_queue(s->vdev, i);
 
-if (virtio_should_notify(s->vdev, vq)) {
-event_notifier_set(virtio_queue_get_guest_notifier(vq));
-}
+virtio_notify_irqfd(s->vdev, vq);
 
 bits &= bits - 1; /* clear right-most bit */
 }
diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c
index f2ea29d..6b8d0f0 100644
--- a/hw/scsi/virtio-scsi-dataplane.c
+++ b/hw/scsi/virtio-scsi-dataplane.c
@@ -95,13 +95,6 @@ static int virtio_scsi_vring_init(VirtIOSCSI *s, VirtQueue 
*vq, int n,
 return 0;
 }
 
-void virtio_scsi_dataplane_notify(VirtIODevice *vdev, VirtIOSCSIReq *req)
-{
-if (virtio_should_notify(vdev, req->vq)) {
-event_notifier_set(virtio_queue_get_guest_notifier(req->vq));
-}
-}
-
 /* assumes s->ctx held */
 static void virtio_scsi_clear_aio(VirtIOSCSI *s)
 {
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 3e5ae6a..10fd687 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -69,7 +69,7 @@ static void virtio_scsi_complete_req(VirtIOSCSIReq *req)
 qemu_iovec_from_buf(&req->resp_iov, 0, &req->resp, req->resp_size);
 virtqueue_push(vq, &req->elem, req->qsgl.size + req->resp_iov.size);
 if (s->dataplane_started && !s->dataplane_fenced) {
-virtio_scsi_dataplane_notify(vdev, req);
+virtio_notify_irqfd(vdev, vq);
 } else {
 virtio_notify(vdev, vq);
 }
diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index 8756cef..7b6f55e 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -5,7 +5,7 @@ virtqueue_fill(void *vq, const void *elem, unsigned int len, 
unsigned int idx) "
 virtqueue_flush(void *vq, unsigned int count) "vq %p count %u"
 virtqueue_pop(void *vq, void *elem, unsigned int in_num, unsigned int out_num) 
"vq %p elem %p in_num %u out_num %u"
 virtio_queue_notify(void *vdev, int n, void *vq) "vdev %p n %d vq %p"
-virtio_irq(void *vq) "vq %p"
+virtio_notify_irqfd(void *vdev, void *vq) "vdev %p vq %p"
 virtio_notify(void *vdev, void *vq) "vdev %p vq %p"
 virtio_set_status(void *vdev, uint8_t val) "vdev %p val %u"
 
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 138a414..1af2de2 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -1330,13 +1330,6 @@ static void virtio_set_isr(VirtIODevice *vdev, int value)
 }
 }
 
-void virtio_irq(VirtQueue *vq)
-{
-trace_virtio_irq(vq);
-virtio_set_isr(vq->vdev, 0x1);
-virtio_notify_vector(vq->vdev, vq->vector);
-}
-
 bool virtio_should_notify(VirtIODevice *vdev, VirtQueue *vq)
 {
 uint16_t old, new;
@@ -1360,6 +1353,33 @@ bool virtio_should_notify(VirtIODev

[Qemu-devel] [PATCH v3 2/3] virtio: access ISR atomically

2016-11-18 Thread Paolo Bonzini
This will be needed once dataplane will be able to set it outside
the big QEMU lock.

Signed-off-by: Paolo Bonzini 
---
 hw/virtio/virtio-mmio.c |  6 +++---
 hw/virtio/virtio-pci.c  |  9 +++--
 hw/virtio/virtio.c  | 22 +-
 3 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/hw/virtio/virtio-mmio.c b/hw/virtio/virtio-mmio.c
index a30270f..17412cb 100644
--- a/hw/virtio/virtio-mmio.c
+++ b/hw/virtio/virtio-mmio.c
@@ -191,7 +191,7 @@ static uint64_t virtio_mmio_read(void *opaque, hwaddr 
offset, unsigned size)
 return virtio_queue_get_addr(vdev, vdev->queue_sel)
 >> proxy->guest_page_shift;
 case VIRTIO_MMIO_INTERRUPTSTATUS:
-return vdev->isr;
+return atomic_read(&vdev->isr);
 case VIRTIO_MMIO_STATUS:
 return vdev->status;
 case VIRTIO_MMIO_HOSTFEATURESSEL:
@@ -299,7 +299,7 @@ static void virtio_mmio_write(void *opaque, hwaddr offset, 
uint64_t value,
 }
 break;
 case VIRTIO_MMIO_INTERRUPTACK:
-vdev->isr &= ~value;
+atomic_and(&vdev->isr, ~value);
 virtio_update_irq(vdev);
 break;
 case VIRTIO_MMIO_STATUS:
@@ -347,7 +347,7 @@ static void virtio_mmio_update_irq(DeviceState *opaque, 
uint16_t vector)
 if (!vdev) {
 return;
 }
-level = (vdev->isr != 0);
+level = (atomic_read(&vdev->isr) != 0);
 DPRINTF("virtio_mmio setting IRQ %d\n", level);
 qemu_set_irq(proxy->irq, level);
 }
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 97b32fe..521ba0b 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -73,7 +73,7 @@ static void virtio_pci_notify(DeviceState *d, uint16_t vector)
 msix_notify(&proxy->pci_dev, vector);
 else {
 VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
-pci_set_irq(&proxy->pci_dev, vdev->isr & 1);
+pci_set_irq(&proxy->pci_dev, atomic_read(&vdev->isr) & 1);
 }
 }
 
@@ -449,8 +449,7 @@ static uint32_t virtio_ioport_read(VirtIOPCIProxy *proxy, 
uint32_t addr)
 break;
 case VIRTIO_PCI_ISR:
 /* reading from the ISR also clears it. */
-ret = vdev->isr;
-vdev->isr = 0;
+ret = atomic_xchg(&vdev->isr, 0);
 pci_irq_deassert(&proxy->pci_dev);
 break;
 case VIRTIO_MSI_CONFIG_VECTOR:
@@ -1379,9 +1378,7 @@ static uint64_t virtio_pci_isr_read(void *opaque, hwaddr 
addr,
 {
 VirtIOPCIProxy *proxy = opaque;
 VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
-uint64_t val = vdev->isr;
-
-vdev->isr = 0;
+uint64_t val = atomic_xchg(&vdev->isr, 0);
 pci_irq_deassert(&proxy->pci_dev);
 
 return val;
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index b7d5828..138a414 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -945,7 +945,7 @@ void virtio_reset(void *opaque)
 vdev->guest_features = 0;
 vdev->queue_sel = 0;
 vdev->status = 0;
-vdev->isr = 0;
+atomic_set(&vdev->isr, 0);
 vdev->config_vector = VIRTIO_NO_VECTOR;
 virtio_notify_vector(vdev, vdev->config_vector);
 
@@ -1318,10 +1318,22 @@ void virtio_del_queue(VirtIODevice *vdev, int n)
 vdev->vq[n].vring.num_default = 0;
 }
 
+static void virtio_set_isr(VirtIODevice *vdev, int value)
+{
+uint8_t old = atomic_read(&vdev->isr);
+
+/* Do not write ISR if it does not change, so that its cacheline remains
+ * shared in the common case where the guest does not read it.
+ */
+if ((old & value) != value) {
+atomic_or(&vdev->isr, value);
+}
+}
+
 void virtio_irq(VirtQueue *vq)
 {
 trace_virtio_irq(vq);
-vq->vdev->isr |= 0x01;
+virtio_set_isr(vq->vdev, 0x1);
 virtio_notify_vector(vq->vdev, vq->vector);
 }
 
@@ -1355,7 +1367,7 @@ void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
 }
 
 trace_virtio_notify(vdev, vq);
-vdev->isr |= 0x01;
+virtio_set_isr(vq->vdev, 0x1);
 virtio_notify_vector(vdev, vq->vector);
 }
 
@@ -1364,7 +1376,7 @@ void virtio_notify_config(VirtIODevice *vdev)
 if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK))
 return;
 
-vdev->isr |= 0x03;
+virtio_set_isr(vdev, 0x3);
 vdev->generation++;
 virtio_notify_vector(vdev, vdev->config_vector);
 }
@@ -1895,7 +1907,7 @@ void virtio_init(VirtIODevice *vdev, const char *name,
 
 vdev->device_id = device_id;
 vdev->status = 0;
-vdev->isr = 0;
+atomic_set(&vdev->isr, 0);
 vdev->queue_sel = 0;
 vdev->config_vector = VIRTIO_NO_VECTOR;
 vdev->vq = g_malloc0(sizeof(VirtQueue) * VIRTIO_QUEUE_MAX);
-- 
2.9.3





[Qemu-devel] [PATCH v3 1/3] virtio: introduce grab/release_ioeventfd to fix vhost

2016-11-18 Thread Paolo Bonzini
Following the recent refactoring of virtio notifiers [1], more specifically
the patch ed08a2a0b ("virtio: use virtio_bus_set_host_notifier to
start/stop ioeventfd") that uses virtio_bus_set_host_notifier [2]
by default, core virtio code requires 'ioeventfd_started' to be set
to true/false when the host notifiers are configured.

When vhost is stopped and started, however, there is a stop followed by
another start. Since ioeventfd_started was never set to true, the 'stop'
operation triggered by virtio_bus_set_host_notifier() will not result
in a call to virtio_pci_ioeventfd_assign(assign=false). This leaves
the memory regions with stale notifiers and results on the next start
triggering the following assertion:

  kvm_mem_ioeventfd_add: error adding ioeventfd: File exists
  Aborted

This patch reintroduces (hopefully in a cleaner way) the concept
that was present with ioeventfd_disabled before the refactoring.
When ioeventfd_grabbed>0, ioeventfd_started tracks whether ioeventfd
should be enabled or not, but ioeventfd is actually not started at
all until vhost releases the host notifiers.

[1] http://lists.nongnu.org/archive/html/qemu-devel/2016-10/msg07748.html
[2] http://lists.nongnu.org/archive/html/qemu-devel/2016-10/msg07760.html

Reported-by: Felipe Franciosi 
Reported-by: Christian Borntraeger 
Reported-by: Alex Williamson 
Fixes: ed08a2a0b ("virtio: use virtio_bus_set_host_notifier to start/stop 
ioeventfd")
Signed-off-by: Paolo Bonzini 
Message-Id: <2016192855.26350-1-pbonz...@redhat.com>
Signed-off-by: Paolo Bonzini 
---
 hw/virtio/vhost.c  | 14 +-
 hw/virtio/virtio-bus.c | 58 ++
 hw/virtio/virtio.c | 16 
 include/hw/virtio/virtio-bus.h | 14 ++
 include/hw/virtio/virtio.h |  2 ++
 5 files changed, 86 insertions(+), 18 deletions(-)

diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 30aee88..f7f7023 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -1214,17 +1214,17 @@ void vhost_dev_cleanup(struct vhost_dev *hdev)
 int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
 {
 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
-VirtioBusState *vbus = VIRTIO_BUS(qbus);
-VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
 int i, r, e;
 
-if (!k->ioeventfd_assign) {
+/* We will pass the notifiers to the kernel, make sure that QEMU
+ * doesn't interfere.
+ */
+r = virtio_device_grab_ioeventfd(vdev);
+if (r < 0) {
 error_report("binding does not support host notifiers");
-r = -ENOSYS;
 goto fail;
 }
 
-virtio_device_stop_ioeventfd(vdev);
 for (i = 0; i < hdev->nvqs; ++i) {
 r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
  true);
@@ -1244,7 +1244,7 @@ fail_vq:
 }
 assert (e >= 0);
 }
-virtio_device_start_ioeventfd(vdev);
+virtio_device_release_ioeventfd(vdev);
 fail:
 return r;
 }
@@ -1267,7 +1267,7 @@ void vhost_dev_disable_notifiers(struct vhost_dev *hdev, 
VirtIODevice *vdev)
 }
 assert (r >= 0);
 }
-virtio_device_start_ioeventfd(vdev);
+virtio_device_release_ioeventfd(vdev);
 }
 
 /* Test and clear event pending status.
diff --git a/hw/virtio/virtio-bus.c b/hw/virtio/virtio-bus.c
index bf61f66..d6c0c72 100644
--- a/hw/virtio/virtio-bus.c
+++ b/hw/virtio/virtio-bus.c
@@ -147,6 +147,39 @@ void virtio_bus_set_vdev_config(VirtioBusState *bus, 
uint8_t *config)
 }
 }
 
+/* On success, ioeventfd ownership belongs to the caller.  */
+int virtio_bus_grab_ioeventfd(VirtioBusState *bus)
+{
+VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(bus);
+
+/* vhost can be used even if ioeventfd=off in the proxy device,
+ * so do not check k->ioeventfd_enabled.
+ */
+if (!k->ioeventfd_assign) {
+return -ENOSYS;
+}
+
+if (bus->ioeventfd_grabbed == 0 && bus->ioeventfd_started) {
+virtio_bus_stop_ioeventfd(bus);
+/* Remember that we need to restart ioeventfd
+ * when ioeventfd_grabbed becomes zero.
+ */
+bus->ioeventfd_started = true;
+}
+bus->ioeventfd_grabbed++;
+return 0;
+}
+
+void virtio_bus_release_ioeventfd(VirtioBusState *bus)
+{
+assert(bus->ioeventfd_grabbed != 0);
+if (--bus->ioeventfd_grabbed == 0 && bus->ioeventfd_started) {
+/* Force virtio_bus_start_ioeventfd to act.  */
+bus->ioeventfd_started = false;
+virtio_bus_start_ioeventfd(bus);
+}
+}
+
 int virtio_bus_start_ioeventfd(VirtioBusState *bus)
 {
 VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(bus);
@@ -161,10 +194,14 @@ int virtio_bus_start_ioeventfd(VirtioBusState *bus)
 if (bus->ioeventfd_started) {
 return 0;
 }
-r = vdc->start_ioeventfd(vdev);
-if (r < 0) {
-error_report("%s: failed. Fallback to userspace (slower).", __func__);
-return r;
+
+/* Only

[Qemu-devel] [PATCH v3 for-2.8 0/3] virtio fixes

2016-11-18 Thread Paolo Bonzini
Patch 1 fixes vhost, patches 2-3 fix Windows hibernation.
v3 only adds more comments to patches 2 and 3.

Paolo

Paolo Bonzini (3):
  virtio: introduce grab/release_ioeventfd to fix vhost
  virtio: access ISR atomically
  virtio: set ISR on dataplane notifications

 hw/block/dataplane/virtio-blk.c |  4 +--
 hw/scsi/virtio-scsi-dataplane.c |  7 -
 hw/scsi/virtio-scsi.c   |  2 +-
 hw/virtio/trace-events  |  2 +-
 hw/virtio/vhost.c   | 14 -
 hw/virtio/virtio-bus.c  | 58 +---
 hw/virtio/virtio-mmio.c |  6 ++--
 hw/virtio/virtio-pci.c  |  9 ++
 hw/virtio/virtio.c  | 66 +++--
 include/hw/virtio/virtio-bus.h  | 14 +
 include/hw/virtio/virtio-scsi.h |  1 -
 include/hw/virtio/virtio.h  |  4 ++-
 12 files changed, 137 insertions(+), 50 deletions(-)

-- 
2.9.3




Re: [Qemu-devel] [PATCH 1/3] virtio: introduce grab/release_ioeventfd to fix vhost

2016-11-18 Thread Michael S. Tsirkin
On Fri, Nov 18, 2016 at 09:15:32AM +0100, Christian Borntraeger wrote:
> On 11/16/2016 07:05 PM, Paolo Bonzini wrote:
> > Following the recent refactoring of virtio notifiers [1], more specifically
> > the patch ed08a2a0b ("virtio: use virtio_bus_set_host_notifier to
> > start/stop ioeventfd") that uses virtio_bus_set_host_notifier [2]
> > by default, core virtio code requires 'ioeventfd_started' to be set
> > to true/false when the host notifiers are configured.
> > 
> > When vhost is stopped and started, however, there is a stop followed by
> > another start. Since ioeventfd_started was never set to true, the 'stop'
> > operation triggered by virtio_bus_set_host_notifier() will not result
> > in a call to virtio_pci_ioeventfd_assign(assign=false). This leaves
> > the memory regions with stale notifiers and results on the next start
> > triggering the following assertion:
> > 
> >   kvm_mem_ioeventfd_add: error adding ioeventfd: File exists
> >   Aborted
> > 
> > This patch reintroduces (hopefully in a cleaner way) the concept
> > that was present with ioeventfd_disabled before the refactoring.
> > When ioeventfd_grabbed>0, ioeventfd_started tracks whether ioeventfd
> > should be enabled or not, but ioeventfd is actually not started at
> > all until vhost releases the host notifiers.
> > 
> > [1] http://lists.nongnu.org/archive/html/qemu-devel/2016-10/msg07748.html
> > [2] http://lists.nongnu.org/archive/html/qemu-devel/2016-10/msg07760.html
> > 
> > Reported-by: Felipe Franciosi 
> > Reported-by: Christian Borntraeger 
> > Reported-by: Alex Williamson 
> > Fixes: ed08a2a0b ("virtio: use virtio_bus_set_host_notifier to start/stop 
> > ioeventfd")
> > Signed-off-by: Paolo Bonzini 
> > Message-Id: <2016192855.26350-1-pbonz...@redhat.com>
> > Signed-off-by: Paolo Bonzini 
> > ---
> > v1->v2: more comments [Cornelia]
> 
> 
> As this seems to fix a functional issues, is there any chance to apply this
> patch now and not wait for the discussion about patch 2 and 3 to calm down?

It's in my tree, will be in the next pull.




Re: [Qemu-devel] [PATCH 0/3] virtio: disable notifications in blk and scsi

2016-11-18 Thread Michael S. Tsirkin
On Fri, Nov 18, 2016 at 10:58:47AM +, Stefan Hajnoczi wrote:
> On Thu, Nov 17, 2016 at 07:38:45PM +0200, Michael S. Tsirkin wrote:
> > On Thu, Nov 17, 2016 at 01:27:49PM +, Stefan Hajnoczi wrote:
> > > On Thu, Nov 17, 2016 at 12:17:57AM +0200, Michael S. Tsirkin wrote:
> > > > On Wed, Nov 16, 2016 at 09:53:06PM +, Stefan Hajnoczi wrote:
> > > > > Disabling notifications during virtqueue processing reduces the 
> > > > > number of
> > > > > exits.  The virtio-net device already uses 
> > > > > virtio_queue_set_notifications() but
> > > > > virtio-blk and virtio-scsi do not.
> > > > > 
> > > > > The following benchmark shows a 15% reduction in virtio-blk-pci MMIO 
> > > > > exits:
> > > > > 
> > > > >   (host)$ qemu-system-x86_64 \
> > > > >   -enable-kvm -m 1024 -cpu host \
> > > > >   -drive if=virtio,id=drive0,file=f24.img,format=raw,\
> > > > >  cache=none,aio=native
> > > > >   (guest)$ fio # jobs=4, iodepth=8, direct=1, randread
> > > > >   (host)$ sudo perf record -a -e kvm:kvm_fast_mmio
> > > > > 
> > > > > Number of kvm_fast_mmio events:
> > > > > Unpatched: 685k
> > > > > Patched: 592k (-15%, lower is better)
> > > > 
> > > > Any chance to see a gain in actual benchmark numbers?
> > > > This is important to make sure we are not just
> > > > shifting overhead around.
> > > 
> > > Good idea.  I reran this morning without any tracing and compared
> > > against bare metal.
> > > 
> > > Total reads for a 30-second 4 KB random read benchmark with 4 processes
> > > x iodepth=8:
> > > 
> > > Bare metal: 26440 MB
> > > Unpatched:  19799 MB
> > > Patched:21252 MB
> > > 
> > > Patched vs Unpatched: +7% improvement
> > > Patched vs Bare metal: 20% virtualization overhead
> > > 
> > > The disk image is a 8 GB raw file on XFS on LVM on dm-crypt on a Samsung
> > > MZNLN256HCHP 256 GB SATA SSD.  This is just my laptop.
> > > 
> > > Seems like a worthwhile improvement to me.
> > > 
> > > Stefan
> > 
> > Sure. Pls remember to ping or re-post after the release.
> 
> How about a -next tree?

-next would make sense if we did Linus style short merge
cycles followed by a long stabilization period.

With current QEMU style -next seems counter-productive, we do freezes in
particular so people focus on stabilization, with -next everyone except
maintainers just keeps going as usual, and maintainers must handle
double the load.

> I've found that useful for block, net, and tracing in the past.  Most of
> the time it means patch authors can rest assured their patches will be
> merged without further action.  It allows development of features that
> depend on out-of-tree patches.
> 
> Stefan

Less work for authors, more work for me ... I'd rather distribute the load.

-- 
MST



Re: [Qemu-devel] [kvm-unit-tests PATCH 3/4] arm/arm64: GICv2: add GICD_ITARGETSR testing

2016-11-18 Thread Andrew Jones
On Thu, Nov 17, 2016 at 05:57:51PM +, Andre Przywara wrote:
> Some tests for the ITARGETS registers.
> Bits corresponding to non-existent CPUs must be RAZ/WI.
> These registers must be byte-accessible, also check that accesses beyond
> the implemented IRQ limit are actually read-as-zero/write-ignore.
> 
> Signed-off-by: Andre Przywara 
> ---
>  arm/gic.c | 54 ++
>  lib/arm/asm/gic.h |  1 +
>  2 files changed, 55 insertions(+)
> 
> diff --git a/arm/gic.c b/arm/gic.c
> index a27da2c..02b1be1 100644
> --- a/arm/gic.c
> +++ b/arm/gic.c
> @@ -397,6 +397,57 @@ static bool test_priorities(int nr_irqs, void *priptr)
>   return true;
>  }
>  
> +static bool test_targets(int nr_irqs)
> +{
> + void *targetsptr = gicv2_dist_base() + GICD_ITARGETSR;
> + u32 orig_targets;
> + u32 cpu_mask;
> + u32 pattern, reg;
> +
> + orig_targets = readl(targetsptr + 32);
> + report_prefix_push("ITARGETSR");
> +
> + cpu_mask = (1 << nr_cpus) - 1;

Shouldn't this be 1 << (nr_cpus - 1) ?

Is this test always going to be gicv2-only? We should probably comment it,
if so. We don't want to risk this being run when nr_cpus can be larger
than 8.

> + cpu_mask |= cpu_mask << 8;
> + cpu_mask |= cpu_mask << 16;
> +
> + /* Check that bits for non implemented CPUs are RAZ/WI. */
> + if (nr_cpus < 8) {
> + writel(0x, targetsptr + 32);
> + report("bits for %d non-existent CPUs masked",
> +!(readl(targetsptr + 32) & ~cpu_mask), 8 - nr_cpus);
> + } else {
> + report_skip("CPU masking (all CPUs implemented)");
> + }
> +
> + report("accesses beyond limit RAZ/WI",
> +test_readonly_32(targetsptr + nr_irqs, true));
> +
> + pattern = 0x0103020f;
> + writel(pattern, targetsptr + 32);
> + reg = readl(targetsptr + 32);
> + report("register content preserved (%08x => %08x)",
> +reg == (pattern & cpu_mask), pattern & cpu_mask, reg);
> +
> + /*
> +  * The TARGETS registers are byte accessible, do a byte-wide
> +  * read and write of known content to check for this.
> +  */
> + reg = readb(targetsptr + 33);
> + report("byte reads successful (0x%08x => 0x%02x)",
> +reg == (BYTE(pattern, 1) & cpu_mask),
> +pattern & cpu_mask, reg);
> +
> + pattern = REPLACE_BYTE(pattern, 2, 0x04);
> + writeb(BYTE(pattern, 2), targetsptr + 34);
> + reg = readl(targetsptr + 32);
> + report("byte writes successful (0x%02x => 0x%08x)",
> +reg == (pattern & cpu_mask), BYTE(pattern, 2), reg);

Last patch also had a byte addressability test. Maybe we should make
a helper function?

> +
> + writel(orig_targets, targetsptr + 32);
> + return true;

Function can/should be void.

> +}
> +
>  static int gic_test_mmio(int gic_version)
>  {
>   u32 reg;
> @@ -436,6 +487,9 @@ static int gic_test_mmio(int gic_version)
>  
>   test_priorities(nr_irqs, gic_dist_base + GICD_IPRIORITYR);
>  
> + if (gic_version == 2)
> + test_targets(nr_irqs);
> +
>   return 0;
>  }
>  
> diff --git a/lib/arm/asm/gic.h b/lib/arm/asm/gic.h
> index cef748d..6f170cb 100644
> --- a/lib/arm/asm/gic.h
> +++ b/lib/arm/asm/gic.h
> @@ -14,6 +14,7 @@
>  #define GICD_IGROUPR 0x0080
>  #define GICD_ISENABLER   0x0100
>  #define GICD_IPRIORITYR  0x0400
> +#define GICD_ITARGETSR   0x0800
>  #define GICD_SGIR0x0f00
>  #define GICD_ICPIDR2 0x0fe8
>  
> -- 
> 2.9.0
> 
>

Thanks,
drew



Re: [Qemu-devel] [PATCH] xen_disk: convert discard input to byte ranges

2016-11-18 Thread Olaf Hering
Am 18. November 2016 14:43:18 MEZ, schrieb Eric Blake :
>On 11/18/2016 04:24 AM, Olaf Hering wrote:
>> The guest sends discard requests as u64 sector/count pairs, but the
>> block layer operates internally with s64/s32 pairs. The conversion
>> leads to IO errors in the guest, the discard request is not
>processed.
>
>Doesn't the block layer already split discard requests into 2^31 byte
>chunks?

How would it do that without valid input?  It was wrong before the sectors to 
bytes conversion, and now its even worse given that all the world fits into an 
int.

Remember that there is no API to let the guest know about the limitations of 
the host. 

Olaf



Re: [Qemu-devel] [PATCH v5 12/17] qapi: rename QAPIExprError/QAPILineError

2016-11-18 Thread Markus Armbruster
Marc-André Lureau  writes:

> - Original Message -
>> Make the summary line "qapi: Rename QAPIExprError to QAPILineError".
>> 
>
> ok
>
>> Marc-André Lureau  writes:
>> 
>> > There is nothing specific about expressions in this exception,
>> > the following patch will use it without expressions.
>> >
>> > Signed-off-by: Marc-André Lureau 
>> > ---
>> >  scripts/qapi.py | 146
>> >  
>> >  1 file changed, 73 insertions(+), 73 deletions(-)
>> >
>> > diff --git a/scripts/qapi.py b/scripts/qapi.py
>> > index 21bc32f..4d1b0e4 100644
>> > --- a/scripts/qapi.py
>> > +++ b/scripts/qapi.py
>> > @@ -110,11 +110,11 @@ class QAPISchemaError(Exception):
>> >  "%s:%d:%d: %s" % (self.fname, self.line, self.col, self.msg)
>> >  
>> >  
>> > -class QAPIExprError(Exception):
>> > -def __init__(self, expr_info, msg):
>> > +class QAPILineError(Exception):
>> > +def __init__(self, info, msg):
>> >  Exception.__init__(self)
>> > -assert expr_info
>> > -self.info = expr_info
>> > +assert info
>> > +self.info = info
>> >  self.msg = msg
>> >  
>> >  def __str__(self):
>> 
>> Since we're talking about misnamed / awkward error stuff:
>> 
>> * QAPISchemaError is really a parse error.  __init__()'s schema argument
>>   isn't a QAPISchema, it's a QAPISchemaParser.
>> 
>> * Method __str__() is mostly duplicated.
>> 
>> How do you like the following untested sketch?
>
> I like it, though I would consider it separately from this series.

Punting ideas we both like to later series is fair.  But when we intend
to rework the error classes later, renaming one of them now is perhaps
not worth the churn.

> Here I systematically renamed the existing class ("line" since it's
> about location in file), your proposed change has more implications I
> don't really want to get into here.

Well, QAPISchemaError is just as much about "location in file" as
QAPIExprError.  They just specify the location differently: one by
reference to the parser's current location, and the other by an "info"
dictionary.

[...]



Re: [Qemu-devel] [PATCH v2 1/9] nbd: Allow unmap and fua during write zeroes

2016-11-18 Thread Paolo Bonzini


On 17/11/2016 22:10, Max Reitz wrote:
> On 17.11.2016 21:13, Eric Blake wrote:
>> Commit fa778fff wired up support to send the NBD_CMD_WRITE_ZEROES,
>> but forgot to inform the block layer that FUA unmapping of zeroes is
>> supported.  Without BDRV_REQ_MAY_UNMAP listed as a supported flag,
>> the block layer will always insist on the NBD layer passing
>> NBD_CMD_FLAG_NO_HOLE, resulting in the server always allocating
>> things even when it was desired to let the server punch holes.
>> Similarly, failing to set BDRV_REQ_FUA means that the client may
>> send unnecessary NBD_CMD_FLUSH when it could have instead used the
>> NBD_CMD_FLAG_FUA bit.
>>
>> CC: qemu-sta...@nongnu.org
>> Signed-off-by: Eric Blake 
>> ---
>>  block/nbd-client.c | 4 
>>  1 file changed, 4 insertions(+)
> 
> Reviewed-by: Max Reitz 

I'll take this one.

Paolo



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] [PATCH v3 for-2.9 0/3] q35: add negotiable broadcast SMI

2016-11-18 Thread Michael S. Tsirkin
On Fri, Nov 18, 2016 at 11:36:56AM +0100, Laszlo Ersek wrote:
> This is v3 of the series, with updates based on the v2 discussion:
> .
> 
> I've added feature negotiation via the APM_STS ("scratchpad") register.
> A new spec file called "docs/specs/q35-apm-sts.txt" is included.
> 
> Tested with new OVMF patches (about to send out those as well).
> Regression tested with SeaBIOS (beyond simple functional tests with
> maximum SeaBIOS logging enabled, I used gdb to step through the new
> ich9_apm_status_changed() callback to see if it was behaving compatibly
> with SeaBIOS).
> 
> The series was developed and tested on top of v2.7.0, because v2.8.0-rc0
> crashes very quickly for me when running OVMF:
> 
>   kvm_io_ioeventfd_add: error adding ioeventfd: File exists
> 
> It is my understanding that there are patches on the list for this:
> 
>   [Qemu-devel] [PATCH v2 for-2.8 0/3] virtio fixes
> 
> Anyway, the series rebases to v2.8.0-rc0 without as much as context
> differences.
> 
> Cc: "Kevin O'Connor" 
> Cc: "Michael S. Tsirkin" 
> Cc: Gerd Hoffmann 
> Cc: Paolo Bonzini 

I'll review. Pls remember it will have to be re-posted or pinged
after 2.8 is out.

> Thanks
> Laszlo
> 
> Laszlo Ersek (3):
>   hw/isa/apm: introduce callback for APM_STS_IOPORT writes
>   hw/isa/lpc_ich9: add SMI feature negotiation via APM_STS
>   hw/isa/lpc_ich9: ICH9_APM_STS_F_BROADCAST_SMI: inject SMI on all VCPUs
> 
>  docs/specs/q35-apm-sts.txt | 80 
> ++
>  include/hw/i386/ich9.h |  9 ++
>  include/hw/isa/apm.h   |  9 +++---
>  hw/acpi/piix4.c|  2 +-
>  hw/isa/apm.c   | 15 ++---
>  hw/isa/lpc_ich9.c  | 64 +++--
>  hw/isa/vt82c686.c  |  2 +-
>  7 files changed, 168 insertions(+), 13 deletions(-)
>  create mode 100644 docs/specs/q35-apm-sts.txt
> 
> -- 
> 2.9.2



Re: [Qemu-devel] [PATCH v5 03/17] qga/schema: improve guest-set-vcpus Returns: section

2016-11-18 Thread Markus Armbruster
Marc-André Lureau  writes:

> Hi
>
> - Original Message -
>> Marc-André Lureau  writes:
>> 
>> > The documentation parser we are going to add finishes a section after an
>> > empty line.
>> 
>> Is this still true?
>
> No longer true, the parser only breaks on argument empty line, as explained 
> in cover letter.
>
> I'll bring back the empty lines.

Feel free to keep them out, and justify your change with a consistency
argument.

[...]



Re: [Qemu-devel] [kvm-unit-tests PATCH 1/4] arm/arm64: GIC: basic GICv2 MMIO tests

2016-11-18 Thread Andre Przywara
Hi Drew,

On 18/11/16 13:06, Andrew Jones wrote:
> Hi Andre,
> 
> I'm so pleased to see this series. Thank you!
> 
> On Thu, Nov 17, 2016 at 05:57:49PM +, Andre Przywara wrote:
>> This adds an MMIO subtest to the GIC test.
>> It accesses some generic GICv2 registers and does some sanity tests,
>> like checking for some of them being read-only.
>>
>> Signed-off-by: Andre Przywara 
>> ---
>>  arm/gic.c | 99 
>> +++
>>  arm/unittests.cfg |  6 
>>  lib/arm/asm/gic.h |  2 ++
>>  3 files changed, 107 insertions(+)
>>
>> diff --git a/arm/gic.c b/arm/gic.c
>> index 638b8b1..ba2585b 100644
>> --- a/arm/gic.c
>> +++ b/arm/gic.c
>> @@ -3,6 +3,7 @@
>>   *
>>   * GICv2
>>   *   + test sending/receiving IPIs
>> + *   + MMIO access tests
>>   * GICv3
>>   *   + test sending/receiving IPIs
>>   *
>> @@ -274,6 +275,98 @@ static struct gic gicv3 = {
>>  },
>>  };
>>  
>> +static bool test_ro_pattern_32(void *address, u32 pattern, u32 orig)
>> +{
>> +u32 reg;
>> +
>> +writel(pattern, address);
>> +reg = readl(address);
>> +
>> +if (reg != orig)
>> +writel(orig, address);
>> +
>> +return reg == orig;
>> +}
>> +
>> +static bool test_readonly_32(void *address, bool razwi)
>> +{
>> +u32 orig, pattern;
>> +
>> +orig = readl(address);
>> +if (razwi && orig)
>> +return false;
>> +
>> +pattern = 0x;
>> +if (orig != pattern) {
>> +if (!test_ro_pattern_32(address, pattern, orig))
>> +return false;
>> +}
>> +
>> +pattern = 0xa5a55a5a;
>> +if (orig != pattern) {
>> +if (!test_ro_pattern_32(address, pattern, orig))
>> +return false;
>> +}
>> +
>> +pattern = 0;
>> +if (orig != pattern) {
>> +if (!test_ro_pattern_32(address, pattern, orig))
>> +return false;
>> +}
>> +
>> +return true;
>> +}
>> +
>> +static bool test_typer_v2(uint32_t reg)
>> +{
>> +int nr_gic_cpus = ((reg >> 5) & 0x7) + 1;
>> +
>> +report("all %d CPUs have interrupts", nr_cpus == nr_gic_cpus,
>> +   nr_gic_cpus);
>> +
>> +return true;
> 
> This test function can be a void.
> 
>> +}
>> +
>> +static int gic_test_mmio(int gic_version)
>> +{
>> +u32 reg;
>> +int nr_irqs;
>> +void *gic_dist_base, *idreg;
>> +
>> +switch(gic_version) {
>> +case 0x2:
>> +gic_dist_base = gicv2_dist_base();
>> +idreg = gic_dist_base + 0xfe8;
> 
> I see below you introduce GICD_ICPIDR2, so I guess you can use it here.
> 
>> +break;
>> +case 0x3:
>> +report_abort("GICv3 MMIO tests NYI");
>> +return -1;
> 
> can't reach this return

But we need to tell GCC about this, because otherwise we get all kind of
warnings (including bogus "maybe unused" warnings).
__attribute__ ((noreturn)) seems the way to go here, but this is
currently giving me a hard time ...

>> +default:
>> +report_abort("GIC version %d not supported", gic_version);
>> +return 0;
> 
> can't reach this return
> 
>> +}
>> +
>> +reg = readl(gic_dist_base + GICD_TYPER);
>> +nr_irqs = 32 * ((reg & 0x1f) + 1);
> 
> Any reason to avoid using GICD_TYPER_IRQS() here?

On the first write I wasn't aware of it, on a second thought then I
wanted to avoid using the macro copied from Linux.

But you are right, I will use it here.

> 
>> +report("number of implemented SPIs: %d", 1, nr_irqs - 32);
> 
> We usually just use printf for informational output (but we should
> probably add a 'report_info' in order to keep the prefixes. I can
> do that now.) Anyway, please s/1/true

I saw your patch, will use that.

>> +
>> +test_typer_v2(reg);
>> +
>> +report("IIDR: 0x%x", 1, readl(gic_dist_base + GICD_IIDR));
>> +
>> +report("GICD_TYPER is read-only",
>> +   test_readonly_32(gic_dist_base + GICD_TYPER, false));
>> +report("GICD_IIDR is read-only",
>> +   test_readonly_32(gic_dist_base + GICD_IIDR, false));
>> +
>> +reg = readl(idreg);
>> +report("ICPIDR2 is read-only (0x%x)",
>> +   test_readonly_32(idreg, false),
>> +   reg);
>> +
>> +return 0;
> 
> You may want %08x for all your register printing.
> 
> Since you either abort or always return success, then this function can be
> a void.
> 
>> +}
>> +
>>  int main(int argc, char **argv)
>>  {
>>  char pfx[8];
>> @@ -332,6 +425,12 @@ int main(int argc, char **argv)
>>  }
>>  ipi_test();
>>  
>> +} else if (!strcmp(argv[1], "mmio")) {
>> +report_prefix_push(argv[1]);
>> +
>> +gic_test_mmio(gic_version());
> 
> Any reason to pass gic_version() here instead of just using it
> in gic_test_mmio?

Not really, I originally wanted to pass this variable on in a clean
fashion to allow sharing tests.
But using the function shouldn't make any difference anymore, so I can
easily replace it.

"Yes,

Re: [Qemu-devel] [kvm-unit-tests PATCH 2/4] arm/arm64: GICv2: add GICD_IPRIORITYR testing

2016-11-18 Thread Andrew Jones
On Thu, Nov 17, 2016 at 05:57:50PM +, Andre Przywara wrote:
> Some tests for the IPRIORITY registers. The significant number of bits
> is IMPLEMENTATION DEFINED, but should be the same for every IRQ.
> Also these registers must be byte-accessible.
> Check that accesses beyond the implemented IRQ limit are actually
> read-as-zero/write-ignore.
> 
> Signed-off-by: Andre Przywara 
> ---
>  arm/gic.c | 72 
> +++
>  1 file changed, 72 insertions(+)
> 
> diff --git a/arm/gic.c b/arm/gic.c
> index ba2585b..a27da2c 100644
> --- a/arm/gic.c
> +++ b/arm/gic.c
> @@ -327,6 +327,76 @@ static bool test_typer_v2(uint32_t reg)
>   return true;
>  }
>  
> +#define BYTE(reg32, byte) (((reg32) >> ((byte) * 8)) & 0xff)
> +#define REPLACE_BYTE(reg32, byte, new) (((reg32) & ~(0xff << ((byte) * 8))) 
> |\
> + ((new) << ((byte) * 8)))
> +
> +static bool test_priorities(int nr_irqs, void *priptr)
> +{
> + u32 orig_prio, reg, pri_bits;
> + u32 pri_mask, pattern;
> +
> + orig_prio = readl(priptr + 32);
> + report_prefix_push("IPRIORITYR");
> +
> + /*
> +  * Determine implemented number of priority bits by writing all 1's
> +  * and checking the number of cleared bits in the value read back.
> +  */
> + writel(0x, priptr + 32);
> + pri_mask = readl(priptr + 32);
> +
> + reg = ~pri_mask;
> + report("consistent priority masking (0x%08x)",
> +(((reg >> 16) == (reg & 0x)) &&
> + ((reg & 0xff) == ((reg >> 8) & 0xff))), pri_mask);
> +
> + reg = reg & 0xff;
> + for (pri_bits = 8; reg & 1; reg >>= 1, pri_bits--)
> + ;
> + report("implements at least 4 priority bits (%d)",
> +pri_bits >= 4, pri_bits);
> +
> + pattern = 0;
> + writel(pattern, priptr + 32);
> + report("clearing priorities", readl(priptr + 32) == pattern);
> +
> + pattern = 0x;
> + writel(pattern, priptr + 32);
> + report("filling priorities",
> +readl(priptr + 32) == (pattern & pri_mask));
> +
> + report("accesses beyond limit RAZ/WI",
> +test_readonly_32(priptr + nr_irqs, true));
> +
> + writel(pattern, priptr + nr_irqs - 4);
> + report("accessing last SPIs",
> +readl(priptr + nr_irqs - 4) == (pattern & pri_mask));
> +
> + pattern = 0xff7fbf3f;
> + writel(pattern, priptr + 32);
> + report("priorities are preserved",
> +readl(priptr + 32) == (pattern & pri_mask));
> +
> + /*
> +  * The PRIORITY registers are byte accessible, do a byte-wide
> +  * read and write of known content to check for this.
> +  */
> + reg = readb(priptr + 33);
> + report("byte reads successful (0x%08x => 0x%02x)",
> +reg == (BYTE(pattern, 1) & pri_mask), pattern & pri_mask, reg);
> +
> + pattern = REPLACE_BYTE(pattern, 2, 0x1f);
> + writeb(BYTE(pattern, 2), priptr + 34);
> + reg = readl(priptr + 32);
> + report("byte writes successful (0x%02x => 0x%08x)",
> +reg == (pattern & pri_mask), BYTE(pattern, 2) & pri_mask, reg);
> +
> + report_prefix_pop();
> + writel(orig_prio, priptr + 32);
> + return true;

Might be nice to have FIRST_SPI and maybe LAST_SPI macros to avoid all
the +32's

This function always returns true, so it can be a void.

> +}
> +
>  static int gic_test_mmio(int gic_version)
>  {
>   u32 reg;
> @@ -364,6 +434,8 @@ static int gic_test_mmio(int gic_version)
>  test_readonly_32(idreg, false),
>  reg);
>  
> + test_priorities(nr_irqs, gic_dist_base + GICD_IPRIORITYR);

Feel free to add state like nr_irqs and dist_base to the gic struct
defined in arm/gic.c. That struct should provide the abstraction
needed to handle both gicv2 and gicv3 and contain anything that the
test functions need to refer to frequently. Using it should help
reduce the amount of parameters passed around.

> +
>   return 0;
>  }
>  
> -- 
> 2.9.0

Otherwise looks good to me

Reviewed-by: Andrew Jones 
> 
> 



Re: [Qemu-devel] [PATCH] xen_disk: convert discard input to byte ranges

2016-11-18 Thread Eric Blake
On 11/18/2016 04:24 AM, Olaf Hering wrote:
> The guest sends discard requests as u64 sector/count pairs, but the
> block layer operates internally with s64/s32 pairs. The conversion
> leads to IO errors in the guest, the discard request is not processed.

Doesn't the block layer already split discard requests into 2^31 byte
chunks?

> 
>   domU.cfg:
>   'vdev=xvda, format=qcow2, backendtype=qdisk, target=/x.qcow2'
>   domU:
>   mkfs.ext4 -F /dev/xvda
>   Discarding device blocks: failed - Input/output error
> 
> Fix this by splitting the request into chunks of BDRV_REQUEST_MAX_SECTORS.
> Add input range checking to avoid overflow.
> 
> Signed-off-by: Olaf Hering 
> ---
>  hw/block/xen_disk.c | 45 +++--
>  1 file changed, 39 insertions(+), 6 deletions(-)
> 

> @@ -708,12 +743,10 @@ static int ioreq_runio_qemu_aio(struct ioreq *ioreq)
>  break;
>  case BLKIF_OP_DISCARD:
>  {
> -struct blkif_request_discard *discard_req = (void *)&ioreq->req;
> -ioreq->aio_inflight++;
> -blk_aio_pdiscard(blkdev->blk,

That is, blk_aio_pdiscard() calls into bdrv_co_pdiscard() which is
supposed to be fragmenting things as needed.  Can you trace what is
going wrong there?  You shouldn't have to reimplement fragementation if
the block layer is doing it correctly.

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] [PATCH v7 RFC] block/vxhs: Initial commit to add Veritas HyperScale VxHS block device support

2016-11-18 Thread Daniel P. Berrange
On Fri, Nov 18, 2016 at 01:25:43PM +, Ketan Nilangekar wrote:
> 
> 
> > On Nov 18, 2016, at 5:25 PM, Daniel P. Berrange  wrote:
> > 
> >> On Fri, Nov 18, 2016 at 11:36:02AM +, Ketan Nilangekar wrote:
> >> 
> >> 
> >> 
> >> 
> >> 
> >>> On 11/18/16, 3:32 PM, "Stefan Hajnoczi"  wrote:
> >>> 
>  On Fri, Nov 18, 2016 at 02:26:21AM -0500, Jeff Cody wrote:
>  * Daniel pointed out that there is no authentication method for taking 
>  to a
>   remote server.  This seems a bit scary.  Maybe all that is needed here 
>  is
>   some clarification of the security scheme for authentication?  My
>   impression from above is that you are relying on the networks being
>   private to provide some sort of implicit authentication, though, and 
>  this
>   seems fragile (and doesn't protect against a compromised guest or other
>   process on the server, for one).
> >>> 
> >>> Exactly, from the QEMU trust model you must assume that QEMU has been
> >>> compromised by the guest.  The escaped guest can connect to the VxHS
> >>> server since it controls the QEMU process.
> >>> 
> >>> An escaped guest must not have access to other guests' volumes.
> >>> Therefore authentication is necessary.
> >> 
> >> Just so I am clear on this, how will such an escaped guest get to know
> >> the other guest vdisk IDs?
> > 
> > There can be a multiple approaches depending on the deployment scenario.
> > At the very simplest it could directly read the IDs out of the libvirt
> > XML files in /var/run/libvirt. Or it can rnu "ps" to list other running
> > QEMU processes and see the vdisk IDs in the command line args of those
> > processes. Or the mgmt app may be creating vdisk IDs based on some
> > particular scheme, and the attacker may have info about this which lets
> > them determine likely IDs.  Or the QEMU may have previously been
> > permitted to the use the disk and remembered the ID for use later
> > after access to the disk has been removed.
> > 
> 
> Are we talking about a compromised guest here or compromised hypervisor?
> How will a compromised guest read the xml file or list running qemu
> processes?

Compromised QEMU process, aka hypervisor userspace


Regards,
Daniel
-- 
|: http://berrange.com  -o-http://www.flickr.com/photos/dberrange/ :|
|: http://libvirt.org  -o- http://virt-manager.org :|
|: http://entangle-photo.org   -o-http://search.cpan.org/~danberr/ :|



[Qemu-devel] [RFC Design Doc v2] Enable Shared Virtual Memory feature in pass-through scenarios

2016-11-18 Thread Liu, Yi L
What's changed from v1:
a) Solution changed: 
Switch to extend the existing QEMU vIOMMU emulator to support virtual SVM
for pass-through devices. Take use of VFIO to do data propagation.

b) A SVM Virtualization Architecture diagram is added.

Content
===
1. Feature description
2. Why use it?
3. How to enable it
4. How to test

Details
===
1. Feature description
Shared virtual memory(SVM) is to let application program share its virtual
address with SVM capable devices. And the feature in this design is to let
application programs(running in guest)share their virtual address with
assigned device(e.g. graphics processors or accelerators).

For SVM detail, you may want refer to section 2.5.1.1 of Intel VT-d spec and
section 5.6 of OpenCL spec. For details about SVM address translation,
pls refer to section 3 of Intel VT-d spec.
It's also welcomed to discuss directly in this thread.

Link to related specs:
http://www.intel.com/content/dam/www/public/us/en/documents/product-specifications/vt-directed-io-spec.pdf
https://www.khronos.org/registry/cl/specs/opencl-2.0.pdf


2. Why use it?
It is common to pass-through devices to guest and expect to achieve as
much similar performance as it is on host. With this feature enabled, 
the application programs in guest would be able to pass data-structures
to assigned devices without unnecessary overheads.


3. How to enable it
There is an existing IOMMU emulator in host user space(QEMU). The solution here
is to extend IOMMU emulator to support SVM for assigned devices. So far, the
vIOMMU exposed to guest is only for emulated devices. In this design, it would
focus on virtual SVM for assigned devices. Virtual IOVA and virtual interrupt
remapping will not be included here.

The enabling work would include the following items.

a) IOMMU Register Access Emulation
Already existed in QEMU, need some extensions to support SVM. e.g. support
page request service related registers.

b) vIOMMU Capability
Report SVM related capabilities(PASID,PRS,DT etc.) and cache mode in vIOMMU
capability/ex-capability register.

c) QI Handling Emulation
Already existed in QEMU, need to shadow the QIs related to assigned devices to
physical IOMMU.
i. ex-context entry cache invalidation(nested mode setting, guest PASID table
pointer shadowing)
ii. 1st level translation cache invalidation
iii. Response for recoverable faults

d) Address translation in virtual SVM
For requests with PASID from assigned device, the address translation would be
subjected to nested mode. For the SVM capable devices which are assigned to a
guest, the corresponding extended context entry should have the NESTE bit set.
Guest PASID table pointer would be shadowed to ex-context entry. It would be a
GPA as nested mode is on.

e) Recoverable Address Translation Faults Handling Emulation
It is serviced by page request when device support PRS. For assigned devices, 
host IOMMU driver would get page requests from pIOMMU. Here, we need a
mechanism to drain the page requests from devices which are assigned to a guest.
In this design it would be done through VFIO. Page request descriptors would be
propagated to user space and then exposed to guest IOMMU driver. This
requires following support:
i. a mechanism to notify vIOMMU emulator to fetch PRQ descriptor
ii. a notify framework in QEMU to signal the PRQ descriptor fetching when
notified

f) Non-Recoverable Address Translation Handling Emulation
The non-recoverable fault propagation is similar to recoverable faults. In this
design it would propagate fault data to user space(QEMU) through VFIO. 
vIOMMU emulator then emulate the fault. Either fill data to vIOMMU fault
record registers or fill the data to memory-resident fault log region. Depends
on the fault reporting type.

g) SVM Virtualization Architecture
**
 Guest +--+   
+->|   vIOMMU driver  |   
|  +--+   
| | | 
++(1) |(2)  |(3)  
 || | 
*||*|*
 Host User   |V V 
 Space++  
  |   Qemu vIOMMU  |  
  ++  
 |||||
*|||||
 Host Kernel |(1) |(2) |(4) |(5) |(6) 
 Space   |

Re: [Qemu-devel] [PATCH v7 RFC] block/vxhs: Initial commit to add Veritas HyperScale VxHS block device support

2016-11-18 Thread Ketan Nilangekar


> On Nov 18, 2016, at 5:25 PM, Daniel P. Berrange  wrote:
> 
>> On Fri, Nov 18, 2016 at 11:36:02AM +, Ketan Nilangekar wrote:
>> 
>> 
>> 
>> 
>> 
>>> On 11/18/16, 3:32 PM, "Stefan Hajnoczi"  wrote:
>>> 
 On Fri, Nov 18, 2016 at 02:26:21AM -0500, Jeff Cody wrote:
 * Daniel pointed out that there is no authentication method for taking to a
  remote server.  This seems a bit scary.  Maybe all that is needed here is
  some clarification of the security scheme for authentication?  My
  impression from above is that you are relying on the networks being
  private to provide some sort of implicit authentication, though, and this
  seems fragile (and doesn't protect against a compromised guest or other
  process on the server, for one).
>>> 
>>> Exactly, from the QEMU trust model you must assume that QEMU has been
>>> compromised by the guest.  The escaped guest can connect to the VxHS
>>> server since it controls the QEMU process.
>>> 
>>> An escaped guest must not have access to other guests' volumes.
>>> Therefore authentication is necessary.
>> 
>> Just so I am clear on this, how will such an escaped guest get to know
>> the other guest vdisk IDs?
> 
> There can be a multiple approaches depending on the deployment scenario.
> At the very simplest it could directly read the IDs out of the libvirt
> XML files in /var/run/libvirt. Or it can rnu "ps" to list other running
> QEMU processes and see the vdisk IDs in the command line args of those
> processes. Or the mgmt app may be creating vdisk IDs based on some
> particular scheme, and the attacker may have info about this which lets
> them determine likely IDs.  Or the QEMU may have previously been
> permitted to the use the disk and remembered the ID for use later
> after access to the disk has been removed.
> 

Are we talking about a compromised guest here or compromised hypervisor? How 
will a compromised guest read the xml file or list running qemu processes?

> IOW, you can't rely on security-through-obscurity of the vdisk IDs
> 
> Regards,
> Daniel
> -- 
> |: http://berrange.com  -o-http://www.flickr.com/photos/dberrange/ :|
> |: http://libvirt.org  -o- http://virt-manager.org :|
> |: http://entangle-photo.org   -o-http://search.cpan.org/~danberr/ :|



Re: [Qemu-devel] [kvm-unit-tests PATCH 1/4] arm/arm64: GIC: basic GICv2 MMIO tests

2016-11-18 Thread Andrew Jones
Hi Andre,

I'm so pleased to see this series. Thank you!

On Thu, Nov 17, 2016 at 05:57:49PM +, Andre Przywara wrote:
> This adds an MMIO subtest to the GIC test.
> It accesses some generic GICv2 registers and does some sanity tests,
> like checking for some of them being read-only.
> 
> Signed-off-by: Andre Przywara 
> ---
>  arm/gic.c | 99 
> +++
>  arm/unittests.cfg |  6 
>  lib/arm/asm/gic.h |  2 ++
>  3 files changed, 107 insertions(+)
> 
> diff --git a/arm/gic.c b/arm/gic.c
> index 638b8b1..ba2585b 100644
> --- a/arm/gic.c
> +++ b/arm/gic.c
> @@ -3,6 +3,7 @@
>   *
>   * GICv2
>   *   + test sending/receiving IPIs
> + *   + MMIO access tests
>   * GICv3
>   *   + test sending/receiving IPIs
>   *
> @@ -274,6 +275,98 @@ static struct gic gicv3 = {
>   },
>  };
>  
> +static bool test_ro_pattern_32(void *address, u32 pattern, u32 orig)
> +{
> + u32 reg;
> +
> + writel(pattern, address);
> + reg = readl(address);
> +
> + if (reg != orig)
> + writel(orig, address);
> +
> + return reg == orig;
> +}
> +
> +static bool test_readonly_32(void *address, bool razwi)
> +{
> + u32 orig, pattern;
> +
> + orig = readl(address);
> + if (razwi && orig)
> + return false;
> +
> + pattern = 0x;
> + if (orig != pattern) {
> + if (!test_ro_pattern_32(address, pattern, orig))
> + return false;
> + }
> +
> + pattern = 0xa5a55a5a;
> + if (orig != pattern) {
> + if (!test_ro_pattern_32(address, pattern, orig))
> + return false;
> + }
> +
> + pattern = 0;
> + if (orig != pattern) {
> + if (!test_ro_pattern_32(address, pattern, orig))
> + return false;
> + }
> +
> + return true;
> +}
> +
> +static bool test_typer_v2(uint32_t reg)
> +{
> + int nr_gic_cpus = ((reg >> 5) & 0x7) + 1;
> +
> + report("all %d CPUs have interrupts", nr_cpus == nr_gic_cpus,
> +nr_gic_cpus);
> +
> + return true;

This test function can be a void.

> +}
> +
> +static int gic_test_mmio(int gic_version)
> +{
> + u32 reg;
> + int nr_irqs;
> + void *gic_dist_base, *idreg;
> +
> + switch(gic_version) {
> + case 0x2:
> + gic_dist_base = gicv2_dist_base();
> + idreg = gic_dist_base + 0xfe8;

I see below you introduce GICD_ICPIDR2, so I guess you can use it here.

> + break;
> + case 0x3:
> + report_abort("GICv3 MMIO tests NYI");
> + return -1;

can't reach this return

> + default:
> + report_abort("GIC version %d not supported", gic_version);
> + return 0;

can't reach this return

> + }
> +
> + reg = readl(gic_dist_base + GICD_TYPER);
> + nr_irqs = 32 * ((reg & 0x1f) + 1);

Any reason to avoid using GICD_TYPER_IRQS() here?

> + report("number of implemented SPIs: %d", 1, nr_irqs - 32);

We usually just use printf for informational output (but we should
probably add a 'report_info' in order to keep the prefixes. I can
do that now.) Anyway, please s/1/true

> +
> + test_typer_v2(reg);
> +
> + report("IIDR: 0x%x", 1, readl(gic_dist_base + GICD_IIDR));
> +
> + report("GICD_TYPER is read-only",
> +test_readonly_32(gic_dist_base + GICD_TYPER, false));
> + report("GICD_IIDR is read-only",
> +test_readonly_32(gic_dist_base + GICD_IIDR, false));
> +
> + reg = readl(idreg);
> + report("ICPIDR2 is read-only (0x%x)",
> +test_readonly_32(idreg, false),
> +reg);
> +
> + return 0;

You may want %08x for all your register printing.

Since you either abort or always return success, then this function can be
a void.

> +}
> +
>  int main(int argc, char **argv)
>  {
>   char pfx[8];
> @@ -332,6 +425,12 @@ int main(int argc, char **argv)
>   }
>   ipi_test();
>  
> + } else if (!strcmp(argv[1], "mmio")) {
> + report_prefix_push(argv[1]);
> +
> + gic_test_mmio(gic_version());

Any reason to pass gic_version() here instead of just using it
in gic_test_mmio?

> +
> + report_prefix_pop();
>   } else {
>   report_abort("Unknown subtest '%s'", argv[1]);
>   }
> diff --git a/arm/unittests.cfg b/arm/unittests.cfg
> index c7392c7..0162e5a 100644
> --- a/arm/unittests.cfg
> +++ b/arm/unittests.cfg
> @@ -67,6 +67,12 @@ smp = $((($MAX_SMP < 8)?$MAX_SMP:8))
>  extra_params = -machine gic-version=2 -append 'ipi'
>  groups = gic
>  
> +[gicv2-mmio]
> +file = gic.flat
> +smp = $((($MAX_SMP < 8)?$MAX_SMP:8))
> +extra_params = -machine gic-version=2 -append 'mmio'
> +groups = gic
> +
>  [gicv3-ipi]
>  file = gic.flat
>  smp = $MAX_SMP
> diff --git a/lib/arm/asm/gic.h b/lib/arm/asm/gic.h
> index c2267b6..cef748d 100644
> --- a/lib/arm/asm/gic.h
> +++ b/lib/arm/asm/gic.h
> @@ -10,10 +10,12 @@
>  /* Distributor registers */

Re: [Qemu-devel] [PATCH] tests/postcopy: Use KVM on ppc64 only if it is KVM-HV

2016-11-18 Thread Laurent Vivier


On 18/11/2016 13:53, Greg Kurz wrote:
> Hi Laurent,

Hi Greg,

> On Thu, 17 Nov 2016 21:22:33 +0100
> Laurent Vivier  wrote:
> 
>> On 16/11/2016 15:17, Greg Kurz wrote:
>>> On Wed, 16 Nov 2016 14:17:47 +0100
>>> Thomas Huth  wrote:
>>>   
 On 16.11.2016 13:37, Greg Kurz wrote:  
> On Wed, 16 Nov 2016 12:24:50 +
> "Dr. David Alan Gilbert"  wrote:
> 
>> * Greg Kurz (gr...@kaod.org) wrote:
>>> On Wed, 16 Nov 2016 09:39:31 +0100
>>> Thomas Huth  wrote:
>>>   
 The ppc64 postcopy test does not work with KVM-PR, and it is also
 causing annoying warning messages when run on a x86 host. So let's
 use KVM here only if we know that we're running with KVM-HV (which
 automatically also means that we're running on a ppc64 host), and
 fall back to TCG otherwise.
   
>> [..]
>>> The changes to the code look ok and I prefer to spend time chasing the
>>> KVM PR issue rather than arguing on a comment...  
>>
>> For the problem itself, it seems to appear only after a
>> BOOK3S_INTERRUPT_SYSCALL interrupt for an KVM_EXIT_PAPR_HCALL
>> (H_PUT_TERM_CHAR). In this case, KVM has to exit to QEMU to manage the
>> output. The following interrupt is always an BOOK3S_INTERRUPT_PROGRAM
>> with an emulation failure.
>>
> 
> Which specific problem are you referring to ? 
..
> 2) "Unexpected 32 on dest_serial serial" accompanied by the following in dmesg
> 
> [131613.428616] Couldn't emulate instruction 0x (op 0 xop 0)
> [131613.503515] kvmppc_handle_exit_pr: emulation at d8 failed ()

This one, test running on a bare metal PowerMac G5 (F24), 4.9.0-rc5
kernel. And I have only and everytime this error.

Laurent




Re: [Qemu-devel] [PATCH] add migration capability to bypass the shared memory

2016-11-18 Thread Alexander Graf

Juan,

It looks like Lai is waiting for a reply from you on this email :).


Alex

On 08/30/2016 06:11 AM, Lai Jiangshan wrote:

On Wed, Aug 10, 2016 at 5:03 PM, Juan Quintela  wrote:

Lai Jiangshan  wrote:

Hi

First of all, I like a lot the patchset, but I would preffer to split it
to find "possible" bugs along the lines, especially in postcopy, but not only.

Hello, thanks for review and comments

I tried to make the patch be sane and tight.
I don't see any strong reason to split it without complicating the patch.


[very nice description of the patch]

Nothing to say about the QMP and shared memory detection, looks correct
to me.


diff --git a/migration/ram.c b/migration/ram.c
index 815bc0e..880972d 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -605,6 +605,28 @@ static void migration_bitmap_sync_init(void)
  num_dirty_pages_period = 0;
  xbzrle_cache_miss_prev = 0;
  iterations_prev = 0;
+migration_dirty_pages = 0;
+}
+
+static void migration_bitmap_init(unsigned long *bitmap)
+{
+RAMBlock *block;
+
+bitmap_clear(bitmap, 0, last_ram_offset() >> TARGET_PAGE_BITS);
+rcu_read_lock();
+QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
+if (!migrate_bypass_shared_memory() || !qemu_ram_is_shared(block)) {
+bitmap_set(bitmap, block->offset >> TARGET_PAGE_BITS,
+   block->used_length >> TARGET_PAGE_BITS);
+
+/*
+ * Count the total number of pages used by ram blocks not including
+ * any gaps due to alignment or unplugs.
+ */
+ migration_dirty_pages += block->used_length >> TARGET_PAGE_BITS;
+ }
+}
+rcu_read_unlock();
  }

We can split this function in a different patch.

it calls the new function migrate_bypass_shared_memory().
it is no a good idea to split it out.


I haven't fully search
if we care about taking the rcu lock here.  The thing that I am more
interested is in knowing what happens when we don't set
migration_dirty_pages as the full "possible" memory pages.

I hadn't tested it with postcopy, I don't know how to use postcopy.
 From my review I can't find obvious bugs about it.

I don't think there is any good reason to use migrate_bypass
and postcopy together,  I can disable the migrate_bypass
when postcopy==true if you want.


Once here, should we check for ROM regions?

BTW, could'nt we use:

int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
{
 RAMBlock *block;
 int ret = 0;

 rcu_read_lock();
 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 ret = func(block->idstr, block->host, block->offset,
block->used_length, opaque);
 if (ret) {
 break;
 }
 }
 rcu_read_unlock();
 return ret;
}


the patch only introduces only one "QLIST_FOREACH_RCU(ram_list.blocks)"
but
# git grep 'QLIST_FOREACH_RCU.*ram_list'  | wc -l
#   16

I don't want to introduce qemu_ram_foreach_block()
and touch another 15 places.
I hope someone do it after merged.





  static void migration_bitmap_sync(void)
@@ -631,7 +653,9 @@ static void migration_bitmap_sync(void)
  qemu_mutex_lock(&migration_bitmap_mutex);
  rcu_read_lock();
  QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
-migration_bitmap_sync_range(block->offset, block->used_length);
+if (!migrate_bypass_shared_memory() || !qemu_ram_is_shared(block)) {
+migration_bitmap_sync_range(block->offset, block->used_length);
+}
  }
  rcu_read_unlock();
  qemu_mutex_unlock(&migration_bitmap_mutex);

Oops, another place where we were not using qemu_ram_foreach_block :p



@@ -1926,19 +1950,14 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
  ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
  migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
  migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
-bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
+migration_bitmap_init(migration_bitmap_rcu->bmap);

  if (migrate_postcopy_ram()) {
  migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
-bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
+bitmap_copy(migration_bitmap_rcu->unsentmap,
+ migration_bitmap_rcu->bmap, ram_bitmap_pages);
  }

I think that if we go this route, we should move the whole if inside the
migration_bitmap_init?

good! I will do it when I update the patch.

Thanks,
Lai


-/*
- * Count the total number of pages used by ram blocks not including any
- * gaps due to alignment or unplugs.
- */
-migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
-
  memory_global_dirty_log_start();
  migration_bitmap_sync();
  qemu_mutex_unlock_ramlist();


As said, very happy with the patch.  And it got much simpler that I
would have expected.

Thanks, Juan.





Re: [Qemu-devel] [PATCH] tests/postcopy: Use KVM on ppc64 only if it is KVM-HV

2016-11-18 Thread Greg Kurz
Hi Laurent,

On Thu, 17 Nov 2016 21:22:33 +0100
Laurent Vivier  wrote:

> On 16/11/2016 15:17, Greg Kurz wrote:
> > On Wed, 16 Nov 2016 14:17:47 +0100
> > Thomas Huth  wrote:
> >   
> >> On 16.11.2016 13:37, Greg Kurz wrote:  
> >>> On Wed, 16 Nov 2016 12:24:50 +
> >>> "Dr. David Alan Gilbert"  wrote:
> >>> 
>  * Greg Kurz (gr...@kaod.org) wrote:
> > On Wed, 16 Nov 2016 09:39:31 +0100
> > Thomas Huth  wrote:
> >   
> >> The ppc64 postcopy test does not work with KVM-PR, and it is also
> >> causing annoying warning messages when run on a x86 host. So let's
> >> use KVM here only if we know that we're running with KVM-HV (which
> >> automatically also means that we're running on a ppc64 host), and
> >> fall back to TCG otherwise.
> >>   
> [..]
> > The changes to the code look ok and I prefer to spend time chasing the
> > KVM PR issue rather than arguing on a comment...  
> 
> For the problem itself, it seems to appear only after a
> BOOK3S_INTERRUPT_SYSCALL interrupt for an KVM_EXIT_PAPR_HCALL
> (H_PUT_TERM_CHAR). In this case, KVM has to exit to QEMU to manage the
> output. The following interrupt is always an BOOK3S_INTERRUPT_PROGRAM
> with an emulation failure.
> 

Which specific problem are you referring to ? 

On my side, when running postcopy-test in a nested guest, I hit either one of 
the
three following issues (in decreasing order of probability of occurence):

1) "Memory content inconsistency at ..." like Stefan

2) "Unexpected 32 on dest_serial serial" accompanied by the following in dmesg

[131613.428616] Couldn't emulate instruction 0x (op 0 xop 0)
[131613.503515] kvmppc_handle_exit_pr: emulation at d8 failed ()

3) hang because the destination QEMU is looping on:

ioctl(19, KVM_RUN, 0)   = 2 (RESUME_HOST)


Host runs OpenPower HostOS (kernel 4.9, QEMU 2.7) and guest runs fedora25.

Cheers.

--
Greg

> Laurent




Re: [Qemu-devel] [PATCH v5 16/17] docs: add qemu logo

2016-11-18 Thread Markus Armbruster
Marc-André Lureau  writes:

> The pdf (needed by texi2pdf for vectorized images) was generated thanks
> to inkscape, from the pc-bios/qemu_logo.svg file.

Since we don't want to build-depend on inkscape, we need to commit the
.pdf.  Not that bad, as the .svg is unlikely to change.  But let's
explain that in the commit message.  Recommend to throw in the inkscape
command line.

>
> Signed-off-by: Marc-André Lureau 

I'm afraid this fails when build tree != source tree.  I get

$ make pdf
  GEN docs/qemu-qmp-ref.pdf
/usr/bin/texi2dvi: pdfetex exited with bad status, quitting.
Makefile:558: recipe for target 'docs/qemu-qmp-ref.pdf' failed
make: *** [docs/qemu-qmp-ref.pdf] Error 1
$ tail qemu-qmp-ref.log 
  @let @centersub @relax 
l.12 @center @image{docs/qemu_logo}

@Texinfo supports .png, .jpg, .jpeg, and .pdf images with PDF output, and 
none 
of those formats could be found. (.eps cannot be supported due to the 
design of
 the PDF format; use regular TeX (DVI output) for that.) 


!pdfTeX error: pdfetex (file docs/qemu_logo.): cannot find image file
 ==> Fatal error occurred, no output PDF file produced!

make dvi fails for me, too.

> ---
>  docs/qemu-ga-ref.texi  |   4 
>  docs/qemu-qmp-ref.texi |   4 
>  docs/qemu_logo.pdf | Bin 0 -> 9117 bytes
>  3 files changed, 8 insertions(+)
>  create mode 100644 docs/qemu_logo.pdf
>
> diff --git a/docs/qemu-ga-ref.texi b/docs/qemu-ga-ref.texi
> index 02ecdb7..f7ed73e 100644
> --- a/docs/qemu-ga-ref.texi
> +++ b/docs/qemu-ga-ref.texi
> @@ -6,6 +6,10 @@
>  
>  @settitle QEMU Guest Agent Protocol Reference
>  
> +@iftex
> +@center @image{docs/qemu_logo}
> +@end iftex
> +
>  @copying
>  This is the QEMU Guest Agent Protocol reference manual.
>  
> diff --git a/docs/qemu-qmp-ref.texi b/docs/qemu-qmp-ref.texi
> index ccc03cb..0f7e9e4 100644
> --- a/docs/qemu-qmp-ref.texi
> +++ b/docs/qemu-qmp-ref.texi
> @@ -6,6 +6,10 @@
>  
>  @settitle QEMU QMP Reference Manual
>  
> +@iftex
> +@center @image{docs/qemu_logo}
> +@end iftex
> +

Quoting the Texinfo manual:

Here is the synopsis of the '@image' command:

 @image{FILENAME[, WIDTH[, HEIGHT[, ALTTEXT[, EXTENSION}

  The FILENAME argument is mandatory, and must not have an extension,
because the different processors support different formats:

   * TeX (DVI output) reads the file 'FILENAME.eps' (Encapsulated
 PostScript format).

   * pdfTeX reads 'FILENAME.pdf', 'FILENAME.png', 'FILENAME.jpg', or
 'FILENAME.jpeg' (in that order).  It also tries uppercase versions
 of the extensions.  The PDF format does not support EPS images, so
 such must be converted first.

   * For Info, 'makeinfo' includes 'FILENAME.txt' verbatim (more or less
 as if it were in '@verbatim').  The Info output may also include a
 reference to 'FILENAME.png' or 'FILENAME.jpg'.  (See below.)

   * For HTML, 'makeinfo' outputs a reference to 'FILENAME.png',
 'FILENAME.jpg', 'FILENAME.jpeg' or 'FILENAME.gif' (in that order).
 If none of those exist, it gives an error, and outputs a reference
 to 'FILENAME.jpg' anyway.

   * For Docbook, 'makeinfo' outputs references to 'FILENAME.eps',
 'FILENAME.gif' 'FILENAME.jpeg', 'FILENAME.jpg', 'FILENAME.pdf',
 'FILENAME.png' and 'FILENAME.svg', for every file found.  Also,
 'FILENAME.txt' is included verbatim, if present.  (The subsequent
 Docbook processor is supposed to choose the appropriate one.)

   * For Info and HTML output, 'makeinfo' uses the optional fifth
 argument EXTENSION to '@image' for the filename extension, if it is
 specified and the file is found.  Any leading period should be
 included in EXTENSION.  For example:

  @image{foo.xpm}

  If you want to install image files for use by Info readers too, we
recommend putting them in a subdirectory like 'FOO-figures' for a
package FOO.  Copying the files into '$(infodir)/FOO-figures/' should be
done in your 'Makefile'.

End quote.

You provide qemu_logo.pdf, which satisfies pdfTeX (our make target pdf)
and Docbook (we don't care).  Should we provide files to satisfy Info
and HTML?

My version of texinfo seems to ignore the @image when it can't file any
file in some cases.  That's why make info and make html still work.

>  @copying
>  This is the QEMU QMP reference manual.
>  
> diff --git a/docs/qemu_logo.pdf b/docs/qemu_logo.pdf
> new file mode 100644
> index 
> ..294cb7dec50de73c786925671300fb0abdf9d641
> GIT binary patch
[...]



Re: [Qemu-devel] [PATCH 16/25] tcg/i386: Handle ctz and clz opcodes

2016-11-18 Thread Bastian Koppelmann
On 11/18/2016 12:03 AM, Richard Henderson wrote:
> On 11/17/2016 11:09 PM, Bastian Koppelmann wrote:
>> On 11/17/2016 08:59 PM, Richard Henderson wrote:
>>> On 11/17/2016 08:53 PM, Richard Henderson wrote:
 On 11/17/2016 05:50 PM, Bastian Koppelmann wrote:
> On 11/16/2016 08:25 PM, Richard Henderson wrote:
>> +
>> +OP_32_64(clz):
>> +if (const_args[2]) {
>> +tcg_debug_assert(have_bmi1);
>> +tcg_debug_assert(args[2] == (rexw ? 64 : 32));
>> +tcg_out_modrm(s, OPC_LZCNT + rexw, args[0], args[1]);
>> +} else {
>> +/* ??? See above.  */
>> +tcg_out_modrm(s, OPC_BSR + rexw, args[0], args[1]);
>
> The Intel ISA manual states that it find the bit index of the most
> significant bit, where the least significant bit is index 0. So for
> the
> input 0x2 this should return 1. However this is not the number of
> leading zeros.

 Oh, of course you're right.  I thought I was testing this, but while
 alpha does
 have this operation, it turns out it isn't used much.
>>>
>>> Alternately, what I tested was on a haswell machine, which takes the
>>> LZCNT path, which *does* produce the intended results.  Just the BSR
>>> path doesn't.
>>
>> Luckily my old laptop is a Core 2 Duo without LZCNT :)
> 
> Heh.  Well, I've given it another few tests with LZCNT hacked off, and
> with i686 32-bit.  Here's an incremental update.  Wherein I also note
> that lzcnt isn't in the same cpuid flag as tzcnt.  Double whoops.

My processor[1] seems to lie about the LZCNT cpuid flag. It says it has
LZCNT but executes it as BSR. According to [2] ABM flag is used to
indicate LZCNT support.

Cheers,
Bastian


[1]
$ cat /proc/cpuinfo
processor   : 0
vendor_id   : GenuineIntel
cpu family  : 6
model   : 23
model name  : Intel(R) Core(TM)2 Duo CPU P8400  @ 2.26GHz
stepping: 10
microcode   : 0xa0b
cpu MHz : 1600.000
cache size  : 3072 KB
physical id : 0
siblings: 2
core id : 0
cpu cores   : 2
apicid  : 0
initial apicid  : 0
fpu : yes
fpu_exception   : yes
cpuid level : 13
wp  : yes
flags   : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov
pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx lm
constant_tsc arch_perfmon pebs bts rep_good nopl aperfmperf eagerfpu pni
dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm sse4_1 xsave
lahf_lm tpr_shadow vnmi flexpriority dtherm ida
bugs:
bogomips: 4523.35
clflush size: 64
cache_alignment : 64
address sizes   : 36 bits physical, 48 bits virtual
power management:

[2] https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets



  1   2   >