date:20160524

Re: [Qemu-devel] [for-2.7 PATCH v3 00/15] Core based CPU hotplug for PowerPC sPAPR

2016-05-24 Thread Thomas Huth

On 12.05.2016 05:48, Bharata B Rao wrote:
> Hi,
> 
> This is v3 of "Core based CPU hotplug for PowerPC sPAPR". The hotplug
> semantics looks like this:
> 
> (qemu) device_add POWER8E-spapr-cpu-core,id=core2,core=16[,threads=4]
> (qemu) device_add POWER8E_v2.1-spapr-cpu-core,id=core2,core=16[,threads=4]

*ping*

Andreas, could you please, please review this series? At least the
generic patches (patch 1-6 and patch 13-15)?

Thanks!

  Thomas

[Qemu-devel] [PATCH v6 3/5] hw/char: QOM'ify lm32_juart.c

2016-05-24 Thread xiaoqiang zhao

* Drop the old SysBus init function
* Call qemu_chr_add_handlers in the realize callback
* Use qdev chardev prop instead of qemu_char_get_next_serial

Signed-off-by: xiaoqiang zhao 
---
 hw/char/lm32_juart.c  | 17 -
 hw/lm32/lm32.h|  3 ++-
 hw/lm32/lm32_boards.c |  5 +++--
 hw/lm32/milkymist.c   |  2 +-
 4 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/hw/char/lm32_juart.c b/hw/char/lm32_juart.c
index 5bf8acf..28c2cf7 100644
--- a/hw/char/lm32_juart.c
+++ b/hw/char/lm32_juart.c
@@ -114,17 +114,13 @@ static void juart_reset(DeviceState *d)
 s->jrx = 0;
 }
 
-static int lm32_juart_init(SysBusDevice *dev)
+static void lm32_juart_realize(DeviceState *dev, Error **errp)
 {
 LM32JuartState *s = LM32_JUART(dev);
 
-/* FIXME use a qdev chardev prop instead of qemu_char_get_next_serial() */
-s->chr = qemu_char_get_next_serial();
 if (s->chr) {
 qemu_chr_add_handlers(s->chr, juart_can_rx, juart_rx, juart_event, s);
 }
-
-return 0;
 }
 
 static const VMStateDescription vmstate_lm32_juart = {
@@ -138,16 +134,19 @@ static const VMStateDescription vmstate_lm32_juart = {
 }
 };
 
+static Property lm32_juart_properties[] = {
+DEFINE_PROP_CHR("chardev", LM32JuartState, chr),
+DEFINE_PROP_END_OF_LIST(),
+};
+
 static void lm32_juart_class_init(ObjectClass *klass, void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(klass);
-SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass);
 
-k->init = lm32_juart_init;
 dc->reset = juart_reset;
 dc->vmsd = &vmstate_lm32_juart;
-/* Reason: init() method uses qemu_char_get_next_serial() */
-dc->cannot_instantiate_with_device_add_yet = true;
+dc->props = lm32_juart_properties;
+dc->realize = lm32_juart_realize;
 }
 
 static const TypeInfo lm32_juart_info = {
diff --git a/hw/lm32/lm32.h b/hw/lm32/lm32.h
index 18aa6fd..a993f00 100644
--- a/hw/lm32/lm32.h
+++ b/hw/lm32/lm32.h
@@ -16,11 +16,12 @@ static inline DeviceState *lm32_pic_init(qemu_irq cpu_irq)
 return dev;
 }
 
-static inline DeviceState *lm32_juart_init(void)
+static inline DeviceState *lm32_juart_init(CharDriverState *chr)
 {
 DeviceState *dev;
 
 dev = qdev_create(NULL, TYPE_LM32_JUART);
+qdev_prop_set_chr(dev, "chardev", chr);
 qdev_init_nofail(dev);
 
 return dev;
diff --git a/hw/lm32/lm32_boards.c b/hw/lm32/lm32_boards.c
index c029056..2ae6555 100644
--- a/hw/lm32/lm32_boards.c
+++ b/hw/lm32/lm32_boards.c
@@ -31,6 +31,7 @@
 #include "lm32_hwsetup.h"
 #include "lm32.h"
 #include "exec/address-spaces.h"
+#include "sysemu/sysemu.h"
 
 typedef struct {
 LM32CPU *cpu;
@@ -136,7 +137,7 @@ static void lm32_evr_init(MachineState *machine)
 sysbus_create_simple("lm32-timer", timer1_base, irq[timer1_irq]);
 
 /* make sure juart isn't the first chardev */
-env->juart_state = lm32_juart_init();
+env->juart_state = lm32_juart_init(serial_hds[1]);
 
 reset_info->bootstrap_pc = flash_base;
 
@@ -238,7 +239,7 @@ static void lm32_uclinux_init(MachineState *machine)
 sysbus_create_simple("lm32-timer", timer2_base, irq[timer2_irq]);
 
 /* make sure juart isn't the first chardev */
-env->juart_state = lm32_juart_init();
+env->juart_state = lm32_juart_init(serial_hds[1]);
 
 reset_info->bootstrap_pc = flash_base;
 
diff --git a/hw/lm32/milkymist.c b/hw/lm32/milkymist.c
index 1abdf6e..ebd8a3c 100644
--- a/hw/lm32/milkymist.c
+++ b/hw/lm32/milkymist.c
@@ -175,7 +175,7 @@ milkymist_init(MachineState *machine)
 0x2000, 0x1000, 0x2002, 0x2000);
 
 /* make sure juart isn't the first chardev */
-env->juart_state = lm32_juart_init();
+env->juart_state = lm32_juart_init(serial_hds[1]);
 
 if (kernel_filename) {
 uint64_t entry;
-- 
2.1.4

Re: [Qemu-devel] [QEMU RFC PATCH v2 4/6] Migration: migrate QTAILQ

2016-05-24 Thread David Gibson

On Tue, May 24, 2016 at 10:55:07AM -0700, Jianjun Duan wrote:
> A recursive structure has elements of the same type in itself. Currently
> we cannot directly transfer a QTAILQ instance, or any recursive
> structure such as lists in migration because of the limitation in the
> migration code. Here we introduce a general approach to transfer such
> structures. In our approach a recursive structure is tagged with VMS_CSTM.
> We extend VMStateField with 3 fields: meta_data to store the meta data
> about the recursive structure in question, extend_get to load the structure
> from migration stream to memory, and extend_put to dump the structure into
> the migration stream. This extension mirrors VMStateInfo. We then modify
> vmstate_save_state and vmstate_load_state so that when VMS_CSTM is
> encountered, extend_put and extend_get are called respectively with the
> knowledge of the meta data.

All the talk about recursive structures just obscures the issue.  The
point is you're building a way of migrating QTAILQs - just stick to
that.

> To make it work for QTAILQ in qemu/queue.h, we created the meta data
> format, extend_get and extend_put for it. We will use this approach to
> transfer pending_events and ccs_list in spapr state.
> 
> We also create some macros in qemu/queue.h to get the layout information
> about QTAILQ. This ensures that we do not depend on the implementation
> details about QTAILQ in the migration code.

I'm interested to see what Juan and Dave Gilbert think about this.


> 
> Signed-off-by: Jianjun Duan 
> ---
>  include/migration/vmstate.h | 59 +++
>  include/qemu/queue.h| 38 
>  migration/vmstate.c | 84 
> +
>  3 files changed, 181 insertions(+)
> 
> diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
> index 1622638..bf57b25 100644
> --- a/include/migration/vmstate.h
> +++ b/include/migration/vmstate.h
> @@ -183,6 +183,8 @@ enum VMStateFlags {
>   * to determine the number of entries in the array. Only valid in
>   * combination with one of VMS_VARRAY*. */
>  VMS_MULTIPLY_ELEMENTS = 0x4000,
> +/* For fields which need customized handling, such as QTAILQ in queue.h*/
> +VMS_CSTM= 0x8000,
>  };
>  
>  typedef struct {
> @@ -198,6 +200,14 @@ typedef struct {
>  const VMStateDescription *vmsd;
>  int version_id;
>  bool (*field_exists)(void *opaque, int version_id);
> +/*
> + * Following 3 fields are for VMStateField which needs customized 
> handling,
> + * such as QTAILQ in qemu/queue.h, lists, and tree.
> + */
> +const void *meta_data;
> +int (*extend_get)(QEMUFile *f, const void *metadata, void *opaque);
> +void (*extend_put)(QEMUFile *f, const void *metadata, void *opaque,
> +   QJSON *vmdesc);
>  } VMStateField;
>  
>  struct VMStateDescription {
> @@ -654,6 +664,18 @@ extern const VMStateInfo vmstate_info_bitmap;
>  .offset   = offsetof(_state, _field),\
>  }
>  
> +/* For fields that need customized handling, such as queue, list */
> +#define VMSTATE_CSTM_V(_field, _state, _version, _meta_data, _get, _put)  \
> +{ \
> +.name = (stringify(_field)),  \
> +.version_id   = (_version),   \
> +.flags= VMS_CSTM, \
> +.offset   = offsetof(_state, _field), \
> +.meta_data= &(_meta_data),\
> +.extend_get   = (_get),   \
> +.extend_put   = (_put),   \
> +}
> +
>  /* _f : field name
> _f_n : num of elements field_name
> _n : num of elements
> @@ -970,4 +992,41 @@ int64_t self_announce_delay(int round)
>  
>  void dump_vmstate_json_to_file(FILE *out_fp);
>  
> +
> +/* Meta data for QTAILQ */
> +typedef struct QTAILQMetaData {
> +/* the offset of tqh_first in QTAILQ_HEAD */
> +size_t first;
> +/* the offset of tqh_last in QTAILQ_HEAD */
> +size_t last;
> +/* the offset of tqe_next in QTAILQ_ENTRY */
> +size_t next;
> +/* the offset of tqe_prev in QTAILQ_ENTRY */
> +size_t prev;
> +/* the offset of QTAILQ_ENTRY in a QTAILQ element*/
> +size_t entry;
> +/* size of a QTAILQ element */
> +size_t size;
> +/* vmsd of a QTAILQ element */
> +const VMStateDescription *vmsd;
> +} QTAILQMetaData;
> +
> +#define VMSTATE_QTAILQ_METADATA(_field, _state, _type, _next, _vmsd) { \
> +.first = QTAILQ_FIRST_OFFSET(typeof_field(_state, _field)),\
> +.last =  QTAILQ_LAST_OFFSET(typeof_field(_state, _field)), \
> +.next = QTAILQ_NEXT_OFFSET(_type, _next),  \
> +.prev

[Qemu-devel] [PATCH v6 2/5] hw/char: QOM'ify etraxfs_ser.c

2016-05-24 Thread xiaoqiang zhao

* Drop the old SysBus init function and use instance_init
* Call qemu_chr_add_handlers in the realize callback
* Use qdev chardev prop instead of qemu_char_get_next_serial
* Add etraxfs_ser_create function to create etraxfs serial device

Signed-off-by: xiaoqiang zhao 
---
 hw/char/etraxfs_ser.c | 27 +--
 hw/cris/axis_dev88.c  |  4 ++--
 include/hw/cris/etraxfs.h | 16 
 3 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/hw/char/etraxfs_ser.c b/hw/char/etraxfs_ser.c
index 146b387..04ca04f 100644
--- a/hw/char/etraxfs_ser.c
+++ b/hw/char/etraxfs_ser.c
@@ -159,6 +159,11 @@ static const MemoryRegionOps ser_ops = {
 }
 };
 
+static Property etraxfs_ser_properties[] = {
+DEFINE_PROP_CHR("chardev", ETRAXSerial, chr),
+DEFINE_PROP_END_OF_LIST(),
+};
+
 static void serial_receive(void *opaque, const uint8_t *buf, int size)
 {
 ETRAXSerial *s = opaque;
@@ -209,40 +214,42 @@ static void etraxfs_ser_reset(DeviceState *d)
 
 }
 
-static int etraxfs_ser_init(SysBusDevice *dev)
+static void etraxfs_ser_init(Object *obj)
 {
-ETRAXSerial *s = ETRAX_SERIAL(dev);
+ETRAXSerial *s = ETRAX_SERIAL(obj);
+SysBusDevice *dev = SYS_BUS_DEVICE(obj);
 
 sysbus_init_irq(dev, &s->irq);
-memory_region_init_io(&s->mmio, OBJECT(s), &ser_ops, s,
+memory_region_init_io(&s->mmio, obj, &ser_ops, s,
   "etraxfs-serial", R_MAX * 4);
 sysbus_init_mmio(dev, &s->mmio);
+}
+
+static void etraxfs_ser_realize(DeviceState *dev, Error **errp)
+{
+ETRAXSerial *s = ETRAX_SERIAL(dev);
 
-/* FIXME use a qdev chardev prop instead of qemu_char_get_next_serial() */
-s->chr = qemu_char_get_next_serial();
 if (s->chr) {
 qemu_chr_add_handlers(s->chr,
   serial_can_receive, serial_receive,
   serial_event, s);
 }
-return 0;
 }
 
 static void etraxfs_ser_class_init(ObjectClass *klass, void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(klass);
-SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass);
 
-k->init = etraxfs_ser_init;
 dc->reset = etraxfs_ser_reset;
-/* Reason: init() method uses qemu_char_get_next_serial() */
-dc->cannot_instantiate_with_device_add_yet = true;
+dc->props = etraxfs_ser_properties;
+dc->realize = etraxfs_ser_realize;
 }
 
 static const TypeInfo etraxfs_ser_info = {
 .name  = TYPE_ETRAX_FS_SERIAL,
 .parent= TYPE_SYS_BUS_DEVICE,
 .instance_size = sizeof(ETRAXSerial),
+.instance_init = etraxfs_ser_init,
 .class_init= etraxfs_ser_class_init,
 };
 
diff --git a/hw/cris/axis_dev88.c b/hw/cris/axis_dev88.c
index 9f58658..60df887 100644
--- a/hw/cris/axis_dev88.c
+++ b/hw/cris/axis_dev88.c
@@ -37,6 +37,7 @@
 #include "sysemu/block-backend.h"
 #include "exec/address-spaces.h"
 #include "sysemu/qtest.h"
+#include "sysemu/sysemu.h"
 
 #define D(x)
 #define DNAND(x)
@@ -341,8 +342,7 @@ void axisdev88_init(MachineState *machine)
 sysbus_create_varargs("etraxfs,timer", 0x3005e000, irq[0x1b], nmi[1], 
NULL);
 
 for (i = 0; i < 4; i++) {
-sysbus_create_simple("etraxfs,serial", 0x30026000 + i * 0x2000,
- irq[0x14 + i]);
+etraxfs_ser_create(0x30026000 + i * 0x2000, irq[0x14 + i], 
serial_hds[i]);
 }
 
 if (kernel_filename) {
diff --git a/include/hw/cris/etraxfs.h b/include/hw/cris/etraxfs.h
index 73a6134..eb66418 100644
--- a/include/hw/cris/etraxfs.h
+++ b/include/hw/cris/etraxfs.h
@@ -46,4 +46,20 @@ etraxfs_eth_init(NICInfo *nd, hwaddr base, int phyaddr,
 return dev;
 }
 
+static inline DeviceState *etraxfs_ser_create(hwaddr addr,
+  qemu_irq irq,
+  CharDriverState *chr)
+{
+DeviceState *dev;
+SysBusDevice *s;
+
+dev = qdev_create(NULL, "etraxfs,serial");
+s = SYS_BUS_DEVICE(dev);
+qdev_prop_set_chr(dev, "chardev", chr);
+qdev_init_nofail(dev);
+sysbus_mmio_map(s, 0, addr);
+sysbus_connect_irq(s, 0, irq);
+return dev;
+}
+
 #endif
-- 
2.1.4

[Qemu-devel] [PATCH v6 0/5] QOM'ify hw/char devices

2016-05-24 Thread xiaoqiang zhao

This patch set trys to QOM'ify hw/char files, see commit messages 
for more details

Thanks Paolo  for your suggestions.

Note:
* CRIS axis_dev88 broad related test is passed and looks ok. 
* lm32 test cases by Michael  is passed and looks good.
* lm32 milkymist need test by Michael.

Changes in v6:
* change lm32_juart_init to accept CharDriverState pointer as a parameter
* QOM'ify milkymist-uart to drop qemu_char_get_next_serial

Changes in v5: 
  drop the call of qemu_char_get_next_serial in board code and 
  direct access serial_hds array

Changes in v4: 
* add wrapper functions to create char device
* drop the sysbus_create_simple function and 
  use the new qdev_create stuff

Changes in v3: 
* use chardev property instead of qemu_char_get_next_serial
* call the functions that touch globals in the realize callback

xiaoqiang zhao (5):
  hw/char: QOM'ify escc.c
  hw/char: QOM'ify etraxfs_ser.c
  hw/char: QOM'ify lm32_juart.c
  hw/char: QOM'ify lm32_uart.c
  hw/char: QOM'ify milkymist-uart.c

 hw/char/escc.c| 30 +++---
 hw/char/etraxfs_ser.c | 27 +--
 hw/char/lm32_juart.c  | 17 -
 hw/char/lm32_uart.c   | 28 +---
 hw/char/milkymist-uart.c  | 10 ++
 hw/cris/axis_dev88.c  |  4 ++--
 hw/lm32/lm32.h| 19 ++-
 hw/lm32/lm32_boards.c |  9 +
 hw/lm32/milkymist-hw.h|  4 +++-
 hw/lm32/milkymist.c   |  4 ++--
 include/hw/cris/etraxfs.h | 16 
 11 files changed, 113 insertions(+), 55 deletions(-)

-- 
2.1.4

Re: [Qemu-devel] [PATCH qemu v16 01/19] vfio: Delay DMA address space listener release

2016-05-24 Thread David Gibson

On Fri, May 13, 2016 at 04:24:53PM -0600, Alex Williamson wrote:
> On Fri, 13 May 2016 17:16:48 +1000
> Alexey Kardashevskiy  wrote:
> 
> > On 05/06/2016 08:39 AM, Alex Williamson wrote:
> > > On Wed,  4 May 2016 16:52:13 +1000
> > > Alexey Kardashevskiy  wrote:
> > >  
> > >> This postpones VFIO container deinitialization to let region_del()
> > >> callbacks (called via vfio_listener_release) do proper clean up
> > >> while the group is still attached to the container.  
> > >
> > > Any mappings within the container should clean themselves up when the
> > > container is deprivleged by removing the last group in the kernel. Is
> > > the issue that that doesn't happen, which would be a spapr vfio kernel
> > > bug, or that our QEMU side structures get all out of whack if we let
> > > that happen?  
> > 
> > My mailbase got corrupted, missed that.
> > 
> > This is mostly for "[PATCH qemu v16 17/19] spapr_iommu, vfio, memory: 
> > Notify IOMMU about starting/stopping being used by VFIO", I should have put 
> > 01/19 and 02/19 right before 17/19, sorry about that.
> 
> Which I object to, it's just ridiculous to have vfio start/stop
> callbacks in a set of generic iommu region ops.

It's ugly, but I don't actually see a better way to do this (the
general concept of having vfio start/stop callbacks, that is, not the
specifics of the patches).

The fact is that how we implement the guest side IOMMU *does* need to
change depending on whether VFIO devices are present or not.  That's
due essentially to incompatibilities between a couple of kernel
mechanisms.  Which in itself is ugly, but nonetheless real.

A (usually blank) vfio on/off callback in the guest side IOMMU ops
seems like the least-bad way to handle this.

> > Every reboot the spapr machine removes all (i.e. one or two) windows and 
> > creates the default one.
> > 
> > I do this by memory_region_del_subregion(iommu_mr) + 
> > memory_region_add_subregion(iommu_mr). Which gets translated to 
> > VFIO_IOMMU_SPAPR_TCE_REMOVE + VFIO_IOMMU_SPAPR_TCE_CREATE via 
> > vfio_memory_listener if there is VFIO; no direct calls from spapr to vfio 
> > => cool. During the machine reset, the VFIO device is there with the   
> > container and groups attached, at some point with no windows.
> > 
> > Now to VFIO plug/unplug.
> > 
> > When VFIO plug happens, vfio_memory_listener is created, region_add() is 
> > called, the hardware window is created (via VFIO_IOMMU_SPAPR_TCE_CREATE).
> > Unplugging should end up doing VFIO_IOMMU_SPAPR_TCE_REMOVE somehow. If 
> > region_del() is not called when the container is being destroyed (as before 
> > this patchset), then the kernel cleans and destroys windows when 
> > close(container->fd) is called or when qemu is killed (and this fd is 
> > naturally closed), I hope this answers the comment from 02/19.
> > 
> > So far so good (right?)
> > 
> > However I also have a guest view of the TCE table, this is what the guest 
> > sees and this is what emulated PCI devices use. This guest view is either 
> > allocated in the KVM (so H_PUT_TCE can be handled quickly right in the host 
> > kernel, even in real mode) or userspace (VFIO case).
> > 
> > I generally want the guest view to be in the KVM. However when I plug VFIO, 
> > I have to move the table to the userspace. When I unplug VFIO, I want to do 
> > the opposite so I need a way to tell spapr that it can move the table. 
> > region_del() seemed a natural way of doing this as region_add() is already 
> > doing the opposite part.
> > 
> > With this patchset, each IOMMU MR gets a usage counter, region_add() does 
> > +1, region_del() does -1 (yeah, not extremely optimal during reset). When 
> > the counter goes from 0 to 1, vfio_start() hook is called, when the counter 
> > becomes 0 - vfio_stop(). Note that we may have multiple VFIO containers on 
> > the same PHB.
> > 
> > Without 01/19 and 02/19, I'll have to repeat region_del()'s counter 
> > decrement steps in vfio_disconnect_container(). And I still cannot move 
> > counting from region_add() to vfio_connect_container() so there will be 
> > asymmetry which I am fine with, I am just checking here - what would be the 
> > best approach here?
> 
> 
> You're imposing on other iommu models (type1) that in order to release
> a container we first deregister the listener, which un-plays all of
> the mappings within that region.  That's inefficient when we can simply
> unset the container and move on.  So you're imposing an inefficiency on
> a separate vfio iommu model for the book keeping of your own.  I don't
> think that's a reasonable approach.  Has it even been testing how that
> affects type1 users?  When a container is closed, clearly it shouldn't
> be contributing to reference counts, so it seems like there must be
> other ways to handle this.

My first guess is to agree, but I'll look at that more carefully when
I actually get to the patch doing that.

What I really don't understand about this one is what the
group<->container conn

[Qemu-devel] [PATCH v6 1/5] hw/char: QOM'ify escc.c

2016-05-24 Thread xiaoqiang zhao

* Drop the old SysBus init function and use instance_init
* Call qemu_chr_add_handlers in the realize callback

Signed-off-by: xiaoqiang zhao 
---
 hw/char/escc.c | 30 +++---
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/hw/char/escc.c b/hw/char/escc.c
index 7bf09a0..8e6a7df 100644
--- a/hw/char/escc.c
+++ b/hw/char/escc.c
@@ -983,9 +983,10 @@ void slavio_serial_ms_kbd_init(hwaddr base, qemu_irq irq,
 sysbus_mmio_map(s, 0, base);
 }
 
-static int escc_init1(SysBusDevice *dev)
+static void escc_init1(Object *obj)
 {
-ESCCState *s = ESCC(dev);
+ESCCState *s = ESCC(obj);
+SysBusDevice *dev = SYS_BUS_DEVICE(obj);
 unsigned int i;
 
 s->chn[0].disabled = s->disabled;
@@ -994,17 +995,26 @@ static int escc_init1(SysBusDevice *dev)
 sysbus_init_irq(dev, &s->chn[i].irq);
 s->chn[i].chn = 1 - i;
 s->chn[i].clock = s->frequency / 2;
-if (s->chn[i].chr) {
-qemu_chr_add_handlers(s->chn[i].chr, serial_can_receive,
-  serial_receive1, serial_event, &s->chn[i]);
-}
 }
 s->chn[0].otherchn = &s->chn[1];
 s->chn[1].otherchn = &s->chn[0];
 
-memory_region_init_io(&s->mmio, OBJECT(s), &escc_mem_ops, s, "escc",
+memory_region_init_io(&s->mmio, obj, &escc_mem_ops, s, "escc",
   ESCC_SIZE << s->it_shift);
 sysbus_init_mmio(dev, &s->mmio);
+}
+
+static void escc_realize(DeviceState *dev, Error **errp)
+{
+ESCCState *s = ESCC(dev);
+unsigned int i;
+
+for (i = 0; i < 2; i++) {
+if (s->chn[i].chr) {
+qemu_chr_add_handlers(s->chn[i].chr, serial_can_receive,
+  serial_receive1, serial_event, &s->chn[i]);
+}
+}
 
 if (s->chn[0].type == mouse) {
 qemu_add_mouse_event_handler(sunmouse_event, &s->chn[0], 0,
@@ -1014,8 +1024,6 @@ static int escc_init1(SysBusDevice *dev)
 s->chn[1].hs = qemu_input_handler_register((DeviceState *)(&s->chn[1]),
&sunkbd_handler);
 }
-
-return 0;
 }
 
 static Property escc_properties[] = {
@@ -1032,10 +1040,9 @@ static Property escc_properties[] = {
 static void escc_class_init(ObjectClass *klass, void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(klass);
-SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass);
 
-k->init = escc_init1;
 dc->reset = escc_reset;
+dc->realize = escc_realize;
 dc->vmsd = &vmstate_escc;
 dc->props = escc_properties;
 set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
@@ -1045,6 +1052,7 @@ static const TypeInfo escc_info = {
 .name  = TYPE_ESCC,
 .parent= TYPE_SYS_BUS_DEVICE,
 .instance_size = sizeof(ESCCState),
+.instance_init = escc_init1,
 .class_init= escc_class_init,
 };
 
-- 
2.1.4

[Qemu-devel] [PATCH v6 4/5] hw/char: QOM'ify lm32_uart.c

2016-05-24 Thread xiaoqiang zhao

* Drop the old SysBus init function and use instance_init
* Call qemu_chr_add_handlers in the realize callback
* Use qdev chardev prop instead of qemu_char_get_next_serial
* Add lm32_uart_create function to create lm32 uart device

Tested-by: Michael Walle 
Acked-by: Michael Walle 
Signed-off-by: xiaoqiang zhao 
---
 hw/char/lm32_uart.c   | 28 +---
 hw/lm32/lm32.h| 16 
 hw/lm32/lm32_boards.c |  4 ++--
 3 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/hw/char/lm32_uart.c b/hw/char/lm32_uart.c
index 036813d..b5c760d 100644
--- a/hw/char/lm32_uart.c
+++ b/hw/char/lm32_uart.c
@@ -249,23 +249,25 @@ static void uart_reset(DeviceState *d)
 s->regs[R_LSR] = LSR_THRE | LSR_TEMT;
 }
 
-static int lm32_uart_init(SysBusDevice *dev)
+static void lm32_uart_init(Object *obj)
 {
-LM32UartState *s = LM32_UART(dev);
+LM32UartState *s = LM32_UART(obj);
+SysBusDevice *dev = SYS_BUS_DEVICE(obj);
 
 sysbus_init_irq(dev, &s->irq);
 
-memory_region_init_io(&s->iomem, OBJECT(s), &uart_ops, s,
+memory_region_init_io(&s->iomem, obj, &uart_ops, s,
   "uart", R_MAX * 4);
 sysbus_init_mmio(dev, &s->iomem);
+}
+
+static void lm32_uart_realize(DeviceState *dev, Error **errp)
+{
+LM32UartState *s = LM32_UART(dev);
 
-/* FIXME use a qdev chardev prop instead of qemu_char_get_next_serial() */
-s->chr = qemu_char_get_next_serial();
 if (s->chr) {
 qemu_chr_add_handlers(s->chr, uart_can_rx, uart_rx, uart_event, s);
 }
-
-return 0;
 }
 
 static const VMStateDescription vmstate_lm32_uart = {
@@ -278,22 +280,26 @@ static const VMStateDescription vmstate_lm32_uart = {
 }
 };
 
+static Property lm32_uart_properties[] = {
+DEFINE_PROP_CHR("chardev", LM32UartState, chr),
+DEFINE_PROP_END_OF_LIST(),
+};
+
 static void lm32_uart_class_init(ObjectClass *klass, void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(klass);
-SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass);
 
-k->init = lm32_uart_init;
 dc->reset = uart_reset;
 dc->vmsd = &vmstate_lm32_uart;
-/* Reason: init() method uses qemu_char_get_next_serial() */
-dc->cannot_instantiate_with_device_add_yet = true;
+dc->props = lm32_uart_properties;
+dc->realize = lm32_uart_realize;
 }
 
 static const TypeInfo lm32_uart_info = {
 .name  = TYPE_LM32_UART,
 .parent= TYPE_SYS_BUS_DEVICE,
 .instance_size = sizeof(LM32UartState),
+.instance_init = lm32_uart_init,
 .class_init= lm32_uart_class_init,
 };
 
diff --git a/hw/lm32/lm32.h b/hw/lm32/lm32.h
index a993f00..e338bfe 100644
--- a/hw/lm32/lm32.h
+++ b/hw/lm32/lm32.h
@@ -27,4 +27,20 @@ static inline DeviceState *lm32_juart_init(CharDriverState 
*chr)
 return dev;
 }
 
+static inline DeviceState *lm32_uart_create(hwaddr addr,
+qemu_irq irq,
+CharDriverState *chr)
+{
+DeviceState *dev;
+SysBusDevice *s;
+
+dev = qdev_create(NULL, "lm32-uart");
+s = SYS_BUS_DEVICE(dev);
+qdev_prop_set_chr(dev, "chardev", chr);
+qdev_init_nofail(dev);
+sysbus_mmio_map(s, 0, addr);
+sysbus_connect_irq(s, 0, irq);
+return dev;
+}
+
 #endif
diff --git a/hw/lm32/lm32_boards.c b/hw/lm32/lm32_boards.c
index 2ae6555..8f0c307 100644
--- a/hw/lm32/lm32_boards.c
+++ b/hw/lm32/lm32_boards.c
@@ -132,7 +132,7 @@ static void lm32_evr_init(MachineState *machine)
 irq[i] = qdev_get_gpio_in(env->pic_state, i);
 }
 
-sysbus_create_simple("lm32-uart", uart0_base, irq[uart0_irq]);
+lm32_uart_create(uart0_base, irq[uart0_irq], serial_hds[0]);
 sysbus_create_simple("lm32-timer", timer0_base, irq[timer0_irq]);
 sysbus_create_simple("lm32-timer", timer1_base, irq[timer1_irq]);
 
@@ -233,7 +233,7 @@ static void lm32_uclinux_init(MachineState *machine)
 irq[i] = qdev_get_gpio_in(env->pic_state, i);
 }
 
-sysbus_create_simple("lm32-uart", uart0_base, irq[uart0_irq]);
+lm32_uart_create(uart0_base, irq[uart0_irq], serial_hds[0]);
 sysbus_create_simple("lm32-timer", timer0_base, irq[timer0_irq]);
 sysbus_create_simple("lm32-timer", timer1_base, irq[timer1_irq]);
 sysbus_create_simple("lm32-timer", timer2_base, irq[timer2_irq]);
-- 
2.1.4

[Qemu-devel] [PATCH v6 5/5] hw/char: QOM'ify milkymist-uart.c

2016-05-24 Thread xiaoqiang zhao

drop the qemu_char_get_next_serial and use chardev prop instead

Signed-off-by: xiaoqiang zhao 
---
 hw/char/milkymist-uart.c | 10 ++
 hw/lm32/milkymist-hw.h   |  4 +++-
 hw/lm32/milkymist.c  |  2 +-
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/hw/char/milkymist-uart.c b/hw/char/milkymist-uart.c
index 03b36b2..72f8484 100644
--- a/hw/char/milkymist-uart.c
+++ b/hw/char/milkymist-uart.c
@@ -200,8 +200,6 @@ static void milkymist_uart_realize(DeviceState *dev, Error 
**errp)
 {
 MilkymistUartState *s = MILKYMIST_UART(dev);
 
-/* FIXME use a qdev chardev prop instead of qemu_char_get_next_serial() */
-s->chr = qemu_char_get_next_serial();
 if (s->chr) {
 qemu_chr_add_handlers(s->chr, uart_can_rx, uart_rx, uart_event, s);
 }
@@ -229,6 +227,11 @@ static const VMStateDescription vmstate_milkymist_uart = {
 }
 };
 
+static Property milkymist_uart_properties[] = {
+DEFINE_PROP_CHR("chardev", MilkymistUartState, chr),
+DEFINE_PROP_END_OF_LIST(),
+};
+
 static void milkymist_uart_class_init(ObjectClass *klass, void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(klass);
@@ -236,8 +239,7 @@ static void milkymist_uart_class_init(ObjectClass *klass, 
void *data)
 dc->realize = milkymist_uart_realize;
 dc->reset = milkymist_uart_reset;
 dc->vmsd = &vmstate_milkymist_uart;
-/* Reason: realize() method uses qemu_char_get_next_serial() */
-dc->cannot_instantiate_with_device_add_yet = true;
+dc->props = milkymist_uart_properties;
 }
 
 static const TypeInfo milkymist_uart_info = {
diff --git a/hw/lm32/milkymist-hw.h b/hw/lm32/milkymist-hw.h
index f857d28..eb6a3a2 100644
--- a/hw/lm32/milkymist-hw.h
+++ b/hw/lm32/milkymist-hw.h
@@ -5,11 +5,13 @@
 #include "net/net.h"
 
 static inline DeviceState *milkymist_uart_create(hwaddr base,
-qemu_irq irq)
+ qemu_irq irq,
+ CharDriverState *chr)
 {
 DeviceState *dev;
 
 dev = qdev_create(NULL, "milkymist-uart");
+qdev_prop_set_chr(dev, "chardev", chr);
 qdev_init_nofail(dev);
 sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, base);
 sysbus_connect_irq(SYS_BUS_DEVICE(dev), 0, irq);
diff --git a/hw/lm32/milkymist.c b/hw/lm32/milkymist.c
index ebd8a3c..5cae0f1 100644
--- a/hw/lm32/milkymist.c
+++ b/hw/lm32/milkymist.c
@@ -159,7 +159,7 @@ milkymist_init(MachineState *machine)
 }
 g_free(bios_filename);
 
-milkymist_uart_create(0x6000, irq[0]);
+milkymist_uart_create(0x6000, irq[0], serial_hds[0]);
 milkymist_sysctl_create(0x60001000, irq[1], irq[2], irq[3],
 8000, 0x10014d31, 0x041f, 0x0001);
 milkymist_hpdmc_create(0x60002000);
-- 
2.1.4

Re: [Qemu-devel] [PATCH v2 14/14] blockjob: Remove BlockJob.bs

2016-05-24 Thread Changlong Xie


On 05/24/2016 09:47 PM, Kevin Wolf wrote:

There is a single remaining user in qemu-img, which can be trivially
converted to using BlockJob.blk instead.

Signed-off-by: Kevin Wolf 
Reviewed-by: Max Reitz 
---
  blockjob.c   | 1 -
  include/block/blockjob.h | 1 -
  qemu-img.c   | 2 +-
  3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/blockjob.c b/blockjob.c
index 2097e1d..c095cc5 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -83,7 +83,6 @@ void *block_job_create(const BlockJobDriver *driver, 
BlockDriverState *bs,

  job->driver= driver;
  job->id= g_strdup(bdrv_get_device_name(bs));
-job->bs= bs;
  job->blk   = blk;
  job->cb= cb;
  job->opaque= opaque;
diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index 32012af..86d2807 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -82,7 +82,6 @@ struct BlockJob {
  const BlockJobDriver *driver;

  /** The block device on which the job is operating.  */
-BlockDriverState *bs; /* TODO Remove */
  BlockBackend *blk;

  /**
diff --git a/qemu-img.c b/qemu-img.c
index 7ed8ef2..dd2ba0a 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -775,7 +775,7 @@ static void common_block_job_cb(void *opaque, int ret)

  static void run_block_job(BlockJob *job, Error **errp)
  {
-AioContext *aio_context = bdrv_get_aio_context(job->bs);
+AioContext *aio_context = blk_get_aio_context(job->blk);

  do {
  aio_poll(aio_context, true);



Compiled with your block-jobs-bb branch:

changlox ~/w/qemu/qemu% make check-unit -j8 


[ 1 ]  :-(
  CCtests/test-blockjob-txn.o
GTESTER tests/test-x86-cpuid
GTESTER tests/test-cutils
GTESTER tests/test-int128
GTESTER tests/check-qdict
GTESTER tests/check-qfloat
GTESTER tests/check-qint
GTESTER tests/check-qstring
GTESTER tests/check-qlist
GTESTER tests/check-qjson
GTESTER tests/check-qnull
GTESTER tests/test-qmp-output-visitor
GTESTER tests/test-qmp-input-strict
GTESTER tests/test-qmp-input-visitor
GTESTER tests/test-qmp-commands
GTESTER tests/test-string-input-visitor
GTESTER tests/test-string-output-visitor
GTESTER tests/test-qmp-event
GTESTER tests/test-opts-visitor
GTESTER tests/test-coroutine
GTESTER tests/test-visitor-serialization
GTESTER tests/test-iov
GTESTER tests/test-aio
GTESTER tests/test-rfifolock
tests/test-blockjob-txn.c: In function ‘test_block_job_complete’:
tests/test-blockjob-txn.c:33:31: error: ‘BlockJob’ has no member named ‘bs’
 BlockDriverState *bs = job->bs;
   ^
GTESTER tests/test-throttle
GTESTER tests/test-thread-pool
make: *** [tests/test-blockjob-txn.o] Error 1
make: *** Waiting for unfinished jobs

Re: [Qemu-devel] [PATCH 11/12] vfio: register aer resume notification handler for aer resume

2016-05-24 Thread Zhou Jie


Hi, Alex


  3. Stall any access to the device until resume is signaled.


The code below doesn't actually do this, mmaps are disabled, but that
just means any device access will use the slow path through QEMU.  That
path is still fully enabled and so is config space access.

I will modify the code and find other way to
stall any access to the device.


I find that to stall any access to the device,
I need modify the following function.
1. vfio_region_read and vfio_region_write
   For stalling any access to the device bar region.
2. vfio_vga_read and vfio_vga_write
   For stalling any access to the vga device region.
3. vfio_pci_read_config and vfio_pci_write_config
   For stalling any access to the device config space.

What will happen if I don't stall any access to the device?

Sincerely,
Zhou Jie

[Qemu-devel] [PULL v2 6/8] linux-user: remove unavailable syscalls from aarch64

2016-05-24 Thread riku . voipio

From: Riku Voipio 

QEMU lists deprecated system call numbers in for Aarch64. These
are never enabled for Linux kernel, so don't define them in Qemu
either. Remove the ifdef around host_to_target_stat64 since
all architectures need it now.

Reviewed-by: Peter Maydell 
Signed-off-by: Riku Voipio 
---
 linux-user/aarch64/syscall_nr.h | 59 -
 linux-user/syscall.c|  2 --
 2 files changed, 61 deletions(-)

diff --git a/linux-user/aarch64/syscall_nr.h b/linux-user/aarch64/syscall_nr.h
index c8a8599..59511d8 100644
--- a/linux-user/aarch64/syscall_nr.h
+++ b/linux-user/aarch64/syscall_nr.h
@@ -275,62 +275,3 @@
 #define TARGET_NR_mlock2 284
 #define TARGET_NR_copy_file_range 285
 
-#define TARGET_NR_open 1024
-#define TARGET_NR_link 1025
-#define TARGET_NR_unlink 1026
-#define TARGET_NR_mknod 1027
-#define TARGET_NR_chmod 1028
-#define TARGET_NR_chown 1029
-#define TARGET_NR_mkdir 1030
-#define TARGET_NR_rmdir 1031
-#define TARGET_NR_lchown 1032
-#define TARGET_NR_access 1033
-#define TARGET_NR_rename 1034
-#define TARGET_NR_readlink 1035
-#define TARGET_NR_symlink 1036
-#define TARGET_NR_utimes 1037
-#define TARGET_NR_stat 1038
-#define TARGET_NR_lstat 1039
-#define TARGET_NR_pipe 1040
-#define TARGET_NR_dup2 1041
-#define TARGET_NR_epoll_create 1042
-#define TARGET_NR_inotify_init 1043
-#define TARGET_NR_eventfd 1044
-#define TARGET_NR_signalfd 1045
-#define TARGET_NR_sendfile64 1046
-#define TARGET_NR_ftruncate64 1047
-#define TARGET_NR_truncate64 1048
-#define TARGET_NR_stat64 1049
-#define TARGET_NR_lstat64 1050
-#define TARGET_NR_fstat64 1051
-#define TARGET_NR_fcntl64 1052
-/* #define TARGET_NR_fadvise64 1053 */
-#define TARGET_NR_newfstatat 1054
-#define TARGET_NR_fstatfs64 1055
-#define TARGET_NR_statfs64 1056
-#define TARGET_NR_lseek64 1057
-#define TARGET_NR_mmap64 1058
-#define TARGET_NR_alarm 1059
-#define TARGET_NR_getpgrp 1060
-#define TARGET_NR_pause 1061
-#define TARGET_NR_time 1062
-#define TARGET_NR_utime 1063
-#define TARGET_NR_creat 1064
-#define TARGET_NR_getdents 1065
-#define TARGET_NR_futimesat 1066
-#define TARGET_NR_select 1067
-#define TARGET_NR_poll 1068
-#define TARGET_NR_epoll_wait 1069
-#define TARGET_NR_ustat 1070
-#define TARGET_NR_vfork 1071
-#define TARGET_NR_oldwait4 1072
-#define TARGET_NR_recv 1073
-#define TARGET_NR_send 1074
-#define TARGET_NR_bdflush 1075
-#define TARGET_NR_umount 1076
-#define TARGET_NR_uselib 1077
-#define TARGET_NR__sysctl 1078
-#define TARGET_NR_fork 1079
-#define TARGET_NR_syscalls (__NR_fork+1)
-
-#define TARGET_NR_sigreturn 1999
diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 8b76169..539183a 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -5231,7 +5231,6 @@ static inline int target_to_host_mlockall_arg(int arg)
 }
 #endif
 
-#if defined(TARGET_NR_stat64) || defined(TARGET_NR_newfstatat)
 static inline abi_long host_to_target_stat64(void *cpu_env,
  abi_ulong target_addr,
  struct stat *host_st)
@@ -5294,7 +5293,6 @@ static inline abi_long host_to_target_stat64(void 
*cpu_env,
 
 return 0;
 }
-#endif
 
 /* ??? Using host futex calls even when target atomic operations
are not really atomic probably breaks things.  However implementing
-- 
2.1.4

[Qemu-devel] [Bug 1585433] Re: if docker-volume-size of baymodel lessthan 3, bay cann't create

2016-05-24 Thread thh

** Project changed: qemu => magnum

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1585433

Title:
  if  docker-volume-size of baymodel lessthan 3, bay cann't create

Status in Magnum:
  New

Bug description:
  magnum is running on centos7,

  
  magnum baymodel-create --name k8sbaymodel5 --image-id 
fedora-23-atomic-20160405 --keypair-id testkey --external-network-id public 
--dns-nameserver 8.8.8.8 --flavor-id m1.small --coe kubernetes 
--docker-volume-size 5

  magnum bay-create --name k8sbay5 --baymodel k8sbaymodel5 --node-count
  1

  Execute the above command can get a completed bay,but when docker-
  volume-size is 1 or 2,the status of bay is FAILED.

To manage notifications about this bug go to:
https://bugs.launchpad.net/magnum/+bug/1585433/+subscriptions

[Qemu-devel] 答复： Re: RFC: Make 'info snapshots' show all of snapshots with multiple devices info

2016-05-24 Thread Lin Ma



>>> Kevin Wolf  2016/5/23 星期一 下午 5:00 >>>
>Am 22.05.2016 um 11:55 hat Lin Ma geschrieben:
>> Currently, the output of 'info snapshots' show fully available snapshots.
>>  
>> In my opinion there are 2 disadvantages:
>> 1. It's opaque, hides some snapshot information to users. It's not convenient
>> if users want to know more about all of snapshots on every block device via
>> monitor.
>>  
>> 2. It uses snapshot id to determine whether the snapshots are 'fully
>> available'.
>> It causes incorrect output in some scenario.
>>  
>> For instance:
>> (qemu) info block
>> drive_image1 (#block113): /opt/vms/SLES12-SP1-JeOS-x86_64-GM/disk0.qcow2
>> (qcow2)
>>   Cache mode:   writeback
>>  
>> drive_image2 (#block349): /opt/vms/SLES12-SP1-JeOS-x86_64-GM/disk1.qcow2
>> (qcow2)
>>   Cache mode:   writeback
>> (qemu)
>> (qemu) info snapshots
>> There is no snapshot available.
>> (qemu)
>> (qemu) snapshot_blkdev_internal drive_image1 snap1
>> (qemu)
>> (qemu) info snapshots
>> There is no suitable snapshot available
>> (qemu)
>> (qemu) savevm checkpoint-1
>> (qemu)
>> (qemu) info snapshots
>> ID   TAG VM SIZEDATE 
>>  VM CLOCK
>> 1 snap1   0 2016-05-22 16:57:31  
>>  00:01:30.567
>> (qemu)
>> 
>> $ qemu-img snapshot -l disk0.qcow2
>> Snapshot list:
>> ID   TAG VM SIZEDATE 
>>  VM CLOCK
>> 1 snap1   0 2016-05-22 16:57:31  
>>  00:01:30.567
>> 2 checkpoint-1  165M 2016-05-22 16:58:07   
>> 00:02:06.813
>> 
>> $ qemu-img snapshot -l disk1.qcow2
>> Snapshot list:
>> ID   TAG VM SIZEDATE 
>>  VM CLOCK
>> 1 checkpoint-1 0 2016-05-22 16:58:07   
>> 00:02:06.813
>>  
>>  
>>  
>> I'd like to patch it to make the output looking like this:
>> (qemu) info snapshots
>>  
>> Snapshot list from drive_image1:
>> ID   TAG VM SIZEDATE 
>>  VM CLOCK
>> 1 snap1   0 2016-05-22 16:57:31  
>>  00:01:30.567
>> 2 checkpoint-1  165M 2016-05-22 16:58:07   
>> 00:02:06.813
>>  
>> 
>> Snapshot list from  drive_image2:
>> ID   TAG VM SIZEDATE 
>>  VM CLOCK
>> 1 checkpoint-1 0 2016-05-22 16:58:07   
>> 00:02:06.813
>
>I think that would clutter the output too much in the common case where
>all images have the same snapshots. How about having a list with all
>loadable snapshots first, and then only an additional list for images
>that have snapshots that aren't present on all images?
>
>(qemu) info snapshots
>List of snapshots present on all disks:
>ID TAG VM SIZEDATE 
> VM CLOCK
>-- checkpoint-1  165M 2016-05-22 16:58:07   
>00:02:06.813
Double short dash, good idea.

>List of partial (non-loadable) snapshots on 'drive_image1':
>ID TAG VM SIZEDATE 
> VM CLOCK
>1   snap1   0 2016-05-22 16:57:31  
> 00:01:30.567
These looks more convenient. I'll do it follow this format, Thanks.
 
 
Lin

[Qemu-devel] [PULL 06/38] linux-user: Support for restarting system calls for x86 targets

2016-05-24 Thread riku . voipio

From: Timothy E Baldwin 

Update the x86 main loop and sigreturn code:
 * on TARGET_ERESTARTSYS, wind guest PC backwards to repeat syscall insn
 * set all guest CPU state within signal.c code rather than passing it
   back out as the "return code" from do_sigreturn()
 * handle TARGET_QEMU_ESIGRETURN in the main loop as the indication
   that the main loop should not touch EAX

Signed-off-by: Timothy Edward Baldwin 
Message-id: 1441497448-32489-5-git-send-email-t.e.baldwi...@members.leeds.ac.uk
Reviewed-by: Peter Maydell 
[PMM: Commit message tweaks; drop TARGET_USE_ERESTARTSYS define]
Signed-off-by: Peter Maydell 
Signed-off-by: Riku Voipio 
---
 linux-user/main.c| 47 +--
 linux-user/signal.c  | 15 +++
 linux-user/syscall.c |  2 --
 3 files changed, 36 insertions(+), 28 deletions(-)

diff --git a/linux-user/main.c b/linux-user/main.c
index ba38aed..57ae76e 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -285,6 +285,7 @@ void cpu_loop(CPUX86State *env)
 CPUState *cs = CPU(x86_env_get_cpu(env));
 int trapnr;
 abi_ulong pc;
+abi_ulong ret;
 target_siginfo_t info;
 
 for(;;) {
@@ -294,28 +295,38 @@ void cpu_loop(CPUX86State *env)
 switch(trapnr) {
 case 0x80:
 /* linux syscall from int $0x80 */
-env->regs[R_EAX] = do_syscall(env,
-  env->regs[R_EAX],
-  env->regs[R_EBX],
-  env->regs[R_ECX],
-  env->regs[R_EDX],
-  env->regs[R_ESI],
-  env->regs[R_EDI],
-  env->regs[R_EBP],
-  0, 0);
+ret = do_syscall(env,
+ env->regs[R_EAX],
+ env->regs[R_EBX],
+ env->regs[R_ECX],
+ env->regs[R_EDX],
+ env->regs[R_ESI],
+ env->regs[R_EDI],
+ env->regs[R_EBP],
+ 0, 0);
+if (ret == -TARGET_ERESTARTSYS) {
+env->eip -= 2;
+} else if (ret != -TARGET_QEMU_ESIGRETURN) {
+env->regs[R_EAX] = ret;
+}
 break;
 #ifndef TARGET_ABI32
 case EXCP_SYSCALL:
 /* linux syscall from syscall instruction */
-env->regs[R_EAX] = do_syscall(env,
-  env->regs[R_EAX],
-  env->regs[R_EDI],
-  env->regs[R_ESI],
-  env->regs[R_EDX],
-  env->regs[10],
-  env->regs[8],
-  env->regs[9],
-  0, 0);
+ret = do_syscall(env,
+ env->regs[R_EAX],
+ env->regs[R_EDI],
+ env->regs[R_ESI],
+ env->regs[R_EDX],
+ env->regs[10],
+ env->regs[8],
+ env->regs[9],
+ 0, 0);
+if (ret == -TARGET_ERESTARTSYS) {
+env->eip -= 2;
+} else if (ret != -TARGET_QEMU_ESIGRETURN) {
+env->regs[R_EAX] = ret;
+}
 break;
 #endif
 case EXCP0B_NOSEG:
diff --git a/linux-user/signal.c b/linux-user/signal.c
index 04c21d0..11ddd05 100644
--- a/linux-user/signal.c
+++ b/linux-user/signal.c
@@ -1024,7 +1024,7 @@ give_sigsegv:
 }
 
 static int
-restore_sigcontext(CPUX86State *env, struct target_sigcontext *sc, int *peax)
+restore_sigcontext(CPUX86State *env, struct target_sigcontext *sc)
 {
 unsigned int err = 0;
 abi_ulong fpstate_addr;
@@ -1042,6 +1042,7 @@ restore_sigcontext(CPUX86State *env, struct 
target_sigcontext *sc, int *peax)
 env->regs[R_EBX] = tswapl(sc->ebx);
 env->regs[R_EDX] = tswapl(sc->edx);
 env->regs[R_ECX] = tswapl(sc->ecx);
+env->regs[R_EAX] = tswapl(sc->eax);
 env->eip = tswapl(sc->eip);
 
 cpu_x86_load_seg(env, R_CS, lduw_p(&sc->cs) | 3);
@@ -1059,7 +1060,6 @@ restore_sigcontext(CPUX86State *env, struct 
target_sigcontext *sc, int *peax)
 cpu_x86_frstor(env, fpstate_addr, 1);
 }
 
-*peax = tswapl(sc->eax);
 return err;
 badframe:
 return 1;
@@ -1071,7 +1071,7 @@ long do_sigreturn(CPUX86State *env)
 abi_ulong frame_addr = env->regs[R_ESP] - 8;
 target_sigset_t target_set;
 sigset_t set;
-int eax, i;
+int i;
 
 trace_user_do_sigreturn(env, frame_addr);
 if (!lock_user_struct(VERIFY_

[Qemu-devel] [PULL v2 4/8] linux-user: Don't assert if guest tries shmdt(0)

2016-05-24 Thread riku . voipio

From: Peter Maydell 

Our implementation of shmat() and shmdt() for linux-user was
using "zero guest address" as its marker for "entry in the
shm_regions[] array is not in use". This meant that if the
guest did a shmdt(0) we would match on an unused array entry
and call page_set_flags() with both start and end addresses zero,
which causes an assertion failure.

Use an explicit in_use flag to manage the shm_regions[] array,
so that we avoid this problem.

Signed-off-by: Peter Maydell 
Reported-by: Pavel Shamis 
Reviewed-by: Laurent Vivier 
Signed-off-by: Riku Voipio 
---
 linux-user/syscall.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index dac5518..8b76169 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -2598,8 +2598,9 @@ static abi_long do_socketcall(int num, abi_ulong vptr)
 #define N_SHM_REGIONS  32
 
 static struct shm_region {
-abi_ulong  start;
-abi_ulong  size;
+abi_ulong start;
+abi_ulong size;
+bool in_use;
 } shm_regions[N_SHM_REGIONS];
 
 struct target_semid_ds
@@ -3291,7 +3292,8 @@ static inline abi_ulong do_shmat(int shmid, abi_ulong 
shmaddr, int shmflg)
((shmflg & SHM_RDONLY)? 0 : PAGE_WRITE));
 
 for (i = 0; i < N_SHM_REGIONS; i++) {
-if (shm_regions[i].start == 0) {
+if (!shm_regions[i].in_use) {
+shm_regions[i].in_use = true;
 shm_regions[i].start = raddr;
 shm_regions[i].size = shm_info.shm_segsz;
 break;
@@ -3308,8 +3310,8 @@ static inline abi_long do_shmdt(abi_ulong shmaddr)
 int i;
 
 for (i = 0; i < N_SHM_REGIONS; ++i) {
-if (shm_regions[i].start == shmaddr) {
-shm_regions[i].start = 0;
+if (shm_regions[i].in_use && shm_regions[i].start == shmaddr) {
+shm_regions[i].in_use = false;
 page_set_flags(shmaddr, shmaddr + shm_regions[i].size, 0);
 break;
 }
-- 
2.1.4

[Qemu-devel] [Bug 1585432] Re: magnum coe-service-list returns error

2016-05-24 Thread thh

I assume you want this to be in the Magnum project, not qemu...?

** Project changed: qemu => magnum

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1585432

Title:
  magnum coe-service-list  returns error

Status in Magnum:
  New

Bug description:
  magnum running on centos7,

  I didn't create any services on k8s cluster,

  but I got the result as follows:

  [root@localhost ~(keystone_admin)]# magnum coe-service-list --bay k8sbay3
  ERROR: Field `ports[0][node_port]' cannot be None (HTTP 500) (Request-ID: 
req-c66b68dd-5437-47fa-a389-7cb56a262fa5)

To manage notifications about this bug go to:
https://bugs.launchpad.net/magnum/+bug/1585432/+subscriptions

[Qemu-devel] ARM invalid co-processor register

2016-05-24 Thread Karthik

Hi,

CPU: Cortex R5F

I have this instruction that invalidates the entire data cache

MCR p15, 0, r0, c15, c5, 0


http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0460c/Chdhgibd.html

This instruction generates undefined exception, and further debugging
showed it is because the co-processor register was not implemented.

To get around, I have added the below entry in the cortexr5_cp_reginfo[]
(target-arm/cpu.c)

{
{.name = "INVALLDC", .cp=15, .opc1 = 0, .crn = 5, .opc2 = 0.
 .access = PL1_W, .type = ARM_CP_NOP | ARM_CP_OVERRIDE},
}

or

I have to add
set_feature(&cpu->env, ARM_FEATURE_DUMMY_C15_REGS);

So, which option is recommended?

Best regards,
Karthik

[Qemu-devel] [PULL 04/38] linux-user: Define TARGET_ERESTART* errno values

2016-05-24 Thread riku . voipio

From: Timothy E Baldwin 

Define TARGET_ERESTARTSYS; like the kernel, we will use this to
indicate that a guest system call should be restarted. We use
the same value the kernel does for this, 512.

Signed-off-by: Timothy Edward Baldwin 
[PMM: split out from the patch which moves and renumbers
 TARGET_QEMU_ESIGRETURN, add comment on usage]
Reviewed-by: Peter Maydell 
Signed-off-by: Peter Maydell 
Signed-off-by: Riku Voipio 
---
 linux-user/errno_defs.h | 8 
 1 file changed, 8 insertions(+)

diff --git a/linux-user/errno_defs.h b/linux-user/errno_defs.h
index 8a1cf76..b7a8c9f 100644
--- a/linux-user/errno_defs.h
+++ b/linux-user/errno_defs.h
@@ -139,3 +139,11 @@
 /* for robust mutexes */
 #define TARGET_EOWNERDEAD  130 /* Owner died */
 #define TARGET_ENOTRECOVERABLE 131 /* State not recoverable */
+
+/* QEMU internal, not visible to the guest. This is returned when a
+ * system call should be restarted, to tell the main loop that it
+ * should wind the guest PC backwards so it will re-execute the syscall
+ * after handling any pending signals. They match with the ones the guest
+ * kernel uses for the same purpose.
+ */
+#define TARGET_ERESTARTSYS 512 /* Restart system call (if SA_RESTART) 
*/
-- 
2.1.4

Re: [Qemu-devel] [PULL v2 3/8] linux-user: set ppc64/ppc64le default CPU to POWER8

2016-05-24 Thread Riku Voipio

Hi,

Sorry ignore these "pull v2" mails, apparently I had some old files
lying around..

On Wed, May 25, 2016 at 08:27:38AM +0300, riku.voi...@linaro.org wrote:
> From: Laurent Vivier 
> 
> Set the default to the latest CPU version to have the
> largest set of available features.
> 
> It is also really needed in little-endian mode because
> POWER7 is not really supported in this mode and some distros
> (at least debian) generate POWER8 code for their ppc64le target.
> 
> Fixes: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=813698
> 
> Signed-off-by: Laurent Vivier 
> Reviewed-by: Alexander Graf 
> Reviewed-by: Michael Tokarev 
> Signed-off-by: Riku Voipio 
> ---
>  linux-user/main.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/linux-user/main.c b/linux-user/main.c
> index e719a2d..2a692e0 100644
> --- a/linux-user/main.c
> +++ b/linux-user/main.c
> @@ -4160,7 +4160,7 @@ int main(int argc, char **argv, char **envp)
>  cpu_model = "or1200";
>  #elif defined(TARGET_PPC)
>  # ifdef TARGET_PPC64
> -cpu_model = "POWER7";
> +cpu_model = "POWER8";
>  # else
>  cpu_model = "750";
>  # endif
> -- 
> 2.1.4
> 
>

[Qemu-devel] [PULL v2 1/8] linux-user: fix realloc size of target_fd_trans.

2016-05-24 Thread riku . voipio

From: Laurent Vivier 

target_fd_trans is an array of "TargetFdTrans *": compute size
accordingly. Use g_renew() as proposed by Paolo.

Reported-by: Paolo Bonzini 
Signed-off-by: Laurent Vivier 
Signed-off-by: Riku Voipio 
---
 linux-user/syscall.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 54ce14a..dac5518 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -318,8 +318,8 @@ static void fd_trans_register(int fd, TargetFdTrans *trans)
 if (fd >= target_fd_max) {
 oldmax = target_fd_max;
 target_fd_max = ((fd >> 6) + 1) << 6; /* by slice of 64 entries */
-target_fd_trans = g_realloc(target_fd_trans,
-target_fd_max * sizeof(TargetFdTrans));
+target_fd_trans = g_renew(TargetFdTrans *,
+  target_fd_trans, target_fd_max);
 memset((void *)(target_fd_trans + oldmax), 0,
(target_fd_max - oldmax) * sizeof(TargetFdTrans *));
 }
-- 
2.1.4

Re: [Qemu-devel] [QEMU RFC PATCH v2 1/6] spapr: ensure device trees are always associated with DRC

2016-05-24 Thread David Gibson

On Tue, May 24, 2016 at 10:55:04AM -0700, Jianjun Duan wrote:
> There are possible racing situations involving hotplug events and
> guest migration. For cases where a hotplug event is migrated, or
> the guest is in the process of fetching device tree at the time of
> migration, we need to ensure the device tree is created and
> associated with the corresponding DRC for devices that were
> hotplugged on the source, but 'coldplugged' on the target.
> 
> Signed-off-by: Jianjun Duan 

Applied to ppc-for-2.7, thanks.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature

[Qemu-devel] [PULL 03/38] linux-user: Reindent signal handling

2016-05-24 Thread riku . voipio

From: Timothy E Baldwin 

Some of the signal handling was a mess with a mixture of tabs and 8 space
indents.

Signed-off-by: Timothy Edward Baldwin 
Message-id: 1441497448-32489-3-git-send-email-t.e.baldwi...@members.leeds.ac.uk
Reviewed-by: Peter Maydell 
[PMM: just rebased]
Signed-off-by: Peter Maydell 
Signed-off-by: Riku Voipio 
---
 linux-user/signal.c | 1543 ++-
 1 file changed, 791 insertions(+), 752 deletions(-)

diff --git a/linux-user/signal.c b/linux-user/signal.c
index 96e86c0..04c21d0 100644
--- a/linux-user/signal.c
+++ b/linux-user/signal.c
@@ -157,7 +157,7 @@ static void target_to_host_sigset_internal(sigset_t *d,
 if (target_sigismember(s, i)) {
 sigaddset(d, target_to_host_signal(i));
 }
- }
+}
 }
 
 void target_to_host_sigset(sigset_t *d, const target_sigset_t *s)
@@ -250,18 +250,18 @@ static inline void 
host_to_target_siginfo_noswap(target_siginfo_t *tinfo,
 tinfo->si_code = info->si_code;
 
 if (sig == TARGET_SIGILL || sig == TARGET_SIGFPE || sig == TARGET_SIGSEGV
-|| sig == TARGET_SIGBUS || sig == TARGET_SIGTRAP) {
+|| sig == TARGET_SIGBUS || sig == TARGET_SIGTRAP) {
 /* Should never come here, but who knows. The information for
the target is irrelevant.  */
 tinfo->_sifields._sigfault._addr = 0;
 } else if (sig == TARGET_SIGIO) {
 tinfo->_sifields._sigpoll._band = info->si_band;
-   tinfo->_sifields._sigpoll._fd = info->si_fd;
+tinfo->_sifields._sigpoll._fd = info->si_fd;
 } else if (sig == TARGET_SIGCHLD) {
 tinfo->_sifields._sigchld._pid = info->si_pid;
 tinfo->_sifields._sigchld._uid = info->si_uid;
 tinfo->_sifields._sigchld._status
-= host_to_target_waitstatus(info->si_status);
+= host_to_target_waitstatus(info->si_status);
 tinfo->_sifields._sigchld._utime = info->si_utime;
 tinfo->_sifields._sigchld._stime = info->si_stime;
 } else if (sig >= TARGET_SIGRTMIN) {
@@ -269,7 +269,7 @@ static inline void 
host_to_target_siginfo_noswap(target_siginfo_t *tinfo,
 tinfo->_sifields._rt._uid = info->si_uid;
 /* XXX: potential problem if 64 bit */
 tinfo->_sifields._rt._sigval.sival_ptr
-= (abi_ulong)(unsigned long)info->si_value.sival_ptr;
+= (abi_ulong)(unsigned long)info->si_value.sival_ptr;
 }
 }
 
@@ -723,75 +723,75 @@ int do_sigaction(int sig, const struct target_sigaction 
*act,
 /* from the Linux kernel */
 
 struct target_fpreg {
-   uint16_t significand[4];
-   uint16_t exponent;
+uint16_t significand[4];
+uint16_t exponent;
 };
 
 struct target_fpxreg {
-   uint16_t significand[4];
-   uint16_t exponent;
-   uint16_t padding[3];
+uint16_t significand[4];
+uint16_t exponent;
+uint16_t padding[3];
 };
 
 struct target_xmmreg {
-   abi_ulong element[4];
+abi_ulong element[4];
 };
 
 struct target_fpstate {
-   /* Regular FPU environment */
-abi_ulong   cw;
-abi_ulong   sw;
-abi_ulong   tag;
-abi_ulong   ipoff;
-abi_ulong   cssel;
-abi_ulong   dataoff;
-abi_ulong   datasel;
-   struct target_fpreg _st[8];
-   uint16_tstatus;
-   uint16_tmagic;  /* 0x = regular FPU data only */
-
-   /* FXSR FPU environment */
-abi_ulong   _fxsr_env[6];   /* FXSR FPU env is ignored */
-abi_ulong   mxcsr;
-abi_ulong   reserved;
-   struct target_fpxreg_fxsr_st[8];/* FXSR FPU reg data is ignored 
*/
-   struct target_xmmreg_xmm[8];
-abi_ulong   padding[56];
+/* Regular FPU environment */
+abi_ulong cw;
+abi_ulong sw;
+abi_ulong tag;
+abi_ulong ipoff;
+abi_ulong cssel;
+abi_ulong dataoff;
+abi_ulong datasel;
+struct target_fpreg _st[8];
+uint16_t  status;
+uint16_t  magic;  /* 0x = regular FPU data only */
+
+/* FXSR FPU environment */
+abi_ulong _fxsr_env[6];   /* FXSR FPU env is ignored */
+abi_ulong mxcsr;
+abi_ulong reserved;
+struct target_fpxreg _fxsr_st[8]; /* FXSR FPU reg data is ignored */
+struct target_xmmreg _xmm[8];
+abi_ulong padding[56];
 };
 
 #define X86_FXSR_MAGIC 0x
 
 struct target_sigcontext {
-   uint16_t gs, __gsh;
-   uint16_t fs, __fsh;
-   uint16_t es, __esh;
-   uint16_t ds, __dsh;
-abi_ulong edi;
-abi_ulong esi;
-abi_ulong ebp;
-abi_ulong esp;
-abi_ulong ebx;
-abi_ulong edx;
-abi_ulong ecx;
-abi_ulong eax;
-abi_ulong trapno;
-abi_ulong err;
-abi_ulong eip;
-   uint16_t cs, __csh;
-abi_ulong eflags;
-abi_ulong esp_at_signal;
-   uint16_t ss, __ssh;
-abi_ulong fpstate; /* pointer */
-abi_ulong oldmask;

[Qemu-devel] [PULL 02/38] linux-user: Consistently return host errnos from do_openat()

2016-05-24 Thread riku . voipio

From: Peter Maydell 

The function do_openat() is not consistent about whether it is
returning a host errno or a guest errno in case of failure.
Standardise on returning -1 with errno set (ie caller has
to call get_errno()).

Signed-off-by: Peter Maydell 
Reported-by: Timothy Edward Baldwin 
Signed-off-by: Riku Voipio 
Reviewed-by: Laurent Vivier 
---
 linux-user/syscall.c | 14 ++
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 5246f36..f4c2e19 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -5559,7 +5559,9 @@ static int open_self_cmdline(void *cpu_env, int fd)
 
 nb_read = read(fd_orig, buf, sizeof(buf));
 if (nb_read < 0) {
+int e = errno;
 fd_orig = close(fd_orig);
+errno = e;
 return -1;
 } else if (nb_read == 0) {
 break;
@@ -5579,7 +5581,9 @@ static int open_self_cmdline(void *cpu_env, int fd)
 
 if (word_skipped) {
 if (write(fd, cp_buf, nb_read) != nb_read) {
+int e = errno;
 close(fd_orig);
+errno = e;
 return -1;
 }
 }
@@ -5599,7 +5603,7 @@ static int open_self_maps(void *cpu_env, int fd)
 
 fp = fopen("/proc/self/maps", "r");
 if (fp == NULL) {
-return -EACCES;
+return -1;
 }
 
 while ((read = getline(&line, &len, fp)) != -1) {
@@ -5743,7 +5747,7 @@ static int open_net_route(void *cpu_env, int fd)
 
 fp = fopen("/proc/net/route", "r");
 if (fp == NULL) {
-return -EACCES;
+return -1;
 }
 
 /* read header */
@@ -5793,7 +5797,7 @@ static int do_openat(void *cpu_env, int dirfd, const char 
*pathname, int flags,
 
 if (is_proc_myself(pathname, "exe")) {
 int execfd = qemu_getauxval(AT_EXECFD);
-return execfd ? execfd : get_errno(sys_openat(dirfd, exec_path, flags, 
mode));
+return execfd ? execfd : sys_openat(dirfd, exec_path, flags, mode);
 }
 
 for (fake_open = fakes; fake_open->filename; fake_open++) {
@@ -5819,7 +5823,9 @@ static int do_openat(void *cpu_env, int dirfd, const char 
*pathname, int flags,
 unlink(filename);
 
 if ((r = fake_open->fill(cpu_env, fd))) {
+int e = errno;
 close(fd);
+errno = e;
 return r;
 }
 lseek(fd, 0, SEEK_SET);
@@ -5827,7 +5833,7 @@ static int do_openat(void *cpu_env, int dirfd, const char 
*pathname, int flags,
 return fd;
 }
 
-return get_errno(sys_openat(dirfd, path(pathname), flags, mode));
+return sys_openat(dirfd, path(pathname), flags, mode);
 }
 
 #define TIMER_MAGIC 0x0caf
-- 
2.1.4

Re: [Qemu-devel] ARM Cortex R5 + VFP3

2016-05-24 Thread Karthik

Hi,

Added enabling VFP co-processor in the bootrom emulation code. Works like a
charm.

Many thanks.

Best regards,
Karthik


On Tue, May 24, 2016 at 7:52 PM, Karthik  wrote:

> Yeah, the micro had a secure boot ROM which we don't have access to.
> Probably it is enabled there. I'll check in the target hardware.
>
> The code generated by the compiler doesn't specifically check for
> non-existent registers, so we should be good using VFP3 in place for
> VFP3-D16.
>
> Best regards,
> Karthik
> On May 24, 2016 7:31 PM, "Peter Maydell"  wrote:
>
>> On 24 May 2016 at 14:49, Karthik  wrote:
>> > ahh okay. The code I don't think writes to CPACR_EL1 register, but it
>> runs
>> > on the hardware anywary.
>>
>> If it does then there's probably some firmware somewhere
>> which is doing the setup for you somehow.
>>
>> As you can see in the Cortex-R5 technical reference manual entry
>> for CPACR:
>>
>> http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0205g/Bgbjhiea.html
>>
>> the hardware reset value for bits 21..23 is 0, meaning access denied,
>> and for FPEXC the EN bit is clear on reset:
>>
>> http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0205g/Bgbjhiea.html
>>
>> By the way, QEMU doesn't implement VFPv3-D16, which is what the R5
>> ought to have -- setting the ARM_FEATURE_VFP3 bit will give you
>> 32 double-precision registers. This is probably close enough that
>> guest code will work, though obviously if the guest specifically
>> tests that registers 16..31 don't exist it will not behave as
>> the hardware does.
>>
>> thanks
>> -- PMM
>>
>

[Qemu-devel] [PULL 05/38] linux-user: Renumber TARGET_QEMU_ESIGRETURN, make it not arch-specific

2016-05-24 Thread riku . voipio

From: Timothy E Baldwin 

Currently we define a QEMU-internal errno TARGET_QEMU_ESIGRETURN
only on the MIPS and PPC targets; move this to errno_defs.h
so it is available for all architectures, and renumber it to 513.
We pick 513 because this is safe from future use as a system call return
value: Linux uses it as ERESTART_NOINTR internally and never allows that
errno to escape to userspace.

Signed-off-by: Timothy Edward Baldwin 
Message-id: 1441497448-32489-4-git-send-email-t.e.baldwi...@members.leeds.ac.uk
[PMM: TARGET_ERESTARTSYS split out into preceding patch, add comment]
Reviewed-by: Peter Maydell 
Reviewed-by: Laurent Vivier 
Signed-off-by: Peter Maydell 
Signed-off-by: Riku Voipio 
---
 linux-user/errno_defs.h| 9 +
 linux-user/mips/target_syscall.h   | 4 
 linux-user/mips64/target_syscall.h | 4 
 linux-user/ppc/target_syscall.h| 2 --
 4 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/linux-user/errno_defs.h b/linux-user/errno_defs.h
index b7a8c9f..65522c4 100644
--- a/linux-user/errno_defs.h
+++ b/linux-user/errno_defs.h
@@ -147,3 +147,12 @@
  * kernel uses for the same purpose.
  */
 #define TARGET_ERESTARTSYS 512 /* Restart system call (if SA_RESTART) 
*/
+
+/* QEMU internal, not visible to the guest. This is returned by the
+ * do_sigreturn() code after a successful sigreturn syscall, to indicate
+ * that it has correctly set the guest registers and so the main loop
+ * should not touch them. We use the value the guest would use for
+ * ERESTART_NOINTR (which is kernel internal) to guarantee that we won't
+ * clash with a valid guest errno now or in the future.
+ */
+#define TARGET_QEMU_ESIGRETURN 513 /* Return from signal */
diff --git a/linux-user/mips/target_syscall.h b/linux-user/mips/target_syscall.h
index 68db160..e8e305c 100644
--- a/linux-user/mips/target_syscall.h
+++ b/linux-user/mips/target_syscall.h
@@ -222,10 +222,6 @@ struct target_pt_regs {
 #define TARGET_ENOTRECOVERABLE 166 /* State not recoverable */
 
 
-
-/* Nasty hack: define a fake errno value for use by sigreturn.  */
-#define TARGET_QEMU_ESIGRETURN 255
-
 #define UNAME_MACHINE "mips"
 #define UNAME_MINIMUM_RELEASE "2.6.32"
 
diff --git a/linux-user/mips64/target_syscall.h 
b/linux-user/mips64/target_syscall.h
index 0e0c2d2..5789e86 100644
--- a/linux-user/mips64/target_syscall.h
+++ b/linux-user/mips64/target_syscall.h
@@ -219,10 +219,6 @@ struct target_pt_regs {
 #define TARGET_ENOTRECOVERABLE 166 /* State not recoverable */
 
 
-
-/* Nasty hack: define a fake errno value for use by sigreturn. */
-#define TARGET_QEMU_ESIGRETURN 255
-
 #define UNAME_MACHINE "mips64"
 #define UNAME_MINIMUM_RELEASE "2.6.32"
 
diff --git a/linux-user/ppc/target_syscall.h b/linux-user/ppc/target_syscall.h
index 35cab59..7ca83c2 100644
--- a/linux-user/ppc/target_syscall.h
+++ b/linux-user/ppc/target_syscall.h
@@ -53,8 +53,6 @@ struct target_revectored_struct {
abi_ulong __map[8]; /* 256 bits */
 };
 
-/* Nasty hack: define a fake errno value for use by sigreturn.  */
-#define TARGET_QEMU_ESIGRETURN 255
 
 /*
  * flags masks
-- 
2.1.4

[Qemu-devel] [PULL 01/38] linux-user: Check array bounds in errno conversion

2016-05-24 Thread riku . voipio

From: Timothy E Baldwin 

Check array bounds in host_to_target_errno() and target_to_host_errno().

Signed-off-by: Timothy Edward Baldwin 
Message-id: 1441497448-32489-2-git-send-email-t.e.baldwi...@members.leeds.ac.uk
[PMM: Add a lower-bound check, use braces on if(), tweak commit message]
Signed-off-by: Peter Maydell 
Signed-off-by: Riku Voipio 
Reviewed-by: Laurent Vivier 
---
 linux-user/syscall.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 032d338..5246f36 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -619,15 +619,19 @@ static uint16_t 
host_to_target_errno_table[ERRNO_TABLE_SIZE] = {
 
 static inline int host_to_target_errno(int err)
 {
-if(host_to_target_errno_table[err])
+if (err >= 0 && err < ERRNO_TABLE_SIZE &&
+host_to_target_errno_table[err]) {
 return host_to_target_errno_table[err];
+}
 return err;
 }
 
 static inline int target_to_host_errno(int err)
 {
-if (target_to_host_errno_table[err])
+if (err >= 0 && err < ERRNO_TABLE_SIZE &&
+target_to_host_errno_table[err]) {
 return target_to_host_errno_table[err];
+}
 return err;
 }
 
-- 
2.1.4

[Qemu-devel] [PULL 07/38] linux-user: Support for restarting system calls for ARM targets

2016-05-24 Thread riku . voipio

From: Timothy E Baldwin 

Update the 32-bit and 64-bit ARM main loop and sigreturn code:
 * on TARGET_ERESTARTSYS, wind guest PC backwards to repeat syscall insn
 * set all guest CPU state within signal.c code on sigreturn
 * handle TARGET_QEMU_ESIGRETURN in the main loop as the indication
   that the main loop should not touch any guest CPU state

Signed-off-by: Timothy Edward Baldwin 
Message-id: 1441497448-32489-6-git-send-email-t.e.baldwi...@members.leeds.ac.uk
Reviewed-by: Peter Maydell 
[PMM: tweak commit message; drop TARGET_USE_ERESTARTSYS define]
Signed-off-by: Peter Maydell 
Signed-off-by: Riku Voipio 
---
 linux-user/arm/target_signal.h |  1 +
 linux-user/main.c  | 48 ++
 linux-user/signal.c| 10 -
 3 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/linux-user/arm/target_signal.h b/linux-user/arm/target_signal.h
index 2b32813..fb31f4c 100644
--- a/linux-user/arm/target_signal.h
+++ b/linux-user/arm/target_signal.h
@@ -26,4 +26,5 @@ static inline abi_ulong get_sp_from_cpustate(CPUARMState 
*state)
return state->regs[13];
 }
 
+
 #endif /* TARGET_SIGNAL_H */
diff --git a/linux-user/main.c b/linux-user/main.c
index 57ae76e..0e3da58 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -727,6 +727,7 @@ void cpu_loop(CPUARMState *env)
 unsigned int n, insn;
 target_siginfo_t info;
 uint32_t addr;
+abi_ulong ret;
 
 for(;;) {
 cpu_exec_start(cs);
@@ -865,15 +866,20 @@ void cpu_loop(CPUARMState *env)
 break;
 }
 } else {
-env->regs[0] = do_syscall(env,
-  n,
-  env->regs[0],
-  env->regs[1],
-  env->regs[2],
-  env->regs[3],
-  env->regs[4],
-  env->regs[5],
-  0, 0);
+ret = do_syscall(env,
+ n,
+ env->regs[0],
+ env->regs[1],
+ env->regs[2],
+ env->regs[3],
+ env->regs[4],
+ env->regs[5],
+ 0, 0);
+if (ret == -TARGET_ERESTARTSYS) {
+env->regs[15] -= env->thumb ? 2 : 4;
+} else if (ret != -TARGET_QEMU_ESIGRETURN) {
+env->regs[0] = ret;
+}
 }
 } else {
 goto error;
@@ -1056,6 +1062,7 @@ void cpu_loop(CPUARMState *env)
 {
 CPUState *cs = CPU(arm_env_get_cpu(env));
 int trapnr, sig;
+abi_long ret;
 target_siginfo_t info;
 
 for (;;) {
@@ -1065,15 +1072,20 @@ void cpu_loop(CPUARMState *env)
 
 switch (trapnr) {
 case EXCP_SWI:
-env->xregs[0] = do_syscall(env,
-   env->xregs[8],
-   env->xregs[0],
-   env->xregs[1],
-   env->xregs[2],
-   env->xregs[3],
-   env->xregs[4],
-   env->xregs[5],
-   0, 0);
+ret = do_syscall(env,
+ env->xregs[8],
+ env->xregs[0],
+ env->xregs[1],
+ env->xregs[2],
+ env->xregs[3],
+ env->xregs[4],
+ env->xregs[5],
+ 0, 0);
+if (ret == -TARGET_ERESTARTSYS) {
+env->pc -= 4;
+} else if (ret != -TARGET_QEMU_ESIGRETURN) {
+env->xregs[0] = ret;
+}
 break;
 case EXCP_INTERRUPT:
 /* just indicate that signals should be handled asap */
diff --git a/linux-user/signal.c b/linux-user/signal.c
index 11ddd05..14e58b0 100644
--- a/linux-user/signal.c
+++ b/linux-user/signal.c
@@ -1390,7 +1390,7 @@ long do_rt_sigreturn(CPUARMState *env)
 }
 
 unlock_user_struct(frame, frame_addr, 0);
-return env->xregs[0];
+return -TARGET_QEMU_ESIGRETURN;
 
  badframe:
 unlock_user_struct(frame, frame_addr, 0);
@@ -1902,7 +1902,7 @@ static long do_sigreturn_v1(CPUARMState *env)
 send_sig(SIGTRAP, current, 1);
 #endif

[Qemu-devel] [PULL v2 2/8] build: [linux-user] Rename "syscall.h" to "target_syscall.h" in target directories

2016-05-24 Thread riku . voipio

From: Lluís Vilanova 

This fixes double-definitions in linux-user builds when using the UST
tracing backend (which indirectly includes the system's "syscall.h").

Signed-off-by: Lluís Vilanova 
Reviewed-by: Peter Maydell 
Signed-off-by: Riku Voipio 
---
 linux-user/aarch64/syscall.h   |  13 --
 linux-user/aarch64/target_syscall.h|  18 +++
 linux-user/alpha/syscall.h | 257 
 linux-user/alpha/target_syscall.h  | 262 +
 linux-user/arm/syscall.h   |  50 ---
 linux-user/arm/target_syscall.h|  54 +++
 linux-user/cris/syscall.h  |  46 --
 linux-user/cris/target_syscall.h   |  46 ++
 linux-user/i386/syscall.h  | 152 ---
 linux-user/i386/target_syscall.h   | 157 
 linux-user/m68k/syscall.h  |  25 
 linux-user/m68k/target_syscall.h   |  29 
 linux-user/microblaze/syscall.h|  56 ---
 linux-user/microblaze/target_syscall.h |  56 +++
 linux-user/mips/syscall.h  | 233 -
 linux-user/mips/target_syscall.h   | 237 +
 linux-user/mips64/syscall.h| 230 -
 linux-user/mips64/target_syscall.h | 234 +
 linux-user/openrisc/syscall.h  |  29 
 linux-user/openrisc/target_syscall.h   |  34 +
 linux-user/ppc/syscall.h   |  75 --
 linux-user/ppc/target_syscall.h|  80 ++
 linux-user/qemu.h  |   2 +-
 linux-user/s390x/syscall.h |  29 
 linux-user/s390x/target_syscall.h  |  34 +
 linux-user/sh4/syscall.h   |  17 ---
 linux-user/sh4/target_syscall.h|  22 +++
 linux-user/sparc/syscall.h |  20 ---
 linux-user/sparc/target_syscall.h  |  25 
 linux-user/sparc64/syscall.h   |  21 ---
 linux-user/sparc64/target_syscall.h|  26 
 linux-user/tilegx/syscall.h|  43 --
 linux-user/tilegx/target_syscall.h |  43 ++
 linux-user/unicore32/syscall.h |  60 
 linux-user/unicore32/target_syscall.h  |  60 
 linux-user/x86_64/syscall.h| 102 -
 linux-user/x86_64/target_syscall.h | 107 ++
 37 files changed, 1525 insertions(+), 1459 deletions(-)
 delete mode 100644 linux-user/aarch64/syscall.h
 create mode 100644 linux-user/aarch64/target_syscall.h
 delete mode 100644 linux-user/alpha/syscall.h
 create mode 100644 linux-user/alpha/target_syscall.h
 delete mode 100644 linux-user/arm/syscall.h
 create mode 100644 linux-user/arm/target_syscall.h
 delete mode 100644 linux-user/cris/syscall.h
 create mode 100644 linux-user/cris/target_syscall.h
 delete mode 100644 linux-user/i386/syscall.h
 create mode 100644 linux-user/i386/target_syscall.h
 delete mode 100644 linux-user/m68k/syscall.h
 create mode 100644 linux-user/m68k/target_syscall.h
 delete mode 100644 linux-user/microblaze/syscall.h
 create mode 100644 linux-user/microblaze/target_syscall.h
 delete mode 100644 linux-user/mips/syscall.h
 create mode 100644 linux-user/mips/target_syscall.h
 delete mode 100644 linux-user/mips64/syscall.h
 create mode 100644 linux-user/mips64/target_syscall.h
 delete mode 100644 linux-user/openrisc/syscall.h
 create mode 100644 linux-user/openrisc/target_syscall.h
 delete mode 100644 linux-user/ppc/syscall.h
 create mode 100644 linux-user/ppc/target_syscall.h
 delete mode 100644 linux-user/s390x/syscall.h
 create mode 100644 linux-user/s390x/target_syscall.h
 delete mode 100644 linux-user/sh4/syscall.h
 create mode 100644 linux-user/sh4/target_syscall.h
 delete mode 100644 linux-user/sparc/syscall.h
 create mode 100644 linux-user/sparc/target_syscall.h
 delete mode 100644 linux-user/sparc64/syscall.h
 create mode 100644 linux-user/sparc64/target_syscall.h
 delete mode 100644 linux-user/tilegx/syscall.h
 create mode 100644 linux-user/tilegx/target_syscall.h
 delete mode 100644 linux-user/unicore32/syscall.h
 create mode 100644 linux-user/unicore32/target_syscall.h
 delete mode 100644 linux-user/x86_64/syscall.h
 create mode 100644 linux-user/x86_64/target_syscall.h

diff --git a/linux-user/aarch64/syscall.h b/linux-user/aarch64/syscall.h
deleted file mode 100644
index dc72a15..000
--- a/linux-user/aarch64/syscall.h
+++ /dev/null
@@ -1,13 +0,0 @@
-struct target_pt_regs {
-uint64_tregs[31];
-uint64_tsp;
-uint64_tpc;
-uint64_tpstate;
-};
-
-#define UNAME_MACHINE "aarch64"
-#define UNAME_MINIMUM_RELEASE "3.8.0"
-#define TARGET_CLONE_BACKWARDS
-#define TARGET_MINSIGSTKSZ   2048
-#define TARGET_MLOCKALL_MCL_CURRENT 1
-#define TARGET_MLOCKALL_MCL_FUTURE  2
diff --git a/linux-user/aarch64/target_syscall.h 
b/linux-user/aarch64/target_syscall.h
new file mode 100644
index 000..f458018
--- /dev/null
+++ b/linux-user/aarch64/tar

[Qemu-devel] [PULL v2 5/8] linux-user: sync syscall numbers with kernel

2016-05-24 Thread riku . voipio

From: Riku Voipio 

Sync syscall numbers to match the linux v4.5-rc1 kernel.

Reviewed-by: Laurent Vivier 
Signed-off-by: Riku Voipio 
---
 linux-user/aarch64/syscall_nr.h|  2 +-
 linux-user/alpha/syscall_nr.h  |  6 ++
 linux-user/cris/syscall_nr.h   | 24 
 linux-user/i386/syscall_nr.h   | 27 +++
 linux-user/microblaze/syscall_nr.h |  8 
 linux-user/mips64/syscall_nr.h | 23 +++
 linux-user/openrisc/syscall_nr.h   | 28 
 linux-user/ppc/syscall_nr.h| 24 
 linux-user/s390x/syscall_nr.h  | 30 ++
 linux-user/sparc/syscall_nr.h  | 14 ++
 linux-user/sparc64/syscall_nr.h| 14 ++
 linux-user/tilegx/syscall_nr.h |  4 
 linux-user/x86_64/syscall_nr.h | 13 +
 13 files changed, 200 insertions(+), 17 deletions(-)

diff --git a/linux-user/aarch64/syscall_nr.h b/linux-user/aarch64/syscall_nr.h
index 74f4275..c8a8599 100644
--- a/linux-user/aarch64/syscall_nr.h
+++ b/linux-user/aarch64/syscall_nr.h
@@ -262,7 +262,6 @@
 #define TARGET_NR_process_vm_writev 271
 #define TARGET_NR_kcmp 272
 #define TARGET_NR_finit_module 273
-
 #define TARGET_NR_sched_setattr 274
 #define TARGET_NR_sched_getattr 275
 #define TARGET_NR_renameat2 276
@@ -274,6 +273,7 @@
 #define TARGET_NR_userfaultfd 282
 #define TARGET_NR_membarrier 283
 #define TARGET_NR_mlock2 284
+#define TARGET_NR_copy_file_range 285
 
 #define TARGET_NR_open 1024
 #define TARGET_NR_link 1025
diff --git a/linux-user/alpha/syscall_nr.h b/linux-user/alpha/syscall_nr.h
index dde8d5c..00e14bb 100644
--- a/linux-user/alpha/syscall_nr.h
+++ b/linux-user/alpha/syscall_nr.h
@@ -444,3 +444,9 @@
 #define TARGET_NR_process_vm_writev 505
 #define TARGET_NR_kcmp  506
 #define TARGET_NR_finit_module  507
+#define TARGET_NR_sched_setattr 508
+#define TARGET_NR_sched_getattr 509
+#define TARGET_NR_renameat2 510
+#define TARGET_NR_getrandom 511
+#define TARGET_NR_memfd_create  512
+#define TARGET_NR_execveat  513
diff --git a/linux-user/cris/syscall_nr.h b/linux-user/cris/syscall_nr.h
index 694bd02..44f0b64 100644
--- a/linux-user/cris/syscall_nr.h
+++ b/linux-user/cris/syscall_nr.h
@@ -336,3 +336,27 @@
 #define TARGET_NR_preadv 333
 #define TARGET_NR_pwritev334
 #define TARGET_NR_setns  335
+#define TARGET_NR_name_to_handle_at  336
+#define TARGET_NR_open_by_handle_at  337
+#define TARGET_NR_rt_tgsigqueueinfo  338
+#define TARGET_NR_perf_event_open339
+#define TARGET_NR_recvmmsg   340
+#define TARGET_NR_accept4341
+#define TARGET_NR_fanotify_init  342
+#define TARGET_NR_fanotify_mark  343
+#define TARGET_NR_prlimit64  344
+#define TARGET_NR_clock_adjtime  345
+#define TARGET_NR_syncfs 346
+#define TARGET_NR_sendmmsg   347
+#define TARGET_NR_process_vm_readv   348
+#define TARGET_NR_process_vm_writev  349
+#define TARGET_NR_kcmp   350
+#define TARGET_NR_finit_module   351
+#define TARGET_NR_sched_setattr  352
+#define TARGET_NR_sched_getattr  353
+#define TARGET_NR_renameat2  354
+#define TARGET_NR_seccomp355
+#define TARGET_NR_getrandom  356
+#define TARGET_NR_memfd_create   357
+#define TARGET_NR_bpf358
+#define TARGET_NR_execveat   359
diff --git a/linux-user/i386/syscall_nr.h b/linux-user/i386/syscall_nr.h
index c8f7302..fa3f0b4 100644
--- a/linux-user/i386/syscall_nr.h
+++ b/linux-user/i386/syscall_nr.h
@@ -353,3 +353,30 @@
 #define TARGET_NR_process_vm_writev 348
 #define TARGET_NR_kcmp  349
 #define TARGET_NR_finit_module  350
+#define TARGET_NR_sched_setattr 351
+#define TARGET_NR_sched_getattr 352
+#define TARGET_NR_renameat2 353
+#define TARGET_NR_seccomp   354
+#define TARGET_NR_getrandom 355
+#define TARGET_NR_memfd_create  356
+#define TARGET_NR_bpf   357
+#define TARGET_NR_execveat  358
+#define TARGET_NR_socket359
+#define TARGET_NR_socketpair360
+#define TARGET_NR_bind  361
+#define TARGET_NR_connect   362
+#define TARGET_NR_listen363
+#define TARGET_NR_accept4   364
+#define TARGET_NR_getsockopt365
+#define TARGET_NR_setsockopt366
+#define TARGET_NR_getsockname   367
+#define TARGET_NR_getpeername   368
+#define TARGET_NR_sendto369
+#define TARGET_NR_sendmsg   370
+#define TARGET_NR_recvfrom  371
+#define TARGET_NR_recvmsg   372
+#define TARGET_NR_shutdown  373
+#define TARGET_NR_userfaultfd

[Qemu-devel] [PULL v2 3/8] linux-user: set ppc64/ppc64le default CPU to POWER8

2016-05-24 Thread riku . voipio

From: Laurent Vivier 

Set the default to the latest CPU version to have the
largest set of available features.

It is also really needed in little-endian mode because
POWER7 is not really supported in this mode and some distros
(at least debian) generate POWER8 code for their ppc64le target.

Fixes: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=813698

Signed-off-by: Laurent Vivier 
Reviewed-by: Alexander Graf 
Reviewed-by: Michael Tokarev 
Signed-off-by: Riku Voipio 
---
 linux-user/main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/linux-user/main.c b/linux-user/main.c
index e719a2d..2a692e0 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -4160,7 +4160,7 @@ int main(int argc, char **argv, char **envp)
 cpu_model = "or1200";
 #elif defined(TARGET_PPC)
 # ifdef TARGET_PPC64
-cpu_model = "POWER7";
+cpu_model = "POWER8";
 # else
 cpu_model = "750";
 # endif
-- 
2.1.4

[Qemu-devel] [PULL 00/38] linux-user update

2016-05-24 Thread riku . voipio

From: Riku Voipio 

The following changes since commit b0f6ef8915247f3230ffd9b71af9c3dadb6082c7:

  Merge remote-tracking branch 'remotes/amit-virtio-rng/tags/rng-2.7-1' into 
staging (2016-05-24 11:38:22 +0100)

are available in the git repository at:

  git://git.linaro.org/people/riku.voipio/qemu.git tags/pull-linux-user-20160525

for you to fetch changes up to 210a7201d1bd9e25eec01d23edb713f181eabb23:

  linux-user,target-ppc: fix use of MSR_LE (2016-05-24 15:45:46 +0300)


Linux user update for May 2016


Chen Gang (3):
  linux-user/signal.c: Generate opcode data for restorer in setup_rt_frame
  linux-user/signal.c: Use target address instead of host address for 
microblaze restorer
  linux-user/signal.c: Use s390 target space address instead of host space

Laurent Vivier (1):
  linux-user,target-ppc: fix use of MSR_LE

Peter Maydell (11):
  linux-user: Consistently return host errnos from do_openat()
  linux-user: Support for restarting system calls for tilegx targets
  linux-user: Set r14 on exit from microblaze syscall
  linux-user: Use safe_syscall for pselect, select syscalls
  linux-user: Use safe_syscall for futex syscall
  linux-user: Handle negative values in timespec conversion
  linux-user: Handle msgrcv error case correctly
  linux-user: Use g_try_malloc() in do_msgrcv()
  linux-user: x86_64: Don't use 16-bit UIDs
  linux-user: Use direct syscalls for setuid(), etc
  linux-user: arm: Remove ARM_cpsr and similar #defines

Timothy E Baldwin (23):
  linux-user: Check array bounds in errno conversion
  linux-user: Reindent signal handling
  linux-user: Define TARGET_ERESTART* errno values
  linux-user: Renumber TARGET_QEMU_ESIGRETURN, make it not arch-specific
  linux-user: Support for restarting system calls for x86 targets
  linux-user: Support for restarting system calls for ARM targets
  linux-user: Support for restarting system calls for MIPS targets
  linux-user: Support for restarting system calls for PPC targets
  linux-user: Support for restarting system calls for SPARC targets
  linux-user: Support for restarting system calls for SH4 targets
  linux-user: Support for restarting system calls for Alpha targets
  linux-user: Support for restarting system calls for UniCore32 targets
  linux-user: Support for restarting system calls for OpenRISC targets
  linux-user: Support for restarting system calls for M68K targets
  linux-user: Support for restarting system calls for S390 targets
  linux-user: Support for restarting system calls for CRIS targets
  linux-user: Support for restarting system calls for Microblaze targets
  linux-user: Add debug code to exercise restarting system calls
  linux-user: Provide safe_syscall for fixing races between signals and 
syscalls
  linux-user: Use safe_syscall for read and write system calls
  linux-user: Use safe_syscall for open and openat system calls
  linux-user: Use safe_syscall for wait system calls
  linux-user: Use safe_syscall for execve syscall

 Makefile.target   |4 +-
 linux-user/Makefile.objs  |3 +-
 linux-user/alpha/target_signal.h  |1 +
 linux-user/arm/target_signal.h|1 +
 linux-user/arm/target_syscall.h   |   20 +-
 linux-user/cris/target_signal.h   |1 +
 linux-user/elfload.c  |   19 +-
 linux-user/errno_defs.h   |   17 +
 linux-user/host/x86_64/hostdep.h  |   38 +
 linux-user/host/x86_64/safe-syscall.inc.S |   81 +++
 linux-user/m68k/target_signal.h   |1 +
 linux-user/main.c |  225 --
 linux-user/microblaze/target_signal.h |1 +
 linux-user/mips/target_signal.h   |1 +
 linux-user/mips/target_syscall.h  |4 -
 linux-user/mips64/target_signal.h |1 +
 linux-user/mips64/target_syscall.h|4 -
 linux-user/openrisc/target_signal.h   |1 +
 linux-user/ppc/target_signal.h|1 +
 linux-user/ppc/target_syscall.h   |2 -
 linux-user/qemu.h |  127 +++-
 linux-user/s390x/target_signal.h  |1 +
 linux-user/safe-syscall.S |   30 +
 linux-user/sh4/target_signal.h|1 +
 linux-user/signal.c   | 1631 
++-
 linux-user/sparc/target_signal.h  |1 +
 linux-user/sparc64/target_signal.h|1 +
 linux-user/syscall.c  |  266 +--
 linux-user/syscall_defs.h |3 +-
 linux-user/tilegx/target_signal.h |1 +
 30 files changed, 1516 insertions(+), 972 deletions(-)
 create mode 100644 linux-user/host/x86_64/hostdep.h
 create mode 100644 li

[Qemu-devel] [PATCH] nbd: Don't trim unrequested bytes

2016-05-24 Thread Eric Blake

Similar to commit df7b97ff, we are mishandling clients that
give an unaligned NBD_CMD_TRIM request, and potentially
trimming bytes that occur before their request; which in turn
can cause potential unintended data loss (unlikely in
practice, since most clients are sane and issue aligned trim
requests).  However, while we fixed read and write by switching
to the byte interfaces of blk_, we don't yet have a byte
interface for discard.  On the other hand, trim is advisory, so
rounding the user's request to simply ignore the first and last
unaligned sectors (or the entire request, if it is sub-sector
in length) is just fine.

CC: qemu-sta...@nongnu.org
Signed-off-by: Eric Blake 
---
 nbd/server.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/nbd/server.c b/nbd/server.c
index fa862cd..5aed8db 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -1153,8 +1153,12 @@ static void nbd_trip(void *opaque)
 break;
 case NBD_CMD_TRIM:
 TRACE("Request type is TRIM");
-ret = blk_co_discard(exp->blk, (request.from + exp->dev_offset)
-   / BDRV_SECTOR_SIZE,
+/* Ignore unaligned head or tail, until block layer adds byte
+ * interface */
+request.len -= (request.from + request.len) % BDRV_SECTOR_SIZE;
+ret = blk_co_discard(exp->blk,
+ DIV_ROUND_UP(request.from + exp->dev_offset,
+  BDRV_SECTOR_SIZE),
  request.len / BDRV_SECTOR_SIZE);
 if (ret < 0) {
 LOG("discard failed");
-- 
2.5.5

Re: [Qemu-devel] [PATCH v2 08/14] mirror: Use BlockBackend for I/O

2016-05-24 Thread Changlong Xie


On 05/25/2016 12:01 PM, Eric Blake wrote:

On 05/24/2016 09:51 PM, Changlong Xie wrote:

On 05/24/2016 09:47 PM, Kevin Wolf wrote:

+s->target = blk_new();

blk_new(errp);


Depends on Kevin's block/next branch, which currently includes:

commit 5d7dd50566a4f9786b95f49448f48fead0bb34d8
Author: Max Reitz 
Date:   Tue May 17 16:41:34 2016 +0200

 block: Drop errp parameter from blk_new()

 blk_new() cannot fail so its Error ** parameter has become superfluous.

 Signed-off-by: Max Reitz 
 Signed-off-by: Kevin Wolf 

So the patch builds as written when applied to the correct branch.
http://repo.or.cz/qemu/kevin.git



Thanks for you reminder.

Re: [Qemu-devel] [PATCH v2 13/14] commit: Use BlockBackend for I/O

2016-05-24 Thread Eric Blake

On 05/24/2016 09:51 PM, Changlong Xie wrote:
> On 05/24/2016 09:47 PM, Kevin Wolf wrote:
>> +s->base = blk_new();
> blk_new(errp);
>> +blk_insert_bs(s->base, base);
>> +
>> +s->top = blk_new();
> blk_new(errp);

Wrong. Even if it weren't for basing it on top of Kevin's branch which
removes the Error parameter, you can't safely pass errp to two functions
in a row (if both functions fail, the second will cause an assertion
error for trying to set an already-set error); the correct usage when
calling multiple functions that can set errors requires the use of a
local Error *err = NULL; up front, then error_propagate(errp, err) after
each place where it can be set.

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org

signature.asc
Description: OpenPGP digital signature

Re: [Qemu-devel] [PATCH v2 08/14] mirror: Use BlockBackend for I/O

2016-05-24 Thread Eric Blake

On 05/24/2016 09:51 PM, Changlong Xie wrote:
> On 05/24/2016 09:47 PM, Kevin Wolf wrote:
>> +s->target = blk_new();
> blk_new(errp);

Depends on Kevin's block/next branch, which currently includes:

commit 5d7dd50566a4f9786b95f49448f48fead0bb34d8
Author: Max Reitz 
Date:   Tue May 17 16:41:34 2016 +0200

block: Drop errp parameter from blk_new()

blk_new() cannot fail so its Error ** parameter has become superfluous.

Signed-off-by: Max Reitz 
Signed-off-by: Kevin Wolf 

So the patch builds as written when applied to the correct branch.
http://repo.or.cz/qemu/kevin.git

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org

signature.asc
Description: OpenPGP digital signature

Re: [Qemu-devel] [PATCH v2 12/14] backup: Use BlockBackend for I/O

2016-05-24 Thread Changlong Xie


On 05/24/2016 09:47 PM, Kevin Wolf wrote:

+job->target = blk_new();

blk_new(errp);

Re: [Qemu-devel] [PATCH v2 08/14] mirror: Use BlockBackend for I/O

2016-05-24 Thread Changlong Xie


On 05/24/2016 09:47 PM, Kevin Wolf wrote:

+s->target = blk_new();

blk_new(errp);

Re: [Qemu-devel] [PATCH v2 13/14] commit: Use BlockBackend for I/O

2016-05-24 Thread Changlong Xie


On 05/24/2016 09:47 PM, Kevin Wolf wrote:

+s->base = blk_new();

blk_new(errp);

+blk_insert_bs(s->base, base);
+
+s->top = blk_new();

blk_new(errp);

Re: [Qemu-devel] [PATCH v2 04/14] block: Convert block job core to BlockBackend

2016-05-24 Thread Changlong Xie


On 05/24/2016 09:47 PM, Kevin Wolf wrote:

+blk = blk_new();

blk_new(errp);

Re: [Qemu-devel] [patch v6 11/12] vfio: register aer resume notification handler for aer resume

2016-05-24 Thread Alex Williamson

On Tue, 24 May 2016 13:49:12 +0300
"Michael S. Tsirkin"  wrote:

> On Tue, Apr 26, 2016 at 08:48:15AM -0600, Alex Williamson wrote:
> > I think that means that if we want to switch from a
> > simple halt-on-error to a mechanism for the guest to handle recovery,
> > we need to disable access to the device between being notified that the
> > error occurred and being notified to resume.  
> 
> But this isn't what happens on bare metal.
> Errors are reported asynchronously and host might access the device
> meanwhile.  These accesses might or might not trigger more errors, but
> fundamentally this should not matter too much as device is going to be
> reset.

Bare metal also doesn't have a hypervisor underneath performing a PCI
bus reset, there's only one OS trying to control the device at a time,
so we have some clear differences from bare metal that I don't know we
can avoid.  The thought here was that we need to notify the guest at the
earliest point we can, but let the host recovery run to completion
before allowing the user to interact with the device.  Perhaps there is
no need to block region access to the device (ie. config space & BAR
resources), but I think we do need to somehow synchronize the bus resets
or else we get situations like that observed previously where the bus is
still in reset while userspace trys to proceed with using it.

The next question then would be whether that's QEMU's job or something
that should be done in the host kernel.  It's been proposed to add yet
another eventfd for the kernel vfio-pci to signal QEMU when a resume
notification has occured, but perhaps the better approach would be for
the hot reset ioctl (and base reset ioctl) to handle this situation more
transparently.  We could immediately return -EAGAIN and allow QEMU to
delay itself for any reset ioctl received after the AER error detected
event, but before the resume event.  We could also allow some sort of
timeout, that the ioctl might enter an interruptible sleep, woken on
the resume notification or timeout.  That sounds a bit better to me as
the specification of what's allowed between the error detected
notification and the resume notification is otherwise pretty poorly
defined.  Do you think we can run completely asynchronous, letting the
host and guest bus resets race?  Thanks,

Alex

[Qemu-devel] [Bug 1585433] [NEW] if docker-volume-size of baymodel lessthan 3, bay cann't create

2016-05-24 Thread Michael liu

Public bug reported:

magnum is running on centos7,


magnum baymodel-create --name k8sbaymodel5 --image-id fedora-23-atomic-20160405 
--keypair-id testkey --external-network-id public --dns-nameserver 8.8.8.8 
--flavor-id m1.small --coe kubernetes --docker-volume-size 5

magnum bay-create --name k8sbay5 --baymodel k8sbaymodel5 --node-count 1

Execute the above command can get a completed bay,but when docker-
volume-size is 1 or 2,the status of bay is FAILED.

** Affects: qemu
 Importance: Undecided
 Status: New

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1585433

Title:
  if  docker-volume-size of baymodel lessthan 3, bay cann't create

Status in QEMU:
  New

Bug description:
  magnum is running on centos7,

  
  magnum baymodel-create --name k8sbaymodel5 --image-id 
fedora-23-atomic-20160405 --keypair-id testkey --external-network-id public 
--dns-nameserver 8.8.8.8 --flavor-id m1.small --coe kubernetes 
--docker-volume-size 5

  magnum bay-create --name k8sbay5 --baymodel k8sbaymodel5 --node-count
  1

  Execute the above command can get a completed bay,but when docker-
  volume-size is 1 or 2,the status of bay is FAILED.

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1585433/+subscriptions

[Qemu-devel] [Bug 1585432] [NEW] magnum coe-service-list returns error

2016-05-24 Thread Michael liu

Public bug reported:

magnum running on centos7,

I didn't create any services on k8s cluster,

but I got the result as follows:

[root@localhost ~(keystone_admin)]# magnum coe-service-list --bay k8sbay3
ERROR: Field `ports[0][node_port]' cannot be None (HTTP 500) (Request-ID: 
req-c66b68dd-5437-47fa-a389-7cb56a262fa5)

** Affects: qemu
 Importance: Undecided
 Status: New

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1585432

Title:
  magnum coe-service-list  returns error

Status in QEMU:
  New

Bug description:
  magnum running on centos7,

  I didn't create any services on k8s cluster,

  but I got the result as follows:

  [root@localhost ~(keystone_admin)]# magnum coe-service-list --bay k8sbay3
  ERROR: Field `ports[0][node_port]' cannot be None (HTTP 500) (Request-ID: 
req-c66b68dd-5437-47fa-a389-7cb56a262fa5)

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1585432/+subscriptions

Re: [Qemu-devel] [PATCH v2 13/14] commit: Use BlockBackend for I/O

2016-05-24 Thread Eric Blake

On 05/24/2016 07:47 AM, Kevin Wolf wrote:
> This changes the commit block job to use the job's BlockBackend for
> performing its I/O. job->bs isn't used by the commit code any more
> afterwards.
> 
> Signed-off-by: Kevin Wolf 
> ---
>  block/commit.c | 53 +
>  1 file changed, 33 insertions(+), 20 deletions(-)
> 

Reviewed-by: Eric Blake 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-devel] [PATCH v2 12/14] backup: Use BlockBackend for I/O

2016-05-24 Thread Eric Blake

On 05/24/2016 07:47 AM, Kevin Wolf wrote:
> This changes the backup block job to use the job's BlockBackend for
> performing its I/O. job->bs isn't used by the backup code any more
> afterwards.
> 
> Signed-off-by: Kevin Wolf 
> ---
>  block/backup.c| 46 +-
>  block/io.c|  9 -
>  blockdev.c|  4 +---
>  include/block/block.h |  2 --
>  trace-events  |  1 -
>  5 files changed, 22 insertions(+), 40 deletions(-)
> 

>  
> -if (is_write_notifier) {
> -ret = bdrv_co_readv_no_serialising(bs,
> -   start * sectors_per_cluster,
> -   n, &bounce_qiov);
> -} else {
> -ret = bdrv_co_readv(bs, start * sectors_per_cluster, n,
> -&bounce_qiov);
> -}
> +ret = blk_co_preadv(blk, start * job->cluster_size,
> +bounce_qiov.size, &bounce_qiov,
> +is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0);

See my question earlier in the series about whether we want a redundant
size parameter in blk_co_preadv() now that it is public.

>  if (ret < 0) {
>  trace_backup_do_cow_read_fail(job, start, ret);
>  if (error_is_read) {
> @@ -155,13 +150,11 @@ static int coroutine_fn backup_do_cow(BackupBlockJob 
> *job,
>  }
>  
>  if (buffer_is_zero(iov.iov_base, iov.iov_len)) {
> -ret = bdrv_co_write_zeroes(job->target,
> -   start * sectors_per_cluster,
> -   n, BDRV_REQ_MAY_UNMAP);
> +ret = blk_co_write_zeroes(job->target, start * job->cluster_size,
> +  n * BDRV_SECTOR_SIZE, 
> BDRV_REQ_MAY_UNMAP);

More rebasing fun.

Reviewed-by: Eric Blake 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-devel] [PATCH v2 08/14] mirror: Use BlockBackend for I/O

2016-05-24 Thread Eric Blake

On 05/24/2016 07:47 AM, Kevin Wolf wrote:
> This changes the mirror block job to use the job's BlockBackend for
> performing its I/O. job->bs isn't used by the mirroring code any more
> afterwards.
> 
> Signed-off-by: Kevin Wolf 
> ---
>  block/mirror.c | 74 
> --
>  blockdev.c |  4 +---
>  2 files changed, 42 insertions(+), 36 deletions(-)
> 

> @@ -295,18 +297,19 @@ static void mirror_do_zero_or_discard(MirrorBlockJob *s,
>  s->in_flight++;
>  s->sectors_in_flight += nb_sectors;
>  if (is_discard) {
> -bdrv_aio_discard(s->target, sector_num, op->nb_sectors,
> - mirror_write_complete, op);
> +blk_aio_discard(s->target, sector_num, op->nb_sectors,
> +mirror_write_complete, op);

Eventually, discard should be converted to bytes. (Did I just get
volunteered?)

>  } else {
> -bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors,
> -  s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
> -  mirror_write_complete, op);
> +blk_aio_write_zeroes(s->target, sector_num * BDRV_SECTOR_SIZE,
> + op->nb_sectors * BDRV_SECTOR_SIZE,
> + s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
> + mirror_write_complete, op);

Conflicts with the write_zeroes stuff I just posted, let me know if I
need to rebase mine.

Reviewed-by: Eric Blake 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-devel] [PATCH 2/2] memory: Drop FlatRange.romd_mode

2016-05-24 Thread Fam Zheng

On Tue, 05/24 19:47, Laszlo Ersek wrote:
> Which I think satisfies (a->mr == b->mr), but falsifies (a->romd_mode ==
> b->romd_mode).
> 
> In effect, the patch seems to allow merging and equality between FlatRange
> objects when they only differ in romd_mode, and that's wrong.
> 
> Given that the cover letter for this series says "memory: Dead code
> removals", I'm requesting that this patch be simply reverted.

Yes, it is a mistake, let's revert it. Thanks a lot for the analysis!

Fam

[Qemu-devel] [PATCH v6 13/15] qht: add test-qht-par to invoke qht-bench from 'check' target

2016-05-24 Thread Emilio G. Cota

Signed-off-by: Emilio G. Cota 
---
 tests/.gitignore |  1 +
 tests/Makefile   |  5 -
 tests/test-qht-par.c | 56 
 3 files changed, 61 insertions(+), 1 deletion(-)
 create mode 100644 tests/test-qht-par.c

diff --git a/tests/.gitignore b/tests/.gitignore
index d19023e..840ea39 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -52,6 +52,7 @@ test-qemu-opts
 test-qdist
 test-qga
 test-qht
+test-qht-par
 test-qmp-commands
 test-qmp-commands.h
 test-qmp-event
diff --git a/tests/Makefile b/tests/Makefile
index 176bbd8..b4e4e21 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -74,6 +74,8 @@ check-unit-y += tests/test-qdist$(EXESUF)
 gcov-files-test-qdist-y = util/qdist.c
 check-unit-y += tests/test-qht$(EXESUF)
 gcov-files-test-qht-y = util/qht.c
+check-unit-y += tests/test-qht-par$(EXESUF)
+gcov-files-test-qht-par-y = util/qht.c
 check-unit-y += tests/test-bitops$(EXESUF)
 check-unit-$(CONFIG_HAS_GLIB_SUBPROCESS_TESTS) += 
tests/test-qdev-global-props$(EXESUF)
 check-unit-y += tests/check-qom-interface$(EXESUF)
@@ -398,7 +400,7 @@ test-obj-y = tests/check-qint.o tests/check-qstring.o 
tests/check-qdict.o \
tests/test-opts-visitor.o tests/test-qmp-event.o \
tests/rcutorture.o tests/test-rcu-list.o \
tests/test-qdist.o \
-   tests/test-qht.o tests/qht-bench.o
+   tests/test-qht.o tests/qht-bench.o tests/test-qht-par.o
 
 $(test-obj-y): QEMU_INCLUDES += -Itests
 QEMU_CFLAGS += -I$(SRC_PATH)/tests
@@ -439,6 +441,7 @@ tests/rcutorture$(EXESUF): tests/rcutorture.o 
$(test-util-obj-y)
 tests/test-rcu-list$(EXESUF): tests/test-rcu-list.o $(test-util-obj-y)
 tests/test-qdist$(EXESUF): tests/test-qdist.o $(test-util-obj-y)
 tests/test-qht$(EXESUF): tests/test-qht.o $(test-util-obj-y)
+tests/test-qht-par$(EXESUF): tests/test-qht-par.o tests/qht-bench$(EXESUF) 
$(test-util-obj-y)
 tests/qht-bench$(EXESUF): tests/qht-bench.o $(test-util-obj-y)
 
 tests/test-qdev-global-props$(EXESUF): tests/test-qdev-global-props.o \
diff --git a/tests/test-qht-par.c b/tests/test-qht-par.c
new file mode 100644
index 000..fc0cb23
--- /dev/null
+++ b/tests/test-qht-par.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2016, Emilio G. Cota 
+ *
+ * License: GNU GPL, version 2 or later.
+ *   See the COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include 
+
+#define TEST_QHT_STRING "tests/qht-bench 1>/dev/null 2>&1 -R -S0.1 -D1 -N1"
+
+static void test_qht(int n_threads, int update_rate, int duration)
+{
+char *str;
+int rc;
+
+str = g_strdup_printf(TEST_QHT_STRING "-n %d -u %d -d %d",
+  n_threads, update_rate, duration);
+rc = system(str);
+g_free(str);
+g_assert_cmpint(rc, ==, 0);
+}
+
+static void test_2th0u1s(void)
+{
+test_qht(2, 0, 1);
+}
+
+static void test_2th20u1s(void)
+{
+test_qht(2, 20, 1);
+}
+
+static void test_2th0u5s(void)
+{
+test_qht(2, 0, 5);
+}
+
+static void test_2th20u5s(void)
+{
+test_qht(2, 20, 5);
+}
+
+int main(int argc, char *argv[])
+{
+g_test_init(&argc, &argv, NULL);
+
+if (g_test_quick()) {
+g_test_add_func("/qht/parallel/2threads-0%updates-1s", test_2th0u1s);
+g_test_add_func("/qht/parallel/2threads-20%updates-1s", test_2th20u1s);
+} else {
+g_test_add_func("/qht/parallel/2threads-0%updates-5s", test_2th0u5s);
+g_test_add_func("/qht/parallel/2threads-20%updates-5s", test_2th20u5s);
+}
+return g_test_run();
+}
-- 
2.5.0

[Qemu-devel] [PATCH v6 00/15] tb hash improvements

2016-05-24 Thread Emilio G. Cota

v5: https://lists.gnu.org/archive/html/qemu-devel/2016-05/msg02366.html

v6 applies cleanly on top of tcg-next (8b1fe3f4 "cpu-exec:
Clean up 'interrupt_request' reloading", tagged "pull-tcg-20160512").

Changes from v5, mostly from Sergey's review:

- processor.h: use #ifdef #elif throughout the file

- tb_hash_func: use uint32 for 'flags' param

- tb_hash_func5: do 'foo >> 32' instead of 'foo >> 31 >> 1', since foo
  is a u64.

- thread.h:
  * qemu_spin_locked: remove acquire semantics; simply use atomic_read().
  * qemu_spin_trylock: return bool instead of 0 or -EBUSY; this saves
a branch.
  * qemu_spin:
+ use __sync_lock_test_and_set and __sync_lock_release; drop
  the patches touching atomic.h.
+ add unlikely() hint to "while (test_and_set)"; this gives a small
  speedup under no contention.

- qht:
  * merge the parallel-writes patch into the QHT patch.
[Richard: I dropped your reviewed-by since the patch changed
 quite a bit.]
  * drop unneeded #includes from qht.h
  * document qht.h using kerneldoc.
  * use unsigned int for storing the seqlock version.
  * fix a couple of typos in the comments at the top of qht.c.
  * explain better the "no duplicated pointer" policy: while trying to
insert an already-existing hash-pointer pair is OK (insert will
just return false), it's not OK to insert different hash-pointer
pairs that share the same pointer value, but not the hashes.
  * Add comment about lookups having to be done in an RCU read-critical
section.
  * remove map->stale; simply check ht->map before and after acquiring
a bucket lock.
  * only use atomic_read/set on bucket pointers, not hashes. Reading
partially-updated hashes is OK, since we'll retry anyway thanks
to the seqlock. Add a comment regarding this at the top of struct
qht_bucket.
  * s/b->n/b->n_buckets/
  * define qht_debug_assert, enabled #ifdef QHT_DEBUG. Use it instead of
assert(), except in one case (slow path) where g_assert_cmpuint is
convenient.
  * use a mutex for ht->lock instead of a spinlock. This makes the resize
code simpler, since holding ht->lock for a bit of time is OK now;
other threads won't be busy-waiting. Document that ht->lock needs
to be grabbed before b->lock.
  * use atomic_rcu_read/set instead of open-coding them.
  * qht_remove: only clear out b->hashes[] and b->pointers[] if they belong
to what was the last entry in the chain.
  * qht_remove: add debug assert against inserting a NULL pointer.

Thanks,

Emilio

[Qemu-devel] [PATCH v6 12/15] qht: add qht-bench, a performance benchmark

2016-05-24 Thread Emilio G. Cota

This serves as a performance benchmark as well as a stress test
for QHT. We can tweak quite a number of things, including the
number of resize threads and how frequently resizes are triggered.

A performance comparison of QHT vs CLHT[1] and ck_hs[2] using
this same benchmark program can be found here:
  http://imgur.com/a/0Bms4

The tests are run on a 64-core AMD Opteron 6376.

Note that ck_hs's performance drops significantly as writes go
up, since it requires an external lock (I used a ck_spinlock)
around every write.

Also, note that CLHT instead of using a seqlock, relies on an
allocator that does not ever return the same address during the
same read-critical section. This gives it a slight performance
advantage over QHT on read-heavy workloads, since the seqlock
writes aren't there.

[1] CLHT: https://github.com/LPD-EPFL/CLHT
  https://infoscience.epfl.ch/record/207109/files/ascy_asplos15.pdf

[2] ck_hs: http://concurrencykit.org/
   http://backtrace.io/blog/blog/2015/03/13/workload-specialization/

A few of those plots are shown in text here, since that site
might not be online forever. Throughput is on Mops/s on the Y axis.

 200K keys, 0 % updates

  450 ++--+--+--+---+---+---+---+--+---+--++
  |   +  +  +   +   +   +   +  +  +N+  |
  400 ++   ---+E+ ++
  |   +++  |
  350 ++  9 ++--+--++   --+E+-+H+ ++
  | |  +H+- | -+N+    +++  |
  300 ++  8 ++ +E+ ++ -+E+  --+H+ ++
  | |  +++  | -+N+-+H+--   |
  250 ++  7 ++--+--++  +++-+E+++
  200 ++1 -+E+-+H+++
  |    qht +-E--+  |
  150 ++  -+E+clht +-H--+ ++
  |     ck +-N--+  |
  100 ++   +E+++
  ||
   50 ++   -+E+   ++
  |   +E+E+  +  +   +   +   +   +  +   +   |
0 ++--E--+--+---+---+---+---+--+---+--++
  1  8  16  24  32  40  48 56  64
Number of threads

 200K keys, 1 % updates

  350 ++--+--+--+---+---+---+---+--+---+--++
  |   +  +  +   +   +   +   +  + -+E+  |
  300 ++ -+H+ ++
  |   +E+--|
  |   9 ++--+--++  +++ |
  250 ++|  +E+   -- | -+E+++
  |   8 ++ --  ++  |
  200 ++|  +++- |  +++  ---+E+++
  |   7 ++--N--++ -+E+--   qht +-E--+  |
  | 1  +++clht +-H--+  |
  150 ++  -+E+  ck +-N--+ ++
  |    |
  100 ++   +E+++
  ||
  |-+E+|
   50 +++H+-+N++N+-+N+--  ++
  |   +E+E+  +  +   +  +N+-+N+-+N++N+-+N+  |
0 ++--E--+--+---+---+---+---+--+---+--++
  1  8  16  24  32  40  48 56  64
Number of threads

 200K keys, 20 % updates

  300 ++--+--+--+---+---+---+---+--+---+--++
  |   +  +  +   +   +   +   +  +   +   |
  |  -+H+  |
  250 ++  ++
  |   9 ++--+--++   --+H+  ---+E+  |
  |   8 ++ +H+--   ++ -+H++E+--|
  200 ++|  +E+--| -+E+--  +++ ++
  |   7 ++  +  ++   ---+H+ +++ qht +-E--+  |
  150 ++  6 ++--N--

[Qemu-devel] [PATCH v6 01/15] compiler.h: add QEMU_ALIGNED() to enforce struct alignment

2016-05-24 Thread Emilio G. Cota

Reviewed-by: Richard Henderson 
Reviewed-by: Alex Bennée 
Signed-off-by: Emilio G. Cota 
---
 include/qemu/compiler.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/qemu/compiler.h b/include/qemu/compiler.h
index 8f1cc7b..b64f899 100644
--- a/include/qemu/compiler.h
+++ b/include/qemu/compiler.h
@@ -41,6 +41,8 @@
 # define QEMU_PACKED __attribute__((packed))
 #endif
 
+#define QEMU_ALIGNED(X) __attribute__((aligned(X)))
+
 #ifndef glue
 #define xglue(x, y) x ## y
 #define glue(x, y) xglue(x, y)
-- 
2.5.0

[Qemu-devel] [PATCH v6 14/15] tb hash: track translated blocks with qht

2016-05-24 Thread Emilio G. Cota

Having a fixed-size hash table for keeping track of all translation blocks
is suboptimal: some workloads are just too big or too small to get maximum
performance from the hash table. The MRU promotion policy helps improve
performance when the hash table is a little undersized, but it cannot
make up for severely undersized hash tables.

Furthermore, frequent MRU promotions result in writes that are a scalability
bottleneck. For scalability, lookups should only perform reads, not writes.
This is not a big deal for now, but it will become one once MTTCG matures.

The appended fixes these issues by using qht as the implementation of
the TB hash table. This solution is superior to other alternatives considered,
namely:

- master: implementation in QEMU before this patchset
- xxhash: before this patch, i.e. fixed buckets + xxhash hashing + MRU.
- xxhash-rcu: fixed buckets + xxhash + RCU list + MRU.
  MRU is implemented here by adding an intermediate struct
  that contains the u32 hash and a pointer to the TB; this
  allows us, on an MRU promotion, to copy said struct (that is not
  at the head), and put this new copy at the head. After a grace
  period, the original non-head struct can be eliminated, and
  after another grace period, freed.
- qht-fixed-nomru: fixed buckets + xxhash + qht without auto-resize +
   no MRU for lookups; MRU for inserts.
The appended solution is the following:
- qht-dyn-nomru: dynamic number of buckets + xxhash + qht w/ auto-resize +
 no MRU for lookups; MRU for inserts.

The plots below compare the considered solutions. The Y axis shows the
boot time (in seconds) of a debian jessie image with arm-softmmu; the X axis
sweeps the number of buckets (or initial number of buckets for qht-autoresize).
The plots in PNG format (and with errorbars) can be seen here:
  http://imgur.com/a/Awgnq

Each test runs 5 times, and the entire QEMU process is pinned to a
single core for repeatability of results.

Host: Intel Xeon E5-2690

  28 +++-+-+-+++
 A*+ + + master **A*** +
  27 ++* xxhash ##B###++
 |  A**A**   xxhash-rcu $$C$$$ |
  26 C$$  A**A**qht-fixed-nomru*%%D%%%++
 D%%$$  A**A**A*qht-dyn-mru A*EA
  25 ++ %%$$  qht-dyn-nomru &&F&&&++
 B#%   |
  24 ++#C$++
 |  B###  $|
 |  ## C$$ |
  23 ++   #   C$$ ++
 | B##   C$$%%%D
  22 ++  %B##   C$$C$$C$$C$$C$$C
 |D%%B##  @E@@%%%D%%%@@@E@@E
  21 E@@E@@F&&&@@@E@@@&&&D%%B##B##B##B##B##B
 + E@@@   F&&&   +  E@ +  F&&&   + +
  20 +++-+-+-+++
 141618202224
 log2 number of buckets

 Host: Intel i7-4790K

  14.5 ++++-++++
   A**   ++ +master **A*** +
14 ++ ** xxhash ##B###++
  13.5 ++   **   xxhash-rcu $$C$$$++
   |qht-fixed-nomru %%D%%% |
13 ++ A**   qht-dyn-mru @@E@@@++
   | A*A**A** qht-dyn-nomru &&F&&& |
  12.5 C$$   A**A**A*A*****A
12 ++ $$A***  ++
   D%%% $$ |
  11.5 ++  %% ++
   B###  %C$$  |
11 ++  ## D% C$   ++
   | #  %  C$$ |
  10.5 F&&B##D%   C$$C$$C$$C$C$$$$$C
10 E@@E@@B#B##B##E@@E@@@%%%D%D%%%###B##B
   + F&&  D%%B##B##B#B###@@@D%%%   +
   9.5 ++---

[Qemu-devel] [PATCH v6 10/15] qht: QEMU's fast, resizable and scalable Hash Table

2016-05-24 Thread Emilio G. Cota

This is a fast, scalable chained hash table with optional auto-resizing, 
allowing
reads that are concurrent with reads, and reads/writes that are concurrent
with writes to separate buckets.

A hash table with these features will be necessary for the scalability
of the ongoing MTTCG work; before those changes arrive we can already
benefit from the single-threaded speedup that qht also provides.

Signed-off-by: Emilio G. Cota 
---
 include/qemu/qht.h | 183 
 util/Makefile.objs |   1 +
 util/qht.c | 837 +
 3 files changed, 1021 insertions(+)
 create mode 100644 include/qemu/qht.h
 create mode 100644 util/qht.c

diff --git a/include/qemu/qht.h b/include/qemu/qht.h
new file mode 100644
index 000..aec60aa
--- /dev/null
+++ b/include/qemu/qht.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright (C) 2016, Emilio G. Cota 
+ *
+ * License: GNU GPL, version 2 or later.
+ *   See the COPYING file in the top-level directory.
+ */
+#ifndef QEMU_QHT_H
+#define QEMU_QHT_H
+
+#include "qemu/osdep.h"
+#include "qemu/seqlock.h"
+#include "qemu/thread.h"
+#include "qemu/qdist.h"
+
+struct qht {
+struct qht_map *map;
+QemuMutex lock; /* serializes setters of ht->map */
+unsigned int mode;
+};
+
+/**
+ * struct qht_stats - Statistics of a QHT
+ * @head_buckets: number of head buckets
+ * @used_head_buckets: number of non-empty head buckets
+ * @entries: total number of entries
+ * @chain: frequency distribution representing the number of buckets in each
+ * chain, excluding empty chains.
+ * @occupancy: frequency distribution representing chain occupancy rate.
+ * Valid range: from 0.0 (empty) to 1.0 (full occupancy).
+ *
+ * An entry is a pointer-hash pair.
+ * Each bucket can host several entries.
+ * Chains are chains of buckets, whose first link is always a head bucket.
+ */
+struct qht_stats {
+size_t head_buckets;
+size_t used_head_buckets;
+size_t entries;
+struct qdist chain;
+struct qdist occupancy;
+};
+
+typedef bool (*qht_lookup_func_t)(const void *obj, const void *userp);
+typedef void (*qht_iter_func_t)(struct qht *ht, void *p, uint32_t h, void *up);
+
+#define QHT_MODE_AUTO_RESIZE 0x1 /* auto-resize when heavily loaded */
+
+/**
+ * qht_init - Initialize a QHT
+ * @ht: QHT to be initialized
+ * @n_elems: number of entries the hash table should be optimized for.
+ * @mode: bitmask with OR'ed QHT_MODE_*
+ */
+void qht_init(struct qht *ht, size_t n_elems, unsigned int mode);
+
+/**
+ * qht_destroy - destroy a previously initialized QHT
+ * @ht: QHT to be destroyed
+ *
+ * Call only when there are no readers/writers left.
+ */
+void qht_destroy(struct qht *ht);
+
+/**
+ * qht_insert - Insert a pointer into the hash table
+ * @ht: QHT to insert to
+ * @p: pointer to be inserted
+ * @hash: hash corresponding to @p
+ *
+ * Attempting to insert a NULL @p is a bug.
+ * Inserting the same pointer @p with different @hash values is a bug.
+ *
+ * Returns true on sucess.
+ * Returns false if the @p-@hash pair already exists in the hash table.
+ */
+bool qht_insert(struct qht *ht, void *p, uint32_t hash);
+
+/**
+ * qht_lookup - Look up a pointer in a QHT
+ * @ht: QHT to be looked up
+ * @func: function to compare existing pointers against @userp
+ * @userp: pointer to pass to @func
+ * @hash: hash of the pointer to be looked up
+ *
+ * Needs to be called under an RCU read-critical section.
+ *
+ * The user-provided @func compares pointers in QHT against @userp.
+ * If the function returns true, a match has been found.
+ *
+ * Returns the corresponding pointer when a match is found.
+ * Returns NULL otherwise.
+ */
+void *qht_lookup(struct qht *ht, qht_lookup_func_t func, const void *userp,
+ uint32_t hash);
+
+/**
+ * qht_remove - remove a pointer from the hash table
+ * @ht: QHT to remove from
+ * @p: pointer to be removed
+ * @hash: hash corresponding to @p
+ *
+ * Attempting to remove a NULL @p is a bug.
+ *
+ * Just-removed @p pointers cannot be immediately freed; they need to remain
+ * valid until the end of the RCU grace period in which qht_remove() is called.
+ * This guarantees that concurrent lookups will always compare against valid
+ * data.
+ *
+ * Returns true on success.
+ * Returns false if the @p-@hash pair was not found.
+ */
+bool qht_remove(struct qht *ht, const void *p, uint32_t hash);
+
+/**
+ * qht_reset - reset a QHT
+ * @ht: QHT to be reset
+ *
+ * All entries in the hash table are reset. No resizing is performed.
+ *
+ * If concurrent readers may exist, the objects pointed to by the hash table
+ * must remain valid for the existing RCU grace period -- see qht_remove().
+ * See also: qht_reset_size()
+ */
+void qht_reset(struct qht *ht);
+
+/**
+ * qht_reset_size - reset and resize a QHT
+ * @ht: QHT to be reset and resized
+ * @n_elems: number of entries the resized hash table should be optimized for.
+ *
+ * Returns true if the resize was necessary and therefore performed

[Qemu-devel] [PATCH v6 11/15] qht: add test program

2016-05-24 Thread Emilio G. Cota

Reviewed-by: Alex Bennée 
Reviewed-by: Richard Henderson 
Signed-off-by: Emilio G. Cota 
---
 tests/.gitignore |   1 +
 tests/Makefile   |   6 ++-
 tests/test-qht.c | 159 +++
 3 files changed, 165 insertions(+), 1 deletion(-)
 create mode 100644 tests/test-qht.c

diff --git a/tests/.gitignore b/tests/.gitignore
index 7c0d156..ffde5d2 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -50,6 +50,7 @@ test-qdev-global-props
 test-qemu-opts
 test-qdist
 test-qga
+test-qht
 test-qmp-commands
 test-qmp-commands.h
 test-qmp-event
diff --git a/tests/Makefile b/tests/Makefile
index a5af20b..8589b11 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -72,6 +72,8 @@ check-unit-y += tests/test-rcu-list$(EXESUF)
 gcov-files-test-rcu-list-y = util/rcu.c
 check-unit-y += tests/test-qdist$(EXESUF)
 gcov-files-test-qdist-y = util/qdist.c
+check-unit-y += tests/test-qht$(EXESUF)
+gcov-files-test-qht-y = util/qht.c
 check-unit-y += tests/test-bitops$(EXESUF)
 check-unit-$(CONFIG_HAS_GLIB_SUBPROCESS_TESTS) += 
tests/test-qdev-global-props$(EXESUF)
 check-unit-y += tests/check-qom-interface$(EXESUF)
@@ -395,7 +397,8 @@ test-obj-y = tests/check-qint.o tests/check-qstring.o 
tests/check-qdict.o \
tests/test-x86-cpuid.o tests/test-mul64.o tests/test-int128.o \
tests/test-opts-visitor.o tests/test-qmp-event.o \
tests/rcutorture.o tests/test-rcu-list.o \
-   tests/test-qdist.o
+   tests/test-qdist.o \
+   tests/test-qht.o
 
 $(test-obj-y): QEMU_INCLUDES += -Itests
 QEMU_CFLAGS += -I$(SRC_PATH)/tests
@@ -435,6 +438,7 @@ tests/test-int128$(EXESUF): tests/test-int128.o
 tests/rcutorture$(EXESUF): tests/rcutorture.o $(test-util-obj-y)
 tests/test-rcu-list$(EXESUF): tests/test-rcu-list.o $(test-util-obj-y)
 tests/test-qdist$(EXESUF): tests/test-qdist.o $(test-util-obj-y)
+tests/test-qht$(EXESUF): tests/test-qht.o $(test-util-obj-y)
 
 tests/test-qdev-global-props$(EXESUF): tests/test-qdev-global-props.o \
hw/core/qdev.o hw/core/qdev-properties.o hw/core/hotplug.o\
diff --git a/tests/test-qht.c b/tests/test-qht.c
new file mode 100644
index 000..c8eb930
--- /dev/null
+++ b/tests/test-qht.c
@@ -0,0 +1,159 @@
+/*
+ * Copyright (C) 2016, Emilio G. Cota 
+ *
+ * License: GNU GPL, version 2 or later.
+ *   See the COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include 
+#include "qemu/qht.h"
+
+#define N 5000
+
+static struct qht ht;
+static int32_t arr[N * 2];
+
+static bool is_equal(const void *obj, const void *userp)
+{
+const int32_t *a = obj;
+const int32_t *b = userp;
+
+return *a == *b;
+}
+
+static void insert(int a, int b)
+{
+int i;
+
+for (i = a; i < b; i++) {
+uint32_t hash;
+
+arr[i] = i;
+hash = i;
+
+qht_insert(&ht, &arr[i], hash);
+}
+}
+
+static void rm(int init, int end)
+{
+int i;
+
+for (i = init; i < end; i++) {
+uint32_t hash;
+
+hash = arr[i];
+g_assert_true(qht_remove(&ht, &arr[i], hash));
+}
+}
+
+static void check(int a, int b, bool expected)
+{
+struct qht_stats stats;
+int i;
+
+for (i = a; i < b; i++) {
+void *p;
+uint32_t hash;
+int32_t val;
+
+val = i;
+hash = i;
+p = qht_lookup(&ht, is_equal, &val, hash);
+g_assert_true(!!p == expected);
+}
+qht_statistics_init(&ht, &stats);
+if (stats.used_head_buckets) {
+g_assert_cmpfloat(qdist_avg(&stats.chain), >=, 1.0);
+}
+g_assert_cmpuint(stats.head_buckets, >, 0);
+qht_statistics_destroy(&stats);
+}
+
+static void count_func(struct qht *ht, void *p, uint32_t hash, void *userp)
+{
+unsigned int *curr = userp;
+
+(*curr)++;
+}
+
+static void check_n(size_t expected)
+{
+struct qht_stats stats;
+
+qht_statistics_init(&ht, &stats);
+g_assert_cmpuint(stats.entries, ==, expected);
+qht_statistics_destroy(&stats);
+}
+
+static void iter_check(unsigned int count)
+{
+unsigned int curr = 0;
+
+qht_iter(&ht, count_func, &curr);
+g_assert_cmpuint(curr, ==, count);
+}
+
+static void qht_do_test(unsigned int mode, size_t init_entries)
+{
+qht_init(&ht, 0, mode);
+
+insert(0, N);
+check(0, N, true);
+check_n(N);
+check(-N, -1, false);
+iter_check(N);
+
+rm(101, 102);
+check_n(N - 1);
+insert(N, N * 2);
+check_n(N + N - 1);
+rm(N, N * 2);
+check_n(N - 1);
+insert(101, 102);
+check_n(N);
+
+rm(10, 200);
+check_n(N - 190);
+insert(150, 200);
+check_n(N - 190 + 50);
+insert(10, 150);
+check_n(N);
+
+rm(1, 2);
+check_n(N - 1);
+qht_reset_size(&ht, 0);
+check_n(0);
+check(0, N, false);
+
+qht_destroy(&ht);
+}
+
+static void qht_test(unsigned int mode)
+{
+qht_do_test(mode, 0);
+qht_do_test(mode, 1);
+qht_do_test(mode, 2);
+qht_do_test(mode, 8);
+qht_do_test(mode, 16);
+qht_do_test(mode, 8192);
+qht_do_test(m

[Qemu-devel] [PATCH v6 09/15] qdist: add test program

2016-05-24 Thread Emilio G. Cota

Reviewed-by: Richard Henderson 
Signed-off-by: Emilio G. Cota 
---
 tests/.gitignore   |   1 +
 tests/Makefile |   6 +-
 tests/test-qdist.c | 369 +
 3 files changed, 375 insertions(+), 1 deletion(-)
 create mode 100644 tests/test-qdist.c

diff --git a/tests/.gitignore b/tests/.gitignore
index a06a8ba..7c0d156 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -48,6 +48,7 @@ test-qapi-types.[ch]
 test-qapi-visit.[ch]
 test-qdev-global-props
 test-qemu-opts
+test-qdist
 test-qga
 test-qmp-commands
 test-qmp-commands.h
diff --git a/tests/Makefile b/tests/Makefile
index 9e6..a5af20b 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -70,6 +70,8 @@ check-unit-y += tests/rcutorture$(EXESUF)
 gcov-files-rcutorture-y = util/rcu.c
 check-unit-y += tests/test-rcu-list$(EXESUF)
 gcov-files-test-rcu-list-y = util/rcu.c
+check-unit-y += tests/test-qdist$(EXESUF)
+gcov-files-test-qdist-y = util/qdist.c
 check-unit-y += tests/test-bitops$(EXESUF)
 check-unit-$(CONFIG_HAS_GLIB_SUBPROCESS_TESTS) += 
tests/test-qdev-global-props$(EXESUF)
 check-unit-y += tests/check-qom-interface$(EXESUF)
@@ -392,7 +394,8 @@ test-obj-y = tests/check-qint.o tests/check-qstring.o 
tests/check-qdict.o \
tests/test-qmp-commands.o tests/test-visitor-serialization.o \
tests/test-x86-cpuid.o tests/test-mul64.o tests/test-int128.o \
tests/test-opts-visitor.o tests/test-qmp-event.o \
-   tests/rcutorture.o tests/test-rcu-list.o
+   tests/rcutorture.o tests/test-rcu-list.o \
+   tests/test-qdist.o
 
 $(test-obj-y): QEMU_INCLUDES += -Itests
 QEMU_CFLAGS += -I$(SRC_PATH)/tests
@@ -431,6 +434,7 @@ tests/test-cutils$(EXESUF): tests/test-cutils.o 
util/cutils.o
 tests/test-int128$(EXESUF): tests/test-int128.o
 tests/rcutorture$(EXESUF): tests/rcutorture.o $(test-util-obj-y)
 tests/test-rcu-list$(EXESUF): tests/test-rcu-list.o $(test-util-obj-y)
+tests/test-qdist$(EXESUF): tests/test-qdist.o $(test-util-obj-y)
 
 tests/test-qdev-global-props$(EXESUF): tests/test-qdev-global-props.o \
hw/core/qdev.o hw/core/qdev-properties.o hw/core/hotplug.o\
diff --git a/tests/test-qdist.c b/tests/test-qdist.c
new file mode 100644
index 000..7625a57
--- /dev/null
+++ b/tests/test-qdist.c
@@ -0,0 +1,369 @@
+/*
+ * Copyright (C) 2016, Emilio G. Cota 
+ *
+ * License: GNU GPL, version 2 or later.
+ *   See the COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include 
+#include "qemu/qdist.h"
+
+#include 
+
+struct entry_desc {
+double x;
+unsigned long count;
+
+/* 0 prints a space, 1-8 prints from qdist_blocks[] */
+int fill_code;
+};
+
+/* See: https://en.wikipedia.org/wiki/Block_Elements */
+static const gunichar qdist_blocks[] = {
+0x2581,
+0x2582,
+0x2583,
+0x2584,
+0x2585,
+0x2586,
+0x2587,
+0x2588
+};
+
+#define QDIST_NR_BLOCK_CODES ARRAY_SIZE(qdist_blocks)
+
+static char *pr_hist(const struct entry_desc *darr, size_t n)
+{
+GString *s = g_string_new("");
+size_t i;
+
+for (i = 0; i < n; i++) {
+int fill = darr[i].fill_code;
+
+if (fill) {
+assert(fill <= QDIST_NR_BLOCK_CODES);
+g_string_append_unichar(s, qdist_blocks[fill - 1]);
+} else {
+g_string_append_c(s, ' ');
+}
+}
+return g_string_free(s, FALSE);
+}
+
+static void
+histogram_check(const struct qdist *dist, const struct entry_desc *darr,
+size_t n, size_t n_bins)
+{
+char *pr = qdist_pr_plain(dist, n_bins);
+char *str = pr_hist(darr, n);
+
+g_assert_cmpstr(pr, ==, str);
+g_free(pr);
+g_free(str);
+}
+
+static void histogram_check_single_full(const struct qdist *dist, size_t 
n_bins)
+{
+struct entry_desc desc = { .fill_code = 8 };
+
+histogram_check(dist, &desc, 1, n_bins);
+}
+
+static void
+entries_check(const struct qdist *dist, const struct entry_desc *darr, size_t 
n)
+{
+size_t i;
+
+for (i = 0; i < n; i++) {
+struct qdist_entry *e = &dist->entries[i];
+
+g_assert_cmpuint(e->count, ==, darr[i].count);
+}
+}
+
+static void
+entries_insert(struct qdist *dist, const struct entry_desc *darr, size_t n)
+{
+size_t i;
+
+for (i = 0; i < n; i++) {
+qdist_add(dist, darr[i].x, darr[i].count);
+}
+}
+
+static void do_test_bin(const struct entry_desc *a, size_t n_a,
+const struct entry_desc *b, size_t n_b)
+{
+struct qdist qda;
+struct qdist qdb;
+
+qdist_init(&qda);
+
+entries_insert(&qda, a, n_a);
+qdist_inc(&qda, a[0].x);
+qdist_add(&qda, a[0].x, -1);
+
+g_assert_cmpuint(qdist_unique_entries(&qda), ==, n_a);
+g_assert_cmpfloat(qdist_xmin(&qda), ==, a[0].x);
+g_assert_cmpfloat(qdist_xmax(&qda), ==, a[n_a - 1].x);
+histogram_check(&qda, a, n_a, 0);
+histogram_check(&qda, a, n_a, n_a);
+
+qdist_bin__internal(&qdb, &qda, n_b);
+g_assert_cmpuint(qdb.n, ==, n_b);
+entries_check(&qd

[Qemu-devel] [PATCH v6 08/15] qdist: add module to represent frequency distributions of data

2016-05-24 Thread Emilio G. Cota

Sometimes it is useful to have a quick histogram to represent a certain
distribution -- for example, when investigating a performance regression
in a hash table due to inadequate hashing.

The appended allows us to easily represent a distribution using Unicode
characters. Further, the data structure keeping track of the distribution
is so simple that obtaining its values for off-line processing is trivial.

Example, taking the last 10 commits to QEMU:

 Characters in commit title  Count
---
 39  1
 48  1
 53  1
 54  2
 57  1
 61  1
 67  1
 78  1
 80  1
qdist_init(&dist);
qdist_inc(&dist, 39);
[...]
qdist_inc(&dist, 80);

char *str = qdist_pr(&dist, 9, QDIST_PR_LABELS);
// -> [39.0,43.6)▂▂ █▂ ▂ ▄[75.4,80.0]
g_free(str);

char *str = qdist_pr(&dist, 4, QDIST_PR_LABELS);
// -> [39.0,49.2)▁█▁▁[69.8,80.0]
g_free(str);

Reviewed-by: Richard Henderson 
Signed-off-by: Emilio G. Cota 
---
 include/qemu/qdist.h |  62 +
 util/Makefile.objs   |   1 +
 util/qdist.c | 386 +++
 3 files changed, 449 insertions(+)
 create mode 100644 include/qemu/qdist.h
 create mode 100644 util/qdist.c

diff --git a/include/qemu/qdist.h b/include/qemu/qdist.h
new file mode 100644
index 000..6d8b701
--- /dev/null
+++ b/include/qemu/qdist.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2016, Emilio G. Cota 
+ *
+ * License: GNU GPL, version 2 or later.
+ *   See the COPYING file in the top-level directory.
+ */
+#ifndef QEMU_QDIST_H
+#define QEMU_QDIST_H
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "qemu/bitops.h"
+
+/*
+ * Samples with the same 'x value' end up in the same qdist_entry,
+ * e.g. inc(0.1) and inc(0.1) end up as {x=0.1, count=2}.
+ *
+ * Binning happens only at print time, so that we retain the flexibility to
+ * choose the binning. This might not be ideal for workloads that do not care
+ * much about precision and insert many samples all with different x values;
+ * in that case, pre-binning (e.g. entering both 0.115 and 0.097 as 0.1)
+ * should be considered.
+ */
+struct qdist_entry {
+double x;
+unsigned long count;
+};
+
+struct qdist {
+struct qdist_entry *entries;
+size_t n;
+};
+
+#define QDIST_PR_BORDER BIT(0)
+#define QDIST_PR_LABELS BIT(1)
+/* the remaining options only work if PR_LABELS is set */
+#define QDIST_PR_NODECIMAL  BIT(2)
+#define QDIST_PR_PERCENTBIT(3)
+#define QDIST_PR_100X   BIT(4)
+#define QDIST_PR_NOBINRANGE BIT(5)
+
+void qdist_init(struct qdist *dist);
+void qdist_destroy(struct qdist *dist);
+
+void qdist_add(struct qdist *dist, double x, long count);
+void qdist_inc(struct qdist *dist, double x);
+double qdist_xmin(const struct qdist *dist);
+double qdist_xmax(const struct qdist *dist);
+double qdist_avg(const struct qdist *dist);
+unsigned long qdist_sample_count(const struct qdist *dist);
+size_t qdist_unique_entries(const struct qdist *dist);
+
+/* callers must free the returned string with g_free() */
+char *qdist_pr_plain(const struct qdist *dist, size_t n_groups);
+
+/* callers must free the returned string with g_free() */
+char *qdist_pr(const struct qdist *dist, size_t n_groups, uint32_t opt);
+
+/* Only qdist code and test code should ever call this function */
+void qdist_bin__internal(struct qdist *to, const struct qdist *from, size_t n);
+
+#endif /* QEMU_QDIST_H */
diff --git a/util/Makefile.objs b/util/Makefile.objs
index a8a777e..702435e 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -32,3 +32,4 @@ util-obj-y += buffer.o
 util-obj-y += timed-average.o
 util-obj-y += base64.o
 util-obj-y += log.o
+util-obj-y += qdist.o
diff --git a/util/qdist.c b/util/qdist.c
new file mode 100644
index 000..3343640
--- /dev/null
+++ b/util/qdist.c
@@ -0,0 +1,386 @@
+/*
+ * qdist.c - QEMU helpers for handling frequency distributions of data.
+ *
+ * Copyright (C) 2016, Emilio G. Cota 
+ *
+ * License: GNU GPL, version 2 or later.
+ *   See the COPYING file in the top-level directory.
+ */
+#include "qemu/qdist.h"
+
+#include 
+#ifndef NAN
+#define NAN (0.0 / 0.0)
+#endif
+
+void qdist_init(struct qdist *dist)
+{
+dist->entries = NULL;
+dist->n = 0;
+}
+
+void qdist_destroy(struct qdist *dist)
+{
+g_free(dist->entries);
+}
+
+static inline int qdist_cmp_double(double a, double b)
+{
+if (a > b) {
+return 1;
+} else if (a < b) {
+return -1;
+}
+return 0;
+}
+
+static int qdist_cmp(const void *ap, const void *bp)
+{
+const struct qdist_entry *a = ap;
+const struct qdist_entry *b = bp;
+
+return qdist_cmp_double(a->x, b->x);
+}
+
+void qdist_add(struct qdist *dist, double x, long count)
+{
+struct qdist_entry *entry = NULL;
+
+if (dist

[Qemu-devel] [PATCH v6 07/15] tb hash: hash phys_pc, pc, and flags with xxhash

2016-05-24 Thread Emilio G. Cota

For some workloads such as arm bootup, tb_phys_hash is performance-critical.
The is due to the high frequency of accesses to the hash table, originated
by (frequent) TLB flushes that wipe out the cpu-private tb_jmp_cache's.
More info:
  https://lists.nongnu.org/archive/html/qemu-devel/2016-03/msg05098.html

To dig further into this I modified an arm image booting debian jessie to
immediately shut down after boot. Analysis revealed that quite a bit of time
is unnecessarily spent in tb_phys_hash: the cause is poor hashing that
results in very uneven loading of chains in the hash table's buckets;
the longest observed chain had ~550 elements.

The appended addresses this with two changes:

1) Use xxhash as the hash table's hash function. xxhash is a fast,
   high-quality hashing function.

2) Feed the hashing function with not just tb_phys, but also pc and flags.

This improves performance over using just tb_phys for hashing, since that
resulted in some hash buckets having many TB's, while others getting very few;
with these changes, the longest observed chain on a single hash bucket is
brought down from ~550 to ~40.

Tests show that the other element checked for in tb_find_physical,
cs_base, is always a match when tb_phys+pc+flags are a match,
so hashing cs_base is wasteful. It could be that this is an ARM-only
thing, though. UPDATE:
On Tue, Apr 05, 2016 at 08:41:43 -0700, Richard Henderson wrote:
> The cs_base field is only used by i386 (in 16-bit modes), and sparc (for a TB
> consisting of only a delay slot).
> It may well still turn out to be reasonable to ignore cs_base for hashing.

BTW, after this change the hash table should not be called "tb_hash_phys"
anymore; this is addressed later in this series.

This change gives consistent bootup time improvements. I tested two
host machines:
- Intel Xeon E5-2690: 11.6% less time
- Intel i7-4790K: 19.2% less time

Increasing the number of hash buckets yields further improvements. However,
using a larger, fixed number of buckets can degrade performance for other
workloads that do not translate as many blocks (600K+ for debian-jessie arm
bootup). This is dealt with later in this series.

Reviewed-by: Richard Henderson 
Reviewed-by: Alex Bennée 
Signed-off-by: Emilio G. Cota 
---
 cpu-exec.c |  4 ++--
 include/exec/tb-hash.h |  8 ++--
 translate-all.c| 10 +-
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/cpu-exec.c b/cpu-exec.c
index 14df1aa..1735032 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -231,13 +231,13 @@ static TranslationBlock *tb_find_physical(CPUState *cpu,
 {
 CPUArchState *env = (CPUArchState *)cpu->env_ptr;
 TranslationBlock *tb, **tb_hash_head, **ptb1;
-unsigned int h;
+uint32_t h;
 tb_page_addr_t phys_pc, phys_page1;

 /* find translated block using physical mappings */
 phys_pc = get_page_addr_code(env, pc);
 phys_page1 = phys_pc & TARGET_PAGE_MASK;
-h = tb_phys_hash_func(phys_pc);
+h = tb_hash_func(phys_pc, pc, flags);

 /* Start at head of the hash entry */
 ptb1 = tb_hash_head = &tcg_ctx.tb_ctx.tb_phys_hash[h];
diff --git a/include/exec/tb-hash.h b/include/exec/tb-hash.h
index 0f4e8a0..88ccfd1 100644
--- a/include/exec/tb-hash.h
+++ b/include/exec/tb-hash.h
@@ -20,6 +20,9 @@
 #ifndef EXEC_TB_HASH
 #define EXEC_TB_HASH

+#include "exec/exec-all.h"
+#include "exec/tb-hash-xx.h"
+
 /* Only the bottom TB_JMP_PAGE_BITS of the jump cache hash bits vary for
addresses on the same page.  The top bits are the same.  This allows
TLB invalidation to quickly clear a subset of the hash table.  */
@@ -43,9 +46,10 @@ static inline unsigned int 
tb_jmp_cache_hash_func(target_ulong pc)
| (tmp & TB_JMP_ADDR_MASK));
 }

-static inline unsigned int tb_phys_hash_func(tb_page_addr_t pc)
+static inline
+uint32_t tb_hash_func(tb_page_addr_t phys_pc, target_ulong pc, uint32_t flags)
 {
-return (pc >> 2) & (CODE_GEN_PHYS_HASH_SIZE - 1);
+return tb_hash_func5(phys_pc, pc, flags) & (CODE_GEN_PHYS_HASH_SIZE - 1);
 }

 #endif
diff --git a/translate-all.c b/translate-all.c
index b54f472..c48fccb 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -991,12 +991,12 @@ void tb_phys_invalidate(TranslationBlock *tb, 
tb_page_addr_t page_addr)
 {
 CPUState *cpu;
 PageDesc *p;
-unsigned int h;
+uint32_t h;
 tb_page_addr_t phys_pc;

 /* remove the TB from the hash list */
 phys_pc = tb->page_addr[0] + (tb->pc & ~TARGET_PAGE_MASK);
-h = tb_phys_hash_func(phys_pc);
+h = tb_hash_func(phys_pc, tb->pc, tb->flags);
 tb_hash_remove(&tcg_ctx.tb_ctx.tb_phys_hash[h], tb);

 /* remove the TB from the page list */
@@ -1126,11 +1126,11 @@ static inline void tb_alloc_page(TranslationBlock *tb,
 static void tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
  tb_page_addr_t phys_page2)
 {
-unsigned int h;
+uint32_t h;
 TranslationBlock **ptb;

-/* add in the physical hash tab

[Qemu-devel] [PATCH v6 02/15] seqlock: remove optional mutex

2016-05-24 Thread Emilio G. Cota

This option is unused; besides, it bloats the struct when not needed.
Let's just let writers define their own locks elsewhere.

Reviewed-by: Alex Bennée 
Reviewed-by: Richard Henderson 
Signed-off-by: Emilio G. Cota 
---
 cpus.c |  2 +-
 include/qemu/seqlock.h | 10 +-
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/cpus.c b/cpus.c
index cbeb1f6..dd86da5 100644
--- a/cpus.c
+++ b/cpus.c
@@ -619,7 +619,7 @@ int cpu_throttle_get_percentage(void)
 
 void cpu_ticks_init(void)
 {
-seqlock_init(&timers_state.vm_clock_seqlock, NULL);
+seqlock_init(&timers_state.vm_clock_seqlock);
 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
cpu_throttle_timer_tick, NULL);
diff --git a/include/qemu/seqlock.h b/include/qemu/seqlock.h
index 70b01fd..e673482 100644
--- a/include/qemu/seqlock.h
+++ b/include/qemu/seqlock.h
@@ -19,22 +19,17 @@
 typedef struct QemuSeqLock QemuSeqLock;
 
 struct QemuSeqLock {
-QemuMutex *mutex;
 unsigned sequence;
 };
 
-static inline void seqlock_init(QemuSeqLock *sl, QemuMutex *mutex)
+static inline void seqlock_init(QemuSeqLock *sl)
 {
-sl->mutex = mutex;
 sl->sequence = 0;
 }
 
 /* Lock out other writers and update the count.  */
 static inline void seqlock_write_lock(QemuSeqLock *sl)
 {
-if (sl->mutex) {
-qemu_mutex_lock(sl->mutex);
-}
 ++sl->sequence;
 
 /* Write sequence before updating other fields.  */
@@ -47,9 +42,6 @@ static inline void seqlock_write_unlock(QemuSeqLock *sl)
 smp_wmb();
 
 ++sl->sequence;
-if (sl->mutex) {
-qemu_mutex_unlock(sl->mutex);
-}
 }
 
 static inline unsigned seqlock_read_begin(QemuSeqLock *sl)
-- 
2.5.0

[Qemu-devel] [PATCH v6 05/15] qemu-thread: add simple test-and-set spinlock

2016-05-24 Thread Emilio G. Cota

From: Guillaume Delbergue 

Signed-off-by: Guillaume Delbergue 
[Rewritten. - Paolo]
Signed-off-by: Paolo Bonzini 
[Emilio's additions: use TAS instead of atomic_xchg; emit acquire/release
 barriers; return bool from trylock; call cpu_relax() while spinning;
 optimize for uncontended locks by acquiring the lock with TAS instead
 of TATAS; add qemu_spin_locked().]
Signed-off-by: Emilio G. Cota 
---
 include/qemu/thread.h | 35 +++
 1 file changed, 35 insertions(+)

diff --git a/include/qemu/thread.h b/include/qemu/thread.h
index bdae6df..c5d71cf 100644
--- a/include/qemu/thread.h
+++ b/include/qemu/thread.h
@@ -1,6 +1,8 @@
 #ifndef __QEMU_THREAD_H
 #define __QEMU_THREAD_H 1
 
+#include "qemu/processor.h"
+#include "qemu/atomic.h"
 
 typedef struct QemuMutex QemuMutex;
 typedef struct QemuCond QemuCond;
@@ -60,4 +62,37 @@ struct Notifier;
 void qemu_thread_atexit_add(struct Notifier *notifier);
 void qemu_thread_atexit_remove(struct Notifier *notifier);
 
+typedef struct QemuSpin {
+int value;
+} QemuSpin;
+
+static inline void qemu_spin_init(QemuSpin *spin)
+{
+__sync_lock_release(&spin->value);
+}
+
+static inline void qemu_spin_lock(QemuSpin *spin)
+{
+while (unlikely(__sync_lock_test_and_set(&spin->value, true))) {
+while (atomic_read(&spin->value)) {
+cpu_relax();
+}
+}
+}
+
+static inline bool qemu_spin_trylock(QemuSpin *spin)
+{
+return __sync_lock_test_and_set(&spin->value, true);
+}
+
+static inline bool qemu_spin_locked(QemuSpin *spin)
+{
+return atomic_read(&spin->value);
+}
+
+static inline void qemu_spin_unlock(QemuSpin *spin)
+{
+__sync_lock_release(&spin->value);
+}
+
 #endif
-- 
2.5.0

[Qemu-devel] [PATCH v6 15/15] translate-all: add tb hash bucket info to 'info jit' dump

2016-05-24 Thread Emilio G. Cota

Examples:

- Good hashing, i.e. tb_hash_func5(phys_pc, pc, flags):
TB count715135/2684354
[...]
TB hash buckets 388775/524288 (74.15% head buckets used)
TB hash occupancy   33.04% avg chain occ. Histogram: [0,10)%|▆ █  
▅▁▃▁▁|[90,100]%
TB hash avg chain   1.017 buckets. Histogram: 1|█▁▁|3

- Not-so-good hashing, i.e. tb_hash_func5(phys_pc, pc, 0):
TB count712636/2684354
[...]
TB hash buckets 344924/524288 (65.79% head buckets used)
TB hash occupancy   31.64% avg chain occ. Histogram: [0,10)%|█ ▆  
▅▁▃▁▂|[90,100]%
TB hash avg chain   1.047 buckets. Histogram: 1|█▁▁▁|4

- Bad hashing, i.e. tb_hash_func5(phys_pc, 0, 0):
TB count702818/2684354
[...]
TB hash buckets 112741/524288 (21.50% head buckets used)
TB hash occupancy   10.15% avg chain occ. Histogram: [0,10)%|█ ▁  
▁|[90,100]%
TB hash avg chain   2.107 buckets. Histogram: [1.0,10.2)|█▁|[83.8,93.0]

- Good hashing, but no auto-resize:
TB count715634/2684354
TB hash buckets 8192/8192 (100.00% head buckets used)
TB hash occupancy   98.30% avg chain occ. Histogram: 
[95.3,95.8)%|▁▁▃▄▃▄▁▇▁█|[99.5,100.0]%
TB hash avg chain   22.070 buckets. Histogram: 
[15.0,16.7)|▁▂▅▄█▅|[30.3,32.0]

Suggested-by: Richard Henderson 
Reviewed-by: Richard Henderson 
Signed-off-by: Emilio G. Cota 
---
 translate-all.c | 36 
 1 file changed, 36 insertions(+)

diff --git a/translate-all.c b/translate-all.c
index 5357737..c8074cf 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -1667,6 +1667,10 @@ void dump_exec_info(FILE *f, fprintf_function 
cpu_fprintf)
 int i, target_code_size, max_target_code_size;
 int direct_jmp_count, direct_jmp2_count, cross_page;
 TranslationBlock *tb;
+struct qht_stats hst;
+uint32_t hgram_opts;
+size_t hgram_bins;
+char *hgram;
 
 target_code_size = 0;
 max_target_code_size = 0;
@@ -1717,6 +1721,38 @@ void dump_exec_info(FILE *f, fprintf_function 
cpu_fprintf)
 direct_jmp2_count,
 tcg_ctx.tb_ctx.nb_tbs ? (direct_jmp2_count * 100) /
 tcg_ctx.tb_ctx.nb_tbs : 0);
+
+qht_statistics_init(&tcg_ctx.tb_ctx.htable, &hst);
+
+cpu_fprintf(f, "TB hash buckets %zu/%zu (%0.2f%% head buckets used)\n",
+hst.used_head_buckets, hst.head_buckets,
+(double)hst.used_head_buckets / hst.head_buckets * 100);
+
+hgram_opts =  QDIST_PR_BORDER | QDIST_PR_LABELS;
+hgram_opts |= QDIST_PR_100X   | QDIST_PR_PERCENT;
+if (qdist_xmax(&hst.occupancy) - qdist_xmin(&hst.occupancy) == 1) {
+hgram_opts |= QDIST_PR_NODECIMAL;
+}
+hgram = qdist_pr(&hst.occupancy, 10, hgram_opts);
+cpu_fprintf(f, "TB hash occupancy   %0.2f%% avg chain occ. Histogram: 
%s\n",
+qdist_avg(&hst.occupancy) * 100, hgram);
+g_free(hgram);
+
+hgram_opts = QDIST_PR_BORDER | QDIST_PR_LABELS;
+hgram_bins = qdist_xmax(&hst.chain) - qdist_xmin(&hst.chain);
+if (hgram_bins > 10) {
+hgram_bins = 10;
+} else {
+hgram_bins = 0;
+hgram_opts |= QDIST_PR_NODECIMAL | QDIST_PR_NOBINRANGE;
+}
+hgram = qdist_pr(&hst.chain, hgram_bins, hgram_opts);
+cpu_fprintf(f, "TB hash avg chain   %0.3f buckets. Histogram: %s\n",
+qdist_avg(&hst.chain), hgram);
+g_free(hgram);
+
+qht_statistics_destroy(&hst);
+
 cpu_fprintf(f, "\nStatistics:\n");
 cpu_fprintf(f, "TB flush count  %d\n", tcg_ctx.tb_ctx.tb_flush_count);
 cpu_fprintf(f, "TB invalidate count %d\n",
-- 
2.5.0

[Qemu-devel] [PATCH v6 03/15] seqlock: rename write_lock/unlock to write_begin/end

2016-05-24 Thread Emilio G. Cota

It is a more appropriate name, now that the mutex embedded
in the seqlock is gone.

Reviewed-by: Alex Bennée 
Reviewed-by: Richard Henderson 
Signed-off-by: Emilio G. Cota 
---
 cpus.c | 28 ++--
 include/qemu/seqlock.h |  4 ++--
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/cpus.c b/cpus.c
index dd86da5..735c9b2 100644
--- a/cpus.c
+++ b/cpus.c
@@ -247,13 +247,13 @@ int64_t cpu_get_clock(void)
 void cpu_enable_ticks(void)
 {
 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
-seqlock_write_lock(&timers_state.vm_clock_seqlock);
+seqlock_write_begin(&timers_state.vm_clock_seqlock);
 if (!timers_state.cpu_ticks_enabled) {
 timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 timers_state.cpu_clock_offset -= get_clock();
 timers_state.cpu_ticks_enabled = 1;
 }
-seqlock_write_unlock(&timers_state.vm_clock_seqlock);
+seqlock_write_end(&timers_state.vm_clock_seqlock);
 }
 
 /* disable cpu_get_ticks() : the clock is stopped. You must not call
@@ -263,13 +263,13 @@ void cpu_enable_ticks(void)
 void cpu_disable_ticks(void)
 {
 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
-seqlock_write_lock(&timers_state.vm_clock_seqlock);
+seqlock_write_begin(&timers_state.vm_clock_seqlock);
 if (timers_state.cpu_ticks_enabled) {
 timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 timers_state.cpu_clock_offset = cpu_get_clock_locked();
 timers_state.cpu_ticks_enabled = 0;
 }
-seqlock_write_unlock(&timers_state.vm_clock_seqlock);
+seqlock_write_end(&timers_state.vm_clock_seqlock);
 }
 
 /* Correlation between real and virtual time is always going to be
@@ -292,7 +292,7 @@ static void icount_adjust(void)
 return;
 }
 
-seqlock_write_lock(&timers_state.vm_clock_seqlock);
+seqlock_write_begin(&timers_state.vm_clock_seqlock);
 cur_time = cpu_get_clock_locked();
 cur_icount = cpu_get_icount_locked();
 
@@ -313,7 +313,7 @@ static void icount_adjust(void)
 last_delta = delta;
 timers_state.qemu_icount_bias = cur_icount
   - (timers_state.qemu_icount << 
icount_time_shift);
-seqlock_write_unlock(&timers_state.vm_clock_seqlock);
+seqlock_write_end(&timers_state.vm_clock_seqlock);
 }
 
 static void icount_adjust_rt(void *opaque)
@@ -353,7 +353,7 @@ static void icount_warp_rt(void)
 return;
 }
 
-seqlock_write_lock(&timers_state.vm_clock_seqlock);
+seqlock_write_begin(&timers_state.vm_clock_seqlock);
 if (runstate_is_running()) {
 int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
  cpu_get_clock_locked());
@@ -372,7 +372,7 @@ static void icount_warp_rt(void)
 timers_state.qemu_icount_bias += warp_delta;
 }
 vm_clock_warp_start = -1;
-seqlock_write_unlock(&timers_state.vm_clock_seqlock);
+seqlock_write_end(&timers_state.vm_clock_seqlock);
 
 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
@@ -397,9 +397,9 @@ void qtest_clock_warp(int64_t dest)
 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 
-seqlock_write_lock(&timers_state.vm_clock_seqlock);
+seqlock_write_begin(&timers_state.vm_clock_seqlock);
 timers_state.qemu_icount_bias += warp;
-seqlock_write_unlock(&timers_state.vm_clock_seqlock);
+seqlock_write_end(&timers_state.vm_clock_seqlock);
 
 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
@@ -466,9 +466,9 @@ void qemu_start_warp_timer(void)
  * It is useful when we want a deterministic execution time,
  * isolated from host latencies.
  */
-seqlock_write_lock(&timers_state.vm_clock_seqlock);
+seqlock_write_begin(&timers_state.vm_clock_seqlock);
 timers_state.qemu_icount_bias += deadline;
-seqlock_write_unlock(&timers_state.vm_clock_seqlock);
+seqlock_write_end(&timers_state.vm_clock_seqlock);
 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 } else {
 /*
@@ -479,11 +479,11 @@ void qemu_start_warp_timer(void)
  * you will not be sending network packets continuously instead of
  * every 100ms.
  */
-seqlock_write_lock(&timers_state.vm_clock_seqlock);
+seqlock_write_begin(&timers_state.vm_clock_seqlock);
 if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
 vm_clock_warp_start = clock;
 }
-seqlock_write_unlock(&timers_state.vm_clock_seqlock);
+seqlock_write_end(&timers_state.vm_clock_seqlock);
 timer_mod_anticipate(icount_warp_timer, clock + dea

[Qemu-devel] [PATCH v6 04/15] include/processor.h: define cpu_relax()

2016-05-24 Thread Emilio G. Cota

Taken from the linux kernel.

Reviewed-by: Richard Henderson 
Reviewed-by: Alex Bennée 
Signed-off-by: Emilio G. Cota 
---
 include/qemu/processor.h | 30 ++
 1 file changed, 30 insertions(+)
 create mode 100644 include/qemu/processor.h

diff --git a/include/qemu/processor.h b/include/qemu/processor.h
new file mode 100644
index 000..42bcc99
--- /dev/null
+++ b/include/qemu/processor.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2016, Emilio G. Cota 
+ *
+ * License: GNU GPL, version 2.
+ *   See the COPYING file in the top-level directory.
+ */
+#ifndef QEMU_PROCESSOR_H
+#define QEMU_PROCESSOR_H
+
+#include "qemu/atomic.h"
+
+#if defined(__i386__) || defined(__x86_64__)
+# define cpu_relax() asm volatile("rep; nop" ::: "memory")
+
+#elif defined(__ia64__)
+# define cpu_relax() asm volatile("hint @pause" ::: "memory")
+
+#elif defined(__aarch64__)
+# define cpu_relax() asm volatile("yield" ::: "memory")
+
+#elif defined(__powerpc64__)
+/* set Hardware Multi-Threading (HMT) priority to low; then back to medium */
+# define cpu_relax() asm volatile("or 1, 1, 1;"
+  "or 2, 2, 2;" ::: "memory")
+
+#else
+# define cpu_relax() barrier()
+#endif
+
+#endif /* QEMU_PROCESSOR_H */
-- 
2.5.0

[Qemu-devel] [PATCH v6 06/15] exec: add tb_hash_func5, derived from xxhash

2016-05-24 Thread Emilio G. Cota

This will be used by upcoming changes for hashing the tb hash.

Add this into a separate file to include the copyright notice from
xxhash.

Reviewed-by: Richard Henderson 
Signed-off-by: Emilio G. Cota 
---
 include/exec/tb-hash-xx.h | 94 +++
 1 file changed, 94 insertions(+)
 create mode 100644 include/exec/tb-hash-xx.h

diff --git a/include/exec/tb-hash-xx.h b/include/exec/tb-hash-xx.h
new file mode 100644
index 000..9f3fc05
--- /dev/null
+++ b/include/exec/tb-hash-xx.h
@@ -0,0 +1,94 @@
+/*
+ * xxHash - Fast Hash algorithm
+ * Copyright (C) 2012-2016, Yann Collet
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * + Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * + Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at :
+ * - xxHash source repository : https://github.com/Cyan4973/xxHash
+ */
+#ifndef EXEC_TB_HASH_XX
+#define EXEC_TB_HASH_XX
+
+#include 
+
+#define PRIME32_1   2654435761U
+#define PRIME32_2   2246822519U
+#define PRIME32_3   3266489917U
+#define PRIME32_4668265263U
+#define PRIME32_5374761393U
+
+#define TB_HASH_XX_SEED 1
+
+/*
+ * xxhash32, customized for input variables that are not guaranteed to be
+ * contiguous in memory.
+ */
+static inline
+uint32_t tb_hash_func5(uint64_t a0, uint64_t b0, uint32_t e)
+{
+uint32_t v1 = TB_HASH_XX_SEED + PRIME32_1 + PRIME32_2;
+uint32_t v2 = TB_HASH_XX_SEED + PRIME32_2;
+uint32_t v3 = TB_HASH_XX_SEED + 0;
+uint32_t v4 = TB_HASH_XX_SEED - PRIME32_1;
+uint32_t a = a0 >> 32;
+uint32_t b = a0;
+uint32_t c = b0 >> 32;
+uint32_t d = b0;
+uint32_t h32;
+
+v1 += a * PRIME32_2;
+v1 = rol32(v1, 13);
+v1 *= PRIME32_1;
+
+v2 += b * PRIME32_2;
+v2 = rol32(v2, 13);
+v2 *= PRIME32_1;
+
+v3 += c * PRIME32_2;
+v3 = rol32(v3, 13);
+v3 *= PRIME32_1;
+
+v4 += d * PRIME32_2;
+v4 = rol32(v4, 13);
+v4 *= PRIME32_1;
+
+h32 = rol32(v1, 1) + rol32(v2, 7) + rol32(v3, 12) + rol32(v4, 18);
+h32 += 20;
+
+h32 += e * PRIME32_3;
+h32  = rol32(h32, 17) * PRIME32_4;
+
+h32 ^= h32 >> 15;
+h32 *= PRIME32_2;
+h32 ^= h32 >> 13;
+h32 *= PRIME32_3;
+h32 ^= h32 >> 16;
+
+return h32;
+}
+
+#endif /* EXEC_TB_HASH_XX */
-- 
2.5.0

Re: [Qemu-devel] [patch v6 11/12] vfio: register aer resume notification handler for aer resume

2016-05-24 Thread Zhou Jie

So I needn't disable access to the device between being notified that 
the error occurred and being notified to resume.

This will make code simpler.

Am I right?

On 2016/5/24 18:49, Michael S. Tsirkin wrote:

On Tue, Apr 26, 2016 at 08:48:15AM -0600, Alex Williamson wrote:

I think that means that if we want to switch from a
simple halt-on-error to a mechanism for the guest to handle recovery,
we need to disable access to the device between being notified that the
error occurred and being notified to resume.


But this isn't what happens on bare metal.
Errors are reported asynchronously and host might access the device
meanwhile.  These accesses might or might not trigger more errors, but
fundamentally this should not matter too much as device is going to be
reset.

Re: [Qemu-devel] [PATCH RFC kernel] balloon: speed up inflating/deflating process

2016-05-24 Thread Li, Liang Z

> > > > > > > This is grossly inefficient if you only requested a single page.
> > > > > > > And it's also allocating memory very aggressively without
> > > > > > > ever telling the host what is going on.
> > > > > >
> > > > > > If only requested a single page, there is no need  to send the
> > > > > > entire page bitmap, This RFC patch has already considered about
> this.
> > > > >
> > > > > where's that addressed in code?
> > > > >
> > > >
> > > > By record the start_pfn and end_pfn.
> > > >
> > > > The start_pfn & end_pfn will be updated in set_page_bitmap() and
> > > > will be used in the function tell_host():
> > > >
> > > > --
> > > > 
> > > > ---
> > > > +static void set_page_bitmap(struct virtio_balloon *vb, struct
> > > > +page
> > > > +*page) {
> > > > +   unsigned int i;
> > > > +   unsigned long *bitmap = vb->page_bitmap;
> > > > +   unsigned long balloon_pfn = page_to_balloon_pfn(page);
> > > > +
> > > > +   for (i = 0; i < VIRTIO_BALLOON_PAGES_PER_PAGE; i++)
> > > > +   set_bit(balloon_pfn + i, bitmap);
> > >
> > > BTW, there's a page size value in header so there is no longer need
> > > to set multiple bits per page.
> >
> > Yes, you are right.
> >
> > >
> > > > +   if (balloon_pfn < vb->start_pfn)
> > > > +   vb->start_pfn = balloon_pfn;
> > > > +   if (balloon_pfn > vb->end_pfn)
> > > > +   vb->end_pfn = balloon_pfn;
> > > > +}
> > >
> > > Sounds good, but you also need to limit by allocated bitmap size.
> >
> > Why should we limit the page bitmap size? Is it no good to send a large
> page bitmap?
> > or to save the memory used for page bitmap? Or some other reason?
> 
> To save memory. First allocating a large bitmap can fail, second this is 
> pinned
> memory that is wasted - it's unused most of the time while guest is running.

Make sense.

> 
> > >
> > > >
> > > > +   unsigned long start_pfn, end_pfn, flags = 0, bmap_len;
> > > > +   struct scatterlist sg[5];
> > > > +
> > > > +   start_pfn = rounddown(vb->start_pfn, BITS_PER_LONG);
> > > > +   end_pfn = roundup(vb->end_pfn, BITS_PER_LONG);
> > > > +   bmap_len = (end_pfn - start_pfn) / BITS_PER_LONG *
> > > sizeof(long);
> > > > +
> > > > +   sg_init_table(sg, 5);
> > > > +   sg_set_buf(&sg[0], &flags, sizeof(flags));
> > > > +   sg_set_buf(&sg[1], &start_pfn, sizeof(start_pfn));
> > > > +   sg_set_buf(&sg[2], &page_shift, sizeof(page_shift));
> > > > +   sg_set_buf(&sg[3], &bmap_len, sizeof(bmap_len));
> > > > +   sg_set_buf(&sg[4], vb->page_bitmap +
> > > > +(start_pfn / BITS_PER_LONG), bmap_len);
> > >
> > > Looks wrong. start_pfn should start at offset 0 I think ...
> >
> > I don't know what is wrong here, could you tell me why?
> >
> > Thanks!
> >
> > Liang
> 
> start_pfn should mean "bit 0 in bitmap refers to pfn X".
> So it does not make sense to also add it as offset within bitmap.

I got it, but in my code, it's correct, because the page_bitmap is
range from pfn 0 to max_pfn. 
It should be changed if we are going to use a small page bitmap.

Thanks!

Liang


> 
> --
> MST

Re: [Qemu-devel] [PATCH RFC kernel] balloon: speed up inflating/deflating process

2016-05-24 Thread Li, Liang Z

> On Tue, May 24, 2016 at 02:36:08PM +, Li, Liang Z wrote:
> > > > > > > This can be pre-initialized, correct?
> > > > > >
> > > > > > pre-initialized? I am not quite understand your mean.
> > > > >
> > > > > I think you can maintain sg as part of device state and init sg
> > > > > with the
> > > bitmap.
> > > > >
> > > >
> > > > I got it.
> > > >
> > > > > > > This is grossly inefficient if you only requested a single page.
> > > > > > > And it's also allocating memory very aggressively without
> > > > > > > ever telling the host what is going on.
> > > > > >
> > > > > > If only requested a single page, there is no need  to send the
> > > > > > entire page bitmap, This RFC patch has already considered about
> this.
> > > > >
> > > > > where's that addressed in code?
> > > > >
> > > >
> > > > By record the start_pfn and end_pfn.
> > > >
> > > > The start_pfn & end_pfn will be updated in set_page_bitmap() and
> > > > will be used in the function tell_host():
> > > >
> > > > --
> > > > 
> > > > ---
> > > > +static void set_page_bitmap(struct virtio_balloon *vb, struct
> > > > +page
> > > > +*page) {
> > > > +   unsigned int i;
> > > > +   unsigned long *bitmap = vb->page_bitmap;
> > > > +   unsigned long balloon_pfn = page_to_balloon_pfn(page);
> > > > +
> > > > +   for (i = 0; i < VIRTIO_BALLOON_PAGES_PER_PAGE; i++)
> > > > +   set_bit(balloon_pfn + i, bitmap);
> > >
> > > BTW, there's a page size value in header so there is no longer need
> > > to set multiple bits per page.
> >
> > Yes, you are right.
> >
> > >
> > > > +   if (balloon_pfn < vb->start_pfn)
> > > > +   vb->start_pfn = balloon_pfn;
> > > > +   if (balloon_pfn > vb->end_pfn)
> > > > +   vb->end_pfn = balloon_pfn;
> > > > +}
> > >
> > > Sounds good, but you also need to limit by allocated bitmap size.
> >
> > Why should we limit the page bitmap size? Is it no good to send a large
> page bitmap?
> > or to save the memory used for page bitmap? Or some other reason?
> 
> To save memory. First allocating a large bitmap can fail, second this is 
> pinned
> memory that is wasted - it's unused most of the time while guest is running.
> 
> > >
> > > >
> > > > +   unsigned long start_pfn, end_pfn, flags = 0, bmap_len;
> > > > +   struct scatterlist sg[5];
> > > > +
> > > > +   start_pfn = rounddown(vb->start_pfn, BITS_PER_LONG);
> > > > +   end_pfn = roundup(vb->end_pfn, BITS_PER_LONG);
> > > > +   bmap_len = (end_pfn - start_pfn) / BITS_PER_LONG *
> > > sizeof(long);
> > > > +
> > > > +   sg_init_table(sg, 5);
> > > > +   sg_set_buf(&sg[0], &flags, sizeof(flags));
> > > > +   sg_set_buf(&sg[1], &start_pfn, sizeof(start_pfn));
> > > > +   sg_set_buf(&sg[2], &page_shift, sizeof(page_shift));
> > > > +   sg_set_buf(&sg[3], &bmap_len, sizeof(bmap_len));
> > > > +   sg_set_buf(&sg[4], vb->page_bitmap +
> > > > +(start_pfn / BITS_PER_LONG), bmap_len);
> > >
> > > Looks wrong. start_pfn should start at offset 0 I think ...
> >
> > I don't know what is wrong here, could you tell me why?
> >
> > Thanks!
> >
> > Liang
> 
> start_pfn should mean "bit 0 in bitmap refers to pfn X".
> So it does not make sense to also add it as offset within bitmap.
> 
> --
> MST

Re: [Qemu-devel] [PATCH v2 06/14] stream: Use BlockBackend for I/O

2016-05-24 Thread Eric Blake

On 05/24/2016 07:47 AM, Kevin Wolf wrote:
> This changes the streaming block job to use the job's BlockBackend for
> performing the COR reads. job->bs isn't used by the streaming code any
> more afterwards.
> 
> Signed-off-by: Kevin Wolf 
> ---
>  block/io.c|  9 -
>  block/stream.c| 15 +--
>  include/block/block.h |  2 --
>  trace-events  |  1 -
>  4 files changed, 9 insertions(+), 18 deletions(-)
> 

> +++ b/trace-events
> @@ -72,7 +72,6 @@ bdrv_aio_readv(void *bs, int64_t sector_num, int 
> nb_sectors, void *opaque) "bs %
>  bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void *opaque) 
> "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
>  bdrv_aio_write_zeroes(void *bs, int64_t sector_num, int nb_sectors, int 
> flags, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d flags %#x 
> opaque %p"
>  bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num 
> %"PRId64" nb_sectors %d"
> -bdrv_co_copy_on_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p 
> sector_num %"PRId64" nb_sectors %d"

Yay - one less sector-based trace point - we're slowly getting to an
all-bytes destination.

Reviewed-by: Eric Blake 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-devel] [PATCH v2 05/14] block: Make blk_co_preadv/pwritev() public

2016-05-24 Thread Eric Blake

On 05/24/2016 07:47 AM, Kevin Wolf wrote:
> Also add trace points now that the function can be directly called.
> 
> Signed-off-by: Kevin Wolf 
> ---
>  block/block-backend.c  | 21 ++---
>  include/sysemu/block-backend.h |  6 ++
>  trace-events   |  4 
>  3 files changed, 24 insertions(+), 7 deletions(-)
> 

> @@ -741,11 +742,15 @@ static int blk_check_request(BlockBackend *blk, int64_t 
> sector_num,
>nb_sectors * BDRV_SECTOR_SIZE);
>  }
>  
> -static int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
> -  unsigned int bytes, QEMUIOVector *qiov,
> -  BdrvRequestFlags flags)
> +int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
> +   unsigned int bytes, QEMUIOVector *qiov,

Isn't bytes redundant with qiov->size? Or can qiov be NULL?  Should we
assert(!qiov || qiov->size == bytes)?

> +int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
> +unsigned int bytes, QEMUIOVector *qiov,
> +BdrvRequestFlags flags)
>  {

Ditto. When doing write_zeroes, do we want qiov == NULL, must we always
have qiov but just leave qiov->iov[0].base as NULL?  Probably worth
documenting as part of making it public.


> +++ b/include/sysemu/block-backend.h
> @@ -113,6 +113,12 @@ void *blk_get_attached_dev(BlockBackend *blk);
>  void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops, void 
> *opaque);
>  int blk_pread_unthrottled(BlockBackend *blk, int64_t offset, uint8_t *buf,
>int count);
> +int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
> +   unsigned int bytes, QEMUIOVector *qiov,
> +   BdrvRequestFlags flags);
> +int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
> +   unsigned int bytes, QEMUIOVector *qiov,
> +   BdrvRequestFlags flags);

Note that my earlier addition of blk_aio_pwritev intentionally omitted
the bytes argument, relying solely on qiov->size.

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-devel] [PATCH v2 04/14] block: Convert block job core to BlockBackend

2016-05-24 Thread Eric Blake

On 05/24/2016 07:47 AM, Kevin Wolf wrote:
> This adds a new BlockBackend field to the BlockJob struct, which
> coexists with the BlockDriverState while converting the individual jobs.
> 
> When creating a block job, a new BlockBackend is created on top of the
> given BlockDriverState, and it is destroyed when the BlockJob ends. The
> reference to the BDS is now held by the BlockBackend instead of calling
> bdrv_ref/unref manually.
> 
> We have to be careful when we use bdrv_replace_in_backing_chain() in
> block jobs because this changes the BDS that job->blk points to. At the
> moment block jobs are too tightly coupled with their BDS, so that moving
> a job to another BDS isn't easily possible; therefore, we need to just
> manually undo this change afterwards.
> 
> Signed-off-by: Kevin Wolf 
> ---
>  block/mirror.c   |  3 +++
>  blockjob.c   | 37 -
>  include/block/blockjob.h |  3 ++-
>  3 files changed, 25 insertions(+), 18 deletions(-)
> 

Reviewed-by: Eric Blake 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-devel] [PATCH v5 13/18] qht: support parallel writes

2016-05-24 Thread Emilio G. Cota

On Wed, May 25, 2016 at 01:17:21 +0300, Sergey Fedorov wrote:
> >> With this implementation we could:
> >>  (1) get rid of qht_map::stale
> >>  (2) don't waste cycles waiting for resize to complete
> > I'll include this in v6.
> 
> How is it by perf?

Not much of a difference, since resize is a slow path. Calling
qht-bench with lots of update and resize threads performs very
poorly either way =D

I like the change though because using the mutex here simplifies
the resize code; there's no guilt anymore attached to holding the lock
for some time (e.g. when allocating a new, possible quite large,
map), whereas with the spinlock we would allocate it before
acquiring the lock, without knowing whether the allocation would
be needed in the end.

Emilio

Re: [Qemu-devel] [PATCH] docs: Fix a couple of typos in throttle.txt

2016-05-24 Thread Eric Blake

On 05/24/2016 05:59 AM, Alberto Garcia wrote:
> Signed-off-by: Alberto Garcia 
> ---
>  docs/throttle.txt | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)

Reviewed-by: Eric Blake 

> 
> diff --git a/docs/throttle.txt b/docs/throttle.txt
> index 06ed9b3..b4431f6 100644
> --- a/docs/throttle.txt
> +++ b/docs/throttle.txt
> @@ -39,7 +39,7 @@ the parameters for both cases:
>  | throttling.bps-write  | bps_wr|
>  |---+---|
>  
> -It is possible to set limits for both IOPS and bps and the same time,
> +It is possible to set limits for both IOPS and bps at the same time,
>  and for each case we can decide whether to have separate read and
>  write limits or not, but note that if iops-total is set then neither
>  iops-read nor iops-write can be set. The same applies to bps-total and
> @@ -235,7 +235,7 @@ consider the following values:
>- Water leaks from the bucket at a rate of 100 IOPS.
>- Water can be added to the bucket at a rate of 2000 IOPS.
>- The size of the bucket is 2000 x 60 = 12
> -  - If 'iops-total-max-length' is unset then the bucket size is 100.
> +  - If 'iops-total-max' is unset then the bucket size is 100.
>  
>  The bucket is initially empty, therefore water can be added until it's
>  full at a rate of 2000 IOPS (the burst rate). Once the bucket is full
> 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-devel] [PATCH V3] block/iscsi: allow caching of the allocation map

2016-05-24 Thread Eric Blake

On 05/24/2016 02:40 AM, Peter Lieven wrote:
> until now the allocation map was used only as a hint if a cluster
> is allocated or not. If a block was not allocated (or Qemu had
> no info about the allocation status) a get_block_status call was
> issued to check the allocation status and possibly avoid
> a subsequent read of unallocated sectors. If a block known to be
> allocated the get_block_status call was omitted. In the other case
> a get_block_status call was issued before every read to avoid
> the necessity for a consistent allocation map. To avoid the
> potential overhead of calling get_block_status for each and
> every read request this took only place for the bigger requests.
> 
> This patch enhances this mechanism to cache the allocation
> status and avoid calling get_block_status for blocks where
> the allocation status has been queried before. This allows
> for bypassing the read request even for smaller requests and
> additionally omits calling get_block_status for known to be
> unallocated blocks.
> 
> Signed-off-by: Peter Lieven 
> ---

> +static int iscsi_allocmap_init(IscsiLun *iscsilun, int open_flags)
>  {
> -if (iscsilun->allocationmap == NULL) {
> -return;
> +iscsi_allocmap_free(iscsilun);
> +
> +iscsilun->allocmap_size =
> +DIV_ROUND_UP(sector_lun2qemu(iscsilun->num_blocks, iscsilun),
> + iscsilun->cluster_sectors);
> +

Computes: ceiling( (num_blocks * block_size / 512) / (cluster_size /
512) ); aka number of clusters.  But we don't independently track the
cluster size, so I don't see any simpler way of writing it, even if we
could be more efficient by not having to first scale through qemu's
sector size.

> +iscsilun->allocmap = bitmap_try_new(iscsilun->allocmap_size);
> +if (!iscsilun->allocmap) {
> +return -ENOMEM;
> +}
> +
> +if (open_flags & BDRV_O_NOCACHE) {
> +/* in case that cache.direct = on all allocmap entries are
> + * treated as invalid to force a relookup of the block
> + * status on every read request */
> +return 0;

Can we cache that we are opened BDRV_O_NOCACHE, so that we don't even
have to bother allocating allocmap when we know we are never changing
its bits?  In other words, can you swap this to be before the
bitmap_try_new()?

> +}
> +
> +iscsilun->allocmap_valid = bitmap_try_new(iscsilun->allocmap_size);
> +if (!iscsilun->allocmap_valid) {
> +/* if we are under memory pressure free the allocmap as well */
> +iscsi_allocmap_free(iscsilun);
> +return -ENOMEM;
>  }
> -bitmap_set(iscsilun->allocationmap,
> -   sector_num / iscsilun->cluster_sectors,
> -   DIV_ROUND_UP(nb_sectors, iscsilun->cluster_sectors));
> +
> +return 0;
>  }
>  
> -static void iscsi_allocationmap_clear(IscsiLun *iscsilun, int64_t sector_num,
> -  int nb_sectors)
> +static void
> +iscsi_allocmap_update(IscsiLun *iscsilun, int64_t sector_num,
> +  int nb_sectors, bool allocated, bool valid)
>  {
>  int64_t cluster_num, nb_clusters;
> -if (iscsilun->allocationmap == NULL) {
> +
> +if (iscsilun->allocmap == NULL) {
>  return;
>  }

Here, you are short-circuiting when there is no allocmap, but shouldn't
you also short-circuit if you are BDRV_O_NOCACHE?

Should you assert(!(allocated && !valid)) [or by deMorgan's,
assert(!allocated || valid)], to make sure we are only tracking 3 states
rather than 4?

>  cluster_num = DIV_ROUND_UP(sector_num, iscsilun->cluster_sectors);
>  nb_clusters = (sector_num + nb_sectors) / iscsilun->cluster_sectors
>- cluster_num;
> -if (nb_clusters > 0) {
> -bitmap_clear(iscsilun->allocationmap, cluster_num, nb_clusters);
> +if (allocated) {
> +bitmap_set(iscsilun->allocmap,
> +   sector_num / iscsilun->cluster_sectors,
> +   DIV_ROUND_UP(nb_sectors, iscsilun->cluster_sectors));

This says that if we have a sub-cluster request, we can round out to
cluster alignment (if our covered part of the cluster is allocated,
presumably the rest is allocated as well)...

> +} else if (nb_clusters > 0) {
> +bitmap_clear(iscsilun->allocmap, cluster_num, nb_clusters);

...while this says if we are marking something clear, we have to round
in (while we can trim the aligned middle, we should not mark any
unaligned head or tail as trimmed, in case they remain allocated due to
the unvisited sectors).  Still, it may be worth comments for why your
rounding between the two calls is different.

> +}
> +
> +if (iscsilun->allocmap_valid == NULL) {
> +return;
> +}

When do we ever have allocmap set but allocmap_valid clear?  Isn't it
easier to assume that both maps are present (we are tracking status) or
absent (we are BDRV_O_NOCACHE)?

> +if (valid) {
> +if (nb_clusters > 0) {
> +bitmap_set(iscsilun->allocmap

Re: [Qemu-devel] [PATCH v2 0/7] dma-helpers, scsi-block: use SG_IO for all I/O on scsi-block

2016-05-24 Thread Mark Cave-Ayland

On 23/05/16 20:36, Mark Cave-Ayland wrote:

> On 23/05/16 13:54, Paolo Bonzini wrote:
> 
>> scsi-block uses the block layer for reads and writes in order to avoid
>> allocating bounce buffers as big as the transferred data.  We know how
>> to split a large transfer to multiple reads and writes, and thus we can
>> use scsi-disk.c's existing code to do I/O in multiple chunks (for non-s/g
>> SCSI hosts) or through the DMA helpers (for s/g SCSI hosts).
>>
>> Unfortunately, this has the side effect of eating the SCSI status except
>> in the very few cases where we can convert an errno code back to a SCSI
>> status.  It puts a big wrench in persistent reservations support in the
>> guest, for example.
>>
>> Luckily, splitting a large transfer into multiple SBC commands is just as
>> easy, and this is what the last patch does.  It takes the original CDB,
>> patches in a modified starting sector and sector count, and executes the
>> SCSI command through blk_aio_ioctl.  It is also easy to pass a QEMUIOVector
>> to SG_IO, so that s/g SCSI hosts keep the performance.
>>
>> This rebases the patches on top of Eric's changes for byte-based
>> BlockBackend access and fixes a few bugs I knew about in the RFC.
>>
>> Patches 1, 5 and 6 are new.
>>
>> Paolo
>>
>> Paolo Bonzini (7):
>>   dma-helpers: change interface to byte-based
>>   dma-helpers: change BlockBackend to opaque value in DMAIOFunc
>>   scsi-disk: introduce a common base class
>>   scsi-disk: introduce dma_readv and dma_writev
>>   scsi-disk: add need_fua_emulation to SCSIDiskClass
>>   scsi-disk: introduce scsi_disk_req_check_error
>>   scsi-block: always use SG_IO
>>
>>  dma-helpers.c|  54 +--
>>  hw/block/nvme.c  |   6 +-
>>  hw/ide/ahci.c|   6 +-
>>  hw/ide/core.c|  20 ++-
>>  hw/ide/internal.h|   6 +-
>>  hw/ide/macio.c   |   2 +-
>>  hw/scsi/scsi-disk.c  | 409 
>> +--
>>  include/sysemu/dma.h |  20 +--
>>  trace-events |   2 +-
>>  9 files changed, 371 insertions(+), 154 deletions(-)
> 
> Hi Paolo,
> 
> I thought I'd give this patchset a spin with a view to seeing whether I
> could switch the macio device back to the now byte-based dma-helpers,
> but came up with a couple of compile errors. Attached is the minor diff
> I applied in order to get a successful compile (apologies for not being
> inline, however I couldn't get my mail client to stop wrapping incorrectly).

After some head-scratching, I've finally managed to install and boot
Darwin PPC using the new byte-based DMA helpers facilitated by this
patch in the macio IDE driver. Just switching over to the new helpers
provided by this patch seemingly allowed the installation to proceed,
but the resulting image was corrupt and refused to boot.

I eventually traced the corruption down to this section of code in
dma_blk_cb() which was incorrectly truncating the unaligned iovecs:

if (dbs->iov.size & ~BDRV_SECTOR_MASK) {
qemu_iovec_discard_back(&dbs->iov, dbs->iov.size &
~BDRV_SECTOR_MASK);
}

This was introduced in
http://lists.gnu.org/archive/html/qemu-devel/2014-07/msg02236.html to
handle non-sector aligned SG lists, although given that this is a
restriction of one particular implementation (PRDT) I'm not sure whether
a plain revert is the correct thing to do or whether an alternative
solution needs to be found.


ATB,

Mark.

Re: [Qemu-devel] [PATCH v2 02/14] block: Cancel jobs first in bdrv_close_all()

2016-05-24 Thread Eric Blake

On 05/24/2016 07:47 AM, Kevin Wolf wrote:
> So far, bdrv_close_all() first removed all root BlockDriverStates of
> BlockBackends and monitor owned BDSes, and then assumed that the
> remaining BDSes must be related to jobs and cancelled these jobs.
> 
> This order doesn't work that well any more when block jobs use
> BlockBackends internally because then they will lose their BDS before
> being cancelled.
> 
> This patch changes bdrv_close_all() to first cancel all jobs and then
> remove all root BDSes from the remaining BBs.
> 
> Signed-off-by: Kevin Wolf 
> ---
>  block.c  | 23 ++-
>  blockjob.c   | 13 +
>  include/block/blockjob.h |  7 +++
>  3 files changed, 22 insertions(+), 21 deletions(-)
> 

Reviewed-by: Eric Blake 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature

[Qemu-devel] [PATCH 01/13] block: Rename blk_write_zeroes()

2016-05-24 Thread Eric Blake

Commit 983a1600 changed the semantics of blk_write_zeroes() to
be byte-based rather than sector-based, but did not change the
name, which is an open invitation for other code to misuse the
function.  Renaming to pwrite_zeroes() makes it more in line
with other byte-based interfaces, and will help make it easier
to track which remaining write_zeroes interfaces still need
conversion.

Reported-by: Kevin Wolf 
Signed-off-by: Eric Blake 
---
 include/sysemu/block-backend.h | 14 +++---
 block/block-backend.c  | 14 +++---
 block/parallels.c  |  4 ++--
 hw/scsi/scsi-disk.c|  2 +-
 qemu-img.c |  4 ++--
 qemu-io-cmds.c | 22 +++---
 6 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index 9d6615c..9571726 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -113,11 +113,11 @@ void *blk_get_attached_dev(BlockBackend *blk);
 void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops, void *opaque);
 int blk_pread_unthrottled(BlockBackend *blk, int64_t offset, uint8_t *buf,
   int count);
-int blk_write_zeroes(BlockBackend *blk, int64_t offset,
- int count, BdrvRequestFlags flags);
-BlockAIOCB *blk_aio_write_zeroes(BlockBackend *blk, int64_t offset,
- int count, BdrvRequestFlags flags,
- BlockCompletionFunc *cb, void *opaque);
+int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
+  int count, BdrvRequestFlags flags);
+BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, int64_t offset,
+  int count, BdrvRequestFlags flags,
+  BlockCompletionFunc *cb, void *opaque);
 int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count);
 int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count,
BdrvRequestFlags flags);
@@ -195,8 +195,8 @@ int blk_get_open_flags_from_root_state(BlockBackend *blk);

 void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk,
   BlockCompletionFunc *cb, void *opaque);
-int coroutine_fn blk_co_write_zeroes(BlockBackend *blk, int64_t offset,
- int count, BdrvRequestFlags flags);
+int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
+  int count, BdrvRequestFlags flags);
 int blk_write_compressed(BlockBackend *blk, int64_t sector_num,
  const uint8_t *buf, int nb_sectors);
 int blk_truncate(BlockBackend *blk, int64_t offset);
diff --git a/block/block-backend.c b/block/block-backend.c
index 4e8298b..b33b8e2 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -857,8 +857,8 @@ int blk_pread_unthrottled(BlockBackend *blk, int64_t 
offset, uint8_t *buf,
 return ret;
 }

-int blk_write_zeroes(BlockBackend *blk, int64_t offset,
- int count, BdrvRequestFlags flags)
+int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
+  int count, BdrvRequestFlags flags)
 {
 return blk_prw(blk, offset, NULL, count, blk_write_entry,
flags | BDRV_REQ_ZERO_WRITE);
@@ -973,9 +973,9 @@ static void blk_aio_write_entry(void *opaque)
 blk_aio_complete(acb);
 }

-BlockAIOCB *blk_aio_write_zeroes(BlockBackend *blk, int64_t offset,
- int count, BdrvRequestFlags flags,
- BlockCompletionFunc *cb, void *opaque)
+BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, int64_t offset,
+  int count, BdrvRequestFlags flags,
+  BlockCompletionFunc *cb, void *opaque)
 {
 return blk_aio_prwv(blk, offset, count, NULL, blk_aio_write_entry,
 flags | BDRV_REQ_ZERO_WRITE, cb, opaque);
@@ -1464,8 +1464,8 @@ void *blk_aio_get(const AIOCBInfo *aiocb_info, 
BlockBackend *blk,
 return qemu_aio_get(aiocb_info, blk_bs(blk), cb, opaque);
 }

-int coroutine_fn blk_co_write_zeroes(BlockBackend *blk, int64_t offset,
- int count, BdrvRequestFlags flags)
+int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
+  int count, BdrvRequestFlags flags)
 {
 return blk_co_pwritev(blk, offset, count, NULL,
   flags | BDRV_REQ_ZERO_WRITE);
diff --git a/block/parallels.c b/block/parallels.c
index 88cface..99fc0f7 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -517,8 +517,8 @@ static int parallels_create(const char *filename, QemuOpts 
*opts, Error **errp)
 if (ret < 0) {
 goto exit;
 }
-ret = blk_write_zeroes(file, BDRV_SECTOR_SIZE,
-   (bat_sectors - 1) << BDRV_SECTOR_BITS, 0);

[Qemu-devel] [PATCH 11/13] raw_bsd: Convert to bdrv_co_pwrite_zeroes()

2016-05-24 Thread Eric Blake

Another step on our continuing quest to switch to byte-based
interfaces.

Signed-off-by: Eric Blake 
---
 block/raw_bsd.c | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/block/raw_bsd.c b/block/raw_bsd.c
index d9adf90..b1d5237 100644
--- a/block/raw_bsd.c
+++ b/block/raw_bsd.c
@@ -127,12 +127,11 @@ static int64_t coroutine_fn 
raw_co_get_block_status(BlockDriverState *bs,
(sector_num << BDRV_SECTOR_BITS);
 }

-static int coroutine_fn raw_co_write_zeroes(BlockDriverState *bs,
-int64_t sector_num, int nb_sectors,
-BdrvRequestFlags flags)
+static int coroutine_fn raw_co_pwrite_zeroes(BlockDriverState *bs,
+ int64_t offset, int count,
+ BdrvRequestFlags flags)
 {
-return bdrv_co_pwrite_zeroes(bs->file->bs, sector_num << BDRV_SECTOR_BITS,
- nb_sectors << BDRV_SECTOR_BITS, flags);
+return bdrv_co_pwrite_zeroes(bs->file->bs, offset, count, flags);
 }

 static int coroutine_fn raw_co_discard(BlockDriverState *bs,
@@ -253,7 +252,7 @@ BlockDriver bdrv_raw = {
 .bdrv_create  = &raw_create,
 .bdrv_co_readv= &raw_co_readv,
 .bdrv_co_writev_flags = &raw_co_writev_flags,
-.bdrv_co_write_zeroes = &raw_co_write_zeroes,
+.bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes,
 .bdrv_co_discard  = &raw_co_discard,
 .bdrv_co_get_block_status = &raw_co_get_block_status,
 .bdrv_truncate= &raw_truncate,
-- 
2.5.5

[Qemu-devel] [PATCH 06/13] qcow2: Convert to bdrv_co_pwrite_zeroes()

2016-05-24 Thread Eric Blake

Another step on our continuing quest to switch to byte-based
interfaces.

There are still opportunities to optimize the qcow2 handling
of zero clusters.  For example, if the backing file only has
non-zero data in the portion about to be overwritten, then
we could widen the request and make the entire cluster zero,
rather than falling back to -ENOTSUP.  But for this patch,
intentionally leave the semantics unchanged, even if not
optimal.

Signed-off-by: Eric Blake 
---
 block/qcow2.c | 46 +-
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 978694e..3522fc0 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2428,43 +2428,47 @@ static bool is_zero_cluster_top_locked(BlockDriverState 
*bs, int64_t start)
 return ret == QCOW2_CLUSTER_UNALLOCATED || ret == QCOW2_CLUSTER_ZERO;
 }

-static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
-int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
+int64_t offset, int count, BdrvRequestFlags flags)
 {
 int ret;
 BDRVQcow2State *s = bs->opaque;

-int head = sector_num % s->cluster_sectors;
-int tail = (sector_num + nb_sectors) % s->cluster_sectors;
+int head = offset % s->cluster_size;
+int tail = (offset + count) % s->cluster_size;

+/* Widen the write to a full cluster, if the cluster already reads
+ * as zero. */
 if (head != 0 || tail != 0) {
-int64_t cl_end = -1;
+int64_t tail_sector = 0;

-sector_num -= head;
-nb_sectors += head;
-
-if (tail != 0) {
-nb_sectors += s->cluster_sectors - tail;
+offset -= head;
+count += head;
+if (tail) {
+count += s->cluster_size - tail;
 }

-if (!is_zero_cluster(bs, sector_num)) {
+if (!is_zero_cluster(bs, offset >> BDRV_SECTOR_BITS)) {
 return -ENOTSUP;
 }

-if (nb_sectors > s->cluster_sectors) {
-/* Technically the request can cover 2 clusters, f.e. 4k write
-   at s->cluster_sectors - 2k offset. One of these cluster can
-   be zeroed, one unallocated */
-cl_end = sector_num + nb_sectors - s->cluster_sectors;
-if (!is_zero_cluster(bs, cl_end)) {
+if (count > s->cluster_size) {
+/* Technically the request can cover 2 clusters, f.e. 4k
+ * write at s->cluster_sectors - 2k offset. One of these
+ * cluster can be zeroed, one unallocated. Anything larger
+ * and the front end already split it to alignment
+ * boundaries. */
+assert(count == 2 * s->cluster_size);
+tail_sector = (offset >> BDRV_SECTOR_BITS) + s->cluster_sectors;
+if (!is_zero_cluster(bs, tail_sector)) {
 return -ENOTSUP;
 }
 }

 qemu_co_mutex_lock(&s->lock);
 /* We can have new write after previous check */
-if (!is_zero_cluster_top_locked(bs, sector_num) ||
-(cl_end > 0 && !is_zero_cluster_top_locked(bs, cl_end))) {
+if (!is_zero_cluster_top_locked(bs, offset >> BDRV_SECTOR_BITS) ||
+(tail_sector && !is_zero_cluster_top_locked(bs, tail_sector))) {
 qemu_co_mutex_unlock(&s->lock);
 return -ENOTSUP;
 }
@@ -2473,7 +2477,7 @@ static coroutine_fn int 
qcow2_co_write_zeroes(BlockDriverState *bs,
 }

 /* Whatever is left can use real zero clusters */
-ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS, nb_sectors);
+ret = qcow2_zero_clusters(bs, offset, count >> BDRV_SECTOR_BITS);
 qemu_co_mutex_unlock(&s->lock);

 return ret;
@@ -3380,7 +3384,7 @@ BlockDriver bdrv_qcow2 = {
 .bdrv_co_writev = qcow2_co_writev,
 .bdrv_co_flush_to_os= qcow2_co_flush_to_os,

-.bdrv_co_write_zeroes   = qcow2_co_write_zeroes,
+.bdrv_co_pwrite_zeroes  = qcow2_co_pwrite_zeroes,
 .bdrv_co_discard= qcow2_co_discard,
 .bdrv_truncate  = qcow2_truncate,
 .bdrv_write_compressed  = qcow2_write_compressed,
-- 
2.5.5

[Qemu-devel] [PATCH 02/13] block: Track write zero limits in bytes

2016-05-24 Thread Eric Blake

Another step towards removing sector-based interfaces: convert
the maximum write and minimum alignment values from sectorss to
bytes.  Alignment is changed to 'int', since it makes no sense
to have an alignment larger than the maximum write.  Add an
assert that no one was trying to use sectors to get a write
zeroes larger than 2G.  Rename the variables to let the compiler
check that all users are converted to the new semantics.

Signed-off-by: Eric Blake 
---
 include/block/block_int.h |  8 
 block/io.c| 27 +++
 block/iscsi.c |  6 ++
 block/qcow2.c |  2 +-
 block/qed.c   |  2 +-
 block/vmdk.c  |  6 +++---
 6 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index b6f4755..4282ffd 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -328,11 +328,11 @@ typedef struct BlockLimits {
 /* optimal alignment for discard requests in sectors */
 int64_t discard_alignment;

-/* maximum number of sectors that can zeroized at once */
-int max_write_zeroes;
+/* maximum number of bytes that can zeroized at once */
+int max_pwrite_zeroes;

-/* optimal alignment for write zeroes requests in sectors */
-int64_t write_zeroes_alignment;
+/* optimal alignment for write zeroes requests in bytes */
+int pwrite_zeroes_alignment;

 /* optimal transfer length in sectors */
 int opt_transfer_length;
diff --git a/block/io.c b/block/io.c
index 2a2ff84..41b4e9d 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1120,32 +1120,35 @@ static int coroutine_fn 
bdrv_co_do_write_zeroes(BlockDriverState *bs,
 int ret = 0;
 bool need_flush = false;

-int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
-BDRV_REQUEST_MAX_SECTORS);
+int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
+int max_write_zeroes_sectors = max_write_zeroes >> BDRV_SECTOR_BITS;
+int write_zeroes_sector_align =
+bs->bl.pwrite_zeroes_alignment >> BDRV_SECTOR_BITS;

+assert(nb_sectors <= BDRV_REQUEST_MAX_SECTORS);
 while (nb_sectors > 0 && !ret) {
 int num = nb_sectors;

 /* Align request.  Block drivers can expect the "bulk" of the request
  * to be aligned.
  */
-if (bs->bl.write_zeroes_alignment
-&& num > bs->bl.write_zeroes_alignment) {
-if (sector_num % bs->bl.write_zeroes_alignment != 0) {
+if (write_zeroes_sector_align
+&& num > write_zeroes_sector_align) {
+if (sector_num % write_zeroes_sector_align != 0) {
 /* Make a small request up to the first aligned sector.  */
-num = bs->bl.write_zeroes_alignment;
-num -= sector_num % bs->bl.write_zeroes_alignment;
-} else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 
0) {
+num = write_zeroes_sector_align;
+num -= sector_num % write_zeroes_sector_align;
+} else if ((sector_num + num) % write_zeroes_sector_align != 0) {
 /* Shorten the request to the last aligned sector.  num cannot
- * underflow because num > bs->bl.write_zeroes_alignment.
+ * underflow because num > write_zeroes_sector_align.
  */
-num -= (sector_num + num) % bs->bl.write_zeroes_alignment;
+num -= (sector_num + num) % write_zeroes_sector_align;
 }
 }

 /* limit request size */
-if (num > max_write_zeroes) {
-num = max_write_zeroes;
+if (num > max_write_zeroes_sectors) {
+num = max_write_zeroes_sectors;
 }

 ret = -ENOTSUP;
diff --git a/block/iscsi.c b/block/iscsi.c
index 10f3906..0acc3dc 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -1706,12 +1706,10 @@ static void iscsi_refresh_limits(BlockDriverState *bs, 
Error **errp)
 }

 if (iscsilun->bl.max_ws_len < 0x) {
-bs->bl.max_write_zeroes =
-sector_limits_lun2qemu(iscsilun->bl.max_ws_len, iscsilun);
+bs->bl.max_pwrite_zeroes = iscsilun->bl.max_ws_len;
 }
 if (iscsilun->lbp.lbpws) {
-bs->bl.write_zeroes_alignment =
-sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun);
+bs->bl.pwrite_zeroes_alignment = iscsilun->bl.opt_unmap_gran;
 }
 bs->bl.opt_transfer_length =
 sector_limits_lun2qemu(iscsilun->bl.opt_xfer_len, iscsilun);
diff --git a/block/qcow2.c b/block/qcow2.c
index c9306a7..745b66f 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1193,7 +1193,7 @@ static void qcow2_refresh_limits(BlockDriverState *bs, 
Error **errp)
 {
 BDRVQcow2State *s = bs->opaque;

-bs->bl.write_zeroes_alignment = s->cluster_sectors;
+bs->bl.pwrite_zeroes_alignment = s->cluster_sectors << BDRV_SEC

[Qemu-devel] [PATCH 08/13] gluster: Convert to bdrv_co_pwrite_zeroes()

2016-05-24 Thread Eric Blake

Another step on our continuing quest to switch to byte-based
interfaces.

Signed-off-by: Eric Blake 
---
 block/gluster.c | 15 +++
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/block/gluster.c b/block/gluster.c
index a8aaacf..15aff4b 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -454,14 +454,13 @@ static void qemu_gluster_reopen_abort(BDRVReopenState 
*state)
 }

 #ifdef CONFIG_GLUSTERFS_ZEROFILL
-static coroutine_fn int qemu_gluster_co_write_zeroes(BlockDriverState *bs,
-int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+static coroutine_fn int qemu_gluster_co_pwrite_zeroes(BlockDriverState *bs,
+int64_t offset, int count, BdrvRequestFlags flags)
 {
 int ret;
 GlusterAIOCB acb;
 BDRVGlusterState *s = bs->opaque;
-off_t size = nb_sectors * BDRV_SECTOR_SIZE;
-off_t offset = sector_num * BDRV_SECTOR_SIZE;
+off_t size = count;

 acb.size = size;
 acb.ret = 0;
@@ -769,7 +768,7 @@ static BlockDriver bdrv_gluster = {
 .bdrv_co_discard  = qemu_gluster_co_discard,
 #endif
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
-.bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes,
+.bdrv_co_pwrite_zeroes= qemu_gluster_co_pwrite_zeroes,
 #endif
 .create_opts  = &qemu_gluster_create_opts,
 };
@@ -796,7 +795,7 @@ static BlockDriver bdrv_gluster_tcp = {
 .bdrv_co_discard  = qemu_gluster_co_discard,
 #endif
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
-.bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes,
+.bdrv_co_pwrite_zeroes= qemu_gluster_co_pwrite_zeroes,
 #endif
 .create_opts  = &qemu_gluster_create_opts,
 };
@@ -823,7 +822,7 @@ static BlockDriver bdrv_gluster_unix = {
 .bdrv_co_discard  = qemu_gluster_co_discard,
 #endif
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
-.bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes,
+.bdrv_co_pwrite_zeroes= qemu_gluster_co_pwrite_zeroes,
 #endif
 .create_opts  = &qemu_gluster_create_opts,
 };
@@ -850,7 +849,7 @@ static BlockDriver bdrv_gluster_rdma = {
 .bdrv_co_discard  = qemu_gluster_co_discard,
 #endif
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
-.bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes,
+.bdrv_co_pwrite_zeroes= qemu_gluster_co_pwrite_zeroes,
 #endif
 .create_opts  = &qemu_gluster_create_opts,
 };
-- 
2.5.5

[Qemu-devel] [PATCH 12/13] vmdk: Convert to bdrv_co_pwrite_zeroes()

2016-05-24 Thread Eric Blake

Another step on our continuing quest to switch to byte-based
interfaces.

Signed-off-by: Eric Blake 
---
 block/vmdk.c | 13 ++---
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 8494d63..284d7a0 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1704,15 +1704,14 @@ static int vmdk_write_compressed(BlockDriverState *bs,
 }
 }

-static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs,
- int64_t sector_num,
- int nb_sectors,
- BdrvRequestFlags flags)
+static int coroutine_fn vmdk_co_pwrite_zeroes(BlockDriverState *bs,
+  int64_t offset,
+  int count,
+  BdrvRequestFlags flags)
 {
 int ret;
 BDRVVmdkState *s = bs->opaque;
-uint64_t offset = sector_num * BDRV_SECTOR_SIZE;
-uint64_t bytes = nb_sectors * BDRV_SECTOR_SIZE;
+uint64_t bytes = count;

 qemu_co_mutex_lock(&s->lock);
 /* write zeroes could fail if sectors not aligned to cluster, test it with
@@ -2403,7 +2402,7 @@ static BlockDriver bdrv_vmdk = {
 .bdrv_co_preadv   = vmdk_co_preadv,
 .bdrv_co_pwritev  = vmdk_co_pwritev,
 .bdrv_write_compressed= vmdk_write_compressed,
-.bdrv_co_write_zeroes = vmdk_co_write_zeroes,
+.bdrv_co_pwrite_zeroes= vmdk_co_pwrite_zeroes,
 .bdrv_close   = vmdk_close,
 .bdrv_create  = vmdk_create,
 .bdrv_co_flush_to_disk= vmdk_co_flush,
-- 
2.5.5

[Qemu-devel] [PATCH 04/13] block: Switch bdrv_write_zeroes() to byte interface

2016-05-24 Thread Eric Blake

Rename to bdrv_pwrite_zeroes() to let the compiler ensure we
cater to the updated semantics.  Do the same for
bdrv_aio_write_zeroes() and bdrv_co_write_zeroes().  For now,
we still require sector alignment in the callers, via assertions.

Signed-off-by: Eric Blake 
---
 include/block/block.h  | 16 
 block/backup.c |  7 ---
 block/blkreplay.c  |  4 +++-
 block/io.c | 39 +++
 block/mirror.c |  7 ---
 block/parallels.c  |  4 +++-
 block/qcow2-cluster.c  |  3 +--
 block/qcow2.c  |  9 -
 block/raw_bsd.c|  3 ++-
 migration/block.c  |  5 +++--
 tests/qemu-iotests/034 |  2 +-
 tests/qemu-iotests/154 |  2 +-
 trace-events   |  4 ++--
 13 files changed, 59 insertions(+), 46 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index b740af8..da4d9b8 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -33,7 +33,7 @@ typedef struct BlockDriverInfo {
  * True if the driver can optimize writing zeroes by unmapping
  * sectors. This is equivalent to the BLKDISCARDZEROES ioctl in Linux
  * with the difference that in qemu a discard is allowed to silently
- * fail. Therefore we have to use bdrv_write_zeroes with the
+ * fail. Therefore we have to use bdrv_pwrite_zeroes with the
  * BDRV_REQ_MAY_UNMAP flag for an optimized zero write with unmapping.
  * After this call the driver has to guarantee that the contents read
  * back as zero. It is additionally required that the block device is
@@ -227,11 +227,11 @@ int bdrv_read(BlockDriverState *bs, int64_t sector_num,
   uint8_t *buf, int nb_sectors);
 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
const uint8_t *buf, int nb_sectors);
-int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
-   int nb_sectors, BdrvRequestFlags flags);
-BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, int64_t sector_num,
-  int nb_sectors, BdrvRequestFlags flags,
-  BlockCompletionFunc *cb, void *opaque);
+int bdrv_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
+   int count, BdrvRequestFlags flags);
+BlockAIOCB *bdrv_aio_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
+   int count, BdrvRequestFlags flags,
+   BlockCompletionFunc *cb, void *opaque);
 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags);
 int bdrv_pread(BlockDriverState *bs, int64_t offset,
void *buf, int count);
@@ -254,8 +254,8 @@ int coroutine_fn bdrv_co_writev(BlockDriverState *bs, 
int64_t sector_num,
  * function is not suitable for zeroing the entire image in a single request
  * because it may allocate memory for the entire region.
  */
-int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
-int nb_sectors, BdrvRequestFlags flags);
+int coroutine_fn bdrv_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
+int count, BdrvRequestFlags flags);
 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
 const char *backing_file);
 int bdrv_get_backing_file_depth(BlockDriverState *bs);
diff --git a/block/backup.c b/block/backup.c
index fec45e8..918ff4f 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -154,9 +154,10 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,
 }

 if (buffer_is_zero(iov.iov_base, iov.iov_len)) {
-ret = bdrv_co_write_zeroes(job->target,
-   start * sectors_per_cluster,
-   n, BDRV_REQ_MAY_UNMAP);
+ret = bdrv_co_pwrite_zeroes(job->target,
+start * job->cluster_size,
+n << BDRV_SECTOR_BITS,
+BDRV_REQ_MAY_UNMAP);
 } else {
 ret = bdrv_co_writev(job->target,
  start * sectors_per_cluster, n,
diff --git a/block/blkreplay.c b/block/blkreplay.c
index 42f1813..1a721ad 100755
--- a/block/blkreplay.c
+++ b/block/blkreplay.c
@@ -107,7 +107,9 @@ static int coroutine_fn 
blkreplay_co_write_zeroes(BlockDriverState *bs,
 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
 {
 uint64_t reqid = request_id++;
-int ret = bdrv_co_write_zeroes(bs->file->bs, sector_num, nb_sectors, 
flags);
+int ret = bdrv_co_pwrite_zeroes(bs->file->bs,
+sector_num << BDRV_SECTOR_BITS,
+nb_sectors << BDRV_SECTOR_BITS, flags);
 block_request_create(reqid, bs, qemu_coroutine_self());
 qemu_coroutine_yield();

diff --git a/block/io.c b/block/io.c
index c1d700b..ea8135f 100644
--- a/block/io.c
+++ b/block/io.c
@@ -603,18 +603,21 @@ int bdrv_write(BlockDriverState *

[Qemu-devel] [PATCH 10/13] raw-posix: Convert to bdrv_co_pwrite_zeroes()

2016-05-24 Thread Eric Blake

Another step on our continuing quest to switch to byte-based
interfaces.

Signed-off-by: Eric Blake 
---
 block/raw-posix.c | 37 +++--
 trace-events  |  2 +-
 2 files changed, 16 insertions(+), 23 deletions(-)

diff --git a/block/raw-posix.c b/block/raw-posix.c
index a4f5a1b..bb691f6 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -1252,8 +1252,7 @@ static int aio_worker(void *arg)
 }

 static int paio_submit_co(BlockDriverState *bs, int fd,
-int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-int type)
+  int64_t offset, int count, int type)
 {
 RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
 ThreadPool *pool;
@@ -1262,16 +1261,10 @@ static int paio_submit_co(BlockDriverState *bs, int fd,
 acb->aio_type = type;
 acb->aio_fildes = fd;

-acb->aio_nbytes = nb_sectors * BDRV_SECTOR_SIZE;
-acb->aio_offset = sector_num * BDRV_SECTOR_SIZE;
+acb->aio_nbytes = count;
+acb->aio_offset = offset;

-if (qiov) {
-acb->aio_iov = qiov->iov;
-acb->aio_niov = qiov->niov;
-assert(qiov->size == acb->aio_nbytes);
-}
-
-trace_paio_submit_co(sector_num, nb_sectors, type);
+trace_paio_submit_co(offset, count, type);
 pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
 return thread_pool_submit_co(pool, aio_worker, acb);
 }
@@ -1868,17 +1861,17 @@ static coroutine_fn BlockAIOCB 
*raw_aio_discard(BlockDriverState *bs,
cb, opaque, QEMU_AIO_DISCARD);
 }

-static int coroutine_fn raw_co_write_zeroes(
-BlockDriverState *bs, int64_t sector_num,
-int nb_sectors, BdrvRequestFlags flags)
+static int coroutine_fn raw_co_pwrite_zeroes(
+BlockDriverState *bs, int64_t offset,
+int count, BdrvRequestFlags flags)
 {
 BDRVRawState *s = bs->opaque;

 if (!(flags & BDRV_REQ_MAY_UNMAP)) {
-return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
+return paio_submit_co(bs, s->fd, offset, count,
   QEMU_AIO_WRITE_ZEROES);
 } else if (s->discard_zeroes) {
-return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
+return paio_submit_co(bs, s->fd, offset, count,
   QEMU_AIO_DISCARD);
 }
 return -ENOTSUP;
@@ -1931,7 +1924,7 @@ BlockDriver bdrv_file = {
 .bdrv_create = raw_create,
 .bdrv_has_zero_init = bdrv_has_zero_init_1,
 .bdrv_co_get_block_status = raw_co_get_block_status,
-.bdrv_co_write_zeroes = raw_co_write_zeroes,
+.bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,

 .bdrv_aio_readv = raw_aio_readv,
 .bdrv_aio_writev = raw_aio_writev,
@@ -2293,8 +2286,8 @@ static coroutine_fn BlockAIOCB 
*hdev_aio_discard(BlockDriverState *bs,
cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
 }

-static coroutine_fn int hdev_co_write_zeroes(BlockDriverState *bs,
-int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
+int64_t offset, int count, BdrvRequestFlags flags)
 {
 BDRVRawState *s = bs->opaque;
 int rc;
@@ -2304,10 +2297,10 @@ static coroutine_fn int 
hdev_co_write_zeroes(BlockDriverState *bs,
 return rc;
 }
 if (!(flags & BDRV_REQ_MAY_UNMAP)) {
-return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
+return paio_submit_co(bs, s->fd, offset, count,
   QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV);
 } else if (s->discard_zeroes) {
-return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
+return paio_submit_co(bs, s->fd, offset, count,
   QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
 }
 return -ENOTSUP;
@@ -2379,7 +2372,7 @@ static BlockDriver bdrv_host_device = {
 .bdrv_reopen_abort   = raw_reopen_abort,
 .bdrv_create = hdev_create,
 .create_opts = &raw_create_opts,
-.bdrv_co_write_zeroes = hdev_co_write_zeroes,
+.bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,

 .bdrv_aio_readv= raw_aio_readv,
 .bdrv_aio_writev   = raw_aio_writev,
diff --git a/trace-events b/trace-events
index 97be1d7..b5b03ae 100644
--- a/trace-events
+++ b/trace-events
@@ -130,7 +130,7 @@ thread_pool_cancel(void *req, void *opaque) "req %p opaque 
%p"

 # block/raw-win32.c
 # block/raw-posix.c
-paio_submit_co(int64_t sector_num, int nb_sectors, int type) "sector_num 
%"PRId64" nb_sectors %d type %d"
+paio_submit_co(int64_t offset, int count, int type) "offset %"PRId64" count %d 
type %d"
 paio_submit(void *acb, void *opaque, int64_t sector_num, int nb_sectors, int 
type) "acb %p opaque %p sector_num %"PRId64" nb_sectors %d type %d"

 # ioport.c
-- 
2.5.5

[Qemu-devel] [PATCH 09/13] qed: Convert to bdrv_co_pwrite_zeroes()

2016-05-24 Thread Eric Blake

Another step on our continuing quest to switch to byte-based
interfaces.

Kill an abuse of the comma operator while at it (fortunately,
the semantics were still right).

Signed-off-by: Eric Blake 
---
 block/qed.c | 25 +
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index 0ab5b40..a0be886 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1419,7 +1419,7 @@ typedef struct {
 bool done;
 } QEDWriteZeroesCB;

-static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret)
+static void coroutine_fn qed_co_pwrite_zeroes_cb(void *opaque, int ret)
 {
 QEDWriteZeroesCB *cb = opaque;

@@ -1430,10 +1430,10 @@ static void coroutine_fn qed_co_write_zeroes_cb(void 
*opaque, int ret)
 }
 }

-static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
- int64_t sector_num,
- int nb_sectors,
- BdrvRequestFlags flags)
+static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
+  int64_t offset,
+  int count,
+  BdrvRequestFlags flags)
 {
 BlockAIOCB *blockacb;
 BDRVQEDState *s = bs->opaque;
@@ -1443,10 +1443,10 @@ static int coroutine_fn 
bdrv_qed_co_write_zeroes(BlockDriverState *bs,

 /* Refuse if there are untouched backing file sectors */
 if (bs->backing) {
-if (qed_offset_into_cluster(s, sector_num * BDRV_SECTOR_SIZE) != 0) {
+if (qed_offset_into_cluster(s, offset) != 0) {
 return -ENOTSUP;
 }
-if (qed_offset_into_cluster(s, nb_sectors * BDRV_SECTOR_SIZE) != 0) {
+if (qed_offset_into_cluster(s, count) != 0) {
 return -ENOTSUP;
 }
 }
@@ -1454,12 +1454,13 @@ static int coroutine_fn 
bdrv_qed_co_write_zeroes(BlockDriverState *bs,
 /* Zero writes start without an I/O buffer.  If a buffer becomes necessary
  * then it will be allocated during request processing.
  */
-iov.iov_base = NULL,
-iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE,
+iov.iov_base = NULL;
+iov.iov_len = count;

 qemu_iovec_init_external(&qiov, &iov, 1);
-blockacb = qed_aio_setup(bs, sector_num, &qiov, nb_sectors,
- qed_co_write_zeroes_cb, &cb,
+blockacb = qed_aio_setup(bs, offset >> BDRV_SECTOR_BITS, &qiov,
+ count >> BDRV_SECTOR_BITS,
+ qed_co_pwrite_zeroes_cb, &cb,
  QED_AIOCB_WRITE | QED_AIOCB_ZERO);
 if (!blockacb) {
 return -EIO;
@@ -1664,7 +1665,7 @@ static BlockDriver bdrv_qed = {
 .bdrv_co_get_block_status = bdrv_qed_co_get_block_status,
 .bdrv_aio_readv   = bdrv_qed_aio_readv,
 .bdrv_aio_writev  = bdrv_qed_aio_writev,
-.bdrv_co_write_zeroes = bdrv_qed_co_write_zeroes,
+.bdrv_co_pwrite_zeroes= bdrv_qed_co_pwrite_zeroes,
 .bdrv_truncate= bdrv_qed_truncate,
 .bdrv_getlength   = bdrv_qed_getlength,
 .bdrv_get_info= bdrv_qed_get_info,
-- 
2.5.5

[Qemu-devel] [PATCH 07/13] blkreplay: Convert to bdrv_co_pwrite_zeroes()

2016-05-24 Thread Eric Blake

Another step on our continuing quest to switch to byte-based
interfaces.

Signed-off-by: Eric Blake 
---
 block/blkreplay.c | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/block/blkreplay.c b/block/blkreplay.c
index 1a721ad..525c2d5 100755
--- a/block/blkreplay.c
+++ b/block/blkreplay.c
@@ -103,13 +103,11 @@ static int coroutine_fn 
blkreplay_co_writev(BlockDriverState *bs,
 return ret;
 }

-static int coroutine_fn blkreplay_co_write_zeroes(BlockDriverState *bs,
-int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+static int coroutine_fn blkreplay_co_pwrite_zeroes(BlockDriverState *bs,
+int64_t offset, int count, BdrvRequestFlags flags)
 {
 uint64_t reqid = request_id++;
-int ret = bdrv_co_pwrite_zeroes(bs->file->bs,
-sector_num << BDRV_SECTOR_BITS,
-nb_sectors << BDRV_SECTOR_BITS, flags);
+int ret = bdrv_co_pwrite_zeroes(bs->file->bs, offset, count, flags);
 block_request_create(reqid, bs, qemu_coroutine_self());
 qemu_coroutine_yield();

@@ -149,7 +147,7 @@ static BlockDriver bdrv_blkreplay = {
 .bdrv_co_readv  = blkreplay_co_readv,
 .bdrv_co_writev = blkreplay_co_writev,

-.bdrv_co_write_zeroes   = blkreplay_co_write_zeroes,
+.bdrv_co_pwrite_zeroes  = blkreplay_co_pwrite_zeroes,
 .bdrv_co_discard= blkreplay_co_discard,
 .bdrv_co_flush  = blkreplay_co_flush,
 };
-- 
2.5.5

[Qemu-devel] [PATCH 03/13] block: Add .bdrv_co_pwrite_zeroes()

2016-05-24 Thread Eric Blake

Update bdrv_co_do_write_zeroes() to be byte-based, and select
between the new byte-based bdrv_co_pwrite_zeroes() or the old
bdrv_co_write_zeroes().  The next patches will convert drivers,
then remove the old interface.

Signed-off-by: Eric Blake 
---
 include/block/block_int.h |  4 ++-
 block/io.c| 81 +--
 2 files changed, 47 insertions(+), 38 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index 4282ffd..fa7e3f9 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -165,6 +165,8 @@ struct BlockDriver {
  */
 int coroutine_fn (*bdrv_co_write_zeroes)(BlockDriverState *bs,
 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
+int coroutine_fn (*bdrv_co_pwrite_zeroes)(BlockDriverState *bs,
+int64_t offset, int count, BdrvRequestFlags flags);
 int coroutine_fn (*bdrv_co_discard)(BlockDriverState *bs,
 int64_t sector_num, int nb_sectors);
 int64_t coroutine_fn (*bdrv_co_get_block_status)(BlockDriverState *bs,
@@ -454,7 +456,7 @@ struct BlockDriverState {
 unsigned int request_alignment;
 /* Flags honored during pwrite (so far: BDRV_REQ_FUA) */
 unsigned int supported_write_flags;
-/* Flags honored during write_zeroes (so far: BDRV_REQ_FUA,
+/* Flags honored during pwrite_zeroes (so far: BDRV_REQ_FUA,
  * BDRV_REQ_MAY_UNMAP) */
 unsigned int supported_zero_flags;

diff --git a/block/io.c b/block/io.c
index 41b4e9d..c1d700b 100644
--- a/block/io.c
+++ b/block/io.c
@@ -42,8 +42,8 @@ static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
  void *opaque,
  bool is_write);
 static void coroutine_fn bdrv_co_do_rw(void *opaque);
-static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
-int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
+static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
+int64_t offset, int count, BdrvRequestFlags flags);

 static void bdrv_parent_drained_begin(BlockDriverState *bs)
 {
@@ -876,10 +876,12 @@ static int coroutine_fn 
bdrv_co_do_copy_on_readv(BlockDriverState *bs,
 goto err;
 }

-if (drv->bdrv_co_write_zeroes &&
+if ((drv->bdrv_co_write_zeroes || drv->bdrv_co_pwrite_zeroes) &&
 buffer_is_zero(bounce_buffer, iov.iov_len)) {
-ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
-  cluster_nb_sectors, 0);
+ret = bdrv_co_do_pwrite_zeroes(bs,
+   cluster_sector_num * BDRV_SECTOR_SIZE,
+   cluster_nb_sectors * BDRV_SECTOR_SIZE,
+   0);
 } else {
 /* This does not change the data on the disk, it is not necessary
  * to flush even in cache=writethrough mode.
@@ -,8 +1113,8 @@ int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState 
*bs,

 #define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768

-static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
-int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
+int64_t offset, int count, BdrvRequestFlags flags)
 {
 BlockDriver *drv = bs->drv;
 QEMUIOVector qiov;
@@ -1121,40 +1123,45 @@ static int coroutine_fn 
bdrv_co_do_write_zeroes(BlockDriverState *bs,
 bool need_flush = false;

 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
-int max_write_zeroes_sectors = max_write_zeroes >> BDRV_SECTOR_BITS;
-int write_zeroes_sector_align =
-bs->bl.pwrite_zeroes_alignment >> BDRV_SECTOR_BITS;
+int alignment = MAX(bs->bl.pwrite_zeroes_alignment, BDRV_SECTOR_SIZE);

-assert(nb_sectors <= BDRV_REQUEST_MAX_SECTORS);
-while (nb_sectors > 0 && !ret) {
-int num = nb_sectors;
+while (count > 0 && !ret) {
+int num = count;

 /* Align request.  Block drivers can expect the "bulk" of the request
  * to be aligned.
  */
-if (write_zeroes_sector_align
-&& num > write_zeroes_sector_align) {
-if (sector_num % write_zeroes_sector_align != 0) {
+if (count > alignment) {
+if (offset % alignment) {
 /* Make a small request up to the first aligned sector.  */
-num = write_zeroes_sector_align;
-num -= sector_num % write_zeroes_sector_align;
-} else if ((sector_num + num) % write_zeroes_sector_align != 0) {
+num = alignment - (offset % alignment);
+} else if ((offset + count) % alignment) {
 /* Shorten the request to the last aligned sector.  num cannot
- * underflow because num > write_zeroes_sector_align.
+ * underflow because num > alignment.
  */

[Qemu-devel] [PATCH 13/13] block: Kill bdrv_co_write_zeroes()

2016-05-24 Thread Eric Blake

Now that all drivers have been converted to a byte interface,
we no longer need a sector interface.

Signed-off-by: Eric Blake 
---
 include/block/block_int.h |  2 --
 block/io.c| 15 ++-
 2 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index fa7e3f9..129263a 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -163,8 +163,6 @@ struct BlockDriver {
  * function pointer may be NULL or return -ENOSUP and .bdrv_co_writev()
  * will be called instead.
  */
-int coroutine_fn (*bdrv_co_write_zeroes)(BlockDriverState *bs,
-int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
 int coroutine_fn (*bdrv_co_pwrite_zeroes)(BlockDriverState *bs,
 int64_t offset, int count, BdrvRequestFlags flags);
 int coroutine_fn (*bdrv_co_discard)(BlockDriverState *bs,
diff --git a/block/io.c b/block/io.c
index ea8135f..4577228 100644
--- a/block/io.c
+++ b/block/io.c
@@ -880,7 +880,7 @@ static int coroutine_fn 
bdrv_co_do_copy_on_readv(BlockDriverState *bs,
 goto err;
 }

-if ((drv->bdrv_co_write_zeroes || drv->bdrv_co_pwrite_zeroes) &&
+if (drv->bdrv_co_pwrite_zeroes &&
 buffer_is_zero(bounce_buffer, iov.iov_len)) {
 ret = bdrv_co_do_pwrite_zeroes(bs,
cluster_sector_num * BDRV_SECTOR_SIZE,
@@ -1161,16 +1161,6 @@ static int coroutine_fn 
bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
 !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
 need_flush = true;
 }
-} else if (drv->bdrv_co_write_zeroes) {
-assert(offset % BDRV_SECTOR_SIZE == 0);
-assert(count % BDRV_SECTOR_SIZE == 0);
-ret = drv->bdrv_co_write_zeroes(bs, offset >> BDRV_SECTOR_BITS,
-num >> BDRV_SECTOR_BITS,
-flags & bs->supported_zero_flags);
-if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
-!(bs->supported_zero_flags & BDRV_REQ_FUA)) {
-need_flush = true;
-}
 } else {
 assert(!bs->supported_zero_flags);
 }
@@ -1250,8 +1240,7 @@ static int coroutine_fn 
bdrv_aligned_pwritev(BlockDriverState *bs,
 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);

 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
-!(flags & BDRV_REQ_ZERO_WRITE) &&
-(drv->bdrv_co_pwrite_zeroes || drv->bdrv_co_write_zeroes) &&
+!(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
 qemu_iovec_is_zero(qiov)) {
 flags |= BDRV_REQ_ZERO_WRITE;
 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
-- 
2.5.5

[Qemu-devel] [PATCH 05/13] iscsi: Convert to bdrv_co_pwrite_zeroes()

2016-05-24 Thread Eric Blake

Another step on our continuing quest to switch to byte-based
interfaces.

As this is the first byte-based iscsi interface, convert
is_request_lun_aligned() into two versions, one for sectors
and one for bytes.

Signed-off-by: Eric Blake 
---
 block/iscsi.c | 53 +++--
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index 0acc3dc..3dbfd57 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -401,18 +401,25 @@ static int64_t sector_qemu2lun(int64_t sector, IscsiLun 
*iscsilun)
 return sector * BDRV_SECTOR_SIZE / iscsilun->block_size;
 }

-static bool is_request_lun_aligned(int64_t sector_num, int nb_sectors,
-  IscsiLun *iscsilun)
+static bool is_byte_request_lun_aligned(int64_t offset, int count,
+IscsiLun *iscsilun)
 {
-if ((sector_num * BDRV_SECTOR_SIZE) % iscsilun->block_size ||
-(nb_sectors * BDRV_SECTOR_SIZE) % iscsilun->block_size) {
-error_report("iSCSI misaligned request: "
- "iscsilun->block_size %u, sector_num %" PRIi64
- ", nb_sectors %d",
- iscsilun->block_size, sector_num, nb_sectors);
-return 0;
+if (offset % iscsilun->block_size || count % iscsilun->block_size) {
+error_report("iSCSI misaligned request: "
+ "iscsilun->block_size %u, offset %" PRIi64
+ ", count %d",
+ iscsilun->block_size, offset, count);
+return false;
 }
-return 1;
+return true;
+}
+
+static bool is_sector_request_lun_aligned(int64_t sector_num, int nb_sectors,
+  IscsiLun *iscsilun)
+{
+return is_byte_request_lun_aligned(sector_num << BDRV_SECTOR_BITS,
+   nb_sectors << BDRV_SECTOR_BITS,
+   iscsilun);
 }

 static unsigned long *iscsi_allocationmap_init(IscsiLun *iscsilun)
@@ -461,7 +468,7 @@ iscsi_co_writev_flags(BlockDriverState *bs, int64_t 
sector_num, int nb_sectors,
 if (fua) {
 assert(iscsilun->dpofua);
 }
-if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
 return -EINVAL;
 }

@@ -541,7 +548,7 @@ static int64_t coroutine_fn 
iscsi_co_get_block_status(BlockDriverState *bs,

 iscsi_co_init_iscsitask(iscsilun, &iTask);

-if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
 ret = -EINVAL;
 goto out;
 }
@@ -638,7 +645,7 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs,
 uint64_t lba;
 uint32_t num_sectors;

-if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
 return -EINVAL;
 }

@@ -918,7 +925,7 @@ coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t 
sector_num,
 struct IscsiTask iTask;
 struct unmap_list list;

-if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
 return -EINVAL;
 }

@@ -969,8 +976,8 @@ retry:
 }

 static int
-coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
-   int nb_sectors, BdrvRequestFlags flags)
+coroutine_fn iscsi_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
+int count, BdrvRequestFlags flags)
 {
 IscsiLun *iscsilun = bs->opaque;
 struct IscsiTask iTask;
@@ -978,7 +985,7 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, 
int64_t sector_num,
 uint32_t nb_blocks;
 bool use_16_for_ws = iscsilun->use_16_for_rw;

-if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+if (!is_byte_request_lun_aligned(offset, count, iscsilun)) {
 return -EINVAL;
 }

@@ -1000,8 +1007,8 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, 
int64_t sector_num,
 return -ENOTSUP;
 }

-lba = sector_qemu2lun(sector_num, iscsilun);
-nb_blocks = sector_qemu2lun(nb_sectors, iscsilun);
+lba = offset / iscsilun->block_size;
+nb_blocks = count / iscsilun->block_size;

 if (iscsilun->zeroblock == NULL) {
 iscsilun->zeroblock = g_try_malloc0(iscsilun->block_size);
@@ -1057,9 +1064,11 @@ retry:
 }

 if (flags & BDRV_REQ_MAY_UNMAP) {
-iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors);
+iscsi_allocationmap_clear(iscsilun, offset >> BDRV_SECTOR_BITS,
+  count >> BDRV_SECTOR_BITS);
 } else {
-iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors);
+iscsi_allocationmap_set(iscsilun, off

[Qemu-devel] [PATCH 00/13] Kill sector-based write_zeroes

2016-05-24 Thread Eric Blake

Kevin pointed out that my recent change to byte-based instead
of sector-based blk_write_zeroes() (commit 983a1600) makes life
harder as long as bdrv_write_zeroes is still sector-based, and
where the compiler doesn't flag any change in parameter types.
Complete the conversion, by renaming things (so the compiler
will help flag any future rebase conflicts), and making all
write_zeroes operations nominally take bytes.

Definitely conflicts with Denis' qcow2_co_write_zeroes improvements
series, and probably with Kevin's conversion of block jobs to
BlockBackend. I can rebase if those land on the block branch first.

Eric Blake (13):
  block: Rename blk_write_zeroes()
  block: Track write zero limits in bytes
  block: Add .bdrv_co_pwrite_zeroes()
  block: Switch bdrv_write_zeroes() to byte interface
  iscsi: Convert to bdrv_co_pwrite_zeroes()
  qcow2: Convert to bdrv_co_pwrite_zeroes()
  blkreplay: Convert to bdrv_co_pwrite_zeroes()
  gluster: Convert to bdrv_co_pwrite_zeroes()
  qed: Convert to bdrv_co_pwrite_zeroes()
  raw-posix: Convert to bdrv_co_pwrite_zeroes()
  raw_bsd: Convert to bdrv_co_pwrite_zeroes()
  vmdk: Convert to bdrv_co_pwrite_zeroes()
  block: Kill bdrv_co_write_zeroes()

 include/block/block.h  |  16 +++
 include/block/block_int.h  |  14 +++---
 include/sysemu/block-backend.h |  14 +++---
 block/backup.c |   7 +--
 block/blkreplay.c  |   8 ++--
 block/block-backend.c  |  14 +++---
 block/gluster.c|  15 +++---
 block/io.c | 106 ++---
 block/iscsi.c  |  59 +--
 block/mirror.c |   7 +--
 block/parallels.c  |   8 ++--
 block/qcow2-cluster.c  |   3 +-
 block/qcow2.c  |  57 +++---
 block/qed.c|  27 ++-
 block/raw-posix.c  |  37 ++
 block/raw_bsd.c|  10 ++--
 block/vmdk.c   |  19 
 hw/scsi/scsi-disk.c|   2 +-
 migration/block.c  |   5 +-
 qemu-img.c |   4 +-
 qemu-io-cmds.c |  22 -
 tests/qemu-iotests/034 |   2 +-
 tests/qemu-iotests/154 |   2 +-
 trace-events   |   6 +--
 24 files changed, 238 insertions(+), 226 deletions(-)

-- 
2.5.5

Re: [Qemu-devel] [PATCH v5 13/18] qht: support parallel writes

2016-05-24 Thread Sergey Fedorov

On 25/05/16 01:07, Emilio G. Cota wrote:
> On Mon, May 23, 2016 at 23:28:27 +0300, Sergey Fedorov wrote:
>> What if we turn qht::lock into a mutex and change the function as follows:
>>
>> static inline
>> struct qht_bucket *qht_bucket_lock__no_stale(struct qht *ht,
>> uint32_t hash,
>>  struct qht_map
>> **pmap)   
>> {
>> struct qht_bucket *b;
>> struct qht_map *map;
>>
>> map = atomic_rcu_read(&ht->map);
>> b = qht_map_to_bucket(map, hash);
>>
>> qemu_spin_lock(&b->lock);
>> /* 'ht->map' access is serialized by 'b->lock' here */
>> if (likely(map == ht->map)) {
>> /* no resize in progress; we're done */
>> *pmap = map;
>> return b;
>> }
>> qemu_spin_unlock(&b->lock);
>>
>> /* resize in progress; retry grabbing 'ht->lock' */
>> qemu_mutex_lock(&ht->lock);
>> b = qht_map_to_bucket(ht->map, hash);
>> *pmap = ht->map;
>> qemu_spin_lock(&b->lock);
>> qemu_mutex_unlock(&ht->lock);
>>
>> return b;
>> }
>>
>>
>> With this implementation we could:
>>  (1) get rid of qht_map::stale
>>  (2) don't waste cycles waiting for resize to complete
> I'll include this in v6.

How is it by perf?

Regards,
Sergey

Re: [Qemu-devel] [PATCH v5 13/18] qht: support parallel writes

2016-05-24 Thread Emilio G. Cota

On Mon, May 23, 2016 at 23:28:27 +0300, Sergey Fedorov wrote:
> What if we turn qht::lock into a mutex and change the function as follows:
> 
> static inline
> struct qht_bucket *qht_bucket_lock__no_stale(struct qht *ht,
> uint32_t hash,
>  struct qht_map
> **pmap)   
> {
> struct qht_bucket *b;
> struct qht_map *map;
> 
> map = atomic_rcu_read(&ht->map);
> b = qht_map_to_bucket(map, hash);
> 
> qemu_spin_lock(&b->lock);
> /* 'ht->map' access is serialized by 'b->lock' here */
> if (likely(map == ht->map)) {
> /* no resize in progress; we're done */
> *pmap = map;
> return b;
> }
> qemu_spin_unlock(&b->lock);
> 
> /* resize in progress; retry grabbing 'ht->lock' */
> qemu_mutex_lock(&ht->lock);
> b = qht_map_to_bucket(ht->map, hash);
> *pmap = ht->map;
> qemu_spin_lock(&b->lock);
> qemu_mutex_unlock(&ht->lock);
> 
> return b;
> }
> 
> 
> With this implementation we could:
>  (1) get rid of qht_map::stale
>  (2) don't waste cycles waiting for resize to complete

I'll include this in v6.

Thanks,

Emilio

Re: [Qemu-devel] [PATCH v6 1/3] loader: Allow ELF loader to auto-detect the ELF arch

2016-05-24 Thread Cleber Rosa



On 05/13/2016 05:37 PM, Alistair Francis wrote:


+if (elf_machine < 1) {
+/* The caller didn't specify and ARCH, we can figure it out */


Spotted a comment typo: s/and/an/


+lseek(fd, 0x12, SEEK_SET);
+if (read(fd, &e_machine, sizeof(e_machine)) != sizeof(e_machine)) {
+goto fail;
+}
+elf_machine = e_machine;
+}
+


--
Cleber Rosa
[ Sr Software Engineer - Virtualization Team - Red Hat ]
[ Avocado Test Framework - avocado-framework.github.io ]

Re: [Qemu-devel] [RFC v2 10/11] tcg: drop global lock during TCG code execution

2016-05-24 Thread Sergey Fedorov

On 05/04/16 18:32, Alex Bennée wrote:
> diff --git a/cpu-exec.c b/cpu-exec.c
> index bd50fef..f558508 100644
> --- a/cpu-exec.c
> +++ b/cpu-exec.c
> @@ -28,6 +28,7 @@
>  #include "qemu/rcu.h"
>  #include "exec/tb-hash.h"
>  #include "exec/log.h"
> +#include "qemu/main-loop.h"
>  #if defined(TARGET_I386) && !defined(CONFIG_USER_ONLY)
>  #include "hw/i386/apic.h"
>  #endif
> @@ -452,6 +453,8 @@ int cpu_exec(CPUState *cpu)
>  for(;;) {
>  interrupt_request = cpu->interrupt_request;
>  if (unlikely(interrupt_request)) {
> +qemu_mutex_lock_iothread();
> +
>  if (unlikely(cpu->singlestep_enabled & SSTEP_NOIRQ)) {
>  /* Mask out external interrupts for this step. */
>  interrupt_request &= ~CPU_INTERRUPT_SSTEP_MASK;
> @@ -504,6 +507,11 @@ int cpu_exec(CPUState *cpu)
> the program flow was changed */
>  next_tb = 0;
>  }
> +
> +if (qemu_mutex_iothread_locked()) {
> +qemu_mutex_unlock_iothread();
> +}
> +

Why do we need to check for "qemu_mutex_iothread_locked()" here?
iothread lock is always held here, isn't it?

>  }
>  if (unlikely(cpu->exit_request
>   || replay_has_interrupt())) {
(snip)
> diff --git a/cputlb.c b/cputlb.c
> index 43b..1412049 100644
> --- a/cputlb.c
> +++ b/cputlb.c
> @@ -18,6 +18,7 @@
>   */
>  
>  #include "qemu/osdep.h"
> +#include "qemu/main-loop.h"

No need in this include.

>  #include "cpu.h"
>  #include "exec/exec-all.h"
>  #include "exec/memory.h"
> diff --git a/exec.c b/exec.c
> index c46c123..9e83673 100644
> --- a/exec.c
> +++ b/exec.c
(snip)
> @@ -2347,8 +2353,14 @@ static void io_mem_init(void)
>  memory_region_init_io(&io_mem_rom, NULL, &unassigned_mem_ops, NULL, 
> NULL, UINT64_MAX);
>  memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, 
> NULL,
>NULL, UINT64_MAX);
> +
> +/* io_mem_notdirty calls tb_invalidate_phys_page_fast,
> + * which must be called without the iothread mutex.
> + */

notdirty_mem_write() operates on page dirty flags. Is it safe to do so
out of iothread lock?

>  memory_region_init_io(&io_mem_notdirty, NULL, ¬dirty_mem_ops, NULL,
>NULL, UINT64_MAX);
> +memory_region_clear_global_locking(&io_mem_notdirty);
> +
>  memory_region_init_io(&io_mem_watch, NULL, &watch_mem_ops, NULL,
>NULL, UINT64_MAX);
>  }
(snip)
> diff --git a/translate-all.c b/translate-all.c
> index 935d24c..0c377bb 100644
> --- a/translate-all.c
> +++ b/translate-all.c
> @@ -1318,6 +1318,7 @@ void tb_invalidate_phys_range(tb_page_addr_t start, 
> tb_page_addr_t end)
>   * this TB.
>   *
>   * Called with mmap_lock held for user-mode emulation
> + * If called from generated code, iothread mutex must not be held.

What does that mean? If iothread mutex is not required by the function
then why mention it here at all?

Kind regards,
Sergey

Re: [Qemu-devel] [PATCH v2 2/3] atomics: emit an smp_read_barrier_depends() barrier only for Sparc and Thread Sanitizer

2016-05-24 Thread Emilio G. Cota

On Tue, May 24, 2016 at 23:09:47 +0300, Sergey Fedorov wrote:
> On 24/05/16 23:06, Emilio G. Cota wrote:
> > For correctness, smp_read_barrier_depends() is only required to
> > emit a barrier on Sparc hosts. However, we are currently emitting
> > a consume fence unconditionally.
> >
> > Fix it by keeping the consume fence if we're compiling with Thread
> > Sanitizer, since this might help prevent false warnings. Otherwise,
> > only emit the barrier for Sparc hosts. Note that we still guarantee
> > that smp_read_barrier_depends() is a compiler barrier.
> 
> s/Sparc/Alpha/?

Obviously :-) Typo also in patch 3 -- apologies.

E.

[Qemu-devel] [PATCH v2 3/3] atomics: do not emit consume barrier for atomic_rcu_read

2016-05-24 Thread Emilio G. Cota

Currently we emit a consume-load in atomic_rcu_read. This is
overkill for non-Sparc hosts, and is only useful to make
things easier for Thread Sanitizer, which as far as I understand
works best without explicit fences.

The appended leaves the consume-load in atomic_rcu_read when
compiling with Thread Sanitizer enabled, and resorts to a
relaxed load + smp_read_barrier_depends otherwise.

On an RMO host architecture, such as aarch64, the performance
improvement of this change is easily measurable. For instance,
qht-bench performs an atomic_rcu_read on every lookup. Performance
before and after applying this patch:

$ tests/qht-bench -d 5 -n 1
Before: 9.78 MT/s
After:  10.96 MT/s

Signed-off-by: Emilio G. Cota 
---
 include/qemu/atomic.h | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h
index 4a4f2fb..c5b6c8d 100644
--- a/include/qemu/atomic.h
+++ b/include/qemu/atomic.h
@@ -63,13 +63,20 @@
 __atomic_store(ptr, &_val, __ATOMIC_RELAXED); \
 } while(0)
 
-/* Atomic RCU operations imply weak memory barriers */
+#ifdef __SANITIZE_THREAD__
+#define atomic_rcu_read__nocheck(ptr, valptr)   \
+__atomic_load(ptr, valptr, __ATOMIC_CONSUME);
+#else
+#define atomic_rcu_read__nocheck(ptr, valptr)   \
+__atomic_load(ptr, valptr, __ATOMIC_RELAXED);   \
+smp_read_barrier_depends();
+#endif
 
 #define atomic_rcu_read(ptr)  \
 ({\
 QEMU_BUILD_BUG_ON(sizeof(*ptr) > sizeof(void *)); \
 typeof(*ptr) _val;\
-__atomic_load(ptr, &_val, __ATOMIC_CONSUME);  \
+atomic_rcu_read__nocheck(ptr, &_val); \
 _val; \
 })
 
-- 
2.5.0

Re: [Qemu-devel] [PATCH v2 2/3] atomics: emit an smp_read_barrier_depends() barrier only for Sparc and Thread Sanitizer

2016-05-24 Thread Sergey Fedorov

On 24/05/16 23:06, Emilio G. Cota wrote:
> For correctness, smp_read_barrier_depends() is only required to
> emit a barrier on Sparc hosts. However, we are currently emitting
> a consume fence unconditionally.
>
> Fix it by keeping the consume fence if we're compiling with Thread
> Sanitizer, since this might help prevent false warnings. Otherwise,
> only emit the barrier for Sparc hosts. Note that we still guarantee
> that smp_read_barrier_depends() is a compiler barrier.

s/Sparc/Alpha/?

Regards,
Sergey

>
> Signed-off-by: Emilio G. Cota 
> ---
>  include/qemu/atomic.h | 7 +++
>  1 file changed, 7 insertions(+)
>
> diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h
> index 5bc4d6c..4a4f2fb 100644
> --- a/include/qemu/atomic.h
> +++ b/include/qemu/atomic.h
> @@ -36,7 +36,14 @@
>  #define smp_wmb()   ({ barrier(); __atomic_thread_fence(__ATOMIC_RELEASE); 
> barrier(); })
>  #define smp_rmb()   ({ barrier(); __atomic_thread_fence(__ATOMIC_ACQUIRE); 
> barrier(); })
>  
> +#if defined(__SANITIZE_THREAD__)
>  #define smp_read_barrier_depends() ({ barrier(); 
> __atomic_thread_fence(__ATOMIC_CONSUME); barrier(); })
> +#elsif defined(__alpha__)
> +#define smp_read_barrier_depends()   asm volatile("mb":::"memory")
> +#else
> +#define smp_read_barrier_depends()   barrier()
> +#endif
> +
>  
>  /* Weak atomic operations prevent the compiler moving other
>   * loads/stores past the atomic operation load/store. However there is

[Qemu-devel] [PATCH v2 1/3] docs/atomics: update atomic_read/set comparison with Linux

2016-05-24 Thread Emilio G. Cota

Recently Linux did a mass conversion of its atomic_read/set calls
so that they at least are READ/WRITE_ONCE. See Linux's commit
62e8a325 ("atomic, arch: Audit atomic_{read,set}()"). It seems though
that their documentation hasn't been updated to reflect this.

The appended updates our documentation to reflect the change, which
means there is effectively no difference between our atomic_read/set
and the current Linux implementation.

While at it, fix the statement that a barrier is implied by
atomic_read/set, which is incorrect. Volatile/atomic semantics prevent
transformations pertaining the variable they apply to; this, however,
has no effect on surrounding statements like barriers do. For more
details on this, see:
  https://gcc.gnu.org/onlinedocs/gcc/Volatiles.html

Signed-off-by: Emilio G. Cota 
---
 docs/atomics.txt | 16 +---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/docs/atomics.txt b/docs/atomics.txt
index ef285e3..7540990 100644
--- a/docs/atomics.txt
+++ b/docs/atomics.txt
@@ -326,9 +326,19 @@ and memory barriers, and the equivalents in QEMU:
   use a boxed atomic_t type; atomic operations in QEMU are polymorphic
   and use normal C types.
 
-- atomic_read and atomic_set in Linux give no guarantee at all;
-  atomic_read and atomic_set in QEMU include a compiler barrier
-  (similar to the ACCESS_ONCE macro in Linux).
+- Originally, atomic_read and atomic_set in Linux gave no guarantee
+  at all. Recently they have been updated to implement volatile
+  semantics via ACCESS_ONCE (or the more recent READ/WRITE_ONCE).
+
+  QEMU's atomic_read/set implement, if the compiler supports it, C11
+  atomic relaxed semantics, and volatile semantics otherwise.
+  Both semantics prevent the compiler from doing certain transformations;
+  the difference is that atomic accesses are guaranteed to be atomic,
+  while volatile accesses aren't. Thus, in the volatile case we just cross
+  our fingers hoping that the compiler will generate atomic accesses,
+  since we assume the variables passed are machine-word sized and
+  properly aligned.
+  No barriers are implied by atomic_read/set in either Linux or QEMU.
 
 - most atomic read-modify-write operations in Linux return void;
   in QEMU, all of them return the old value of the variable.
-- 
2.5.0

[Qemu-devel] [PATCH v2 2/3] atomics: emit an smp_read_barrier_depends() barrier only for Sparc and Thread Sanitizer

2016-05-24 Thread Emilio G. Cota

For correctness, smp_read_barrier_depends() is only required to
emit a barrier on Sparc hosts. However, we are currently emitting
a consume fence unconditionally.

Fix it by keeping the consume fence if we're compiling with Thread
Sanitizer, since this might help prevent false warnings. Otherwise,
only emit the barrier for Sparc hosts. Note that we still guarantee
that smp_read_barrier_depends() is a compiler barrier.

Signed-off-by: Emilio G. Cota 
---
 include/qemu/atomic.h | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h
index 5bc4d6c..4a4f2fb 100644
--- a/include/qemu/atomic.h
+++ b/include/qemu/atomic.h
@@ -36,7 +36,14 @@
 #define smp_wmb()   ({ barrier(); __atomic_thread_fence(__ATOMIC_RELEASE); 
barrier(); })
 #define smp_rmb()   ({ barrier(); __atomic_thread_fence(__ATOMIC_ACQUIRE); 
barrier(); })
 
+#if defined(__SANITIZE_THREAD__)
 #define smp_read_barrier_depends() ({ barrier(); 
__atomic_thread_fence(__ATOMIC_CONSUME); barrier(); })
+#elsif defined(__alpha__)
+#define smp_read_barrier_depends()   asm volatile("mb":::"memory")
+#else
+#define smp_read_barrier_depends()   barrier()
+#endif
+
 
 /* Weak atomic operations prevent the compiler moving other
  * loads/stores past the atomic operation load/store. However there is
-- 
2.5.0

[Qemu-devel] [PATCH v2 0/3] atomics: fix RCU perf. regression + update documentation

2016-05-24 Thread Emilio G. Cota

v1: https://lists.gnu.org/archive/html/qemu-devel/2016-05/msg03661.html

Patch 1 hasn't changed from v1 (where it was patch 2 though).

Patches 2 and 3 fix a not-so-small-after-all RCU performance regression
we introduced when transitioning to __atomic primitives. I got
an arm64 machine to test today and a workload that issues a lot of
atomic_read_rcu's, such as a 100%-lookup qht-bench test,
can gain ~12% in performance. [ in v1's 0/2 message I mentioned rcutorture;
It turns out it's not as dependent on atomic_read_rcu as I thought,
so it's not a good benchmark to measure this effect. ]

Thanks,

Emilio

Re: [Qemu-devel] [PATCH 1/2] atomics: do not use __atomic primitives for RCU atomics

2016-05-24 Thread Sergey Fedorov

On 24/05/16 22:56, Emilio G. Cota wrote:
> On Tue, May 24, 2016 at 09:08:01 +0200, Paolo Bonzini wrote:
>> On 23/05/2016 19:09, Emilio G. Cota wrote:
>>> PS. And really equating smp_wmb/rmb to release/acquire as we have under
>>> #ifdef __ATOMIC is hard to justify, other than to please tsan.
>> That only makes a difference on arm64, right?
>>
>>  acquire release rmb wmb
>> x86  --  --  --  --
>> powerlwsync  lwsync  lwsync  lwsync
>> armv7dmb dmb dmb dmb
>> arm64dmb ishld   dmb ish dmb ishld   dmb ishst
>> ia64 --  --  --  --
> Yes. I now see why we're defining rmb/wmb based on acquire/release:
> it's quite convenient given that the compiler provides them, and
> the (tiny) differences in practice are not worth the trouble of
> adding asm for them. So I take back my comment =)
>
> The gains of getting rid of the consume barrier from atomic_rcu_read
> are clear though; updated patch to follow.

However, maybe it's not such a pain to maintain an optimized version for
AArch64 in assembly :P

Best,
Sergey

[Qemu-devel] [RFC PATCH v4 3/3] VFIO Type1 IOMMU: Add support for mediated devices

2016-05-24 Thread Kirti Wankhede

VFIO Type1 IOMMU driver is designed for the devices which are IOMMU
capable. Mediated device only uses IOMMU TYPE1 API, the underlying
hardware can be managed by an IOMMU domain.

This change exports functions to pin and unpin pages for mediated devices.
It maintains data of pinned pages for mediated domain. This data is used to
verify unpinning request and to unpin remaining pages from detach_group()
if there are any.

Aim of this change is:
- To use most of the code of IOMMU driver for mediated devices
- To support direct assigned device and mediated device by single module

Updated the change to keep mediated domain structure out of domain_list.

Tested by assigning below combinations of devices to a single VM:
- GPU pass through only
- vGPU device only
- One GPU pass through and one vGPU device
- two GPU pass through

Signed-off-by: Kirti Wankhede 
Signed-off-by: Neo Jia 
Change-Id: I9c262abc9c68fd6abf52d91a636bf0cc631593a0
---
 drivers/vfio/vfio_iommu_type1.c | 433 +---
 include/linux/vfio.h|   6 +
 2 files changed, 407 insertions(+), 32 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 75b24e93cedb..5cc7dc0288a3 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -36,6 +36,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define DRIVER_VERSION  "0.2"
 #define DRIVER_AUTHOR   "Alex Williamson "
@@ -55,6 +56,7 @@ MODULE_PARM_DESC(disable_hugepages,
 
 struct vfio_iommu {
struct list_headdomain_list;
+   struct vfio_domain  *mediated_domain;
struct mutexlock;
struct rb_root  dma_list;
boolv2;
@@ -67,6 +69,13 @@ struct vfio_domain {
struct list_headgroup_list;
int prot;   /* IOMMU_CACHE */
boolfgsp;   /* Fine-grained super pages */
+
+   /* Domain for mediated device which is without physical IOMMU */
+   boolmediated_device;
+
+   struct mm_struct*mm;
+   struct rb_root  pfn_list;   /* pinned Host pfn list */
+   struct mutexpfn_list_lock;  /* mutex for pfn_list */
 };
 
 struct vfio_dma {
@@ -79,10 +88,23 @@ struct vfio_dma {
 
 struct vfio_group {
struct iommu_group  *iommu_group;
+   struct mdev_device  *mdevice;
struct list_headnext;
 };
 
 /*
+ * Guest RAM pinning working set or DMA target
+ */
+struct vfio_pfn {
+   struct rb_node  node;
+   unsigned long   vaddr;  /* virtual addr */
+   dma_addr_t  iova;   /* IOVA */
+   unsigned long   npage;  /* number of pages */
+   unsigned long   pfn;/* Host pfn */
+   size_t  prot;
+   atomic_tref_count;
+};
+/*
  * This code handles mapping and unmapping of user data buffers
  * into DMA'ble space using the IOMMU
  */
@@ -130,6 +152,64 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, 
struct vfio_dma *old)
rb_erase(&old->node, &iommu->dma_list);
 }
 
+/*
+ * Helper Functions for host pfn list
+ */
+
+static struct vfio_pfn *vfio_find_pfn(struct vfio_domain *domain,
+ unsigned long pfn)
+{
+   struct rb_node *node;
+   struct vfio_pfn *vpfn, *ret = NULL;
+
+   mutex_lock(&domain->pfn_list_lock);
+   node = domain->pfn_list.rb_node;
+
+   while (node) {
+   vpfn = rb_entry(node, struct vfio_pfn, node);
+
+   if (pfn < vpfn->pfn)
+   node = node->rb_left;
+   else if (pfn > vpfn->pfn)
+   node = node->rb_right;
+   else {
+   ret = vpfn;
+   break;
+   }
+   }
+
+   mutex_unlock(&domain->pfn_list_lock);
+   return ret;
+}
+
+static void vfio_link_pfn(struct vfio_domain *domain, struct vfio_pfn *new)
+{
+   struct rb_node **link, *parent = NULL;
+   struct vfio_pfn *vpfn;
+
+   mutex_lock(&domain->pfn_list_lock);
+   link = &domain->pfn_list.rb_node;
+   while (*link) {
+   parent = *link;
+   vpfn = rb_entry(parent, struct vfio_pfn, node);
+
+   if (new->pfn < vpfn->pfn)
+   link = &(*link)->rb_left;
+   else
+   link = &(*link)->rb_right;
+   }
+
+   rb_link_node(&new->node, parent, link);
+   rb_insert_color(&new->node, &domain->pfn_list);
+   mutex_unlock(&domain->pfn_list_lock);
+}
+
+/* call by holding domain->pfn_list_lock */
+static void vfio_unlink_pfn(struct vfio_domain *domain, struct vfio_pfn *old)
+{
+   rb_erase(&old->node, &domain->pfn_list);
+}
+
 struct vwork {
struct mm_struct*mm;
longnpage;
@@ -228,20 +30

1 2 3 4 >

1 - 100 of 313 matches

Mail list logo