date:20170623

Re: [Qemu-block] [Qemu-devel] [PATCH 0/3] AHCI: re-privatize ahci headers

2017-06-23 Thread no-reply

Hi,

This series seems to have some coding style problems. See output below for
more information:

Type: series
Subject: [Qemu-devel] [PATCH 0/3] AHCI: re-privatize ahci headers
Message-id: 20170623220926.11479-1-js...@redhat.com

=== TEST SCRIPT BEGIN ===
#!/bin/bash

BASE=base
n=1
total=$(git log --oneline $BASE.. | wc -l)
failed=0

git config --local diff.renamelimit 0
git config --local diff.renames True

commits="$(git log --format=%H --reverse $BASE..)"
for c in $commits; do
echo "Checking PATCH $n/$total: $(git log -n 1 --format=%s $c)..."
if ! git show $c --format=email | ./scripts/checkpatch.pl --mailback -; then
failed=1
echo
fi
n=$((n+1))
done

exit $failed
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
Switched to a new branch 'test'
dcb5007 ahci: split public and private interface
cabd116 ahci: Isolate public AHCI interface
d723da8 ahci: add ahci_get_num_ports

=== OUTPUT BEGIN ===
Checking PATCH 1/3: ahci: add ahci_get_num_ports...
Checking PATCH 2/3: ahci: Isolate public AHCI interface...
ERROR: spaces required around that '/' (ctx:VxV)
#214: FILE: include/hw/ide/ahci_public.h:88:
+uint32_t regs[ALLWINNER_AHCI_MMIO_SIZE/4];
   ^

total: 1 errors, 0 warnings, 189 lines checked

Your patch has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.

Checking PATCH 3/3: ahci: split public and private interface...
ERROR: spaces required around that '/' (ctx:VxV)
#793: FILE: include/hw/ide/ahci.h:88:
+uint32_t regs[ALLWINNER_AHCI_MMIO_SIZE/4];
   ^

total: 1 errors, 0 warnings, 760 lines checked

Your patch has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.

=== OUTPUT END ===

Test command exited with code: 1


---
Email generated automatically by Patchew [http://patchew.org/].
Please send your feedback to patchew-de...@freelists.org

Re: [Qemu-block] [Qemu-devel] [RFC PATCH 1/2] arm/highbank: use defined type name instead of hard-coded string

2017-06-23 Thread John Snow



On 06/09/2017 11:05 AM, Philippe Mathieu-Daudé wrote:
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  hw/arm/highbank.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/arm/highbank.c b/hw/arm/highbank.c
> index d209b97dee..dd809b404b 100644
> --- a/hw/arm/highbank.c
> +++ b/hw/arm/highbank.c
> @@ -31,6 +31,7 @@
>  #include "exec/address-spaces.h"
>  #include "qemu/error-report.h"
>  #include "hw/char/pl011.h"
> +#include "hw/ide/ahci.h"
>  
>  #define SMP_BOOT_ADDR   0x100
>  #define SMP_BOOT_REG0x40
> @@ -341,7 +342,7 @@ static void calxeda_init(MachineState *machine, enum 
> cxmachines machine_id)
>  sysbus_create_simple("pl031", 0xfff35000, pic[19]);
>  sysbus_create_simple("pl022", 0xfff39000, pic[23]);
>  
> -sysbus_create_simple("sysbus-ahci", 0xffe08000, pic[83]);
> +sysbus_create_simple(TYPE_SYSBUS_AHCI, 0xffe08000, pic[83]);
>  
>  if (nd_table[0].used) {
>  qemu_check_nic_model(&nd_table[0], "xgmac");
> 

This patch is fine, I took a stab at fixing the include issues in a
separate series. Try it all out and let me know.

--js

[Qemu-block] [PATCH 3/3] ahci: split public and private interface

2017-06-23 Thread John Snow

Complete the split by renaming ahci_public.h --> ahci.h and
moving the current ahci.h to hw/ide/ahci_internal.h.

Adjust ahci_internal.h to now load ahci.h instead of ahci_public.h.

Finalize the split by switching external users to the new header.

Signed-off-by: John Snow 
---
 hw/ide/ahci.c|   2 +-
 hw/ide/ahci_internal.h   | 359 +
 hw/ide/ich.c |   2 +-
 include/hw/ide/ahci.h| 368 ++-
 include/hw/ide/ahci_public.h |  91 ---
 5 files changed, 411 insertions(+), 411 deletions(-)
 create mode 100644 hw/ide/ahci_internal.h
 delete mode 100644 include/hw/ide/ahci_public.h

diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c
index 1ab3245..fdb1b11 100644
--- a/hw/ide/ahci.c
+++ b/hw/ide/ahci.c
@@ -32,7 +32,7 @@
 #include "sysemu/dma.h"
 #include "hw/ide/internal.h"
 #include "hw/ide/pci.h"
-#include "hw/ide/ahci.h"
+#include "hw/ide/ahci_internal.h"
 
 #define DEBUG_AHCI 0
 
diff --git a/hw/ide/ahci_internal.h b/hw/ide/ahci_internal.h
new file mode 100644
index 000..1e21169
--- /dev/null
+++ b/hw/ide/ahci_internal.h
@@ -0,0 +1,359 @@
+/*
+ * QEMU AHCI Emulation
+ *
+ * Copyright (c) 2010 qiaoch...@loongson.cn
+ * Copyright (c) 2010 Roland Elek 
+ * Copyright (c) 2010 Sebastian Herbszt 
+ * Copyright (c) 2010 Alexander Graf 
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see .
+ *
+ */
+
+#ifndef HW_IDE_AHCI_INTERNAL_H
+#define HW_IDE_AHCI_INTERNAL_H
+
+#include "hw/ide/ahci.h"
+#include "hw/sysbus.h"
+
+#define AHCI_MEM_BAR_SIZE 0x1000
+#define AHCI_MAX_PORTS32
+#define AHCI_MAX_SG   168 /* hardware max is 64K */
+#define AHCI_DMA_BOUNDARY 0x
+#define AHCI_USE_CLUSTERING   0
+#define AHCI_MAX_CMDS 32
+#define AHCI_CMD_SZ   32
+#define AHCI_CMD_SLOT_SZ  (AHCI_MAX_CMDS * AHCI_CMD_SZ)
+#define AHCI_RX_FIS_SZ256
+#define AHCI_CMD_TBL_CDB  0x40
+#define AHCI_CMD_TBL_HDR_SZ   0x80
+#define AHCI_CMD_TBL_SZ   (AHCI_CMD_TBL_HDR_SZ + (AHCI_MAX_SG * 16))
+#define AHCI_CMD_TBL_AR_SZ(AHCI_CMD_TBL_SZ * AHCI_MAX_CMDS)
+#define AHCI_PORT_PRIV_DMA_SZ (AHCI_CMD_SLOT_SZ + AHCI_CMD_TBL_AR_SZ + \
+   AHCI_RX_FIS_SZ)
+
+#define AHCI_IRQ_ON_SG(1U << 31)
+#define AHCI_CMD_ATAPI(1 << 5)
+#define AHCI_CMD_WRITE(1 << 6)
+#define AHCI_CMD_PREFETCH (1 << 7)
+#define AHCI_CMD_RESET(1 << 8)
+#define AHCI_CMD_CLR_BUSY (1 << 10)
+
+#define RX_FIS_D2H_REG0x40 /* offset of D2H Register FIS data */
+#define RX_FIS_SDB0x58 /* offset of SDB FIS data */
+#define RX_FIS_UNK0x60 /* offset of Unknown FIS data */
+
+/* global controller registers */
+#define HOST_CAP  0x00 /* host capabilities */
+#define HOST_CTL  0x04 /* global host control */
+#define HOST_IRQ_STAT 0x08 /* interrupt status */
+#define HOST_PORTS_IMPL   0x0c /* bitmap of implemented ports */
+#define HOST_VERSION  0x10 /* AHCI spec. version compliancy */
+
+/* HOST_CTL bits */
+#define HOST_CTL_RESET(1 << 0)  /* reset controller; self-clear */
+#define HOST_CTL_IRQ_EN   (1 << 1)  /* global IRQ enable */
+#define HOST_CTL_AHCI_EN  (1U << 31) /* AHCI enabled */
+
+/* HOST_CAP bits */
+#define HOST_CAP_SSC  (1 << 14) /* Slumber capable */
+#define HOST_CAP_AHCI (1 << 18) /* AHCI only */
+#define HOST_CAP_CLO  (1 << 24) /* Command List Override support */
+#define HOST_CAP_SSS  (1 << 27) /* Staggered Spin-up */
+#define HOST_CAP_NCQ  (1 << 30) /* Native Command Queueing */
+#define HOST_CAP_64   (1U << 31) /* PCI DAC (64-bit DMA) support */
+
+/* registers for each SATA port */
+#define PORT_LST_ADDR 0x00 /* command list DMA addr */
+#define PORT_LST_ADDR_HI  0x04 /* command list DMA addr hi */
+#define PORT_FIS_ADDR 0x08 /* FIS rx buf addr */
+#define PORT_FIS_ADDR_HI  0x0c /* FIS rx buf addr hi */
+#define PORT_IRQ_STAT 0x10 /* interrupt status */
+#define PORT_IRQ_MASK 0x14 /* interrupt enable/disable mask */
+#define PORT_CMD  0x18 /* por

[Qemu-block] [PATCH 2/3] ahci: Isolate public AHCI interface

2017-06-23 Thread John Snow

Begin separating the public/private interface by removing the minimum
set of information used by code outside of hw/ide/ and calling this
a new ahci_public.h file, which will be renamed to ahci.h in a future
patch.

Signed-off-by: John Snow 
---
 include/hw/ide/ahci.h| 57 +++
 include/hw/ide/ahci_public.h | 91 
 2 files changed, 96 insertions(+), 52 deletions(-)
 create mode 100644 include/hw/ide/ahci_public.h

diff --git a/include/hw/ide/ahci.h b/include/hw/ide/ahci.h
index f866bbf..70a0140 100644
--- a/include/hw/ide/ahci.h
+++ b/include/hw/ide/ahci.h
@@ -21,9 +21,10 @@
  *
  */
 
-#ifndef HW_IDE_AHCI_H
-#define HW_IDE_AHCI_H
+#ifndef HW_IDE_AHCI_INTERNAL_H
+#define HW_IDE_AHCI_INTERNAL_H
 
+#include "hw/ide/ahci_public.h"
 #include "hw/sysbus.h"
 
 #define AHCI_MEM_BAR_SIZE 0x1000
@@ -210,14 +211,6 @@
 #define SATA_CAP_REV0x2
 #define SATA_CAP_BAR0x4
 
-typedef struct AHCIControlRegs {
-uint32_tcap;
-uint32_tghc;
-uint32_tirqstatus;
-uint32_timpl;
-uint32_tversion;
-} AHCIControlRegs;
-
 typedef struct AHCIPortRegs {
 uint32_tlst_addr;
 uint32_tlst_addr_hi;
@@ -251,8 +244,6 @@ typedef struct AHCI_SG {
 uint32_tflags_size;
 } QEMU_PACKED AHCI_SG;
 
-typedef struct AHCIDevice AHCIDevice;
-
 typedef struct NCQTransferState {
 AHCIDevice *drive;
 BlockAIOCB *aiocb;
@@ -286,27 +277,13 @@ struct AHCIDevice {
 NCQTransferState ncq_tfs[AHCI_MAX_CMDS];
 };
 
-typedef struct AHCIState {
-DeviceState *container;
-
-AHCIDevice *dev;
-AHCIControlRegs control_regs;
-MemoryRegion mem;
-MemoryRegion idp;   /* Index-Data Pair I/O port space */
-unsigned idp_offset;/* Offset of index in I/O port space */
-uint32_t idp_index; /* Current IDP index */
-int32_t ports;
-qemu_irq irq;
-AddressSpace *as;
-} AHCIState;
-
-typedef struct AHCIPCIState {
+struct AHCIPCIState {
 /*< private >*/
 PCIDevice parent_obj;
 /*< public >*/
 
 AHCIState ahci;
-} AHCIPCIState;
+};
 
 #define TYPE_ICH9_AHCI "ich9-ahci"
 
@@ -372,35 +349,11 @@ void ahci_uninit(AHCIState *s);
 
 void ahci_reset(AHCIState *s);
 
-int32_t ahci_get_num_ports(PCIDevice *dev);
-void ahci_ide_create_devs(PCIDevice *dev, DriveInfo **hd);
-
 #define TYPE_SYSBUS_AHCI "sysbus-ahci"
 #define SYSBUS_AHCI(obj) OBJECT_CHECK(SysbusAHCIState, (obj), TYPE_SYSBUS_AHCI)
 
-typedef struct SysbusAHCIState {
-/*< private >*/
-SysBusDevice parent_obj;
-/*< public >*/
-
-AHCIState ahci;
-uint32_t num_ports;
-} SysbusAHCIState;
-
 #define TYPE_ALLWINNER_AHCI "allwinner-ahci"
 #define ALLWINNER_AHCI(obj) OBJECT_CHECK(AllwinnerAHCIState, (obj), \
TYPE_ALLWINNER_AHCI)
 
-#define ALLWINNER_AHCI_MMIO_OFF  0x80
-#define ALLWINNER_AHCI_MMIO_SIZE 0x80
-
-struct AllwinnerAHCIState {
-/*< private >*/
-SysbusAHCIState parent_obj;
-/*< public >*/
-
-MemoryRegion mmio;
-uint32_t regs[ALLWINNER_AHCI_MMIO_SIZE/4];
-};
-
 #endif /* HW_IDE_AHCI_H */
diff --git a/include/hw/ide/ahci_public.h b/include/hw/ide/ahci_public.h
new file mode 100644
index 000..5a06537
--- /dev/null
+++ b/include/hw/ide/ahci_public.h
@@ -0,0 +1,91 @@
+/*
+ * QEMU AHCI Emulation
+ *
+ * Copyright (c) 2010 qiaoch...@loongson.cn
+ * Copyright (c) 2010 Roland Elek 
+ * Copyright (c) 2010 Sebastian Herbszt 
+ * Copyright (c) 2010 Alexander Graf 
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see .
+ *
+ */
+
+#ifndef HW_IDE_AHCI_H
+#define HW_IDE_AHCI_H
+
+#include "hw/sysbus.h"
+
+typedef struct AHCIDevice AHCIDevice;
+
+typedef struct AHCIControlRegs {
+uint32_tcap;
+uint32_tghc;
+uint32_tirqstatus;
+uint32_timpl;
+uint32_tversion;
+} AHCIControlRegs;
+
+typedef struct AHCIState {
+DeviceState *container;
+
+AHCIDevice *dev;
+AHCIControlRegs control_regs;
+MemoryRegion mem;
+MemoryRegion idp;   /* Index-Data Pair I/O port space */
+unsigned idp_offset;/* Offset of index in I/O port space */
+uint32_t idp_index; /* Current IDP index */
+int32_t ports;
+qemu_irq irq;
+AddressSpace *as;
+} AHCIState;
+
+typedef struct AHCIPCIState AHCIPCIState;
+
+#define TYPE_ICH9_AHCI "ich9-ahci"
+
+#define ICH_AHCI(obj) \
+

[Qemu-block] [PATCH 0/3] AHCI: re-privatize ahci headers

2017-06-23 Thread John Snow

As reported by Philippe Mathieu-Daudé, including AHCI headers can
quickly pull in more dependencies than is sane. To remedy this,
split the AHCI headers into public and private areas as best as we
can and move the private information back into hw/ide/.

The only code movement is performed in patch 2.
Patch 3 looks terrible, but all it is doing are two different moves,
renaming one include directive. If you have suggestions for doing
this diffstat differently, please advise.

John Snow (3):
  ahci: add ahci_get_num_ports
  ahci: Isolate public AHCI interface
  ahci: split public and private interface

 hw/i386/pc_q35.c   |   4 +-
 hw/ide/ahci.c  |  10 +-
 hw/ide/ahci_internal.h | 359 +
 hw/ide/ich.c   |   2 +-
 hw/mips/boston.c   |   4 +-
 include/hw/ide/ahci.h  | 320 +--
 6 files changed, 376 insertions(+), 323 deletions(-)
 create mode 100644 hw/ide/ahci_internal.h

-- 
2.9.4

[Qemu-block] [PATCH 1/3] ahci: add ahci_get_num_ports

2017-06-23 Thread John Snow

Instead of reaching into the PCI state, allow the AHCIDevice to
respond with how many ports it has.

Signed-off-by: John Snow 
---
 hw/i386/pc_q35.c  | 4 ++--
 hw/ide/ahci.c | 8 
 hw/mips/boston.c  | 4 ++--
 include/hw/ide/ahci.h | 1 +
 4 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 1523ef3..8715514 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -241,8 +241,8 @@ static void pc_q35_init(MachineState *machine)
true, "ich9-ahci");
 idebus[0] = qdev_get_child_bus(&ahci->qdev, "ide.0");
 idebus[1] = qdev_get_child_bus(&ahci->qdev, "ide.1");
-g_assert(MAX_SATA_PORTS == ICH_AHCI(ahci)->ahci.ports);
-ide_drive_get(hd, ICH_AHCI(ahci)->ahci.ports);
+g_assert(MAX_SATA_PORTS == ahci_get_num_ports(ahci));
+ide_drive_get(hd, ahci_get_num_ports(ahci));
 ahci_ide_create_devs(ahci, hd);
 } else {
 idebus[0] = idebus[1] = NULL;
diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c
index f60826d..1ab3245 100644
--- a/hw/ide/ahci.c
+++ b/hw/ide/ahci.c
@@ -1833,6 +1833,14 @@ static void sysbus_ahci_register_types(void)
 
 type_init(sysbus_ahci_register_types)
 
+int32_t ahci_get_num_ports(PCIDevice *dev)
+{
+AHCIPCIState *d = ICH_AHCI(dev);
+AHCIState *ahci = &d->ahci;
+
+return ahci->ports;
+}
+
 void ahci_ide_create_devs(PCIDevice *dev, DriveInfo **hd)
 {
 AHCIPCIState *d = ICH_AHCI(dev);
diff --git a/hw/mips/boston.c b/hw/mips/boston.c
index a4677f7..ad59404 100644
--- a/hw/mips/boston.c
+++ b/hw/mips/boston.c
@@ -538,8 +538,8 @@ static void boston_mach_init(MachineState *machine)
 ahci = pci_create_simple_multifunction(&PCI_BRIDGE(&pcie2->root)->sec_bus,
PCI_DEVFN(0, 0),
true, TYPE_ICH9_AHCI);
-g_assert(ARRAY_SIZE(hd) == ICH_AHCI(ahci)->ahci.ports);
-ide_drive_get(hd, ICH_AHCI(ahci)->ahci.ports);
+g_assert(ARRAY_SIZE(hd) == ahci_get_num_ports(ahci));
+ide_drive_get(hd, ahci_get_num_ports(ahci));
 ahci_ide_create_devs(ahci, hd);
 
 if (machine->firmware) {
diff --git a/include/hw/ide/ahci.h b/include/hw/ide/ahci.h
index 0ca7c65..f866bbf 100644
--- a/include/hw/ide/ahci.h
+++ b/include/hw/ide/ahci.h
@@ -372,6 +372,7 @@ void ahci_uninit(AHCIState *s);
 
 void ahci_reset(AHCIState *s);
 
+int32_t ahci_get_num_ports(PCIDevice *dev);
 void ahci_ide_create_devs(PCIDevice *dev, DriveInfo **hd);
 
 #define TYPE_SYSBUS_AHCI "sysbus-ahci"
-- 
2.9.4

Re: [Qemu-block] [Qemu-devel] [PATCH v3] live-block-ops.txt: Rename, rewrite, and improve it

2017-06-23 Thread John Snow



On 06/23/2017 04:15 AM, Kashyap Chamarthy wrote:
> On Thu, Jun 22, 2017 at 10:13:03AM -0400, John Snow wrote:
>> On 06/22/2017 04:56 AM, Kashyap Chamarthy wrote:
>>> On Wed, Jun 21, 2017 at 06:49:02PM -0400, John Snow wrote:
> 
> [...]
> 
>>> Yes, I was thinking of this, too -- just link to the 'bitmaps' document.
>>>
>>> A quick side question here: Since upstream QEMU is converging onto
>>> Sphinx, and rST, hope you mind if I convert docs/devel/bitmaps.md into
>>> rST at somepoint, for consistency's sake.  I'll file a separate review,
>>> anyway for that.  In the long term, all / most other documents would
>>> also be converted.
>>>
>>
>> Of course not. I chose bitmaps.md so that it would be nice to view from
>> the github interface while remaining nice to read in plaintext, but feel
>> free to convert it if we actually do standardize on Sphinx/rST.
>>
>> If you can make the generated output look prettier than the github
>> rendering of the markdown I'll ACK it ;)
> 
> :-) Here's a sneak-peak (don't miss the index to your left hand side):
> 
> 
> https://kashyapc.fedorapeople.org/v3-QEMU-Docs/_build/html/docs/bitmaps.html
> 

Looking good, you've got an errant backslash:

'Let’s assume the full backup is named full\_backup.img.'

I also think we should change

"Let’s assume it is named incremental.0.img."

to

"Let's assume the new incremental image is named incremental.0.img"

so as to avoid "it" when dealing with two images.

Otherwise, LGTM.

> Compare that with the GitHub rendering:
> 
> https://github.com/qemu/qemu/blob/master/docs/devel/bitmaps.md
> 
> And, here's the source (in reStructuredText, despite the ".txt"
> extension) for the 'bitmaps' doc (I made tiny styling changes):
> 
> 
> https://kashyapc.fedorapeople.org/v3-QEMU-Docs/_build/html/_sources/docs/bitmaps.txt
> 
>>> Yes, I fully agree with your suggestion.  I will simply link to the
>>> detailed document you wrote, which I was thinking of anyhow.
>>>
>>> Thanks for your comments!
>>>
>> Sure. You could perhaps mention the different sync modes, including top,
>> none, full and incremental and urge readers to check out the bitmaps
>> document for detailed workings of the incremental mode.
> 
> Yeah, good point.  I'll make that edit in v2.
>

[Qemu-block] [PATCH v10 20/20] docs: document encryption options for qcow, qcow2 and luks

2017-06-23 Thread Daniel P. Berrange

Expand the image format docs to cover the new options for
the qcow, qcow2 and luks disk image formats

Reviewed-by: Alberto Garcia 
Reviewed-by: Eric Blake 
Signed-off-by: Daniel P. Berrange 
---
 qemu-doc.texi | 123 ++
 1 file changed, 115 insertions(+), 8 deletions(-)

diff --git a/qemu-doc.texi b/qemu-doc.texi
index 965ba59..3c3081f 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -541,10 +541,20 @@ File name of a base image (see @option{create} subcommand)
 @item backing_fmt
 Image format of the base image
 @item encryption
-If this option is set to @code{on}, the image is encrypted with 128-bit 
AES-CBC.
+This option is deprecated and equivalent to @code{encrypt.format=aes}
 
-The use of encryption in qcow and qcow2 images is considered to be flawed by
-modern cryptography standards, suffering from a number of design problems:
+@item encrypt.format
+
+If this is set to @code{luks}, it requests that the qcow2 payload (not
+qcow2 header) be encrypted using the LUKS format. The passphrase to
+use to unlock the LUKS key slot is given by the @code{encrypt.key-secret}
+parameter. LUKS encryption parameters can be tuned with the other
+@code{encrypt.*} parameters.
+
+If this is set to @code{aes}, the image is encrypted with 128-bit AES-CBC.
+The encryption key is given by the @code{encrypt.key-secret} parameter.
+This encryption format is considered to be flawed by modern cryptography
+standards, suffering from a number of design problems:
 
 @itemize @minus
 @item The AES-CBC cipher is used with predictable initialization vectors based
@@ -559,10 +569,45 @@ original file must then be securely erased using a 
program like shred,
 though even this is ineffective with many modern storage technologies.
 @end itemize
 
-Use of qcow / qcow2 encryption with QEMU is deprecated, and support for
-it will go away in a future release.  Users are recommended to use an
-alternative encryption technology such as the Linux dm-crypt / LUKS
-system.
+The use of this is no longer supported in system emulators. Support only
+remains in the command line utilities, for the purposes of data liberation
+and interoperability with old versions of QEMU. The @code{luks} format
+should be used instead.
+
+@item encrypt.key-secret
+
+Provides the ID of a @code{secret} object that contains the passphrase
+(@code{encrypt.format=luks}) or encryption key (@code{encrypt.format=aes}).
+
+@item encrypt.cipher-alg
+
+Name of the cipher algorithm and key length. Currently defaults
+to @code{aes-256}. Only used when @code{encrypt.format=luks}.
+
+@item encrypt.cipher-mode
+
+Name of the encryption mode to use. Currently defaults to @code{xts}.
+Only used when @code{encrypt.format=luks}.
+
+@item encrypt.ivgen-alg
+
+Name of the initialization vector generator algorithm. Currently defaults
+to @code{plain64}. Only used when @code{encrypt.format=luks}.
+
+@item encrypt.ivgen-hash-alg
+
+Name of the hash algorithm to use with the initialization vector generator
+(if required). Defaults to @code{sha256}. Only used when 
@code{encrypt.format=luks}.
+
+@item encrypt.hash-alg
+
+Name of the hash algorithm to use for PBKDF algorithm
+Defaults to @code{sha256}. Only used when @code{encrypt.format=luks}.
+
+@item encrypt.iter-time
+
+Amount of time, in milliseconds, to use for PBKDF algorithm per key slot.
+Defaults to @code{2000}. Only used when @code{encrypt.format=luks}.
 
 @item cluster_size
 Changes the qcow2 cluster size (must be between 512 and 2M). Smaller cluster
@@ -637,7 +682,69 @@ Supported options:
 @item backing_file
 File name of a base image (see @option{create} subcommand)
 @item encryption
-If this option is set to @code{on}, the image is encrypted.
+This option is deprecated and equivalent to @code{encrypt.format=aes}
+
+@item encrypt.format
+If this is set to @code{aes}, the image is encrypted with 128-bit AES-CBC.
+The encryption key is given by the @code{encrypt.key-secret} parameter.
+This encryption format is considered to be flawed by modern cryptography
+standards, suffering from a number of design problems enumerated previously
+against the @code{qcow2} image format.
+
+The use of this is no longer supported in system emulators. Support only
+remains in the command line utilities, for the purposes of data liberation
+and interoperability with old versions of QEMU.
+
+Users requiring native encryption should use the @code{qcow2} format
+instead with @code{encrypt.format=luks}.
+
+@item encrypt.key-secret
+
+Provides the ID of a @code{secret} object that contains the encryption
+key (@code{encrypt.format=aes}).
+
+@end table
+
+@item luks
+
+LUKS v1 encryption format, compatible with Linux dm-crypt/cryptsetup
+
+Supported options:
+@table @code
+
+@item key-secret
+
+Provides the ID of a @code{secret} object that contains the passphrase.
+
+@item cipher-alg
+
+Name of the cipher algorithm and key length. Currently defaults
+to @code{aes-256}.
+
+@item cipher-mode
+
+Name of t

[Qemu-block] [PATCH v10 18/20] block: pass option prefix down to crypto layer

2017-06-23 Thread Daniel P. Berrange

While the crypto layer uses a fixed option name "key-secret",
the upper block layer may have a prefix on the options. e.g.
"encrypt.key-secret", in order to avoid clashes between crypto
option names & other block option names. To ensure the crypto
layer can report accurate error messages, we must tell it what
option name prefix was used.

Reviewed-by: Alberto Garcia 
Reviewed-by: Max Reitz 
Signed-off-by: Daniel P. Berrange 
---
 block/crypto.c| 4 ++--
 block/qcow.c  | 7 ---
 block/qcow2.c | 8 
 crypto/block-luks.c   | 8 ++--
 crypto/block-qcow.c   | 8 ++--
 crypto/block.c| 6 --
 crypto/blockpriv.h| 2 ++
 include/crypto/block.h| 6 +-
 tests/test-crypto-block.c | 8 
 9 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/block/crypto.c b/block/crypto.c
index 3ad4b20..c561cba 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -296,7 +296,7 @@ static int block_crypto_open_generic(QCryptoBlockFormat 
format,
 if (flags & BDRV_O_NO_IO) {
 cflags |= QCRYPTO_BLOCK_OPEN_NO_IO;
 }
-crypto->block = qcrypto_block_open(open_opts,
+crypto->block = qcrypto_block_open(open_opts, NULL,
block_crypto_read_func,
bs,
cflags,
@@ -340,7 +340,7 @@ static int block_crypto_create_generic(QCryptoBlockFormat 
format,
 return -1;
 }
 
-crypto = qcrypto_block_create(create_opts,
+crypto = qcrypto_block_create(create_opts, NULL,
   block_crypto_init_func,
   block_crypto_write_func,
   &data,
diff --git a/block/qcow.c b/block/qcow.c
index 8a24930..2002c16 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -208,8 +208,8 @@ static int qcow_open(BlockDriverState *bs, QDict *options, 
int flags,
 if (flags & BDRV_O_NO_IO) {
 cflags |= QCRYPTO_BLOCK_OPEN_NO_IO;
 }
-s->crypto = qcrypto_block_open(crypto_opts, NULL, NULL,
-   cflags, errp);
+s->crypto = qcrypto_block_open(crypto_opts, "encrypt.",
+   NULL, NULL, cflags, errp);
 if (!s->crypto) {
 ret = -EINVAL;
 goto fail;
@@ -866,7 +866,8 @@ static int qcow_create(const char *filename, QemuOpts 
*opts, Error **errp)
 goto exit;
 }
 
-crypto = qcrypto_block_create(crypto_opts, NULL, NULL, NULL, errp);
+crypto = qcrypto_block_create(crypto_opts, "encrypt.",
+  NULL, NULL, NULL, errp);
 if (!crypto) {
 ret = -EINVAL;
 goto exit;
diff --git a/block/qcow2.c b/block/qcow2.c
index 765220e..b3836d5 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -279,7 +279,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, 
uint64_t start_offset,
 if (flags & BDRV_O_NO_IO) {
 cflags |= QCRYPTO_BLOCK_OPEN_NO_IO;
 }
-s->crypto = qcrypto_block_open(s->crypto_opts,
+s->crypto = qcrypto_block_open(s->crypto_opts, "encrypt.",
qcow2_crypto_hdr_read_func,
bs, cflags, errp);
 if (!s->crypto) {
@@ -1313,8 +1313,8 @@ static int qcow2_do_open(BlockDriverState *bs, QDict 
*options, int flags,
 if (flags & BDRV_O_NO_IO) {
 cflags |= QCRYPTO_BLOCK_OPEN_NO_IO;
 }
-s->crypto = qcrypto_block_open(s->crypto_opts, NULL, NULL,
-   cflags, errp);
+s->crypto = qcrypto_block_open(s->crypto_opts, "encrypt.",
+   NULL, NULL, cflags, errp);
 if (!s->crypto) {
 ret = -EINVAL;
 goto fail;
@@ -2318,7 +2318,7 @@ static int qcow2_set_up_encryption(BlockDriverState *bs, 
const char *encryptfmt,
 }
 s->crypt_method_header = fmt;
 
-crypto = qcrypto_block_create(cryptoopts,
+crypto = qcrypto_block_create(cryptoopts, "encrypt.",
   qcow2_crypto_hdr_init_func,
   qcow2_crypto_hdr_write_func,
   bs, errp);
diff --git a/crypto/block-luks.c b/crypto/block-luks.c
index 2b97d89..afb8543 100644
--- a/crypto/block-luks.c
+++ b/crypto/block-luks.c
@@ -638,6 +638,7 @@ qcrypto_block_luks_find_key(QCryptoBlock *block,
 static int
 qcrypto_block_luks_open(QCryptoBlock *block,
 QCryptoBlockOpenOptions *options,
+const char *optprefix,
 QCryptoBlockReadFunc readfunc,
 void *opaque,
 unsigned int flags,
@@ -661,7 +662,8 @@ qcrypto_

[Qemu-block] [PATCH v10 13/20] qcow2: add support for LUKS encryption format

2017-06-23 Thread Daniel P. Berrange

This adds support for using LUKS as an encryption format
with the qcow2 file, using the new encrypt.format parameter
to request "luks" format. e.g.

  # qemu-img create --object secret,data=123456,id=sec0 \
   -f qcow2 -o encrypt.format=luks,encrypt.key-secret=sec0 \
   test.qcow2 10G

The legacy "encryption=on" parameter still results in
creation of the old qcow2 AES format (and is equivalent
to the new 'encryption-format=aes'). e.g. the following are
equivalent:

  # qemu-img create --object secret,data=123456,id=sec0 \
   -f qcow2 -o encryption=on,encrypt.key-secret=sec0 \
   test.qcow2 10G

 # qemu-img create --object secret,data=123456,id=sec0 \
   -f qcow2 -o encryption-format=aes,encrypt.key-secret=sec0 \
   test.qcow2 10G

With the LUKS format it is necessary to store the LUKS
partition header and key material in the QCow2 file. This
data can be many MB in size, so cannot go into the QCow2
header region directly. Thus the spec defines a FDE
(Full Disk Encryption) header extension that specifies
the offset of a set of clusters to hold the FDE headers,
as well as the length of that region. The LUKS header is
thus stored in these extra allocated clusters before the
main image payload.

Aside from all the cryptographic differences implied by
use of the LUKS format, there is one further key difference
between the use of legacy AES and LUKS encryption in qcow2.
For LUKS, the initialiazation vectors are generated using
the host physical sector as the input, rather than the
guest virtual sector. This guarantees unique initialization
vectors for all sectors when qcow2 internal snapshots are
used, thus giving stronger protection against watermarking
attacks.

Signed-off-by: Daniel P. Berrange 
---
 block/qcow2-cluster.c  |  14 ++-
 block/qcow2-refcount.c |  10 ++
 block/qcow2.c  | 268 ++--
 block/qcow2.h  |   9 ++
 qapi/block-core.json   |   5 +-
 tests/qemu-iotests/082.out | 270 -
 6 files changed, 484 insertions(+), 92 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 71a5e0d..f06c08f 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -389,13 +389,16 @@ static int coroutine_fn 
do_perform_cow_read(BlockDriverState *bs,
 
 static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs,
 uint64_t src_cluster_offset,
+uint64_t cluster_offset,
 unsigned offset_in_cluster,
 uint8_t *buffer,
 unsigned bytes)
 {
 if (bytes && bs->encrypted) {
 BDRVQcow2State *s = bs->opaque;
-int64_t sector = (src_cluster_offset + offset_in_cluster)
+int64_t sector = (s->crypt_physical_offset ?
+  (cluster_offset + offset_in_cluster) :
+  (src_cluster_offset + offset_in_cluster))
  >> BDRV_SECTOR_BITS;
 assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0);
 assert((bytes & ~BDRV_SECTOR_MASK) == 0);
@@ -788,10 +791,11 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta 
*m)
 
 /* Encrypt the data if necessary before writing it */
 if (bs->encrypted) {
-if (!do_perform_cow_encrypt(bs, m->offset, start->offset,
-start_buffer, start->nb_bytes) ||
-!do_perform_cow_encrypt(bs, m->offset, end->offset,
-end_buffer, end->nb_bytes)) {
+if (!do_perform_cow_encrypt(bs, m->offset, m->alloc_offset,
+start->offset, start_buffer,
+start->nb_bytes) ||
+!do_perform_cow_encrypt(bs, m->offset, m->alloc_offset,
+end->offset, end_buffer, end->nb_bytes)) {
 ret = -EIO;
 goto fail;
 }
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 7c06061..81c22e6 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -1856,6 +1856,16 @@ static int calculate_refcounts(BlockDriverState *bs, 
BdrvCheckResult *res,
 return ret;
 }
 
+/* encryption */
+if (s->crypto_header.length) {
+ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
+s->crypto_header.offset,
+s->crypto_header.length);
+if (ret < 0) {
+return ret;
+}
+}
+
 return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
 }
 
diff --git a/block/qcow2.c b/block/qcow2.c
index 0a31127..173aeae 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -66,6 +66,7 @@ typedef struct {
 #define  QCOW2_EXT_MAGIC_END 0
 #define  QCOW2_EXT_MAGIC_BACKING_FOR

[Qemu-block] [PATCH v10 08/20] qcow: make encrypt_sectors encrypt in place

2017-06-23 Thread Daniel P. Berrange

Instead of requiring separate input/output buffers for
encrypting data, change encrypt_sectors() to assume
use of a single buffer, encrypting in place. One current
caller uses the same buffer for input/output already
and the other two callers are easily converted to do so.

Reviewed-by: Alberto Garcia 
Reviewed-by: Eric Blake 
Reviewed-by: Max Reitz 
Reviewed-by: Kevin Wolf 
Signed-off-by: Daniel P. Berrange 
---
 block/qcow.c | 45 +++--
 1 file changed, 15 insertions(+), 30 deletions(-)

diff --git a/block/qcow.c b/block/qcow.c
index a442ed7..3a3dbf9 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -322,11 +322,10 @@ static int qcow_set_key(BlockDriverState *bs, const char 
*key)
 }
 
 /* The crypt function is compatible with the linux cryptoloop
-   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
-   supported */
+   algorithm for < 4 GB images. */
 static int encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
-   uint8_t *out_buf, const uint8_t *in_buf,
-   int nb_sectors, bool enc, Error **errp)
+   uint8_t *buf, int nb_sectors, bool enc,
+   Error **errp)
 {
 union {
 uint64_t ll[2];
@@ -345,14 +344,12 @@ static int encrypt_sectors(BDRVQcowState *s, int64_t 
sector_num,
 }
 if (enc) {
 ret = qcrypto_cipher_encrypt(s->cipher,
- in_buf,
- out_buf,
+ buf, buf,
  512,
  errp);
 } else {
 ret = qcrypto_cipher_decrypt(s->cipher,
- in_buf,
- out_buf,
+ buf, buf,
  512,
  errp);
 }
@@ -360,8 +357,7 @@ static int encrypt_sectors(BDRVQcowState *s, int64_t 
sector_num,
 return -1;
 }
 sector_num++;
-in_buf += 512;
-out_buf += 512;
+buf += 512;
 }
 return 0;
 }
@@ -481,13 +477,12 @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
 uint64_t start_sect;
 assert(s->cipher);
 start_sect = (offset & ~(s->cluster_size - 1)) >> 9;
-memset(s->cluster_data + 512, 0x00, 512);
 for(i = 0; i < s->cluster_sectors; i++) {
 if (i < n_start || i >= n_end) {
 Error *err = NULL;
+memset(s->cluster_data, 0x00, 512);
 if (encrypt_sectors(s, start_sect + i,
-s->cluster_data,
-s->cluster_data + 512, 1,
+s->cluster_data, 1,
 true, &err) < 0) {
 error_free(err);
 errno = EIO;
@@ -665,7 +660,7 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, 
int64_t sector_num,
 }
 if (bs->encrypted) {
 assert(s->cipher);
-if (encrypt_sectors(s, sector_num, buf, buf,
+if (encrypt_sectors(s, sector_num, buf,
 n, false, &err) < 0) {
 goto fail;
 }
@@ -700,9 +695,7 @@ static coroutine_fn int qcow_co_writev(BlockDriverState 
*bs, int64_t sector_num,
 BDRVQcowState *s = bs->opaque;
 int index_in_cluster;
 uint64_t cluster_offset;
-const uint8_t *src_buf;
 int ret = 0, n;
-uint8_t *cluster_data = NULL;
 struct iovec hd_iov;
 QEMUIOVector hd_qiov;
 uint8_t *buf;
@@ -710,7 +703,9 @@ static coroutine_fn int qcow_co_writev(BlockDriverState 
*bs, int64_t sector_num,
 
 s->cluster_cache_offset = -1; /* disable compressed cache */
 
-if (qiov->niov > 1) {
+/* We must always copy the iov when encrypting, so we
+ * don't modify the original data buffer during encryption */
+if (bs->encrypted || qiov->niov > 1) {
 buf = orig_buf = qemu_try_blockalign(bs, qiov->size);
 if (buf == NULL) {
 return -ENOMEM;
@@ -740,21 +735,14 @@ static coroutine_fn int qcow_co_writev(BlockDriverState 
*bs, int64_t sector_num,
 if (bs->encrypted) {
 Error *err = NULL;
 assert(s->cipher);
-if (!cluster_data) {
-cluster_data = g_malloc0(s->cluster_size);
-}
-if (encrypt_sectors(s, sector_num, cluster_data, buf,
-n, true, &err) < 0) {
+if (encrypt_sectors(s, sector_num, buf, n, true, &err) < 0) {

[Qemu-block] [PATCH v10 02/20] block: add ability to set a prefix for opt names

2017-06-23 Thread Daniel P. Berrange

When integrating the crypto support with qcow/qcow2, we don't
want to use the bare LUKS option names "hash-alg", "key-secret",
etc. We need to namespace them to match the nested QAPI schema.

e.g. "encrypt.hash-alg", "encrypt.key-secret"

so that they don't clash with any general qcow options at a later
date.

Reviewed-by: Eric Blake 
Reviewed-by: Max Reitz 
Reviewed-by: Alberto Garcia 
Signed-off-by: Daniel P. Berrange 
---
 block/crypto.c | 16 
 block/crypto.h | 40 
 2 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/block/crypto.c b/block/crypto.c
index ea40ba4..9df1e5d 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -129,7 +129,7 @@ static QemuOptsList block_crypto_runtime_opts_luks = {
 .name = "crypto",
 .head = QTAILQ_HEAD_INITIALIZER(block_crypto_runtime_opts_luks.head),
 .desc = {
-BLOCK_CRYPTO_OPT_DEF_LUKS_KEY_SECRET,
+BLOCK_CRYPTO_OPT_DEF_LUKS_KEY_SECRET(""),
 { /* end of list */ }
 },
 };
@@ -144,13 +144,13 @@ static QemuOptsList block_crypto_create_opts_luks = {
 .type = QEMU_OPT_SIZE,
 .help = "Virtual disk size"
 },
-BLOCK_CRYPTO_OPT_DEF_LUKS_KEY_SECRET,
-BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_ALG,
-BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_MODE,
-BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_ALG,
-BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_HASH_ALG,
-BLOCK_CRYPTO_OPT_DEF_LUKS_HASH_ALG,
-BLOCK_CRYPTO_OPT_DEF_LUKS_ITER_TIME,
+BLOCK_CRYPTO_OPT_DEF_LUKS_KEY_SECRET(""),
+BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_ALG(""),
+BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_MODE(""),
+BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_ALG(""),
+BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_HASH_ALG(""),
+BLOCK_CRYPTO_OPT_DEF_LUKS_HASH_ALG(""),
+BLOCK_CRYPTO_OPT_DEF_LUKS_ITER_TIME(""),
 { /* end of list */ }
 },
 };
diff --git a/block/crypto.h b/block/crypto.h
index c0e9b54..3430dcd 100644
--- a/block/crypto.h
+++ b/block/crypto.h
@@ -29,51 +29,51 @@
 #define BLOCK_CRYPTO_OPT_LUKS_HASH_ALG "hash-alg"
 #define BLOCK_CRYPTO_OPT_LUKS_ITER_TIME "iter-time"
 
-#define BLOCK_CRYPTO_OPT_DEF_LUKS_KEY_SECRET\
+#define BLOCK_CRYPTO_OPT_DEF_LUKS_KEY_SECRET(prefix)\
 {   \
-.name = BLOCK_CRYPTO_OPT_LUKS_KEY_SECRET,   \
+.name = prefix BLOCK_CRYPTO_OPT_LUKS_KEY_SECRET,\
 .type = QEMU_OPT_STRING,\
 .help = "ID of the secret that provides the keyslot passphrase", \
 }
 
-#define BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_ALG   \
+#define BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_ALG(prefix)   \
 {  \
-.name = BLOCK_CRYPTO_OPT_LUKS_CIPHER_ALG,  \
+.name = prefix BLOCK_CRYPTO_OPT_LUKS_CIPHER_ALG,   \
 .type = QEMU_OPT_STRING,   \
 .help = "Name of encryption cipher algorithm", \
 }
 
-#define BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_MODE \
-{ \
-.name = BLOCK_CRYPTO_OPT_LUKS_CIPHER_MODE,\
-.type = QEMU_OPT_STRING,  \
-.help = "Name of encryption cipher mode", \
+#define BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_MODE(prefix)  \
+{  \
+.name = prefix BLOCK_CRYPTO_OPT_LUKS_CIPHER_MODE,  \
+.type = QEMU_OPT_STRING,   \
+.help = "Name of encryption cipher mode",  \
 }
 
-#define BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_ALG   \
-{ \
-.name = BLOCK_CRYPTO_OPT_LUKS_IVGEN_ALG,  \
-.type = QEMU_OPT_STRING,  \
-.help = "Name of IV generator algorithm", \
+#define BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_ALG(prefix) \
+{   \
+.name = prefix BLOCK_CRYPTO_OPT_LUKS_IVGEN_ALG, \
+.type = QEMU_OPT_STRING,\
+.help = "Name of IV generator algorithm",   \
 }
 
-#define BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_HASH_ALG\
+#define BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_HASH_ALG(prefix)\
 {   \
-.name = BLOCK_CRYPTO_OPT_LUKS_IVGEN_HASH_ALG,   \
+.name = prefix BLOCK_CRYPTO_OPT_LUKS_IVGEN_HASH_ALG,\
 .type = QEMU_OPT_STRING,\
 .help = "Name of IV generator hash algorithm",  \
 }
 
-#define BLOCK_CRYPTO_OPT_DEF_LUKS_HASH_ALG   \
+#define BLOCK_CRYPTO_OPT_DEF_LUKS_HASH_ALG(prefix)   \
 {\
-

[Qemu-block] [PULL 56/61] block: change variable names in BlockDriverState

2017-06-23 Thread Kevin Wolf

From: Manos Pitsidianakis 

Change the 'int count' parameter in *pwrite_zeros, *pdiscard related
functions (and some others) to 'int bytes', as they both refer to bytes.
This helps with code legibility.

Signed-off-by: Manos Pitsidianakis 
Message-id: 20170609101808.13506-1-el13...@mail.ntua.gr
Reviewed-by: Stefan Hajnoczi 
Signed-off-by: Max Reitz 
---
 block/blkdebug.c   | 36 +++
 block/blkreplay.c  |  8 +++
 block/block-backend.c  | 22 +--
 block/file-posix.c | 34 +++---
 block/io.c | 48 +-
 block/iscsi.c  | 20 +-
 block/mirror.c |  8 +++
 block/nbd-client.c |  8 +++
 block/nbd-client.h |  4 ++--
 block/qcow2.c  | 28 
 block/qed.c|  8 +++
 block/raw-format.c |  8 +++
 block/rbd.c|  4 ++--
 block/sheepdog.c   |  6 +++---
 include/block/block.h  |  8 +++
 include/block/block_int.h  |  6 +++---
 include/sysemu/block-backend.h | 20 +-
 qemu-io-cmds.c | 46 
 18 files changed, 161 insertions(+), 161 deletions(-)

diff --git a/block/blkdebug.c b/block/blkdebug.c
index 0618fc7..6431962 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -575,7 +575,7 @@ static int blkdebug_co_flush(BlockDriverState *bs)
 }
 
 static int coroutine_fn blkdebug_co_pwrite_zeroes(BlockDriverState *bs,
-  int64_t offset, int count,
+  int64_t offset, int bytes,
   BdrvRequestFlags flags)
 {
 uint32_t align = MAX(bs->bl.request_alignment,
@@ -586,29 +586,29 @@ static int coroutine_fn 
blkdebug_co_pwrite_zeroes(BlockDriverState *bs,
  * preferred alignment (so that we test the fallback to writes on
  * unaligned portions), and check that the block layer never hands
  * us anything unaligned that crosses an alignment boundary.  */
-if (count < align) {
+if (bytes < align) {
 assert(QEMU_IS_ALIGNED(offset, align) ||
-   QEMU_IS_ALIGNED(offset + count, align) ||
+   QEMU_IS_ALIGNED(offset + bytes, align) ||
DIV_ROUND_UP(offset, align) ==
-   DIV_ROUND_UP(offset + count, align));
+   DIV_ROUND_UP(offset + bytes, align));
 return -ENOTSUP;
 }
 assert(QEMU_IS_ALIGNED(offset, align));
-assert(QEMU_IS_ALIGNED(count, align));
+assert(QEMU_IS_ALIGNED(bytes, align));
 if (bs->bl.max_pwrite_zeroes) {
-assert(count <= bs->bl.max_pwrite_zeroes);
+assert(bytes <= bs->bl.max_pwrite_zeroes);
 }
 
-err = rule_check(bs, offset, count);
+err = rule_check(bs, offset, bytes);
 if (err) {
 return err;
 }
 
-return bdrv_co_pwrite_zeroes(bs->file, offset, count, flags);
+return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
 }
 
 static int coroutine_fn blkdebug_co_pdiscard(BlockDriverState *bs,
- int64_t offset, int count)
+ int64_t offset, int bytes)
 {
 uint32_t align = bs->bl.pdiscard_alignment;
 int err;
@@ -616,29 +616,29 @@ static int coroutine_fn 
blkdebug_co_pdiscard(BlockDriverState *bs,
 /* Only pass through requests that are larger than requested
  * minimum alignment, and ensure that unaligned requests do not
  * cross optimum discard boundaries. */
-if (count < bs->bl.request_alignment) {
+if (bytes < bs->bl.request_alignment) {
 assert(QEMU_IS_ALIGNED(offset, align) ||
-   QEMU_IS_ALIGNED(offset + count, align) ||
+   QEMU_IS_ALIGNED(offset + bytes, align) ||
DIV_ROUND_UP(offset, align) ==
-   DIV_ROUND_UP(offset + count, align));
+   DIV_ROUND_UP(offset + bytes, align));
 return -ENOTSUP;
 }
 assert(QEMU_IS_ALIGNED(offset, bs->bl.request_alignment));
-assert(QEMU_IS_ALIGNED(count, bs->bl.request_alignment));
-if (align && count >= align) {
+assert(QEMU_IS_ALIGNED(bytes, bs->bl.request_alignment));
+if (align && bytes >= align) {
 assert(QEMU_IS_ALIGNED(offset, align));
-assert(QEMU_IS_ALIGNED(count, align));
+assert(QEMU_IS_ALIGNED(bytes, align));
 }
 if (bs->bl.max_pdiscard) {
-assert(count <= bs->bl.max_pdiscard);
+assert(bytes <= bs->bl.max_pdiscard);
 }
 
-err = rule_check(bs, offset, count);
+err = rule_check(bs, offset, bytes);
 if (err) {
 return err;
 }
 
-return bdrv_co_pdiscard(bs->file->bs, offset, count);
+return bdrv_co_pdiscard(bs->file->bs, offset, bytes);
 }

[Qemu-block] [PATCH v10 01/20] block: expose crypto option names / defs to other drivers

2017-06-23 Thread Daniel P. Berrange

The block/crypto.c defines a set of QemuOpts that provide
parameters for encryption. This will also be needed by
the qcow/qcow2 integration, so expose the relevant pieces
in a new block/crypto.h header. Some helper methods taking
QemuOpts are changed to take QDict to simplify usage in
other places.

Reviewed-by: Max Reitz 
Reviewed-by: Eric Blake 
Reviewed-by: Alberto Garcia 
Signed-off-by: Daniel P. Berrange 
---
 block/crypto.c | 82 +---
 block/crypto.h | 91 ++
 2 files changed, 117 insertions(+), 56 deletions(-)
 create mode 100644 block/crypto.h

diff --git a/block/crypto.c b/block/crypto.c
index 10e5ddc..ea40ba4 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -24,16 +24,10 @@
 #include "sysemu/block-backend.h"
 #include "crypto/block.h"
 #include "qapi/opts-visitor.h"
+#include "qapi/qobject-input-visitor.h"
 #include "qapi-visit.h"
 #include "qapi/error.h"
-
-#define BLOCK_CRYPTO_OPT_LUKS_KEY_SECRET "key-secret"
-#define BLOCK_CRYPTO_OPT_LUKS_CIPHER_ALG "cipher-alg"
-#define BLOCK_CRYPTO_OPT_LUKS_CIPHER_MODE "cipher-mode"
-#define BLOCK_CRYPTO_OPT_LUKS_IVGEN_ALG "ivgen-alg"
-#define BLOCK_CRYPTO_OPT_LUKS_IVGEN_HASH_ALG "ivgen-hash-alg"
-#define BLOCK_CRYPTO_OPT_LUKS_HASH_ALG "hash-alg"
-#define BLOCK_CRYPTO_OPT_LUKS_ITER_TIME "iter-time"
+#include "block/crypto.h"
 
 typedef struct BlockCrypto BlockCrypto;
 
@@ -135,11 +129,7 @@ static QemuOptsList block_crypto_runtime_opts_luks = {
 .name = "crypto",
 .head = QTAILQ_HEAD_INITIALIZER(block_crypto_runtime_opts_luks.head),
 .desc = {
-{
-.name = BLOCK_CRYPTO_OPT_LUKS_KEY_SECRET,
-.type = QEMU_OPT_STRING,
-.help = "ID of the secret that provides the encryption key",
-},
+BLOCK_CRYPTO_OPT_DEF_LUKS_KEY_SECRET,
 { /* end of list */ }
 },
 };
@@ -154,49 +144,21 @@ static QemuOptsList block_crypto_create_opts_luks = {
 .type = QEMU_OPT_SIZE,
 .help = "Virtual disk size"
 },
-{
-.name = BLOCK_CRYPTO_OPT_LUKS_KEY_SECRET,
-.type = QEMU_OPT_STRING,
-.help = "ID of the secret that provides the encryption key",
-},
-{
-.name = BLOCK_CRYPTO_OPT_LUKS_CIPHER_ALG,
-.type = QEMU_OPT_STRING,
-.help = "Name of encryption cipher algorithm",
-},
-{
-.name = BLOCK_CRYPTO_OPT_LUKS_CIPHER_MODE,
-.type = QEMU_OPT_STRING,
-.help = "Name of encryption cipher mode",
-},
-{
-.name = BLOCK_CRYPTO_OPT_LUKS_IVGEN_ALG,
-.type = QEMU_OPT_STRING,
-.help = "Name of IV generator algorithm",
-},
-{
-.name = BLOCK_CRYPTO_OPT_LUKS_IVGEN_HASH_ALG,
-.type = QEMU_OPT_STRING,
-.help = "Name of IV generator hash algorithm",
-},
-{
-.name = BLOCK_CRYPTO_OPT_LUKS_HASH_ALG,
-.type = QEMU_OPT_STRING,
-.help = "Name of encryption hash algorithm",
-},
-{
-.name = BLOCK_CRYPTO_OPT_LUKS_ITER_TIME,
-.type = QEMU_OPT_NUMBER,
-.help = "Time to spend in PBKDF in milliseconds",
-},
+BLOCK_CRYPTO_OPT_DEF_LUKS_KEY_SECRET,
+BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_ALG,
+BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_MODE,
+BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_ALG,
+BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_HASH_ALG,
+BLOCK_CRYPTO_OPT_DEF_LUKS_HASH_ALG,
+BLOCK_CRYPTO_OPT_DEF_LUKS_ITER_TIME,
 { /* end of list */ }
 },
 };
 
 
-static QCryptoBlockOpenOptions *
+QCryptoBlockOpenOptions *
 block_crypto_open_opts_init(QCryptoBlockFormat format,
-QemuOpts *opts,
+QDict *opts,
 Error **errp)
 {
 Visitor *v;
@@ -206,7 +168,7 @@ block_crypto_open_opts_init(QCryptoBlockFormat format,
 ret = g_new0(QCryptoBlockOpenOptions, 1);
 ret->format = format;
 
-v = opts_visitor_new(opts);
+v = qobject_input_visitor_new_keyval(QOBJECT(opts));
 
 visit_start_struct(v, NULL, NULL, 0, &local_err);
 if (local_err) {
@@ -240,9 +202,9 @@ block_crypto_open_opts_init(QCryptoBlockFormat format,
 }
 
 
-static QCryptoBlockCreateOptions *
+QCryptoBlockCreateOptions *
 block_crypto_create_opts_init(QCryptoBlockFormat format,
-  QemuOpts *opts,
+  QDict *opts,
   Error **errp)
 {
 Visitor *v;
@@ -252,7 +214,7 @@ block_crypto_create_opts_init(QCryptoBlockFormat format,
 ret = g_new0(QCryptoBlockCreateOptions, 1);
 ret->format = format;
 
-v = opts_visitor_new(opts);
+v = qobject_input_visitor_new_keyval(QOBJECT(opts));
 
 visit_start_struct(v, NULL, NULL, 0, &local_err);
 if (local_err) {
@@ -299,6 +261,7 @@ static int b

[Qemu-block] [PATCH v10 14/20] qcow2: add iotests to cover LUKS encryption support

2017-06-23 Thread Daniel P. Berrange

This extends the 087 iotest to cover LUKS encryption when doing
blockdev-add.

Two further tests are added to validate read/write of LUKS
encrypted images with a single file and with a backing file.

Reviewed-by: Alberto Garcia 
Reviewed-by: Max Reitz 
Signed-off-by: Daniel P. Berrange 
---
 tests/qemu-iotests/087 | 35 ++-
 tests/qemu-iotests/087.out | 14 +++-
 tests/qemu-iotests/188 | 76 
 tests/qemu-iotests/188.out | 18 ++
 tests/qemu-iotests/189 | 86 ++
 tests/qemu-iotests/189.out | 26 ++
 tests/qemu-iotests/group   |  2 ++
 7 files changed, 255 insertions(+), 2 deletions(-)
 create mode 100755 tests/qemu-iotests/188
 create mode 100644 tests/qemu-iotests/188.out
 create mode 100755 tests/qemu-iotests/189
 create mode 100644 tests/qemu-iotests/189.out

diff --git a/tests/qemu-iotests/087 b/tests/qemu-iotests/087
index 1d595b2..f8e4903 100755
--- a/tests/qemu-iotests/087
+++ b/tests/qemu-iotests/087
@@ -119,7 +119,7 @@ run_qemu .
+#
+
+# creator
+owner=berra...@redhat.com
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+here=`pwd`
+status=1   # failure is the default!
+
+_cleanup()
+{
+   _cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+_supported_fmt qcow2
+_supported_proto generic
+_supported_os Linux
+
+
+size=16M
+
+SECRET="secret,id=sec0,data=astrochicken"
+SECRETALT="secret,id=sec0,data=platypus"
+
+_make_test_img --object $SECRET -o 
"encrypt.format=luks,encrypt.key-secret=sec0,encrypt.iter-time=10" $size
+
+IMGSPEC="driver=$IMGFMT,file.filename=$TEST_IMG,encrypt.key-secret=sec0"
+
+QEMU_IO_OPTIONS=$QEMU_IO_OPTIONS_NO_FMT
+
+echo
+echo "== reading whole image =="
+$QEMU_IO --object $SECRET -c "read -P 0 0 $size" --image-opts $IMGSPEC | 
_filter_qemu_io | _filter_testdir
+
+echo
+echo "== rewriting whole image =="
+$QEMU_IO --object $SECRET -c "write -P 0xa 0 $size" --image-opts $IMGSPEC | 
_filter_qemu_io | _filter_testdir
+
+echo
+echo "== verify pattern =="
+$QEMU_IO --object $SECRET -c "read -P 0xa 0 $size"  --image-opts $IMGSPEC | 
_filter_qemu_io | _filter_testdir
+
+echo
+echo "== verify open failure with wrong password =="
+$QEMU_IO --object $SECRETALT -c "read -P 0xa 0 $size" --image-opts $IMGSPEC | 
_filter_qemu_io | _filter_testdir
+
+
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/188.out b/tests/qemu-iotests/188.out
new file mode 100644
index 000..8af24e5
--- /dev/null
+++ b/tests/qemu-iotests/188.out
@@ -0,0 +1,18 @@
+QA output created by 188
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=16777216 encrypt.format=luks 
encrypt.key-secret=sec0 encrypt.iter-time=10
+
+== reading whole image ==
+read 16777216/16777216 bytes at offset 0
+16 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== rewriting whole image ==
+wrote 16777216/16777216 bytes at offset 0
+16 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== verify pattern ==
+read 16777216/16777216 bytes at offset 0
+16 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== verify open failure with wrong password ==
+can't open: Invalid password, cannot unlock any keyslot
+*** done
diff --git a/tests/qemu-iotests/189 b/tests/qemu-iotests/189
new file mode 100755
index 000..54ad980
--- /dev/null
+++ b/tests/qemu-iotests/189
@@ -0,0 +1,86 @@
+#!/bin/bash
+#
+# Test encrypted read/write using backing files
+#
+# Copyright (C) 2017 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see .
+#
+
+# creator
+owner=berra...@redhat.com
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+here=`pwd`
+status=1   # failure is the default!
+
+_cleanup()
+{
+   _cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+_supported_fmt qcow2
+_supported_proto generic
+_supported_os Linux
+
+
+size=16M
+TEST_IMG_BASE=$TEST_IMG.base
+SECRET0="secret,id=sec0,data=astrochicken"
+SECRET1="secret,id=sec1,data=furby"
+
+TEST_IMG_SAVE=$TEST_IMG
+TEST_IMG=$TEST_IMG_BASE
+echo "== create base =="
+_make_test_img --object $SECRET0 -o 
"encrypt.format=luks,encrypt.key-secret=sec0,encrypt.iter-time=10" $size
+

[Qemu-block] [PULL 58/61] blkdebug: Catch bs->exact_filename overflow

2017-06-23 Thread Kevin Wolf

From: Max Reitz 

The bs->exact_filename field may not be sufficient to store the full
blkdebug node filename. In this case, we should not generate a filename
at all instead of an unusable one.

Cc: qemu-sta...@nongnu.org
Reported-by: Qu Wenruo 
Signed-off-by: Max Reitz 
Message-id: 20170613172006.19685-2-mre...@redhat.com
Reviewed-by: Alberto Garcia 
Reviewed-by: Stefan Hajnoczi 
Signed-off-by: Max Reitz 
---
 block/blkdebug.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/block/blkdebug.c b/block/blkdebug.c
index 6431962..a1b24b9 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -839,9 +839,13 @@ static void blkdebug_refresh_filename(BlockDriverState 
*bs, QDict *options)
 }
 
 if (!force_json && bs->file->bs->exact_filename[0]) {
-snprintf(bs->exact_filename, sizeof(bs->exact_filename),
- "blkdebug:%s:%s", s->config_file ?: "",
- bs->file->bs->exact_filename);
+int ret = snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+   "blkdebug:%s:%s", s->config_file ?: "",
+   bs->file->bs->exact_filename);
+if (ret >= sizeof(bs->exact_filename)) {
+/* An overflow makes the filename unusable, so do not report any */
+bs->exact_filename[0] = 0;
+}
 }
 
 opts = qdict_new();
-- 
1.8.3.1

[Qemu-block] [PULL 59/61] blkverify: Catch bs->exact_filename overflow

2017-06-23 Thread Kevin Wolf

From: Max Reitz 

The bs->exact_filename field may not be sufficient to store the full
blkverify node filename. In this case, we should not generate a filename
at all instead of an unusable one.

Cc: qemu-sta...@nongnu.org
Reported-by: Qu Wenruo 
Signed-off-by: Max Reitz 
Message-id: 20170613172006.19685-3-mre...@redhat.com
Reviewed-by: Alberto Garcia 
Reviewed-by: Stefan Hajnoczi 
Signed-off-by: Max Reitz 
---
 block/blkverify.c | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/block/blkverify.c b/block/blkverify.c
index 6b0a603..06369f9 100644
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -301,10 +301,14 @@ static void blkverify_refresh_filename(BlockDriverState 
*bs, QDict *options)
 if (bs->file->bs->exact_filename[0]
 && s->test_file->bs->exact_filename[0])
 {
-snprintf(bs->exact_filename, sizeof(bs->exact_filename),
- "blkverify:%s:%s",
- bs->file->bs->exact_filename,
- s->test_file->bs->exact_filename);
+int ret = snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+   "blkverify:%s:%s",
+   bs->file->bs->exact_filename,
+   s->test_file->bs->exact_filename);
+if (ret >= sizeof(bs->exact_filename)) {
+/* An overflow makes the filename unusable, so do not report any */
+bs->exact_filename[0] = 0;
+}
 }
 }
 
-- 
1.8.3.1

[Qemu-block] [PATCH v10 11/20] qcow2: convert QCow2 to use QCryptoBlock for encryption

2017-06-23 Thread Daniel P. Berrange

This converts the qcow2 driver to make use of the QCryptoBlock
APIs for encrypting image content, using the legacy QCow2 AES
scheme.

With this change it is now required to use the QCryptoSecret
object for providing passwords, instead of the current block
password APIs / interactive prompting.

  $QEMU \
-object secret,id=sec0,filename=/home/berrange/encrypted.pw \
-drive file=/home/berrange/encrypted.qcow2,encrypt.key-secret=sec0

The test 087 could be simplified since there is no longer a
difference in behaviour when using blockdev_add with encrypted
images for the running vs stopped CPU state.

Signed-off-by: Daniel P. Berrange 
---
 block/qcow2-cluster.c  |  47 +-
 block/qcow2.c  | 226 ++---
 block/qcow2.h  |   5 +-
 qapi/block-core.json   |  27 +-
 tests/qemu-iotests/049 |   2 +-
 tests/qemu-iotests/049.out |   4 +-
 tests/qemu-iotests/082.out |  27 ++
 tests/qemu-iotests/087 |  28 +++---
 tests/qemu-iotests/087.out |  12 +--
 tests/qemu-iotests/134 |  18 +++-
 tests/qemu-iotests/134.out |  10 +-
 tests/qemu-iotests/158 |  19 ++--
 tests/qemu-iotests/158.out |  14 +--
 tests/qemu-iotests/common  |  10 +-
 14 files changed, 263 insertions(+), 186 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index a570929..71a5e0d 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -357,47 +357,6 @@ static int count_contiguous_clusters_unallocated(int 
nb_clusters,
 return i;
 }
 
-/* The crypt function is compatible with the linux cryptoloop
-   algorithm for < 4 GB images. */
-int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
-  uint8_t *buf, int nb_sectors, bool enc,
-  Error **errp)
-{
-union {
-uint64_t ll[2];
-uint8_t b[16];
-} ivec;
-int i;
-int ret;
-
-for(i = 0; i < nb_sectors; i++) {
-ivec.ll[0] = cpu_to_le64(sector_num);
-ivec.ll[1] = 0;
-if (qcrypto_cipher_setiv(s->cipher,
- ivec.b, G_N_ELEMENTS(ivec.b),
- errp) < 0) {
-return -1;
-}
-if (enc) {
-ret = qcrypto_cipher_encrypt(s->cipher,
- buf, buf,
- 512,
- errp);
-} else {
-ret = qcrypto_cipher_decrypt(s->cipher,
- buf, buf,
- 512,
- errp);
-}
-if (ret < 0) {
-return -1;
-}
-sector_num++;
-buf += 512;
-}
-return 0;
-}
-
 static int coroutine_fn do_perform_cow_read(BlockDriverState *bs,
 uint64_t src_cluster_offset,
 unsigned offset_in_cluster,
@@ -438,11 +397,11 @@ static bool coroutine_fn 
do_perform_cow_encrypt(BlockDriverState *bs,
 BDRVQcow2State *s = bs->opaque;
 int64_t sector = (src_cluster_offset + offset_in_cluster)
  >> BDRV_SECTOR_BITS;
-assert(s->cipher);
 assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0);
 assert((bytes & ~BDRV_SECTOR_MASK) == 0);
-if (qcow2_encrypt_sectors(s, sector, buffer,
-  bytes >> BDRV_SECTOR_BITS, true, NULL) < 0) {
+assert(s->crypto);
+if (qcrypto_block_encrypt(s->crypto, sector, buffer,
+  bytes, NULL) < 0) {
 return false;
 }
 }
diff --git a/block/qcow2.c b/block/qcow2.c
index f4b5207..0a31127 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -37,6 +37,9 @@
 #include "qemu/option_int.h"
 #include "qemu/cutils.h"
 #include "qemu/bswap.h"
+#include "qapi/opts-visitor.h"
+#include "qapi-visit.h"
+#include "block/crypto.h"
 
 /*
   Differences with QCOW:
@@ -461,6 +464,7 @@ static QemuOptsList qcow2_runtime_opts = {
 .type = QEMU_OPT_NUMBER,
 .help = "Clean unused cache entries after this time (in seconds)",
 },
+BLOCK_CRYPTO_OPT_DEF_QCOW_KEY_SECRET("encrypt."),
 { /* end of list */ }
 },
 };
@@ -585,6 +589,7 @@ typedef struct Qcow2ReopenState {
 int overlap_check;
 bool discard_passthrough[QCOW2_DISCARD_MAX];
 uint64_t cache_clean_interval;
+QCryptoBlockOpenOptions *crypto_opts; /* Disk encryption runtime options */
 } Qcow2ReopenState;
 
 static int qcow2_update_options_prepare(BlockDriverState *bs,
@@ -598,9 +603,14 @@ static int qcow2_update_options_prepare(BlockDriverState 
*bs,
 int overlap_check_template = 0;
 uint64_t l2_cache_size, refcount_cache_size;
 int i;
+const char *encryptfmt;
+QDict *encryptopts = NULL;
 Error *local_err = NULL;
 int ret;
 
+qdict_extra

Re: [Qemu-block] [Qemu-devel] [PATCH v2] blockdev: Print a warning for legacy drive options that belong to -device

2017-06-23 Thread Thomas Huth

On 12.05.2017 12:33, Thomas Huth wrote:
> We likely do not want to carry these legacy -drive options along forever.
> Let's emit a deprecation warning for the -drive options that have a
> replacement with the -device option, so that the (hopefully few) remaining
> users are aware of this and can adapt their scripts / behaviour accordingly.
> 
> Signed-off-by: Thomas Huth 
> ---
>  v2:
>  - Check for !qtest_enabled() since tests/hd-geo-test still uses these
>  - Added "addr" to the list, too
>  - Also mark the options as deprecated in the documentation
> 
>  blockdev.c  | 14 ++
>  qemu-options.hx |  5 -
>  2 files changed, 18 insertions(+), 1 deletion(-)
> 
> diff --git a/blockdev.c b/blockdev.c
> index 0b38c3d..aef38f0 100644
> --- a/blockdev.c
> +++ b/blockdev.c
> @@ -50,6 +50,7 @@
>  #include "qmp-commands.h"
>  #include "block/trace.h"
>  #include "sysemu/arch_init.h"
> +#include "sysemu/qtest.h"
>  #include "qemu/cutils.h"
>  #include "qemu/help_option.h"
>  #include "qemu/throttle-options.h"
> @@ -797,6 +798,9 @@ DriveInfo *drive_new(QemuOpts *all_opts, 
> BlockInterfaceType block_default_type)
>  const char *filename;
>  Error *local_err = NULL;
>  int i;
> +const char *deprecated[] = {
> +"serial", "trans", "secs", "heads", "cyls", "addr"
> +};
>  
>  /* Change legacy command line options into QMP ones */
>  static const struct {
> @@ -880,6 +884,16 @@ DriveInfo *drive_new(QemuOpts *all_opts, 
> BlockInterfaceType block_default_type)
>  "update your scripts.\n");
>  }
>  
> +/* Other deprecated options */
> +if (!qtest_enabled()) {
> +for (i = 0; i < ARRAY_SIZE(deprecated); i++) {
> +if (qemu_opt_get(legacy_opts, deprecated[i]) != NULL) {
> +error_report("'%s' is deprecated, please use the 
> corresponding "
> + "option of '-device' instead", deprecated[i]);
> +}
> +}
> +}
> +
>  /* Media type */
>  value = qemu_opt_get(legacy_opts, "media");
>  if (value) {
> diff --git a/qemu-options.hx b/qemu-options.hx
> index 9d7964d..2f66f1a 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -615,6 +615,8 @@ of available connectors of a given interface type.
>  This option defines the type of the media: disk or cdrom.
>  @item cyls=@var{c},heads=@var{h},secs=@var{s}[,trans=@var{t}]
>  These options have the same definition as they have in @option{-hdachs}.
> +These parameters are deprecated, use the corresponding parameters
> +of @code{-device} instead.
>  @item snapshot=@var{snapshot}
>  @var{snapshot} is "on" or "off" and controls snapshot mode for the given 
> drive
>  (see @option{-snapshot}).
> @@ -631,7 +633,8 @@ an untrusted format header.
>  @item serial=@var{serial}
>  This option specifies the serial number to assign to the device.
>  @item addr=@var{addr}
> -Specify the controller's PCI address (if=virtio only).
> +Specify the controller's PCI address (if=virtio only). This parameter is
> +deprecated, use the corresponding parameter of @code{-device} instead.
>  @item werror=@var{action},rerror=@var{action}
>  Specify which @var{action} to take on write and read errors. Valid actions 
> are:
>  "ignore" (ignore the error and try to continue), "stop" (pause QEMU),
> 

ping²

Any takers?

 Thomas

[Qemu-block] [PATCH v10 07/20] block: deprecate "encryption=on" in favor of "encrypt.format=aes"

2017-06-23 Thread Daniel P. Berrange

Historically the qcow & qcow2 image formats supported a property
"encryption=on" to enable their built-in AES encryption. We'll
soon be supporting LUKS for qcow2, so need a more general purpose
way to enable encryption, with a choice of formats.

This introduces an "encrypt.format" option, which will later be
joined by a number of other "encrypt.XXX" options. The use of
a "encrypt." prefix instead of "encrypt-" is done to facilitate
mapping to a nested QAPI schema at later date.

e.g. the preferred syntax is now

  qemu-img create -f qcow2 -o encrypt.format=aes demo.qcow2

Signed-off-by: Daniel P. Berrange 
---
 block/qcow.c   | 31 ---
 block/qcow2.c  | 34 
 include/block/block_int.h  |  2 +-
 qemu-img.c |  4 +-
 tests/qemu-iotests/049.out | 98 +++---
 tests/qemu-iotests/082.out | 95 
 tests/qemu-iotests/085.out | 38 +-
 tests/qemu-iotests/144.out |  4 +-
 tests/qemu-iotests/185.out |  8 ++--
 9 files changed, 191 insertions(+), 123 deletions(-)

diff --git a/block/qcow.c b/block/qcow.c
index 49871fb..a442ed7 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -803,10 +803,10 @@ static int qcow_create(const char *filename, QemuOpts 
*opts, Error **errp)
 uint8_t *tmp;
 int64_t total_size = 0;
 char *backing_file = NULL;
-int flags = 0;
 Error *local_err = NULL;
 int ret;
 BlockBackend *qcow_blk;
+const char *encryptfmt = NULL;
 
 /* Read out options */
 total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
@@ -818,8 +818,16 @@ static int qcow_create(const char *filename, QemuOpts 
*opts, Error **errp)
 }
 
 backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
-if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ENCRYPT, false)) {
-flags |= BLOCK_FLAG_ENCRYPT;
+encryptfmt = qemu_opt_get_del(opts, BLOCK_OPT_ENCRYPT_FORMAT);
+if (encryptfmt) {
+if (qemu_opt_get(opts, BLOCK_OPT_ENCRYPT)) {
+error_setg(errp, "Options " BLOCK_OPT_ENCRYPT " and "
+   BLOCK_OPT_ENCRYPT_FORMAT " are mutually exclusive");
+ret = -EINVAL;
+goto cleanup;
+}
+} else if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ENCRYPT, false)) {
+encryptfmt = "aes";
 }
 
 ret = bdrv_create_file(filename, opts, &local_err);
@@ -873,7 +881,13 @@ static int qcow_create(const char *filename, QemuOpts 
*opts, Error **errp)
 l1_size = (total_size + (1LL << shift) - 1) >> shift;
 
 header.l1_table_offset = cpu_to_be64(header_size);
-if (flags & BLOCK_FLAG_ENCRYPT) {
+if (encryptfmt) {
+if (!g_str_equal(encryptfmt, "aes")) {
+error_setg(errp, "Unknown encryption format '%s', expected 'aes'",
+   encryptfmt);
+ret = -EINVAL;
+goto exit;
+}
 header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
 } else {
 header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
@@ -1047,8 +1061,13 @@ static QemuOptsList qcow_create_opts = {
 {
 .name = BLOCK_OPT_ENCRYPT,
 .type = QEMU_OPT_BOOL,
-.help = "Encrypt the image",
-.def_value_str = "off"
+.help = "Encrypt the image with format 'aes'. (Deprecated "
+"in favor of " BLOCK_OPT_ENCRYPT_FORMAT "=aes)",
+},
+{
+.name = BLOCK_OPT_ENCRYPT_FORMAT,
+.type = QEMU_OPT_STRING,
+.help = "Encrypt the image, format choices: 'aes'",
 },
 { /* end of list */ }
 }
diff --git a/block/qcow2.c b/block/qcow2.c
index 088ffe1..67fb50d 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2144,7 +2144,7 @@ static int qcow2_create2(const char *filename, int64_t 
total_size,
  const char *backing_file, const char *backing_format,
  int flags, size_t cluster_size, PreallocMode prealloc,
  QemuOpts *opts, int version, int refcount_order,
- Error **errp)
+ const char *encryptfmt, Error **errp)
 {
 int cluster_bits;
 QDict *options;
@@ -2273,7 +2273,13 @@ static int qcow2_create2(const char *filename, int64_t 
total_size,
 .header_length  = cpu_to_be32(sizeof(*header)),
 };
 
-if (flags & BLOCK_FLAG_ENCRYPT) {
+if (encryptfmt) {
+if (!g_str_equal(encryptfmt, "aes")) {
+error_setg(errp, "Unknown encryption format '%s', expected 'aes'",
+   encryptfmt);
+ret = -EINVAL;
+goto out;
+}
 header->crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
 } else {
 header->crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
@@ -2402,6 +2408,7 @@ static int qcow2_create(const char *filename, QemuOpts 
*opts, Error **errp)
 int version = 3;
 ui

[Qemu-block] [PATCH v10 09/20] qcow: convert QCow to use QCryptoBlock for encryption

2017-06-23 Thread Daniel P. Berrange

This converts the qcow driver to make use of the QCryptoBlock
APIs for encrypting image content. This is only wired up to
permit use of the legacy QCow encryption format. Users who wish
to have the strong LUKS format should switch to qcow2 instead.

With this change it is now required to use the QCryptoSecret
object for providing passwords, instead of the current block
password APIs / interactive prompting.

  $QEMU \
-object secret,id=sec0,filename=/home/berrange/encrypted.pw \
-drive file=/home/berrange/encrypted.qcow,encrypt.format=aes,\
   encrypt.key-secret=sec0

Though note that running QEMU system emulators with the AES
encryption is no longer supported, so while the above syntax
is valid, QEMU will refuse to actually run the VM in this
particular example.

Likewise when creating images with the legacy AES-CBC format

  qemu-img create -f qcow \
--object secret,id=sec0,filename=/home/berrange/encrypted.pw \
-o encrypt.format=aes,encrypt.key-secret=sec0 \
/home/berrange/encrypted.qcow 64M

Reviewed-by: Max Reitz 
Reviewed-by: Alberto Garcia 
Reviewed-by: Eric Blake 
Signed-off-by: Daniel P. Berrange 
---
 block/crypto.c   |  10 +++
 block/crypto.h   |  20 --
 block/qcow.c | 198 +--
 qapi/block-core.json |  38 +-
 4 files changed, 158 insertions(+), 108 deletions(-)

diff --git a/block/crypto.c b/block/crypto.c
index 9df1e5d..da4be74 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -181,6 +181,11 @@ block_crypto_open_opts_init(QCryptoBlockFormat format,
 v, &ret->u.luks, &local_err);
 break;
 
+case Q_CRYPTO_BLOCK_FORMAT_QCOW:
+visit_type_QCryptoBlockOptionsQCow_members(
+v, &ret->u.qcow, &local_err);
+break;
+
 default:
 error_setg(&local_err, "Unsupported block format %d", format);
 break;
@@ -227,6 +232,11 @@ block_crypto_create_opts_init(QCryptoBlockFormat format,
 v, &ret->u.luks, &local_err);
 break;
 
+case Q_CRYPTO_BLOCK_FORMAT_QCOW:
+visit_type_QCryptoBlockOptionsQCow_members(
+v, &ret->u.qcow, &local_err);
+break;
+
 default:
 error_setg(&local_err, "Unsupported block format %d", format);
 break;
diff --git a/block/crypto.h b/block/crypto.h
index 3430dcd..0f985ea 100644
--- a/block/crypto.h
+++ b/block/crypto.h
@@ -21,6 +21,19 @@
 #ifndef BLOCK_CRYPTO_H__
 #define BLOCK_CRYPTO_H__
 
+#define BLOCK_CRYPTO_OPT_DEF_KEY_SECRET(prefix, helpstr)\
+{   \
+.name = prefix BLOCK_CRYPTO_OPT_QCOW_KEY_SECRET,\
+.type = QEMU_OPT_STRING,\
+.help = helpstr,\
+}
+
+#define BLOCK_CRYPTO_OPT_QCOW_KEY_SECRET "key-secret"
+
+#define BLOCK_CRYPTO_OPT_DEF_QCOW_KEY_SECRET(prefix)\
+BLOCK_CRYPTO_OPT_DEF_KEY_SECRET(prefix, \
+"ID of the secret that provides the AES encryption key")
+
 #define BLOCK_CRYPTO_OPT_LUKS_KEY_SECRET "key-secret"
 #define BLOCK_CRYPTO_OPT_LUKS_CIPHER_ALG "cipher-alg"
 #define BLOCK_CRYPTO_OPT_LUKS_CIPHER_MODE "cipher-mode"
@@ -30,11 +43,8 @@
 #define BLOCK_CRYPTO_OPT_LUKS_ITER_TIME "iter-time"
 
 #define BLOCK_CRYPTO_OPT_DEF_LUKS_KEY_SECRET(prefix)\
-{   \
-.name = prefix BLOCK_CRYPTO_OPT_LUKS_KEY_SECRET,\
-.type = QEMU_OPT_STRING,\
-.help = "ID of the secret that provides the keyslot passphrase", \
-}
+BLOCK_CRYPTO_OPT_DEF_KEY_SECRET(prefix, \
+"ID of the secret that provides the keyslot passphrase")
 
 #define BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_ALG(prefix)   \
 {  \
diff --git a/block/qcow.c b/block/qcow.c
index 3a3dbf9..db0c5a9 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -31,8 +31,10 @@
 #include "qemu/bswap.h"
 #include 
 #include "qapi/qmp/qerror.h"
-#include "crypto/cipher.h"
+#include "qapi/qmp/qstring.h"
+#include "crypto/block.h"
 #include "migration/blocker.h"
+#include "block/crypto.h"
 
 /**/
 /* QEMU COW block driver with compression and encryption support */
@@ -77,7 +79,7 @@ typedef struct BDRVQcowState {
 uint8_t *cluster_cache;
 uint8_t *cluster_data;
 uint64_t cluster_cache_offset;
-QCryptoCipher *cipher; /* NULL if no key yet */
+QCryptoBlock *crypto; /* Disk encryption format driver */
 uint32_t crypt_method_header;
 CoMutex lock;
 Error *migration_blocker;
@@ -97,6 +99,15 @@ static int qcow_probe(const uint8_t *buf, int buf_size, 
const char *filename)
 return 0;
 }
 
+static QemuOptsList

[Qemu-block] [PULL 54/61] qed: Use bdrv_co_* for coroutine_fns

2017-06-23 Thread Kevin Wolf

All functions that are marked coroutine_fn can directly call the
bdrv_co_* version of functions instead of going through the wrapper.

Signed-off-by: Kevin Wolf 
Reviewed-by: Manos Pitsidianakis 
Reviewed-by: Stefan Hajnoczi 
---
 block/qed.c | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index 48f2b0e..c073baa 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -116,7 +116,7 @@ static int coroutine_fn qed_write_header(BDRVQEDState *s)
 };
 qemu_iovec_init_external(&qiov, &iov, 1);
 
-ret = bdrv_preadv(s->bs->file, 0, &qiov);
+ret = bdrv_co_preadv(s->bs->file, 0, qiov.size, &qiov, 0);
 if (ret < 0) {
 goto out;
 }
@@ -124,7 +124,7 @@ static int coroutine_fn qed_write_header(BDRVQEDState *s)
 /* Update header */
 qed_header_cpu_to_le(&s->header, (QEDHeader *) buf);
 
-ret = bdrv_pwritev(s->bs->file, 0, &qiov);
+ret = bdrv_co_pwritev(s->bs->file, 0, qiov.size,  &qiov, 0);
 if (ret < 0) {
 goto out;
 }
@@ -796,7 +796,7 @@ static int coroutine_fn qed_read_backing_file(BDRVQEDState 
*s, uint64_t pos,
 qemu_iovec_concat(*backing_qiov, qiov, 0, size);
 
 BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
-ret = bdrv_preadv(s->bs->backing, pos, *backing_qiov);
+ret = bdrv_co_preadv(s->bs->backing, pos, size, *backing_qiov, 0);
 if (ret < 0) {
 return ret;
 }
@@ -844,7 +844,7 @@ static int coroutine_fn 
qed_copy_from_backing_file(BDRVQEDState *s,
 }
 
 BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE);
-ret = bdrv_pwritev(s->bs->file, offset, &qiov);
+ret = bdrv_co_pwritev(s->bs->file, offset, qiov.size, &qiov, 0);
 if (ret < 0) {
 goto out;
 }
@@ -987,7 +987,8 @@ static int coroutine_fn qed_aio_write_main(QEDAIOCB *acb)
 trace_qed_aio_write_main(s, acb, 0, offset, acb->cur_qiov.size);
 
 BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
-ret = bdrv_pwritev(s->bs->file, offset, &acb->cur_qiov);
+ret = bdrv_co_pwritev(s->bs->file, offset, acb->cur_qiov.size,
+  &acb->cur_qiov, 0);
 if (ret < 0) {
 return ret;
 }
@@ -1004,7 +1005,7 @@ static int coroutine_fn qed_aio_write_main(QEDAIOCB *acb)
  * region.  The solution is to flush after writing a new data
  * cluster and before updating the L2 table.
  */
-ret = bdrv_flush(s->bs->file->bs);
+ret = bdrv_co_flush(s->bs->file->bs);
 if (ret < 0) {
 return ret;
 }
@@ -1221,7 +1222,8 @@ static int coroutine_fn qed_aio_read_data(void *opaque, 
int ret,
 }
 
 BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
-ret = bdrv_preadv(bs->file, offset, &acb->cur_qiov);
+ret = bdrv_co_preadv(bs->file, offset, acb->cur_qiov.size,
+ &acb->cur_qiov, 0);
 if (ret < 0) {
 return ret;
 }
-- 
1.8.3.1

[Qemu-block] [PULL 50/61] qed: Use CoQueue for serialising allocations

2017-06-23 Thread Kevin Wolf

Now that we're running in coroutine context, the ad-hoc serialisation
code (which drops a request that has to wait out of coroutine context)
can be replaced by a CoQueue.

This means that when we resume a serialised request, it is running in
coroutine context again and its I/O isn't blocking any more.

Signed-off-by: Kevin Wolf 
Reviewed-by: Stefan Hajnoczi 
---
 block/qed.c | 49 +
 block/qed.h |  3 ++-
 2 files changed, 19 insertions(+), 33 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index a5111fd..cd3ef55 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -269,16 +269,10 @@ static void qed_plug_allocating_write_reqs(BDRVQEDState 
*s)
 
 static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
 {
-QEDAIOCB *acb;
-
 assert(s->allocating_write_reqs_plugged);
 
 s->allocating_write_reqs_plugged = false;
-
-acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
-if (acb) {
-qed_aio_start_io(acb);
-}
+qemu_co_enter_next(&s->allocating_write_reqs);
 }
 
 static void qed_clear_need_check(void *opaque, int ret)
@@ -305,7 +299,7 @@ static void qed_need_check_timer_cb(void *opaque)
 BDRVQEDState *s = opaque;
 
 /* The timer should only fire when allocating writes have drained */
-assert(!QSIMPLEQ_FIRST(&s->allocating_write_reqs));
+assert(!s->allocating_acb);
 
 trace_qed_need_check_timer_cb(s);
 
@@ -388,7 +382,7 @@ static int bdrv_qed_do_open(BlockDriverState *bs, QDict 
*options, int flags,
 int ret;
 
 s->bs = bs;
-QSIMPLEQ_INIT(&s->allocating_write_reqs);
+qemu_co_queue_init(&s->allocating_write_reqs);
 
 ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header));
 if (ret < 0) {
@@ -910,11 +904,6 @@ static void qed_aio_complete_bh(void *opaque)
 qed_release(s);
 }
 
-static void qed_resume_alloc_bh(void *opaque)
-{
-qed_aio_start_io(opaque);
-}
-
 static void qed_aio_complete(QEDAIOCB *acb, int ret)
 {
 BDRVQEDState *s = acb_to_s(acb);
@@ -942,13 +931,10 @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
  * next request in the queue.  This ensures that we don't cycle through
  * requests multiple times but rather finish one at a time completely.
  */
-if (acb == QSIMPLEQ_FIRST(&s->allocating_write_reqs)) {
-QEDAIOCB *next_acb;
-QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next);
-next_acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
-if (next_acb) {
-aio_bh_schedule_oneshot(bdrv_get_aio_context(acb->common.bs),
-qed_resume_alloc_bh, next_acb);
+if (acb == s->allocating_acb) {
+s->allocating_acb = NULL;
+if (!qemu_co_queue_empty(&s->allocating_write_reqs)) {
+qemu_co_enter_next(&s->allocating_write_reqs);
 } else if (s->header.features & QED_F_NEED_CHECK) {
 qed_start_need_check_timer(s);
 }
@@ -1124,17 +1110,18 @@ static int qed_aio_write_alloc(QEDAIOCB *acb, size_t 
len)
 int ret;
 
 /* Cancel timer when the first allocating request comes in */
-if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) {
+if (s->allocating_acb == NULL) {
 qed_cancel_need_check_timer(s);
 }
 
 /* Freeze this request if another allocating write is in progress */
-if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs)) {
-QSIMPLEQ_INSERT_TAIL(&s->allocating_write_reqs, acb, next);
-}
-if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs) ||
-s->allocating_write_reqs_plugged) {
-return -EINPROGRESS; /* wait for existing request to finish */
+if (s->allocating_acb != acb || s->allocating_write_reqs_plugged) {
+if (s->allocating_acb != NULL) {
+qemu_co_queue_wait(&s->allocating_write_reqs, NULL);
+assert(s->allocating_acb == NULL);
+}
+s->allocating_acb = acb;
+return -EAGAIN; /* start over with looking up table entries */
 }
 
 acb->cur_nclusters = qed_bytes_to_clusters(s,
@@ -1313,10 +1300,8 @@ static void qed_aio_next_io(QEDAIOCB *acb)
 ret = qed_aio_read_data(acb, ret, offset, len);
 }
 
-if (ret < 0) {
-if (ret != -EINPROGRESS) {
-qed_aio_complete(acb, ret);
-}
+if (ret < 0 && ret != -EAGAIN) {
+qed_aio_complete(acb, ret);
 return;
 }
 }
diff --git a/block/qed.h b/block/qed.h
index 8644fed..37558e4 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -163,7 +163,8 @@ typedef struct {
 uint32_t l2_mask;
 
 /* Allocating write request queue */
-QSIMPLEQ_HEAD(, QEDAIOCB) allocating_write_reqs;
+QEDAIOCB *allocating_acb;
+CoQueue allocating_write_reqs;
 bool allocating_write_reqs_plugged;
 
 /* Periodic flush and clear need check flag */
-- 
1.8.3.1

[Qemu-block] [PATCH v10 19/20] qcow2: report encryption specific image information

2017-06-23 Thread Daniel P. Berrange

Currently 'qemu-img info' reports a simple "encrypted: yes"
field. This is not very useful now that qcow2 can support
multiple encryption formats. Users want to know which format
is in use and some data related to it.

Wire up usage of the qcrypto_block_get_info() method so that
'qemu-img info' can report about the encryption format
and parameters in use

  $ qemu-img create \
  --object secret,id=sec0,data=123456 \
  -o encrypt.format=luks,encrypt.key-secret=sec0 \
  -f qcow2 demo.qcow2 1G
  Formatting 'demo.qcow2', fmt=qcow2 size=1073741824 \
  encryption=off encrypt.format=luks encrypt.key-secret=sec0 \
  cluster_size=65536 lazy_refcounts=off refcount_bits=16

  $ qemu-img info demo.qcow2
  image: demo.qcow2
  file format: qcow2
  virtual size: 1.0G (1073741824 bytes)
  disk size: 480K
  encrypted: yes
  cluster_size: 65536
  Format specific information:
  compat: 1.1
  lazy refcounts: false
  refcount bits: 16
  encrypt:
  ivgen alg: plain64
  hash alg: sha256
  cipher alg: aes-256
  uuid: 3fa930c4-58c8-4ef7-b3c5-314bb5af21f3
  format: luks
  cipher mode: xts
  slots:
  [0]:
  active: true
  iters: 1839058
  key offset: 4096
  stripes: 4000
  [1]:
  active: false
  key offset: 262144
  [2]:
  active: false
  key offset: 520192
  [3]:
  active: false
  key offset: 778240
  [4]:
  active: false
  key offset: 1036288
  [5]:
  active: false
  key offset: 1294336
  [6]:
  active: false
  key offset: 1552384
  [7]:
  active: false
  key offset: 1810432
  payload offset: 2068480
  master key iters: 438487
  corrupt: false

With the legacy "AES" encryption we just report the format
name

  $ qemu-img create \
  --object secret,id=sec0,data=123456 \
  -o encrypt.format=aes,encrypt.key-secret=sec0 \
  -f qcow2 demo.qcow2 1G
  Formatting 'demo.qcow2', fmt=qcow2 size=1073741824 \
  encryption=off encrypt.format=aes encrypt.key-secret=sec0 \
  cluster_size=65536 lazy_refcounts=off refcount_bits=16

  $ ./qemu-img info demo.qcow2
  image: demo.qcow2
  file format: qcow2
  virtual size: 1.0G (1073741824 bytes)
  disk size: 196K
  encrypted: yes
  cluster_size: 65536
  Format specific information:
  compat: 1.1
  lazy refcounts: false
  refcount bits: 16
  encrypt:
  format: aes
  corrupt: false

Reviewed-by: Alberto Garcia 
Reviewed-by: Max Reitz 
Reviewed-by: Eric Blake 
Signed-off-by: Daniel P. Berrange 
---
 block/qcow2.c| 32 +++-
 qapi/block-core.json | 27 ++-
 2 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index b3836d5..6a590a5 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -3241,8 +3241,14 @@ static int qcow2_get_info(BlockDriverState *bs, 
BlockDriverInfo *bdi)
 static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs)
 {
 BDRVQcow2State *s = bs->opaque;
-ImageInfoSpecific *spec_info = g_new(ImageInfoSpecific, 1);
+ImageInfoSpecific *spec_info;
+QCryptoBlockInfo *encrypt_info = NULL;
 
+if (s->crypto != NULL) {
+encrypt_info = qcrypto_block_get_info(s->crypto, &error_abort);
+}
+
+spec_info = g_new(ImageInfoSpecific, 1);
 *spec_info = (ImageInfoSpecific){
 .type  = IMAGE_INFO_SPECIFIC_KIND_QCOW2,
 .u.qcow2.data = g_new(ImageInfoSpecificQCow2, 1),
@@ -3269,6 +3275,30 @@ static ImageInfoSpecific 
*qcow2_get_specific_info(BlockDriverState *bs)
 assert(false);
 }
 
+if (encrypt_info) {
+ImageInfoSpecificQCow2Encryption *qencrypt =
+g_new(ImageInfoSpecificQCow2Encryption, 1);
+switch (encrypt_info->format) {
+case Q_CRYPTO_BLOCK_FORMAT_QCOW:
+qencrypt->format = BLOCKDEV_QCOW2_ENCRYPTION_FORMAT_AES;
+qencrypt->u.aes = encrypt_info->u.qcow;
+break;
+case Q_CRYPTO_BLOCK_FORMAT_LUKS:
+qencrypt->format = BLOCKDEV_QCOW2_ENCRYPTION_FORMAT_LUKS;
+qencrypt->u.luks = encrypt_info->u.luks;
+break;
+default:
+abort();
+}
+/* Since we did shallow copy above, erase any pointers
+ * in the original info */
+memset(&encrypt_info->u, 0, sizeof(encrypt_info->u));
+qapi_free_QCryptoBlockInfo(encrypt_info);
+
+spec_info->u.qcow2.data->has_encrypt = true;
+spec_info->u.qcow2.data->encrypt = qencrypt;
+}
+
 return spec_info;
 }
 
diff --git a/qapi/block-core.json b/qapi/block-core.json
index d04d277..9570963 100644
--- a/qapi/blo

[Qemu-block] [PATCH v10 17/20] block: remove all encryption handling APIs

2017-06-23 Thread Daniel P. Berrange

Now that all encryption keys must be provided upfront via
the QCryptoSecret API and associated block driver properties
there is no need for any explicit encryption handling APIs
in the block layer. Encryption can be handled transparently
within the block driver. We only retain an API for querying
whether an image is encrypted or not, since that is a
potentially useful piece of metadata to report to the user.

Reviewed-by: Alberto Garcia 
Reviewed-by: Max Reitz 
Signed-off-by: Daniel P. Berrange 
---
 block.c   | 77 +--
 block/crypto.c|  1 -
 block/qapi.c  |  2 +-
 block/qcow.c  |  8 -
 block/qcow2.c |  1 -
 blockdev.c| 37 ++-
 hmp-commands.hx   |  2 ++
 include/block/block.h |  3 --
 include/block/block_int.h |  1 -
 include/qapi/error.h  |  1 -
 qapi/block-core.json  | 37 ++-
 qapi/common.json  |  5 +--
 12 files changed, 16 insertions(+), 159 deletions(-)

diff --git a/block.c b/block.c
index fa1d06d..440649c 100644
--- a/block.c
+++ b/block.c
@@ -2569,15 +2569,7 @@ static BlockDriverState *bdrv_open_inherit(const char 
*filename,
 goto close_and_fail;
 }
 
-if (!bdrv_key_required(bs)) {
-bdrv_parent_cb_change_media(bs, true);
-} else if (!runstate_check(RUN_STATE_PRELAUNCH)
-   && !runstate_check(RUN_STATE_INMIGRATE)
-   && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */
-error_setg(errp,
-   "Guest must be stopped for opening of encrypted image");
-goto close_and_fail;
-}
+bdrv_parent_cb_change_media(bs, true);
 
 QDECREF(options);
 
@@ -3068,7 +3060,6 @@ static void bdrv_close(BlockDriverState *bs)
 bs->backing_format[0] = '\0';
 bs->total_sectors = 0;
 bs->encrypted = false;
-bs->valid_key = false;
 bs->sg = false;
 QDECREF(bs->options);
 QDECREF(bs->explicit_options);
@@ -3498,72 +3489,6 @@ bool bdrv_is_encrypted(BlockDriverState *bs)
 return bs->encrypted;
 }
 
-bool bdrv_key_required(BlockDriverState *bs)
-{
-BdrvChild *backing = bs->backing;
-
-if (backing && backing->bs->encrypted && !backing->bs->valid_key) {
-return true;
-}
-return (bs->encrypted && !bs->valid_key);
-}
-
-int bdrv_set_key(BlockDriverState *bs, const char *key)
-{
-int ret;
-if (bs->backing && bs->backing->bs->encrypted) {
-ret = bdrv_set_key(bs->backing->bs, key);
-if (ret < 0)
-return ret;
-if (!bs->encrypted)
-return 0;
-}
-if (!bs->encrypted) {
-return -EINVAL;
-} else if (!bs->drv || !bs->drv->bdrv_set_key) {
-return -ENOMEDIUM;
-}
-ret = bs->drv->bdrv_set_key(bs, key);
-if (ret < 0) {
-bs->valid_key = false;
-} else if (!bs->valid_key) {
-/* call the change callback now, we skipped it on open */
-bs->valid_key = true;
-bdrv_parent_cb_change_media(bs, true);
-}
-return ret;
-}
-
-/*
- * Provide an encryption key for @bs.
- * If @key is non-null:
- * If @bs is not encrypted, fail.
- * Else if the key is invalid, fail.
- * Else set @bs's key to @key, replacing the existing key, if any.
- * If @key is null:
- * If @bs is encrypted and still lacks a key, fail.
- * Else do nothing.
- * On failure, store an error object through @errp if non-null.
- */
-void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp)
-{
-if (key) {
-if (!bdrv_is_encrypted(bs)) {
-error_setg(errp, "Node '%s' is not encrypted",
-  bdrv_get_device_or_node_name(bs));
-} else if (bdrv_set_key(bs, key) < 0) {
-error_setg(errp, QERR_INVALID_PASSWORD);
-}
-} else {
-if (bdrv_key_required(bs)) {
-error_set(errp, ERROR_CLASS_DEVICE_ENCRYPTED,
-  "'%s' (%s) is encrypted",
-  bdrv_get_device_or_node_name(bs),
-  bdrv_get_encrypted_filename(bs));
-}
-}
-}
-
 const char *bdrv_get_format_name(BlockDriverState *bs)
 {
 return bs->drv ? bs->drv->format_name : NULL;
diff --git a/block/crypto.c b/block/crypto.c
index da4be74..3ad4b20 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -308,7 +308,6 @@ static int block_crypto_open_generic(QCryptoBlockFormat 
format,
 }
 
 bs->encrypted = true;
-bs->valid_key = true;
 
 ret = 0;
  cleanup:
diff --git a/block/qapi.c b/block/qapi.c
index a40922e..9d724c2 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -45,7 +45,7 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
 info->ro = bs->read_only;
 info->drv= g_strdup(bs->drv->format_name);
 info->encrypted  = bs->encrypted;
-info->encryption_key_missing = bdrv_key_required(bs);
+

[Qemu-block] [PATCH v10 16/20] block: rip out all traces of password prompting

2017-06-23 Thread Daniel P. Berrange

Now that qcow & qcow2 are wired up to get encryption keys
via the QCryptoSecret object, nothing is relying on the
interactive prompting for passwords. All the code related
to password prompting can thus be ripped out.

Reviewed-by: Alberto Garcia 
Reviewed-by: Max Reitz 
Signed-off-by: Daniel P. Berrange 
---
 hmp.c | 31 -
 include/monitor/monitor.h |  7 -
 include/qemu/osdep.h  |  2 --
 monitor.c | 68 ---
 qapi-schema.json  | 10 +--
 qemu-img.c| 31 -
 qemu-io.c | 20 --
 qmp.c | 12 +
 util/oslib-posix.c| 66 -
 util/oslib-win32.c| 24 -
 10 files changed, 2 insertions(+), 269 deletions(-)

diff --git a/hmp.c b/hmp.c
index 8c72c58..435cb31 100644
--- a/hmp.c
+++ b/hmp.c
@@ -1085,37 +1085,12 @@ void hmp_ringbuf_read(Monitor *mon, const QDict *qdict)
 g_free(data);
 }
 
-static void hmp_cont_cb(void *opaque, int err)
-{
-if (!err) {
-qmp_cont(NULL);
-}
-}
-
-static bool key_is_missing(const BlockInfo *bdev)
-{
-return (bdev->inserted && bdev->inserted->encryption_key_missing);
-}
-
 void hmp_cont(Monitor *mon, const QDict *qdict)
 {
-BlockInfoList *bdev_list, *bdev;
 Error *err = NULL;
 
-bdev_list = qmp_query_block(NULL);
-for (bdev = bdev_list; bdev; bdev = bdev->next) {
-if (key_is_missing(bdev->value)) {
-monitor_read_block_device_key(mon, bdev->value->device,
-  hmp_cont_cb, NULL);
-goto out;
-}
-}
-
 qmp_cont(&err);
 hmp_handle_error(mon, &err);
-
-out:
-qapi_free_BlockInfoList(bdev_list);
 }
 
 void hmp_system_wakeup(Monitor *mon, const QDict *qdict)
@@ -1738,12 +1713,6 @@ void hmp_change(Monitor *mon, const QDict *qdict)
 qmp_blockdev_change_medium(true, device, false, NULL, target,
!!arg, arg, !!read_only, read_only_mode,
&err);
-if (err &&
-error_get_class(err) == ERROR_CLASS_DEVICE_ENCRYPTED) {
-error_free(err);
-monitor_read_block_device_key(mon, device, NULL, NULL);
-return;
-}
 }
 
 hmp_handle_error(mon, &err);
diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h
index d2b3aaf..83ea4a1 100644
--- a/include/monitor/monitor.h
+++ b/include/monitor/monitor.h
@@ -23,13 +23,6 @@ void monitor_cleanup(void);
 int monitor_suspend(Monitor *mon);
 void monitor_resume(Monitor *mon);
 
-int monitor_read_bdrv_key_start(Monitor *mon, BlockDriverState *bs,
-BlockCompletionFunc *completion_cb,
-void *opaque);
-int monitor_read_block_device_key(Monitor *mon, const char *device,
-  BlockCompletionFunc *completion_cb,
-  void *opaque);
-
 int monitor_get_fd(Monitor *mon, const char *fdname, Error **errp);
 int monitor_fd_param(Monitor *mon, const char *fdname, Error **errp);
 
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index fb008a2..a5982ef 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -457,8 +457,6 @@ void qemu_set_tty_echo(int fd, bool echo);
 void os_mem_prealloc(int fd, char *area, size_t sz, int smp_cpus,
  Error **errp);
 
-int qemu_read_password(char *buf, int buf_size);
-
 /**
  * qemu_get_pid_name:
  * @pid: pid of a process
diff --git a/monitor.c b/monitor.c
index fcf4fad..7647291 100644
--- a/monitor.c
+++ b/monitor.c
@@ -4123,74 +4123,6 @@ void monitor_cleanup(void)
 qemu_mutex_unlock(&monitor_lock);
 }
 
-static void bdrv_password_cb(void *opaque, const char *password,
- void *readline_opaque)
-{
-Monitor *mon = opaque;
-BlockDriverState *bs = readline_opaque;
-int ret = 0;
-Error *local_err = NULL;
-
-bdrv_add_key(bs, password, &local_err);
-if (local_err) {
-error_report_err(local_err);
-ret = -EPERM;
-}
-if (mon->password_completion_cb)
-mon->password_completion_cb(mon->password_opaque, ret);
-
-monitor_read_command(mon, 1);
-}
-
-int monitor_read_bdrv_key_start(Monitor *mon, BlockDriverState *bs,
-BlockCompletionFunc *completion_cb,
-void *opaque)
-{
-int err;
-
-monitor_printf(mon, "%s (%s) is encrypted.\n", bdrv_get_device_name(bs),
-   bdrv_get_encrypted_filename(bs));
-
-mon->password_completion_cb = completion_cb;
-mon->password_opaque = opaque;
-
-err = monitor_read_password(mon, bdrv_password_cb, bs);
-
-if (err && completion_cb)
-completion_cb(opaque, err);
-
-return err;
-}
-
-int monitor_read_block_device_key(Monitor *mon, const

[Qemu-block] [PATCH v10 15/20] iotests: enable tests 134 and 158 to work with qcow (v1)

2017-06-23 Thread Daniel P. Berrange

The 138 and 158 iotests exercise the legacy qcow2 aes encryption
code path and they work fine with qcow v1 too.

Reviewed-by: Alberto Garcia 
Reviewed-by: Max Reitz 
Signed-off-by: Daniel P. Berrange 
---
 tests/qemu-iotests/134 | 2 +-
 tests/qemu-iotests/158 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/qemu-iotests/134 b/tests/qemu-iotests/134
index f851d92..9914415 100755
--- a/tests/qemu-iotests/134
+++ b/tests/qemu-iotests/134
@@ -37,7 +37,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 . ./common.rc
 . ./common.filter
 
-_supported_fmt qcow2
+_supported_fmt qcow qcow2
 _supported_proto generic
 _unsupported_proto vxhs
 _supported_os Linux
diff --git a/tests/qemu-iotests/158 b/tests/qemu-iotests/158
index e280b79..823c120 100755
--- a/tests/qemu-iotests/158
+++ b/tests/qemu-iotests/158
@@ -37,7 +37,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 . ./common.rc
 . ./common.filter
 
-_supported_fmt qcow2
+_supported_fmt qcow qcow2
 _supported_proto generic
 _unsupported_proto vxhs
 _supported_os Linux
-- 
2.9.3

[Qemu-block] [PATCH v10 12/20] qcow2: extend specification to cover LUKS encryption

2017-06-23 Thread Daniel P. Berrange

Update the qcow2 specification to describe how the LUKS header is
placed inside a qcow2 file, when using LUKS encryption for the
qcow2 payload instead of the legacy AES-CBC encryption

Reviewed-by: Eric Blake 
Reviewed-by: Alberto Garcia 
Reviewed-by: Max Reitz 
Signed-off-by: Daniel P. Berrange 
---
 docs/specs/qcow2.txt | 103 +++
 1 file changed, 103 insertions(+)

diff --git a/docs/specs/qcow2.txt b/docs/specs/qcow2.txt
index 80cdfd0..886a546 100644
--- a/docs/specs/qcow2.txt
+++ b/docs/specs/qcow2.txt
@@ -45,6 +45,7 @@ The first cluster of a qcow2 image contains the file header:
  32 - 35:   crypt_method
 0 for no encryption
 1 for AES encryption
+2 for LUKS encryption
 
  36 - 39:   l1_size
 Number of entries in the active L1 table
@@ -135,6 +136,7 @@ be stored. Each extension has a structure like the 
following:
 0xE2792ACA - Backing file format name
 0x6803f857 - Feature name table
 0x23852875 - Bitmaps extension
+0x0537be77 - Full disk encryption header pointer
 other  - Unknown header extension, can be safely
  ignored
 
@@ -207,6 +209,107 @@ The fields of the bitmaps extension are:
Offset into the image file at which the bitmap directory
starts. Must be aligned to a cluster boundary.
 
+== Full disk encryption header pointer ==
+
+The full disk encryption header must be present if, and only if, the
+'crypt_method' header requires metadata. Currently this is only true
+of the 'LUKS' crypt method. The header extension must be absent for
+other methods.
+
+This header provides the offset at which the crypt method can store
+its additional data, as well as the length of such data.
+
+Byte  0 -  7:   Offset into the image file at which the encryption
+header starts in bytes. Must be aligned to a cluster
+boundary.
+Byte  8 - 15:   Length of the written encryption header in bytes.
+Note actual space allocated in the qcow2 file may
+be larger than this value, since it will be rounded
+to the nearest multiple of the cluster size. Any
+unused bytes in the allocated space will be initialized
+to 0.
+
+For the LUKS crypt method, the encryption header works as follows.
+
+The first 592 bytes of the header clusters will contain the LUKS
+partition header. This is then followed by the key material data areas.
+The size of the key material data areas is determined by the number of
+stripes in the key slot and key size. Refer to the LUKS format
+specification ('docs/on-disk-format.pdf' in the cryptsetup source
+package) for details of the LUKS partition header format.
+
+In the LUKS partition header, the "payload-offset" field will be
+calculated as normal for the LUKS spec. ie the size of the LUKS
+header, plus key material regions, plus padding, relative to the
+start of the LUKS header. This offset value is not required to be
+qcow2 cluster aligned. Its value is currently never used in the
+context of qcow2, since the qcow2 file format itself defines where
+the real payload offset is, but none the less a valid payload offset
+should always be present.
+
+In the LUKS key slots header, the "key-material-offset" is relative
+to the start of the LUKS header clusters in the qcow2 container,
+not the start of the qcow2 file.
+
+Logically the layout looks like
+
+  +-+
+  | QCow2 header|
+  | QCow2 header extension X|
+  | QCow2 header extension FDE  |
+  | QCow2 header extension ...  |
+  | QCow2 header extension Z|
+  +-+
+  | other QCow2 tables  |
+  . .
+  . .
+  +-+
+  | +-+ |
+  | | LUKS partition header   | |
+  | +-+ |
+  | | LUKS key material 1 | |
+  | +-+ |
+  | | LUKS key material 2 | |
+  | +-+ |
+  | | LUKS key material ...   | |
+  | +-+ |
+  | | LUKS key material 8 | |
+  | +-+ |
+  +-+
+  | QCow2 cluster payload   |
+  . .
+  . .
+  . .
+  | |
+  +-+
+
+== Data encryption ==
+
+When an encryption method is requested in the header, the image payload
+data must be encrypted/decrypted on every write/read. The image headers
+and metadata are never encrypted.
+
+The algorithms used for encryption vary depending on the method
+
+ -

[Qemu-block] [PATCH v10 05/20] iotests: skip 042 with qcow which dosn't support zero sized images

2017-06-23 Thread Daniel P. Berrange

Test 042 is designed to verify operation with zero sized images.
Such images are not supported with qcow (v1), so this test has
always failed.

Reviewed-by: Max Reitz 
Reviewed-by: Alberto Garcia 
Signed-off-by: Daniel P. Berrange 
---
 tests/qemu-iotests/042 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/042 b/tests/qemu-iotests/042
index 351b283..a53e7cb 100755
--- a/tests/qemu-iotests/042
+++ b/tests/qemu-iotests/042
@@ -37,7 +37,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 . ./common.rc
 . ./common.filter
 
-_supported_fmt qcow2 qcow qed
+_supported_fmt qcow2 qed
 _supported_proto file
 _supported_os Linux
 
-- 
2.9.3

[Qemu-block] [PATCH v10 03/20] qcow: document another weakness of qcow AES encryption

2017-06-23 Thread Daniel P. Berrange

Document that use of guest virtual sector numbers as the basis for
the initialization vectors is a potential weakness, when combined
with internal snapshots or multiple images using the same passphrase.
This fixes the formatting of the itemized list too.

Reviewed-by: Max Reitz 
Reviewed-by: Alberto Garcia 
Signed-off-by: Daniel P. Berrange 
---
 qemu-img.texi | 19 ---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/qemu-img.texi b/qemu-img.texi
index 5b925ec..f335139 100644
--- a/qemu-img.texi
+++ b/qemu-img.texi
@@ -567,16 +567,29 @@ The use of encryption in qcow and qcow2 images is 
considered to be flawed by
 modern cryptography standards, suffering from a number of design problems:
 
 @itemize @minus
-@item The AES-CBC cipher is used with predictable initialization vectors based
+@item
+The AES-CBC cipher is used with predictable initialization vectors based
 on the sector number. This makes it vulnerable to chosen plaintext attacks
 which can reveal the existence of encrypted data.
-@item The user passphrase is directly used as the encryption key. A poorly
+@item
+The user passphrase is directly used as the encryption key. A poorly
 chosen or short passphrase will compromise the security of the encryption.
-@item In the event of the passphrase being compromised there is no way to
+@item
+In the event of the passphrase being compromised there is no way to
 change the passphrase to protect data in any qcow images. The files must
 be cloned, using a different encryption passphrase in the new file. The
 original file must then be securely erased using a program like shred,
 though even this is ineffective with many modern storage technologies.
+@item
+Initialization vectors used to encrypt sectors are based on the
+guest virtual sector number, instead of the host physical sector. When
+a disk image has multiple internal snapshots this means that data in
+multiple physical sectors is encrypted with the same initialization
+vector. With the CBC mode, this opens the possibility of watermarking
+attacks if the attack can collect multiple sectors encrypted with the
+same IV and some predictable data. Having multiple qcow2 images with
+the same passphrase also exposes this weakness since the passphrase
+is directly used as the key.
 @end itemize
 
 Use of qcow / qcow2 encryption is thus strongly discouraged. Users are
-- 
2.9.3

[Qemu-block] [PULL 46/61] qed: Add return value to qed_aio_read/write_data()

2017-06-23 Thread Kevin Wolf

Don't recurse into qed_aio_next_io() and qed_aio_complete() here, but
just return an error code and let the caller handle it.

Signed-off-by: Kevin Wolf 
Reviewed-by: Stefan Hajnoczi 
---
 block/qed.c | 72 ++---
 block/qed.h | 21 --
 2 files changed, 31 insertions(+), 62 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index 4c8ba4a..6f83831 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1205,13 +1205,12 @@ static int qed_aio_write_inplace(QEDAIOCB *acb, 
uint64_t offset, size_t len)
  * Write data cluster
  *
  * @opaque: Write request
- * @ret:QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1,
- *  or -errno
+ * @ret:QED_CLUSTER_FOUND, QED_CLUSTER_L2 or QED_CLUSTER_L1
  * @offset: Cluster offset in bytes
  * @len:Length in bytes
  */
-static void qed_aio_write_data(void *opaque, int ret,
-   uint64_t offset, size_t len)
+static int qed_aio_write_data(void *opaque, int ret,
+  uint64_t offset, size_t len)
 {
 QEDAIOCB *acb = opaque;
 
@@ -1221,40 +1220,27 @@ static void qed_aio_write_data(void *opaque, int ret,
 
 switch (ret) {
 case QED_CLUSTER_FOUND:
-ret = qed_aio_write_inplace(acb, offset, len);
-break;
+return qed_aio_write_inplace(acb, offset, len);
 
 case QED_CLUSTER_L2:
 case QED_CLUSTER_L1:
 case QED_CLUSTER_ZERO:
-ret = qed_aio_write_alloc(acb, len);
-break;
+return qed_aio_write_alloc(acb, len);
 
 default:
-assert(ret < 0);
-break;
-}
-
-if (ret < 0) {
-if (ret != -EINPROGRESS) {
-qed_aio_complete(acb, ret);
-}
-return;
+g_assert_not_reached();
 }
-qed_aio_next_io(acb, 0);
 }
 
 /**
  * Read data cluster
  *
  * @opaque: Read request
- * @ret:QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1,
- *  or -errno
+ * @ret:QED_CLUSTER_FOUND, QED_CLUSTER_L2 or QED_CLUSTER_L1
  * @offset: Cluster offset in bytes
  * @len:Length in bytes
  */
-static void qed_aio_read_data(void *opaque, int ret,
-  uint64_t offset, size_t len)
+static int qed_aio_read_data(void *opaque, int ret, uint64_t offset, size_t 
len)
 {
 QEDAIOCB *acb = opaque;
 BDRVQEDState *s = acb_to_s(acb);
@@ -1265,34 +1251,23 @@ static void qed_aio_read_data(void *opaque, int ret,
 
 trace_qed_aio_read_data(s, acb, ret, offset, len);
 
-if (ret < 0) {
-goto err;
-}
-
 qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
 
 /* Handle zero cluster and backing file reads */
 if (ret == QED_CLUSTER_ZERO) {
 qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size);
-qed_aio_start_io(acb);
-return;
+return 0;
 } else if (ret != QED_CLUSTER_FOUND) {
-ret = qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
-&acb->backing_qiov);
-qed_aio_next_io(acb, ret);
-return;
+return qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
+ &acb->backing_qiov);
 }
 
 BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
 ret = bdrv_preadv(bs->file, offset, &acb->cur_qiov);
 if (ret < 0) {
-goto err;
+return ret;
 }
-qed_aio_next_io(acb, 0);
-return;
-
-err:
-qed_aio_complete(acb, ret);
+return 0;
 }
 
 /**
@@ -1301,8 +1276,6 @@ err:
 static void qed_aio_next_io(QEDAIOCB *acb, int ret)
 {
 BDRVQEDState *s = acb_to_s(acb);
-QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ?
-qed_aio_write_data : qed_aio_read_data;
 uint64_t offset;
 size_t len;
 
@@ -1333,7 +1306,24 @@ static void qed_aio_next_io(QEDAIOCB *acb, int ret)
 /* Find next cluster and start I/O */
 len = acb->end_pos - acb->cur_pos;
 ret = qed_find_cluster(s, &acb->request, acb->cur_pos, &len, &offset);
-io_fn(acb, ret, offset, len);
+if (ret < 0) {
+qed_aio_complete(acb, ret);
+return;
+}
+
+if (acb->flags & QED_AIOCB_WRITE) {
+ret = qed_aio_write_data(acb, ret, offset, len);
+} else {
+ret = qed_aio_read_data(acb, ret, offset, len);
+}
+
+if (ret < 0) {
+if (ret != -EINPROGRESS) {
+qed_aio_complete(acb, ret);
+}
+return;
+}
+qed_aio_next_io(acb, 0);
 }
 
 static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
diff --git a/block/qed.h b/block/qed.h
index 51443fa..8644fed 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -177,27 +177,6 @@ enum {
 QED_CLUSTER_L1,/* cluster missing in L1 */
 };
 
-/**
- * qed_find_cluster() completion callback
- *
- * @opaque: User data for completion callback
- * @ret:QED_CLUSTER_FOUND   Success
- *  QED_CLUSTER_L2  Data clust

[Qemu-block] [PATCH v10 10/20] qcow2: make qcow2_encrypt_sectors encrypt in place

2017-06-23 Thread Daniel P. Berrange

Instead of requiring separate input/output buffers for
encrypting data, change qcow2_encrypt_sectors() to assume
use of a single buffer, encrypting in place. The current
callers all used the same buffer for input/output already.

Signed-off-by: Daniel P. Berrange 
---
 block/qcow2-cluster.c | 17 ++---
 block/qcow2.c |  4 ++--
 block/qcow2.h |  3 +--
 3 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 3d341fd..a570929 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -358,11 +358,9 @@ static int count_contiguous_clusters_unallocated(int 
nb_clusters,
 }
 
 /* The crypt function is compatible with the linux cryptoloop
-   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
-   supported */
+   algorithm for < 4 GB images. */
 int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
-  uint8_t *out_buf, const uint8_t *in_buf,
-  int nb_sectors, bool enc,
+  uint8_t *buf, int nb_sectors, bool enc,
   Error **errp)
 {
 union {
@@ -382,14 +380,12 @@ int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t 
sector_num,
 }
 if (enc) {
 ret = qcrypto_cipher_encrypt(s->cipher,
- in_buf,
- out_buf,
+ buf, buf,
  512,
  errp);
 } else {
 ret = qcrypto_cipher_decrypt(s->cipher,
- in_buf,
- out_buf,
+ buf, buf,
  512,
  errp);
 }
@@ -397,8 +393,7 @@ int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t 
sector_num,
 return -1;
 }
 sector_num++;
-in_buf += 512;
-out_buf += 512;
+buf += 512;
 }
 return 0;
 }
@@ -446,7 +441,7 @@ static bool coroutine_fn 
do_perform_cow_encrypt(BlockDriverState *bs,
 assert(s->cipher);
 assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0);
 assert((bytes & ~BDRV_SECTOR_MASK) == 0);
-if (qcow2_encrypt_sectors(s, sector, buffer, buffer,
+if (qcow2_encrypt_sectors(s, sector, buffer,
   bytes >> BDRV_SECTOR_BITS, true, NULL) < 0) {
 return false;
 }
diff --git a/block/qcow2.c b/block/qcow2.c
index 67fb50d..f4b5207 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1543,7 +1543,7 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState 
*bs, uint64_t offset,
 assert((cur_bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
 Error *err = NULL;
 if (qcow2_encrypt_sectors(s, offset >> BDRV_SECTOR_BITS,
-  cluster_data, cluster_data,
+  cluster_data,
   cur_bytes >> BDRV_SECTOR_BITS,
   false, &err) < 0) {
 error_free(err);
@@ -1677,7 +1677,7 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState 
*bs, uint64_t offset,
 qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size);
 
 if (qcow2_encrypt_sectors(s, offset >> BDRV_SECTOR_BITS,
-  cluster_data, cluster_data,
+  cluster_data,
   cur_bytes >>BDRV_SECTOR_BITS,
   true, &err) < 0) {
 error_free(err);
diff --git a/block/qcow2.h b/block/qcow2.h
index 87b15eb..5a3f07e 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -545,8 +545,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t 
min_size,
 int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index);
 int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
 int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
-  uint8_t *out_buf, const uint8_t *in_buf,
-  int nb_sectors, bool enc, Error **errp);
+  uint8_t *buf, int nb_sectors, bool enc, Error 
**errp);
 
 int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
  unsigned int *bytes, uint64_t *cluster_offset);
-- 
2.9.3

[Qemu-block] [PATCH v10 06/20] iotests: skip 048 with qcow which doesn't support resize

2017-06-23 Thread Daniel P. Berrange

Test 048 is designed to verify data preservation during an
image resize. The qcow (v1) format impl has never supported
resize so always fails.

Reviewed-by: Max Reitz 
Reviewed-by: Alberto Garcia 
Signed-off-by: Daniel P. Berrange 
---
 tests/qemu-iotests/048 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/048 b/tests/qemu-iotests/048
index 203c04f..9ed04a0 100755
--- a/tests/qemu-iotests/048
+++ b/tests/qemu-iotests/048
@@ -46,7 +46,7 @@ _compare()
 . ./common.filter
 . ./common.pattern
 
-_supported_fmt raw qcow qcow2 qed luks
+_supported_fmt raw qcow2 qed luks
 _supported_proto file
 _supported_os Linux
 
-- 
2.9.3

[Qemu-block] [PATCH v10 04/20] qcow: require image size to be > 1 for new images

2017-06-23 Thread Daniel P. Berrange

The qcow driver refuses to open images which are less than
2 bytes in size, but will happily create such images. Add
a check in the create path to avoid this discrepancy.

Reviewed-by: Max Reitz 
Reviewed-by: Alberto Garcia 
Reviewed-by: Eric Blake 
Signed-off-by: Daniel P. Berrange 
---
 block/qcow.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/block/qcow.c b/block/qcow.c
index 7bd94dc..49871fb 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -811,6 +811,12 @@ static int qcow_create(const char *filename, QemuOpts 
*opts, Error **errp)
 /* Read out options */
 total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
   BDRV_SECTOR_SIZE);
+if (total_size == 0) {
+error_setg(errp, "Image size is too small, cannot be zero length");
+ret = -EINVAL;
+goto cleanup;
+}
+
 backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
 if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ENCRYPT, false)) {
 flags |= BLOCK_FLAG_ENCRYPT;
-- 
2.9.3

[Qemu-block] [PULL 43/61] qed: Add return value to qed_aio_write_main()

2017-06-23 Thread Kevin Wolf

Don't recurse into qed_aio_next_io() and qed_aio_complete() here, but
just return an error code and let the caller handle it.

Signed-off-by: Kevin Wolf 
Reviewed-by: Stefan Hajnoczi 
---
 block/qed.c | 55 ++-
 1 file changed, 30 insertions(+), 25 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index 3cda01f..a4b13f8 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1022,29 +1022,22 @@ static int qed_aio_write_l2_update(QEDAIOCB *acb, 
uint64_t offset)
 /**
  * Write data to the image file
  */
-static void qed_aio_write_main(void *opaque, int ret)
+static int qed_aio_write_main(QEDAIOCB *acb)
 {
-QEDAIOCB *acb = opaque;
 BDRVQEDState *s = acb_to_s(acb);
 uint64_t offset = acb->cur_cluster +
   qed_offset_into_cluster(s, acb->cur_pos);
+int ret;
 
-trace_qed_aio_write_main(s, acb, ret, offset, acb->cur_qiov.size);
-
-if (ret) {
-qed_aio_complete(acb, ret);
-return;
-}
+trace_qed_aio_write_main(s, acb, 0, offset, acb->cur_qiov.size);
 
 BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
 ret = bdrv_pwritev(s->bs->file, offset, &acb->cur_qiov);
-if (ret >= 0) {
-ret = 0;
+if (ret < 0) {
+return ret;
 }
 
-if (acb->find_cluster_ret == QED_CLUSTER_FOUND) {
-qed_aio_next_io(acb, ret);
-} else {
+if (acb->find_cluster_ret != QED_CLUSTER_FOUND) {
 if (s->bs->backing) {
 /*
  * Flush new data clusters before updating the L2 table
@@ -1057,20 +1050,16 @@ static void qed_aio_write_main(void *opaque, int ret)
  * cluster and before updating the L2 table.
  */
 ret = bdrv_flush(s->bs->file->bs);
-}
-if (ret) {
-goto err;
+if (ret < 0) {
+return ret;
+}
 }
 ret = qed_aio_write_l2_update(acb, acb->cur_cluster);
-if (ret) {
-goto err;
+if (ret < 0) {
+return ret;
 }
-qed_aio_next_io(acb, 0);
 }
-return;
-
-err:
-qed_aio_complete(acb, ret);
+return 0;
 }
 
 /**
@@ -1102,8 +1091,17 @@ static void qed_aio_write_cow(void *opaque, int ret)
 
 trace_qed_aio_write_postfill(s, acb, start, len, offset);
 ret = qed_copy_from_backing_file(s, start, len, offset);
+if (ret) {
+qed_aio_complete(acb, ret);
+return;
+}
 
-qed_aio_write_main(acb, ret);
+ret = qed_aio_write_main(acb);
+if (ret < 0) {
+qed_aio_complete(acb, ret);
+return;
+}
+qed_aio_next_io(acb, 0);
 }
 
 /**
@@ -1201,6 +1199,8 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
  */
 static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
 {
+int ret;
+
 /* Allocate buffer for zero writes */
 if (acb->flags & QED_AIOCB_ZERO) {
 struct iovec *iov = acb->qiov->iov;
@@ -1220,7 +1220,12 @@ static void qed_aio_write_inplace(QEDAIOCB *acb, 
uint64_t offset, size_t len)
 qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
 
 /* Do the actual write */
-qed_aio_write_main(acb, 0);
+ret = qed_aio_write_main(acb);
+if (ret < 0) {
+qed_aio_complete(acb, ret);
+return;
+}
+qed_aio_next_io(acb, 0);
 }
 
 /**
-- 
1.8.3.1

[Qemu-block] [PULL 57/61] fix: avoid an infinite loop or a dangling pointer problem in img_commit

2017-06-23 Thread Kevin Wolf

From: "sochin.jiang" 

img_commit could fall into an infinite loop calling run_block_job() if
its blockjob fails on any I/O error, fix this already known problem.

Signed-off-by: sochin.jiang 
Message-id: 1497509253-28941-1-git-send-email-sochin.ji...@huawei.com
Signed-off-by: Max Reitz 
---
 blockjob.c   |  4 ++--
 include/block/blockjob.h | 18 ++
 qemu-img.c   | 20 +---
 3 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/blockjob.c b/blockjob.c
index a0d7e29..70a7818 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -139,7 +139,7 @@ static void block_job_resume(BlockJob *job)
 block_job_enter(job);
 }
 
-static void block_job_ref(BlockJob *job)
+void block_job_ref(BlockJob *job)
 {
 ++job->refcnt;
 }
@@ -148,7 +148,7 @@ static void block_job_attached_aio_context(AioContext 
*new_context,
void *opaque);
 static void block_job_detach_aio_context(void *opaque);
 
-static void block_job_unref(BlockJob *job)
+void block_job_unref(BlockJob *job)
 {
 if (--job->refcnt == 0) {
 BlockDriverState *bs = blk_bs(job->blk);
diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index 09c7c69..67c0968 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -321,6 +321,24 @@ void block_job_iostatus_reset(BlockJob *job);
 BlockJobTxn *block_job_txn_new(void);
 
 /**
+ * block_job_ref:
+ *
+ * Add a reference to BlockJob refcnt, it will be decreased with
+ * block_job_unref, and then be freed if it comes to be the last
+ * reference.
+ */
+void block_job_ref(BlockJob *job);
+
+/**
+ * block_job_unref:
+ *
+ * Release a reference that was previously acquired with block_job_ref
+ * or block_job_create. If it's the last reference to the object, it will be
+ * freed.
+ */
+void block_job_unref(BlockJob *job);
+
+/**
  * block_job_txn_unref:
  *
  * Release a reference that was previously acquired with block_job_txn_add_job
diff --git a/qemu-img.c b/qemu-img.c
index 0ad698d..e70d515 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -887,22 +887,28 @@ static void common_block_job_cb(void *opaque, int ret)
 static void run_block_job(BlockJob *job, Error **errp)
 {
 AioContext *aio_context = blk_get_aio_context(job->blk);
+int ret = 0;
 
-/* FIXME In error cases, the job simply goes away and we access a dangling
- * pointer below. */
 aio_context_acquire(aio_context);
+block_job_ref(job);
 do {
 aio_poll(aio_context, true);
 qemu_progress_print(job->len ?
 ((float)job->offset / job->len * 100.f) : 0.0f, 0);
-} while (!job->ready);
+} while (!job->ready && !job->completed);
 
-block_job_complete_sync(job, errp);
+if (!job->completed) {
+ret = block_job_complete_sync(job, errp);
+} else {
+ret = job->ret;
+}
+block_job_unref(job);
 aio_context_release(aio_context);
 
-/* A block job may finish instantaneously without publishing any progress,
- * so just signal completion here */
-qemu_progress_print(100.f, 0);
+/* publish completion progress only when success */
+if (!ret) {
+qemu_progress_print(100.f, 0);
+}
 }
 
 static int img_commit(int argc, char **argv)
-- 
1.8.3.1

[Qemu-block] [PATCH v10 00/20] Convert QCow[2] to QCryptoBlock & add LUKS support

2017-06-23 Thread Daniel P. Berrange

Previously posted:

 v1: https://lists.gnu.org/archive/html/qemu-devel/2017-01/msg00201.html
 v2: https://lists.gnu.org/archive/html/qemu-devel/2017-01/msg05147.html
 v3: https://lists.gnu.org/archive/html/qemu-devel/2017-01/msg05671.html
 v4: https://lists.gnu.org/archive/html/qemu-devel/2017-02/msg02293.html
 v5: https://lists.gnu.org/archive/html/qemu-devel/2017-02/msg04653.html
 v6: https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg04561.html
 v7: https://lists.gnu.org/archive/html/qemu-devel/2017-05/msg05818.html
 v8: https://lists.gnu.org/archive/html/qemu-devel/2017-06/msg00308.html
 v9: https://lists.gnu.org/archive/html/qemu-devel/2017-06/msg04250.html

Patches 7, 10, 11 & 13 have changes needing new R-b tags since previous
postings

This series is a continuation of previous work to support LUKS in
QEMU. The existing merged code supports LUKS as a standalone
driver which can be layered over/under any other QEMU block device
driver. This works well when using LUKS over protocol drivers (file,
rbd, iscsi, etc, etc), but has some downsides when combined with
format drivers like qcow2.

If you layer LUKS under qcow2 (eg qcow2 -> luks -> file) then you
cannot get any information about the qcow2 file without first
decrypting it, as both the header and payload are encrypted.

If you layer LUKS over qcow2 (eg luks -> qcow2 -> file) then you
cannot distinguish between a qcow2 file where the guest has done
LUKS encryption from a qcow2 file which qemu has done encryption.
More seriously, when encrypting sectors the guest virtual sector
is used as the input for deriving the initialization vectors.
When internal snapshots are used, this means that multiple sectors
in the qcow2 file may be encrypted with the same initialization
vector. This is a security weakness when combined with certain
cryptographic modes.

Integrating LUKS natively into qcow2 allows us to combine the
best aspects of both layering strategies above. In particular
the header remains unecrypted, but initialization vectors are
generated using physical sector numbers preserving security
when snapshots are used. This is a change from previous postings
of this work, where the IVs were (incorrectly) generated based
on the virtual disk sector.

In a previous posting of this work, Fam had suggested that we
do integration by layering luks over qcow2, but having QEMU
block layer automatically create the luks driver above qcow2
based on the qcow2 header crypt_method field. This is not
possible though, because such a scheme would suffer from the
problem of IVs being generated from the virtual disk sector
instead of physical disk sector. So having LUKS specific
code in the qcow2 block driver is unavoidable. In comparison
to the previous posting though, the amount of code in qcow2.c
has been reduced by allowing re-use of code from block/crypto.c
for handling QemuOpts -> QAPI conversion. So extra lines of
code in qcow2 to support LUKS is < 200.

I have also split the changes to qcow2 up into 2 patches. The
first patch simply introduces use of the QCryptoBlock framework
to qcow2 for the existing (deprecated) AES-CBC encryption method.
The second patch wires up the LUKS support for qcow2. This makes
it clearer which parts of the changes are related to plain code
refactoring, vs enabling the new features. Specifically we can
now see that the LUKS enablement in qcow2 has this footprint:

Changed in v10:

 - Use qemu_opt_get instead of qemu_opt_get_del (Alberto)
 - Rebase & resolve conflicts with latest kevin/block branch (Max)
 - Fix format name in commit message (Max)

Changed in v9:

 - Ensure encryption=off conflicts with encryption.format=XXX (Max)
 - Fix mistaken syntax in commit message examples (Max)
 - Fix typo in spec (Eric)
 - Ensure variable decl is at start of method (Max)
 - Use abort() instead of assert(false)  (Max)

Changed in v8:

 - Fix leak of encryptopts in qcow driver (Alberto)
 - Remove some error_propagate calls (Alberto)
 - Clarify payload offset in spec (Eric)
 - Mention AES deprecation in spec (Eric)
 - Misc typos in spec (Eric)
 - Use error_abort querying specific info (Eric)
 - Document 'encrypt' qapi field (Eric)
 - Resolve conflict in iotests 087

Changed in v7:

 - Add encryption info to 'qemu-img info' output
 - List new encryption parameters in QEMU manual docs.
 - Extend copyright date to include 2017 (Eric)
 - Avoid local error object when not needed (Alberto)
 - Ensure to set 'ret' to an errno value (Alberto)
 - Fix leak of crypto options in qcow (Alberto)
 - Use american spelling of 'favor' (Eric)
 - Fix encryption format name in qapi (Alberto)
 - Fix incorrect option name prefix
 - Rename new iotests to avoid clash

Changed in v6:

 - Changed QAPI / QemuOpts design to use nested struct/union
   for all encryption parameters (Eric/Kevin)
 - Fix cleanup during error conditions (Alberto)

Changed in v5:

 - Remove accidental use of tabs in spec (Alberto)
 - Clarify payload-offset position semantics (Alberto)
 - Fix leak

[Qemu-block] [PULL 49/61] qed: Implement .bdrv_co_readv/writev

2017-06-23 Thread Kevin Wolf

Most of the qed code is now synchronous and matches the coroutine model.
One notable exception is the serialisation between requests which can
still schedule a callback. Before we can replace this with coroutine
locks, let's convert the driver's external interfaces to the coroutine
versions.

We need to be careful to handle both requests that call the completion
callback directly from the calling coroutine (i.e. fully synchronous
code) and requests that involve some callback, so that we need to yield
and wait for the completion callback coming from outside the coroutine.

Signed-off-by: Kevin Wolf 
Reviewed-by: Manos Pitsidianakis 
Reviewed-by: Stefan Hajnoczi 
---
 block/qed.c | 97 ++---
 1 file changed, 42 insertions(+), 55 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index e762169..a5111fd 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1322,16 +1322,32 @@ static void qed_aio_next_io(QEDAIOCB *acb)
 }
 }
 
-static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
- int64_t sector_num,
- QEMUIOVector *qiov, int nb_sectors,
- BlockCompletionFunc *cb,
- void *opaque, int flags)
+typedef struct QEDRequestCo {
+Coroutine *co;
+bool done;
+int ret;
+} QEDRequestCo;
+
+static void qed_co_request_cb(void *opaque, int ret)
 {
-QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, cb, opaque);
+QEDRequestCo *co = opaque;
 
-trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors,
-opaque, flags);
+co->done = true;
+co->ret = ret;
+qemu_coroutine_enter_if_inactive(co->co);
+}
+
+static int coroutine_fn qed_co_request(BlockDriverState *bs, int64_t 
sector_num,
+   QEMUIOVector *qiov, int nb_sectors,
+   int flags)
+{
+QEDRequestCo co = {
+.co = qemu_coroutine_self(),
+.done   = false,
+};
+QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, qed_co_request_cb, &co);
+
+trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors, &co, flags);
 
 acb->flags = flags;
 acb->qiov = qiov;
@@ -1344,43 +1360,26 @@ static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
 
 /* Start request */
 qed_aio_start_io(acb);
-return &acb->common;
-}
 
-static BlockAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs,
-  int64_t sector_num,
-  QEMUIOVector *qiov, int nb_sectors,
-  BlockCompletionFunc *cb,
-  void *opaque)
-{
-return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+if (!co.done) {
+qemu_coroutine_yield();
+}
+
+return co.ret;
 }
 
-static BlockAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs,
-   int64_t sector_num,
-   QEMUIOVector *qiov, int nb_sectors,
-   BlockCompletionFunc *cb,
-   void *opaque)
+static int coroutine_fn bdrv_qed_co_readv(BlockDriverState *bs,
+  int64_t sector_num, int nb_sectors,
+  QEMUIOVector *qiov)
 {
-return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb,
- opaque, QED_AIOCB_WRITE);
+return qed_co_request(bs, sector_num, qiov, nb_sectors, 0);
 }
 
-typedef struct {
-Coroutine *co;
-int ret;
-bool done;
-} QEDWriteZeroesCB;
-
-static void coroutine_fn qed_co_pwrite_zeroes_cb(void *opaque, int ret)
+static int coroutine_fn bdrv_qed_co_writev(BlockDriverState *bs,
+   int64_t sector_num, int nb_sectors,
+   QEMUIOVector *qiov)
 {
-QEDWriteZeroesCB *cb = opaque;
-
-cb->done = true;
-cb->ret = ret;
-if (cb->co) {
-aio_co_wake(cb->co);
-}
+return qed_co_request(bs, sector_num, qiov, nb_sectors, QED_AIOCB_WRITE);
 }
 
 static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
@@ -1388,9 +1387,7 @@ static int coroutine_fn 
bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
   int count,
   BdrvRequestFlags flags)
 {
-BlockAIOCB *blockacb;
 BDRVQEDState *s = bs->opaque;
-QEDWriteZeroesCB cb = { .done = false };
 QEMUIOVector qiov;
 struct iovec iov;
 
@@ -1407,19 +1404,9 @@ static int coroutine_fn 
bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
 iov.iov_len = count;
 
 qemu_iovec_init_external(&qiov, &iov, 1);
-blockacb = qed_aio_setup(bs, offset >> BDRV_SECTOR_BITS, &qiov,
- count >> BDRV_SECTOR_BITS,
-

[Qemu-block] [PULL 60/61] block: Do not strcmp() with NULL uri->scheme

2017-06-23 Thread Kevin Wolf

From: Max Reitz 

uri_parse(...)->scheme may be NULL. In fact, probably every field may be
NULL, and the callers do test this for all of the other fields but not
for scheme (except for block/gluster.c; block/vxhs.c does not access
that field at all).

We can easily fix this by using g_strcmp0() instead of strcmp().

Cc: qemu-sta...@nongnu.org
Signed-off-by: Max Reitz 
Message-id: 20170613205726.13544-1-mre...@redhat.com
Reviewed-by: Stefan Hajnoczi 
Signed-off-by: Max Reitz 
---
 block/nbd.c  | 6 +++---
 block/nfs.c  | 2 +-
 block/sheepdog.c | 6 +++---
 block/ssh.c  | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/block/nbd.c b/block/nbd.c
index e946ea9..d529305 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -64,11 +64,11 @@ static int nbd_parse_uri(const char *filename, QDict 
*options)
 }
 
 /* transport */
-if (!strcmp(uri->scheme, "nbd")) {
+if (!g_strcmp0(uri->scheme, "nbd")) {
 is_unix = false;
-} else if (!strcmp(uri->scheme, "nbd+tcp")) {
+} else if (!g_strcmp0(uri->scheme, "nbd+tcp")) {
 is_unix = false;
-} else if (!strcmp(uri->scheme, "nbd+unix")) {
+} else if (!g_strcmp0(uri->scheme, "nbd+unix")) {
 is_unix = true;
 } else {
 ret = -EINVAL;
diff --git a/block/nfs.c b/block/nfs.c
index 6b8b5b6..c3c5de0 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -82,7 +82,7 @@ static int nfs_parse_uri(const char *filename, QDict 
*options, Error **errp)
 error_setg(errp, "Invalid URI specified");
 goto out;
 }
-if (strcmp(uri->scheme, "nfs") != 0) {
+if (g_strcmp0(uri->scheme, "nfs") != 0) {
 error_setg(errp, "URI scheme must be 'nfs'");
 goto out;
 }
diff --git a/block/sheepdog.c b/block/sheepdog.c
index a87ee5f..08d7b11 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -1046,11 +1046,11 @@ static void sd_parse_uri(SheepdogConfig *cfg, const 
char *filename,
 }
 
 /* transport */
-if (!strcmp(uri->scheme, "sheepdog")) {
+if (!g_strcmp0(uri->scheme, "sheepdog")) {
 is_unix = false;
-} else if (!strcmp(uri->scheme, "sheepdog+tcp")) {
+} else if (!g_strcmp0(uri->scheme, "sheepdog+tcp")) {
 is_unix = false;
-} else if (!strcmp(uri->scheme, "sheepdog+unix")) {
+} else if (!g_strcmp0(uri->scheme, "sheepdog+unix")) {
 is_unix = true;
 } else {
 error_setg(&err, "URI scheme must be 'sheepdog', 'sheepdog+tcp',"
diff --git a/block/ssh.c b/block/ssh.c
index bac3453..5296441 100644
--- a/block/ssh.c
+++ b/block/ssh.c
@@ -204,7 +204,7 @@ static int parse_uri(const char *filename, QDict *options, 
Error **errp)
 return -EINVAL;
 }
 
-if (strcmp(uri->scheme, "ssh") != 0) {
+if (g_strcmp0(uri->scheme, "ssh") != 0) {
 error_setg(errp, "URI scheme must be 'ssh'");
 goto err;
 }
-- 
1.8.3.1

[Qemu-block] [PULL 52/61] qed: Use a coroutine for need_check_timer

2017-06-23 Thread Kevin Wolf

This fixes the last place where we degraded from AIO to actual blocking
synchronous I/O requests. Putting it into a coroutine means that instead
of blocking, the coroutine simply yields while doing I/O.

Signed-off-by: Kevin Wolf 
Reviewed-by: Stefan Hajnoczi 
---
 block/qed.c | 33 +
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index e53f6b5..eac8c2f 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -264,11 +264,23 @@ static void qed_unplug_allocating_write_reqs(BDRVQEDState 
*s)
 qemu_co_enter_next(&s->allocating_write_reqs);
 }
 
-static void qed_clear_need_check(void *opaque, int ret)
+static void qed_need_check_timer_entry(void *opaque)
 {
 BDRVQEDState *s = opaque;
+int ret;
 
-if (ret) {
+/* The timer should only fire when allocating writes have drained */
+assert(!s->allocating_acb);
+
+trace_qed_need_check_timer_cb(s);
+
+qed_acquire(s);
+qed_plug_allocating_write_reqs(s);
+
+/* Ensure writes are on disk before clearing flag */
+ret = bdrv_co_flush(s->bs->file->bs);
+qed_release(s);
+if (ret < 0) {
 qed_unplug_allocating_write_reqs(s);
 return;
 }
@@ -279,25 +291,14 @@ static void qed_clear_need_check(void *opaque, int ret)
 
 qed_unplug_allocating_write_reqs(s);
 
-ret = bdrv_flush(s->bs);
+ret = bdrv_co_flush(s->bs);
 (void) ret;
 }
 
 static void qed_need_check_timer_cb(void *opaque)
 {
-BDRVQEDState *s = opaque;
-
-/* The timer should only fire when allocating writes have drained */
-assert(!s->allocating_acb);
-
-trace_qed_need_check_timer_cb(s);
-
-qed_acquire(s);
-qed_plug_allocating_write_reqs(s);
-
-/* Ensure writes are on disk before clearing flag */
-bdrv_aio_flush(s->bs->file->bs, qed_clear_need_check, s);
-qed_release(s);
+Coroutine *co = qemu_coroutine_create(qed_need_check_timer_entry, opaque);
+qemu_coroutine_enter(co);
 }
 
 void qed_acquire(BDRVQEDState *s)
-- 
1.8.3.1

[Qemu-block] [PULL 61/61] qemu-img: don't shadow opts variable in img_dd()

2017-06-23 Thread Kevin Wolf

From: Stefan Hajnoczi 

It's confusing when two different variables have the same name in one
function.

Cc: Reda Sallahi 
Signed-off-by: Stefan Hajnoczi 
Message-id: 20170619150002.3033-1-stefa...@redhat.com
Signed-off-by: Max Reitz 
---
 qemu-img.c | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index e70d515..91ad6be 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -4255,15 +4255,12 @@ static int img_dd(int argc, char **argv)
 case 'U':
 force_share = true;
 break;
-case OPTION_OBJECT: {
-QemuOpts *opts;
-opts = qemu_opts_parse_noisily(&qemu_object_opts,
-   optarg, true);
-if (!opts) {
+case OPTION_OBJECT:
+if (!qemu_opts_parse_noisily(&qemu_object_opts, optarg, true)) {
 ret = -1;
 goto out;
 }
-}   break;
+break;
 case OPTION_IMAGE_OPTS:
 image_opts = true;
 break;
-- 
1.8.3.1

[Qemu-block] [PULL 55/61] block: Remove bdrv_aio_readv/writev/flush()

2017-06-23 Thread Kevin Wolf

These functions are unused now.

Signed-off-by: Kevin Wolf 
Reviewed-by: Stefan Hajnoczi 
---
 block/io.c| 171 --
 block/trace-events|   3 -
 include/block/block.h |   8 ---
 3 files changed, 182 deletions(-)

diff --git a/block/io.c b/block/io.c
index e158ae0..132bcbb 100644
--- a/block/io.c
+++ b/block/io.c
@@ -34,14 +34,6 @@
 
 #define NOT_DONE 0x7fff /* used while emulated sync operation in progress 
*/
 
-static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
-  int64_t offset,
-  QEMUIOVector *qiov,
-  BdrvRequestFlags flags,
-  BlockCompletionFunc *cb,
-  void *opaque,
-  bool is_write);
-static void coroutine_fn bdrv_co_do_rw(void *opaque);
 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
 int64_t offset, int count, BdrvRequestFlags flags);
 
@@ -2080,28 +2072,6 @@ int bdrv_readv_vmstate(BlockDriverState *bs, 
QEMUIOVector *qiov, int64_t pos)
 /**/
 /* async I/Os */
 
-BlockAIOCB *bdrv_aio_readv(BdrvChild *child, int64_t sector_num,
-   QEMUIOVector *qiov, int nb_sectors,
-   BlockCompletionFunc *cb, void *opaque)
-{
-trace_bdrv_aio_readv(child->bs, sector_num, nb_sectors, opaque);
-
-assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size);
-return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov,
-  0, cb, opaque, false);
-}
-
-BlockAIOCB *bdrv_aio_writev(BdrvChild *child, int64_t sector_num,
-QEMUIOVector *qiov, int nb_sectors,
-BlockCompletionFunc *cb, void *opaque)
-{
-trace_bdrv_aio_writev(child->bs, sector_num, nb_sectors, opaque);
-
-assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size);
-return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov,
-  0, cb, opaque, true);
-}
-
 void bdrv_aio_cancel(BlockAIOCB *acb)
 {
 qemu_aio_ref(acb);
@@ -2134,147 +2104,6 @@ void bdrv_aio_cancel_async(BlockAIOCB *acb)
 }
 
 /**/
-/* async block device emulation */
-
-typedef struct BlockRequest {
-union {
-/* Used during read, write, trim */
-struct {
-int64_t offset;
-int bytes;
-int flags;
-QEMUIOVector *qiov;
-};
-/* Used during ioctl */
-struct {
-int req;
-void *buf;
-};
-};
-BlockCompletionFunc *cb;
-void *opaque;
-
-int error;
-} BlockRequest;
-
-typedef struct BlockAIOCBCoroutine {
-BlockAIOCB common;
-BdrvChild *child;
-BlockRequest req;
-bool is_write;
-bool need_bh;
-bool *done;
-} BlockAIOCBCoroutine;
-
-static const AIOCBInfo bdrv_em_co_aiocb_info = {
-.aiocb_size = sizeof(BlockAIOCBCoroutine),
-};
-
-static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
-{
-if (!acb->need_bh) {
-bdrv_dec_in_flight(acb->common.bs);
-acb->common.cb(acb->common.opaque, acb->req.error);
-qemu_aio_unref(acb);
-}
-}
-
-static void bdrv_co_em_bh(void *opaque)
-{
-BlockAIOCBCoroutine *acb = opaque;
-
-assert(!acb->need_bh);
-bdrv_co_complete(acb);
-}
-
-static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
-{
-acb->need_bh = false;
-if (acb->req.error != -EINPROGRESS) {
-BlockDriverState *bs = acb->common.bs;
-
-aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
-}
-}
-
-/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
-static void coroutine_fn bdrv_co_do_rw(void *opaque)
-{
-BlockAIOCBCoroutine *acb = opaque;
-
-if (!acb->is_write) {
-acb->req.error = bdrv_co_preadv(acb->child, acb->req.offset,
-acb->req.qiov->size, acb->req.qiov, acb->req.flags);
-} else {
-acb->req.error = bdrv_co_pwritev(acb->child, acb->req.offset,
-acb->req.qiov->size, acb->req.qiov, acb->req.flags);
-}
-
-bdrv_co_complete(acb);
-}
-
-static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
-  int64_t offset,
-  QEMUIOVector *qiov,
-  BdrvRequestFlags flags,
-  BlockCompletionFunc *cb,
-  void *opaque,
-  bool is_write)
-{
-Coroutine *co;
-BlockAIOCBCoroutine *acb;
-
-/* Matched by bdrv_co_complete's bdrv_dec_in_flight.  */
-bdrv_inc_in_flight(child->bs);
-
-acb = qemu_aio_get(&bdrv

[Qemu-block] [PULL 53/61] qed: Add coroutine_fn to I/O path functions

2017-06-23 Thread Kevin Wolf

Now that we stay in coroutine context for the whole request when doing
reads or writes, we can add coroutine_fn annotations to many functions
that can do I/O or yield directly.

Signed-off-by: Kevin Wolf 
Reviewed-by: Stefan Hajnoczi 
---
 block/qed-cluster.c |  5 +++--
 block/qed.c | 44 
 block/qed.h |  5 +++--
 3 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/block/qed-cluster.c b/block/qed-cluster.c
index 88dc979..d8d6e66 100644
--- a/block/qed-cluster.c
+++ b/block/qed-cluster.c
@@ -86,8 +86,9 @@ static unsigned int 
qed_count_contiguous_clusters(BDRVQEDState *s,
  * On failure QED_CLUSTER_L2 or QED_CLUSTER_L1 is returned for missing L2 or L1
  * table offset, respectively. len is number of contiguous unallocated bytes.
  */
-int qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
- size_t *len, uint64_t *img_offset)
+int coroutine_fn qed_find_cluster(BDRVQEDState *s, QEDRequest *request,
+  uint64_t pos, size_t *len,
+  uint64_t *img_offset)
 {
 uint64_t l2_offset;
 uint64_t offset = 0;
diff --git a/block/qed.c b/block/qed.c
index eac8c2f..48f2b0e 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -94,7 +94,7 @@ int qed_write_header_sync(BDRVQEDState *s)
  * This function only updates known header fields in-place and does not affect
  * extra data after the QED header.
  */
-static int qed_write_header(BDRVQEDState *s)
+static int coroutine_fn qed_write_header(BDRVQEDState *s)
 {
 /* We must write full sectors for O_DIRECT but cannot necessarily generate
  * the data following the header if an unrecognized compat feature is
@@ -264,7 +264,7 @@ static void qed_unplug_allocating_write_reqs(BDRVQEDState 
*s)
 qemu_co_enter_next(&s->allocating_write_reqs);
 }
 
-static void qed_need_check_timer_entry(void *opaque)
+static void coroutine_fn qed_need_check_timer_entry(void *opaque)
 {
 BDRVQEDState *s = opaque;
 int ret;
@@ -757,9 +757,9 @@ static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
  * This function reads qiov->size bytes starting at pos from the backing file.
  * If there is no backing file then zeroes are read.
  */
-static int qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
- QEMUIOVector *qiov,
- QEMUIOVector **backing_qiov)
+static int coroutine_fn qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
+  QEMUIOVector *qiov,
+  QEMUIOVector **backing_qiov)
 {
 uint64_t backing_length = 0;
 size_t size;
@@ -811,8 +811,9 @@ static int qed_read_backing_file(BDRVQEDState *s, uint64_t 
pos,
  * @len:Number of bytes
  * @offset: Byte offset in image file
  */
-static int qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
-  uint64_t len, uint64_t offset)
+static int coroutine_fn qed_copy_from_backing_file(BDRVQEDState *s,
+   uint64_t pos, uint64_t len,
+   uint64_t offset)
 {
 QEMUIOVector qiov;
 QEMUIOVector *backing_qiov = NULL;
@@ -865,8 +866,9 @@ out:
  * The cluster offset may be an allocated byte offset in the image file, the
  * zero cluster marker, or the unallocated cluster marker.
  */
-static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
-unsigned int n, uint64_t cluster)
+static void coroutine_fn qed_update_l2_table(BDRVQEDState *s, QEDTable *table,
+ int index, unsigned int n,
+ uint64_t cluster)
 {
 int i;
 for (i = index; i < index + n; i++) {
@@ -878,7 +880,7 @@ static void qed_update_l2_table(BDRVQEDState *s, QEDTable 
*table, int index,
 }
 }
 
-static void qed_aio_complete(QEDAIOCB *acb)
+static void coroutine_fn qed_aio_complete(QEDAIOCB *acb)
 {
 BDRVQEDState *s = acb_to_s(acb);
 
@@ -911,7 +913,7 @@ static void qed_aio_complete(QEDAIOCB *acb)
 /**
  * Update L1 table with new L2 table offset and write it out
  */
-static int qed_aio_write_l1_update(QEDAIOCB *acb)
+static int coroutine_fn qed_aio_write_l1_update(QEDAIOCB *acb)
 {
 BDRVQEDState *s = acb_to_s(acb);
 CachedL2Table *l2_table = acb->request.l2_table;
@@ -939,7 +941,7 @@ static int qed_aio_write_l1_update(QEDAIOCB *acb)
 /**
  * Update L2 table with new cluster offsets and write them out
  */
-static int qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
+static int coroutine_fn qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
 {
 BDRVQEDState *s = acb_to_s(acb);
 bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
@@ -975,7 +977,7 @@ static int qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t 
offset)
 /**
  * Write dat

[Qemu-block] [PULL 48/61] qed: Remove recursion in qed_aio_next_io()

2017-06-23 Thread Kevin Wolf

Instead of calling itself recursively as the last thing, just convert
qed_aio_next_io() into a loop.

This patch is best reviewed with 'git show -w' because most of it is
just whitespace changes.

Signed-off-by: Kevin Wolf 
Reviewed-by: Stefan Hajnoczi 
---
 block/qed.c | 63 +++--
 1 file changed, 32 insertions(+), 31 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index db80987..e762169 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1280,45 +1280,46 @@ static void qed_aio_next_io(QEDAIOCB *acb)
 size_t len;
 int ret;
 
-trace_qed_aio_next_io(s, acb, 0, acb->cur_pos + acb->cur_qiov.size);
+while (1) {
+trace_qed_aio_next_io(s, acb, 0, acb->cur_pos + acb->cur_qiov.size);
 
-if (acb->backing_qiov) {
-qemu_iovec_destroy(acb->backing_qiov);
-g_free(acb->backing_qiov);
-acb->backing_qiov = NULL;
-}
+if (acb->backing_qiov) {
+qemu_iovec_destroy(acb->backing_qiov);
+g_free(acb->backing_qiov);
+acb->backing_qiov = NULL;
+}
 
-acb->qiov_offset += acb->cur_qiov.size;
-acb->cur_pos += acb->cur_qiov.size;
-qemu_iovec_reset(&acb->cur_qiov);
+acb->qiov_offset += acb->cur_qiov.size;
+acb->cur_pos += acb->cur_qiov.size;
+qemu_iovec_reset(&acb->cur_qiov);
 
-/* Complete request */
-if (acb->cur_pos >= acb->end_pos) {
-qed_aio_complete(acb, 0);
-return;
-}
+/* Complete request */
+if (acb->cur_pos >= acb->end_pos) {
+qed_aio_complete(acb, 0);
+return;
+}
 
-/* Find next cluster and start I/O */
-len = acb->end_pos - acb->cur_pos;
-ret = qed_find_cluster(s, &acb->request, acb->cur_pos, &len, &offset);
-if (ret < 0) {
-qed_aio_complete(acb, ret);
-return;
-}
+/* Find next cluster and start I/O */
+len = acb->end_pos - acb->cur_pos;
+ret = qed_find_cluster(s, &acb->request, acb->cur_pos, &len, &offset);
+if (ret < 0) {
+qed_aio_complete(acb, ret);
+return;
+}
 
-if (acb->flags & QED_AIOCB_WRITE) {
-ret = qed_aio_write_data(acb, ret, offset, len);
-} else {
-ret = qed_aio_read_data(acb, ret, offset, len);
-}
+if (acb->flags & QED_AIOCB_WRITE) {
+ret = qed_aio_write_data(acb, ret, offset, len);
+} else {
+ret = qed_aio_read_data(acb, ret, offset, len);
+}
 
-if (ret < 0) {
-if (ret != -EINPROGRESS) {
-qed_aio_complete(acb, ret);
+if (ret < 0) {
+if (ret != -EINPROGRESS) {
+qed_aio_complete(acb, ret);
+}
+return;
 }
-return;
 }
-qed_aio_next_io(acb);
 }
 
 static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
-- 
1.8.3.1

[Qemu-block] [PULL 39/61] qed: Make qed_aio_write_main() synchronous

2017-06-23 Thread Kevin Wolf

Note that this code is generally not running in coroutine context, so
this is an actual blocking synchronous operation. We'll fix this in a
moment.

Signed-off-by: Kevin Wolf 
Reviewed-by: Stefan Hajnoczi 
---
 block/qed.c | 61 +++--
 1 file changed, 19 insertions(+), 42 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index cfebbae..d164b0e 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -260,13 +260,6 @@ static void qed_aio_start_io(QEDAIOCB *acb)
 qed_aio_next_io(acb, 0);
 }
 
-static void qed_aio_next_io_cb(void *opaque, int ret)
-{
-QEDAIOCB *acb = opaque;
-
-qed_aio_next_io(acb, ret);
-}
-
 static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
 {
 assert(!s->allocating_write_reqs_plugged);
@@ -1042,31 +1035,6 @@ err:
 qed_aio_complete(acb, ret);
 }
 
-static void qed_aio_write_l2_update_cb(void *opaque, int ret)
-{
-QEDAIOCB *acb = opaque;
-qed_aio_write_l2_update(acb, ret, acb->cur_cluster);
-}
-
-/**
- * Flush new data clusters before updating the L2 table
- *
- * This flush is necessary when a backing file is in use.  A crash during an
- * allocating write could result in empty clusters in the image.  If the write
- * only touched a subregion of the cluster, then backing image sectors have
- * been lost in the untouched region.  The solution is to flush after writing a
- * new data cluster and before updating the L2 table.
- */
-static void qed_aio_write_flush_before_l2_update(void *opaque, int ret)
-{
-QEDAIOCB *acb = opaque;
-BDRVQEDState *s = acb_to_s(acb);
-
-if (!bdrv_aio_flush(s->bs->file->bs, qed_aio_write_l2_update_cb, opaque)) {
-qed_aio_complete(acb, -EIO);
-}
-}
-
 /**
  * Write data to the image file
  */
@@ -1076,7 +1044,6 @@ static void qed_aio_write_main(void *opaque, int ret)
 BDRVQEDState *s = acb_to_s(acb);
 uint64_t offset = acb->cur_cluster +
   qed_offset_into_cluster(s, acb->cur_pos);
-BlockCompletionFunc *next_fn;
 
 trace_qed_aio_write_main(s, acb, ret, offset, acb->cur_qiov.size);
 
@@ -1085,20 +1052,30 @@ static void qed_aio_write_main(void *opaque, int ret)
 return;
 }
 
+BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
+ret = bdrv_pwritev(s->bs->file, offset, &acb->cur_qiov);
+if (ret >= 0) {
+ret = 0;
+}
+
 if (acb->find_cluster_ret == QED_CLUSTER_FOUND) {
-next_fn = qed_aio_next_io_cb;
+qed_aio_next_io(acb, ret);
 } else {
 if (s->bs->backing) {
-next_fn = qed_aio_write_flush_before_l2_update;
-} else {
-next_fn = qed_aio_write_l2_update_cb;
+/*
+ * Flush new data clusters before updating the L2 table
+ *
+ * This flush is necessary when a backing file is in use.  A crash
+ * during an allocating write could result in empty clusters in the
+ * image.  If the write only touched a subregion of the cluster,
+ * then backing image sectors have been lost in the untouched
+ * region.  The solution is to flush after writing a new data
+ * cluster and before updating the L2 table.
+ */
+ret = bdrv_flush(s->bs->file->bs);
 }
+qed_aio_write_l2_update(acb, ret, acb->cur_cluster);
 }
-
-BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
-bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE,
-&acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
-next_fn, acb);
 }
 
 /**
-- 
1.8.3.1

[Qemu-block] [PULL 38/61] qed: Make qed_aio_read_data() synchronous

2017-06-23 Thread Kevin Wolf

Note that this code is generally not running in coroutine context, so
this is an actual blocking synchronous operation. We'll fix this in a
moment.

Signed-off-by: Kevin Wolf 
Reviewed-by: Stefan Hajnoczi 
---
 block/qed.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index 8c493bb..cfebbae 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1321,9 +1321,11 @@ static void qed_aio_read_data(void *opaque, int ret,
 }
 
 BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
-bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE,
-   &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
-   qed_aio_next_io_cb, acb);
+ret = bdrv_preadv(bs->file, offset, &acb->cur_qiov);
+if (ret < 0) {
+goto err;
+}
+qed_aio_next_io(acb, 0);
 return;
 
 err:
-- 
1.8.3.1

[Qemu-block] [PULL 51/61] qed: Simplify request handling

2017-06-23 Thread Kevin Wolf

Now that we process a request in the same coroutine from beginning to
end and don't drop out of it any more, we can look like a proper
coroutine-based driver and simply call qed_aio_next_io() and get a
return value from it instead of spawning an additional coroutine that
reenters the parent when it's done.

Signed-off-by: Kevin Wolf 
Reviewed-by: Stefan Hajnoczi 
---
 block/qed.c | 101 +---
 block/qed.h |   3 +-
 2 files changed, 22 insertions(+), 82 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index cd3ef55..e53f6b5 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -21,10 +21,6 @@
 #include "qapi/qmp/qerror.h"
 #include "sysemu/block-backend.h"
 
-static const AIOCBInfo qed_aiocb_info = {
-.aiocb_size = sizeof(QEDAIOCB),
-};
-
 static int bdrv_qed_probe(const uint8_t *buf, int buf_size,
   const char *filename)
 {
@@ -253,13 +249,6 @@ static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
 return l2_table;
 }
 
-static void qed_aio_next_io(QEDAIOCB *acb);
-
-static void qed_aio_start_io(QEDAIOCB *acb)
-{
-qed_aio_next_io(acb);
-}
-
 static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
 {
 assert(!s->allocating_write_reqs_plugged);
@@ -751,7 +740,7 @@ static int64_t coroutine_fn 
bdrv_qed_co_get_block_status(BlockDriverState *bs,
 
 static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
 {
-return acb->common.bs->opaque;
+return acb->bs->opaque;
 }
 
 /**
@@ -888,28 +877,10 @@ static void qed_update_l2_table(BDRVQEDState *s, QEDTable 
*table, int index,
 }
 }
 
-static void qed_aio_complete_bh(void *opaque)
-{
-QEDAIOCB *acb = opaque;
-BDRVQEDState *s = acb_to_s(acb);
-BlockCompletionFunc *cb = acb->common.cb;
-void *user_opaque = acb->common.opaque;
-int ret = acb->bh_ret;
-
-qemu_aio_unref(acb);
-
-/* Invoke callback */
-qed_acquire(s);
-cb(user_opaque, ret);
-qed_release(s);
-}
-
-static void qed_aio_complete(QEDAIOCB *acb, int ret)
+static void qed_aio_complete(QEDAIOCB *acb)
 {
 BDRVQEDState *s = acb_to_s(acb);
 
-trace_qed_aio_complete(s, acb, ret);
-
 /* Free resources */
 qemu_iovec_destroy(&acb->cur_qiov);
 qed_unref_l2_cache_entry(acb->request.l2_table);
@@ -920,11 +891,6 @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
 acb->qiov->iov[0].iov_base = NULL;
 }
 
-/* Arrange for a bh to invoke the completion function */
-acb->bh_ret = ret;
-aio_bh_schedule_oneshot(bdrv_get_aio_context(acb->common.bs),
-qed_aio_complete_bh, acb);
-
 /* Start next allocating write request waiting behind this one.  Note that
  * requests enqueue themselves when they first hit an unallocated cluster
  * but they wait until the entire request is finished before waking up the
@@ -1172,7 +1138,7 @@ static int qed_aio_write_inplace(QEDAIOCB *acb, uint64_t 
offset, size_t len)
 struct iovec *iov = acb->qiov->iov;
 
 if (!iov->iov_base) {
-iov->iov_base = qemu_try_blockalign(acb->common.bs, iov->iov_len);
+iov->iov_base = qemu_try_blockalign(acb->bs, iov->iov_len);
 if (iov->iov_base == NULL) {
 return -ENOMEM;
 }
@@ -1231,7 +1197,7 @@ static int qed_aio_read_data(void *opaque, int ret, 
uint64_t offset, size_t len)
 {
 QEDAIOCB *acb = opaque;
 BDRVQEDState *s = acb_to_s(acb);
-BlockDriverState *bs = acb->common.bs;
+BlockDriverState *bs = acb->bs;
 
 /* Adjust offset into cluster */
 offset += qed_offset_into_cluster(s, acb->cur_pos);
@@ -1260,7 +1226,7 @@ static int qed_aio_read_data(void *opaque, int ret, 
uint64_t offset, size_t len)
 /**
  * Begin next I/O or complete the request
  */
-static void qed_aio_next_io(QEDAIOCB *acb)
+static int qed_aio_next_io(QEDAIOCB *acb)
 {
 BDRVQEDState *s = acb_to_s(acb);
 uint64_t offset;
@@ -1282,16 +1248,15 @@ static void qed_aio_next_io(QEDAIOCB *acb)
 
 /* Complete request */
 if (acb->cur_pos >= acb->end_pos) {
-qed_aio_complete(acb, 0);
-return;
+ret = 0;
+break;
 }
 
 /* Find next cluster and start I/O */
 len = acb->end_pos - acb->cur_pos;
 ret = qed_find_cluster(s, &acb->request, acb->cur_pos, &len, &offset);
 if (ret < 0) {
-qed_aio_complete(acb, ret);
-return;
+break;
 }
 
 if (acb->flags & QED_AIOCB_WRITE) {
@@ -1301,56 +1266,32 @@ static void qed_aio_next_io(QEDAIOCB *acb)
 }
 
 if (ret < 0 && ret != -EAGAIN) {
-qed_aio_complete(acb, ret);
-return;
+break;
 }
 }
-}
 
-typedef struct QEDRequestCo {
-Coroutine *co;
-bool done;
-int ret;
-} QEDRequestCo;
-
-static void qed_co_request_cb(void *opaque, int ret)
-{
-QEDRequestCo *co = opaque;
-
-co->done = true;
-co->ret = ret;
-qe

[Qemu-block] [PULL 47/61] qed: Remove ret argument from qed_aio_next_io()

2017-06-23 Thread Kevin Wolf

All callers pass ret = 0, so we can just remove it.

Signed-off-by: Kevin Wolf 
Reviewed-by: Stefan Hajnoczi 
---
 block/qed.c | 17 ++---
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index 6f83831..db80987 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -253,11 +253,11 @@ static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
 return l2_table;
 }
 
-static void qed_aio_next_io(QEDAIOCB *acb, int ret);
+static void qed_aio_next_io(QEDAIOCB *acb);
 
 static void qed_aio_start_io(QEDAIOCB *acb)
 {
-qed_aio_next_io(acb, 0);
+qed_aio_next_io(acb);
 }
 
 static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
@@ -1273,13 +1273,14 @@ static int qed_aio_read_data(void *opaque, int ret, 
uint64_t offset, size_t len)
 /**
  * Begin next I/O or complete the request
  */
-static void qed_aio_next_io(QEDAIOCB *acb, int ret)
+static void qed_aio_next_io(QEDAIOCB *acb)
 {
 BDRVQEDState *s = acb_to_s(acb);
 uint64_t offset;
 size_t len;
+int ret;
 
-trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size);
+trace_qed_aio_next_io(s, acb, 0, acb->cur_pos + acb->cur_qiov.size);
 
 if (acb->backing_qiov) {
 qemu_iovec_destroy(acb->backing_qiov);
@@ -1287,12 +1288,6 @@ static void qed_aio_next_io(QEDAIOCB *acb, int ret)
 acb->backing_qiov = NULL;
 }
 
-/* Handle I/O error */
-if (ret) {
-qed_aio_complete(acb, ret);
-return;
-}
-
 acb->qiov_offset += acb->cur_qiov.size;
 acb->cur_pos += acb->cur_qiov.size;
 qemu_iovec_reset(&acb->cur_qiov);
@@ -1323,7 +1318,7 @@ static void qed_aio_next_io(QEDAIOCB *acb, int ret)
 }
 return;
 }
-qed_aio_next_io(acb, 0);
+qed_aio_next_io(acb);
 }
 
 static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
-- 
1.8.3.1

[Qemu-block] [PULL 44/61] qed: Add return value to qed_aio_write_cow()

2017-06-23 Thread Kevin Wolf

Don't recurse into qed_aio_next_io() and qed_aio_complete() here, but
just return an error code and let the caller handle it.

While refactoring qed_aio_write_alloc() to accomodate the change,
qed_aio_write_zero_cluster() ended up with a single line, so I chose to
inline that line and remove the function completely.

Signed-off-by: Kevin Wolf 
Reviewed-by: Stefan Hajnoczi 
---
 block/qed.c | 58 +-
 1 file changed, 21 insertions(+), 37 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index a4b13f8..84864e0 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1065,11 +1065,11 @@ static int qed_aio_write_main(QEDAIOCB *acb)
 /**
  * Populate untouched regions of new data cluster
  */
-static void qed_aio_write_cow(void *opaque, int ret)
+static int qed_aio_write_cow(QEDAIOCB *acb)
 {
-QEDAIOCB *acb = opaque;
 BDRVQEDState *s = acb_to_s(acb);
 uint64_t start, len, offset;
+int ret;
 
 /* Populate front untouched region of new data cluster */
 start = qed_start_of_cluster(s, acb->cur_pos);
@@ -1077,9 +1077,8 @@ static void qed_aio_write_cow(void *opaque, int ret)
 
 trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
 ret = qed_copy_from_backing_file(s, start, len, acb->cur_cluster);
-if (ret) {
-qed_aio_complete(acb, ret);
-return;
+if (ret < 0) {
+return ret;
 }
 
 /* Populate back untouched region of new data cluster */
@@ -1091,17 +1090,11 @@ static void qed_aio_write_cow(void *opaque, int ret)
 
 trace_qed_aio_write_postfill(s, acb, start, len, offset);
 ret = qed_copy_from_backing_file(s, start, len, offset);
-if (ret) {
-qed_aio_complete(acb, ret);
-return;
-}
-
-ret = qed_aio_write_main(acb);
 if (ret < 0) {
-qed_aio_complete(acb, ret);
-return;
+return ret;
 }
-qed_aio_next_io(acb, 0);
+
+return qed_aio_write_main(acb);
 }
 
 /**
@@ -1117,23 +1110,6 @@ static bool qed_should_set_need_check(BDRVQEDState *s)
 return !(s->header.features & QED_F_NEED_CHECK);
 }
 
-static void qed_aio_write_zero_cluster(void *opaque, int ret)
-{
-QEDAIOCB *acb = opaque;
-
-if (ret) {
-qed_aio_complete(acb, ret);
-return;
-}
-
-ret = qed_aio_write_l2_update(acb, 1);
-if (ret < 0) {
-qed_aio_complete(acb, ret);
-return;
-}
-qed_aio_next_io(acb, 0);
-}
-
 /**
  * Write new data cluster
  *
@@ -1145,7 +1121,6 @@ static void qed_aio_write_zero_cluster(void *opaque, int 
ret)
 static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
 {
 BDRVQEDState *s = acb_to_s(acb);
-BlockCompletionFunc *cb;
 int ret;
 
 /* Cancel timer when the first allocating request comes in */
@@ -1172,20 +1147,29 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t 
len)
 qed_aio_start_io(acb);
 return;
 }
-
-cb = qed_aio_write_zero_cluster;
 } else {
-cb = qed_aio_write_cow;
 acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
 }
 
 if (qed_should_set_need_check(s)) {
 s->header.features |= QED_F_NEED_CHECK;
 ret = qed_write_header(s);
-cb(acb, ret);
+if (ret < 0) {
+qed_aio_complete(acb, ret);
+return;
+}
+}
+
+if (acb->flags & QED_AIOCB_ZERO) {
+ret = qed_aio_write_l2_update(acb, 1);
 } else {
-cb(acb, 0);
+ret = qed_aio_write_cow(acb);
 }
+if (ret < 0) {
+qed_aio_complete(acb, ret);
+return;
+}
+qed_aio_next_io(acb, 0);
 }
 
 /**
-- 
1.8.3.1

[Qemu-block] [PULL 45/61] qed: Add return value to qed_aio_write_inplace/alloc()

2017-06-23 Thread Kevin Wolf

Don't recurse into qed_aio_next_io() and qed_aio_complete() here, but
just return an error code and let the caller handle it.

Signed-off-by: Kevin Wolf 
Reviewed-by: Stefan Hajnoczi 
---
 block/qed.c | 43 ---
 1 file changed, 20 insertions(+), 23 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index 84864e0..4c8ba4a 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1118,7 +1118,7 @@ static bool qed_should_set_need_check(BDRVQEDState *s)
  *
  * This path is taken when writing to previously unallocated clusters.
  */
-static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
+static int qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
 {
 BDRVQEDState *s = acb_to_s(acb);
 int ret;
@@ -1134,7 +1134,7 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
 }
 if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs) ||
 s->allocating_write_reqs_plugged) {
-return; /* wait for existing request to finish */
+return -EINPROGRESS; /* wait for existing request to finish */
 }
 
 acb->cur_nclusters = qed_bytes_to_clusters(s,
@@ -1144,8 +1144,7 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
 if (acb->flags & QED_AIOCB_ZERO) {
 /* Skip ahead if the clusters are already zero */
 if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
-qed_aio_start_io(acb);
-return;
+return 0;
 }
 } else {
 acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
@@ -1155,8 +1154,7 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
 s->header.features |= QED_F_NEED_CHECK;
 ret = qed_write_header(s);
 if (ret < 0) {
-qed_aio_complete(acb, ret);
-return;
+return ret;
 }
 }
 
@@ -1166,10 +1164,9 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t 
len)
 ret = qed_aio_write_cow(acb);
 }
 if (ret < 0) {
-qed_aio_complete(acb, ret);
-return;
+return ret;
 }
-qed_aio_next_io(acb, 0);
+return 0;
 }
 
 /**
@@ -1181,10 +1178,8 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t 
len)
  *
  * This path is taken when writing to already allocated clusters.
  */
-static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
+static int qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
 {
-int ret;
-
 /* Allocate buffer for zero writes */
 if (acb->flags & QED_AIOCB_ZERO) {
 struct iovec *iov = acb->qiov->iov;
@@ -1192,8 +1187,7 @@ static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t 
offset, size_t len)
 if (!iov->iov_base) {
 iov->iov_base = qemu_try_blockalign(acb->common.bs, iov->iov_len);
 if (iov->iov_base == NULL) {
-qed_aio_complete(acb, -ENOMEM);
-return;
+return -ENOMEM;
 }
 memset(iov->iov_base, 0, iov->iov_len);
 }
@@ -1204,12 +1198,7 @@ static void qed_aio_write_inplace(QEDAIOCB *acb, 
uint64_t offset, size_t len)
 qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
 
 /* Do the actual write */
-ret = qed_aio_write_main(acb);
-if (ret < 0) {
-qed_aio_complete(acb, ret);
-return;
-}
-qed_aio_next_io(acb, 0);
+return qed_aio_write_main(acb);
 }
 
 /**
@@ -1232,19 +1221,27 @@ static void qed_aio_write_data(void *opaque, int ret,
 
 switch (ret) {
 case QED_CLUSTER_FOUND:
-qed_aio_write_inplace(acb, offset, len);
+ret = qed_aio_write_inplace(acb, offset, len);
 break;
 
 case QED_CLUSTER_L2:
 case QED_CLUSTER_L1:
 case QED_CLUSTER_ZERO:
-qed_aio_write_alloc(acb, len);
+ret = qed_aio_write_alloc(acb, len);
 break;
 
 default:
-qed_aio_complete(acb, ret);
+assert(ret < 0);
 break;
 }
+
+if (ret < 0) {
+if (ret != -EINPROGRESS) {
+qed_aio_complete(acb, ret);
+}
+return;
+}
+qed_aio_next_io(acb, 0);
 }
 
 /**
-- 
1.8.3.1

[Qemu-block] [PULL 32/61] qed: Remove callback from qed_copy_from_backing_file()

2017-06-23 Thread Kevin Wolf

With this change, qed_aio_write_prefill() and qed_aio_write_postfill()
collapse into a single function. This is reflected by a rename of the
combined function to qed_aio_write_cow().

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 block/qed.c | 57 +++--
 1 file changed, 23 insertions(+), 34 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index af53b8f..658b31b 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -861,13 +861,9 @@ static int qed_read_backing_file(BDRVQEDState *s, uint64_t 
pos,
  * @pos:Byte position in device
  * @len:Number of bytes
  * @offset: Byte offset in image file
- * @cb: Completion function
- * @opaque: User data for completion function
  */
-static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
-   uint64_t len, uint64_t offset,
-   BlockCompletionFunc *cb,
-   void *opaque)
+static int qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos,
+  uint64_t len, uint64_t offset)
 {
 QEMUIOVector qiov;
 QEMUIOVector *backing_qiov = NULL;
@@ -876,8 +872,7 @@ static void qed_copy_from_backing_file(BDRVQEDState *s, 
uint64_t pos,
 
 /* Skip copy entirely if there is no work to do */
 if (len == 0) {
-cb(opaque, 0);
-return;
+return 0;
 }
 
 iov = (struct iovec) {
@@ -906,7 +901,7 @@ static void qed_copy_from_backing_file(BDRVQEDState *s, 
uint64_t pos,
 ret = 0;
 out:
 qemu_vfree(iov.iov_base);
-cb(opaque, ret);
+return ret;
 }
 
 /**
@@ -1133,42 +1128,36 @@ static void qed_aio_write_main(void *opaque, int ret)
 }
 
 /**
- * Populate back untouched region of new data cluster
+ * Populate untouched regions of new data cluster
  */
-static void qed_aio_write_postfill(void *opaque, int ret)
+static void qed_aio_write_cow(void *opaque, int ret)
 {
 QEDAIOCB *acb = opaque;
 BDRVQEDState *s = acb_to_s(acb);
-uint64_t start = acb->cur_pos + acb->cur_qiov.size;
-uint64_t len =
-qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start;
-uint64_t offset = acb->cur_cluster +
-  qed_offset_into_cluster(s, acb->cur_pos) +
-  acb->cur_qiov.size;
+uint64_t start, len, offset;
+
+/* Populate front untouched region of new data cluster */
+start = qed_start_of_cluster(s, acb->cur_pos);
+len = qed_offset_into_cluster(s, acb->cur_pos);
 
+trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
+ret = qed_copy_from_backing_file(s, start, len, acb->cur_cluster);
 if (ret) {
 qed_aio_complete(acb, ret);
 return;
 }
 
-trace_qed_aio_write_postfill(s, acb, start, len, offset);
-qed_copy_from_backing_file(s, start, len, offset,
-qed_aio_write_main, acb);
-}
+/* Populate back untouched region of new data cluster */
+start = acb->cur_pos + acb->cur_qiov.size;
+len = qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start;
+offset = acb->cur_cluster +
+ qed_offset_into_cluster(s, acb->cur_pos) +
+ acb->cur_qiov.size;
 
-/**
- * Populate front untouched region of new data cluster
- */
-static void qed_aio_write_prefill(void *opaque, int ret)
-{
-QEDAIOCB *acb = opaque;
-BDRVQEDState *s = acb_to_s(acb);
-uint64_t start = qed_start_of_cluster(s, acb->cur_pos);
-uint64_t len = qed_offset_into_cluster(s, acb->cur_pos);
+trace_qed_aio_write_postfill(s, acb, start, len, offset);
+ret = qed_copy_from_backing_file(s, start, len, offset);
 
-trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
-qed_copy_from_backing_file(s, start, len, acb->cur_cluster,
-qed_aio_write_postfill, acb);
+qed_aio_write_main(acb, ret);
 }
 
 /**
@@ -1236,7 +1225,7 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
 
 cb = qed_aio_write_zero_cluster;
 } else {
-cb = qed_aio_write_prefill;
+cb = qed_aio_write_cow;
 acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
 }
 
-- 
1.8.3.1

[Qemu-block] [PULL 33/61] qed: Make qed_write_header() synchronous

2017-06-23 Thread Kevin Wolf

Note that this code is generally not running in coroutine context, so
this is an actual blocking synchronous operation. We'll fix this in a
moment.

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 block/qed.c | 76 +++--
 1 file changed, 29 insertions(+), 47 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index 658b31b..2665efc 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -92,41 +92,6 @@ int qed_write_header_sync(BDRVQEDState *s)
 return 0;
 }
 
-typedef struct {
-GenericCB gencb;
-BDRVQEDState *s;
-struct iovec iov;
-QEMUIOVector qiov;
-int nsectors;
-uint8_t *buf;
-} QEDWriteHeaderCB;
-
-static void qed_write_header_cb(void *opaque, int ret)
-{
-QEDWriteHeaderCB *write_header_cb = opaque;
-
-qemu_vfree(write_header_cb->buf);
-gencb_complete(write_header_cb, ret);
-}
-
-static void qed_write_header_read_cb(void *opaque, int ret)
-{
-QEDWriteHeaderCB *write_header_cb = opaque;
-BDRVQEDState *s = write_header_cb->s;
-
-if (ret) {
-qed_write_header_cb(write_header_cb, ret);
-return;
-}
-
-/* Update header */
-qed_header_cpu_to_le(&s->header, (QEDHeader *)write_header_cb->buf);
-
-bdrv_aio_writev(s->bs->file, 0, &write_header_cb->qiov,
-write_header_cb->nsectors, qed_write_header_cb,
-write_header_cb);
-}
-
 /**
  * Update header in-place (does not rewrite backing filename or other strings)
  *
@@ -144,18 +109,35 @@ static void qed_write_header(BDRVQEDState *s, 
BlockCompletionFunc cb,
 
 int nsectors = DIV_ROUND_UP(sizeof(QEDHeader), BDRV_SECTOR_SIZE);
 size_t len = nsectors * BDRV_SECTOR_SIZE;
-QEDWriteHeaderCB *write_header_cb = gencb_alloc(sizeof(*write_header_cb),
-cb, opaque);
-
-write_header_cb->s = s;
-write_header_cb->nsectors = nsectors;
-write_header_cb->buf = qemu_blockalign(s->bs, len);
-write_header_cb->iov.iov_base = write_header_cb->buf;
-write_header_cb->iov.iov_len = len;
-qemu_iovec_init_external(&write_header_cb->qiov, &write_header_cb->iov, 1);
-
-bdrv_aio_readv(s->bs->file, 0, &write_header_cb->qiov, nsectors,
-   qed_write_header_read_cb, write_header_cb);
+uint8_t *buf;
+struct iovec iov;
+QEMUIOVector qiov;
+int ret;
+
+buf = qemu_blockalign(s->bs, len);
+iov = (struct iovec) {
+.iov_base = buf,
+.iov_len = len,
+};
+qemu_iovec_init_external(&qiov, &iov, 1);
+
+ret = bdrv_preadv(s->bs->file, 0, &qiov);
+if (ret < 0) {
+goto out;
+}
+
+/* Update header */
+qed_header_cpu_to_le(&s->header, (QEDHeader *) buf);
+
+ret = bdrv_pwritev(s->bs->file, 0, &qiov);
+if (ret < 0) {
+goto out;
+}
+
+ret = 0;
+out:
+qemu_vfree(buf);
+cb(opaque, ret);
 }
 
 static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size)
-- 
1.8.3.1

[Qemu-block] [PULL 29/61] qed: Remove callback from qed_find_cluster()

2017-06-23 Thread Kevin Wolf

Signed-off-by: Kevin Wolf 
Reviewed-by: Stefan Hajnoczi 
---
 block/qed-cluster.c | 39 ++-
 block/qed.c | 24 +++-
 block/qed.h |  4 ++--
 3 files changed, 35 insertions(+), 32 deletions(-)

diff --git a/block/qed-cluster.c b/block/qed-cluster.c
index d279944..88dc979 100644
--- a/block/qed-cluster.c
+++ b/block/qed-cluster.c
@@ -67,22 +67,27 @@ static unsigned int 
qed_count_contiguous_clusters(BDRVQEDState *s,
  * @s:  QED state
  * @request:L2 cache entry
  * @pos:Byte position in device
- * @len:Number of bytes
- * @cb: Completion function
- * @opaque: User data for completion function
+ * @len:Number of bytes (may be shortened on return)
+ * @img_offset: Contains offset in the image file on success
  *
  * This function translates a position in the block device to an offset in the
- * image file.  It invokes the cb completion callback to report back the
- * translated offset or unallocated range in the image file.
+ * image file. The translated offset or unallocated range in the image file is
+ * reported back in *img_offset and *len.
  *
  * If the L2 table exists, request->l2_table points to the L2 table cache entry
  * and the caller must free the reference when they are finished.  The cache
  * entry is exposed in this way to avoid callers having to read the L2 table
  * again later during request processing.  If request->l2_table is non-NULL it
  * will be unreferenced before taking on the new cache entry.
+ *
+ * On success QED_CLUSTER_FOUND is returned and img_offset/len are a contiguous
+ * range in the image file.
+ *
+ * On failure QED_CLUSTER_L2 or QED_CLUSTER_L1 is returned for missing L2 or L1
+ * table offset, respectively. len is number of contiguous unallocated bytes.
  */
-void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
-  size_t len, QEDFindClusterFunc *cb, void *opaque)
+int qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
+ size_t *len, uint64_t *img_offset)
 {
 uint64_t l2_offset;
 uint64_t offset = 0;
@@ -93,16 +98,16 @@ void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, 
uint64_t pos,
 /* Limit length to L2 boundary.  Requests are broken up at the L2 boundary
  * so that a request acts on one L2 table at a time.
  */
-len = MIN(len, (((pos >> s->l1_shift) + 1) << s->l1_shift) - pos);
+*len = MIN(*len, (((pos >> s->l1_shift) + 1) << s->l1_shift) - pos);
 
 l2_offset = s->l1_table->offsets[qed_l1_index(s, pos)];
 if (qed_offset_is_unalloc_cluster(l2_offset)) {
-cb(opaque, QED_CLUSTER_L1, 0, len);
-return;
+*img_offset = 0;
+return QED_CLUSTER_L1;
 }
 if (!qed_check_table_offset(s, l2_offset)) {
-cb(opaque, -EINVAL, 0, 0);
-return;
+*img_offset = *len = 0;
+return -EINVAL;
 }
 
 ret = qed_read_l2_table(s, request, l2_offset);
@@ -112,8 +117,7 @@ void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, 
uint64_t pos,
 }
 
 index = qed_l2_index(s, pos);
-n = qed_bytes_to_clusters(s,
-  qed_offset_into_cluster(s, pos) + len);
+n = qed_bytes_to_clusters(s, qed_offset_into_cluster(s, pos) + *len);
 n = qed_count_contiguous_clusters(s, request->l2_table->table,
   index, n, &offset);
 
@@ -127,10 +131,11 @@ void qed_find_cluster(BDRVQEDState *s, QEDRequest 
*request, uint64_t pos,
 ret = -EINVAL;
 }
 
-len = MIN(len,
-  n * s->header.cluster_size - qed_offset_into_cluster(s, pos));
+*len = MIN(*len,
+   n * s->header.cluster_size - qed_offset_into_cluster(s, pos));
 
 out:
-cb(opaque, ret, offset, len);
+*img_offset = offset;
 qed_release(s);
+return ret;
 }
diff --git a/block/qed.c b/block/qed.c
index a837a28..290cbcd 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -776,14 +776,14 @@ static int64_t coroutine_fn 
bdrv_qed_co_get_block_status(BlockDriverState *bs,
 .file = file,
 };
 QEDRequest request = { .l2_table = NULL };
+uint64_t offset;
+int ret;
 
-qed_find_cluster(s, &request, cb.pos, len, qed_is_allocated_cb, &cb);
+ret = qed_find_cluster(s, &request, cb.pos, &len, &offset);
+qed_is_allocated_cb(&cb, ret, offset, len);
 
-/* Now sleep if the callback wasn't invoked immediately */
-while (cb.status == BDRV_BLOCK_OFFSET_MASK) {
-cb.co = qemu_coroutine_self();
-qemu_coroutine_yield();
-}
+/* The callback was invoked immediately */
+assert(cb.status != BDRV_BLOCK_OFFSET_MASK);
 
 qed_unref_l2_cache_entry(request.l2_table);
 
@@ -1306,8 +1306,6 @@ static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t 
offset, size_t len)
  *  or -errno
  * @offset: Cluster offset in bytes
  * @len:Length in bytes
- *
- * Callback fr

[Qemu-block] [PULL 41/61] qed: Add return value to qed_aio_write_l1_update()

2017-06-23 Thread Kevin Wolf

Don't recurse into qed_aio_next_io() and qed_aio_complete() here, but
just return an error code and let the caller handle it.

Signed-off-by: Kevin Wolf 
Reviewed-by: Stefan Hajnoczi 
---
 block/qed.c | 19 +--
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index 5462faa..e43827f 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -958,18 +958,12 @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
 /**
  * Update L1 table with new L2 table offset and write it out
  */
-static void qed_aio_write_l1_update(void *opaque, int ret)
+static int qed_aio_write_l1_update(QEDAIOCB *acb)
 {
-QEDAIOCB *acb = opaque;
 BDRVQEDState *s = acb_to_s(acb);
 CachedL2Table *l2_table = acb->request.l2_table;
 uint64_t l2_offset = l2_table->offset;
-int index;
-
-if (ret) {
-qed_aio_complete(acb, ret);
-return;
-}
+int index, ret;
 
 index = qed_l1_index(s, acb->cur_pos);
 s->l1_table->offsets[index] = l2_table->offset;
@@ -985,7 +979,7 @@ static void qed_aio_write_l1_update(void *opaque, int ret)
 acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
 assert(acb->request.l2_table != NULL);
 
-qed_aio_next_io(acb, ret);
+return ret;
 }
 
 
@@ -1014,7 +1008,12 @@ static void qed_aio_write_l2_update(QEDAIOCB *acb, int 
ret, uint64_t offset)
 if (need_alloc) {
 /* Write out the whole new L2 table */
 ret = qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true);
-qed_aio_write_l1_update(acb, ret);
+if (ret) {
+goto err;
+}
+ret = qed_aio_write_l1_update(acb);
+qed_aio_next_io(acb, ret);
+
 } else {
 /* Write out only the updated part of the L2 table */
 ret = qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters,
-- 
1.8.3.1

[Qemu-block] [PULL 42/61] qed: Add return value to qed_aio_write_l2_update()

2017-06-23 Thread Kevin Wolf

Don't recurse into qed_aio_next_io() and qed_aio_complete() here, but
just return an error code and let the caller handle it.

Signed-off-by: Kevin Wolf 
Reviewed-by: Stefan Hajnoczi 
---
 block/qed.c | 43 ++-
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index e43827f..3cda01f 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -986,15 +986,11 @@ static int qed_aio_write_l1_update(QEDAIOCB *acb)
 /**
  * Update L2 table with new cluster offsets and write them out
  */
-static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
+static int qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
 {
 BDRVQEDState *s = acb_to_s(acb);
 bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
-int index;
-
-if (ret) {
-goto err;
-}
+int index, ret;
 
 if (need_alloc) {
 qed_unref_l2_cache_entry(acb->request.l2_table);
@@ -1009,21 +1005,18 @@ static void qed_aio_write_l2_update(QEDAIOCB *acb, int 
ret, uint64_t offset)
 /* Write out the whole new L2 table */
 ret = qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true);
 if (ret) {
-goto err;
+return ret;
 }
-ret = qed_aio_write_l1_update(acb);
-qed_aio_next_io(acb, ret);
-
+return qed_aio_write_l1_update(acb);
 } else {
 /* Write out only the updated part of the L2 table */
 ret = qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters,
  false);
-qed_aio_next_io(acb, ret);
+if (ret) {
+return ret;
+}
 }
-return;
-
-err:
-qed_aio_complete(acb, ret);
+return 0;
 }
 
 /**
@@ -1065,8 +1058,19 @@ static void qed_aio_write_main(void *opaque, int ret)
  */
 ret = bdrv_flush(s->bs->file->bs);
 }
-qed_aio_write_l2_update(acb, ret, acb->cur_cluster);
+if (ret) {
+goto err;
+}
+ret = qed_aio_write_l2_update(acb, acb->cur_cluster);
+if (ret) {
+goto err;
+}
+qed_aio_next_io(acb, 0);
 }
+return;
+
+err:
+qed_aio_complete(acb, ret);
 }
 
 /**
@@ -1124,7 +1128,12 @@ static void qed_aio_write_zero_cluster(void *opaque, int 
ret)
 return;
 }
 
-qed_aio_write_l2_update(acb, 0, 1);
+ret = qed_aio_write_l2_update(acb, 1);
+if (ret < 0) {
+qed_aio_complete(acb, ret);
+return;
+}
+qed_aio_next_io(acb, 0);
 }
 
 /**
-- 
1.8.3.1

[Qemu-block] [PULL 27/61] qed: Remove callback from qed_read_table()

2017-06-23 Thread Kevin Wolf

Instead of passing the return value to a callback, return it to the
caller so that the callback can be inlined there.

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 block/qed-table.c | 79 ++-
 1 file changed, 25 insertions(+), 54 deletions(-)

diff --git a/block/qed-table.c b/block/qed-table.c
index f330538..4270003 100644
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -18,8 +18,7 @@
 #include "qed.h"
 #include "qemu/bswap.h"
 
-static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
-   BlockCompletionFunc *cb, void *opaque)
+static int qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table)
 {
 QEMUIOVector qiov;
 int noffsets;
@@ -50,7 +49,7 @@ static void qed_read_table(BDRVQEDState *s, uint64_t offset, 
QEDTable *table,
 out:
 /* Completion */
 trace_qed_read_table_cb(s, table, ret);
-cb(opaque, ret);
+return ret;
 }
 
 typedef struct {
@@ -156,13 +155,7 @@ static void qed_sync_cb(void *opaque, int ret)
 
 int qed_read_l1_table_sync(BDRVQEDState *s)
 {
-int ret = -EINPROGRESS;
-
-qed_read_table(s, s->header.l1_table_offset,
-   s->l1_table, qed_sync_cb, &ret);
-BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);
-
-return ret;
+return qed_read_table(s, s->header.l1_table_offset, s->l1_table);
 }
 
 void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n,
@@ -184,46 +177,10 @@ int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int 
index,
 return ret;
 }
 
-typedef struct {
-GenericCB gencb;
-BDRVQEDState *s;
-uint64_t l2_offset;
-QEDRequest *request;
-} QEDReadL2TableCB;
-
-static void qed_read_l2_table_cb(void *opaque, int ret)
-{
-QEDReadL2TableCB *read_l2_table_cb = opaque;
-QEDRequest *request = read_l2_table_cb->request;
-BDRVQEDState *s = read_l2_table_cb->s;
-CachedL2Table *l2_table = request->l2_table;
-uint64_t l2_offset = read_l2_table_cb->l2_offset;
-
-qed_acquire(s);
-if (ret) {
-/* can't trust loaded L2 table anymore */
-qed_unref_l2_cache_entry(l2_table);
-request->l2_table = NULL;
-} else {
-l2_table->offset = l2_offset;
-
-qed_commit_l2_cache_entry(&s->l2_cache, l2_table);
-
-/* This is guaranteed to succeed because we just committed the entry
- * to the cache.
- */
-request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
-assert(request->l2_table != NULL);
-}
-qed_release(s);
-
-gencb_complete(&read_l2_table_cb->gencb, ret);
-}
-
 void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
BlockCompletionFunc *cb, void *opaque)
 {
-QEDReadL2TableCB *read_l2_table_cb;
+int ret;
 
 qed_unref_l2_cache_entry(request->l2_table);
 
@@ -237,14 +194,28 @@ void qed_read_l2_table(BDRVQEDState *s, QEDRequest 
*request, uint64_t offset,
 request->l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);
 request->l2_table->table = qed_alloc_table(s);
 
-read_l2_table_cb = gencb_alloc(sizeof(*read_l2_table_cb), cb, opaque);
-read_l2_table_cb->s = s;
-read_l2_table_cb->l2_offset = offset;
-read_l2_table_cb->request = request;
-
 BLKDBG_EVENT(s->bs->file, BLKDBG_L2_LOAD);
-qed_read_table(s, offset, request->l2_table->table,
-   qed_read_l2_table_cb, read_l2_table_cb);
+ret = qed_read_table(s, offset, request->l2_table->table);
+
+qed_acquire(s);
+if (ret) {
+/* can't trust loaded L2 table anymore */
+qed_unref_l2_cache_entry(request->l2_table);
+request->l2_table = NULL;
+} else {
+request->l2_table->offset = offset;
+
+qed_commit_l2_cache_entry(&s->l2_cache, request->l2_table);
+
+/* This is guaranteed to succeed because we just committed the entry
+ * to the cache.
+ */
+request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, offset);
+assert(request->l2_table != NULL);
+}
+qed_release(s);
+
+cb(opaque, ret);
 }
 
 int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t 
offset)
-- 
1.8.3.1

[Qemu-block] [PULL 28/61] qed: Remove callback from qed_read_l2_table()

2017-06-23 Thread Kevin Wolf

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 block/qed-cluster.c | 94 ++---
 block/qed-table.c   | 15 +++--
 block/qed.h |  3 +-
 3 files changed, 36 insertions(+), 76 deletions(-)

diff --git a/block/qed-cluster.c b/block/qed-cluster.c
index 8f5da74..d279944 100644
--- a/block/qed-cluster.c
+++ b/block/qed-cluster.c
@@ -61,59 +61,6 @@ static unsigned int 
qed_count_contiguous_clusters(BDRVQEDState *s,
 return i - index;
 }
 
-typedef struct {
-BDRVQEDState *s;
-uint64_t pos;
-size_t len;
-
-QEDRequest *request;
-
-/* User callback */
-QEDFindClusterFunc *cb;
-void *opaque;
-} QEDFindClusterCB;
-
-static void qed_find_cluster_cb(void *opaque, int ret)
-{
-QEDFindClusterCB *find_cluster_cb = opaque;
-BDRVQEDState *s = find_cluster_cb->s;
-QEDRequest *request = find_cluster_cb->request;
-uint64_t offset = 0;
-size_t len = 0;
-unsigned int index;
-unsigned int n;
-
-qed_acquire(s);
-if (ret) {
-goto out;
-}
-
-index = qed_l2_index(s, find_cluster_cb->pos);
-n = qed_bytes_to_clusters(s,
-  qed_offset_into_cluster(s, find_cluster_cb->pos) 
+
-  find_cluster_cb->len);
-n = qed_count_contiguous_clusters(s, request->l2_table->table,
-  index, n, &offset);
-
-if (qed_offset_is_unalloc_cluster(offset)) {
-ret = QED_CLUSTER_L2;
-} else if (qed_offset_is_zero_cluster(offset)) {
-ret = QED_CLUSTER_ZERO;
-} else if (qed_check_cluster_offset(s, offset)) {
-ret = QED_CLUSTER_FOUND;
-} else {
-ret = -EINVAL;
-}
-
-len = MIN(find_cluster_cb->len, n * s->header.cluster_size -
-  qed_offset_into_cluster(s, find_cluster_cb->pos));
-
-out:
-find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len);
-qed_release(s);
-g_free(find_cluster_cb);
-}
-
 /**
  * Find the offset of a data cluster
  *
@@ -137,8 +84,11 @@ out:
 void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos,
   size_t len, QEDFindClusterFunc *cb, void *opaque)
 {
-QEDFindClusterCB *find_cluster_cb;
 uint64_t l2_offset;
+uint64_t offset = 0;
+unsigned int index;
+unsigned int n;
+int ret;
 
 /* Limit length to L2 boundary.  Requests are broken up at the L2 boundary
  * so that a request acts on one L2 table at a time.
@@ -155,14 +105,32 @@ void qed_find_cluster(BDRVQEDState *s, QEDRequest 
*request, uint64_t pos,
 return;
 }
 
-find_cluster_cb = g_malloc(sizeof(*find_cluster_cb));
-find_cluster_cb->s = s;
-find_cluster_cb->pos = pos;
-find_cluster_cb->len = len;
-find_cluster_cb->cb = cb;
-find_cluster_cb->opaque = opaque;
-find_cluster_cb->request = request;
+ret = qed_read_l2_table(s, request, l2_offset);
+qed_acquire(s);
+if (ret) {
+goto out;
+}
+
+index = qed_l2_index(s, pos);
+n = qed_bytes_to_clusters(s,
+  qed_offset_into_cluster(s, pos) + len);
+n = qed_count_contiguous_clusters(s, request->l2_table->table,
+  index, n, &offset);
+
+if (qed_offset_is_unalloc_cluster(offset)) {
+ret = QED_CLUSTER_L2;
+} else if (qed_offset_is_zero_cluster(offset)) {
+ret = QED_CLUSTER_ZERO;
+} else if (qed_check_cluster_offset(s, offset)) {
+ret = QED_CLUSTER_FOUND;
+} else {
+ret = -EINVAL;
+}
+
+len = MIN(len,
+  n * s->header.cluster_size - qed_offset_into_cluster(s, pos));
 
-qed_read_l2_table(s, request, l2_offset,
-  qed_find_cluster_cb, find_cluster_cb);
+out:
+cb(opaque, ret, offset, len);
+qed_release(s);
 }
diff --git a/block/qed-table.c b/block/qed-table.c
index 4270003..ffecbea 100644
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -177,8 +177,7 @@ int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int 
index,
 return ret;
 }
 
-void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset,
-   BlockCompletionFunc *cb, void *opaque)
+int qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset)
 {
 int ret;
 
@@ -187,8 +186,7 @@ void qed_read_l2_table(BDRVQEDState *s, QEDRequest 
*request, uint64_t offset,
 /* Check for cached L2 entry */
 request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, offset);
 if (request->l2_table) {
-cb(opaque, 0);
-return;
+return 0;
 }
 
 request->l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);
@@ -215,17 +213,12 @@ void qed_read_l2_table(BDRVQEDState *s, QEDRequest 
*request, uint64_t offset,
 }
 qed_release(s);
 
-cb(opaque, ret);
+return ret;
 }
 
 int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t 
offset)
 {

[Qemu-block] [PULL 40/61] qed: Inline qed_commit_l2_update()

2017-06-23 Thread Kevin Wolf

qed_commit_l2_update() is unconditionally called at the end of
qed_aio_write_l1_update(). Inline it.

Signed-off-by: Kevin Wolf 
Reviewed-by: Stefan Hajnoczi 
---
 block/qed.c | 36 ++--
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index d164b0e..5462faa 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -956,15 +956,27 @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
 }
 
 /**
- * Commit the current L2 table to the cache
+ * Update L1 table with new L2 table offset and write it out
  */
-static void qed_commit_l2_update(void *opaque, int ret)
+static void qed_aio_write_l1_update(void *opaque, int ret)
 {
 QEDAIOCB *acb = opaque;
 BDRVQEDState *s = acb_to_s(acb);
 CachedL2Table *l2_table = acb->request.l2_table;
 uint64_t l2_offset = l2_table->offset;
+int index;
+
+if (ret) {
+qed_aio_complete(acb, ret);
+return;
+}
 
+index = qed_l1_index(s, acb->cur_pos);
+s->l1_table->offsets[index] = l2_table->offset;
+
+ret = qed_write_l1_table(s, index, 1);
+
+/* Commit the current L2 table to the cache */
 qed_commit_l2_cache_entry(&s->l2_cache, l2_table);
 
 /* This is guaranteed to succeed because we just committed the entry to the
@@ -976,26 +988,6 @@ static void qed_commit_l2_update(void *opaque, int ret)
 qed_aio_next_io(acb, ret);
 }
 
-/**
- * Update L1 table with new L2 table offset and write it out
- */
-static void qed_aio_write_l1_update(void *opaque, int ret)
-{
-QEDAIOCB *acb = opaque;
-BDRVQEDState *s = acb_to_s(acb);
-int index;
-
-if (ret) {
-qed_aio_complete(acb, ret);
-return;
-}
-
-index = qed_l1_index(s, acb->cur_pos);
-s->l1_table->offsets[index] = acb->request.l2_table->offset;
-
-ret = qed_write_l1_table(s, index, 1);
-qed_commit_l2_update(acb, ret);
-}
 
 /**
  * Update L2 table with new cluster offsets and write them out
-- 
1.8.3.1

[Qemu-block] [PULL 37/61] qed: Remove callback from qed_write_table()

2017-06-23 Thread Kevin Wolf

Signed-off-by: Kevin Wolf 
Reviewed-by: Stefan Hajnoczi 
---
 block/qed-table.c | 47 ---
 block/qed.c   | 12 +++-
 block/qed.h   |  8 +++-
 3 files changed, 22 insertions(+), 45 deletions(-)

diff --git a/block/qed-table.c b/block/qed-table.c
index 0cc93a7..ebee2c5 100644
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -61,12 +61,9 @@ out:
  * @index:  Index of first element
  * @n:  Number of elements
  * @flush:  Whether or not to sync to disk
- * @cb: Completion function
- * @opaque: Argument for completion function
  */
-static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
-unsigned int index, unsigned int n, bool flush,
-BlockCompletionFunc *cb, void *opaque)
+static int qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
+   unsigned int index, unsigned int n, bool flush)
 {
 unsigned int sector_mask = BDRV_SECTOR_SIZE / sizeof(uint64_t) - 1;
 unsigned int start, end, i;
@@ -118,15 +115,7 @@ static void qed_write_table(BDRVQEDState *s, uint64_t 
offset, QEDTable *table,
 ret = 0;
 out:
 qemu_vfree(new_table);
-cb(opaque, ret);
-}
-
-/**
- * Propagate return value from async callback
- */
-static void qed_sync_cb(void *opaque, int ret)
-{
-*(int *)opaque = ret;
+return ret;
 }
 
 int qed_read_l1_table_sync(BDRVQEDState *s)
@@ -134,23 +123,17 @@ int qed_read_l1_table_sync(BDRVQEDState *s)
 return qed_read_table(s, s->header.l1_table_offset, s->l1_table);
 }
 
-void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n,
-BlockCompletionFunc *cb, void *opaque)
+int qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n)
 {
 BLKDBG_EVENT(s->bs->file, BLKDBG_L1_UPDATE);
-qed_write_table(s, s->header.l1_table_offset,
-s->l1_table, index, n, false, cb, opaque);
+return qed_write_table(s, s->header.l1_table_offset,
+   s->l1_table, index, n, false);
 }
 
 int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
 unsigned int n)
 {
-int ret = -EINPROGRESS;
-
-qed_write_l1_table(s, index, n, qed_sync_cb, &ret);
-BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);
-
-return ret;
+return qed_write_l1_table(s, index, n);
 }
 
 int qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset)
@@ -197,22 +180,16 @@ int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest 
*request, uint64_t offset
 return qed_read_l2_table(s, request, offset);
 }
 
-void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
-unsigned int index, unsigned int n, bool flush,
-BlockCompletionFunc *cb, void *opaque)
+int qed_write_l2_table(BDRVQEDState *s, QEDRequest *request,
+   unsigned int index, unsigned int n, bool flush)
 {
 BLKDBG_EVENT(s->bs->file, BLKDBG_L2_UPDATE);
-qed_write_table(s, request->l2_table->offset,
-request->l2_table->table, index, n, flush, cb, opaque);
+return qed_write_table(s, request->l2_table->offset,
+   request->l2_table->table, index, n, flush);
 }
 
 int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
 unsigned int index, unsigned int n, bool flush)
 {
-int ret = -EINPROGRESS;
-
-qed_write_l2_table(s, request, index, n, flush, qed_sync_cb, &ret);
-BDRV_POLL_WHILE(s->bs, ret == -EINPROGRESS);
-
-return ret;
+return qed_write_l2_table(s, request, index, n, flush);
 }
diff --git a/block/qed.c b/block/qed.c
index 95f1050..8c493bb 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1000,7 +1000,8 @@ static void qed_aio_write_l1_update(void *opaque, int ret)
 index = qed_l1_index(s, acb->cur_pos);
 s->l1_table->offsets[index] = acb->request.l2_table->offset;
 
-qed_write_l1_table(s, index, 1, qed_commit_l2_update, acb);
+ret = qed_write_l1_table(s, index, 1);
+qed_commit_l2_update(acb, ret);
 }
 
 /**
@@ -1027,12 +1028,13 @@ static void qed_aio_write_l2_update(QEDAIOCB *acb, int 
ret, uint64_t offset)
 
 if (need_alloc) {
 /* Write out the whole new L2 table */
-qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true,
-   qed_aio_write_l1_update, acb);
+ret = qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true);
+qed_aio_write_l1_update(acb, ret);
 } else {
 /* Write out only the updated part of the L2 table */
-qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false,
-   qed_aio_next_io_cb, acb);
+ret = qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters,
+ false);
+qed_aio_next_io(acb, ret);
 }

[Qemu-block] [PULL 26/61] qed: Make qed_read_table() synchronous

2017-06-23 Thread Kevin Wolf

Note that this code is generally not running in coroutine context, so
this is an actual blocking synchronous operation. We'll fix this in a
moment.

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 block/qed-table.c | 56 ++-
 1 file changed, 18 insertions(+), 38 deletions(-)

diff --git a/block/qed-table.c b/block/qed-table.c
index b12c298..f330538 100644
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -18,59 +18,39 @@
 #include "qed.h"
 #include "qemu/bswap.h"
 
-typedef struct {
-GenericCB gencb;
-BDRVQEDState *s;
-QEDTable *table;
-
-struct iovec iov;
+static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
+   BlockCompletionFunc *cb, void *opaque)
+{
 QEMUIOVector qiov;
-} QEDReadTableCB;
+int noffsets;
+int i, ret;
 
-static void qed_read_table_cb(void *opaque, int ret)
-{
-QEDReadTableCB *read_table_cb = opaque;
-QEDTable *table = read_table_cb->table;
-BDRVQEDState *s = read_table_cb->s;
-int noffsets = read_table_cb->qiov.size / sizeof(uint64_t);
-int i;
+struct iovec iov = {
+.iov_base = table->offsets,
+.iov_len = s->header.cluster_size * s->header.table_size,
+};
+qemu_iovec_init_external(&qiov, &iov, 1);
 
-/* Handle I/O error */
-if (ret) {
+trace_qed_read_table(s, offset, table);
+
+ret = bdrv_preadv(s->bs->file, offset, &qiov);
+if (ret < 0) {
 goto out;
 }
 
 /* Byteswap offsets */
 qed_acquire(s);
+noffsets = qiov.size / sizeof(uint64_t);
 for (i = 0; i < noffsets; i++) {
 table->offsets[i] = le64_to_cpu(table->offsets[i]);
 }
 qed_release(s);
 
+ret = 0;
 out:
 /* Completion */
-trace_qed_read_table_cb(s, read_table_cb->table, ret);
-gencb_complete(&read_table_cb->gencb, ret);
-}
-
-static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table,
-   BlockCompletionFunc *cb, void *opaque)
-{
-QEDReadTableCB *read_table_cb = gencb_alloc(sizeof(*read_table_cb),
-cb, opaque);
-QEMUIOVector *qiov = &read_table_cb->qiov;
-
-trace_qed_read_table(s, offset, table);
-
-read_table_cb->s = s;
-read_table_cb->table = table;
-read_table_cb->iov.iov_base = table->offsets,
-read_table_cb->iov.iov_len = s->header.cluster_size * s->header.table_size,
-
-qemu_iovec_init_external(qiov, &read_table_cb->iov, 1);
-bdrv_aio_readv(s->bs->file, offset / BDRV_SECTOR_SIZE, qiov,
-   qiov->size / BDRV_SECTOR_SIZE,
-   qed_read_table_cb, read_table_cb);
+trace_qed_read_table_cb(s, table, ret);
+cb(opaque, ret);
 }
 
 typedef struct {
-- 
1.8.3.1

[Qemu-block] [PULL 21/61] qcow2: Allow reading both COW regions with only one request

2017-06-23 Thread Kevin Wolf

From: Alberto Garcia 

Reading both COW regions requires two separate requests, but it's
perfectly possible to merge them and perform only one. This generally
improves performance, particularly on rotating disk drives. The
downside is that the data in the middle region is read but discarded.

This patch takes a conservative approach and only merges reads when
the size of the middle region is <= 16KB.

Signed-off-by: Alberto Garcia 
Reviewed-by: Kevin Wolf 
Signed-off-by: Kevin Wolf 
---
 block/qcow2-cluster.c | 51 ++-
 1 file changed, 38 insertions(+), 13 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 3c9ace8..20fb531 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -777,20 +777,38 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta 
*m)
 Qcow2COWRegion *start = &m->cow_start;
 Qcow2COWRegion *end = &m->cow_end;
 unsigned buffer_size;
+unsigned data_bytes = end->offset - (start->offset + start->nb_bytes);
+bool merge_reads;
 uint8_t *start_buffer, *end_buffer;
 int ret;
 
 assert(start->nb_bytes <= UINT_MAX - end->nb_bytes);
+assert(start->nb_bytes + end->nb_bytes <= UINT_MAX - data_bytes);
+assert(start->offset + start->nb_bytes <= end->offset);
 
 if (start->nb_bytes == 0 && end->nb_bytes == 0) {
 return 0;
 }
 
-/* Reserve a buffer large enough to store the data from both the
- * start and end COW regions. Add some padding in the middle if
- * necessary to make sure that the end region is optimally aligned */
-buffer_size = QEMU_ALIGN_UP(start->nb_bytes, bdrv_opt_mem_align(bs)) +
-end->nb_bytes;
+/* If we have to read both the start and end COW regions and the
+ * middle region is not too large then perform just one read
+ * operation */
+merge_reads = start->nb_bytes && end->nb_bytes && data_bytes <= 16384;
+if (merge_reads) {
+buffer_size = start->nb_bytes + data_bytes + end->nb_bytes;
+} else {
+/* If we have to do two reads, add some padding in the middle
+ * if necessary to make sure that the end region is optimally
+ * aligned. */
+size_t align = bdrv_opt_mem_align(bs);
+assert(align > 0 && align <= UINT_MAX);
+assert(QEMU_ALIGN_UP(start->nb_bytes, align) <=
+   UINT_MAX - end->nb_bytes);
+buffer_size = QEMU_ALIGN_UP(start->nb_bytes, align) + end->nb_bytes;
+}
+
+/* Reserve a buffer large enough to store all the data that we're
+ * going to read */
 start_buffer = qemu_try_blockalign(bs, buffer_size);
 if (start_buffer == NULL) {
 return -ENOMEM;
@@ -799,15 +817,22 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta 
*m)
 end_buffer = start_buffer + buffer_size - end->nb_bytes;
 
 qemu_co_mutex_unlock(&s->lock);
-/* First we read the existing data from both COW regions */
-ret = do_perform_cow_read(bs, m->offset, start->offset,
-  start_buffer, start->nb_bytes);
-if (ret < 0) {
-goto fail;
-}
+/* First we read the existing data from both COW regions. We
+ * either read the whole region in one go, or the start and end
+ * regions separately. */
+if (merge_reads) {
+ret = do_perform_cow_read(bs, m->offset, start->offset,
+  start_buffer, buffer_size);
+} else {
+ret = do_perform_cow_read(bs, m->offset, start->offset,
+  start_buffer, start->nb_bytes);
+if (ret < 0) {
+goto fail;
+}
 
-ret = do_perform_cow_read(bs, m->offset, end->offset,
-  end_buffer, end->nb_bytes);
+ret = do_perform_cow_read(bs, m->offset, end->offset,
+  end_buffer, end->nb_bytes);
+}
 if (ret < 0) {
 goto fail;
 }
-- 
1.8.3.1

[Qemu-block] [PULL 36/61] qed: Remove GenericCB

2017-06-23 Thread Kevin Wolf

The GenericCB infrastructure isn't used any more. Remove it.

Signed-off-by: Kevin Wolf 
Reviewed-by: Stefan Hajnoczi 
---
 block/Makefile.objs |  2 +-
 block/qed-gencb.c   | 33 -
 block/qed.h | 11 ---
 3 files changed, 1 insertion(+), 45 deletions(-)
 delete mode 100644 block/qed-gencb.c

diff --git a/block/Makefile.objs b/block/Makefile.objs
index ea95530..f9368b5 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -1,6 +1,6 @@
 block-obj-y += raw-format.o qcow.o vdi.o vmdk.o cloop.o bochs.o vpc.o vvfat.o 
dmg.o
 block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o 
qcow2-cache.o
-block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
+block-obj-y += qed.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-obj-y += qed-check.o
 block-obj-y += vhdx.o vhdx-endian.o vhdx-log.o
 block-obj-y += quorum.o
diff --git a/block/qed-gencb.c b/block/qed-gencb.c
deleted file mode 100644
index faf8ecc..000
--- a/block/qed-gencb.c
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * QEMU Enhanced Disk Format
- *
- * Copyright IBM, Corp. 2010
- *
- * Authors:
- *  Stefan Hajnoczi   
- *
- * This work is licensed under the terms of the GNU LGPL, version 2 or later.
- * See the COPYING.LIB file in the top-level directory.
- *
- */
-
-#include "qemu/osdep.h"
-#include "qed.h"
-
-void *gencb_alloc(size_t len, BlockCompletionFunc *cb, void *opaque)
-{
-GenericCB *gencb = g_malloc(len);
-gencb->cb = cb;
-gencb->opaque = opaque;
-return gencb;
-}
-
-void gencb_complete(void *opaque, int ret)
-{
-GenericCB *gencb = opaque;
-BlockCompletionFunc *cb = gencb->cb;
-void *user_opaque = gencb->opaque;
-
-g_free(gencb);
-cb(user_opaque, ret);
-}
diff --git a/block/qed.h b/block/qed.h
index 6ab5702..46843c4 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -202,17 +202,6 @@ void qed_acquire(BDRVQEDState *s);
 void qed_release(BDRVQEDState *s);
 
 /**
- * Generic callback for chaining async callbacks
- */
-typedef struct {
-BlockCompletionFunc *cb;
-void *opaque;
-} GenericCB;
-
-void *gencb_alloc(size_t len, BlockCompletionFunc *cb, void *opaque);
-void gencb_complete(void *opaque, int ret);
-
-/**
  * Header functions
  */
 int qed_write_header_sync(BDRVQEDState *s);
-- 
1.8.3.1

[Qemu-block] [PULL 31/61] qed: Make qed_copy_from_backing_file() synchronous

2017-06-23 Thread Kevin Wolf

Note that this code is generally not running in coroutine context, so
this is an actual blocking synchronous operation. We'll fix this in a
moment.

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 block/qed.c | 78 +++--
 1 file changed, 29 insertions(+), 49 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index 1105f19..af53b8f 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -854,44 +854,6 @@ static int qed_read_backing_file(BDRVQEDState *s, uint64_t 
pos,
 return 0;
 }
 
-typedef struct {
-GenericCB gencb;
-BDRVQEDState *s;
-QEMUIOVector qiov;
-QEMUIOVector *backing_qiov;
-struct iovec iov;
-uint64_t offset;
-} CopyFromBackingFileCB;
-
-static void qed_copy_from_backing_file_cb(void *opaque, int ret)
-{
-CopyFromBackingFileCB *copy_cb = opaque;
-qemu_vfree(copy_cb->iov.iov_base);
-gencb_complete(©_cb->gencb, ret);
-}
-
-static void qed_copy_from_backing_file_write(void *opaque, int ret)
-{
-CopyFromBackingFileCB *copy_cb = opaque;
-BDRVQEDState *s = copy_cb->s;
-
-if (copy_cb->backing_qiov) {
-qemu_iovec_destroy(copy_cb->backing_qiov);
-g_free(copy_cb->backing_qiov);
-copy_cb->backing_qiov = NULL;
-}
-
-if (ret) {
-qed_copy_from_backing_file_cb(copy_cb, ret);
-return;
-}
-
-BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE);
-bdrv_aio_writev(s->bs->file, copy_cb->offset / BDRV_SECTOR_SIZE,
-©_cb->qiov, copy_cb->qiov.size / BDRV_SECTOR_SIZE,
-qed_copy_from_backing_file_cb, copy_cb);
-}
-
 /**
  * Copy data from backing file into the image
  *
@@ -907,7 +869,9 @@ static void qed_copy_from_backing_file(BDRVQEDState *s, 
uint64_t pos,
BlockCompletionFunc *cb,
void *opaque)
 {
-CopyFromBackingFileCB *copy_cb;
+QEMUIOVector qiov;
+QEMUIOVector *backing_qiov = NULL;
+struct iovec iov;
 int ret;
 
 /* Skip copy entirely if there is no work to do */
@@ -916,17 +880,33 @@ static void qed_copy_from_backing_file(BDRVQEDState *s, 
uint64_t pos,
 return;
 }
 
-copy_cb = gencb_alloc(sizeof(*copy_cb), cb, opaque);
-copy_cb->s = s;
-copy_cb->offset = offset;
-copy_cb->backing_qiov = NULL;
-copy_cb->iov.iov_base = qemu_blockalign(s->bs, len);
-copy_cb->iov.iov_len = len;
-qemu_iovec_init_external(©_cb->qiov, ©_cb->iov, 1);
+iov = (struct iovec) {
+.iov_base = qemu_blockalign(s->bs, len),
+.iov_len = len,
+};
+qemu_iovec_init_external(&qiov, &iov, 1);
+
+ret = qed_read_backing_file(s, pos, &qiov, &backing_qiov);
+
+if (backing_qiov) {
+qemu_iovec_destroy(backing_qiov);
+g_free(backing_qiov);
+backing_qiov = NULL;
+}
+
+if (ret) {
+goto out;
+}
 
-ret = qed_read_backing_file(s, pos, ©_cb->qiov,
-©_cb->backing_qiov);
-qed_copy_from_backing_file_write(copy_cb, ret);
+BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE);
+ret = bdrv_pwritev(s->bs->file, offset, &qiov);
+if (ret < 0) {
+goto out;
+}
+ret = 0;
+out:
+qemu_vfree(iov.iov_base);
+cb(opaque, ret);
 }
 
 /**
-- 
1.8.3.1

[Qemu-block] [PULL 35/61] qed: Make qed_write_table() synchronous

2017-06-23 Thread Kevin Wolf

Note that this code is generally not running in coroutine context, so
this is an actual blocking synchronous operation. We'll fix this in a
moment.

Signed-off-by: Kevin Wolf 
Reviewed-by: Stefan Hajnoczi 
---
 block/qed-table.c | 84 ---
 1 file changed, 30 insertions(+), 54 deletions(-)

diff --git a/block/qed-table.c b/block/qed-table.c
index ffecbea..0cc93a7 100644
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -52,46 +52,6 @@ out:
 return ret;
 }
 
-typedef struct {
-GenericCB gencb;
-BDRVQEDState *s;
-QEDTable *orig_table;
-QEDTable *table;
-bool flush; /* flush after write? */
-
-struct iovec iov;
-QEMUIOVector qiov;
-} QEDWriteTableCB;
-
-static void qed_write_table_cb(void *opaque, int ret)
-{
-QEDWriteTableCB *write_table_cb = opaque;
-BDRVQEDState *s = write_table_cb->s;
-
-trace_qed_write_table_cb(s,
- write_table_cb->orig_table,
- write_table_cb->flush,
- ret);
-
-if (ret) {
-goto out;
-}
-
-if (write_table_cb->flush) {
-/* We still need to flush first */
-write_table_cb->flush = false;
-qed_acquire(s);
-bdrv_aio_flush(write_table_cb->s->bs, qed_write_table_cb,
-   write_table_cb);
-qed_release(s);
-return;
-}
-
-out:
-qemu_vfree(write_table_cb->table);
-gencb_complete(&write_table_cb->gencb, ret);
-}
-
 /**
  * Write out an updated part or all of a table
  *
@@ -108,10 +68,13 @@ static void qed_write_table(BDRVQEDState *s, uint64_t 
offset, QEDTable *table,
 unsigned int index, unsigned int n, bool flush,
 BlockCompletionFunc *cb, void *opaque)
 {
-QEDWriteTableCB *write_table_cb;
 unsigned int sector_mask = BDRV_SECTOR_SIZE / sizeof(uint64_t) - 1;
 unsigned int start, end, i;
+QEDTable *new_table;
+struct iovec iov;
+QEMUIOVector qiov;
 size_t len_bytes;
+int ret;
 
 trace_qed_write_table(s, offset, table, index, n);
 
@@ -121,28 +84,41 @@ static void qed_write_table(BDRVQEDState *s, uint64_t 
offset, QEDTable *table,
 
 len_bytes = (end - start) * sizeof(uint64_t);
 
-write_table_cb = gencb_alloc(sizeof(*write_table_cb), cb, opaque);
-write_table_cb->s = s;
-write_table_cb->orig_table = table;
-write_table_cb->flush = flush;
-write_table_cb->table = qemu_blockalign(s->bs, len_bytes);
-write_table_cb->iov.iov_base = write_table_cb->table->offsets;
-write_table_cb->iov.iov_len = len_bytes;
-qemu_iovec_init_external(&write_table_cb->qiov, &write_table_cb->iov, 1);
+new_table = qemu_blockalign(s->bs, len_bytes);
+iov = (struct iovec) {
+.iov_base = new_table->offsets,
+.iov_len = len_bytes,
+};
+qemu_iovec_init_external(&qiov, &iov, 1);
 
 /* Byteswap table */
 for (i = start; i < end; i++) {
 uint64_t le_offset = cpu_to_le64(table->offsets[i]);
-write_table_cb->table->offsets[i - start] = le_offset;
+new_table->offsets[i - start] = le_offset;
 }
 
 /* Adjust for offset into table */
 offset += start * sizeof(uint64_t);
 
-bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE,
-&write_table_cb->qiov,
-write_table_cb->qiov.size / BDRV_SECTOR_SIZE,
-qed_write_table_cb, write_table_cb);
+ret = bdrv_pwritev(s->bs->file, offset, &qiov);
+trace_qed_write_table_cb(s, table, flush, ret);
+if (ret < 0) {
+goto out;
+}
+
+if (flush) {
+qed_acquire(s);
+ret = bdrv_flush(s->bs);
+qed_release(s);
+if (ret < 0) {
+goto out;
+}
+}
+
+ret = 0;
+out:
+qemu_vfree(new_table);
+cb(opaque, ret);
 }
 
 /**
-- 
1.8.3.1

[Qemu-block] [PULL 23/61] qcow2: Merge the writing of the COW regions with the guest data

2017-06-23 Thread Kevin Wolf

From: Alberto Garcia 

If the guest tries to write data that results on the allocation of a
new cluster, instead of writing the guest data first and then the data
from the COW regions, write everything together using one single I/O
operation.

This can improve the write performance by 25% or more, depending on
several factors such as the media type, the cluster size and the I/O
request size.

Signed-off-by: Alberto Garcia 
Reviewed-by: Kevin Wolf 
Signed-off-by: Kevin Wolf 
---
 block/qcow2-cluster.c | 40 
 block/qcow2.c | 64 +++
 block/qcow2.h |  7 ++
 3 files changed, 91 insertions(+), 20 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 3ac26d6..01f2101 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -776,6 +776,7 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
 assert(start->nb_bytes <= UINT_MAX - end->nb_bytes);
 assert(start->nb_bytes + end->nb_bytes <= UINT_MAX - data_bytes);
 assert(start->offset + start->nb_bytes <= end->offset);
+assert(!m->data_qiov || m->data_qiov->size == data_bytes);
 
 if (start->nb_bytes == 0 && end->nb_bytes == 0) {
 return 0;
@@ -807,7 +808,7 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
 /* The part of the buffer where the end region is located */
 end_buffer = start_buffer + buffer_size - end->nb_bytes;
 
-qemu_iovec_init(&qiov, 1);
+qemu_iovec_init(&qiov, 2 + (m->data_qiov ? m->data_qiov->niov : 0));
 
 qemu_co_mutex_unlock(&s->lock);
 /* First we read the existing data from both COW regions. We
@@ -842,17 +843,36 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta 
*m)
 }
 }
 
-/* And now we can write everything */
-qemu_iovec_reset(&qiov);
-qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
-ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
-if (ret < 0) {
-goto fail;
+/* And now we can write everything. If we have the guest data we
+ * can write everything in one single operation */
+if (m->data_qiov) {
+qemu_iovec_reset(&qiov);
+if (start->nb_bytes) {
+qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
+}
+qemu_iovec_concat(&qiov, m->data_qiov, 0, data_bytes);
+if (end->nb_bytes) {
+qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
+}
+/* NOTE: we have a write_aio blkdebug event here followed by
+ * a cow_write one in do_perform_cow_write(), but there's only
+ * one single I/O operation */
+BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
+ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
+} else {
+/* If there's no guest data then write both COW regions separately */
+qemu_iovec_reset(&qiov);
+qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
+ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
+if (ret < 0) {
+goto fail;
+}
+
+qemu_iovec_reset(&qiov);
+qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
+ret = do_perform_cow_write(bs, m->alloc_offset, end->offset, &qiov);
 }
 
-qemu_iovec_reset(&qiov);
-qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
-ret = do_perform_cow_write(bs, m->alloc_offset, end->offset, &qiov);
 fail:
 qemu_co_mutex_lock(&s->lock);
 
diff --git a/block/qcow2.c b/block/qcow2.c
index b3ba5da..328b1d4 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1575,6 +1575,44 @@ fail:
 return ret;
 }
 
+/* Check if it's possible to merge a write request with the writing of
+ * the data from the COW regions */
+static bool merge_cow(uint64_t offset, unsigned bytes,
+  QEMUIOVector *hd_qiov, QCowL2Meta *l2meta)
+{
+QCowL2Meta *m;
+
+for (m = l2meta; m != NULL; m = m->next) {
+/* If both COW regions are empty then there's nothing to merge */
+if (m->cow_start.nb_bytes == 0 && m->cow_end.nb_bytes == 0) {
+continue;
+}
+
+/* The data (middle) region must be immediately after the
+ * start region */
+if (l2meta_cow_start(m) + m->cow_start.nb_bytes != offset) {
+continue;
+}
+
+/* The end region must be immediately after the data (middle)
+ * region */
+if (m->offset + m->cow_end.offset != offset + bytes) {
+continue;
+}
+
+/* Make sure that adding both COW regions to the QEMUIOVector
+ * does not exceed IOV_MAX */
+if (hd_qiov->niov > IOV_MAX - 2) {
+continue;
+}
+
+m->data_qiov = hd_qiov;
+return true;
+}
+
+return false;
+}
+
 static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
  uint64_t bytes, QEMUIOVec

[Qemu-block] [PULL 16/61] nvme: Add support for Read Data and Write Data in CMBs.

2017-06-23 Thread Kevin Wolf

From: Stephen Bates 

Add the ability for the NVMe model to support both the RDS and WDS
modes in the Controller Memory Buffer.

Although not currently supported in the upstreamed Linux kernel a fork
with support exists [1] and user-space test programs that build on
this also exist [2].

Useful for testing CMB functionality in preperation for real CMB
enabled NVMe devices (coming soon).

[1] https://github.com/sbates130272/linux-p2pmem
[2] https://github.com/sbates130272/p2pmem-test

Signed-off-by: Stephen Bates 
Reviewed-by: Logan Gunthorpe 
Reviewed-by: Keith Busch 
Signed-off-by: Kevin Wolf 
---
 hw/block/nvme.c | 83 +++--
 hw/block/nvme.h |  1 +
 2 files changed, 58 insertions(+), 26 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 381dc7c..6071dc1 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -21,7 +21,7 @@
  *  cmb_size_mb=
  *
  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
- * offset 0 in BAR2 and supports SQS only for now.
+ * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
  */
 
 #include "qemu/osdep.h"
@@ -93,8 +93,8 @@ static void nvme_isr_notify(NvmeCtrl *n, NvmeCQueue *cq)
 }
 }
 
-static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2,
-uint32_t len, NvmeCtrl *n)
+static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1,
+ uint64_t prp2, uint32_t len, NvmeCtrl *n)
 {
 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
 trans_len = MIN(len, trans_len);
@@ -102,10 +102,15 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t 
prp1, uint64_t prp2,
 
 if (!prp1) {
 return NVME_INVALID_FIELD | NVME_DNR;
+} else if (n->cmbsz && prp1 >= n->ctrl_mem.addr &&
+   prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
+qsg->nsg = 0;
+qemu_iovec_init(iov, num_prps);
+qemu_iovec_add(iov, (void *)&n->cmbuf[prp1 - n->ctrl_mem.addr], 
trans_len);
+} else {
+pci_dma_sglist_init(qsg, &n->parent_obj, num_prps);
+qemu_sglist_add(qsg, prp1, trans_len);
 }
-
-pci_dma_sglist_init(qsg, &n->parent_obj, num_prps);
-qemu_sglist_add(qsg, prp1, trans_len);
 len -= trans_len;
 if (len) {
 if (!prp2) {
@@ -118,7 +123,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t 
prp1, uint64_t prp2,
 
 nents = (len + n->page_size - 1) >> n->page_bits;
 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
-pci_dma_read(&n->parent_obj, prp2, (void *)prp_list, prp_trans);
+nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
 while (len != 0) {
 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
 
@@ -130,7 +135,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t 
prp1, uint64_t prp2,
 i = 0;
 nents = (len + n->page_size - 1) >> n->page_bits;
 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
-pci_dma_read(&n->parent_obj, prp_ent, (void *)prp_list,
+nvme_addr_read(n, prp_ent, (void *)prp_list,
 prp_trans);
 prp_ent = le64_to_cpu(prp_list[i]);
 }
@@ -140,7 +145,11 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t 
prp1, uint64_t prp2,
 }
 
 trans_len = MIN(len, n->page_size);
-qemu_sglist_add(qsg, prp_ent, trans_len);
+if (qsg->nsg){
+qemu_sglist_add(qsg, prp_ent, trans_len);
+} else {
+qemu_iovec_add(iov, (void *)&n->cmbuf[prp_ent - 
n->ctrl_mem.addr], trans_len);
+}
 len -= trans_len;
 i++;
 }
@@ -148,7 +157,11 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t 
prp1, uint64_t prp2,
 if (prp2 & (n->page_size - 1)) {
 goto unmap;
 }
-qemu_sglist_add(qsg, prp2, len);
+if (qsg->nsg) {
+qemu_sglist_add(qsg, prp2, len);
+} else {
+qemu_iovec_add(iov, (void *)&n->cmbuf[prp2 - 
n->ctrl_mem.addr], trans_len);
+}
 }
 }
 return NVME_SUCCESS;
@@ -162,16 +175,24 @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t 
*ptr, uint32_t len,
 uint64_t prp1, uint64_t prp2)
 {
 QEMUSGList qsg;
+QEMUIOVector iov;
+uint16_t status = NVME_SUCCESS;
 
-if (nvme_map_prp(&qsg, prp1, prp2, len, n)) {
+if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) {
 return NVME_INVALID_FIELD | NVME_DNR;
 }
-if (dma_buf_read(ptr, len, &qsg)) {
+if (qsg.nsg > 0) {
+if (dma_buf_read(ptr, len, &qsg)) {
+status = NVME_INVALID_FIELD | NVME_DNR;
+}
 qemu_sglist_destroy(&qsg);
-return

[Qemu-block] [PULL 24/61] qcow2: Use offset_into_cluster() and offset_to_l2_index()

2017-06-23 Thread Kevin Wolf

From: Alberto Garcia 

We already have functions for doing these calculations, so let's use
them instead of doing everything by hand. This makes the code a bit
more readable.

Signed-off-by: Alberto Garcia 
Signed-off-by: Kevin Wolf 
---
 block/qcow2-cluster.c | 4 ++--
 block/qcow2.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 01f2101..3d341fd 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -556,7 +556,7 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t 
offset,
 
 /* find the cluster offset for the given disk offset */
 
-l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+l2_index = offset_to_l2_index(s, offset);
 *cluster_offset = be64_to_cpu(l2_table[l2_index]);
 
 nb_clusters = size_to_clusters(s, bytes_needed);
@@ -693,7 +693,7 @@ static int get_cluster_table(BlockDriverState *bs, uint64_t 
offset,
 
 /* find the cluster offset for the given disk offset */
 
-l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+l2_index = offset_to_l2_index(s, offset);
 
 *new_l2_table = l2_table;
 *new_l2_index = l2_index;
diff --git a/block/qcow2.c b/block/qcow2.c
index 328b1d4..088ffe1 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -356,7 +356,7 @@ static int validate_table_offset(BlockDriverState *bs, 
uint64_t offset,
 }
 
 /* Tables must be cluster aligned */
-if (offset & (s->cluster_size - 1)) {
+if (offset_into_cluster(s, offset) != 0) {
 return -EINVAL;
 }
 
-- 
1.8.3.1

[Qemu-block] [PULL 20/61] qcow2: Split do_perform_cow() into _read(), _encrypt() and _write()

2017-06-23 Thread Kevin Wolf

From: Alberto Garcia 

This patch splits do_perform_cow() into three separate functions to
read, encrypt and write the COW regions.

perform_cow() can now read both regions first, then encrypt them and
finally write them to disk. The memory allocation is also done in
this function now, using one single buffer large enough to hold both
regions.

Signed-off-by: Alberto Garcia 
Reviewed-by: Kevin Wolf 
Signed-off-by: Kevin Wolf 
---
 block/qcow2-cluster.c | 117 +-
 1 file changed, 87 insertions(+), 30 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 4c03639..3c9ace8 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -403,34 +403,26 @@ int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t 
sector_num,
 return 0;
 }
 
-static int coroutine_fn do_perform_cow(BlockDriverState *bs,
-   uint64_t src_cluster_offset,
-   uint64_t cluster_offset,
-   unsigned offset_in_cluster,
-   unsigned bytes)
+static int coroutine_fn do_perform_cow_read(BlockDriverState *bs,
+uint64_t src_cluster_offset,
+unsigned offset_in_cluster,
+uint8_t *buffer,
+unsigned bytes)
 {
-BDRVQcow2State *s = bs->opaque;
 QEMUIOVector qiov;
-struct iovec iov;
+struct iovec iov = { .iov_base = buffer, .iov_len = bytes };
 int ret;
 
 if (bytes == 0) {
 return 0;
 }
 
-iov.iov_len = bytes;
-iov.iov_base = qemu_try_blockalign(bs, iov.iov_len);
-if (iov.iov_base == NULL) {
-return -ENOMEM;
-}
-
 qemu_iovec_init_external(&qiov, &iov, 1);
 
 BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);
 
 if (!bs->drv) {
-ret = -ENOMEDIUM;
-goto out;
+return -ENOMEDIUM;
 }
 
 /* Call .bdrv_co_readv() directly instead of using the public block-layer
@@ -440,39 +432,63 @@ static int coroutine_fn do_perform_cow(BlockDriverState 
*bs,
 ret = bs->drv->bdrv_co_preadv(bs, src_cluster_offset + offset_in_cluster,
   bytes, &qiov, 0);
 if (ret < 0) {
-goto out;
+return ret;
 }
 
-if (bs->encrypted) {
+return 0;
+}
+
+static bool coroutine_fn do_perform_cow_encrypt(BlockDriverState *bs,
+uint64_t src_cluster_offset,
+unsigned offset_in_cluster,
+uint8_t *buffer,
+unsigned bytes)
+{
+if (bytes && bs->encrypted) {
+BDRVQcow2State *s = bs->opaque;
 int64_t sector = (src_cluster_offset + offset_in_cluster)
  >> BDRV_SECTOR_BITS;
 assert(s->cipher);
 assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0);
 assert((bytes & ~BDRV_SECTOR_MASK) == 0);
-if (qcow2_encrypt_sectors(s, sector, iov.iov_base, iov.iov_base,
+if (qcow2_encrypt_sectors(s, sector, buffer, buffer,
   bytes >> BDRV_SECTOR_BITS, true, NULL) < 0) {
-ret = -EIO;
-goto out;
+return false;
 }
 }
+return true;
+}
+
+static int coroutine_fn do_perform_cow_write(BlockDriverState *bs,
+ uint64_t cluster_offset,
+ unsigned offset_in_cluster,
+ uint8_t *buffer,
+ unsigned bytes)
+{
+QEMUIOVector qiov;
+struct iovec iov = { .iov_base = buffer, .iov_len = bytes };
+int ret;
+
+if (bytes == 0) {
+return 0;
+}
+
+qemu_iovec_init_external(&qiov, &iov, 1);
 
 ret = qcow2_pre_write_overlap_check(bs, 0,
 cluster_offset + offset_in_cluster, bytes);
 if (ret < 0) {
-goto out;
+return ret;
 }
 
 BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
 ret = bdrv_co_pwritev(bs->file, cluster_offset + offset_in_cluster,
   bytes, &qiov, 0);
 if (ret < 0) {
-goto out;
+return ret;
 }
 
-ret = 0;
-out:
-qemu_vfree(iov.iov_base);
-return ret;
+return 0;
 }
 
 
@@ -760,22 +776,62 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta 
*m)
 BDRVQcow2State *s = bs->opaque;
 Qcow2COWRegion *start = &m->cow_start;
 Qcow2COWRegion *end = &m->cow_end;
+unsigned buffer_size;
+uint8_t *start_buffer, *end_buffer;
 int ret;
 
+assert(start->nb_bytes <= UINT_MAX - end->nb_bytes);
+
 if (start->nb_bytes == 0 && end->nb_bytes == 0) {
 return 0;
 }
 
+/* Reserve a buffer large enough to

[Qemu-block] [PULL 34/61] qed: Remove callback from qed_write_header()

2017-06-23 Thread Kevin Wolf

Signed-off-by: Kevin Wolf 
Reviewed-by: Stefan Hajnoczi 
---
 block/qed.c | 32 
 1 file changed, 12 insertions(+), 20 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index 2665efc..95f1050 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -98,8 +98,7 @@ int qed_write_header_sync(BDRVQEDState *s)
  * This function only updates known header fields in-place and does not affect
  * extra data after the QED header.
  */
-static void qed_write_header(BDRVQEDState *s, BlockCompletionFunc cb,
- void *opaque)
+static int qed_write_header(BDRVQEDState *s)
 {
 /* We must write full sectors for O_DIRECT but cannot necessarily generate
  * the data following the header if an unrecognized compat feature is
@@ -137,7 +136,7 @@ static void qed_write_header(BDRVQEDState *s, 
BlockCompletionFunc cb,
 ret = 0;
 out:
 qemu_vfree(buf);
-cb(opaque, ret);
+return ret;
 }
 
 static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size)
@@ -289,21 +288,6 @@ static void qed_unplug_allocating_write_reqs(BDRVQEDState 
*s)
 }
 }
 
-static void qed_finish_clear_need_check(void *opaque, int ret)
-{
-/* Do nothing */
-}
-
-static void qed_flush_after_clear_need_check(void *opaque, int ret)
-{
-BDRVQEDState *s = opaque;
-
-bdrv_aio_flush(s->bs, qed_finish_clear_need_check, s);
-
-/* No need to wait until flush completes */
-qed_unplug_allocating_write_reqs(s);
-}
-
 static void qed_clear_need_check(void *opaque, int ret)
 {
 BDRVQEDState *s = opaque;
@@ -314,7 +298,13 @@ static void qed_clear_need_check(void *opaque, int ret)
 }
 
 s->header.features &= ~QED_F_NEED_CHECK;
-qed_write_header(s, qed_flush_after_clear_need_check, s);
+ret = qed_write_header(s);
+(void) ret;
+
+qed_unplug_allocating_write_reqs(s);
+
+ret = bdrv_flush(s->bs);
+(void) ret;
 }
 
 static void qed_need_check_timer_cb(void *opaque)
@@ -1179,6 +1169,7 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
 {
 BDRVQEDState *s = acb_to_s(acb);
 BlockCompletionFunc *cb;
+int ret;
 
 /* Cancel timer when the first allocating request comes in */
 if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) {
@@ -1213,7 +1204,8 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
 
 if (qed_should_set_need_check(s)) {
 s->header.features |= QED_F_NEED_CHECK;
-qed_write_header(s, cb, acb);
+ret = qed_write_header(s);
+cb(acb, ret);
 } else {
 cb(acb, 0);
 }
-- 
1.8.3.1

[Qemu-block] [PULL 15/61] qemu-iotests: 068: test iothread mode

2017-06-23 Thread Kevin Wolf

From: Stefan Hajnoczi 

Perform the savevm/loadvm test with both iothread on and off.  This
covers the recently found savevm/loadvm hang when iothread is enabled.

Signed-off-by: Stefan Hajnoczi 
Signed-off-by: Kevin Wolf 
---
 tests/qemu-iotests/068 | 23 ++-
 tests/qemu-iotests/068.out | 11 ++-
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/tests/qemu-iotests/068 b/tests/qemu-iotests/068
index 7292643..3801b65 100755
--- a/tests/qemu-iotests/068
+++ b/tests/qemu-iotests/068
@@ -45,11 +45,6 @@ _supported_os Linux
 IMGOPTS="compat=1.1"
 IMG_SIZE=128K
 
-echo
-echo "=== Saving and reloading a VM state to/from a qcow2 image ==="
-echo
-_make_test_img $IMG_SIZE
-
 case "$QEMU_DEFAULT_MACHINE" in
   s390-ccw-virtio)
   platform_parm="-no-shutdown"
@@ -71,10 +66,20 @@ _qemu()
 _filter_qemu | _filter_hmp
 }
 
-# Give qemu some time to boot before saving the VM state
-bash -c 'sleep 1; echo -e "savevm 0\nquit"' | _qemu
-# Now try to continue from that VM state (this should just work)
-echo quit | _qemu -loadvm 0
+for extra_args in \
+"" \
+"-object iothread,id=iothread0 -set device.hba0.iothread=iothread0"; do
+echo
+echo "=== Saving and reloading a VM state to/from a qcow2 image 
($extra_args) ==="
+echo
+
+_make_test_img $IMG_SIZE
+
+# Give qemu some time to boot before saving the VM state
+bash -c 'sleep 1; echo -e "savevm 0\nquit"' | _qemu $extra_args
+# Now try to continue from that VM state (this should just work)
+echo quit | _qemu $extra_args -loadvm 0
+done
 
 # success, all done
 echo "*** done"
diff --git a/tests/qemu-iotests/068.out b/tests/qemu-iotests/068.out
index 0fa5340..aa063cf 100644
--- a/tests/qemu-iotests/068.out
+++ b/tests/qemu-iotests/068.out
@@ -1,6 +1,15 @@
 QA output created by 068
 
-=== Saving and reloading a VM state to/from a qcow2 image ===
+=== Saving and reloading a VM state to/from a qcow2 image () ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=131072
+QEMU X.Y.Z monitor - type 'help' for more information
+(qemu) savevm 0
+(qemu) quit
+QEMU X.Y.Z monitor - type 'help' for more information
+(qemu) quit
+
+=== Saving and reloading a VM state to/from a qcow2 image (-object 
iothread,id=iothread0 -set device.hba0.iothread=iothread0) ===
 
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=131072
 QEMU X.Y.Z monitor - type 'help' for more information
-- 
1.8.3.1

[Qemu-block] [PULL 18/61] qcow2: Use unsigned int for both members of Qcow2COWRegion

2017-06-23 Thread Kevin Wolf

From: Alberto Garcia 

Qcow2COWRegion has two attributes:

- The offset of the COW region from the start of the first cluster
  touched by the I/O request. Since it's always going to be positive
  and the maximum request size is at most INT_MAX, we can use a
  regular unsigned int to store this offset.

- The size of the COW region in bytes. This is guaranteed to be >= 0,
  so we should use an unsigned type instead.

In x86_64 this reduces the size of Qcow2COWRegion from 16 to 8 bytes.
It will also help keep some assertions simpler now that we know that
there are no negative numbers.

The prototype of do_perform_cow() is also updated to reflect these
changes.

Signed-off-by: Alberto Garcia 
Reviewed-by: Eric Blake 
Reviewed-by: Kevin Wolf 
Signed-off-by: Kevin Wolf 
---
 block/qcow2-cluster.c | 4 ++--
 block/qcow2.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index d1c419f..a86c5a7 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -406,8 +406,8 @@ int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t 
sector_num,
 static int coroutine_fn do_perform_cow(BlockDriverState *bs,
uint64_t src_cluster_offset,
uint64_t cluster_offset,
-   int offset_in_cluster,
-   int bytes)
+   unsigned offset_in_cluster,
+   unsigned bytes)
 {
 BDRVQcow2State *s = bs->opaque;
 QEMUIOVector qiov;
diff --git a/block/qcow2.h b/block/qcow2.h
index 1801dc3..c26ee0a 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -301,10 +301,10 @@ typedef struct Qcow2COWRegion {
  * Offset of the COW region in bytes from the start of the first cluster
  * touched by the request.
  */
-uint64_toffset;
+unsignedoffset;
 
 /** Number of bytes to copy */
-int nb_bytes;
+unsignednb_bytes;
 } Qcow2COWRegion;
 
 /**
-- 
1.8.3.1

[Qemu-block] [PULL 19/61] qcow2: Make perform_cow() call do_perform_cow() twice

2017-06-23 Thread Kevin Wolf

From: Alberto Garcia 

Instead of calling perform_cow() twice with a different COW region
each time, call it just once and make perform_cow() handle both
regions.

This patch simply moves code around. The next one will do the actual
reordering of the COW operations.

Signed-off-by: Alberto Garcia 
Reviewed-by: Eric Blake 
Reviewed-by: Kevin Wolf 
Signed-off-by: Kevin Wolf 
---
 block/qcow2-cluster.c | 36 ++--
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index a86c5a7..4c03639 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -414,6 +414,10 @@ static int coroutine_fn do_perform_cow(BlockDriverState 
*bs,
 struct iovec iov;
 int ret;
 
+if (bytes == 0) {
+return 0;
+}
+
 iov.iov_len = bytes;
 iov.iov_base = qemu_try_blockalign(bs, iov.iov_len);
 if (iov.iov_base == NULL) {
@@ -751,31 +755,40 @@ uint64_t 
qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
 return cluster_offset;
 }
 
-static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r)
+static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
 {
 BDRVQcow2State *s = bs->opaque;
+Qcow2COWRegion *start = &m->cow_start;
+Qcow2COWRegion *end = &m->cow_end;
 int ret;
 
-if (r->nb_bytes == 0) {
+if (start->nb_bytes == 0 && end->nb_bytes == 0) {
 return 0;
 }
 
 qemu_co_mutex_unlock(&s->lock);
-ret = do_perform_cow(bs, m->offset, m->alloc_offset, r->offset, 
r->nb_bytes);
-qemu_co_mutex_lock(&s->lock);
-
+ret = do_perform_cow(bs, m->offset, m->alloc_offset,
+ start->offset, start->nb_bytes);
 if (ret < 0) {
-return ret;
+goto fail;
 }
 
+ret = do_perform_cow(bs, m->offset, m->alloc_offset,
+ end->offset, end->nb_bytes);
+
+fail:
+qemu_co_mutex_lock(&s->lock);
+
 /*
  * Before we update the L2 table to actually point to the new cluster, we
  * need to be sure that the refcounts have been increased and COW was
  * handled.
  */
-qcow2_cache_depends_on_flush(s->l2_table_cache);
+if (ret == 0) {
+qcow2_cache_depends_on_flush(s->l2_table_cache);
+}
 
-return 0;
+return ret;
 }
 
 int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
@@ -795,12 +808,7 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, 
QCowL2Meta *m)
 }
 
 /* copy content of unmodified sectors */
-ret = perform_cow(bs, m, &m->cow_start);
-if (ret < 0) {
-goto err;
-}
-
-ret = perform_cow(bs, m, &m->cow_end);
+ret = perform_cow(bs, m);
 if (ret < 0) {
 goto err;
 }
-- 
1.8.3.1

[Qemu-block] [PULL 17/61] qcow2: Remove unused Error variable in do_perform_cow()

2017-06-23 Thread Kevin Wolf

From: Alberto Garcia 

We are using the return value of qcow2_encrypt_sectors() to detect
problems but we are throwing away the returned Error since we have no
way to report it to the user. Therefore we can simply get rid of the
local Error variable and pass NULL instead.

Alternatively we could try to figure out a way to pass the original
error instead of simply returning -EIO, but that would be more
invasive, so let's keep the current approach.

Signed-off-by: Alberto Garcia 
Reviewed-by: Eric Blake 
Reviewed-by: Kevin Wolf 
Signed-off-by: Kevin Wolf 
---
 block/qcow2-cluster.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index d779ea1..d1c419f 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -440,16 +440,14 @@ static int coroutine_fn do_perform_cow(BlockDriverState 
*bs,
 }
 
 if (bs->encrypted) {
-Error *err = NULL;
 int64_t sector = (src_cluster_offset + offset_in_cluster)
  >> BDRV_SECTOR_BITS;
 assert(s->cipher);
 assert((offset_in_cluster & ~BDRV_SECTOR_MASK) == 0);
 assert((bytes & ~BDRV_SECTOR_MASK) == 0);
 if (qcow2_encrypt_sectors(s, sector, iov.iov_base, iov.iov_base,
-  bytes >> BDRV_SECTOR_BITS, true, &err) < 0) {
+  bytes >> BDRV_SECTOR_BITS, true, NULL) < 0) {
 ret = -EIO;
-error_free(err);
 goto out;
 }
 }
-- 
1.8.3.1

[Qemu-block] [PULL 14/61] qemu-iotests: 068: use -drive/-device instead of -hda

2017-06-23 Thread Kevin Wolf

From: Stefan Hajnoczi 

The legacy -hda option does not support -drive/-device parameters.  They
will be required by the next patch that extends this test case.

Signed-off-by: Stefan Hajnoczi 
Signed-off-by: Kevin Wolf 
---
 tests/qemu-iotests/068 | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/068 b/tests/qemu-iotests/068
index 61936d5..7292643 100755
--- a/tests/qemu-iotests/068
+++ b/tests/qemu-iotests/068
@@ -53,15 +53,20 @@ _make_test_img $IMG_SIZE
 case "$QEMU_DEFAULT_MACHINE" in
   s390-ccw-virtio)
   platform_parm="-no-shutdown"
+  hba=virtio-scsi-ccw
   ;;
   *)
   platform_parm=""
+  hba=virtio-scsi-pci
   ;;
 esac
 
 _qemu()
 {
-$QEMU $platform_parm -nographic -monitor stdio -serial none -hda 
"$TEST_IMG" \
+$QEMU $platform_parm -nographic -monitor stdio -serial none \
+  -drive if=none,id=drive0,file="$TEST_IMG",format="$IMGFMT" \
+  -device $hba,id=hba0 \
+  -device scsi-hd,drive=drive0 \
   "$@" |\
 _filter_qemu | _filter_hmp
 }
-- 
1.8.3.1

[Qemu-block] [PULL 13/61] qemu-iotests: 068: extract _qemu() function

2017-06-23 Thread Kevin Wolf

From: Stefan Hajnoczi 

Avoid duplicating the QEMU command-line.

Signed-off-by: Stefan Hajnoczi 
Signed-off-by: Kevin Wolf 
---
 tests/qemu-iotests/068 | 15 +--
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/tests/qemu-iotests/068 b/tests/qemu-iotests/068
index 9c1687d..61936d5 100755
--- a/tests/qemu-iotests/068
+++ b/tests/qemu-iotests/068
@@ -59,14 +59,17 @@ case "$QEMU_DEFAULT_MACHINE" in
   ;;
 esac
 
-# Give qemu some time to boot before saving the VM state
-bash -c 'sleep 1; echo -e "savevm 0\nquit"' |\
-$QEMU $platform_parm -nographic -monitor stdio -serial none -hda 
"$TEST_IMG" |\
+_qemu()
+{
+$QEMU $platform_parm -nographic -monitor stdio -serial none -hda 
"$TEST_IMG" \
+  "$@" |\
 _filter_qemu | _filter_hmp
+}
+
+# Give qemu some time to boot before saving the VM state
+bash -c 'sleep 1; echo -e "savevm 0\nquit"' | _qemu
 # Now try to continue from that VM state (this should just work)
-echo quit |\
-$QEMU $platform_parm -nographic -monitor stdio -serial none -hda 
"$TEST_IMG" -loadvm 0 |\
-_filter_qemu | _filter_hmp
+echo quit | _qemu -loadvm 0
 
 # success, all done
 echo "*** done"
-- 
1.8.3.1

[Qemu-block] [PULL 30/61] qed: Make qed_read_backing_file() synchronous

2017-06-23 Thread Kevin Wolf

Note that this code is generally not running in coroutine context, so
this is an actual blocking synchronous operation. We'll fix this in a
moment.

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 block/qed.c | 32 ++--
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index 290cbcd..1105f19 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -808,13 +808,13 @@ static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
  * This function reads qiov->size bytes starting at pos from the backing file.
  * If there is no backing file then zeroes are read.
  */
-static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
-  QEMUIOVector *qiov,
-  QEMUIOVector **backing_qiov,
-  BlockCompletionFunc *cb, void *opaque)
+static int qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
+ QEMUIOVector *qiov,
+ QEMUIOVector **backing_qiov)
 {
 uint64_t backing_length = 0;
 size_t size;
+int ret;
 
 /* If there is a backing file, get its length.  Treat the absence of a
  * backing file like a zero length backing file.
@@ -822,8 +822,7 @@ static void qed_read_backing_file(BDRVQEDState *s, uint64_t 
pos,
 if (s->bs->backing) {
 int64_t l = bdrv_getlength(s->bs->backing->bs);
 if (l < 0) {
-cb(opaque, l);
-return;
+return l;
 }
 backing_length = l;
 }
@@ -836,8 +835,7 @@ static void qed_read_backing_file(BDRVQEDState *s, uint64_t 
pos,
 
 /* Complete now if there are no backing file sectors to read */
 if (pos >= backing_length) {
-cb(opaque, 0);
-return;
+return 0;
 }
 
 /* If the read straddles the end of the backing file, shorten it */
@@ -849,8 +847,11 @@ static void qed_read_backing_file(BDRVQEDState *s, 
uint64_t pos,
 qemu_iovec_concat(*backing_qiov, qiov, 0, size);
 
 BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
-bdrv_aio_readv(s->bs->backing, pos / BDRV_SECTOR_SIZE,
-   *backing_qiov, size / BDRV_SECTOR_SIZE, cb, opaque);
+ret = bdrv_preadv(s->bs->backing, pos, *backing_qiov);
+if (ret < 0) {
+return ret;
+}
+return 0;
 }
 
 typedef struct {
@@ -907,6 +908,7 @@ static void qed_copy_from_backing_file(BDRVQEDState *s, 
uint64_t pos,
void *opaque)
 {
 CopyFromBackingFileCB *copy_cb;
+int ret;
 
 /* Skip copy entirely if there is no work to do */
 if (len == 0) {
@@ -922,8 +924,9 @@ static void qed_copy_from_backing_file(BDRVQEDState *s, 
uint64_t pos,
 copy_cb->iov.iov_len = len;
 qemu_iovec_init_external(©_cb->qiov, ©_cb->iov, 1);
 
-qed_read_backing_file(s, pos, ©_cb->qiov, ©_cb->backing_qiov,
-  qed_copy_from_backing_file_write, copy_cb);
+ret = qed_read_backing_file(s, pos, ©_cb->qiov,
+©_cb->backing_qiov);
+qed_copy_from_backing_file_write(copy_cb, ret);
 }
 
 /**
@@ -1366,8 +1369,9 @@ static void qed_aio_read_data(void *opaque, int ret,
 qed_aio_start_io(acb);
 return;
 } else if (ret != QED_CLUSTER_FOUND) {
-qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
-  &acb->backing_qiov, qed_aio_next_io_cb, acb);
+ret = qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
+&acb->backing_qiov);
+qed_aio_next_io(acb, ret);
 return;
 }
 
-- 
1.8.3.1

[Qemu-block] [PULL 25/61] qed: Use bottom half to resume waiting requests

2017-06-23 Thread Kevin Wolf

The qed driver serialises allocating write requests. When the active
allocation is finished, the AIO callback is called, but after this, the
next allocating request is immediately processed instead of leaving the
coroutine. Resuming another allocation request in the same request
coroutine means that the request now runs in the wrong coroutine.

The following is one of the possible effects of this: The completed
request will generally reenter its request coroutine in a bottom half,
expecting that it completes the request in bdrv_driver_pwritev().
However, if the second request actually yielded before leaving the
coroutine, the reused request coroutine is in an entirely different
place and is reentered prematurely. Not a good idea.

Let's make sure that we exit the coroutine after completing the first
request by resuming the next allocating request only with a bottom
half.

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 block/qed.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index 8d899fd..a837a28 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -967,6 +967,11 @@ static void qed_aio_complete_bh(void *opaque)
 qed_release(s);
 }
 
+static void qed_resume_alloc_bh(void *opaque)
+{
+qed_aio_start_io(opaque);
+}
+
 static void qed_aio_complete(QEDAIOCB *acb, int ret)
 {
 BDRVQEDState *s = acb_to_s(acb);
@@ -995,10 +1000,12 @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
  * requests multiple times but rather finish one at a time completely.
  */
 if (acb == QSIMPLEQ_FIRST(&s->allocating_write_reqs)) {
+QEDAIOCB *next_acb;
 QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next);
-acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
-if (acb) {
-qed_aio_start_io(acb);
+next_acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
+if (next_acb) {
+aio_bh_schedule_oneshot(bdrv_get_aio_context(acb->common.bs),
+qed_resume_alloc_bh, next_acb);
 } else if (s->header.features & QED_F_NEED_CHECK) {
 qed_start_need_check_timer(s);
 }
-- 
1.8.3.1

[Qemu-block] [PULL 22/61] qcow2: Pass a QEMUIOVector to do_perform_cow_{read, write}()

2017-06-23 Thread Kevin Wolf

From: Alberto Garcia 

Instead of passing a single buffer pointer to do_perform_cow_write(),
pass a QEMUIOVector. This will allow us to merge the write requests
for the COW regions and the actual data into a single one.

Although do_perform_cow_read() does not strictly need to change its
API, we're doing it here as well for consistency.

Signed-off-by: Alberto Garcia 
Reviewed-by: Kevin Wolf 
Signed-off-by: Kevin Wolf 
---
 block/qcow2-cluster.c | 51 ---
 1 file changed, 24 insertions(+), 27 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 20fb531..3ac26d6 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -406,19 +406,14 @@ int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t 
sector_num,
 static int coroutine_fn do_perform_cow_read(BlockDriverState *bs,
 uint64_t src_cluster_offset,
 unsigned offset_in_cluster,
-uint8_t *buffer,
-unsigned bytes)
+QEMUIOVector *qiov)
 {
-QEMUIOVector qiov;
-struct iovec iov = { .iov_base = buffer, .iov_len = bytes };
 int ret;
 
-if (bytes == 0) {
+if (qiov->size == 0) {
 return 0;
 }
 
-qemu_iovec_init_external(&qiov, &iov, 1);
-
 BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);
 
 if (!bs->drv) {
@@ -430,7 +425,7 @@ static int coroutine_fn 
do_perform_cow_read(BlockDriverState *bs,
  * which can lead to deadlock when block layer copy-on-read is enabled.
  */
 ret = bs->drv->bdrv_co_preadv(bs, src_cluster_offset + offset_in_cluster,
-  bytes, &qiov, 0);
+  qiov->size, qiov, 0);
 if (ret < 0) {
 return ret;
 }
@@ -462,28 +457,23 @@ static bool coroutine_fn 
do_perform_cow_encrypt(BlockDriverState *bs,
 static int coroutine_fn do_perform_cow_write(BlockDriverState *bs,
  uint64_t cluster_offset,
  unsigned offset_in_cluster,
- uint8_t *buffer,
- unsigned bytes)
+ QEMUIOVector *qiov)
 {
-QEMUIOVector qiov;
-struct iovec iov = { .iov_base = buffer, .iov_len = bytes };
 int ret;
 
-if (bytes == 0) {
+if (qiov->size == 0) {
 return 0;
 }
 
-qemu_iovec_init_external(&qiov, &iov, 1);
-
 ret = qcow2_pre_write_overlap_check(bs, 0,
-cluster_offset + offset_in_cluster, bytes);
+cluster_offset + offset_in_cluster, qiov->size);
 if (ret < 0) {
 return ret;
 }
 
 BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
 ret = bdrv_co_pwritev(bs->file, cluster_offset + offset_in_cluster,
-  bytes, &qiov, 0);
+  qiov->size, qiov, 0);
 if (ret < 0) {
 return ret;
 }
@@ -780,6 +770,7 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
 unsigned data_bytes = end->offset - (start->offset + start->nb_bytes);
 bool merge_reads;
 uint8_t *start_buffer, *end_buffer;
+QEMUIOVector qiov;
 int ret;
 
 assert(start->nb_bytes <= UINT_MAX - end->nb_bytes);
@@ -816,22 +807,25 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta 
*m)
 /* The part of the buffer where the end region is located */
 end_buffer = start_buffer + buffer_size - end->nb_bytes;
 
+qemu_iovec_init(&qiov, 1);
+
 qemu_co_mutex_unlock(&s->lock);
 /* First we read the existing data from both COW regions. We
  * either read the whole region in one go, or the start and end
  * regions separately. */
 if (merge_reads) {
-ret = do_perform_cow_read(bs, m->offset, start->offset,
-  start_buffer, buffer_size);
+qemu_iovec_add(&qiov, start_buffer, buffer_size);
+ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov);
 } else {
-ret = do_perform_cow_read(bs, m->offset, start->offset,
-  start_buffer, start->nb_bytes);
+qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
+ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov);
 if (ret < 0) {
 goto fail;
 }
 
-ret = do_perform_cow_read(bs, m->offset, end->offset,
-  end_buffer, end->nb_bytes);
+qemu_iovec_reset(&qiov);
+qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
+ret = do_perform_cow_read(bs, m->offset, end->offset, &qiov);
 }
 if (ret < 0) {
 goto fail;
@@ -849,14 +843,16 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta 
*m)
 }
 
 /* And now we can write everything */
-ret

[Qemu-block] [PULL 12/61] migration: hold AioContext lock for loadvm qemu_fclose()

2017-06-23 Thread Kevin Wolf

From: Stefan Hajnoczi 

migration_incoming_state_destroy() uses qemu_fclose() on the vmstate
file.  Make sure to call it inside an AioContext acquire/release region.

This fixes an 'qemu: qemu_mutex_unlock: Operation not permitted' abort
in loadvm.

This patch closes the vmstate file before ending the drained region.
Previously we closed the vmstate file after ending the drained region.
The order does not matter.

Signed-off-by: Stefan Hajnoczi 
Signed-off-by: Kevin Wolf 
---
 migration/savevm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/migration/savevm.c b/migration/savevm.c
index b08df04..c7a49c9 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -2300,11 +2300,11 @@ int load_snapshot(const char *name, Error **errp)
 
 aio_context_acquire(aio_context);
 ret = qemu_loadvm_state(f);
+migration_incoming_state_destroy();
 aio_context_release(aio_context);
 
 bdrv_drain_all_end();
 
-migration_incoming_state_destroy();
 if (ret < 0) {
 error_setg(errp, "Error %d while loading VM state", ret);
 return ret;
-- 
1.8.3.1

[Qemu-block] [PULL 08/61] doc: Document generic -blockdev options

2017-06-23 Thread Kevin Wolf

This adds documentation for the -blockdev options that apply to all
nodes independent of the block driver used.

All options that are shared by -blockdev and -drive are now explained in
the section for -blockdev. The documentation of -drive mentions that all
-blockdev options are accepted as well.

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Max Reitz 
---
 qemu-options.hx | 108 +---
 1 file changed, 79 insertions(+), 29 deletions(-)

diff --git a/qemu-options.hx b/qemu-options.hx
index 30c4f98..db20866 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -610,6 +610,53 @@ DEF("blockdev", HAS_ARG, QEMU_OPTION_blockdev,
 "  [,read-only=on|off][,detect-zeroes=on|off|unmap]\n"
 "  [,driver specific parameters...]\n"
 "configure a block backend\n", QEMU_ARCH_ALL)
+STEXI
+@item -blockdev @var{option}[,@var{option}[,@var{option}[,...]]]
+@findex -blockdev
+
+Define a new block driver node.
+
+@table @option
+@item Valid options for any block driver node:
+
+@table @code
+@item driver
+Specifies the block driver to use for the given node.
+@item node-name
+This defines the name of the block driver node by which it will be referenced
+later. The name must be unique, i.e. it must not match the name of a different
+block driver node, or (if you use @option{-drive} as well) the ID of a drive.
+
+If no node name is specified, it is automatically generated. The generated node
+name is not intended to be predictable and changes between QEMU invocations.
+For the top level, an explicit node name must be specified.
+@item read-only
+Open the node read-only. Guest write attempts will fail.
+@item cache.direct
+The host page cache can be avoided with @option{cache.direct=on}. This will
+attempt to do disk IO directly to the guest's memory. QEMU may still perform an
+internal copy of the data.
+@item cache.no-flush
+In case you don't care about data integrity over host failures, you can use
+@option{cache.no-flush=on}. This option tells QEMU that it never needs to write
+any data to the disk but can instead keep things in cache. If anything goes
+wrong, like your host losing power, the disk storage getting disconnected
+accidentally, etc. your image will most probably be rendered unusable.
+@item discard=@var{discard}
+@var{discard} is one of "ignore" (or "off") or "unmap" (or "on") and controls
+whether @code{discard} (also known as @code{trim} or @code{unmap}) requests are
+ignored or passed to the filesystem. Some machine types may not support
+discard requests.
+@item detect-zeroes=@var{detect-zeroes}
+@var{detect-zeroes} is "off", "on" or "unmap" and enables the automatic
+conversion of plain zero writes by the OS to driver specific optimized
+zero write commands. You may even choose "unmap" if @var{discard} is set
+to "unmap" to allow a zero write to be converted to an @code{unmap} operation.
+@end table
+
+@end table
+
+ETEXI
 
 DEF("drive", HAS_ARG, QEMU_OPTION_drive,
 "-drive [file=file][,if=type][,bus=n][,unit=m][,media=d][,index=i]\n"
@@ -630,7 +677,12 @@ STEXI
 @item -drive @var{option}[,@var{option}[,@var{option}[,...]]]
 @findex -drive
 
-Define a new drive. Valid options are:
+Define a new drive. This includes creating a block driver node (the backend) as
+well as a guest device, and is mostly a shortcut for defining the corresponding
+@option{-blockdev} and @option{-device} options.
+
+@option{-drive} accepts all options that are accepted by @option{-blockdev}. In
+addition, it knows the following options:
 
 @table @option
 @item file=@var{file}
@@ -657,11 +709,31 @@ These options have the same definition as they have in 
@option{-hdachs}.
 @var{snapshot} is "on" or "off" and controls snapshot mode for the given drive
 (see @option{-snapshot}).
 @item cache=@var{cache}
-@var{cache} is "none", "writeback", "unsafe", "directsync" or "writethrough" 
and controls how the host cache is used to access block data.
+@var{cache} is "none", "writeback", "unsafe", "directsync" or "writethrough"
+and controls how the host cache is used to access block data. This is a
+shortcut that sets the @option{cache.direct} and @option{cache.no-flush}
+options (as in @option{-blockdev}), and additionally @option{cache.writeback},
+which provides a default for the @option{write-cache} option of block guest
+devices (as in @option{-device}). The modes correspond to the following
+settings:
+
+@c Our texi2pod.pl script doesn't support @multitable, so fall back to using
+@c plain ASCII art (well, UTF-8 art really). This looks okay both in the 
manpage
+@c and the HTML output.
+@example
+@ │ cache.writeback   cache.direct   cache.no-flush
+─┼─
+writeback│ onoffoff
+none │ onon off
+writethrough │ off   offoff
+directsync   │ off   on

[Qemu-block] [PULL 11/61] virtio-pci: use ioeventfd even when KVM is disabled

2017-06-23 Thread Kevin Wolf

From: Stefan Hajnoczi 

Old kvm.ko versions only supported a tiny number of ioeventfds so
virtio-pci avoids ioeventfds when kvm_has_many_ioeventfds() returns 0.

Do not check kvm_has_many_ioeventfds() when KVM is disabled since it
always returns 0.  Since commit 8c56c1a592b5092d91da8d8943c1d6462a6f
("memory: emulate ioeventfd") it has been possible to use ioeventfds in
qtest or TCG mode.

This patch makes -device virtio-blk-pci,iothread=iothread0 work even
when KVM is disabled.

I have tested that virtio-blk-pci works under TCG both with and without
iothread.

Cc: Michael S. Tsirkin 
Signed-off-by: Stefan Hajnoczi 
Reviewed-by: Michael S. Tsirkin 
Signed-off-by: Kevin Wolf 
---
 hw/virtio/virtio-pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 20d6a08..301920e 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -1740,7 +1740,7 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error 
**errp)
 bool pcie_port = pci_bus_is_express(pci_dev->bus) &&
  !pci_bus_is_root(pci_dev->bus);
 
-if (!kvm_has_many_ioeventfds()) {
+if (kvm_enabled() && !kvm_has_many_ioeventfds()) {
 proxy->flags &= ~VIRTIO_PCI_FLAG_USE_IOEVENTFD;
 }
 
-- 
1.8.3.1

[Qemu-block] [PULL 09/61] doc: Document driver-specific -blockdev options

2017-06-23 Thread Kevin Wolf

This documents the driver-specific options for the raw, qcow2 and file
block drivers for the man page. For everything else, we refer to the
QAPI documentation.

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Max Reitz 
---
 qemu-options.hx | 115 +++-
 1 file changed, 114 insertions(+), 1 deletion(-)

diff --git a/qemu-options.hx b/qemu-options.hx
index db20866..896ff17 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -614,7 +614,18 @@ STEXI
 @item -blockdev @var{option}[,@var{option}[,@var{option}[,...]]]
 @findex -blockdev
 
-Define a new block driver node.
+Define a new block driver node. Some of the options apply to all block drivers,
+other options are only accepted for a specific block driver. See below for a
+list of generic options and options for the most common block drivers.
+
+Options that expect a reference to another node (e.g. @code{file}) can be
+given in two ways. Either you specify the node name of an already existing node
+(file=@var{node-name}), or you define a new node inline, adding options
+for the referenced node after a dot (file.filename=@var{path},file.aio=native).
+
+A block driver node created with @option{-blockdev} can be used for a guest
+device by specifying its node name for the @code{drive} property in a
+@option{-device} argument that defines a block device.
 
 @table @option
 @item Valid options for any block driver node:
@@ -654,6 +665,108 @@ zero write commands. You may even choose "unmap" if 
@var{discard} is set
 to "unmap" to allow a zero write to be converted to an @code{unmap} operation.
 @end table
 
+@item Driver-specific options for @code{file}
+
+This is the protocol-level block driver for accessing regular files.
+
+@table @code
+@item filename
+The path to the image file in the local filesystem
+@item aio
+Specifies the AIO backend (threads/native, default: threads)
+@end table
+Example:
+@example
+-blockdev driver=file,node-name=disk,filename=disk.img
+@end example
+
+@item Driver-specific options for @code{raw}
+
+This is the image format block driver for raw images. It is usually
+stacked on top of a protocol level block driver such as @code{file}.
+
+@table @code
+@item file
+Reference to or definition of the data source block driver node
+(e.g. a @code{file} driver node)
+@end table
+Example 1:
+@example
+-blockdev driver=file,node-name=disk_file,filename=disk.img
+-blockdev driver=raw,node-name=disk,file=disk_file
+@end example
+Example 2:
+@example
+-blockdev driver=raw,node-name=disk,file.driver=file,file.filename=disk.img
+@end example
+
+@item Driver-specific options for @code{qcow2}
+
+This is the image format block driver for qcow2 images. It is usually
+stacked on top of a protocol level block driver such as @code{file}.
+
+@table @code
+@item file
+Reference to or definition of the data source block driver node
+(e.g. a @code{file} driver node)
+
+@item backing
+Reference to or definition of the backing file block device (default is taken
+from the image file). It is allowed to pass an empty string here in order to
+disable the default backing file.
+
+@item lazy-refcounts
+Whether to enable the lazy refcounts feature (on/off; default is taken from the
+image file)
+
+@item cache-size
+The maximum total size of the L2 table and refcount block caches in bytes
+(default: 1048576 bytes or 8 clusters, whichever is larger)
+
+@item l2-cache-size
+The maximum size of the L2 table cache in bytes
+(default: 4/5 of the total cache size)
+
+@item refcount-cache-size
+The maximum size of the refcount block cache in bytes
+(default: 1/5 of the total cache size)
+
+@item cache-clean-interval
+Clean unused entries in the L2 and refcount caches. The interval is in seconds.
+The default value is 0 and it disables this feature.
+
+@item pass-discard-request
+Whether discard requests to the qcow2 device should be forwarded to the data
+source (on/off; default: on if discard=unmap is specified, off otherwise)
+
+@item pass-discard-snapshot
+Whether discard requests for the data source should be issued when a snapshot
+operation (e.g. deleting a snapshot) frees clusters in the qcow2 file (on/off;
+default: on)
+
+@item pass-discard-other
+Whether discard requests for the data source should be issued on other
+occasions where a cluster gets freed (on/off; default: off)
+
+@item overlap-check
+Which overlap checks to perform for writes to the image
+(none/constant/cached/all; default: cached). For details or finer
+granularity control refer to the QAPI documentation of @code{blockdev-add}.
+@end table
+
+Example 1:
+@example
+-blockdev driver=file,node-name=my_file,filename=/tmp/disk.qcow2
+-blockdev 
driver=qcow2,node-name=hda,file=my_file,overlap-check=none,cache-size=16777216
+@end example
+Example 2:
+@example
+-blockdev 
driver=qcow2,node-name=disk,file.driver=http,file.filename=http://example.com/image.qcow2
+@end example
+
+@item Driver-specific options for other drivers
+Please refer

[Qemu-block] [PULL 10/61] throttle: Update throttle-groups.c documentation

2017-06-23 Thread Kevin Wolf

From: Alberto Garcia 

There used to be throttle_timers_{detach,attach}_aio_context() calls
in bdrv_set_aio_context(), but since 7ca7f0f6db1fedd28d490795d778cf239
they are now in blk_set_aio_context().

Signed-off-by: Alberto Garcia 
Reviewed-by: Stefan Hajnoczi 
Signed-off-by: Kevin Wolf 
---
 block/throttle-groups.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/throttle-groups.c b/block/throttle-groups.c
index a181cb1..da2b490 100644
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -49,7 +49,7 @@
  * Again, all this is handled internally and is mostly transparent to
  * the outside. The 'throttle_timers' field however has an additional
  * constraint because it may be temporarily invalid (see for example
- * bdrv_set_aio_context()). Therefore in this file a thread will
+ * blk_set_aio_context()). Therefore in this file a thread will
  * access some other BlockBackend's timers only after verifying that
  * that BlockBackend has throttled requests in the queue.
  */
-- 
1.8.3.1

[Qemu-block] [PULL 07/61] migration: use bdrv_drain_all_begin/end() instead bdrv_drain_all()

2017-06-23 Thread Kevin Wolf

From: Stefan Hajnoczi 

blk/bdrv_drain_all() only takes effect for a single instant and then
resumes block jobs, guest devices, and other external clients like the
NBD server.  This can be handy when performing a synchronous drain
before terminating the program, for example.

Monitor commands usually need to quiesce I/O across an entire code
region so blk/bdrv_drain_all() is not suitable.  They must use
bdrv_drain_all_begin/end() to mark the region.  This prevents new I/O
requests from slipping in or worse - block jobs completing and modifying
the graph.

I audited other blk/bdrv_drain_all() callers but did not find anything
that needs a similar fix.  This patch fixes the savevm/loadvm commands.
Although I haven't encountered a read world issue this makes the code
safer.

Suggested-by: Kevin Wolf 
Signed-off-by: Stefan Hajnoczi 
Reviewed-by: Eric Blake 
Signed-off-by: Kevin Wolf 
---
 migration/savevm.c | 18 +++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/migration/savevm.c b/migration/savevm.c
index 5846d9c..b08df04 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -2107,6 +2107,8 @@ int save_snapshot(const char *name, Error **errp)
 }
 vm_stop(RUN_STATE_SAVE_VM);
 
+bdrv_drain_all_begin();
+
 aio_context_acquire(aio_context);
 
 memset(sn, 0, sizeof(*sn));
@@ -2165,6 +2167,9 @@ int save_snapshot(const char *name, Error **errp)
 if (aio_context) {
 aio_context_release(aio_context);
 }
+
+bdrv_drain_all_end();
+
 if (saved_vm_running) {
 vm_start();
 }
@@ -2273,20 +2278,21 @@ int load_snapshot(const char *name, Error **errp)
 }
 
 /* Flush all IO requests so they don't interfere with the new state.  */
-bdrv_drain_all();
+bdrv_drain_all_begin();
 
 ret = bdrv_all_goto_snapshot(name, &bs);
 if (ret < 0) {
 error_setg(errp, "Error %d while activating snapshot '%s' on '%s'",
  ret, name, bdrv_get_device_name(bs));
-return ret;
+goto err_drain;
 }
 
 /* restore the VM state */
 f = qemu_fopen_bdrv(bs_vm_state, 0);
 if (!f) {
 error_setg(errp, "Could not open VM state file");
-return -EINVAL;
+ret = -EINVAL;
+goto err_drain;
 }
 
 qemu_system_reset(SHUTDOWN_CAUSE_NONE);
@@ -2296,6 +2302,8 @@ int load_snapshot(const char *name, Error **errp)
 ret = qemu_loadvm_state(f);
 aio_context_release(aio_context);
 
+bdrv_drain_all_end();
+
 migration_incoming_state_destroy();
 if (ret < 0) {
 error_setg(errp, "Error %d while loading VM state", ret);
@@ -2303,6 +2311,10 @@ int load_snapshot(const char *name, Error **errp)
 }
 
 return 0;
+
+err_drain:
+bdrv_drain_all_end();
+return ret;
 }
 
 void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev)
-- 
1.8.3.1

[Qemu-block] [PULL 05/61] block: use BDRV_POLL_WHILE() in bdrv_rw_vmstate()

2017-06-23 Thread Kevin Wolf

From: Stefan Hajnoczi 

Calling aio_poll() directly may have been fine previously, but this is
the future, man!  The difference between an aio_poll() loop and
BDRV_POLL_WHILE() is that BDRV_POLL_WHILE() releases the AioContext
around aio_poll().

This allows the IOThread to run fd handlers or BHs to complete the
request.  Failure to release the AioContext causes deadlocks.

Using BDRV_POLL_WHILE() partially fixes a 'savevm' hang with -object
iothread.

Signed-off-by: Stefan Hajnoczi 
Reviewed-by: Eric Blake 
Reviewed-by: Paolo Bonzini 
Signed-off-by: Kevin Wolf 
---
 block/io.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/block/io.c b/block/io.c
index 684ea46..e158ae0 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2023,9 +2023,7 @@ bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, 
int64_t pos,
 Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data);
 
 bdrv_coroutine_enter(bs, co);
-while (data.ret == -EINPROGRESS) {
-aio_poll(bdrv_get_aio_context(bs), true);
-}
+BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS);
 return data.ret;
 }
 }
-- 
1.8.3.1

[Qemu-block] [PULL 06/61] migration: avoid recursive AioContext locking in save_vmstate()

2017-06-23 Thread Kevin Wolf

From: Stefan Hajnoczi 

AioContext was designed to allow nested acquire/release calls.  It uses
a recursive mutex so callers don't need to worry about nesting...or so
we thought.

BDRV_POLL_WHILE() is used to wait for block I/O requests.  It releases
the AioContext temporarily around aio_poll().  This gives IOThreads a
chance to acquire the AioContext to process I/O completions.

It turns out that recursive locking and BDRV_POLL_WHILE() don't mix.
BDRV_POLL_WHILE() only releases the AioContext once, so the IOThread
will not be able to acquire the AioContext if it was acquired
multiple times.

Instead of trying to release AioContext n times in BDRV_POLL_WHILE(),
this patch simply avoids nested locking in save_vmstate().  It's the
simplest fix and we should step back to consider the big picture with
all the recent changes to block layer threading.

This patch is the final fix to solve 'savevm' hanging with -object
iothread.

Signed-off-by: Stefan Hajnoczi 
Reviewed-by: Eric Blake 
Reviewed-by: Paolo Bonzini 
Signed-off-by: Kevin Wolf 
---
 migration/savevm.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/migration/savevm.c b/migration/savevm.c
index 6bfd489..5846d9c 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -2144,6 +2144,14 @@ int save_snapshot(const char *name, Error **errp)
 goto the_end;
 }
 
+/* The bdrv_all_create_snapshot() call that follows acquires the AioContext
+ * for itself.  BDRV_POLL_WHILE() does not support nested locking because
+ * it only releases the lock once.  Therefore synchronous I/O will deadlock
+ * unless we release the AioContext before bdrv_all_create_snapshot().
+ */
+aio_context_release(aio_context);
+aio_context = NULL;
+
 ret = bdrv_all_create_snapshot(sn, bs, vm_state_size, &bs);
 if (ret < 0) {
 error_setg(errp, "Error while creating snapshot on '%s'",
@@ -2154,7 +2162,9 @@ int save_snapshot(const char *name, Error **errp)
 ret = 0;
 
  the_end:
-aio_context_release(aio_context);
+if (aio_context) {
+aio_context_release(aio_context);
+}
 if (saved_vm_running) {
 vm_start();
 }
-- 
1.8.3.1

[Qemu-block] [PULL 03/61] qemu-iotests: Test exiting qemu with running job

2017-06-23 Thread Kevin Wolf

When qemu is exited, all running jobs should be cancelled successfully.
This adds a test for this for all types of block jobs that currently
exist in qemu.

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
---
 tests/qemu-iotests/185 | 206 +
 tests/qemu-iotests/185.out |  59 +
 tests/qemu-iotests/group   |   1 +
 3 files changed, 266 insertions(+)
 create mode 100755 tests/qemu-iotests/185
 create mode 100644 tests/qemu-iotests/185.out

diff --git a/tests/qemu-iotests/185 b/tests/qemu-iotests/185
new file mode 100755
index 000..0eda371
--- /dev/null
+++ b/tests/qemu-iotests/185
@@ -0,0 +1,206 @@
+#!/bin/bash
+#
+# Test exiting qemu while jobs are still running
+#
+# Copyright (C) 2017 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see .
+#
+
+# creator
+owner=kw...@redhat.com
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+here=`pwd`
+status=1 # failure is the default!
+
+MIG_SOCKET="${TEST_DIR}/migrate"
+
+_cleanup()
+{
+rm -f "${TEST_IMG}.mid"
+rm -f "${TEST_IMG}.copy"
+_cleanup_test_img
+_cleanup_qemu
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+. ./common.qemu
+
+_supported_fmt qcow2
+_supported_proto file
+_supported_os Linux
+
+size=64M
+TEST_IMG="${TEST_IMG}.base" _make_test_img $size
+
+echo
+echo === Starting VM ===
+echo
+
+qemu_comm_method="qmp"
+
+_launch_qemu \
+-drive file="${TEST_IMG}.base",cache=$CACHEMODE,driver=$IMGFMT,id=disk
+h=$QEMU_HANDLE
+_send_qemu_cmd $h "{ 'execute': 'qmp_capabilities' }" 'return'
+
+echo
+echo === Creating backing chain ===
+echo
+
+_send_qemu_cmd $h \
+"{ 'execute': 'blockdev-snapshot-sync',
+   'arguments': { 'device': 'disk',
+  'snapshot-file': '$TEST_IMG.mid',
+  'format': '$IMGFMT',
+  'mode': 'absolute-paths' } }" \
+"return"
+
+_send_qemu_cmd $h \
+"{ 'execute': 'human-monitor-command',
+   'arguments': { 'command-line':
+  'qemu-io disk \"write 0 4M\"' } }" \
+"return"
+
+_send_qemu_cmd $h \
+"{ 'execute': 'blockdev-snapshot-sync',
+   'arguments': { 'device': 'disk',
+  'snapshot-file': '$TEST_IMG',
+  'format': '$IMGFMT',
+  'mode': 'absolute-paths' } }" \
+"return"
+
+echo
+echo === Start commit job and exit qemu ===
+echo
+
+# Note that the reference output intentionally includes the 'offset' field in
+# BLOCK_JOB_CANCELLED events for all of the following block jobs. They are
+# predictable and any change in the offsets would hint at a bug in the job
+# throttling code.
+#
+# In order to achieve these predictable offsets, all of the following tests
+# use speed=65536. Each job will perform exactly one iteration before it has
+# to sleep at least for a second, which is plenty of time for the 'quit' QMP
+# command to be received (after receiving the command, the rest runs
+# synchronously, so jobs can arbitrarily continue or complete).
+#
+# The buffer size for commit and streaming is 512k (waiting for 8 seconds after
+# the first request), for active commit and mirror it's large enough to cover
+# the full 4M, and for backup it's the qcow2 cluster size, which we know is
+# 64k. As all of these are at least as large as the speed, we are sure that the
+# offset doesn't advance after the first iteration before qemu exits.
+
+_send_qemu_cmd $h \
+"{ 'execute': 'block-commit',
+   'arguments': { 'device': 'disk',
+  'base':'$TEST_IMG.base',
+  'top': '$TEST_IMG.mid',
+  'speed': 65536 } }" \
+"return"
+
+_send_qemu_cmd $h "{ 'execute': 'quit' }" "return"
+wait=1 _cleanup_qemu
+
+echo
+echo === Start active commit job and exit qemu ===
+echo
+
+_launch_qemu \
+-drive file="${TEST_IMG}",cache=$CACHEMODE,driver=$IMGFMT,id=disk
+h=$QEMU_HANDLE
+_send_qemu_cmd $h "{ 'execute': 'qmp_capabilities' }" 'return'
+
+_send_qemu_cmd $h \
+"{ 'execute': 'block-commit',
+   'arguments': { 'device': 'disk',
+  'base':'$TEST_IMG.base',
+  'speed': 65536 } }" \
+"return"
+
+_send_qemu_cmd $h "{ 'execute': 'quit' }" "return"
+wait=1 _cleanup_qemu
+
+echo
+echo === Start mirror job and exit qemu ===

[Qemu-block] [PULL 02/61] qemu-iotests: Allow starting new qemu after cleanup

2017-06-23 Thread Kevin Wolf

After _cleanup_qemu(), test cases should be able to start the next qemu
process and call _cleanup_qemu() for that one as well. For this to work
cleanly, we need to improve the cleanup so that the second invocation
doesn't try to kill the qemu instances from the first invocation a
second time (which would result in error messages).

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Max Reitz 
---
 tests/qemu-iotests/common.qemu | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/qemu-iotests/common.qemu b/tests/qemu-iotests/common.qemu
index 7a78a00..76ef298 100644
--- a/tests/qemu-iotests/common.qemu
+++ b/tests/qemu-iotests/common.qemu
@@ -222,5 +222,8 @@ function _cleanup_qemu()
 rm -f "${QEMU_FIFO_IN}_${i}" "${QEMU_FIFO_OUT}_${i}"
 eval "exec ${QEMU_IN[$i]}<&-"   # close file descriptors
 eval "exec ${QEMU_OUT[$i]}<&-"
+
+unset QEMU_IN[$i]
+unset QEMU_OUT[$i]
 done
 }
-- 
1.8.3.1

[Qemu-block] [PULL 00/61] Block layer patches

2017-06-23 Thread Kevin Wolf

The following changes since commit 4c8c1cc544dbd5e2564868e61c5037258e393832:

  Merge remote-tracking branch 'remotes/vivier/tags/m68k-for-2.10-pull-request' 
into staging (2017-06-22 19:01:58 +0100)

are available in the git repository at:


  git://repo.or.cz/qemu/kevin.git tags/for-upstream

for you to fetch changes up to 1512008812410ca4054506a7c44343088abdd977:

  Merge remote-tracking branch 'mreitz/tags/pull-block-2017-06-23' into 
queue-block (2017-06-23 14:09:12 +0200)



Block layer patches


Alberto Garcia (9):
  throttle: Update throttle-groups.c documentation
  qcow2: Remove unused Error variable in do_perform_cow()
  qcow2: Use unsigned int for both members of Qcow2COWRegion
  qcow2: Make perform_cow() call do_perform_cow() twice
  qcow2: Split do_perform_cow() into _read(), _encrypt() and _write()
  qcow2: Allow reading both COW regions with only one request
  qcow2: Pass a QEMUIOVector to do_perform_cow_{read,write}()
  qcow2: Merge the writing of the COW regions with the guest data
  qcow2: Use offset_into_cluster() and offset_to_l2_index()

Kevin Wolf (37):
  commit: Fix completion with extra reference
  qemu-iotests: Allow starting new qemu after cleanup
  qemu-iotests: Test exiting qemu with running job
  doc: Document generic -blockdev options
  doc: Document driver-specific -blockdev options
  qed: Use bottom half to resume waiting requests
  qed: Make qed_read_table() synchronous
  qed: Remove callback from qed_read_table()
  qed: Remove callback from qed_read_l2_table()
  qed: Remove callback from qed_find_cluster()
  qed: Make qed_read_backing_file() synchronous
  qed: Make qed_copy_from_backing_file() synchronous
  qed: Remove callback from qed_copy_from_backing_file()
  qed: Make qed_write_header() synchronous
  qed: Remove callback from qed_write_header()
  qed: Make qed_write_table() synchronous
  qed: Remove GenericCB
  qed: Remove callback from qed_write_table()
  qed: Make qed_aio_read_data() synchronous
  qed: Make qed_aio_write_main() synchronous
  qed: Inline qed_commit_l2_update()
  qed: Add return value to qed_aio_write_l1_update()
  qed: Add return value to qed_aio_write_l2_update()
  qed: Add return value to qed_aio_write_main()
  qed: Add return value to qed_aio_write_cow()
  qed: Add return value to qed_aio_write_inplace/alloc()
  qed: Add return value to qed_aio_read/write_data()
  qed: Remove ret argument from qed_aio_next_io()
  qed: Remove recursion in qed_aio_next_io()
  qed: Implement .bdrv_co_readv/writev
  qed: Use CoQueue for serialising allocations
  qed: Simplify request handling
  qed: Use a coroutine for need_check_timer
  qed: Add coroutine_fn to I/O path functions
  qed: Use bdrv_co_* for coroutine_fns
  block: Remove bdrv_aio_readv/writev/flush()
  Merge remote-tracking branch 'mreitz/tags/pull-block-2017-06-23' into 
queue-block

Manos Pitsidianakis (1):
  block: change variable names in BlockDriverState

Max Reitz (3):
  blkdebug: Catch bs->exact_filename overflow
  blkverify: Catch bs->exact_filename overflow
  block: Do not strcmp() with NULL uri->scheme

Stefan Hajnoczi (10):
  block: count bdrv_co_rw_vmstate() requests
  block: use BDRV_POLL_WHILE() in bdrv_rw_vmstate()
  migration: avoid recursive AioContext locking in save_vmstate()
  migration: use bdrv_drain_all_begin/end() instead bdrv_drain_all()
  virtio-pci: use ioeventfd even when KVM is disabled
  migration: hold AioContext lock for loadvm qemu_fclose()
  qemu-iotests: 068: extract _qemu() function
  qemu-iotests: 068: use -drive/-device instead of -hda
  qemu-iotests: 068: test iothread mode
  qemu-img: don't shadow opts variable in img_dd()

Stephen Bates (1):
  nvme: Add support for Read Data and Write Data in CMBs.

sochin.jiang (1):
  fix: avoid an infinite loop or a dangling pointer problem in img_commit

 block/Makefile.objs|   2 +-
 block/blkdebug.c   |  46 +--
 block/blkreplay.c  |   8 +-
 block/blkverify.c  |  12 +-
 block/block-backend.c  |  22 +-
 block/commit.c |   7 +
 block/file-posix.c |  34 +-
 block/io.c | 240 ++---
 block/iscsi.c  |  20 +-
 block/mirror.c |   8 +-
 block/nbd-client.c |   8 +-
 block/nbd-client.h |   4 +-
 block/nbd.c|   6 +-
 block/nfs.c|   2 +-
 block/qcow2-cluster.c  | 201 ---
 block/qcow2.c  |  94 +++--
 block/qcow2.h  |  11 +-
 block/qed-cluster.c| 124 +++
 block/qed-gencb.c  |  33 --
 block/qed-ta

[Qemu-block] [PULL 04/61] block: count bdrv_co_rw_vmstate() requests

2017-06-23 Thread Kevin Wolf

From: Stefan Hajnoczi 

Call bdrv_inc/dec_in_flight() for vmstate reads/writes.  This seems
unnecessary at first glance because vmstate reads/writes are done
synchronously while the guest is stopped.  But we need the bdrv_wakeup()
in bdrv_dec_in_flight() so the main loop sees request completion.
Besides, it's cleaner to count vmstate reads/writes like ordinary
read/write requests.

The bdrv_wakeup() partially fixes a 'savevm' hang with -object iothread.

Signed-off-by: Stefan Hajnoczi 
Reviewed-by: Eric Blake 
Reviewed-by: Paolo Bonzini 
Signed-off-by: Kevin Wolf 
---
 block/io.c | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/block/io.c b/block/io.c
index 91611ff..684ea46 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1980,17 +1980,24 @@ bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector 
*qiov, int64_t pos,
bool is_read)
 {
 BlockDriver *drv = bs->drv;
+int ret = -ENOTSUP;
+
+bdrv_inc_in_flight(bs);
 
 if (!drv) {
-return -ENOMEDIUM;
+ret = -ENOMEDIUM;
 } else if (drv->bdrv_load_vmstate) {
-return is_read ? drv->bdrv_load_vmstate(bs, qiov, pos)
-   : drv->bdrv_save_vmstate(bs, qiov, pos);
+if (is_read) {
+ret = drv->bdrv_load_vmstate(bs, qiov, pos);
+} else {
+ret = drv->bdrv_save_vmstate(bs, qiov, pos);
+}
 } else if (bs->file) {
-return bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
+ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
 }
 
-return -ENOTSUP;
+bdrv_dec_in_flight(bs);
+return ret;
 }
 
 static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
-- 
1.8.3.1

[Qemu-block] [PULL 01/61] commit: Fix completion with extra reference

2017-06-23 Thread Kevin Wolf

commit_complete() can't assume that after its block_job_completed() the
job is actually immediately freed; someone else may still be holding
references. In this case, the op blockers on the intermediate nodes make
the graph reconfiguration in the completion code fail.

Call block_job_remove_all_bdrv() manually so that we know for sure that
any blockers on intermediate nodes are given up.

Cc: qemu-sta...@nongnu.org
Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Max Reitz 
---
 block/commit.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/block/commit.c b/block/commit.c
index af6fa68..8c09c3d 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -119,6 +119,13 @@ static void commit_complete(BlockJob *job, void *opaque)
 }
 g_free(s->backing_file_str);
 blk_unref(s->top);
+
+/* If there is more than one reference to the job (e.g. if called from
+ * block_job_finish_sync()), block_job_completed() won't free it and
+ * therefore the blockers on the intermediate nodes remain. This would
+ * cause bdrv_set_backing_hd() to fail. */
+block_job_remove_all_bdrv(job);
+
 block_job_completed(&s->common, ret);
 g_free(data);
 
-- 
1.8.3.1

Re: [Qemu-block] [Qemu-devel] [PATCH v2 1/4] qapi/qnull: Add own header

2017-06-23 Thread Max Reitz

On 2017-06-22 16:41, Markus Armbruster wrote:
> Max Reitz  writes:
> 
>> On 2017-06-21 18:24, Markus Armbruster wrote:
>>> Max Reitz  writes:
>>>
 Reviewed-by: Kevin Wolf 
 Signed-off-by: Max Reitz 
 ---
  include/qapi/qmp/qnull.h   | 26 ++
  include/qapi/qmp/qobject.h |  8 
  include/qapi/qmp/types.h   |  1 +
  qobject/qnull.c|  1 +
  target/i386/cpu.c  |  6 +-
  tests/check-qnull.c|  2 +-
  6 files changed, 30 insertions(+), 14 deletions(-)
  create mode 100644 include/qapi/qmp/qnull.h

 diff --git a/include/qapi/qmp/qnull.h b/include/qapi/qmp/qnull.h
 new file mode 100644
 index 000..69555ac
 --- /dev/null
 +++ b/include/qapi/qmp/qnull.h
 @@ -0,0 +1,26 @@
 +/*
 + * QNull Module
 + *
 + * Copyright (C) 2009, 2017 Red Hat Inc.
 + *
 + * Authors:
 + *  Luiz Capitulino 
 + *
 + * This work is licensed under the terms of the GNU LGPL, version 2.1 or 
 later.
 + * See the COPYING.LIB file in the top-level directory.
>>>
>>> Copy the boilerplate from qnull.c instead, factual correctness.
>>
>> Sorry, will do.
> 
> No need to be sorry!

I was when I noticed that it was your authorship I failed to announce. :-)

Max



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-block] [PATCH v2 4/4] qemu-iotests: add shrinking image test

2017-06-23 Thread Max Reitz

On 2017-06-13 14:16, Pavel Butsykin wrote:
> Signed-off-by: Pavel Butsykin 
> ---
>  tests/qemu-iotests/163 | 113 
> +
>  tests/qemu-iotests/163.out |   5 ++
>  tests/qemu-iotests/group   |   1 +
>  3 files changed, 119 insertions(+)
>  create mode 100644 tests/qemu-iotests/163
>  create mode 100644 tests/qemu-iotests/163.out

Ideally this test should contain tests for how the L1 and refcount table
shrinking functions actually behave (like what we have for growing
images). Right now we don't know whether they do anything in these test
cases at all.

Max

signature.asc
Description: OpenPGP digital signature

Re: [Qemu-block] [PATCH v2 3/4] qcow2: add shrink image support

2017-06-23 Thread Max Reitz

On 2017-06-22 15:57, Pavel Butsykin wrote:
> 
> On 22.06.2017 01:55, Max Reitz wrote:
>> On 2017-06-13 14:16, Pavel Butsykin wrote:
>>> This patch add shrinking of the image file for qcow2. As a result,
>>> this allows
>>> us to reduce the virtual image size and free up space on the disk
>>> without
>>> copying the image. Image can be fragmented and shrink is done by
>>> punching holes
>>> in the image file.
>>>
>>> Signed-off-by: Pavel Butsykin 
>>> ---
>>>   block/qcow2-cluster.c  | 42 
>>>   block/qcow2-refcount.c | 65
>>> ++
>>>   block/qcow2.c  | 40 +++
>>>   block/qcow2.h  |  2 ++
>>>   qapi/block-core.json   |  3 ++-
>>>   5 files changed, 141 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
>>> index d779ea19cf..a84b7e607e 100644
>>> --- a/block/qcow2-cluster.c
>>> +++ b/block/qcow2-cluster.c
>>> @@ -32,6 +32,48 @@
>>>   #include "qemu/bswap.h"
>>>   #include "trace.h"
>>>   +int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t max_size)
>>
>> It's not really a max_size but always an exact size. You don't want it
>> to be any smaller than this.
>>
>>> +{
>>> +BDRVQcow2State *s = bs->opaque;
>>> +int new_l1_size, i, ret;
>>> +
>>> +if (max_size >= s->l1_size) {
>>> +return 0;
>>> +}
>>> +
>>> +new_l1_size = max_size;
>>> +
>>> +#ifdef DEBUG_ALLOC2
>>> +fprintf(stderr, "shrink l1_table from %d to %" PRId64 "\n",
>>> +s->l1_size, new_l1_size);
>>
>> new_l1_size is of type int, not int64_t.
>>
>>> +#endif
>>> +
>>> +BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE);
>>> +ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset +
>>> +   sizeof(uint64_t) * new_l1_size,
>>> + (s->l1_size - new_l1_size) *
>>> sizeof(uint64_t), 0);
>>> +if (ret < 0) {
>>> +return ret;
>>> +}
>>> +
>>> +ret = bdrv_flush(bs->file->bs);
>>> +if (ret < 0) {
>>> +return ret;
>>> +}
>>> +
>>> +BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS);
>>> +for (i = s->l1_size - 1; i > new_l1_size - 1; i--) {
>>> +if ((s->l1_table[i] & L1E_OFFSET_MASK) == 0) {
>>> +continue;
>>> +}
>>> +qcow2_free_clusters(bs, s->l1_table[i] & L1E_OFFSET_MASK,
>>> +s->l2_size * sizeof(uint64_t),
>>
>> I'm more of a fan of s->cluster_size instead of s->l2_size *
>> sizeof(uint64_t) but it's not like it matters...
>>
>>> +QCOW2_DISCARD_ALWAYS);
>>> +s->l1_table[i] = 0;
>>
>> I'd probably clear the overhanging s->l1_table entries before
>> bdrv_flush() (before you shouldn't really use them after
>> bdrv_pwrite_zeroes() has returned, even if bdrv_flush() has failed), but
>> it's not absolutely necessary. As long as they still have a refcount of
>> at least one, writing to them will just be useless but not destroy any
>> data.
>>
> 
> You're right, but If it's not necessary, I would prefer to leave as is..
> Just because overhanging s->l1_table entries used to release clusters :)

Hm, yes. The question is, how bad are useless writes?

So the worst case scenario is this: You invoke qmp_block_resize() to
shrink the image; the bdrv_flush() call fails somewhere in the middle
but the data is still kind of pending and basically in the image.

Now when you continue to use the image and write data beyond the
intended new end, that data basically ends up nowhere. You can still
read the data just fine and change it, but when you restart qemu, it
will all be gone. So that's weird.

Admittedly, though, bdrv_flush() isn't the only issue here;
bdrv_pwrite_zeroes() is, too. If that fails somewhere in the middle, we
basically have the same situation.

Now if we were to update s->l1_table before the bdrv_pwrite_zeroes()
call, we might end up with the opposite issue: The data appears to be
gone, but after reopening the image, it's back again. The main
difference is that in this case we'll have to allocate L2 tables anew
and this will require writes to the L1 table, so maybe we can actually
succeed in overwriting the old data then... But that's a big maybe.

So all in all we'll very likely get inconsistencies either way, so yes,
it doesn't actually matter. :-)

>>> +}
>>> +return 0;
>>> +}
>>> +
>>>   int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
>>>   bool exact_size)
>>>   {
>>> diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
>>> index 576ab551d6..e98306acd8 100644
>>> --- a/block/qcow2-refcount.c
>>> +++ b/block/qcow2-refcount.c
>>> @@ -29,6 +29,7 @@
>>>   #include "block/qcow2.h"
>>>   #include "qemu/range.h"
>>>   #include "qemu/bswap.h"
>>> +#include "qemu/cutils.h"
>>> static int64_t alloc_clusters_noref(BlockDriverState *bs,
>>> uint64_t size);
>>>   static int QEMU_

[Qemu-block] [PATCH RFC v3 1/8] block: move ThrottleGroup membership to ThrottleGroupMember

2017-06-23 Thread Manos Pitsidianakis

This commit gathers ThrottleGroup membership details from
BlockBackendPublic into ThrottleGroupMember and refactors existing code
to use the structure.

Signed-off-by: Manos Pitsidianakis 
---
 block/block-backend.c   |  66 +
 block/qapi.c|   8 +-
 block/throttle-groups.c | 304 
 blockdev.c  |   4 +-
 include/block/throttle-groups.h |  15 +-
 include/qemu/throttle.h |  26 
 include/sysemu/block-backend.h  |  20 +--
 tests/test-throttle.c   |  53 +++
 8 files changed, 260 insertions(+), 236 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index a2bbae90b1..90a7abaa53 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -216,9 +216,9 @@ BlockBackend *blk_new(uint64_t perm, uint64_t shared_perm)
 blk->shared_perm = shared_perm;
 blk_set_enable_write_cache(blk, true);
 
-qemu_co_mutex_init(&blk->public.throttled_reqs_lock);
-qemu_co_queue_init(&blk->public.throttled_reqs[0]);
-qemu_co_queue_init(&blk->public.throttled_reqs[1]);
+qemu_co_mutex_init(&blk->public.throttle_group_member.throttled_reqs_lock);
+qemu_co_queue_init(&blk->public.throttle_group_member.throttled_reqs[0]);
+qemu_co_queue_init(&blk->public.throttle_group_member.throttled_reqs[1]);
 block_acct_init(&blk->stats);
 
 notifier_list_init(&blk->remove_bs_notifiers);
@@ -286,7 +286,7 @@ static void blk_delete(BlockBackend *blk)
 assert(!blk->refcnt);
 assert(!blk->name);
 assert(!blk->dev);
-if (blk->public.throttle_state) {
+if (blk->public.throttle_group_member.throttle_state) {
 blk_io_limits_disable(blk);
 }
 if (blk->root) {
@@ -597,9 +597,12 @@ BlockBackend *blk_by_public(BlockBackendPublic *public)
  */
 void blk_remove_bs(BlockBackend *blk)
 {
+ThrottleTimers *tt;
+
 notifier_list_notify(&blk->remove_bs_notifiers, blk);
-if (blk->public.throttle_state) {
-throttle_timers_detach_aio_context(&blk->public.throttle_timers);
+if (blk->public.throttle_group_member.throttle_state) {
+tt = &blk->public.throttle_group_member.throttle_timers;
+throttle_timers_detach_aio_context(tt);
 }
 
 blk_update_root_state(blk);
@@ -621,9 +624,10 @@ int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, 
Error **errp)
 bdrv_ref(bs);
 
 notifier_list_notify(&blk->insert_bs_notifiers, blk);
-if (blk->public.throttle_state) {
+if (blk->public.throttle_group_member.throttle_state) {
 throttle_timers_attach_aio_context(
-&blk->public.throttle_timers, bdrv_get_aio_context(bs));
+&blk->public.throttle_group_member.throttle_timers,
+bdrv_get_aio_context(bs));
 }
 
 return 0;
@@ -985,8 +989,9 @@ int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t 
offset,
 bdrv_inc_in_flight(bs);
 
 /* throttling disk I/O */
-if (blk->public.throttle_state) {
-throttle_group_co_io_limits_intercept(blk, bytes, false);
+if (blk->public.throttle_group_member.throttle_state) {
+
throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
+bytes, false);
 }
 
 ret = bdrv_co_preadv(blk->root, offset, bytes, qiov, flags);
@@ -1009,10 +1014,10 @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, 
int64_t offset,
 }
 
 bdrv_inc_in_flight(bs);
-
 /* throttling disk I/O */
-if (blk->public.throttle_state) {
-throttle_group_co_io_limits_intercept(blk, bytes, true);
+if (blk->public.throttle_group_member.throttle_state) {
+
throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
+bytes, true);
 }
 
 if (!blk->enable_write_cache) {
@@ -1681,15 +1686,17 @@ static AioContext *blk_aiocb_get_aio_context(BlockAIOCB 
*acb)
 void blk_set_aio_context(BlockBackend *blk, AioContext *new_context)
 {
 BlockDriverState *bs = blk_bs(blk);
+ThrottleTimers *tt;
 
 if (bs) {
-if (blk->public.throttle_state) {
-throttle_timers_detach_aio_context(&blk->public.throttle_timers);
+if (blk->public.throttle_group_member.throttle_state) {
+tt = &blk->public.throttle_group_member.throttle_timers;
+throttle_timers_detach_aio_context(tt);
 }
 bdrv_set_aio_context(bs, new_context);
-if (blk->public.throttle_state) {
-throttle_timers_attach_aio_context(&blk->public.throttle_timers,
-   new_context);
+if (blk->public.throttle_group_member.throttle_state) {
+tt = &blk->public.throttle_group_member.throttle_timers;
+throttle_timers_attach_aio_context(tt, new_context);
 }
 }
 }
@@ -1907,33 +1914,34 @@ int blk_commit_all(void)
 /* throttling disk I/O limits */
 void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg)
 {
-throttle_group

[Qemu-block] [PATCH RFC v3 4/8] block: convert ThrottleGroup to object with QOM

2017-06-23 Thread Manos Pitsidianakis

ThrottleGroup is converted to an object to allow easy runtime
configuration of throttling filter nodes in the BDS graph using QOM.

Signed-off-by: Manos Pitsidianakis 
---
 block/throttle-groups.c | 351 
 include/qemu/throttle.h |   4 +
 2 files changed, 355 insertions(+)

diff --git a/block/throttle-groups.c b/block/throttle-groups.c
index 7883cbb511..60079dc8ea 100644
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -25,9 +25,11 @@
 #include "qemu/osdep.h"
 #include "sysemu/block-backend.h"
 #include "block/throttle-groups.h"
+#include "qemu/throttle-options.h"
 #include "qemu/queue.h"
 #include "qemu/thread.h"
 #include "sysemu/qtest.h"
+#include "qapi/error.h"
 
 /* The ThrottleGroup structure (with its ThrottleState) is shared
  * among different ThrottleGroupMembers and it's independent from
@@ -54,6 +56,7 @@
  * that BlockBackend has throttled requests in the queue.
  */
 typedef struct ThrottleGroup {
+Object parent_obj;
 char *name; /* This is constant during the lifetime of the group */
 
 QemuMutex lock; /* This lock protects the following four fields */
@@ -562,3 +565,351 @@ static void throttle_groups_init(void)
 }
 
 block_init(throttle_groups_init);
+
+
+static bool throttle_group_exists(const char *name)
+{
+ThrottleGroup *iter;
+bool ret = false;
+
+qemu_mutex_lock(&throttle_groups_lock);
+/* Look for an existing group with that name */
+QTAILQ_FOREACH(iter, &throttle_groups, list) {
+if (!strcmp(name, iter->name)) {
+ret = true;
+break;
+}
+}
+
+qemu_mutex_unlock(&throttle_groups_lock);
+return ret;
+}
+
+typedef struct ThrottleGroupClass {
+/* private */
+ObjectClass parent_class;
+/* public */
+} ThrottleGroupClass;
+
+
+#define DOUBLE 0
+#define UINT64 1
+#define UNSIGNED 2
+
+typedef struct {
+BucketType type;
+int size; /* field size */
+ptrdiff_t offset; /* offset in LeakyBucket struct. */
+} ThrottleParamInfo;
+
+static ThrottleParamInfo throttle_iops_total_info = {
+THROTTLE_OPS_TOTAL, DOUBLE, offsetof(LeakyBucket, avg),
+};
+
+static ThrottleParamInfo throttle_iops_total_max_info = {
+THROTTLE_OPS_TOTAL, DOUBLE, offsetof(LeakyBucket, max),
+};
+
+static ThrottleParamInfo throttle_iops_total_max_length_info = {
+THROTTLE_OPS_TOTAL, UNSIGNED, offsetof(LeakyBucket, burst_length),
+};
+
+static ThrottleParamInfo throttle_iops_read_info = {
+THROTTLE_OPS_READ, DOUBLE, offsetof(LeakyBucket, avg),
+};
+
+static ThrottleParamInfo throttle_iops_read_max_info = {
+THROTTLE_OPS_READ, DOUBLE, offsetof(LeakyBucket, max),
+};
+
+static ThrottleParamInfo throttle_iops_read_max_length_info = {
+THROTTLE_OPS_READ, UNSIGNED, offsetof(LeakyBucket, burst_length),
+};
+
+static ThrottleParamInfo throttle_iops_write_info = {
+THROTTLE_OPS_WRITE, DOUBLE, offsetof(LeakyBucket, avg),
+};
+
+static ThrottleParamInfo throttle_iops_write_max_info = {
+THROTTLE_OPS_WRITE, DOUBLE, offsetof(LeakyBucket, max),
+};
+
+static ThrottleParamInfo throttle_iops_write_max_length_info = {
+THROTTLE_OPS_WRITE, UNSIGNED, offsetof(LeakyBucket, burst_length),
+};
+
+static ThrottleParamInfo throttle_bps_total_info = {
+THROTTLE_BPS_TOTAL, DOUBLE, offsetof(LeakyBucket, avg),
+};
+
+static ThrottleParamInfo throttle_bps_total_max_info = {
+THROTTLE_BPS_TOTAL, DOUBLE, offsetof(LeakyBucket, max),
+};
+
+static ThrottleParamInfo throttle_bps_total_max_length_info = {
+THROTTLE_BPS_TOTAL, UNSIGNED, offsetof(LeakyBucket, burst_length),
+};
+
+static ThrottleParamInfo throttle_bps_read_info = {
+THROTTLE_BPS_READ, DOUBLE, offsetof(LeakyBucket, avg),
+};
+
+static ThrottleParamInfo throttle_bps_read_max_info = {
+THROTTLE_BPS_READ, DOUBLE, offsetof(LeakyBucket, max),
+};
+
+static ThrottleParamInfo throttle_bps_read_max_length_info = {
+THROTTLE_BPS_READ, UNSIGNED, offsetof(LeakyBucket, burst_length),
+};
+
+static ThrottleParamInfo throttle_bps_write_info = {
+THROTTLE_BPS_WRITE, DOUBLE, offsetof(LeakyBucket, avg),
+};
+
+static ThrottleParamInfo throttle_bps_write_max_info = {
+THROTTLE_BPS_WRITE, DOUBLE, offsetof(LeakyBucket, max),
+};
+
+static ThrottleParamInfo throttle_bps_write_max_length_info = {
+THROTTLE_BPS_WRITE, UNSIGNED, offsetof(LeakyBucket, burst_length),
+};
+
+static ThrottleParamInfo throttle_iops_size_info = {
+0, UINT64, offsetof(ThrottleConfig, op_size),
+};
+
+
+static void throttle_group_obj_complete(UserCreatable *obj, Error **errp)
+{
+char *name = NULL;
+Error *local_error = NULL;
+ThrottleGroup *tg = THROTTLE_GROUP(obj);
+
+name = object_get_canonical_path_component(OBJECT(obj));
+if (throttle_group_exists(name)) {
+error_setg(&local_error, "A throttle group with this name already \
+  exists.");
+goto ret;
+}
+
+qemu_mutex_lock(&throttle_groups_lock);
+tg->name = name;
+qemu_mute

[Qemu-block] [PATCH RFC v3 2/8] block: Add aio_context field in ThrottleGroupMember

2017-06-23 Thread Manos Pitsidianakis

timer_cb() needs to know about the current Aio context of the throttle
request that is woken up. In order to make ThrottleGroupMember backend
agnostic, this information is stored in an aio_context field instead of
accessing it from BlockBackend.

Signed-off-by: Manos Pitsidianakis 
---
 block/block-backend.c   |  1 +
 block/throttle-groups.c | 19 +--
 include/qemu/throttle.h |  1 +
 tests/test-throttle.c   | 65 +++--
 util/throttle.c |  4 +++
 5 files changed, 48 insertions(+), 42 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index 90a7abaa53..1d501ec973 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1928,6 +1928,7 @@ void blk_io_limits_disable(BlockBackend *blk)
 /* should be called before blk_set_io_limits if a limit is set */
 void blk_io_limits_enable(BlockBackend *blk, const char *group)
 {
+blk->public.throttle_group_member.aio_context = blk_get_aio_context(blk);
 assert(!blk->public.throttle_group_member.throttle_state);
 throttle_group_register_tgm(&blk->public.throttle_group_member, group);
 }
diff --git a/block/throttle-groups.c b/block/throttle-groups.c
index 5e9d8fb4d6..7883cbb511 100644
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -71,6 +71,7 @@ static QemuMutex throttle_groups_lock;
 static QTAILQ_HEAD(, ThrottleGroup) throttle_groups =
 QTAILQ_HEAD_INITIALIZER(throttle_groups);
 
+
 /* Increments the reference count of a ThrottleGroup given its name.
  *
  * If no ThrottleGroup is found with the given name a new one is
@@ -383,9 +384,6 @@ static void coroutine_fn 
throttle_group_restart_queue_entry(void *opaque)
 
 static void throttle_group_restart_queue(ThrottleGroupMember *tgm, bool 
is_write)
 {
-BlockBackendPublic *blkp = container_of(tgm, BlockBackendPublic,
-throttle_group_member);
-BlockBackend *blk = blk_by_public(blkp);
 Coroutine *co;
 RestartData rd = {
 .tgm = tgm,
@@ -393,7 +391,7 @@ static void 
throttle_group_restart_queue(ThrottleGroupMember *tgm, bool is_write
 };
 
 co = qemu_coroutine_create(throttle_group_restart_queue_entry, &rd);
-aio_co_enter(blk_get_aio_context(blk), co);
+aio_co_enter(tgm->aio_context, co);
 }
 
 void throttle_group_restart_tgm(ThrottleGroupMember *tgm)
@@ -447,13 +445,11 @@ void throttle_group_get_config(ThrottleGroupMember *tgm, 
ThrottleConfig *cfg)
 /* ThrottleTimers callback. This wakes up a request that was waiting because it
  * had been throttled.
  *
- * @blk:   the BlockBackend whose request had been throttled
+ * @tgm:   the ThrottleGroupMember whose request had been throttled
  * @is_write:  the type of operation (read/write)
  */
-static void timer_cb(BlockBackend *blk, bool is_write)
+static void timer_cb(ThrottleGroupMember *tgm, bool is_write)
 {
-BlockBackendPublic *blkp = blk_get_public(blk);
-ThrottleGroupMember *tgm = &blkp->throttle_group_member;
 ThrottleState *ts = tgm->throttle_state;
 ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
 
@@ -487,9 +483,6 @@ void throttle_group_register_tgm(ThrottleGroupMember *tgm,
  const char *groupname)
 {
 int i;
-BlockBackendPublic *blkp = container_of(tgm, BlockBackendPublic,
-throttle_group_member);
-BlockBackend *blk = blk_by_public(blkp);
 ThrottleState *ts = throttle_group_incref(groupname);
 ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
 int clock_type = QEMU_CLOCK_REALTIME;
@@ -512,11 +505,11 @@ void throttle_group_register_tgm(ThrottleGroupMember *tgm,
 QLIST_INSERT_HEAD(&tg->head, tgm, round_robin);
 
 throttle_timers_init(&tgm->throttle_timers,
- blk_get_aio_context(blk),
+ tgm->aio_context,
  clock_type,
  read_timer_cb,
  write_timer_cb,
- blk);
+ tgm);
 
 qemu_mutex_unlock(&tg->lock);
 }
diff --git a/include/qemu/throttle.h b/include/qemu/throttle.h
index e99cbfc865..3e92d4d4eb 100644
--- a/include/qemu/throttle.h
+++ b/include/qemu/throttle.h
@@ -160,6 +160,7 @@ void throttle_account(ThrottleState *ts, bool is_write, 
uint64_t size);
  */
 
 typedef struct ThrottleGroupMember {
+AioContext  *aio_context;
 /* throttled_reqs_lock protects the CoQueues for throttled requests.  */
 CoMutex  throttled_reqs_lock;
 CoQueue  throttled_reqs[2];
diff --git a/tests/test-throttle.c b/tests/test-throttle.c
index 0f95da2592..d3298234aa 100644
--- a/tests/test-throttle.c
+++ b/tests/test-throttle.c
@@ -24,8 +24,9 @@
 static AioContext *ctx;
 static LeakyBucketbkt;
 static ThrottleConfig cfg;
+static ThrottleGroupMember tgm;
 static ThrottleState  ts;
-static ThrottleTimers tt;
+static ThrottleTimers *tt;
 
 /* useful function */
 static bool double_cmp(double x, double y)
@@ -153,

[Qemu-block] [PATCH RFC v3 7/8] block: remove legacy I/O throttling

2017-06-23 Thread Manos Pitsidianakis

This commit removes all I/O throttling from block/block-backend.c. In
order to support the existing interface, it is changed to use the
block/throttle.c filter driver.

The throttle filter node that is created by the legacy interface is
stored in a 'throttle_node' field in the BlockBackendPublic of the
device. The legacy throttle node is managed by the legacy interface
completely. More advanced configurations with the filter drive are
possible using the QMP API, but these will be ignored by the legacy
interface.

Signed-off-by: Manos Pitsidianakis 
---
 block/block-backend.c  | 158 ++---
 block/qapi.c   |   8 +--
 block/throttle.c   |   4 ++
 blockdev.c |  55 ++
 include/sysemu/block-backend.h |   8 +--
 tests/test-throttle.c  |  15 ++--
 util/throttle.c|   4 --
 7 files changed, 160 insertions(+), 92 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index 1d501ec973..c777943572 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -216,9 +216,6 @@ BlockBackend *blk_new(uint64_t perm, uint64_t shared_perm)
 blk->shared_perm = shared_perm;
 blk_set_enable_write_cache(blk, true);
 
-qemu_co_mutex_init(&blk->public.throttle_group_member.throttled_reqs_lock);
-qemu_co_queue_init(&blk->public.throttle_group_member.throttled_reqs[0]);
-qemu_co_queue_init(&blk->public.throttle_group_member.throttled_reqs[1]);
 block_acct_init(&blk->stats);
 
 notifier_list_init(&blk->remove_bs_notifiers);
@@ -286,8 +283,8 @@ static void blk_delete(BlockBackend *blk)
 assert(!blk->refcnt);
 assert(!blk->name);
 assert(!blk->dev);
-if (blk->public.throttle_group_member.throttle_state) {
-blk_io_limits_disable(blk);
+if (blk->public.throttle_node) {
+blk_io_limits_disable(blk, &error_abort);
 }
 if (blk->root) {
 blk_remove_bs(blk);
@@ -597,13 +594,7 @@ BlockBackend *blk_by_public(BlockBackendPublic *public)
  */
 void blk_remove_bs(BlockBackend *blk)
 {
-ThrottleTimers *tt;
-
 notifier_list_notify(&blk->remove_bs_notifiers, blk);
-if (blk->public.throttle_group_member.throttle_state) {
-tt = &blk->public.throttle_group_member.throttle_timers;
-throttle_timers_detach_aio_context(tt);
-}
 
 blk_update_root_state(blk);
 
@@ -624,12 +615,6 @@ int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, 
Error **errp)
 bdrv_ref(bs);
 
 notifier_list_notify(&blk->insert_bs_notifiers, blk);
-if (blk->public.throttle_group_member.throttle_state) {
-throttle_timers_attach_aio_context(
-&blk->public.throttle_group_member.throttle_timers,
-bdrv_get_aio_context(bs));
-}
-
 return 0;
 }
 
@@ -987,13 +972,6 @@ int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t 
offset,
 }
 
 bdrv_inc_in_flight(bs);
-
-/* throttling disk I/O */
-if (blk->public.throttle_group_member.throttle_state) {
-
throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
-bytes, false);
-}
-
 ret = bdrv_co_preadv(blk->root, offset, bytes, qiov, flags);
 bdrv_dec_in_flight(bs);
 return ret;
@@ -1014,11 +992,6 @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, 
int64_t offset,
 }
 
 bdrv_inc_in_flight(bs);
-/* throttling disk I/O */
-if (blk->public.throttle_group_member.throttle_state) {
-
throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
-bytes, true);
-}
 
 if (!blk->enable_write_cache) {
 flags |= BDRV_REQ_FUA;
@@ -1686,18 +1659,9 @@ static AioContext *blk_aiocb_get_aio_context(BlockAIOCB 
*acb)
 void blk_set_aio_context(BlockBackend *blk, AioContext *new_context)
 {
 BlockDriverState *bs = blk_bs(blk);
-ThrottleTimers *tt;
 
 if (bs) {
-if (blk->public.throttle_group_member.throttle_state) {
-tt = &blk->public.throttle_group_member.throttle_timers;
-throttle_timers_detach_aio_context(tt);
-}
 bdrv_set_aio_context(bs, new_context);
-if (blk->public.throttle_group_member.throttle_state) {
-tt = &blk->public.throttle_group_member.throttle_timers;
-throttle_timers_attach_aio_context(tt, new_context);
-}
 }
 }
 
@@ -1914,45 +1878,115 @@ int blk_commit_all(void)
 /* throttling disk I/O limits */
 void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg)
 {
-throttle_group_config(&blk->public.throttle_group_member, cfg);
+ThrottleGroupMember *tgm;
+
+assert(blk->public.throttle_node);
+tgm = blk->public.throttle_node->opaque;
+throttle_group_config(tgm, cfg);
 }
 
-void blk_io_limits_disable(BlockBackend *blk)
+void blk_io_limits_disable(BlockBackend *blk, Error **errp)
 {
-assert(blk->public.throttle_group_member.throttle_state);
-bdrv_drained_begin(blk_bs(blk))

[Qemu-block] [PATCH RFC v3 3/8] block: add throttle block filter driver

2017-06-23 Thread Manos Pitsidianakis

block/throttle.c uses existing I/O throttle infrastructure inside a
block filter driver. I/O operations are intercepted in the filter's
read/write coroutines, and referred to block/throttle-groups.c

The driver can be used with the command
-drive driver=throttle,file.filename=foo.qcow2,iops-total=...
The configuration flags and semantics are identical to the hardcoded
throttling ones.

Signed-off-by: Manos Pitsidianakis 
---
 block/Makefile.objs |   1 +
 block/throttle.c| 427 
 include/qemu/throttle-options.h |  60 --
 3 files changed, 469 insertions(+), 19 deletions(-)
 create mode 100644 block/throttle.c

diff --git a/block/Makefile.objs b/block/Makefile.objs
index ea955302c8..bb811a4d01 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -25,6 +25,7 @@ block-obj-y += accounting.o dirty-bitmap.o
 block-obj-y += write-threshold.o
 block-obj-y += backup.o
 block-obj-$(CONFIG_REPLICATION) += replication.o
+block-obj-y += throttle.o
 
 block-obj-y += crypto.o
 
diff --git a/block/throttle.c b/block/throttle.c
new file mode 100644
index 00..0c17051161
--- /dev/null
+++ b/block/throttle.c
@@ -0,0 +1,427 @@
+/*
+ * QEMU block throttling filter driver infrastructure
+ *
+ * Copyright (c) 2017 Manos Pitsidianakis
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 or
+ * (at your option) version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see .
+ */
+
+#include "qemu/osdep.h"
+#include "block/throttle-groups.h"
+#include "qemu/throttle-options.h"
+#include "qapi/error.h"
+
+
+static QemuOptsList throttle_opts = {
+.name = "throttle",
+.head = QTAILQ_HEAD_INITIALIZER(throttle_opts.head),
+.desc = {
+{
+.name = QEMU_OPT_IOPS_TOTAL,
+.type = QEMU_OPT_NUMBER,
+.help = "limit total I/O operations per second",
+},{
+.name = QEMU_OPT_IOPS_READ,
+.type = QEMU_OPT_NUMBER,
+.help = "limit read operations per second",
+},{
+.name = QEMU_OPT_IOPS_WRITE,
+.type = QEMU_OPT_NUMBER,
+.help = "limit write operations per second",
+},{
+.name = QEMU_OPT_BPS_TOTAL,
+.type = QEMU_OPT_NUMBER,
+.help = "limit total bytes per second",
+},{
+.name = QEMU_OPT_BPS_READ,
+.type = QEMU_OPT_NUMBER,
+.help = "limit read bytes per second",
+},{
+.name = QEMU_OPT_BPS_WRITE,
+.type = QEMU_OPT_NUMBER,
+.help = "limit write bytes per second",
+},{
+.name = QEMU_OPT_IOPS_TOTAL_MAX,
+.type = QEMU_OPT_NUMBER,
+.help = "I/O operations burst",
+},{
+.name = QEMU_OPT_IOPS_READ_MAX,
+.type = QEMU_OPT_NUMBER,
+.help = "I/O operations read burst",
+},{
+.name = QEMU_OPT_IOPS_WRITE_MAX,
+.type = QEMU_OPT_NUMBER,
+.help = "I/O operations write burst",
+},{
+.name = QEMU_OPT_BPS_TOTAL_MAX,
+.type = QEMU_OPT_NUMBER,
+.help = "total bytes burst",
+},{
+.name = QEMU_OPT_BPS_READ_MAX,
+.type = QEMU_OPT_NUMBER,
+.help = "total bytes read burst",
+},{
+.name = QEMU_OPT_BPS_WRITE_MAX,
+.type = QEMU_OPT_NUMBER,
+.help = "total bytes write burst",
+},{
+.name = QEMU_OPT_IOPS_TOTAL_MAX_LENGTH,
+.type = QEMU_OPT_NUMBER,
+.help = "length of the iopstotalmax burst period, in seconds",
+},{
+.name = QEMU_OPT_IOPS_READ_MAX_LENGTH,
+.type = QEMU_OPT_NUMBER,
+.help = "length of the iopsreadmax burst period, in seconds",
+},{
+.name = QEMU_OPT_IOPS_WRITE_MAX_LENGTH,
+.type = QEMU_OPT_NUMBER,
+.help = "length of the iopswritemax burst period, in seconds",
+},{
+.name = QEMU_OPT_BPS_TOTAL_MAX_LENGTH,
+.type = QEMU_OPT_NUMBER,
+.help = "length of the bpstotalmax burst period, in seconds",
+},{
+.name = QEMU_OPT_BPS_READ_MAX_LENGTH,
+.type = QEMU_OPT_NUMBER,
+.help = "length of the bpsreadmax burst period, in seconds",
+},{
+.name = QEMU_OPT_BPS_WRITE_MAX_LENGTH,
+.type = QEMU_OPT_NUMBER,
+.help =

[Qemu-block] [PATCH RFC v3 8/8] block: add throttle block filter driver interface tests

2017-06-23 Thread Manos Pitsidianakis

Signed-off-by: Manos Pitsidianakis 
---
 tests/qemu-iotests/184 | 144 +
 tests/qemu-iotests/184.out |  31 ++
 tests/qemu-iotests/group   |   1 +
 3 files changed, 176 insertions(+)
 create mode 100755 tests/qemu-iotests/184
 create mode 100644 tests/qemu-iotests/184.out

diff --git a/tests/qemu-iotests/184 b/tests/qemu-iotests/184
new file mode 100755
index 00..529f9edbec
--- /dev/null
+++ b/tests/qemu-iotests/184
@@ -0,0 +1,144 @@
+#!/bin/bash
+#
+# Test I/O throttle block filter driver interface
+#
+# Copyright (C) 2017 Manos Pitsidianakis
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see .
+#
+
+# creator
+owner=
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+here=`pwd`
+status=1   # failure is the default!
+
+_cleanup()
+{
+_cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+_supported_fmt raw qcow2
+_supported_proto file
+_supported_os Linux
+
+function do_run_qemu()
+{
+echo Testing: "$@" | _filter_imgfmt
+$QEMU -nographic -qmp stdio -serial none "$@"
+echo
+}
+
+function run_qemu()
+{
+do_run_qemu "$@" 2>&1 | _filter_testdir | _filter_qemu | _filter_qmp\
+  | _filter_qemu_io | _filter_generated_node_ids
+}
+
+_make_test_img 64M
+test_throttle=$($QEMU_IMG --help|grep throttle)
+[ "$test_throttle" = "" ] && _supported_fmt throttle
+
+throttle="driver=throttle"
+
+echo
+echo "== checking interface =="
+
+run_qemu <

1 2 >

1 - 100 of 108 matches

Mail list logo