[Qemu-block] [PATCH v2] xen_disk: split discard input to match internal representation

2016-11-23 Thread Olaf Hering
The guest sends discard requests as u64 sector/count pairs, but the
block layer operates internally with s64/s32 pairs. The conversion
leads to IO errors in the guest, the discard request is not processed.

  domU.cfg:
  'vdev=xvda, format=qcow2, backendtype=qdisk, target=/x.qcow2'
  domU:
  mkfs.ext4 -F /dev/xvda
  Discarding device blocks: failed - Input/output error

Fix this by splitting the request into chunks of BDRV_REQUEST_MAX_SECTORS.
Add input range checking to avoid overflow.

Fixes f313520 ("xen_disk: add discard support")

Signed-off-by: Olaf Hering 
---
v2:
 adjust overflow check
 add Fixes revspec because the initial commit also failed to convert u64 to s32
 adjust summary

 hw/block/xen_disk.c | 42 --
 1 file changed, 36 insertions(+), 6 deletions(-)

diff --git a/hw/block/xen_disk.c b/hw/block/xen_disk.c
index 3a7dc19..8d74b69 100644
--- a/hw/block/xen_disk.c
+++ b/hw/block/xen_disk.c
@@ -660,6 +660,38 @@ static void qemu_aio_complete(void *opaque, int ret)
 qemu_bh_schedule(ioreq->blkdev->bh);
 }
 
+static bool blk_split_discard(struct ioreq *ioreq, blkif_sector_t 
sector_number,
+  uint64_t nr_sectors)
+{
+struct XenBlkDev *blkdev = ioreq->blkdev;
+int64_t byte_offset;
+int byte_chunk;
+uint64_t byte_remaining, limit;
+uint64_t sec_start = sector_number;
+uint64_t sec_count = nr_sectors;
+
+/* Wrap around, or overflowing byte limit? */
+if (sec_start + sec_count < sec_count ||
+   sec_start + sec_count > INT64_MAX >> BDRV_SECTOR_BITS) {
+return false;
+}
+
+limit = BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS;
+byte_offset = sec_start << BDRV_SECTOR_BITS;
+byte_remaining = sec_count << BDRV_SECTOR_BITS;
+
+do {
+byte_chunk = byte_remaining > limit ? limit : byte_remaining;
+ioreq->aio_inflight++;
+blk_aio_pdiscard(blkdev->blk, byte_offset, byte_chunk,
+ qemu_aio_complete, ioreq);
+byte_remaining -= byte_chunk;
+byte_offset += byte_chunk;
+} while (byte_remaining > 0);
+
+return true;
+}
+
 static int ioreq_runio_qemu_aio(struct ioreq *ioreq)
 {
 struct XenBlkDev *blkdev = ioreq->blkdev;
@@ -708,12 +740,10 @@ static int ioreq_runio_qemu_aio(struct ioreq *ioreq)
 break;
 case BLKIF_OP_DISCARD:
 {
-struct blkif_request_discard *discard_req = (void *)&ioreq->req;
-ioreq->aio_inflight++;
-blk_aio_pdiscard(blkdev->blk,
- discard_req->sector_number << BDRV_SECTOR_BITS,
- discard_req->nr_sectors << BDRV_SECTOR_BITS,
- qemu_aio_complete, ioreq);
+struct blkif_request_discard *req = (void *)&ioreq->req;
+if (!blk_split_discard(ioreq, req->sector_number, req->nr_sectors)) {
+goto err;
+}
 break;
 }
 default:



Re: [Qemu-block] [Qemu-devel] [PATCH v2] xen_disk: split discard input to match internal representation

2016-11-23 Thread no-reply
Hi,

Your series seems to have some coding style problems. See output below for
more information:

Subject: [Qemu-devel] [PATCH v2] xen_disk: split discard input to match 
internal representation
Type: series
Message-id: 20161123094914.15675-1-o...@aepfle.de

=== TEST SCRIPT BEGIN ===
#!/bin/bash

BASE=base
n=1
total=$(git log --oneline $BASE.. | wc -l)
failed=0

# Useful git options
git config --local diff.renamelimit 0
git config --local diff.renames True

commits="$(git log --format=%H --reverse $BASE..)"
for c in $commits; do
echo "Checking PATCH $n/$total: $(git log -n 1 --format=%s $c)..."
if ! git show $c --format=email | ./scripts/checkpatch.pl --mailback -; then
failed=1
echo
fi
n=$((n+1))
done

exit $failed
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
From https://github.com/patchew-project/qemu
 * [new tag] 
patchew/1479896194-32672-1-git-send-email-fanc.f...@cn.fujitsu.com -> 
patchew/1479896194-32672-1-git-send-email-fanc.f...@cn.fujitsu.com
 * [new tag] patchew/20161123094914.15675-1-o...@aepfle.de -> 
patchew/20161123094914.15675-1-o...@aepfle.de
Switched to a new branch 'test'
947512f xen_disk: split discard input to match internal representation

=== OUTPUT BEGIN ===
Checking PATCH 1/1: xen_disk: split discard input to match internal 
representation...
ERROR: code indent should never use tabs
#45: FILE: hw/block/xen_disk.c:675:
+^Isec_start + sec_count > INT64_MAX >> BDRV_SECTOR_BITS) {$

total: 1 errors, 0 warnings, 54 lines checked

Your patch has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.

=== OUTPUT END ===

Test command exited with code: 1


---
Email generated automatically by Patchew [http://patchew.org/].
Please send your feedback to patchew-de...@freelists.org

[Qemu-block] [PATCH v3] xen_disk: split discard input to match internal representation

2016-11-23 Thread Olaf Hering
The guest sends discard requests as u64 sector/count pairs, but the
block layer operates internally with s64/s32 pairs. The conversion
leads to IO errors in the guest, the discard request is not processed.

  domU.cfg:
  'vdev=xvda, format=qcow2, backendtype=qdisk, target=/x.qcow2'
  domU:
  mkfs.ext4 -F /dev/xvda
  Discarding device blocks: failed - Input/output error

Fix this by splitting the request into chunks of BDRV_REQUEST_MAX_SECTORS.
Add input range checking to avoid overflow.

Fixes f313520 ("xen_disk: add discard support")

Signed-off-by: Olaf Hering 
---
v3:
 turn tab into spaces to fix checkpatch warning
v2:
 adjust overflow check
 add Fixes revspec because the initial commit also failed to convert u64 to s32
 adjust summary

 hw/block/xen_disk.c | 42 --
 1 file changed, 36 insertions(+), 6 deletions(-)

diff --git a/hw/block/xen_disk.c b/hw/block/xen_disk.c
index 3a7dc19..456a2d5 100644
--- a/hw/block/xen_disk.c
+++ b/hw/block/xen_disk.c
@@ -660,6 +660,38 @@ static void qemu_aio_complete(void *opaque, int ret)
 qemu_bh_schedule(ioreq->blkdev->bh);
 }
 
+static bool blk_split_discard(struct ioreq *ioreq, blkif_sector_t 
sector_number,
+  uint64_t nr_sectors)
+{
+struct XenBlkDev *blkdev = ioreq->blkdev;
+int64_t byte_offset;
+int byte_chunk;
+uint64_t byte_remaining, limit;
+uint64_t sec_start = sector_number;
+uint64_t sec_count = nr_sectors;
+
+/* Wrap around, or overflowing byte limit? */
+if (sec_start + sec_count < sec_count ||
+sec_start + sec_count > INT64_MAX >> BDRV_SECTOR_BITS) {
+return false;
+}
+
+limit = BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS;
+byte_offset = sec_start << BDRV_SECTOR_BITS;
+byte_remaining = sec_count << BDRV_SECTOR_BITS;
+
+do {
+byte_chunk = byte_remaining > limit ? limit : byte_remaining;
+ioreq->aio_inflight++;
+blk_aio_pdiscard(blkdev->blk, byte_offset, byte_chunk,
+ qemu_aio_complete, ioreq);
+byte_remaining -= byte_chunk;
+byte_offset += byte_chunk;
+} while (byte_remaining > 0);
+
+return true;
+}
+
 static int ioreq_runio_qemu_aio(struct ioreq *ioreq)
 {
 struct XenBlkDev *blkdev = ioreq->blkdev;
@@ -708,12 +740,10 @@ static int ioreq_runio_qemu_aio(struct ioreq *ioreq)
 break;
 case BLKIF_OP_DISCARD:
 {
-struct blkif_request_discard *discard_req = (void *)&ioreq->req;
-ioreq->aio_inflight++;
-blk_aio_pdiscard(blkdev->blk,
- discard_req->sector_number << BDRV_SECTOR_BITS,
- discard_req->nr_sectors << BDRV_SECTOR_BITS,
- qemu_aio_complete, ioreq);
+struct blkif_request_discard *req = (void *)&ioreq->req;
+if (!blk_split_discard(ioreq, req->sector_number, req->nr_sectors)) {
+goto err;
+}
 break;
 }
 default:



Re: [Qemu-block] [PATCH] xen_disk: convert discard input to byte ranges

2016-11-23 Thread Olaf Hering
On Wed, Nov 23, Olaf Hering wrote:

> > > +if (!blk_split_discard(ioreq, req->sector_number, 
> > > req->nr_sectors)) {
> > > +goto err;
> > How is error handling supposed to work here?

In the guest the cmd is stuck, instead of getting an IO error:

[   91.966404] mkfs.ext4   D  0  2878   2831 0x
[   91.966406]  88002204bc48 880030530480 88002fae5800 
88002204c000
[   91.966407]   7fff 8000 
024000c0
[   91.966409]  88002204bc60 815dd985 880038815c00 
88002204bd08
[   91.966409] Call Trace:
[   91.966413]  [] schedule+0x35/0x80
[   91.966416]  [] schedule_timeout+0x237/0x2d0
[   91.966419]  [] io_schedule_timeout+0xa6/0x110
[   91.966421]  [] wait_for_completion_io+0xa3/0x110
[   91.966425]  [] submit_bio_wait+0x50/0x60
[   91.966430]  [] blkdev_issue_discard+0x78/0xb0
[   91.966433]  [] blk_ioctl_discard+0x7b/0xa0
[   91.966436]  [] blkdev_ioctl+0x730/0x920
[   91.966440]  [] block_ioctl+0x3d/0x40
[   91.966444]  [] do_vfs_ioctl+0x2cd/0x4a0
[   91.966453]  [] SyS_ioctl+0x74/0x80
[   91.966456]  [] entry_SYSCALL_64_fastpath+0x12/0x6d

Olaf


signature.asc
Description: PGP signature


Re: [Qemu-block] [PATCH] xen_disk: convert discard input to byte ranges

2016-11-23 Thread Olaf Hering
Ping.

On Fri, Nov 18, Olaf Hering wrote:

> On Fri, Nov 18, Olaf Hering wrote:
> 
> > @@ -708,12 +743,10 @@ static int ioreq_runio_qemu_aio(struct ioreq *ioreq)
> > +if (!blk_split_discard(ioreq, req->sector_number, 
> > req->nr_sectors)) {
> > +goto err;
> 
> How is error handling supposed to work here?
> Initially I forgot the "!", which just hung the mkfs command in domU.
> I have not yet tried to force errors in other part of that function.


Olaf


signature.asc
Description: PGP signature


Re: [Qemu-block] [Qemu-devel] [PATCH for-2.8 v3] xen_disk: split discard input to match internal representation

2016-11-23 Thread Eric Blake
On 11/23/2016 04:39 AM, Olaf Hering wrote:
> The guest sends discard requests as u64 sector/count pairs, but the
> block layer operates internally with s64/s32 pairs. The conversion
> leads to IO errors in the guest, the discard request is not processed.
> 
>   domU.cfg:
>   'vdev=xvda, format=qcow2, backendtype=qdisk, target=/x.qcow2'
>   domU:
>   mkfs.ext4 -F /dev/xvda
>   Discarding device blocks: failed - Input/output error
> 
> Fix this by splitting the request into chunks of BDRV_REQUEST_MAX_SECTORS.
> Add input range checking to avoid overflow.
> 
> Fixes f313520 ("xen_disk: add discard support")
> 
> Signed-off-by: Olaf Hering 
> ---

Qualifies as a bug fix, so requesting 2.8 inclusion.
Reviewed-by: Eric Blake 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [RFC PATCH] throttle: move throttling cmdline options to a separate header file

2016-11-23 Thread Greg Kurz
On Tue, 22 Nov 2016 16:02:12 +0100
Pradeep Kiruvale  wrote:

> On 22 November 2016 at 14:55, Greg Kurz  wrote:
> 
> > On Tue, 22 Nov 2016 14:51:12 +0100
> > Alberto Garcia  wrote:
> >
> > > On Tue 22 Nov 2016 01:49:51 PM CET, Greg Kurz wrote:
> > >
> > > > This will allow other subsystems (i.e. fsdev) to implement throttling
> > > > without duplicating the command line options.
> > > >
> > > > Signed-off-by: Greg Kurz 
> > >
> > > Did you check if it's possible / easy to do defining a normal array
> > > instead of a macro and using qemu_opts_append() or something similar?
> > >
> > > Berto
> >
> > No I didn't due to lack of time but maybe Pradeep may have a look in this
> > direction ?
> >
> 
> Only may be next year :)
> 

Ok, and also, it will be the occasion to add the throttling options to
the legacy -virtfs command line option as well.

Cheers.

--
Greg

> -Pradeep
> 
> 
> >
> > Cheers.
> >
> > --
> > Greg
> >




Re: [Qemu-block] [RFC PATCH] throttle: move throttling cmdline options to a separate header file

2016-11-23 Thread Pradeep Kiruvale
On 23 November 2016 at 13:13, Greg Kurz  wrote:

> On Tue, 22 Nov 2016 16:02:12 +0100
> Pradeep Kiruvale  wrote:
>
> > On 22 November 2016 at 14:55, Greg Kurz  wrote:
> >
> > > On Tue, 22 Nov 2016 14:51:12 +0100
> > > Alberto Garcia  wrote:
> > >
> > > > On Tue 22 Nov 2016 01:49:51 PM CET, Greg Kurz wrote:
> > > >
> > > > > This will allow other subsystems (i.e. fsdev) to implement
> throttling
> > > > > without duplicating the command line options.
> > > > >
> > > > > Signed-off-by: Greg Kurz 
> > > >
> > > > Did you check if it's possible / easy to do defining a normal array
> > > > instead of a macro and using qemu_opts_append() or something similar?
> > > >
> > > > Berto
> > >
> > > No I didn't due to lack of time but maybe Pradeep may have a look in
> this
> > > direction ?
> > >
> >
> > Only may be next year :)
> >
>
> Ok, and also, it will be the occasion to add the throttling options to
> the legacy -virtfs command line option as well.
>

Ok, also I think I have to add QMP interfaces for 9p case.

-Pradeep


> Cheers.
>
> --
> Greg
>
> > -Pradeep
> >
> >
> > >
> > > Cheers.
> > >
> > > --
> > > Greg
> > >
>
>


Re: [Qemu-block] [Qemu-devel] [PATCH for-2.8 v3] xen_disk: split discard input to match internal representation

2016-11-23 Thread Kevin Wolf
Am 23.11.2016 um 12:40 hat Eric Blake geschrieben:
> On 11/23/2016 04:39 AM, Olaf Hering wrote:
> > The guest sends discard requests as u64 sector/count pairs, but the
> > block layer operates internally with s64/s32 pairs. The conversion
> > leads to IO errors in the guest, the discard request is not processed.
> > 
> >   domU.cfg:
> >   'vdev=xvda, format=qcow2, backendtype=qdisk, target=/x.qcow2'
> >   domU:
> >   mkfs.ext4 -F /dev/xvda
> >   Discarding device blocks: failed - Input/output error
> > 
> > Fix this by splitting the request into chunks of BDRV_REQUEST_MAX_SECTORS.
> > Add input range checking to avoid overflow.
> > 
> > Fixes f313520 ("xen_disk: add discard support")
> > 
> > Signed-off-by: Olaf Hering 
> > ---
> 
> Qualifies as a bug fix, so requesting 2.8 inclusion.
> Reviewed-by: Eric Blake 

Stefano, are you going to merge this or should I take a look?

Kevin


pgp1YJv4jzag4.pgp
Description: PGP signature


Re: [Qemu-block] [PATCH v1 01/18] block/io: add bdrv_aio_{preadv, pwritev}

2016-11-23 Thread Kevin Wolf
Am 15.11.2016 um 07:36 hat Pavel Butsykin geschrieben:
> It's just byte-based wrappers over bdrv_co_aio_prw_vector(), which provide
>  a byte-based interface for AIO read/write.
> 
> Signed-off-by: Pavel Butsykin 

I'm in the process to phase out the last users of bdrv_aio_*() so that
this set of interfaces can be removed. I'm doing this because it's an
unnecessary redundancy, we have too many wrapper functions that expose
the same functionality with different syntax. So let's not add new
users.

At first sight, you don't even seem to use bdrv_aio_preadv() for actual
parallelism, but you often have a pattern like this:

void foo_cb(void *opaque)
{
...
qemu_coroutine_enter(acb->co);
}

void caller()
{
...
acb = bdrv_aio_preadv(...);
qemu_coroutine_yield();
}

The code will actually become a lot simpler if you use bdrv_co_preadv()
instead because you don't have to have a callback, but you get pure
sequential code.

The part that actually has some parallelism, pcache_readahead_request(),
already creates its own coroutine, so it runs in the background without
using callback-style interfaces.

Kevin



Re: [Qemu-block] [PATCH v1 02/18] block/pcache: empty pcache driver filter

2016-11-23 Thread Kevin Wolf
Am 15.11.2016 um 07:36 hat Pavel Butsykin geschrieben:
> The basic version of pcache driver for easy preparation of a patch set.
> 
> Signed-off-by: Pavel Butsykin 
> ---
>  block/Makefile.objs |   1 +
>  block/pcache.c  | 144 
> 
>  2 files changed, 145 insertions(+)
>  create mode 100644 block/pcache.c
> 
> diff --git a/block/Makefile.objs b/block/Makefile.objs
> index 7d4031d..c60f680 100644
> --- a/block/Makefile.objs
> +++ b/block/Makefile.objs
> @@ -4,6 +4,7 @@ block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o 
> qed-cluster.o
>  block-obj-y += qed-check.o
>  block-obj-y += vhdx.o vhdx-endian.o vhdx-log.o
>  block-obj-y += quorum.o
> +block-obj-y += pcache.o
>  block-obj-y += parallels.o blkdebug.o blkverify.o blkreplay.o
>  block-obj-y += block-backend.o snapshot.o qapi.o
>  block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
> diff --git a/block/pcache.c b/block/pcache.c
> new file mode 100644
> index 000..59461df
> --- /dev/null
> +++ b/block/pcache.c
> @@ -0,0 +1,144 @@
> +/*
> + * Prefetch cache driver filter
> + *
> + * Copyright (C) 2015-2016 Parallels IP Holdings GmbH. All rights reserved.

"All rights reserved" doesn't really agree with licensing under the GPL.
It would be preferable to remove this note to avoid any ambiguity.

> + *
> + * Author: Pavel Butsykin 
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or
> + * later.  See the COPYING file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "block/block_int.h"
> +#include "qapi/error.h"
> +#include "qapi/qmp/qstring.h"
> +
> +
> +static QemuOptsList runtime_opts = {
> +.name = "pcache",
> +.head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
> +.desc = {
> +{
> +.name = "x-image",
> +.type = QEMU_OPT_STRING,
> +.help = "[internal use only, will be removed]",
> +},

blkdebug/blkverify have this because they have to support legacy syntax
like -drive file=blkdebug:blkdebug.conf:test.img, i.e. it has to deal
with filenames.

Here we don't have to support a legacy syntax, so I would completely
avoid this from the beginning. You already support the "image" option,
which should be good enough.

> +{ /* end of list */ }
> +},
> +};
> +
> +typedef struct PCacheAIOCB {
> +Coroutine *co;
> +int ret;
> +} PCacheAIOCB;
> +
> +static void pcache_aio_cb(void *opaque, int ret)
> +{
> +PCacheAIOCB *acb = opaque;
> +
> +acb->ret = ret;
> +qemu_coroutine_enter(acb->co);
> +}
> +
> +static coroutine_fn int pcache_co_preadv(BlockDriverState *bs, uint64_t 
> offset,
> + uint64_t bytes, QEMUIOVector *qiov,
> + int flags)
> +{
> +PCacheAIOCB acb = {
> +.co = qemu_coroutine_self(),
> +};
> +
> +bdrv_aio_preadv(bs->file, offset, qiov, bytes, pcache_aio_cb, &acb);
> +
> +qemu_coroutine_yield();
> +
> +return acb.ret;
> +}

As I commented on patch 1, all of this can be replaced by a single line:

return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);

> +static coroutine_fn int pcache_co_pwritev(BlockDriverState *bs, uint64_t 
> offset,
> +  uint64_t bytes, QEMUIOVector *qiov,
> +  int flags)
> +{
> +PCacheAIOCB acb = {
> +.co = qemu_coroutine_self(),
> +};
> +
> +bdrv_aio_pwritev(bs->file, offset, qiov, bytes, pcache_aio_cb, &acb);
> +
> +qemu_coroutine_yield();
> +
> +return acb.ret;
> +}

Same here.

> +static int pcache_file_open(BlockDriverState *bs, QDict *options, int flags,
> +Error **errp)
> +{
> +QemuOpts *opts;
> +Error *local_err = NULL;
> +int ret = 0;
> +
> +opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
> +qemu_opts_absorb_qdict(opts, options, &local_err);
> +if (local_err) {
> +error_propagate(errp, local_err);
> +ret = -EINVAL;
> +goto fail;
> +}
> +
> +assert(bs->file == NULL);
> +bs->file = bdrv_open_child(qemu_opt_get(opts, "x-image"), options,
> +   "image", bs, &child_format, false, 
> &local_err);

When removing x-image, the first parameter becomes NULL here.

> +if (local_err) {
> +ret = -EINVAL;
> +error_propagate(errp, local_err);
> +}
> +fail:
> +qemu_opts_del(opts);
> +return ret;
> +}
> +
> +static void pcache_close(BlockDriverState *bs)
> +{
> +}
> +
> +static void pcache_parse_filename(const char *filename, QDict *options,
> +  Error **errp)
> +{
> +qdict_put(options, "x-image", qstring_from_str(filename));
> +}

This one goes away.

> +static int64_t pcache_getlength(BlockDriverState *bs)
> +{
> +return bdrv_getlength(bs->file->bs);
> +}
> +
> +static bool pcache_recurse_is_first_non_filter(B

Re: [Qemu-block] [PATCH v3] xen_disk: split discard input to match internal representation

2016-11-23 Thread Anthony PERARD
On Wed, Nov 23, 2016 at 10:39:12AM +, Olaf Hering wrote:
> The guest sends discard requests as u64 sector/count pairs, but the
> block layer operates internally with s64/s32 pairs. The conversion
> leads to IO errors in the guest, the discard request is not processed.
> 
>   domU.cfg:
>   'vdev=xvda, format=qcow2, backendtype=qdisk, target=/x.qcow2'
>   domU:
>   mkfs.ext4 -F /dev/xvda
>   Discarding device blocks: failed - Input/output error
> 
> Fix this by splitting the request into chunks of BDRV_REQUEST_MAX_SECTORS.
> Add input range checking to avoid overflow.
> 
> Fixes f313520 ("xen_disk: add discard support")
> 
> Signed-off-by: Olaf Hering 

Acked-by: Anthony PERARD 

-- 
Anthony PERARD



Re: [Qemu-block] [Qemu-devel] [PATCH for-2.8 v3] xen_disk: split discard input to match internal representation

2016-11-23 Thread Stefano Stabellini
On Wed, 23 Nov 2016, Kevin Wolf wrote:
> Am 23.11.2016 um 12:40 hat Eric Blake geschrieben:
> > On 11/23/2016 04:39 AM, Olaf Hering wrote:
> > > The guest sends discard requests as u64 sector/count pairs, but the
> > > block layer operates internally with s64/s32 pairs. The conversion
> > > leads to IO errors in the guest, the discard request is not processed.
> > > 
> > >   domU.cfg:
> > >   'vdev=xvda, format=qcow2, backendtype=qdisk, target=/x.qcow2'
> > >   domU:
> > >   mkfs.ext4 -F /dev/xvda
> > >   Discarding device blocks: failed - Input/output error
> > > 
> > > Fix this by splitting the request into chunks of BDRV_REQUEST_MAX_SECTORS.
> > > Add input range checking to avoid overflow.
> > > 
> > > Fixes f313520 ("xen_disk: add discard support")
> > > 
> > > Signed-off-by: Olaf Hering 
> > > ---
> > 
> > Qualifies as a bug fix, so requesting 2.8 inclusion.
> > Reviewed-by: Eric Blake 
> 
> Stefano, are you going to merge this or should I take a look?

I can merge it.

Cheers,

Stefano



Re: [Qemu-block] [PATCH v3] xen_disk: split discard input to match internal representation

2016-11-23 Thread Stefano Stabellini
On Wed, 23 Nov 2016, Olaf Hering wrote:
> The guest sends discard requests as u64 sector/count pairs, but the
> block layer operates internally with s64/s32 pairs. The conversion
> leads to IO errors in the guest, the discard request is not processed.
> 
>   domU.cfg:
>   'vdev=xvda, format=qcow2, backendtype=qdisk, target=/x.qcow2'
>   domU:
>   mkfs.ext4 -F /dev/xvda
>   Discarding device blocks: failed - Input/output error
> 
> Fix this by splitting the request into chunks of BDRV_REQUEST_MAX_SECTORS.
> Add input range checking to avoid overflow.
> 
> Fixes f313520 ("xen_disk: add discard support")
> 
> Signed-off-by: Olaf Hering 

Reviewed-by: Stefano Stabellini 


> v3:
>  turn tab into spaces to fix checkpatch warning
> v2:
>  adjust overflow check
>  add Fixes revspec because the initial commit also failed to convert u64 to 
> s32
>  adjust summary
> 
>  hw/block/xen_disk.c | 42 --
>  1 file changed, 36 insertions(+), 6 deletions(-)
> 
> diff --git a/hw/block/xen_disk.c b/hw/block/xen_disk.c
> index 3a7dc19..456a2d5 100644
> --- a/hw/block/xen_disk.c
> +++ b/hw/block/xen_disk.c
> @@ -660,6 +660,38 @@ static void qemu_aio_complete(void *opaque, int ret)
>  qemu_bh_schedule(ioreq->blkdev->bh);
>  }
>  
> +static bool blk_split_discard(struct ioreq *ioreq, blkif_sector_t 
> sector_number,
> +  uint64_t nr_sectors)
> +{
> +struct XenBlkDev *blkdev = ioreq->blkdev;
> +int64_t byte_offset;
> +int byte_chunk;
> +uint64_t byte_remaining, limit;
> +uint64_t sec_start = sector_number;
> +uint64_t sec_count = nr_sectors;
> +
> +/* Wrap around, or overflowing byte limit? */
> +if (sec_start + sec_count < sec_count ||
> +sec_start + sec_count > INT64_MAX >> BDRV_SECTOR_BITS) {
> +return false;
> +}
> +
> +limit = BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS;
> +byte_offset = sec_start << BDRV_SECTOR_BITS;
> +byte_remaining = sec_count << BDRV_SECTOR_BITS;
> +
> +do {
> +byte_chunk = byte_remaining > limit ? limit : byte_remaining;
> +ioreq->aio_inflight++;
> +blk_aio_pdiscard(blkdev->blk, byte_offset, byte_chunk,
> + qemu_aio_complete, ioreq);
> +byte_remaining -= byte_chunk;
> +byte_offset += byte_chunk;
> +} while (byte_remaining > 0);
> +
> +return true;
> +}
> +
>  static int ioreq_runio_qemu_aio(struct ioreq *ioreq)
>  {
>  struct XenBlkDev *blkdev = ioreq->blkdev;
> @@ -708,12 +740,10 @@ static int ioreq_runio_qemu_aio(struct ioreq *ioreq)
>  break;
>  case BLKIF_OP_DISCARD:
>  {
> -struct blkif_request_discard *discard_req = (void *)&ioreq->req;
> -ioreq->aio_inflight++;
> -blk_aio_pdiscard(blkdev->blk,
> - discard_req->sector_number << BDRV_SECTOR_BITS,
> - discard_req->nr_sectors << BDRV_SECTOR_BITS,
> - qemu_aio_complete, ioreq);
> +struct blkif_request_discard *req = (void *)&ioreq->req;
> +if (!blk_split_discard(ioreq, req->sector_number, req->nr_sectors)) {
> +goto err;
> +}
>  break;
>  }
>  default:
> 



Re: [Qemu-block] [PATCH] xen_disk: convert discard input to byte ranges

2016-11-23 Thread Stefano Stabellini
On Wed, 23 Nov 2016, Olaf Hering wrote:
> On Wed, Nov 23, Olaf Hering wrote:
> 
> > > > +if (!blk_split_discard(ioreq, req->sector_number, 
> > > > req->nr_sectors)) {
> > > > +goto err;
> > > How is error handling supposed to work here?
> 
> In the guest the cmd is stuck, instead of getting an IO error:
> 
> [   91.966404] mkfs.ext4   D  0  2878   2831 
> 0x
> [   91.966406]  88002204bc48 880030530480 88002fae5800 
> 88002204c000
> [   91.966407]   7fff 8000 
> 024000c0
> [   91.966409]  88002204bc60 815dd985 880038815c00 
> 88002204bd08
> [   91.966409] Call Trace:
> [   91.966413]  [] schedule+0x35/0x80
> [   91.966416]  [] schedule_timeout+0x237/0x2d0
> [   91.966419]  [] io_schedule_timeout+0xa6/0x110
> [   91.966421]  [] wait_for_completion_io+0xa3/0x110
> [   91.966425]  [] submit_bio_wait+0x50/0x60
> [   91.966430]  [] blkdev_issue_discard+0x78/0xb0
> [   91.966433]  [] blk_ioctl_discard+0x7b/0xa0
> [   91.966436]  [] blkdev_ioctl+0x730/0x920
> [   91.966440]  [] block_ioctl+0x3d/0x40
> [   91.966444]  [] do_vfs_ioctl+0x2cd/0x4a0
> [   91.966453]  [] SyS_ioctl+0x74/0x80
> [   91.966456]  [] entry_SYSCALL_64_fastpath+0x12/0x6d

The error should be sent back to the frontend via the status field. Not
sure why blkfront is not hanlding it correctly.



Re: [Qemu-block] [Qemu-devel] [PATCH for-2.8 v3] xen_disk: split discard input to match internal representation

2016-11-23 Thread Olaf Hering
Am 23. November 2016 13:27:13 MEZ, schrieb Kevin Wolf :
>Am 23.11.2016 um 12:40 hat Eric Blake geschrieben:

>> Qualifies as a bug fix, so requesting 2.8 inclusion.
>> Reviewed-by: Eric Blake 

Is this a can for 2.x?

Olaf 




Re: [Qemu-block] [PATCH v1 04/18] util/rbcache: range-based cache core

2016-11-23 Thread Kevin Wolf
Am 15.11.2016 um 07:37 hat Pavel Butsykin geschrieben:
> RBCache provides functionality to cache the data from block devices
> (basically). The range here is used as the main key for searching and storing
> data. The cache is based on red-black trees, so basic operations search,
> insert, delete are performed for O(log n).
> 
> It is important to note that QEMU usually does not require a data cache, but
> in reality, there are already some cases where a cache of small amounts can
> increase performance, so as the data structure was selected red-black trees,
> this is a fairly simple data structure and show high efficiency on a small
> number of elements. Therefore, when the minimum range is 512 bytes, the
> recommended size of the cache memory no more than 8-16mb.  Also note
> that this cache implementation allows to store ranges of different lengths
> without alignment.
> 
> Generic cache core can easily be used to implement different caching policies 
> at
> the block level, such as read-ahed. Also it can be used in some special cases,
> for example for caching data in qcow2 when sequential allocating writes to 
> image
> with backing file.
> 
> Signed-off-by: Pavel Butsykin 
> ---
>  MAINTAINERS|   6 ++
>  include/qemu/rbcache.h | 105 +
>  util/Makefile.objs |   1 +
>  util/rbcache.c | 246 
> +
>  4 files changed, 358 insertions(+)
>  create mode 100644 include/qemu/rbcache.h
>  create mode 100644 util/rbcache.c
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index ddf797b..cb74802 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -1365,6 +1365,12 @@ F: include/qemu/rbtree.h
>  F: include/qemu/rbtree_augmented.h
>  F: util/rbtree.c
>  
> +Range-Based Cache
> +M: Denis V. Lunev 
> +S: Supported
> +F: include/qemu/rbcache.h
> +F: util/rbcache.c
> +
>  UUID
>  M: Fam Zheng 
>  S: Supported
> diff --git a/include/qemu/rbcache.h b/include/qemu/rbcache.h
> new file mode 100644
> index 000..c8f0a9f
> --- /dev/null
> +++ b/include/qemu/rbcache.h
> @@ -0,0 +1,105 @@
> +/*
> + * QEMU Range-Based Cache core
> + *
> + * Copyright (C) 2015-2016 Parallels IP Holdings GmbH. All rights reserved.
> + *
> + * Author: Pavel Butsykin 
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or
> + * later.  See the COPYING file in the top-level directory.
> + */
> +
> +#ifndef RBCACHE_H
> +#define RBCACHE_H
> +
> +#include "qemu/rbtree.h"
> +#include "qemu/queue.h"
> +
> +typedef struct RBCacheNode {
> +struct RbNode rb_node;
> +uint64_t offset;
> +uint64_t bytes;
> +QTAILQ_ENTRY(RBCacheNode) entry;
> +} RBCacheNode;
> +
> +typedef struct RBCache RBCache;
> +
> +typedef RBCacheNode *RBNodeAlloc(uint64_t offset, uint64_t bytes, void 
> *opaque);
> +typedef void RBNodeFree(RBCacheNode *node, void *opaque);

Maybe worth comments describing what these functions do apart from
g_new()/g_free()? I assume that offset and bytes must be initialised
from the parameters. Should rb_node and entry be zeroed?

> +
> +enum eviction_type {
> +RBCACHE_FIFO,
> +RBCACHE_LRU,
> +};
> +
> +/**
> + * rbcache_search:
> + * @rbcache: the cache object.
> + * @offset: the start of the range.
> + * @bytes: the size of the range.
> + *
> + * Returns the node corresponding to the range(offset, bytes),
> + * or NULL if the node was not found.
> + */
> +void *rbcache_search(RBCache *rbcache, uint64_t offset, uint64_t bytes);

What if the range covers multiple nodes? Is it defined which of the
nodes we return or do you just get any?

Why does this function (and the following ones) return void* rather than
RBCacheNode* if they are supposed to return a node?

> +/**
> + * rbcache_insert:
> + * @rbcache: the cache object.
> + * @node: a new node for the cache.
> + *
> + * Returns the new node, or old node if the node already exists.
> + */
> +void *rbcache_insert(RBCache *rbcache, RBCacheNode *node);

What does "if the node already exists" mean? If @node (the very same
object) is already stored in the cache object, or if a node describing
the same range already exists?

> +/**
> + * rbcache_search_and_insert:
> + * @rbcache: the cache object.
> + * @offset: the start of the range.
> + * @bytes: the size of the range.
> + *
> + * rbcache_search_and_insert() is like rbcache_insert(), except that a new 
> node
> + * is allocated inside the function. Returns the new node, or old node if the
> + * node already exists.
> + */
> +void *rbcache_search_and_insert(RBCache *rbcache, uint64_t offset,
> +uint64_t byte);

What happens if a node exists, but only for part of the range?

> +/**
> + * rbcache_remove:
> + * @rbcache: the cache object.
> + * @node: the node to remove.
> + *
> + * Removes the cached range owned by the node.
> + */
> +void rbcache_remove(RBCache *rbcache, RBCacheNode *node);
> +
> +RBCacheNode *rbcache_node_alloc(RBCache *rbcache, uint64_t offset,
> +

Re: [Qemu-block] [Qemu-devel] [PATCH for-2.8 v3] xen_disk: split discard input to match internal representation

2016-11-23 Thread Olaf Hering
Am 23. November 2016 21:44:50 MEZ, schrieb Olaf Hering :

>Is this a can for 2.x?
 candidate 


Olaf