[Qemu-block] [PATCH for 2.8 02/11] virtio: convert to use DMA api

2016-08-29 Thread Jason Wang
Currently, all virtio devices bypass IOMMU completely. This is because
address_space_memory is assumed and used during DMA emulation. This
patch converts the virtio core API to use DMA API. This idea is

- introducing a new transport specific helper to query the dma address
  space. (only pci version is implemented).
- query and use this address space during virtio device guest memory
  accessing when iommu platform (VIRTIO_F_IOMMU_PLATFORM) was enabled
  for this device.

Cc: Michael S. Tsirkin 
Cc: Stefan Hajnoczi 
Cc: Kevin Wolf 
Cc: Amit Shah 
Cc: Paolo Bonzini 
Cc: qemu-block@nongnu.org
Signed-off-by: Jason Wang 
---
 hw/block/virtio-blk.c |  2 +-
 hw/char/virtio-serial-bus.c   |  3 +-
 hw/scsi/virtio-scsi.c |  4 ++-
 hw/virtio/virtio-pci.c| 14 +
 hw/virtio/virtio.c| 62 ---
 include/hw/virtio/virtio-access.h | 43 ---
 include/hw/virtio/virtio-bus.h|  1 +
 include/hw/virtio/virtio.h|  8 +++--
 8 files changed, 98 insertions(+), 39 deletions(-)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 331d766..8fd6df7 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -856,7 +856,7 @@ static int virtio_blk_load_device(VirtIODevice *vdev, 
QEMUFile *f,
 }
 }
 
-req = qemu_get_virtqueue_element(f, sizeof(VirtIOBlockReq));
+req = qemu_get_virtqueue_element(vdev, f, sizeof(VirtIOBlockReq));
 virtio_blk_init_request(s, virtio_get_queue(vdev, vq_idx), req);
 req->next = s->rq;
 s->rq = req;
diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c
index db57a38..94f19ba 100644
--- a/hw/char/virtio-serial-bus.c
+++ b/hw/char/virtio-serial-bus.c
@@ -682,6 +682,7 @@ static void virtio_serial_post_load_timer_cb(void *opaque)
 static int fetch_active_ports_list(QEMUFile *f,
VirtIOSerial *s, uint32_t nr_active_ports)
 {
+VirtIODevice *vdev = VIRTIO_DEVICE(s);
 uint32_t i;
 
 s->post_load = g_malloc0(sizeof(*s->post_load));
@@ -715,7 +716,7 @@ static int fetch_active_ports_list(QEMUFile *f,
 qemu_get_be64s(f, >iov_offset);
 
 port->elem =
-qemu_get_virtqueue_element(f, sizeof(VirtQueueElement));
+qemu_get_virtqueue_element(vdev, f, sizeof(VirtQueueElement));
 
 /*
  *  Port was throttled on source machine.  Let's
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index ce57ef6..4cc7627 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -197,12 +197,14 @@ static void *virtio_scsi_load_request(QEMUFile *f, 
SCSIRequest *sreq)
 SCSIBus *bus = sreq->bus;
 VirtIOSCSI *s = container_of(bus, VirtIOSCSI, bus);
 VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s);
+VirtIODevice *vdev = VIRTIO_DEVICE(s);
 VirtIOSCSIReq *req;
 uint32_t n;
 
 qemu_get_be32s(f, );
 assert(n < vs->conf.num_queues);
-req = qemu_get_virtqueue_element(f, sizeof(VirtIOSCSIReq) + vs->cdb_size);
+req = qemu_get_virtqueue_element(vdev, f,
+ sizeof(VirtIOSCSIReq) + vs->cdb_size);
 virtio_scsi_init_req(s, vs->cmd_vqs[n], req);
 
 if (virtio_scsi_parse_req(req, sizeof(VirtIOSCSICmdReq) + vs->cdb_size,
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 755f921..c10bf55 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -1162,6 +1162,14 @@ static int virtio_pci_query_nvectors(DeviceState *d)
 return proxy->nvectors;
 }
 
+static AddressSpace *virtio_pci_get_dma_as(DeviceState *d)
+{
+VirtIOPCIProxy *proxy = VIRTIO_PCI(d);
+PCIDevice *dev = >pci_dev;
+
+return pci_get_address_space(dev);
+}
+
 static int virtio_pci_add_mem_cap(VirtIOPCIProxy *proxy,
struct virtio_pci_cap *cap)
 {
@@ -1587,6 +1595,11 @@ static void virtio_pci_device_plugged(DeviceState *d, 
Error **errp)
 }
 
 if (legacy) {
+if (virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM)) {
+error_setg(errp, "VIRTIO_F_IOMMU_PLATFORM was supported by"
+   "neither legacy nor transitional device.");
+return ;
+}
 /* legacy and transitional */
 pci_set_word(config + PCI_SUBSYSTEM_VENDOR_ID,
  pci_get_word(config + PCI_VENDOR_ID));
@@ -2452,6 +2465,7 @@ static void virtio_pci_bus_class_init(ObjectClass *klass, 
void *data)
 k->ioeventfd_disabled = virtio_pci_ioeventfd_disabled;
 k->ioeventfd_set_disabled = virtio_pci_ioeventfd_set_disabled;
 k->ioeventfd_assign = virtio_pci_ioeventfd_assign;
+k->get_dma_as = virtio_pci_get_dma_as;
 }
 
 static const TypeInfo virtio_pci_bus_info = {
diff --git a/hw/virtio/virtio.c 

[Qemu-block] [PATCH RFC v2 01/22] block/pcache: empty pcache driver filter

2016-08-29 Thread Pavel Butsykin
The basic version of pcache driver for easy preparation of a patch set.

Signed-off-by: Pavel Butsykin 
---
 block/Makefile.objs |   1 +
 block/pcache.c  | 156 
 2 files changed, 157 insertions(+)
 create mode 100644 block/pcache.c

diff --git a/block/Makefile.objs b/block/Makefile.objs
index 2593a2f..7c588ac 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -4,6 +4,7 @@ block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o 
qed-cluster.o
 block-obj-y += qed-check.o
 block-obj-$(CONFIG_VHDX) += vhdx.o vhdx-endian.o vhdx-log.o
 block-obj-y += quorum.o
+block-obj-y += pcache.o
 block-obj-y += parallels.o blkdebug.o blkverify.o blkreplay.o
 block-obj-y += block-backend.o snapshot.o qapi.o
 block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
diff --git a/block/pcache.c b/block/pcache.c
new file mode 100644
index 000..770bbc0
--- /dev/null
+++ b/block/pcache.c
@@ -0,0 +1,156 @@
+/*
+ * Prefetch cache driver filter
+ *
+ * Copyright (c) 2016 Pavel Butsykin 
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "block/block_int.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qstring.h"
+
+
+static const AIOCBInfo pcache_aiocb_info = {
+.aiocb_size = sizeof(BlockAIOCB),
+};
+
+static QemuOptsList runtime_opts = {
+.name = "pcache",
+.head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+.desc = {
+{
+.name = "x-image",
+.type = QEMU_OPT_STRING,
+.help = "[internal use only, will be removed]",
+},
+{ /* end of list */ }
+},
+};
+
+static void pcache_aio_cb(void *opaque, int ret)
+{
+
+BlockAIOCB *acb = opaque;
+
+acb->cb(acb->opaque, ret);
+
+qemu_aio_unref(acb);
+}
+
+static BlockAIOCB *pcache_aio_readv(BlockDriverState *bs,
+int64_t sector_num,
+QEMUIOVector *qiov,
+int nb_sectors,
+BlockCompletionFunc *cb,
+void *opaque)
+{
+BlockAIOCB *acb = qemu_aio_get(_aiocb_info, bs, cb, opaque);
+
+bdrv_aio_readv(bs->file, sector_num, qiov, nb_sectors,
+   pcache_aio_cb, acb);
+return acb;
+}
+
+static BlockAIOCB *pcache_aio_writev(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BlockCompletionFunc *cb,
+ void *opaque)
+{
+BlockAIOCB *acb = qemu_aio_get(_aiocb_info, bs, cb, opaque);
+
+bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors,
+pcache_aio_cb, acb);
+return acb;
+}
+
+static int pcache_file_open(BlockDriverState *bs, QDict *options, int flags,
+Error **errp)
+{
+QemuOpts *opts;
+Error *local_err = NULL;
+int ret = 0;
+
+opts = qemu_opts_create(_opts, NULL, 0, _abort);
+qemu_opts_absorb_qdict(opts, options, _err);
+if (local_err) {
+error_propagate(errp, local_err);
+ret = -EINVAL;
+goto fail;
+}
+
+assert(bs->file == NULL);
+bs->file = bdrv_open_child(qemu_opt_get(opts, "x-image"), options,
+   "image", bs, _format, false, _err);
+if (local_err) {
+ret = -EINVAL;
+error_propagate(errp, local_err);
+}
+fail:
+qemu_opts_del(opts);
+return ret;
+}
+
+static void pcache_close(BlockDriverState *bs)
+{
+}
+
+static void pcache_parse_filename(const char *filename, QDict *options,
+  Error **errp)
+{
+qdict_put(options, "x-image", qstring_from_str(filename));
+}
+
+static int64_t 

[Qemu-block] [PATCH RFC v2 19/22] block/pcache: add pcache node assert

2016-08-29 Thread Pavel Butsykin
In case of node assert we will print the fields of a pcache node, this can be
useful for catching bugs.

Signed-off-by: Pavel Butsykin 
---
 block/pcache.c | 52 +++-
 1 file changed, 35 insertions(+), 17 deletions(-)

diff --git a/block/pcache.c b/block/pcache.c
index 287156a..7b4a9a9 100644
--- a/block/pcache.c
+++ b/block/pcache.c
@@ -38,6 +38,24 @@
 #define DPRINTF(fmt, ...) do { } while (0)
 #endif
 
+#define NODE_PRINT(_node) \
+printf("node:\n"  \
+   "num: %jd size: %d\n"   \
+   "ref: %d\nstatus: %d\n" \
+   "node_wait_cnt: %d\n"   \
+   "data: %p\nlock %u\n",  \
+   (_node)->cm.sector_num, (_node)->cm.nb_sectors,\
+   (_node)->ref, (_node)->status, (_node)->wait.cnt,  \
+   (_node)->data, (_node)->lock.locked)
+
+#define NODE_ASSERT(_assert, _node) \
+do {\
+if (!(_assert)) {   \
+NODE_PRINT(_node);  \
+assert(_assert);\
+}   \
+} while (0)
+
 typedef struct RbNodeKey {
 uint64_tnum;
 uint32_tsize;
@@ -201,7 +219,7 @@ enum {
 static inline void pcache_node_unref(BDRVPCacheState *s, PCNode *node)
 {
 if (atomic_fetch_dec(>ref) == 0) {
-assert(node->status == NODE_REMOVE_STATUS);
+NODE_ASSERT(node->status == NODE_REMOVE_STATUS, node);
 
 node->status = NODE_GHOST_STATUS;
 
@@ -217,8 +235,8 @@ static inline void pcache_node_unref(BDRVPCacheState *s, 
PCNode *node)
 
 static inline PCNode *pcache_node_ref(PCNode *node)
 {
-assert(node->status == NODE_SUCCESS_STATUS ||
-   node->status == NODE_WAIT_STATUS);
+NODE_ASSERT(node->status == NODE_SUCCESS_STATUS ||
+node->status == NODE_WAIT_STATUS, node);
 atomic_inc(>ref);
 
 return node;
@@ -422,8 +440,8 @@ static PrefCachePartReq *pcache_req_get(PrefCacheAIOCB 
*acb, PCNode *node)
 req->node = node;
 req->acb = acb;
 
-assert(acb->sector_num <= node->cm.sector_num + node->cm.nb_sectors);
-
+NODE_ASSERT(acb->sector_num <= node->cm.sector_num + node->cm.nb_sectors,
+node);
 qemu_iovec_init(>qiov, 1);
 qemu_iovec_add(>qiov, node->data,
node->cm.nb_sectors << BDRV_SECTOR_BITS);
@@ -554,10 +572,10 @@ static inline void pcache_node_read_wait(PrefCacheAIOCB 
*acb, PCNode *node)
 
 static void pcache_node_read(PrefCacheAIOCB *acb, PCNode* node)
 {
-assert(node->status == NODE_SUCCESS_STATUS ||
-   node->status == NODE_WAIT_STATUS||
-   node->status == NODE_REMOVE_STATUS);
-assert(node->data != NULL);
+NODE_ASSERT(node->status == NODE_SUCCESS_STATUS ||
+node->status == NODE_WAIT_STATUS||
+node->status == NODE_REMOVE_STATUS, node);
+NODE_ASSERT(node->data != NULL, node);
 
 qemu_co_mutex_lock(>lock);
 if (node->status == NODE_WAIT_STATUS) {
@@ -694,13 +712,13 @@ static void 
pcache_complete_acb_wait_queue(BDRVPCacheState *s, PCNode *node)
 
 pcache_node_read_buf(wait_acb, node);
 
-assert(node->ref != 0);
+NODE_ASSERT(node->ref != 0, node);
 pcache_node_unref(s, node);
 
 complete_aio_request(wait_acb);
 atomic_dec(>wait.cnt);
 }
-assert(atomic_read(>wait.cnt) == 0);
+NODE_ASSERT(atomic_read(>wait.cnt) == 0, node);
 }
 
 static void pcache_node_submit(PrefCachePartReq *req)
@@ -709,8 +727,8 @@ static void pcache_node_submit(PrefCachePartReq *req)
 BDRVPCacheState *s = req->acb->s;
 
 assert(node != NULL);
-assert(atomic_read(>ref) != 0);
-assert(node->data != NULL);
+NODE_ASSERT(atomic_read(>ref) != 0, node);
+NODE_ASSERT(node->data != NULL, node);
 
 qemu_co_mutex_lock(>lock);
 if (node->status == NODE_WAIT_STATUS) {
@@ -733,7 +751,7 @@ static void pcache_merge_requests(PrefCacheAIOCB *acb)
 QTAILQ_REMOVE(>requests.list, req, entry);
 
 assert(req != NULL);
-assert(node->status == NODE_WAIT_STATUS);
+NODE_ASSERT(node->status == NODE_WAIT_STATUS, node);
 
 pcache_node_submit(req);
 
@@ -768,7 +786,7 @@ static void pcache_try_node_drop(PrefCacheAIOCB *acb)
 return;
 }
 if (node->status != NODE_WAIT_STATUS) {
-assert(node->status == NODE_SUCCESS_STATUS);
+NODE_ASSERT(node->status == NODE_SUCCESS_STATUS, node);
 pcache_node_drop(s, node);
 }
 key.num = node->cm.sector_num + node->cm.nb_sectors;
@@ -1081,8 +1099,8 @@ fail:
 
 static void pcache_node_check_and_free(BDRVPCacheState *s, PCNode *node)
 {
-assert(node->status == NODE_SUCCESS_STATUS);
-assert(node->ref == 0);
+NODE_ASSERT(node->status == NODE_SUCCESS_STATUS, node);
+NODE_ASSERT(node->ref == 0, node);
 
 node->status = NODE_REMOVE_STATUS;
 rb_erase(>cm.rb_node, >pcache.tree.root);
-- 
2.8.3




[Qemu-block] [PATCH RFC v2 13/22] block/pcache: add generic request complete

2016-08-29 Thread Pavel Butsykin
his change  allow us to generalize the completion of all requests.

Signed-off-by: Pavel Butsykin 
---
 block/pcache.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/block/pcache.c b/block/pcache.c
index 435f2b4..1ff4c6a 100644
--- a/block/pcache.c
+++ b/block/pcache.c
@@ -105,6 +105,7 @@ typedef struct PrefCacheAIOCB {
 CoMutex lock;
 uint32_t cnt;
 } requests;
+uint32_t ref;
 QEMUBH   *bh;
 int  ret;
 } PrefCacheAIOCB;
@@ -505,9 +506,11 @@ static void pcache_aio_bh(void *opaque)
 
 static void complete_aio_request(PrefCacheAIOCB *acb)
 {
-acb->bh = aio_bh_new(bdrv_get_aio_context(acb->common.bs),
- pcache_aio_bh, acb);
-qemu_bh_schedule(acb->bh);
+if (atomic_dec_fetch(>ref) == 0) {
+acb->bh = aio_bh_new(bdrv_get_aio_context(acb->common.bs),
+ pcache_aio_bh, acb);
+qemu_bh_schedule(acb->bh);
+}
 }
 
 static void pcache_node_submit(PrefCachePartReq *req)
@@ -585,9 +588,7 @@ static void pcache_aio_cb(void *opaque, int ret)
 pcache_merge_requests(acb);
 }
 
-acb->common.cb(acb->common.opaque, ret);
-
-qemu_aio_unref(acb);
+complete_aio_request(acb);
 }
 
 static PrefCacheAIOCB *pcache_aio_get(BlockDriverState *bs, int64_t sector_num,
@@ -603,6 +604,7 @@ static PrefCacheAIOCB *pcache_aio_get(BlockDriverState *bs, 
int64_t sector_num,
 acb->requests.cnt = 0;
 acb->qiov = qiov;
 acb->aio_type = type;
+acb->ref = 1;
 acb->ret = 0;
 
 QTAILQ_INIT(>requests.list);
-- 
2.8.3




[Qemu-block] [PATCH RFC v2 21/22] block/pcache: add write through node

2016-08-29 Thread Pavel Butsykin
Write-through is another way to keep the cache up-to-date. Even if this
case will be rare, a node write buf is easier than a node drop.

Signed-off-by: Pavel Butsykin 
---
 block/pcache.c | 28 
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/block/pcache.c b/block/pcache.c
index c5fe689..2b2edf5 100644
--- a/block/pcache.c
+++ b/block/pcache.c
@@ -200,6 +200,8 @@ static QemuOptsList runtime_opts = {
 #define PCACHE_DEFAULT_POOL_STAT_SIZE (1 << MB_BITS)
 #define PCACHE_DEFAULT_MAX_AIO_SIZE (32 << KB_BITS)
 
+#define PCACHE_WRITE_THROUGH_NODE TRUE
+
 enum {
 NODE_SUCCESS_STATUS = 0,
 NODE_WAIT_STATUS= 1,
@@ -538,7 +540,12 @@ static uint64_t ranges_overlap_size(uint64_t node1, 
uint32_t size1,
 return MIN(node1 + size1, node2 + size2) - MAX(node1, node2);
 }
 
-static inline void pcache_node_read_buf(PrefCacheAIOCB *acb, PCNode* node)
+enum {
+NODE_READ_BUF  = 1,
+NODE_WRITE_BUF = 2
+};
+
+static void pcache_node_rw_buf(PrefCacheAIOCB *acb, PCNode* node, uint32_t 
type)
 {
 uint64_t qiov_offs = 0, node_offs = 0;
 uint32_t size;
@@ -554,8 +561,9 @@ static inline void pcache_node_read_buf(PrefCacheAIOCB 
*acb, PCNode* node)
<< BDRV_SECTOR_BITS;
 
 qemu_co_mutex_lock(>lock); /* XXX: use rw lock */
-copy = \
-qemu_iovec_from_buf(acb->qiov, qiov_offs, node->data + node_offs, 
size);
+copy = type & NODE_READ_BUF ?
+qemu_iovec_from_buf(acb->qiov, qiov_offs, node->data + node_offs, size)
+: qemu_iovec_to_buf(acb->qiov, qiov_offs, node->data + node_offs, 
size);
 qemu_co_mutex_unlock(>lock);
 assert(copy == size);
 }
@@ -586,7 +594,7 @@ static void pcache_node_read(PrefCacheAIOCB *acb, PCNode* 
node)
 }
 qemu_co_mutex_unlock(>lock);
 
-pcache_node_read_buf(acb, node);
+pcache_node_rw_buf(acb, node, NODE_READ_BUF);
 pcache_node_unref(acb->s, node);
 }
 
@@ -712,7 +720,7 @@ static void pcache_complete_acb_wait_queue(BDRVPCacheState 
*s, PCNode *node,
 g_slice_free1(sizeof(*link), link);
 
 if (ret == 0) {
-pcache_node_read_buf(wait_acb, node);
+pcache_node_rw_buf(wait_acb, node, NODE_READ_BUF);
 } else {  /* write only fail, because next request can rewrite error */
 wait_acb->ret = ret;
 }
@@ -761,7 +769,7 @@ static void pcache_merge_requests(PrefCacheAIOCB *acb)
 if (acb->ret == 0) {
 pcache_node_submit(req);
 if (!(acb->aio_type & PCACHE_AIO_READAHEAD)) {
-pcache_node_read_buf(acb, node);
+pcache_node_rw_buf(acb, node, NODE_READ_BUF);
 }
 } else {
 pcache_node_drop(acb->s, node);
@@ -774,7 +782,7 @@ static void pcache_merge_requests(PrefCacheAIOCB *acb)
 qemu_co_mutex_unlock(>requests.lock);
 }
 
-static void pcache_try_node_drop(PrefCacheAIOCB *acb)
+static void pcache_update_node_state(PrefCacheAIOCB *acb)
 {
 BDRVPCacheState *s = acb->s;
 RbNodeKey key;
@@ -793,7 +801,11 @@ static void pcache_try_node_drop(PrefCacheAIOCB *acb)
 }
 if (node->status != NODE_WAIT_STATUS) {
 NODE_ASSERT(node->status == NODE_SUCCESS_STATUS, node);
+#if PCACHE_WRITE_THROUGH_NODE
+pcache_node_rw_buf(acb, node, NODE_WRITE_BUF);
+#else
 pcache_node_drop(s, node);
+#endif
 }
 key.num = node->cm.sector_num + node->cm.nb_sectors;
 
@@ -820,7 +832,7 @@ static void pcache_aio_cb(void *opaque, int ret)
 return;
 }
 } else {/* PCACHE_AIO_WRITE */
-pcache_try_node_drop(acb); /* XXX: use write through */
+pcache_update_node_state(acb);
 }
 
 complete_aio_request(acb);
-- 
2.8.3




[Qemu-block] [PATCH RFC v2 03/22] util/rbtree: add rbtree from linux kernel

2016-08-29 Thread Pavel Butsykin
Why don't we use rbtree from glib? We need  pointer to the parent node.
For optimal implementation storing of cached chunks in the rbtree
need to get next and previous nodes and content of parent node
is very useful for effective implementation of these functions. In this
implementation of rbtree (unlike rbtree of glib) the node contains a pointer
to parent node.  Moreover, this rbtree allows more flexibility to
work with an algorithm because to use rbtrees you'll have to implement
your own insert and search cores. This will avoid us to use callbacks and
to drop drammatically performances.

Signed-off-by: Pavel Butsykin 
---
 include/qemu/rbtree.h   | 109 
 include/qemu/rbtree_augmented.h | 237 +
 util/Makefile.objs  |   1 +
 util/rbtree.c   | 570 
 4 files changed, 917 insertions(+)
 create mode 100644 include/qemu/rbtree.h
 create mode 100644 include/qemu/rbtree_augmented.h
 create mode 100644 util/rbtree.c

diff --git a/include/qemu/rbtree.h b/include/qemu/rbtree.h
new file mode 100644
index 000..c87a46f
--- /dev/null
+++ b/include/qemu/rbtree.h
@@ -0,0 +1,109 @@
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli 
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+  linux/include/linux/rbtree.h
+
+  To use rbtrees you'll have to implement your own insert and search cores.
+  This will avoid us to use callbacks and to drop drammatically performances.
+  I know it's not the cleaner way,  but in C (not in C++) to get
+  performances and genericity...
+
+  See Documentation/rbtree.txt for documentation and samples.
+*/
+
+#ifndef QEMU_RBTREE_H
+#define QEMU_RBTREE_H
+
+#include 
+#include 
+#include 
+
+struct RbNode {
+uintptr_t __rb_parent_color;
+struct RbNode *rb_right;
+struct RbNode *rb_left;
+} __attribute__((aligned(sizeof(uintptr_t;
+/* The alignment might seem pointless, but allegedly CRIS needs it */
+
+struct RbRoot {
+struct RbNode *rb_node;
+};
+
+
+#define RB_PARENT(r) ((struct RbNode *)((r)->__rb_parent_color & ~3))
+
+#define RB_ROOT (struct RbRoot) { NULL, }
+#define RB_ENTRY(ptr, type, member) container_of(ptr, type, member)
+
+#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL)
+
+/* 'empty' nodes are nodes that are known not to be inserted in an rbtree */
+#define RB_EMPTY_NODE(node)  \
+((node)->__rb_parent_color == (uintptr_t)(node))
+#define RB_CLEAR_NODE(node)  \
+((node)->__rb_parent_color = (uintptr_t)(node))
+
+
+extern void rb_insert_color(struct RbNode *, struct RbRoot *);
+extern void rb_erase(struct RbNode *, struct RbRoot *);
+
+
+/* Find logical next and previous nodes in a tree */
+extern struct RbNode *rb_next(const struct RbNode *);
+extern struct RbNode *rb_prev(const struct RbNode *);
+extern struct RbNode *rb_first(const struct RbRoot *);
+extern struct RbNode *rb_last(const struct RbRoot *);
+
+/* Postorder iteration - always visit the parent after its children */
+extern struct RbNode *rb_first_postorder(const struct RbRoot *);
+extern struct RbNode *rb_next_postorder(const struct RbNode *);
+
+/* Fast replacement of a single node without remove/rebalance/add/rebalance */
+extern void rb_replace_node(struct RbNode *victim, struct RbNode *new,
+struct RbRoot *root);
+
+static inline void rb_link_node(struct RbNode *node, struct RbNode *parent,
+struct RbNode **rb_link)
+{
+node->__rb_parent_color = (uintptr_t)parent;
+node->rb_left = node->rb_right = NULL;
+
+*rb_link = node;
+}
+
+#define RB_ENTRY_SAFE(ptr, type, member) \
+({ typeof(ptr) ptr = (ptr);  \
+   ptr ? rb_entry(ptr, type, member) : NULL; \
+})
+
+/**
+ * rbtree_postorder_for_each_entry_safe - iterate over rb_root in post order of
+ * given type safe against removal of rb_node entry
+ *
+ * @pos:   the 'type *' to use as a loop cursor.
+ * @n: another 'type *' to use as temporary storage
+ * @root:  'rb_root *' of the rbtree.
+ * @field: the name of the rb_node field within 'type'.
+ */
+#define RBTREE_POSTORDER_FOR_EACH_ENTRY_SAFE(pos, n, root, field)\
+for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); 

[Qemu-block] [PATCH RFC v2 11/22] add QEMU style defines for __sync_add_and_fetch

2016-08-29 Thread Pavel Butsykin
Signed-off-by: Pavel Butsykin 
---
 include/qemu/atomic.h | 8 
 1 file changed, 8 insertions(+)

diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h
index 7e13fca..7087d0f 100644
--- a/include/qemu/atomic.h
+++ b/include/qemu/atomic.h
@@ -152,6 +152,10 @@
 _old;   \
 })
 
+/* Provide shorter names for GCC atomic builtins, return new value */
+#define atomic_inc_fetch(ptr)  __sync_add_and_fetch(ptr, 1, __ATOMIC_SEQ_CST)
+#define atomic_dec_fetch(ptr)  __sync_add_and_fetch(ptr, -1, __ATOMIC_SEQ_CST)
+
 /* Provide shorter names for GCC atomic builtins, return old value */
 #define atomic_fetch_inc(ptr)  __atomic_fetch_add(ptr, 1, __ATOMIC_SEQ_CST)
 #define atomic_fetch_dec(ptr)  __atomic_fetch_sub(ptr, 1, __ATOMIC_SEQ_CST)
@@ -346,6 +350,10 @@
 #endif
 #endif
 
+/* Provide shorter names for GCC atomic builtins, return new value */
+#define atomic_inc_fetch(ptr)  __sync_add_and_fetch(ptr, 1)
+#define atomic_dec_fetch(ptr)  __sync_add_and_fetch(ptr, -1)
+
 /* Provide shorter names for GCC atomic builtins.  */
 #define atomic_fetch_inc(ptr)  __sync_fetch_and_add(ptr, 1)
 #define atomic_fetch_dec(ptr)  __sync_fetch_and_add(ptr, -1)
-- 
2.8.3




[Qemu-block] [PATCH RFC v2 00/22] I/O prefetch cache

2016-08-29 Thread Pavel Butsykin
The prefetch cache aims to improve the performance of sequential read data.
Of most interest here are the requests of a small size of data for sequential
read, such requests can be optimized by extending them and moving into 
the prefetch cache. However, there are 2 issues:
 - In aggregate only a small portion of requests is sequential, so delays caused
   by the need to read more volumes of data will lead to an overall decrease
   in performance.
 - The presence of redundant data in the cache memory with a large number of
   random requests.
This pcache implementation solves the above and other problems prefetching data.
The pcache algorithm can be summarised by the following main steps.

1. Monitor I/O requests to identify typical sequences.
This implementation of prefetch cache works at the storage system level and has 
information only about the physical block addresses of I/O requests. Statistics 
are collected only from read requests to a maximum size of 32kb(by default),
each request that matches the criteria falls into a pool of requests. In order
to store requests statistic used by the rb-tree(lreq.tree), it's simple but for
this issue a quite efficient data structure.

2. Identifying sequential I/O streams.
For each read request to be carried out attempting to lift the chain sequence 
from lreq.tree, where this request will be element of a sequential chain of 
requests. The key to search for consecutive requests is the area of sectors 
preceding the current request. The size of this area should not be too small to 
avoid false readahead. The sequential stream data requests can be identified
even when a large number of random requests. For example, if there is access to 
the blocks 100, 1157, 27520, 4, 101, 312, 1337, 102, in the context of request
processing 102 will be identified the chain of sequential requests 100, 101. 102
and then should a decision be made to do readahead. Also a situation may arise
when multiple applications A, B, C simultaneously perform sequential read of
data. For each separate application that will be sequential read data 
A(100, 101, 102), B(300, 301, 302), C(700, 701, 702), but for block devices it 
may look like a random data reading: 100,300,700,101,301,701,102,302,702. 
In this case, the sequential streams will also be recognised because location
requests in the rb-tree will allow to separate the sequential I/O streams.

3. Do the readahead into the cache for recognized sequential data streams.
After the issue of the detection of pcache case was resolved, need using larger 
requests to bring data into the cache. In this implementation the pcache used
readahead instead of the extension request, therefore the request goes as is. 
There is not any reason to put data in the cache that will never be picked up, 
but this will always happen in the case of extension requests. In order to store
areas of cached blocks is also used by the rb-tree(pcache.tree), it's simple but
for this issue a quite efficient data structure.

4. Control size of the prefetch cache pool and the requests statistic pool
For control the border of the pool statistic of requests, the data of requests 
are placed and replaced according to the FIFO principle, everything is simple.
For control the boundaries of the memory cache used LRU list, it allows to limit
the max amount memory that we can allocate for pcache. But the LRU is there
mainly to prevent displacement of the cache blocks that was read partially. 
The main way the memory is pushed out immediately after use, as soon as a chunk
of memory from the cache has been completely read, since the probability of
repetition of the request is very low. Cases when one and the same portion of
the cache memory has been read several times are not optimized and do not apply
to the cases that can optimize the pcache. Thus, using a cache memory of small
volume, by the optimization of the operations read-ahead and clear memory, we
can read entire volumes of data, providing a 100% cache hit. Also does not
decrease the effectiveness of random read requests.

PCache is implemented as a qemu block filter driver, has some configurable
parameters, such as: total cache size, readahead size, maximum size of block
that can be processed.

For performance evaluation has been used several test cases with different
sequential and random read data on SSD disk. Here are the results of tests and
qemu parameters:

qemu parameters: 
-M pc-i440fx-2.4 --enable-kvm -smp 4 -m 1024 
-drive file=centos7.qcow2,if=none,id=drive-virtio-disk0,format=qcow2,cache=none,
   aio=native,pcache-full-size=4MB,pcache-readahead-size=128KB,
   pcache-max-aio-size=32KB
-device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x8,drive=drive-virtio-disk0,
id=virtio-disk0
(-set device.virtio-disk0.x-data-plane=on)


* Testcase* Results in iops*
* 

[Qemu-block] [PATCH RFC v2 17/22] block/pcache: skip readahead for non-sequential requests

2016-08-29 Thread Pavel Butsykin
When randomly reading data will be a lot of readahead, resulting in a loss of
productivity. In order to avoid added checking the requests line before
making the readahead. It also makes no sense to cache new requests,
because a cache hit on this data is very unlikely.

Signed-off-by: Pavel Butsykin 
---
 block/pcache.c | 141 +++--
 1 file changed, 138 insertions(+), 3 deletions(-)

diff --git a/block/pcache.c b/block/pcache.c
index ae7ac8d..7a317fc 100644
--- a/block/pcache.c
+++ b/block/pcache.c
@@ -73,6 +73,10 @@ typedef struct PCNode {
 CoMutex  lock;
 } PCNode;
 
+typedef struct LRNode {
+BlockNode cm;
+} LRNode;
+
 typedef struct ReqStor {
 struct {
 struct RbRoot root;
@@ -91,10 +95,12 @@ typedef struct BDRVPCacheState {
 BlockDriverState **bs;
 
 ReqStor pcache;
+ReqStor lreq;
 
 struct {
 uint32_t cache_size;
 uint32_t readahead_size;
+uint32_t lreq_pool_size;
 } cfg;
 
 #ifdef PCACHE_DEBUG
@@ -166,6 +172,7 @@ static QemuOptsList runtime_opts = {
 #define MB_BITS 20
 #define PCACHE_DEFAULT_CACHE_SIZE (4 << MB_BITS)
 #define PCACHE_DEFAULT_READAHEAD_SIZE (128 << KB_BITS)
+#define PCACHE_DEFAULT_POOL_STAT_SIZE (1 << MB_BITS)
 
 enum {
 NODE_SUCCESS_STATUS = 0,
@@ -181,6 +188,7 @@ enum {
 };
 
 #define PCNODE(_n) ((PCNode *)(_n))
+#define LRNODE(_n) ((LRNode *)(_n))
 
 static inline void pcache_node_unref(BDRVPCacheState *s, PCNode *node)
 {
@@ -262,6 +270,11 @@ static PCNode *pcache_node_search(struct RbRoot *root, 
RbNodeKey *key)
 return node == NULL ? NULL : pcache_node_ref(node);
 }
 
+static inline LRNode *lreq_node_search(struct RbRoot *root, RbNodeKey *key)
+{
+return node_search(root, key);
+}
+
 static void *node_insert(struct RbRoot *root, BlockNode *node)
 {
 struct RbNode **new = &(root->rb_node), *parent = NULL;
@@ -288,6 +301,11 @@ static inline PCNode *pcache_node_insert(struct RbRoot 
*root, PCNode *node)
 return pcache_node_ref(node_insert(root, >cm));
 }
 
+static inline LRNode *lreq_node_insert(struct RbRoot *root, LRNode *node)
+{
+return node_insert(root, >cm);
+}
+
 static inline void *pcache_node_alloc(RbNodeKey* key)
 {
 PCNode *node = g_slice_alloc(sizeof(*node));
@@ -364,6 +382,34 @@ static void pcache_try_shrink(BDRVPCacheState *s)
 }
 }
 
+static void lreq_try_shrink(BDRVPCacheState *s)
+{
+while (s->lreq.curr_size > s->cfg.lreq_pool_size) {
+LRNode *rmv_node;
+/* XXX: need to filter large requests */
+if (QTAILQ_EMPTY(>lreq.lru.list)) {
+DPRINTF("lru lreq list is empty, but curr_size: %d\n",
+s->lreq.curr_size);
+break;
+}
+
+qemu_co_mutex_lock(>lreq.lru.lock);
+rmv_node = LRNODE(QTAILQ_LAST(>lreq.lru.list, lru_head));
+qemu_co_mutex_unlock(>lreq.lru.lock);
+
+atomic_sub(>lreq.curr_size, rmv_node->cm.nb_sectors);
+
+qemu_co_mutex_lock(>lreq.lru.lock);
+QTAILQ_REMOVE(>lreq.lru.list, _node->cm, entry);
+qemu_co_mutex_unlock(>lreq.lru.lock);
+
+qemu_co_mutex_lock(>lreq.tree.lock);
+rb_erase(_node->cm.rb_node, >lreq.tree.root);
+qemu_co_mutex_unlock(>lreq.tree.lock);
+g_slice_free1(sizeof(*rmv_node), rmv_node);
+}
+}
+
 static PrefCachePartReq *pcache_req_get(PrefCacheAIOCB *acb, PCNode *node)
 {
 PrefCachePartReq *req = g_slice_alloc(sizeof(*req));
@@ -437,6 +483,34 @@ static inline PCNode *pcache_node_add(PrefCacheAIOCB *acb, 
RbNodeKey *key)
 return node;
 }
 
+static LRNode *lreq_node_add(PrefCacheAIOCB *acb, RbNodeKey *key)
+{
+BDRVPCacheState *s = acb->s;
+LRNode *new_node = g_slice_alloc(sizeof(*new_node));
+LRNode *found;
+
+new_node->cm.sector_num = key->num;
+new_node->cm.nb_sectors = key->size;
+
+qemu_co_mutex_lock(>lreq.tree.lock);
+found = lreq_node_insert(>lreq.tree.root, new_node);
+qemu_co_mutex_unlock(>lreq.tree.lock);
+if (found != new_node) {
+g_slice_free1(sizeof(*new_node), new_node);
+return NULL;
+}
+
+atomic_add(>lreq.curr_size, new_node->cm.nb_sectors);
+
+lreq_try_shrink(s);
+
+qemu_co_mutex_lock(>lreq.lru.lock);
+QTAILQ_INSERT_HEAD(>lreq.lru.list, _node->cm, entry);
+qemu_co_mutex_unlock(>lreq.lru.lock);
+
+return new_node;
+}
+
 static uint64_t ranges_overlap_size(uint64_t node1, uint32_t size1,
 uint64_t node2, uint32_t size2)
 {
@@ -552,13 +626,24 @@ enum {
 
 static int32_t pcache_prefetch(PrefCacheAIOCB *acb)
 {
+BDRVPCacheState *s = acb->s;
 RbNodeKey key;
-PCNode *node = NULL;
+PCNode *node;
 
 prefetch_init_key(acb, );
-if (pcache_node_find_and_create(acb, , )) {
+
+/* add request statistics */
+lreq_node_add(acb, );
+
+qemu_co_mutex_lock(>pcache.tree.lock); /* XXX: use get_next_node */
+node = 

[Qemu-block] [PATCH RFC v2 05/22] block/pcache: add aio requests into cache

2016-08-29 Thread Pavel Butsykin
For storing requests use an rbtree, here are add basic operations on the
rbtree to work with  cache nodes.

Signed-off-by: Pavel Butsykin 
---
 block/pcache.c | 190 -
 1 file changed, 189 insertions(+), 1 deletion(-)

diff --git a/block/pcache.c b/block/pcache.c
index 7f221d6..f5022f9 100644
--- a/block/pcache.c
+++ b/block/pcache.c
@@ -27,6 +27,7 @@
 #include "block/raw-aio.h"
 #include "qapi/error.h"
 #include "qapi/qmp/qstring.h"
+#include "qemu/rbtree.h"
 
 #define PCACHE_DEBUG
 
@@ -37,9 +38,53 @@
 #define DPRINTF(fmt, ...) do { } while (0)
 #endif
 
+typedef struct RbNodeKey {
+uint64_tnum;
+uint32_tsize;
+} RbNodeKey;
+
+typedef struct BlockNode {
+struct RbNode   rb_node;
+union {
+RbNodeKey   key;
+struct {
+uint64_tsector_num;
+uint32_tnb_sectors;
+};
+};
+QTAILQ_ENTRY(BlockNode) entry;
+} BlockNode;
+
+typedef struct PCNode {
+BlockNode cm;
+
+uint8_t  *data;
+} PCNode;
+
+typedef struct ReqStor {
+struct {
+struct RbRoot root;
+CoMutex   lock;
+} tree;
+
+uint32_t curr_size;
+} ReqStor;
+
+typedef struct BDRVPCacheState {
+BlockDriverState **bs;
+
+ReqStor pcache;
+
+struct {
+QTAILQ_HEAD(pcache_head, BlockNode) head;
+CoMutex lock;
+} list;
+} BDRVPCacheState;
+
 typedef struct PrefCacheAIOCB {
 BlockAIOCB common;
 
+BDRVPCacheState *s;
 QEMUIOVector *qiov;
 uint64_t sector_num;
 uint32_t nb_sectors;
@@ -64,6 +109,124 @@ static QemuOptsList runtime_opts = {
 },
 };
 
+#define PCNODE(_n) ((PCNode *)(_n))
+
+static int pcache_key_cmp(const RbNodeKey *key1, const RbNodeKey *key2)
+{
+assert(key1 != NULL);
+assert(key2 != NULL);
+
+if (key1->num >= key2->num + key2->size) {
+return 1;
+}
+if (key1->num + key1->size <= key2->num) {
+return -1;
+}
+
+return 0;
+}
+
+static void *node_insert(struct RbRoot *root, BlockNode *node)
+{
+struct RbNode **new = &(root->rb_node), *parent = NULL;
+
+/* Figure out where to put new node */
+while (*new) {
+BlockNode *this = container_of(*new, BlockNode, rb_node);
+int result = pcache_key_cmp(>key, >key);
+if (result == 0) {
+return this;
+}
+parent = *new;
+new = result < 0 ? &((*new)->rb_left) : &((*new)->rb_right);
+}
+/* Add new node and rebalance tree. */
+rb_link_node(>rb_node, parent, new);
+rb_insert_color(>rb_node, root);
+
+return node;
+}
+
+static inline PCNode *pcache_node_insert(struct RbRoot *root, PCNode *node)
+{
+return node_insert(root, >cm);
+}
+
+static inline void pcache_node_free(PCNode *node)
+{
+g_free(node->data);
+g_slice_free1(sizeof(*node), node);
+}
+
+static inline void *pcache_node_alloc(RbNodeKey* key)
+{
+PCNode *node = g_slice_alloc(sizeof(*node));
+
+node->cm.sector_num = key->num;
+node->cm.nb_sectors = key->size;
+node->data = g_malloc(node->cm.nb_sectors << BDRV_SECTOR_BITS);
+
+return node;
+}
+
+static bool pcache_node_find_and_create(PrefCacheAIOCB *acb, RbNodeKey *key,
+PCNode **out_node)
+{
+BDRVPCacheState *s = acb->s;
+PCNode *new_node = pcache_node_alloc(key);
+PCNode *found;
+
+qemu_co_mutex_lock(>pcache.tree.lock);
+found = pcache_node_insert(>pcache.tree.root, new_node);
+qemu_co_mutex_unlock(>pcache.tree.lock);
+if (found != new_node) {
+pcache_node_free(new_node);
+*out_node = found;
+return false;
+}
+atomic_add(>pcache.curr_size, new_node->cm.nb_sectors);
+
+qemu_co_mutex_lock(>list.lock);
+QTAILQ_INSERT_HEAD(>list.head, _node->cm, entry);
+qemu_co_mutex_unlock(>list.lock);
+
+*out_node = new_node;
+return true;
+}
+
+static inline void prefetch_init_key(PrefCacheAIOCB *acb, RbNodeKey* key)
+{
+key->num = acb->sector_num;
+key->size = acb->nb_sectors;
+}
+
+enum {
+PREFETCH_NEW_NODE  = 0,
+PREFETCH_FULL_UP   = 1,
+PREFETCH_PART_UP   = 2
+};
+
+static int32_t pcache_prefetch(PrefCacheAIOCB *acb)
+{
+RbNodeKey key;
+PCNode *node = NULL;
+
+prefetch_init_key(acb, );
+if (pcache_node_find_and_create(acb, , )) {
+return PREFETCH_NEW_NODE;
+}
+
+/* Node covers the whole request */
+if (node->cm.sector_num <= acb->sector_num &&
+node->cm.sector_num + node->cm.nb_sectors >= acb->sector_num +
+ acb->nb_sectors)
+{
+return PREFETCH_FULL_UP;
+}
+
+return PREFETCH_PART_UP;
+}
+
 static void pcache_aio_cb(void *opaque, int ret)
 {
 PrefCacheAIOCB *acb = opaque;
@@ -80,6 +243,7 @@ static PrefCacheAIOCB *pcache_aio_get(BlockDriverState *bs, 
int64_t sector_num,
 {
 PrefCacheAIOCB *acb 

[Qemu-block] [PATCH RFC v2 10/22] block/pcache: add check node leak

2016-08-29 Thread Pavel Butsykin
If the pcache  has a bug with node reference, then s->death_node_list can help
to know about it.

Signed-off-by: Pavel Butsykin 
---
 block/pcache.c | 45 +++--
 1 file changed, 39 insertions(+), 6 deletions(-)

diff --git a/block/pcache.c b/block/pcache.c
index 6114289..a8a57e3 100644
--- a/block/pcache.c
+++ b/block/pcache.c
@@ -87,6 +87,8 @@ typedef struct BDRVPCacheState {
 
 #ifdef PCACHE_DEBUG
 uint64_t shrink_cnt_node;
+QTAILQ_HEAD(death_node_head, BlockNode) death_node_list;
+CoMutex death_node_lock;
 #endif
 } BDRVPCacheState;
 
@@ -152,7 +154,7 @@ enum {
 
 #define PCNODE(_n) ((PCNode *)(_n))
 
-static inline void pcache_node_unref(PCNode *node)
+static inline void pcache_node_unref(BDRVPCacheState *s, PCNode *node)
 {
 assert(node->status == NODE_SUCCESS_STATUS ||
node->status == NODE_REMOVE_STATUS);
@@ -161,6 +163,12 @@ static inline void pcache_node_unref(PCNode *node)
 assert(node->status == NODE_REMOVE_STATUS);
 
 node->status = NODE_GHOST_STATUS;
+
+#ifdef PCACHE_DEBUG
+qemu_co_mutex_lock(>death_node_lock);
+QTAILQ_REMOVE(>death_node_list, >cm, entry);
+qemu_co_mutex_unlock(>death_node_lock);
+#endif
 g_free(node->data);
 g_slice_free1(sizeof(*node), node);
 }
@@ -263,11 +271,17 @@ static void pcache_node_drop(BDRVPCacheState *s, PCNode 
*node)
 QTAILQ_REMOVE(>pcache.lru.list, >cm, entry);
 qemu_co_mutex_unlock(>pcache.lru.lock);
 
+#ifdef PCACHE_DEBUG
+qemu_co_mutex_lock(>death_node_lock);
+QTAILQ_INSERT_HEAD(>death_node_list, >cm, entry);
+qemu_co_mutex_unlock(>death_node_lock);
+#endif
+
 qemu_co_mutex_lock(>pcache.tree.lock);
 rb_erase(>cm.rb_node, >pcache.tree.root);
 qemu_co_mutex_unlock(>pcache.tree.lock);
 
-pcache_node_unref(node);
+pcache_node_unref(s, node);
 }
 
 static void pcache_try_shrink(BDRVPCacheState *s)
@@ -367,7 +381,7 @@ static void pcache_pickup_parts_of_cache(PrefCacheAIOCB 
*acb, PCNode *node,
 up_size = lc_key.size;
 
 if (!pcache_node_find_and_create(acb, _key, _node)) {
-pcache_node_unref(node);
+pcache_node_unref(acb->s, node);
 node = new_node;
 continue;
 }
@@ -377,7 +391,7 @@ static void pcache_pickup_parts_of_cache(PrefCacheAIOCB 
*acb, PCNode *node,
 /* XXX: node read */
 up_size = MIN(node->cm.sector_num + node->cm.nb_sectors - num, size);
 
-pcache_node_unref(node);
+pcache_node_unref(acb->s, node);
 
 size -= up_size;
 num += up_size;
@@ -416,7 +430,7 @@ static int32_t pcache_prefetch(PrefCacheAIOCB *acb)
  acb->nb_sectors)
 {
 /* XXX: node read */
-pcache_node_unref(node);
+pcache_node_unref(acb->s, node);
 return PREFETCH_FULL_UP;
 }
 pcache_pickup_parts_of_cache(acb, node, key.num, key.size);
@@ -459,7 +473,7 @@ static void pcache_merge_requests(PrefCacheAIOCB *acb)
 
 /* XXX: pcache read */
 
-pcache_node_unref(req->node);
+pcache_node_unref(acb->s, req->node);
 
 g_slice_free1(sizeof(*req), req);
 }
@@ -544,6 +558,11 @@ static void pcache_state_init(QemuOpts *opts, 
BDRVPCacheState *s)
 s->pcache.curr_size = 0;
 
 s->cfg_cache_size = cache_size >> BDRV_SECTOR_BITS;
+
+#ifdef PCACHE_DEBUG
+QTAILQ_INIT(>death_node_list);
+qemu_co_mutex_init(>death_node_lock);
+#endif
 }
 
 static int pcache_file_open(BlockDriverState *bs, QDict *options, int flags,
@@ -597,6 +616,20 @@ static void pcache_close(BlockDriverState *bs)
 cnt++;
 }
 DPRINTF("used %d nodes\n", cnt);
+
+#ifdef PCACHE_DEBUG
+if (!QTAILQ_EMPTY(>death_node_list)) {
+cnt = 0;
+DPRINTF("warning: death node list contains of node\n");
+QTAILQ_FOREACH_SAFE(node, >death_node_list, entry, next) {
+QTAILQ_REMOVE(>death_node_list, node, entry);
+g_free(PCNODE(node)->data);
+g_slice_free1(sizeof(*node), node);
+cnt++;
+}
+DPRINTF("death nodes: %d", cnt);
+}
+#endif
 }
 
 static void pcache_parse_filename(const char *filename, QDict *options,
-- 
2.8.3




[Qemu-block] [PATCH RFC v2 02/22] block/pcache: add own AIOCB block

2016-08-29 Thread Pavel Butsykin
Signed-off-by: Pavel Butsykin 
---
 block/pcache.c | 43 +++
 1 file changed, 35 insertions(+), 8 deletions(-)

diff --git a/block/pcache.c b/block/pcache.c
index 770bbc0..74a4bc4 100644
--- a/block/pcache.c
+++ b/block/pcache.c
@@ -24,12 +24,22 @@
 
 #include "qemu/osdep.h"
 #include "block/block_int.h"
+#include "block/raw-aio.h"
 #include "qapi/error.h"
 #include "qapi/qmp/qstring.h"
 
+typedef struct PrefCacheAIOCB {
+BlockAIOCB common;
+
+QEMUIOVector *qiov;
+uint64_t sector_num;
+uint32_t nb_sectors;
+int  aio_type;
+int  ret;
+} PrefCacheAIOCB;
 
 static const AIOCBInfo pcache_aiocb_info = {
-.aiocb_size = sizeof(BlockAIOCB),
+.aiocb_size = sizeof(PrefCacheAIOCB),
 };
 
 static QemuOptsList runtime_opts = {
@@ -47,14 +57,29 @@ static QemuOptsList runtime_opts = {
 
 static void pcache_aio_cb(void *opaque, int ret)
 {
+PrefCacheAIOCB *acb = opaque;
 
-BlockAIOCB *acb = opaque;
-
-acb->cb(acb->opaque, ret);
+acb->common.cb(acb->common.opaque, ret);
 
 qemu_aio_unref(acb);
 }
 
+static PrefCacheAIOCB *pcache_aio_get(BlockDriverState *bs, int64_t sector_num,
+  QEMUIOVector *qiov, int nb_sectors,
+  BlockCompletionFunc *cb, void *opaque,
+  int type)
+{
+PrefCacheAIOCB *acb = qemu_aio_get(_aiocb_info, bs, cb, opaque);
+
+acb->sector_num = sector_num;
+acb->nb_sectors = nb_sectors;
+acb->qiov = qiov;
+acb->aio_type = type;
+acb->ret = 0;
+
+return acb;
+}
+
 static BlockAIOCB *pcache_aio_readv(BlockDriverState *bs,
 int64_t sector_num,
 QEMUIOVector *qiov,
@@ -62,11 +87,12 @@ static BlockAIOCB *pcache_aio_readv(BlockDriverState *bs,
 BlockCompletionFunc *cb,
 void *opaque)
 {
-BlockAIOCB *acb = qemu_aio_get(_aiocb_info, bs, cb, opaque);
+PrefCacheAIOCB *acb = pcache_aio_get(bs, sector_num, qiov, nb_sectors, cb,
+ opaque, QEMU_AIO_READ);
 
 bdrv_aio_readv(bs->file, sector_num, qiov, nb_sectors,
pcache_aio_cb, acb);
-return acb;
+return >common;
 }
 
 static BlockAIOCB *pcache_aio_writev(BlockDriverState *bs,
@@ -76,11 +102,12 @@ static BlockAIOCB *pcache_aio_writev(BlockDriverState *bs,
  BlockCompletionFunc *cb,
  void *opaque)
 {
-BlockAIOCB *acb = qemu_aio_get(_aiocb_info, bs, cb, opaque);
+PrefCacheAIOCB *acb = pcache_aio_get(bs, sector_num, qiov, nb_sectors, cb,
+ opaque, QEMU_AIO_WRITE);
 
 bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors,
 pcache_aio_cb, acb);
-return acb;
+return >common;
 }
 
 static int pcache_file_open(BlockDriverState *bs, QDict *options, int flags,
-- 
2.8.3




[Qemu-block] [PATCH RFC v2 22/22] block/pcache: drop used pcache node

2016-08-29 Thread Pavel Butsykin
The pcache is directed to certain situations to sequential reads.
This concept allows to drop parts of the cache that were already used, which
will reduce the size of cache and the number of displaced nodes.

Signed-off-by: Pavel Butsykin 
---
 block/pcache.c | 24 +++-
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/block/pcache.c b/block/pcache.c
index 2b2edf5..3fad4ca 100644
--- a/block/pcache.c
+++ b/block/pcache.c
@@ -88,6 +88,7 @@ typedef struct PCNode {
 uint32_t status;
 uint32_t ref;
 uint8_t  *data;
+uint32_t rdcnt;
 CoMutex  lock;
 } PCNode;
 
@@ -342,6 +343,7 @@ static inline void *pcache_node_alloc(RbNodeKey* key)
 node->cm.nb_sectors = key->size;
 node->ref = 0;
 node->status = NODE_WAIT_STATUS;
+node->rdcnt = 0;
 qemu_co_mutex_init(>lock);
 node->data = g_malloc(node->cm.nb_sectors << BDRV_SECTOR_BITS);
 node->wait.cnt = 0;
@@ -560,11 +562,23 @@ static void pcache_node_rw_buf(PrefCacheAIOCB *acb, 
PCNode* node, uint32_t type)
node->cm.sector_num, node->cm.nb_sectors)
<< BDRV_SECTOR_BITS;
 
-qemu_co_mutex_lock(>lock); /* XXX: use rw lock */
-copy = type & NODE_READ_BUF ?
-qemu_iovec_from_buf(acb->qiov, qiov_offs, node->data + node_offs, size)
-: qemu_iovec_to_buf(acb->qiov, qiov_offs, node->data + node_offs, 
size);
-qemu_co_mutex_unlock(>lock);
+if (type & NODE_READ_BUF) {
+qemu_co_mutex_lock(>lock); /* XXX: use rw lock */
+copy = qemu_iovec_from_buf(acb->qiov, qiov_offs,
+   node->data + node_offs, size);
+qemu_co_mutex_unlock(>lock);
+
+/* pcache node is no longer needed, when it was all read */
+atomic_add(>rdcnt, size >> BDRV_SECTOR_BITS);
+if (node->rdcnt >= node->cm.nb_sectors) {
+pcache_node_drop(acb->s, node);
+}
+} else {
+qemu_co_mutex_lock(>lock); /* XXX: use rw lock */
+copy = qemu_iovec_to_buf(acb->qiov, qiov_offs,
+ node->data + node_offs, size);
+qemu_co_mutex_unlock(>lock);
+}
 assert(copy == size);
 }
 
-- 
2.8.3




[Qemu-block] [PATCH RFC v2 20/22] block/pcache: implement pcache error handling of aio cb

2016-08-29 Thread Pavel Butsykin
Added error handling aio requests to pcache driver. If the request fails,
then fails all pending requests.

Signed-off-by: Pavel Butsykin 
---
 block/pcache.c | 33 ++---
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/block/pcache.c b/block/pcache.c
index 7b4a9a9..c5fe689 100644
--- a/block/pcache.c
+++ b/block/pcache.c
@@ -683,7 +683,7 @@ static void pcache_aio_bh(void *opaque)
 {
 PrefCacheAIOCB *acb = opaque;
 qemu_bh_delete(acb->bh);
-acb->common.cb(acb->common.opaque, 0);
+acb->common.cb(acb->common.opaque, acb->ret);
 qemu_aio_unref(acb);
 }
 
@@ -696,7 +696,8 @@ static void complete_aio_request(PrefCacheAIOCB *acb)
 }
 }
 
-static void pcache_complete_acb_wait_queue(BDRVPCacheState *s, PCNode *node)
+static void pcache_complete_acb_wait_queue(BDRVPCacheState *s, PCNode *node,
+   int ret)
 {
 ACBEntryLink *link, *next;
 
@@ -710,7 +711,11 @@ static void pcache_complete_acb_wait_queue(BDRVPCacheState 
*s, PCNode *node)
 QTAILQ_REMOVE(>wait.list, link, entry);
 g_slice_free1(sizeof(*link), link);
 
-pcache_node_read_buf(wait_acb, node);
+if (ret == 0) {
+pcache_node_read_buf(wait_acb, node);
+} else {  /* write only fail, because next request can rewrite error */
+wait_acb->ret = ret;
+}
 
 NODE_ASSERT(node->ref != 0, node);
 pcache_node_unref(s, node);
@@ -753,16 +758,17 @@ static void pcache_merge_requests(PrefCacheAIOCB *acb)
 assert(req != NULL);
 NODE_ASSERT(node->status == NODE_WAIT_STATUS, node);
 
-pcache_node_submit(req);
-
-if (!(acb->aio_type & PCACHE_AIO_READAHEAD)) {
-pcache_node_read_buf(acb, node);
+if (acb->ret == 0) {
+pcache_node_submit(req);
+if (!(acb->aio_type & PCACHE_AIO_READAHEAD)) {
+pcache_node_read_buf(acb, node);
+}
+} else {
+pcache_node_drop(acb->s, node);
 }
+pcache_complete_acb_wait_queue(acb->s, node, acb->ret);
 
-pcache_complete_acb_wait_queue(acb->s, node);
-
-pcache_node_unref(acb->s, req->node);
-
+pcache_node_unref(acb->s, node);
 g_slice_free1(sizeof(*req), req);
 }
 qemu_co_mutex_unlock(>requests.lock);
@@ -799,6 +805,11 @@ static void pcache_aio_cb(void *opaque, int ret)
 {
 PrefCacheAIOCB *acb = opaque;
 
+if (ret != 0) {
+acb->ret = ret;
+DPRINTF("pcache aio_cb(num: %jd nb: %d) err: %d",
+acb->sector_num, acb->nb_sectors, ret);
+}
 if (acb->aio_type & PCACHE_AIO_READ) {
 if (atomic_dec_fetch(>requests.cnt) > 0) {
 return;
-- 
2.8.3




[Qemu-block] [PATCH RFC v2 07/22] block/pcache: introduce LRU as method of memory

2016-08-29 Thread Pavel Butsykin
This is a simple solution to the problem of displacement of cache memory.
The LRU can be useful to avoid the displacement of the nodes, which have
been partially read.

Signed-off-by: Pavel Butsykin 
---
 block/pcache.c | 74 +-
 1 file changed, 58 insertions(+), 16 deletions(-)

diff --git a/block/pcache.c b/block/pcache.c
index 54d4526..7504db8 100644
--- a/block/pcache.c
+++ b/block/pcache.c
@@ -67,6 +67,11 @@ typedef struct ReqStor {
 CoMutex   lock;
 } tree;
 
+struct {
+QTAILQ_HEAD(lru_head, BlockNode) list;
+CoMutex lock;
+} lru;
+
 uint32_t curr_size;
 } ReqStor;
 
@@ -75,12 +80,11 @@ typedef struct BDRVPCacheState {
 
 ReqStor pcache;
 
-struct {
-QTAILQ_HEAD(pcache_head, BlockNode) head;
-CoMutex lock;
-} list;
-
 uint32_t cfg_cache_size;
+
+#ifdef PCACHE_DEBUG
+uint64_t shrink_cnt_node;
+#endif
 } BDRVPCacheState;
 
 typedef struct PrefCacheAIOCB {
@@ -182,6 +186,44 @@ static inline void *pcache_node_alloc(RbNodeKey* key)
 return node;
 }
 
+static void pcache_node_drop(BDRVPCacheState *s, PCNode *node)
+{
+atomic_sub(>pcache.curr_size, node->cm.nb_sectors);
+
+qemu_co_mutex_lock(>pcache.lru.lock);
+QTAILQ_REMOVE(>pcache.lru.list, >cm, entry);
+qemu_co_mutex_unlock(>pcache.lru.lock);
+
+qemu_co_mutex_lock(>pcache.tree.lock);
+rb_erase(>cm.rb_node, >pcache.tree.root);
+qemu_co_mutex_unlock(>pcache.tree.lock);
+
+pcache_node_free(node);
+}
+
+static void pcache_try_shrink(BDRVPCacheState *s)
+{
+while (s->pcache.curr_size > s->cfg_cache_size) {
+qemu_co_mutex_lock(>pcache.lru.lock);
+assert(!QTAILQ_EMPTY(>pcache.lru.list));
+PCNode *rmv_node = PCNODE(QTAILQ_LAST(>pcache.lru.list, lru_head));
+qemu_co_mutex_unlock(>pcache.lru.lock);
+
+pcache_node_drop(s, rmv_node);
+#ifdef PCACHE_DEBUG
+atomic_inc(>shrink_cnt_node);
+#endif
+}
+}
+
+static inline void pcache_lru_node_up(BDRVPCacheState *s, PCNode *node)
+{
+qemu_co_mutex_lock(>pcache.lru.lock);
+QTAILQ_REMOVE(>pcache.lru.list, >cm, entry);
+QTAILQ_INSERT_HEAD(>pcache.lru.list, >cm, entry);
+qemu_co_mutex_unlock(>pcache.lru.lock);
+}
+
 static bool pcache_node_find_and_create(PrefCacheAIOCB *acb, RbNodeKey *key,
 PCNode **out_node)
 {
@@ -194,14 +236,17 @@ static bool pcache_node_find_and_create(PrefCacheAIOCB 
*acb, RbNodeKey *key,
 qemu_co_mutex_unlock(>pcache.tree.lock);
 if (found != new_node) {
 pcache_node_free(new_node);
+pcache_lru_node_up(s, found);
 *out_node = found;
 return false;
 }
 atomic_add(>pcache.curr_size, new_node->cm.nb_sectors);
 
-qemu_co_mutex_lock(>list.lock);
-QTAILQ_INSERT_HEAD(>list.head, _node->cm, entry);
-qemu_co_mutex_unlock(>list.lock);
+qemu_co_mutex_lock(>pcache.lru.lock);
+QTAILQ_INSERT_HEAD(>pcache.lru.list, _node->cm, entry);
+qemu_co_mutex_unlock(>pcache.lru.lock);
+
+pcache_try_shrink(s);
 
 *out_node = new_node;
 return true;
@@ -275,10 +320,7 @@ static BlockAIOCB *pcache_aio_readv(BlockDriverState *bs,
 {
 PrefCacheAIOCB *acb = pcache_aio_get(bs, sector_num, qiov, nb_sectors, cb,
  opaque, QEMU_AIO_READ);
-
-if (acb->s->pcache.curr_size < acb->s->cfg_cache_size) {
-pcache_prefetch(acb);
-}
+pcache_prefetch(acb);
 
 bdrv_aio_readv(bs->file, sector_num, qiov, nb_sectors,
pcache_aio_cb, acb);
@@ -309,8 +351,8 @@ static void pcache_state_init(QemuOpts *opts, 
BDRVPCacheState *s)
 
 s->pcache.tree.root = RB_ROOT;
 qemu_co_mutex_init(>pcache.tree.lock);
-QTAILQ_INIT(>list.head);
-qemu_co_mutex_init(>list.lock);
+QTAILQ_INIT(>pcache.lru.list);
+qemu_co_mutex_init(>pcache.lru.lock);
 s->pcache.curr_size = 0;
 
 s->cfg_cache_size = cache_size >> BDRV_SECTOR_BITS;
@@ -350,8 +392,8 @@ static void pcache_close(BlockDriverState *bs)
 uint32_t cnt = 0;
 BDRVPCacheState *s = bs->opaque;
 BlockNode *node, *next;
-QTAILQ_FOREACH_SAFE(node, >list.head, entry, next) {
-QTAILQ_REMOVE(>list.head, node, entry);
+QTAILQ_FOREACH_SAFE(node, >pcache.lru.list, entry, next) {
+QTAILQ_REMOVE(>pcache.lru.list, node, entry);
 pcache_node_free(PCNODE(node));
 cnt++;
 }
-- 
2.8.3




[Qemu-block] [PATCH RFC v2 18/22] block/pcache: add pcache skip large aio read

2016-08-29 Thread Pavel Butsykin
This change will allow more efficient use of cache memory and
filter the case for which the pcache isn't efficient.  We miss requests
that are not required in the optimization and thereby reducing the number
of unnecessary readaheads.

Add pcache-max-aio-size open parameter.

Signed-off-by: Pavel Butsykin 
---
 block/pcache.c | 49 -
 1 file changed, 40 insertions(+), 9 deletions(-)

diff --git a/block/pcache.c b/block/pcache.c
index 7a317fc..287156a 100644
--- a/block/pcache.c
+++ b/block/pcache.c
@@ -100,6 +100,7 @@ typedef struct BDRVPCacheState {
 struct {
 uint32_t cache_size;
 uint32_t readahead_size;
+uint32_t max_aio_size;
 uint32_t lreq_pool_size;
 } cfg;
 
@@ -144,6 +145,7 @@ static const AIOCBInfo pcache_aiocb_info = {
 
 #define PCACHE_OPT_CACHE_SIZE "pcache-full-size"
 #define PCACHE_OPT_READAHEAD_SIZE "pcache-readahead-size"
+#define PCACHE_OPT_MAX_AIO_SIZE "pcache-max-aio-size"
 
 static QemuOptsList runtime_opts = {
 .name = "pcache",
@@ -164,6 +166,11 @@ static QemuOptsList runtime_opts = {
 .type = QEMU_OPT_SIZE,
 .help = "Prefetch cache readahead size",
 },
+{
+.name = PCACHE_OPT_MAX_AIO_SIZE,
+.type = QEMU_OPT_SIZE,
+.help = "Maximum size of aio which is handled by pcache",
+},
 { /* end of list */ }
 },
 };
@@ -173,6 +180,7 @@ static QemuOptsList runtime_opts = {
 #define PCACHE_DEFAULT_CACHE_SIZE (4 << MB_BITS)
 #define PCACHE_DEFAULT_READAHEAD_SIZE (128 << KB_BITS)
 #define PCACHE_DEFAULT_POOL_STAT_SIZE (1 << MB_BITS)
+#define PCACHE_DEFAULT_MAX_AIO_SIZE (32 << KB_BITS)
 
 enum {
 NODE_SUCCESS_STATUS = 0,
@@ -386,12 +394,7 @@ static void lreq_try_shrink(BDRVPCacheState *s)
 {
 while (s->lreq.curr_size > s->cfg.lreq_pool_size) {
 LRNode *rmv_node;
-/* XXX: need to filter large requests */
-if (QTAILQ_EMPTY(>lreq.lru.list)) {
-DPRINTF("lru lreq list is empty, but curr_size: %d\n",
-s->lreq.curr_size);
-break;
-}
+assert(!QTAILQ_EMPTY(>lreq.lru.list));
 
 qemu_co_mutex_lock(>lreq.lru.lock);
 rmv_node = LRNODE(QTAILQ_LAST(>lreq.lru.list, lru_head));
@@ -943,6 +946,23 @@ static void pcache_readahead_request(BlockDriverState *bs, 
PrefCacheAIOCB *acb)
 pcache_send_acb_request_list(bs, acb_readahead);
 }
 
+static inline bool pcache_skip_aio_read(BlockDriverState *bs,
+uint64_t sector_num,
+uint32_t nb_sectors)
+{
+BDRVPCacheState *s = bs->opaque;
+
+if (nb_sectors > s->cfg.max_aio_size) {
+return true;
+}
+
+if (bdrv_nb_sectors(bs) < sector_num + nb_sectors) {
+return true;
+}
+
+return false;
+}
+
 static BlockAIOCB *pcache_aio_readv(BlockDriverState *bs,
 int64_t sector_num,
 QEMUIOVector *qiov,
@@ -950,9 +970,16 @@ static BlockAIOCB *pcache_aio_readv(BlockDriverState *bs,
 BlockCompletionFunc *cb,
 void *opaque)
 {
-PrefCacheAIOCB *acb = pcache_aio_get(bs, sector_num, qiov, nb_sectors, cb,
- opaque, PCACHE_AIO_READ);
-int32_t status = pcache_prefetch(acb);
+PrefCacheAIOCB *acb;
+int32_t status;
+
+if (pcache_skip_aio_read(bs, sector_num, nb_sectors)) {
+return bdrv_aio_readv(bs->file, sector_num, qiov, nb_sectors,
+  cb, opaque);
+}
+acb = pcache_aio_get(bs, sector_num, qiov, nb_sectors, cb,
+ opaque, PCACHE_AIO_READ);
+status = pcache_prefetch(acb);
 if (status == PREFETCH_NEW_NODE) {
 BlockAIOCB *ret = bdrv_aio_readv(bs->file, sector_num, qiov, 
nb_sectors,
  cb, opaque);
@@ -993,9 +1020,12 @@ static void pcache_state_init(QemuOpts *opts, 
BDRVPCacheState *s)
 PCACHE_DEFAULT_CACHE_SIZE);
 uint64_t readahead_size = qemu_opt_get_size(opts, 
PCACHE_OPT_READAHEAD_SIZE,
 PCACHE_DEFAULT_READAHEAD_SIZE);
+uint64_t max_aio_size = qemu_opt_get_size(opts, PCACHE_OPT_MAX_AIO_SIZE,
+  PCACHE_DEFAULT_MAX_AIO_SIZE);
 DPRINTF("pcache configure:\n");
 DPRINTF("pcache-full-size = %jd\n", cache_size);
 DPRINTF("readahead_size = %jd\n", readahead_size);
+DPRINTF("max_aio_size = %jd\n", max_aio_size);
 
 s->pcache.tree.root = RB_ROOT;
 qemu_co_mutex_init(>pcache.tree.lock);
@@ -1012,6 +1042,7 @@ static void pcache_state_init(QemuOpts *opts, 
BDRVPCacheState *s)
 s->cfg.cache_size = cache_size >> BDRV_SECTOR_BITS;
 s->cfg.readahead_size = readahead_size >> 

[Qemu-block] [PATCH RFC v2 08/22] block/pcache: implement pickup parts of the cache

2016-08-29 Thread Pavel Butsykin
Implementation of obtaining fragments of the cache belonging to one area
of request. This will allow to handle the case when a request is partially
hits the cache.

Signed-off-by: Pavel Butsykin 
---
 block/pcache.c | 60 +-
 1 file changed, 59 insertions(+), 1 deletion(-)

diff --git a/block/pcache.c b/block/pcache.c
index 7504db8..28bd056 100644
--- a/block/pcache.c
+++ b/block/pcache.c
@@ -143,6 +143,24 @@ static int pcache_key_cmp(const RbNodeKey *key1, const 
RbNodeKey *key2)
 return 0;
 }
 
+static BlockNode *pcache_node_prev(BlockNode* node, RbNodeKey *key)
+{
+while (node) {
+struct RbNode *prev_rb_node = rb_prev(>rb_node);
+BlockNode *prev_node;
+if (prev_rb_node == NULL) {
+break;
+}
+prev_node = container_of(prev_rb_node, BlockNode, rb_node);
+if (prev_node->sector_num + prev_node->nb_sectors <= key->num) {
+break;
+}
+node = prev_node;
+}
+
+return node;
+}
+
 static void *node_insert(struct RbRoot *root, BlockNode *node)
 {
 struct RbNode **new = &(root->rb_node), *parent = NULL;
@@ -152,7 +170,7 @@ static void *node_insert(struct RbRoot *root, BlockNode 
*node)
 BlockNode *this = container_of(*new, BlockNode, rb_node);
 int result = pcache_key_cmp(>key, >key);
 if (result == 0) {
-return this;
+return pcache_node_prev(this, >key);
 }
 parent = *new;
 new = result < 0 ? &((*new)->rb_left) : &((*new)->rb_right);
@@ -258,6 +276,45 @@ static inline void prefetch_init_key(PrefCacheAIOCB *acb, 
RbNodeKey* key)
 key->size = acb->nb_sectors;
 }
 
+static void pcache_pickup_parts_of_cache(PrefCacheAIOCB *acb, PCNode *node,
+ uint64_t num, uint32_t size)
+{
+uint32_t up_size;
+
+do {
+if (num < node->cm.sector_num) {
+PCNode *new_node;
+RbNodeKey lc_key = {
+.num = num,
+.size = node->cm.sector_num - num,
+};
+up_size = lc_key.size;
+
+if (!pcache_node_find_and_create(acb, _key, _node)) {
+node = new_node;
+continue;
+}
+size -= up_size;
+num += up_size;
+}
+/* XXX: node read */
+up_size = MIN(node->cm.sector_num + node->cm.nb_sectors - num, size);
+
+size -= up_size;
+num += up_size;
+if (size != 0) {
+RbNodeKey lc_key = {
+.num = num,
+.size = size,
+};
+if (pcache_node_find_and_create(acb, _key, )) {
+size -= lc_key.size;
+assert(size == 0);
+}
+}
+} while (size);
+}
+
 enum {
 PREFETCH_NEW_NODE  = 0,
 PREFETCH_FULL_UP   = 1,
@@ -281,6 +338,7 @@ static int32_t pcache_prefetch(PrefCacheAIOCB *acb)
 {
 return PREFETCH_FULL_UP;
 }
+pcache_pickup_parts_of_cache(acb, node, key.num, key.size);
 
 return PREFETCH_PART_UP;
 }
-- 
2.8.3




[Qemu-block] [PATCH RFC v2 16/22] block/pcache: pcache readahead node around

2016-08-29 Thread Pavel Butsykin
if the next block is cached, then we need to check the size of node to
ensure full readahead.

Signed-off-by: Pavel Butsykin 
---
 block/pcache.c | 51 +--
 1 file changed, 49 insertions(+), 2 deletions(-)

diff --git a/block/pcache.c b/block/pcache.c
index 90b3f85..ae7ac8d 100644
--- a/block/pcache.c
+++ b/block/pcache.c
@@ -427,6 +427,16 @@ static bool pcache_node_find_and_create(PrefCacheAIOCB 
*acb, RbNodeKey *key,
 return true;
 }
 
+static inline PCNode *pcache_node_add(PrefCacheAIOCB *acb, RbNodeKey *key)
+{
+PCNode *node = NULL;
+if (!pcache_node_find_and_create(acb, key, )) {
+pcache_node_unref(acb->s, node);
+return NULL;
+}
+return node;
+}
+
 static uint64_t ranges_overlap_size(uint64_t node1, uint32_t size1,
 uint64_t node2, uint32_t size2)
 {
@@ -735,6 +745,38 @@ static void pcache_send_acb_request_list(BlockDriverState 
*bs,
 qemu_co_mutex_unlock(>requests.lock);
 }
 
+static void readahead_node_prev(PrefCacheAIOCB *acb, PCNode *node,
+RbNodeKey *key)
+{
+RbNodeKey lc_key;
+if (node->cm.key.num <= key->num) {
+return;
+}
+
+lc_key.num = key->num;
+lc_key.size = node->cm.key.num - key->num;
+
+pcache_node_add(acb, _key);
+}
+
+static void readahead_node_next(PrefCacheAIOCB *acb, PCNode *node,
+RbNodeKey *key, uint64_t total_sectors)
+{
+BDRVPCacheState *s;
+RbNodeKey lc_key;
+if (node->cm.key.num + node->cm.key.size >= key->num + key->size) {
+return;
+}
+s = acb->s;
+
+lc_key.num = node->cm.key.num + node->cm.key.size;
+lc_key.size = s->cfg.readahead_size;
+if (total_sectors <= lc_key.num + lc_key.size) {
+return;
+}
+pcache_node_add(acb, _key);
+}
+
 static bool check_allocated_blocks(BlockDriverState *bs, int64_t sector_num,
int32_t nb_sectors)
 {
@@ -777,9 +819,14 @@ static void pcache_readahead_request(BlockDriverState *bs, 
PrefCacheAIOCB *acb)
acb->common.opaque, PCACHE_AIO_READ |
PCACHE_AIO_READAHEAD);
 if (!pcache_node_find_and_create(acb_readahead, , )) {
+readahead_node_prev(acb_readahead, node, );
+readahead_node_next(acb_readahead, node, , total_sectors);
+
 pcache_node_unref(s, node);
-qemu_aio_unref(acb_readahead);
-return;
+if (acb_readahead->requests.cnt == 0) {
+qemu_aio_unref(acb_readahead);
+return;
+}
 }
 pcache_send_acb_request_list(bs, acb_readahead);
 }
-- 
2.8.3




Re: [Qemu-block] [PATCH v3 1/3] qemu-nbd: Add --fork option

2016-08-29 Thread Sascha Silbe
Dear Max,


thanks for taking the time to fix the race condition!


Max Reitz  writes:

> Using the --fork option, one can make qemu-nbd fork the worker process.
> The original process will exit on error of the worker or once the worker
> enters the main loop.

> @@ -773,7 +780,7 @@ int main(int argc, char **argv)
>  return 0;
>  }
>
> -if (device && !verbose) {
> +if ((device && !verbose) || fork_process) {
>  int stderr_fd[2];
>  pid_t pid;
>  int ret;

Looking at the surrounding (unchanged) code I see that qemu-nbd already
implemented a daemon mode. It's just that it's completely undocumented
and hinges on both the --device and the --verbose option. Yuck.

It seems there are two things --verbose does (from a user point of
view):

1. Print "NBD device %s is now connected to %s" and keep stderr open.

   Debug messages are always printed to stderr, but in non-verbose
   daemon mode they end up at /dev/null.

   This is more or less what one usually expects from an option named
   --verbose. Except that it only affects daemon mode and messages are
   always printed (but end up at /dev/null).

2. Disable daemon mode.

   I might expect this for an option named --debug, but certainly not
   for --verbose...


A clean way forward would be something like this:

1. Introduce --foreground / --daemon, --quiet

   Default to daemon mode with silent output if --connect is given,
   foreground mode with visible output otherwise. Set non-daemon mode
   with visible output if --verbose is given. Let --foreground /
   --daemon / --quiet any default or implicit value. Document that
   --verbose implicitly enables daemon mode for compatibility with
   previous versions and that future versions may stop doing so
   (i.e. users should use either --verbose --foreground or --verbose
   --daemon).

3. At some point in the future (qemu 3.0?) we can stop having --verbose
   imply --foreground.


I can give it a try if it's out of scope for your current task.


Sascha
-- 
Softwareentwicklung Sascha Silbe, Niederhofenstraße 5/1, 71229 Leonberg
https://se-silbe.de/
USt-IdNr. DE281696641




Re: [Qemu-block] [PATCH for-2.8 v3 2/3] module: Don't load the same module if requested multiple times

2016-08-29 Thread Max Reitz
On 17.08.2016 09:20, Fam Zheng wrote:
> Use a hash table to keep record of all loaded modules, and return early
> if the requested module is already loaded.
> 
> Signed-off-by: Fam Zheng 
> ---
>  util/module.c | 19 +--
>  1 file changed, 17 insertions(+), 2 deletions(-)
> 
> diff --git a/util/module.c b/util/module.c
> index a5f7fbd..63efad6 100644
> --- a/util/module.c
> +++ b/util/module.c
> @@ -163,14 +163,29 @@ void module_load_one(const char *prefix, const char 
> *lib_name)
>  char *fname = NULL;
>  char *exec_dir;
>  char *dirs[3];
> +char *module_name;
>  int i = 0;
>  int ret;
> +static GHashTable *loaded_modules;
>  
>  if (!g_module_supported()) {
>  fprintf(stderr, "Module is not supported by system.\n");
>  return;
>  }
>  
> +if (!loaded_modules) {
> +loaded_modules = g_hash_table_new(g_str_hash, g_str_equal);
> +}
> +
> +module_name = g_strdup_printf("%s%s", prefix, lib_name);
> +
> +if (g_hash_table_lookup(loaded_modules, module_name)) {
> +fprintf(stderr, "module is already loaded: %s\n", module_name);

I'm not quite happy with this warning message. Loading a module is
automatically initiated by internal code in qemu, i.e. never done by the
user. Therefore, printing a message for the user does not make much
sense to me since the user cannot do anything about this.

If it is truly wrong to attempt to load a module more than once, this
should be an assertion.

However, I think it's perfectly fine to just allow qemu code to try to
load a module more than once and just ignore the request if we've
already loaded the module (as the commit message implies). In this case,
we don't need an error message or warning, though.

Max

> +g_free(module_name);
> +return;
> +}
> +g_hash_table_insert(loaded_modules, module_name, module_name);
> +
>  exec_dir = qemu_get_exec_dir();
>  dirs[i++] = g_strdup_printf("%s", CONFIG_QEMU_MODDIR);
>  dirs[i++] = g_strdup_printf("%s/..", exec_dir ? : "");
> @@ -180,8 +195,8 @@ void module_load_one(const char *prefix, const char 
> *lib_name)
>  exec_dir = NULL;
>  
>  for (i = 0; i < ARRAY_SIZE(dirs); i++) {
> -fname = g_strdup_printf("%s/%s%s%s",
> -dirs[i], prefix, lib_name, HOST_DSOSUF);
> +fname = g_strdup_printf("%s/%s%s",
> +dirs[i], module_name, HOST_DSOSUF);
>  ret = module_load_file(fname);
>  g_free(fname);
>  fname = NULL;
> 




signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH for-2.8 v3 1/3] scripts: Allow block module to not define BlockDriver

2016-08-29 Thread Max Reitz
On 17.08.2016 09:20, Fam Zheng wrote:
> Signed-off-by: Fam Zheng 
> ---
>  scripts/modules/module_block.py | 5 -
>  1 file changed, 5 deletions(-)
> 
> diff --git a/scripts/modules/module_block.py b/scripts/modules/module_block.py
> index db4fb54..7efec00 100644
> --- a/scripts/modules/module_block.py
> +++ b/scripts/modules/module_block.py
> @@ -56,11 +56,6 @@ def process_file(fheader, filename):
>  format_name = ""
>  protocol_name = ""
>  
> -if not found_something:
> -print("No BlockDriver struct found in " + filename + ". \
> -Is this really a module?", file=sys.stderr)
> -sys.exit(1)

found_something is now unused. Do you want to remove it?

Max

> -
>  def print_top(fheader):
>  fheader.write('''/* AUTOMATICALLY GENERATED, DO NOT MODIFY */
>  /*
> 




signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH v2] iotest 055: refactor and speed up

2016-08-29 Thread Max Reitz
On 09.08.2016 15:15, Vladimir Sementsov-Ogievskiy wrote:
> Source disk is created and filled with test data before each test case.
> Instead initialize it once for the whole unit.
> 
> Test disk filling patterns are merged into one pattern.
> 
> Also TestSetSpeed used different image_len for source and target (by
> mistake) - this is automatically fixed here.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
> 
> v2: rebase on block-next, as compression test pattern differs, merge patterns.
> 
> Need review from Pavel, is new merged disk filling pattern ok for you?
> 
> Also, removed from commit message performance measurements. On block-next 
> this test
> is too long for other reasons, so for qcow speed is not such significant: for 
> qcow2
> I have 7min:33s -> 6min:45s.
> 
> 
>  tests/qemu-iotests/055 | 52 
> +-
>  1 file changed, 18 insertions(+), 34 deletions(-)

Thanks, Vladimir, I've applied the patch to my block-next branch:

https://github.com/XanClic/qemu/commits/block-next

Max



signature.asc
Description: OpenPGP digital signature