date:20161230

[Qemu-block] [PATCH V4 01/10] virtio: convert to use DMA api

2016-12-30 Thread Jason Wang

Currently, all virtio devices bypass IOMMU completely. This is because
address_space_memory is assumed and used during DMA emulation. This
patch converts the virtio core API to use DMA API. This idea is

- introducing a new transport specific helper to query the dma address
  space. (only pci version is implemented).
- query and use this address space during virtio device guest memory
  accessing when iommu platform (VIRTIO_F_IOMMU_PLATFORM) was enabled
  for this device.

Cc: Michael S. Tsirkin 
Cc: Stefan Hajnoczi 
Cc: Kevin Wolf 
Cc: Amit Shah 
Cc: Paolo Bonzini 
Cc: qemu-block@nongnu.org
Signed-off-by: Jason Wang 
---
 hw/block/virtio-blk.c |  2 +-
 hw/char/virtio-serial-bus.c   |  3 ++-
 hw/scsi/virtio-scsi.c |  4 ++-
 hw/virtio/virtio-bus.c|  8 ++
 hw/virtio/virtio-pci.c| 14 ++
 hw/virtio/virtio.c| 57 +--
 include/hw/virtio/virtio-access.h | 31 ++---
 include/hw/virtio/virtio-bus.h|  1 +
 include/hw/virtio/virtio.h|  9 ---
 9 files changed, 93 insertions(+), 36 deletions(-)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 0c5fd27..12657ff 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -857,7 +857,7 @@ static int virtio_blk_load_device(VirtIODevice *vdev, 
QEMUFile *f,
 }
 }
 
-req = qemu_get_virtqueue_element(f, sizeof(VirtIOBlockReq));
+req = qemu_get_virtqueue_element(vdev, f, sizeof(VirtIOBlockReq));
 virtio_blk_init_request(s, virtio_get_queue(vdev, vq_idx), req);
 req->next = s->rq;
 s->rq = req;
diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c
index 7975c2c..d544cd9 100644
--- a/hw/char/virtio-serial-bus.c
+++ b/hw/char/virtio-serial-bus.c
@@ -732,6 +732,7 @@ static void virtio_serial_post_load_timer_cb(void *opaque)
 static int fetch_active_ports_list(QEMUFile *f,
VirtIOSerial *s, uint32_t nr_active_ports)
 {
+VirtIODevice *vdev = VIRTIO_DEVICE(s);
 uint32_t i;
 
 s->post_load = g_malloc0(sizeof(*s->post_load));
@@ -765,7 +766,7 @@ static int fetch_active_ports_list(QEMUFile *f,
 qemu_get_be64s(f, &port->iov_offset);
 
 port->elem =
-qemu_get_virtqueue_element(f, sizeof(VirtQueueElement));
+qemu_get_virtqueue_element(vdev, f, sizeof(VirtQueueElement));
 
 /*
  *  Port was throttled on source machine.  Let's
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 34bba35..3da9d62 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -198,12 +198,14 @@ static void *virtio_scsi_load_request(QEMUFile *f, 
SCSIRequest *sreq)
 SCSIBus *bus = sreq->bus;
 VirtIOSCSI *s = container_of(bus, VirtIOSCSI, bus);
 VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s);
+VirtIODevice *vdev = VIRTIO_DEVICE(s);
 VirtIOSCSIReq *req;
 uint32_t n;
 
 qemu_get_be32s(f, &n);
 assert(n < vs->conf.num_queues);
-req = qemu_get_virtqueue_element(f, sizeof(VirtIOSCSIReq) + vs->cdb_size);
+req = qemu_get_virtqueue_element(vdev, f,
+ sizeof(VirtIOSCSIReq) + vs->cdb_size);
 virtio_scsi_init_req(s, vs->cmd_vqs[n], req);
 
 if (virtio_scsi_parse_req(req, sizeof(VirtIOSCSICmdReq) + vs->cdb_size,
diff --git a/hw/virtio/virtio-bus.c b/hw/virtio/virtio-bus.c
index d6c0c72..d31cc00 100644
--- a/hw/virtio/virtio-bus.c
+++ b/hw/virtio/virtio-bus.c
@@ -28,6 +28,7 @@
 #include "hw/qdev.h"
 #include "hw/virtio/virtio-bus.h"
 #include "hw/virtio/virtio.h"
+#include "exec/address-spaces.h"
 
 /* #define DEBUG_VIRTIO_BUS */
 
@@ -61,6 +62,13 @@ void virtio_bus_device_plugged(VirtIODevice *vdev, Error 
**errp)
 if (klass->device_plugged != NULL) {
 klass->device_plugged(qbus->parent, errp);
 }
+
+if (klass->get_dma_as != NULL &&
+virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM)) {
+vdev->dma_as = klass->get_dma_as(qbus->parent);
+} else {
+vdev->dma_as = &address_space_memory;
+}
 }
 
 /* Reset the virtio_bus */
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 21c2b9d..213d57e 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -1144,6 +1144,14 @@ static int virtio_pci_query_nvectors(DeviceState *d)
 return proxy->nvectors;
 }
 
+static AddressSpace *virtio_pci_get_dma_as(DeviceState *d)
+{
+VirtIOPCIProxy *proxy = VIRTIO_PCI(d);
+PCIDevice *dev = &proxy->pci_dev;
+
+return pci_get_address_space(dev);
+}
+
 static int virtio_pci_add_mem_cap(VirtIOPCIProxy *proxy,
struct virtio_pci_cap *cap)
 {
@@ -1601,6 +1609,11 @@ static void virtio_pci_device_plugged(DeviceState *d, 
Error **errp)
 }
 
 if (legacy) {
+if (virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM)) {
+error_setg(errp, "VIRTIO_

[Qemu-block] [PATCH v2 11/18] block/pcache: add reading data from the cache

2016-12-30 Thread Pavel Butsykin

Added read_cache_data_direct() allowing to read data from node to qiov.
And the simplest use case - read data from node provided that the node is
not in flight and fully covers read request.

Signed-off-by: Pavel Butsykin 
---
 block/pcache.c | 56 
 1 file changed, 56 insertions(+)

diff --git a/block/pcache.c b/block/pcache.c
index 4007241d37..e3749679ca 100644
--- a/block/pcache.c
+++ b/block/pcache.c
@@ -92,6 +92,30 @@ static void pcache_node_unref(PCacheNode *node)
 }
 }
 
+static uint64_t ranges_overlap_size(uint64_t offset1, uint64_t size1,
+uint64_t offset2, uint32_t size2)
+{
+return MIN(offset1 + size1, offset2 + size2) - MAX(offset1, offset2);
+}
+
+static void read_cache_data_direct(PCacheNode *node, uint64_t offset,
+   uint64_t bytes, QEMUIOVector *qiov)
+{
+uint64_t qiov_offs = 0, node_offs = 0;
+uint64_t size;
+uint64_t copy;
+
+if (offset < node->common.offset) {
+qiov_offs = node->common.offset - offset;
+} else {
+node_offs = offset - node->common.offset;
+}
+size = ranges_overlap_size(offset, bytes, node->common.offset,
+   node->common.bytes);
+copy = qemu_iovec_from_buf(qiov, qiov_offs, node->data + node_offs, size);
+assert(copy == size);
+}
+
 static void update_req_stats(RBCache *rbcache, uint64_t offset, uint64_t bytes)
 {
 do {
@@ -288,6 +312,32 @@ static void pcache_readahead_entry(void *opaque)
 readahead_co->bytes);
 }
 
+enum {
+CACHE_MISS,
+CACHE_HIT,
+};
+
+static int pcache_lookup_data(BlockDriverState *bs, uint64_t offset,
+  uint64_t bytes, QEMUIOVector *qiov)
+{
+BDRVPCacheState *s = bs->opaque;
+
+PCacheNode *node = rbcache_search(s->cache, offset, bytes);
+if (node == NULL || node->status & NODE_STATUS_INFLIGHT) {
+return CACHE_MISS;
+}
+
+/* Node covers the whole request */
+if (node->common.offset <= offset &&
+node->common.offset + node->common.bytes >= offset + bytes)
+{
+read_cache_data_direct(node, offset, bytes, qiov);
+return CACHE_HIT;
+}
+
+return CACHE_MISS;
+}
+
 static coroutine_fn int pcache_co_preadv(BlockDriverState *bs, uint64_t offset,
  uint64_t bytes, QEMUIOVector *qiov,
  int flags)
@@ -295,6 +345,7 @@ static coroutine_fn int pcache_co_preadv(BlockDriverState 
*bs, uint64_t offset,
 BDRVPCacheState *s = bs->opaque;
 PCacheReadaheadCo readahead_co;
 Coroutine *co;
+int status;
 
 if (bytes > s->max_aio_size) {
 goto skip_large_request;
@@ -310,6 +361,11 @@ static coroutine_fn int pcache_co_preadv(BlockDriverState 
*bs, uint64_t offset,
 co = qemu_coroutine_create(pcache_readahead_entry, &readahead_co);
 qemu_coroutine_enter(co);
 
+status = pcache_lookup_data(bs, offset, bytes, qiov);
+if (status == CACHE_HIT) {
+return 0;
+}
+
 skip_large_request:
 return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
 }
-- 
2.11.0

[Qemu-block] [PATCH v2 15/18] block/pcache: pick up parts of the cache

2016-12-30 Thread Pavel Butsykin

Provided that the request size is less than the readahead size, a partial cache
hit can occur in the following three cases:
1. The request covers the bottom part of the node
2. The request covers the upper part of the node
3. The request is between two nodes and partially covers both of them

The function pickup_parts_of_cache() covers all three cases and for the
uncached part of the request it creates a new request.

Signed-off-by: Pavel Butsykin 
---
 block/pcache.c | 174 ++---
 1 file changed, 166 insertions(+), 8 deletions(-)

diff --git a/block/pcache.c b/block/pcache.c
index 86b28de44b..3f89e40b05 100644
--- a/block/pcache.c
+++ b/block/pcache.c
@@ -10,6 +10,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/error-report.h"
 #include "block/block_int.h"
 #include "qapi/error.h"
 #include "qapi/qmp/qstring.h"
@@ -355,6 +356,153 @@ static void pcache_readahead_entry(void *opaque)
 readahead_co->bytes);
 }
 
+/*
+ * Provided that the request size is less or equal than the readahead size,
+ * a partial cache hit can occur in the following three cases:
+ * 1. The request covers the bottom part of the node.
+ * 2. The request covers the upper part of the node.
+ * 3. The request is between two nodes and partially covers both of them.
+ *
+ * Therefore, the request can be divided into no more than 3 parts.
+ */
+#define PCACHE_MAX_FRAGMENT_NUM 3
+
+typedef struct PartReqEntry {
+ReqLinkEntry rlink;
+PCacheNode *node;
+} PartReqEntry;
+
+typedef struct PartReqDesc {
+Coroutine *co;
+PartReqEntry parts[PCACHE_MAX_FRAGMENT_NUM];
+uint32_t cnt;
+uint32_t completed;
+} PartReqDesc;
+
+typedef struct PCachePartReqCo {
+BlockDriverState *bs;
+uint64_t offset;
+uint64_t bytes;
+PartReqDesc *desc;
+} PCachePartReqCo;
+
+static void coroutine_fn pcache_co_part_req(BlockDriverState *bs,
+uint64_t offset, uint64_t bytes,
+PartReqDesc *req)
+{
+BDRVPCacheState *s = bs->opaque;
+QEMUIOVector qiov;
+PartReqEntry *part = &req->parts[req->cnt];
+PCacheNode *node = container_of(rbcache_node_alloc(s->cache, offset, 
bytes),
+PCacheNode, common);
+node->status = NODE_STATUS_INFLIGHT;
+qemu_iovec_init(&qiov, 1);
+qemu_iovec_add(&qiov, node->data, node->common.bytes);
+
+part->node = node;
+assert(++req->cnt <= PCACHE_MAX_FRAGMENT_NUM);
+
+part->rlink.ret = bdrv_co_preadv(bs->file, offset, bytes, &qiov, 0);
+
+node->status = NODE_STATUS_COMPLETED | NODE_STATUS_REMOVE;
+QLIST_INSERT_HEAD(&s->remove_node_list, node, entry);
+
+if (!qemu_coroutine_entered(req->co)) {
+qemu_coroutine_enter(req->co);
+} else {
+req->completed++;
+}
+}
+
+static void pcache_part_req_entry(void *opaque)
+{
+PCachePartReqCo *req_co = opaque;
+
+pcache_co_part_req(req_co->bs, req_co->offset, req_co->bytes, 
req_co->desc);
+}
+
+static int pickup_parts_of_cache(BlockDriverState *bs, PCacheNode *node,
+ uint64_t offset, uint64_t bytes,
+ QEMUIOVector *qiov)
+{
+BDRVPCacheState *s = bs->opaque;
+PartReqDesc req = {
+.co = qemu_coroutine_self(),
+};
+PCachePartReqCo req_co = {
+.bs = bs,
+.desc = &req
+};
+uint64_t chunk_offset = offset, chunk_bytes = bytes;
+uint64_t up_size;
+int ret = 0;
+
+do {
+pcache_node_ref(node);
+
+if (chunk_offset < node->common.offset) {
+Coroutine *co;
+
+req_co.offset = chunk_offset;
+req_co.bytes = up_size = node->common.offset - chunk_offset;
+
+co = qemu_coroutine_create(pcache_part_req_entry, &req_co);
+qemu_coroutine_enter(co);
+
+chunk_offset += up_size;
+chunk_bytes -= up_size;
+}
+
+req.parts[req.cnt].node = node;
+if (node->status & NODE_STATUS_INFLIGHT) {
+req.parts[req.cnt].rlink.co = qemu_coroutine_self();
+QTAILQ_INSERT_HEAD(&node->wait_list,
+   &req.parts[req.cnt].rlink, entry);
+} else {
+req.completed++;
+}
+assert(++req.cnt <= PCACHE_MAX_FRAGMENT_NUM);
+
+up_size = MIN(node->common.offset + node->common.bytes - chunk_offset,
+  chunk_bytes);
+chunk_bytes -= up_size;
+chunk_offset += up_size;
+
+if (chunk_bytes != 0) {
+node = rbcache_search(s->cache, chunk_offset, chunk_bytes);
+if (node == NULL) {
+Coroutine *co;
+
+req_co.offset = chunk_offset;
+req_co.bytes = chunk_bytes;
+
+co = qemu_coroutine_create(pcache_part_req_entry, &req_co);
+qemu_coroutine_enter(co);
+chunk_bytes = 0;
+

[Qemu-block] [PATCH v2 08/18] block/pcache: add AIO readahead

2016-12-30 Thread Pavel Butsykin

This patch adds readahead data to the cache. Here the readahead is a separate
asynchronous request, which doesn't depend on completion of filtered read
requests. The readahead is done only by the condition, if before the current
request there's sequential read data enough size. This information can give
the request statistics, of course this method of detection is not very reliable,
but in most cases it'll work.

Signed-off-by: Pavel Butsykin 
---
 block/pcache.c | 206 -
 1 file changed, 204 insertions(+), 2 deletions(-)

diff --git a/block/pcache.c b/block/pcache.c
index deac57c58d..57eebd434a 100644
--- a/block/pcache.c
+++ b/block/pcache.c
@@ -17,6 +17,8 @@
 
 #define PCACHE_OPT_STATS_SIZE "pcache-stats-size"
 #define PCACHE_OPT_MAX_AIO_SIZE "pcache-max-aio-size"
+#define PCACHE_OPT_CACHE_SIZE "pcache-full-size"
+#define PCACHE_OPT_READAHEAD_SIZE "pcache-readahead-size"
 
 static QemuOptsList runtime_opts = {
 .name = "pcache",
@@ -32,6 +34,16 @@ static QemuOptsList runtime_opts = {
 .type = QEMU_OPT_SIZE,
 .help = "Maximum size of aio which is handled by pcache",
 },
+{
+.name = PCACHE_OPT_CACHE_SIZE,
+.type = QEMU_OPT_SIZE,
+.help = "Total cache size",
+},
+{
+.name = PCACHE_OPT_READAHEAD_SIZE,
+.type = QEMU_OPT_SIZE,
+.help = "Prefetch cache readahead size",
+},
 { /* end of list */ }
 },
 };
@@ -40,12 +52,46 @@ static QemuOptsList runtime_opts = {
 #define MB_BITS 20
 #define PCACHE_DEFAULT_STATS_SIZE (3 << MB_BITS)
 #define PCACHE_DEFAULT_MAX_AIO_SIZE (64 << KB_BITS)
+#define PCACHE_DEFAULT_CACHE_SIZE (4 << MB_BITS)
+#define PCACHE_DEFAULT_READAHEAD_SIZE (128 << KB_BITS)
 
 typedef struct BDRVPCacheState {
 RBCache *req_stats;
+RBCache *cache;
 uint64_t max_aio_size;
+uint64_t readahead_size;
 } BDRVPCacheState;
 
+typedef struct PCacheNode {
+RBCacheNode common;
+uint8_t *data;
+enum {
+NODE_STATUS_NEW   = 0x01,
+NODE_STATUS_INFLIGHT  = 0x02,
+NODE_STATUS_COMPLETED = 0x04,
+NODE_STATUS_REMOVE= 0x08,
+NODE_STATUS_DELETED   = 0x10, /* only for debugging */
+} status;
+int ref;
+} PCacheNode;
+
+static inline void pcache_node_ref(PCacheNode *node)
+{
+node->ref++;
+}
+
+static void pcache_node_unref(PCacheNode *node)
+{
+assert(node->ref > 0);
+if (--node->ref == 0) {
+assert(node->status & NODE_STATUS_REMOVE);
+node->status |= NODE_STATUS_DELETED;
+
+g_free(node->data);
+g_free(node);
+}
+}
+
 static void update_req_stats(RBCache *rbcache, uint64_t offset, uint64_t bytes)
 {
 do {
@@ -80,16 +126,165 @@ static void update_req_stats(RBCache *rbcache, uint64_t 
offset, uint64_t bytes)
 } while (true);
 }
 
+static bool check_request_sequence(BDRVPCacheState *s, uint64_t offset)
+{
+uint64_t cache_line_size = s->readahead_size;
+uint64_t check_offset;
+
+if (offset <= cache_line_size) {
+return false;
+}
+check_offset = offset - cache_line_size;
+
+do {
+RBCacheNode *node = rbcache_search(s->req_stats, check_offset,
+   offset - check_offset);
+if (node == NULL) {
+return false;
+}
+if (node->offset > check_offset) {
+return false;
+}
+check_offset = node->offset + node->bytes;
+} while (check_offset < offset);
+
+return true;
+}
+
+static void pcache_node_free(RBCacheNode *rbnode, void *opaque)
+{
+PCacheNode *node = container_of(rbnode, PCacheNode, common);
+
+assert(node->status == NODE_STATUS_INFLIGHT ||
+   node->status == NODE_STATUS_COMPLETED);
+
+node->status |= NODE_STATUS_REMOVE;
+pcache_node_unref(node);
+}
+
+static RBCacheNode *pcache_node_alloc(uint64_t offset, uint64_t bytes,
+  void *opaque)
+{
+PCacheNode *node = g_malloc(sizeof(*node));
+
+node->data = g_malloc(bytes);
+node->status = NODE_STATUS_NEW;
+node->ref = 1;
+
+return &node->common;
+}
+
+#define PCACHE_STEPS_FORWARD 2
+
+static PCacheNode *get_readahead_node(BlockDriverState *bs, RBCache *rbcache,
+  uint64_t offset, uint64_t bytes)
+{
+uint32_t count = PCACHE_STEPS_FORWARD;
+
+int64_t total_bytes = bdrv_getlength(bs);
+if (total_bytes < 0) {
+return NULL;
+}
+
+while (count--) {
+PCacheNode *node;
+
+if (total_bytes <= offset + bytes) {
+break;
+}
+
+node = rbcache_search_and_insert(rbcache, offset, bytes);
+if (node->status == NODE_STATUS_NEW) {
+return node;
+}
+ /* The range less than the readahead size is not cached to reduce
+  * fragmentation of the cache. If the data is already cached, then we
+

[Qemu-block] [PATCH v2 16/18] block/pcache: drop used pcache nodes

2016-12-30 Thread Pavel Butsykin

The pcache is directed to certain situations to sequential reads. This concept
allows to drop parts of the cache that were already used, which will reduce
the size of cache and the number of displaced nodes.

Signed-off-by: Pavel Butsykin 
---
 block/pcache.c | 26 +++---
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/block/pcache.c b/block/pcache.c
index 3f89e40b05..6d2b54cf78 100644
--- a/block/pcache.c
+++ b/block/pcache.c
@@ -81,6 +81,7 @@ typedef struct PCacheNode {
 NODE_STATUS_REMOVE= 0x08,
 NODE_STATUS_DELETED   = 0x10, /* only for debugging */
 } status;
+uint64_t rdcnt;
 QLIST_ENTRY(PCacheNode) entry;
 int ref;
 } PCacheNode;
@@ -109,13 +110,17 @@ static uint64_t ranges_overlap_size(uint64_t offset1, 
uint64_t size1,
 return MIN(offset1 + size1, offset2 + size2) - MAX(offset1, offset2);
 }
 
-static void read_cache_data_direct(PCacheNode *node, uint64_t offset,
-   uint64_t bytes, QEMUIOVector *qiov)
+static void read_cache_data_direct(BlockDriverState *bs, PCacheNode *node,
+   uint64_t offset, uint64_t bytes,
+   QEMUIOVector *qiov)
 {
+BDRVPCacheState *s = bs->opaque;
 uint64_t qiov_offs = 0, node_offs = 0;
 uint64_t size;
 uint64_t copy;
 
+assert(node->status & NODE_STATUS_COMPLETED);
+
 if (offset < node->common.offset) {
 qiov_offs = node->common.offset - offset;
 } else {
@@ -124,11 +129,17 @@ static void read_cache_data_direct(PCacheNode *node, 
uint64_t offset,
 size = ranges_overlap_size(offset, bytes, node->common.offset,
node->common.bytes);
 copy = qemu_iovec_from_buf(qiov, qiov_offs, node->data + node_offs, size);
+node->rdcnt += size;
+if (node->rdcnt >= node->common.bytes &&
+!(node->status & NODE_STATUS_REMOVE))
+{
+rbcache_remove(s->cache, &node->common);
+}
 assert(copy == size);
 }
 
-static int read_cache_data(PCacheNode *node, uint64_t offset, uint64_t bytes,
-   QEMUIOVector *qiov)
+static int read_cache_data(BlockDriverState *bs, PCacheNode *node,
+   uint64_t offset, uint64_t bytes, QEMUIOVector *qiov)
 {
 if (node->status & NODE_STATUS_INFLIGHT) {
 ReqLinkEntry rlink = {
@@ -143,7 +154,7 @@ static int read_cache_data(PCacheNode *node, uint64_t 
offset, uint64_t bytes,
 return rlink.ret;
 }
 }
-read_cache_data_direct(node, offset, bytes, qiov);
+read_cache_data_direct(bs, node, offset, bytes, qiov);
 
 return 0;
 }
@@ -228,6 +239,7 @@ static RBCacheNode *pcache_node_alloc(uint64_t offset, 
uint64_t bytes,
 
 node->data = g_malloc(bytes);
 node->status = NODE_STATUS_NEW;
+node->rdcnt = 0;
 node->ref = 1;
 QTAILQ_INIT(&node->wait_list);
 
@@ -492,7 +504,7 @@ static int pickup_parts_of_cache(BlockDriverState *bs, 
PCacheNode *node,
 PartReqEntry *part = &req.parts[req.cnt];
 if (ret == 0) {
 if (part->rlink.ret == 0) {
-read_cache_data_direct(part->node, offset, bytes, qiov);
+read_cache_data_direct(bs, part->node, offset, bytes, qiov);
 } else {
 ret = part->rlink.ret;
 }
@@ -523,7 +535,7 @@ static int pcache_lookup_data(BlockDriverState *bs, 
uint64_t offset,
 if (node->common.offset <= offset &&
 node->common.offset + node->common.bytes >= offset + bytes)
 {
-ret = read_cache_data(node, offset, bytes, qiov);
+ret = read_cache_data(bs, node, offset, bytes, qiov);
 
 } else {
 ret = pickup_parts_of_cache(bs, node, offset, bytes, qiov);
-- 
2.11.0

[Qemu-block] [PATCH v2 02/18] util/rbtree: add rbtree from linux kernel

2016-12-30 Thread Pavel Butsykin

Why don't we use rbtree from glib? We need  pointer to the parent node.
For optimal implementation storing of cached chunks in the rbtree need to
get next and previous nodes and content of parent node is very useful for
effective implementation of these functions. In this implementation of
rbtree (unlike rbtree of glib) the node contains a pointer to parent node.
Moreover, this rbtree allows more flexibility to work with an algorithm
because to use rbtrees you'll have to implement your own insert and search
cores. This will avoid us to use callbacks and to drop drammatically
performances.

Signed-off-by: Pavel Butsykin 
---
 MAINTAINERS |   7 +
 include/qemu/rbtree.h   | 107 
 include/qemu/rbtree_augmented.h | 235 +
 util/Makefile.objs  |   1 +
 util/rbtree.c   | 570 
 5 files changed, 920 insertions(+)
 create mode 100644 include/qemu/rbtree.h
 create mode 100644 include/qemu/rbtree_augmented.h
 create mode 100644 util/rbtree.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 585cd5abd7..228278c1ca 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1465,6 +1465,13 @@ F: include/qemu/throttle.h
 F: util/throttle.c
 L: qemu-block@nongnu.org
 
+Red Black Trees
+M: Denis V. Lunev 
+S: Supported
+F: include/qemu/rbtree.h
+F: include/qemu/rbtree_augmented.h
+F: util/rbtree.c
+
 UUID
 M: Fam Zheng 
 S: Supported
diff --git a/include/qemu/rbtree.h b/include/qemu/rbtree.h
new file mode 100644
index 00..d2e3fdd149
--- /dev/null
+++ b/include/qemu/rbtree.h
@@ -0,0 +1,107 @@
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli 
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+  include/qemu/rbtree.h
+
+  To use rbtrees you'll have to implement your own insert and search cores.
+  This will avoid us to use callbacks and to drop drammatically performances.
+  I know it's not the cleaner way,  but in C (not in C++) to get
+  performances and genericity...
+*/
+
+#ifndef QEMU_RBTREE_H
+#define QEMU_RBTREE_H
+
+#include 
+#include 
+#include 
+
+struct RbNode {
+uintptr_t __rb_parent_color;
+struct RbNode *rb_right;
+struct RbNode *rb_left;
+} __attribute__((aligned(sizeof(uintptr_t;
+/* The alignment might seem pointless, but allegedly CRIS needs it */
+
+struct RbRoot {
+struct RbNode *rb_node;
+};
+
+
+#define RB_PARENT(r) ((struct RbNode *)((r)->__rb_parent_color & ~3))
+
+#define RB_ROOT (struct RbRoot) { NULL, }
+#define RB_ENTRY(ptr, type, member) container_of(ptr, type, member)
+
+#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL)
+
+/* 'empty' nodes are nodes that are known not to be inserted in an rbtree */
+#define RB_EMPTY_NODE(node)  \
+((node)->__rb_parent_color == (uintptr_t)(node))
+#define RB_CLEAR_NODE(node)  \
+((node)->__rb_parent_color = (uintptr_t)(node))
+
+
+extern void rb_insert_color(struct RbNode *, struct RbRoot *);
+extern void rb_erase(struct RbNode *, struct RbRoot *);
+
+
+/* Find logical next and previous nodes in a tree */
+extern struct RbNode *rb_next(const struct RbNode *);
+extern struct RbNode *rb_prev(const struct RbNode *);
+extern struct RbNode *rb_first(const struct RbRoot *);
+extern struct RbNode *rb_last(const struct RbRoot *);
+
+/* Postorder iteration - always visit the parent after its children */
+extern struct RbNode *rb_first_postorder(const struct RbRoot *);
+extern struct RbNode *rb_next_postorder(const struct RbNode *);
+
+/* Fast replacement of a single node without remove/rebalance/add/rebalance */
+extern void rb_replace_node(struct RbNode *victim, struct RbNode *new,
+struct RbRoot *root);
+
+static inline void rb_link_node(struct RbNode *node, struct RbNode *parent,
+struct RbNode **rb_link)
+{
+node->__rb_parent_color = (uintptr_t)parent;
+node->rb_left = node->rb_right = NULL;
+
+*rb_link = node;
+}
+
+#define RB_ENTRY_SAFE(ptr, type, member) \
+({ typeof(ptr) ptr = (ptr);  \
+   ptr ? rb_entry(ptr, type, member) : NULL; \
+})
+
+/**
+ * rbtree_postorder_for_each_entry_safe - iterate over rb_root in post order of
+ * given type safe against removal of rb_node entry
+ *
+ * @pos:   the 'type *' to use as a loop cursor.
+ * @n

[Qemu-block] [PATCH v2 18/18] block/pcache: add tracepoints

2016-12-30 Thread Pavel Butsykin

Signed-off-by: Pavel Butsykin 
---
 block/pcache.c | 20 
 block/trace-events | 10 ++
 2 files changed, 30 insertions(+)

diff --git a/block/pcache.c b/block/pcache.c
index 6d2b54cf78..2bce3efc59 100644
--- a/block/pcache.c
+++ b/block/pcache.c
@@ -15,6 +15,7 @@
 #include "qapi/error.h"
 #include "qapi/qmp/qstring.h"
 #include "qemu/rbcache.h"
+#include "trace.h"
 
 #define PCACHE_OPT_STATS_SIZE "pcache-stats-size"
 #define PCACHE_OPT_MAX_AIO_SIZE "pcache-max-aio-size"
@@ -348,12 +349,18 @@ static void coroutine_fn 
pcache_co_readahead(BlockDriverState *bs,
 node->status |= NODE_STATUS_COMPLETED;
 
 if (ret < 0) {
+trace_pcache_readahead_fail(ret, node->common.offset,
+node->common.bytes);
 rbcache_remove(s->cache, &node->common);
 }
 
 QTAILQ_FOREACH_SAFE(rlink, &node->wait_list, entry, next) {
 QTAILQ_REMOVE(&node->wait_list, rlink, entry);
 rlink->ret = ret;
+
+trace_pcache_readahead_pending_node_complete(ret, node->common.offset,
+node->common.bytes, offset, bytes);
+
 qemu_coroutine_enter(rlink->co);
 }
 
@@ -509,6 +516,10 @@ static int pickup_parts_of_cache(BlockDriverState *bs, 
PCacheNode *node,
 ret = part->rlink.ret;
 }
 }
+if (part->rlink.ret < 0) {
+trace_pcache_read_part_fail(ret, part->node->common.offset,
+part->node->common.bytes);
+}
 pcache_node_unref(part->node);
 }
 
@@ -618,6 +629,8 @@ static void pcache_write_through(BlockDriverState *bs, 
uint64_t offset,
 
 if (node->status & NODE_STATUS_COMPLETED) {
 write_cache_data(node, offset, bytes, qiov);
+trace_pcache_write_through(offset, bytes, node->common.offset,
+   node->common.bytes);
 }
 } while (end_offs > chunk_offset);
 
@@ -631,6 +644,8 @@ static void pcache_write_through(BlockDriverState *bs, 
uint64_t offset,
 continue;
 }
 write_cache_data(node, offset, bytes, qiov);
+trace_pcache_write_through_removed_node(offset, bytes,
+node->common.offset, node->common.bytes);
 }
 }
 
@@ -663,6 +678,9 @@ static int pcache_state_init(QemuOpts *opts, 
BDRVPCacheState *s)
   PCACHE_DEFAULT_READAHEAD_SIZE);
 QLIST_INIT(&s->remove_node_list);
 
+trace_pcache_state_init(stats_size, s->max_aio_size, cache_size,
+s->readahead_size);
+
 if (s->readahead_size < s->max_aio_size) {
 error_report("Readahead size can't be less than maximum request size"
  "that can be handled by pcache");
@@ -704,6 +722,8 @@ static void pcache_close(BlockDriverState *bs)
 {
 BDRVPCacheState *s = bs->opaque;
 
+trace_pcache_close(s->req_stats, s->cache);
+
 rbcache_destroy(s->req_stats);
 rbcache_destroy(s->cache);
 
diff --git a/block/trace-events b/block/trace-events
index cfc05f2478..d9cef3bcec 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -112,3 +112,13 @@ qed_aio_write_data(void *s, void *acb, int ret, uint64_t 
offset, size_t len) "s
 qed_aio_write_prefill(void *s, void *acb, uint64_t start, size_t len, uint64_t 
offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64
 qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, 
uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64
 qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) 
"s %p acb %p ret %d offset %"PRIu64" len %zu"
+
+# block/pcache.c
+pcache_readahead_fail(int ret, uint64_t offset, uint64_t bytes) "ret: %d 
offset: %"PRIu64" bytes: %"PRIu64
+pcache_readahead_pending_node_complete(int ret, uint64_t node_offset, uint64_t 
node_bytes, uint64_t read_offset, uint64_t read_bytes) "ret: %d node: %"PRIu64" 
%"PRIu64" pending read: %"PRIu64" %"PRIu64
+pcache_aio_read_cb_fail(int ret, uint64_t offset, uint64_t bytes) "ret: %d 
offset: %"PRIu64" bytes: %"PRIu64
+pcache_read_part_fail(int ret, uint64_t offset, uint64_t bytes) "ret: %d 
offset: %"PRIu64" bytes: %"PRIu64
+pcache_write_through(uint64_t req_offset, uint64_t req_bytes, uint64_t 
node_offset, uint64_t node_bytes) "request: %"PRIu64" %"PRIu64" node: %"PRIu64" 
%"PRIu64
+pcache_write_through_removed_node(uint64_t req_offset, uint64_t req_bytes, 
uint64_t node_offset, uint64_t node_bytes) "request: %"PRIu64" %"PRIu64" node: 
%"PRIu64" %"PRIu64
+pcache_state_init(uint64_t stats_size, uint64_t max_aio_size, uint64_t 
cache_size, uint64_t readahead_size) "pool statistics size: %"PRIu64" max aio 
size: %"PRIu64" cache size: %"PRIu64" readahead size: %"PRIu64
+pcache_close(void *req_stats, void *cache) "pool statistics: %p cache: %p"
-- 
2.11.0

[Qemu-block] [PATCH v2 12/18] block/pcache: inflight readahead request waiting for read

2016-12-30 Thread Pavel Butsykin

If there was a cache hit, but the status of the node is NODE_STATUS_INFLIGHT,
then this means that a readahead request for this node is in flight. In this 
case,
we can wait for the completion of the readahead request, and then copy the
data. It allows us even more to optimize aio read requests.

Signed-off-by: Pavel Butsykin 
---
 block/pcache.c | 48 
 1 file changed, 44 insertions(+), 4 deletions(-)

diff --git a/block/pcache.c b/block/pcache.c
index e3749679ca..c1cbfa7040 100644
--- a/block/pcache.c
+++ b/block/pcache.c
@@ -62,9 +62,16 @@ typedef struct BDRVPCacheState {
 uint64_t readahead_size;
 } BDRVPCacheState;
 
+typedef struct ReqLinkEntry {
+QTAILQ_ENTRY(ReqLinkEntry) entry;
+Coroutine *co;
+int ret;
+} ReqLinkEntry;
+
 typedef struct PCacheNode {
 RBCacheNode common;
 uint8_t *data;
+QTAILQ_HEAD(, ReqLinkEntry) wait_list;
 enum {
 NODE_STATUS_NEW   = 0x01,
 NODE_STATUS_INFLIGHT  = 0x02,
@@ -116,6 +123,27 @@ static void read_cache_data_direct(PCacheNode *node, 
uint64_t offset,
 assert(copy == size);
 }
 
+static int read_cache_data(PCacheNode *node, uint64_t offset, uint64_t bytes,
+   QEMUIOVector *qiov)
+{
+if (node->status & NODE_STATUS_INFLIGHT) {
+ReqLinkEntry rlink = {
+.co = qemu_coroutine_self(),
+};
+
+QTAILQ_INSERT_HEAD(&node->wait_list, &rlink, entry);
+
+qemu_coroutine_yield();
+
+if (rlink.ret < 0) {
+return rlink.ret;
+}
+}
+read_cache_data_direct(node, offset, bytes, qiov);
+
+return 0;
+}
+
 static void update_req_stats(RBCache *rbcache, uint64_t offset, uint64_t bytes)
 {
 do {
@@ -194,6 +222,7 @@ static RBCacheNode *pcache_node_alloc(uint64_t offset, 
uint64_t bytes,
 node->data = g_malloc(bytes);
 node->status = NODE_STATUS_NEW;
 node->ref = 1;
+QTAILQ_INIT(&node->wait_list);
 
 return &node->common;
 }
@@ -272,6 +301,7 @@ static void coroutine_fn 
pcache_co_readahead(BlockDriverState *bs,
 BDRVPCacheState *s = bs->opaque;
 QEMUIOVector qiov;
 PCacheNode *node;
+ReqLinkEntry *rlink, *next;
 uint64_t readahead_offset;
 uint64_t readahead_bytes;
 int ret;
@@ -301,6 +331,13 @@ static void coroutine_fn 
pcache_co_readahead(BlockDriverState *bs,
 if (ret < 0) {
 rbcache_remove(s->cache, &node->common);
 }
+
+QTAILQ_FOREACH_SAFE(rlink, &node->wait_list, entry, next) {
+QTAILQ_REMOVE(&node->wait_list, rlink, entry);
+rlink->ret = ret;
+qemu_coroutine_enter(rlink->co);
+}
+
 pcache_node_unref(node);
 }
 
@@ -323,7 +360,7 @@ static int pcache_lookup_data(BlockDriverState *bs, 
uint64_t offset,
 BDRVPCacheState *s = bs->opaque;
 
 PCacheNode *node = rbcache_search(s->cache, offset, bytes);
-if (node == NULL || node->status & NODE_STATUS_INFLIGHT) {
+if (node == NULL) {
 return CACHE_MISS;
 }
 
@@ -331,7 +368,10 @@ static int pcache_lookup_data(BlockDriverState *bs, 
uint64_t offset,
 if (node->common.offset <= offset &&
 node->common.offset + node->common.bytes >= offset + bytes)
 {
-read_cache_data_direct(node, offset, bytes, qiov);
+int ret = read_cache_data(node, offset, bytes, qiov);
+if (ret < 0) {
+return ret;
+}
 return CACHE_HIT;
 }
 
@@ -362,8 +402,8 @@ static coroutine_fn int pcache_co_preadv(BlockDriverState 
*bs, uint64_t offset,
 qemu_coroutine_enter(co);
 
 status = pcache_lookup_data(bs, offset, bytes, qiov);
-if (status == CACHE_HIT) {
-return 0;
+if (status != CACHE_MISS) {
+return status < 0 ? status : 0;
 }
 
 skip_large_request:
-- 
2.11.0

[Qemu-block] [PATCH v2 14/18] block/pcache: up-to-date cache for removed nodes

2016-12-30 Thread Pavel Butsykin

We must be able to keep the actual data even for the removed nodes. This is
necessary when we need to defer the reading of node. Because there may come
an overlapping write to removed node in the interval between node completion
and reading of the node.

For this we move the removed nodes to remove_node_list and update
node->data on each overlapping write.

This is preparatory patch for the resolution of partial cache-hit.

Signed-off-by: Pavel Butsykin 
---
 block/pcache.c | 24 +++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/block/pcache.c b/block/pcache.c
index 140f90d6d7..86b28de44b 100644
--- a/block/pcache.c
+++ b/block/pcache.c
@@ -60,6 +60,7 @@ typedef struct BDRVPCacheState {
 RBCache *cache;
 uint64_t max_aio_size;
 uint64_t readahead_size;
+QLIST_HEAD(, PCacheNode) remove_node_list;
 } BDRVPCacheState;
 
 typedef struct ReqLinkEntry {
@@ -79,6 +80,7 @@ typedef struct PCacheNode {
 NODE_STATUS_REMOVE= 0x08,
 NODE_STATUS_DELETED   = 0x10, /* only for debugging */
 } status;
+QLIST_ENTRY(PCacheNode) entry;
 int ref;
 } PCacheNode;
 
@@ -94,6 +96,7 @@ static void pcache_node_unref(PCacheNode *node)
 assert(node->status & NODE_STATUS_REMOVE);
 node->status |= NODE_STATUS_DELETED;
 
+QLIST_REMOVE(node, entry);
 g_free(node->data);
 g_free(node);
 }
@@ -205,11 +208,14 @@ static bool check_request_sequence(BDRVPCacheState *s, 
uint64_t offset)
 
 static void pcache_node_free(RBCacheNode *rbnode, void *opaque)
 {
+BDRVPCacheState *s = opaque;
 PCacheNode *node = container_of(rbnode, PCacheNode, common);
 
 assert(node->status == NODE_STATUS_INFLIGHT ||
node->status == NODE_STATUS_COMPLETED);
 
+QLIST_INSERT_HEAD(&s->remove_node_list, node, entry);
+
 node->status |= NODE_STATUS_REMOVE;
 pcache_node_unref(node);
 }
@@ -432,11 +438,12 @@ static void pcache_write_through(BlockDriverState *bs, 
uint64_t offset,
  uint64_t bytes, QEMUIOVector *qiov)
 {
 BDRVPCacheState *s = bs->opaque;
+PCacheNode *node, *next;
 uint64_t chunk_offset = offset, chunk_bytes = bytes;
 uint64_t end_offs = offset + bytes;
 
 do {
-PCacheNode *node = rbcache_search(s->cache, chunk_offset, chunk_bytes);
+node = rbcache_search(s->cache, chunk_offset, chunk_bytes);
 if (node == NULL) {
 break;
 }
@@ -450,6 +457,18 @@ static void pcache_write_through(BlockDriverState *bs, 
uint64_t offset,
 write_cache_data(node, offset, bytes, qiov);
 }
 } while (end_offs > chunk_offset);
+
+QLIST_FOREACH_SAFE(node, &s->remove_node_list, entry, next) {
+if (node->status & NODE_STATUS_INFLIGHT) {
+continue;
+}
+if (offset >= node->common.offset + node->common.bytes ||
+offset + bytes <= node->common.offset)
+{
+continue;
+}
+write_cache_data(node, offset, bytes, qiov);
+}
 }
 
 static coroutine_fn int pcache_co_pwritev(BlockDriverState *bs, uint64_t 
offset,
@@ -479,6 +498,7 @@ static void pcache_state_init(QemuOpts *opts, 
BDRVPCacheState *s)
   RBCACHE_LRU, s);
 s->readahead_size = qemu_opt_get_size(opts, PCACHE_OPT_READAHEAD_SIZE,
   PCACHE_DEFAULT_READAHEAD_SIZE);
+QLIST_INIT(&s->remove_node_list);
 }
 
 static int pcache_file_open(BlockDriverState *bs, QDict *options, int flags,
@@ -516,6 +536,8 @@ static void pcache_close(BlockDriverState *bs)
 
 rbcache_destroy(s->req_stats);
 rbcache_destroy(s->cache);
+
+assert(QLIST_EMPTY(&s->remove_node_list));
 }
 
 static int64_t pcache_getlength(BlockDriverState *bs)
-- 
2.11.0

[Qemu-block] [PATCH v2 03/18] util/rbcache: range-based cache core

2016-12-30 Thread Pavel Butsykin

RBCache provides functionality to cache the data from block devices
(basically). The range here is used as the main key for searching and storing
data. The cache is based on red-black trees, so basic operations search,
insert, delete are performed for O(log n).

It is important to note that QEMU usually does not require a data cache, but
in reality, there are already some cases where a cache of small amounts can
increase performance, so as the data structure was selected red-black trees,
this is a fairly simple data structure and show high efficiency on a small
number of elements. Therefore, when the minimum range is 512 bytes, the
recommended size of the cache memory no more than 8-16mb.  Also note
that this cache implementation allows to store ranges of different lengths
without alignment.

Generic cache core can easily be used to implement different caching policies at
the block level, such as read-ahed. Also it can be used in some special cases,
for example for caching data in qcow2 when sequential allocating writes to image
with backing file.

Signed-off-by: Pavel Butsykin 
---
 MAINTAINERS|   6 ++
 include/qemu/rbcache.h | 128 +
 util/Makefile.objs |   1 +
 util/rbcache.c | 253 +
 4 files changed, 388 insertions(+)
 create mode 100644 include/qemu/rbcache.h
 create mode 100644 util/rbcache.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 228278c1ca..01f4afa1e4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1472,6 +1472,12 @@ F: include/qemu/rbtree.h
 F: include/qemu/rbtree_augmented.h
 F: util/rbtree.c
 
+Range-Based Cache
+M: Denis V. Lunev 
+S: Supported
+F: include/qemu/rbcache.h
+F: util/rbcache.c
+
 UUID
 M: Fam Zheng 
 S: Supported
diff --git a/include/qemu/rbcache.h b/include/qemu/rbcache.h
new file mode 100644
index 00..24f7c1cb80
--- /dev/null
+++ b/include/qemu/rbcache.h
@@ -0,0 +1,128 @@
+/*
+ * QEMU Range-Based Cache core
+ *
+ * Copyright (C) 2015-2016 Parallels IP Holdings GmbH.
+ *
+ * Author: Pavel Butsykin 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#ifndef RBCACHE_H
+#define RBCACHE_H
+
+#include "qemu/rbtree.h"
+#include "qemu/queue.h"
+
+typedef struct RBCacheNode {
+struct RbNode rb_node;
+uint64_t offset;
+uint64_t bytes;
+QTAILQ_ENTRY(RBCacheNode) entry;
+} RBCacheNode;
+
+typedef struct RBCache RBCache;
+
+/* These callbacks are used to extend the common structure RBCacheNode. The
+ * alloc callback should initialize only fields of the expanded node. Node
+ * common part is initialized in RBCache( see rbcache_node_alloc() ).
+ */
+typedef RBCacheNode *RBNodeAlloc(uint64_t offset, uint64_t bytes, void 
*opaque);
+typedef void RBNodeFree(RBCacheNode *node, void *opaque);
+
+
+enum eviction_type {
+RBCACHE_FIFO,
+RBCACHE_LRU,
+};
+
+/**
+ * rbcache_search:
+ * @rbcache: the cache object.
+ * @offset: the start of the range.
+ * @bytes: the size of the range.
+ *
+ * Returns the node corresponding to the range(offset, bytes), or NULL if
+ * the node was not found. In the case when the range covers multiple nodes,
+ * it returns the node with the lowest offset.
+ */
+void *rbcache_search(RBCache *rbcache, uint64_t offset, uint64_t bytes);
+
+/**
+ * rbcache_insert:
+ * @rbcache: the cache object.
+ * @node: a new node for the cache.
+ *
+ * Returns the new node, or old node if a node describing the same range
+ * already exists. In case of partial overlaps, the existing overlapping node
+ * with the lowest offset is returned.
+ */
+void *rbcache_insert(RBCache *rbcache, RBCacheNode *node);
+
+/**
+ * rbcache_search_and_insert:
+ * @rbcache: the cache object.
+ * @offset: the start of the range.
+ * @bytes: the size of the range.
+ *
+ * rbcache_search_and_insert() is like rbcache_insert(), except that a new node
+ * is allocated inside the function. Returns the new node, or old node if a 
node
+ * describing the same range. In case of partial overlaps, the existing
+ * overlapping node with the lowest offset is returned.
+ */
+void *rbcache_search_and_insert(RBCache *rbcache, uint64_t offset,
+uint64_t byte);
+
+/**
+ * rbcache_remove:
+ * @rbcache: the cache object.
+ * @node: a node to remove.
+ *
+ * Removes the cached range owned by the node, it also frees the node.
+ */
+void rbcache_remove(RBCache *rbcache, RBCacheNode *node);
+
+/**
+ * rbcache_node_alloc:
+ * @rbcache: the cache object.
+ * @offset: the start of the range.
+ * @bytes: the size of the range.
+ *
+ * Returns an allocated and initialized node.
+ */
+RBCacheNode *rbcache_node_alloc(RBCache *rbcache, uint64_t offset,
+uint64_t bytes);
+
+/**
+ * rbcache_node_free:
+ * @rbcache: the cache object.
+ * @node: a node to free.
+ *
+ * Frees the node.
+ */
+void rbcache_node_free(RBCache *rbcache, RBCacheNode *node);
+
+/**
+ * rbc

[Qemu-block] [PATCH v2 07/18] block/pcache: updating statistics for overlapping requests

2016-12-30 Thread Pavel Butsykin

When updating the statistics sometimes i/O requests can overlap each other,
in this case the requests are not stored in the statistics. It's not very good,
especially when the requests have a small range of intersection.

We can cut the requests in the intersection and add the pieces of requests in
the statistics. Maybe not significantly, but it can make the statistics more
accurate. Here, update_req_stats() adds the ability to save overlapping
requests.

Signed-off-by: Pavel Butsykin 
---
 block/pcache.c | 36 +++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/block/pcache.c b/block/pcache.c
index 1f3200af63..deac57c58d 100644
--- a/block/pcache.c
+++ b/block/pcache.c
@@ -46,6 +46,40 @@ typedef struct BDRVPCacheState {
 uint64_t max_aio_size;
 } BDRVPCacheState;
 
+static void update_req_stats(RBCache *rbcache, uint64_t offset, uint64_t bytes)
+{
+do {
+RBCacheNode *node = rbcache_search_and_insert(rbcache, offset, bytes);
+/* The node was successfully added or already exists */
+if (node->offset <= offset &&
+node->offset + node->bytes >= offset + bytes)
+{
+break;
+}
+
+/* Request covers the whole node */
+if (offset <= node->offset &&
+offset + bytes >= node->offset + node->bytes)
+{
+rbcache_remove(rbcache, node);
+continue;
+}
+
+if (offset < node->offset) {
+RBCacheNode *new_node =
+rbcache_node_alloc(rbcache, offset, node->offset - offset);
+if (new_node != rbcache_insert(rbcache, new_node)) {
+/* The presence of the node in this range is impossible */
+abort();
+}
+break;
+}
+
+bytes = (offset + bytes) - (node->offset + node->bytes);
+offset = node->offset + node->bytes;
+} while (true);
+}
+
 static coroutine_fn int pcache_co_preadv(BlockDriverState *bs, uint64_t offset,
  uint64_t bytes, QEMUIOVector *qiov,
  int flags)
@@ -53,7 +87,7 @@ static coroutine_fn int pcache_co_preadv(BlockDriverState 
*bs, uint64_t offset,
 BDRVPCacheState *s = bs->opaque;
 
 if (s->max_aio_size >= bytes) {
-rbcache_search_and_insert(s->req_stats, offset, bytes);
+update_req_stats(s->req_stats, offset, bytes);
 }
 
 return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
-- 
2.11.0

[Qemu-block] [PATCH v2 17/18] qapi: allow blockdev-add for pcache

2016-12-30 Thread Pavel Butsykin

Signed-off-by: Pavel Butsykin 
---
 qapi/block-core.json | 30 --
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index 6b42216960..00a6e15db3 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -1719,6 +1719,7 @@
 # @nfs: Since 2.8
 # @replication: Since 2.8
 # @ssh: Since 2.8
+# @pcache: since 2.9
 #
 # Since: 2.0
 ##
@@ -1726,8 +1727,8 @@
   'data': [ 'archipelago', 'blkdebug', 'blkverify', 'bochs', 'cloop',
 'dmg', 'file', 'ftp', 'ftps', 'gluster', 'host_cdrom',
 'host_device', 'http', 'https', 'luks', 'nbd', 'nfs', 'null-aio',
-'null-co', 'parallels', 'qcow', 'qcow2', 'qed', 'quorum', 'raw',
-'replication', 'ssh', 'vdi', 'vhdx', 'vmdk', 'vpc',
+'null-co', 'parallels', 'pcache', 'qcow', 'qcow2', 'qed', 'quorum',
+'raw', 'replication', 'ssh', 'vdi', 'vhdx', 'vmdk', 'vpc',
 'vvfat' ] }
 
 ##
@@ -1808,6 +1809,30 @@
   'base': 'BlockdevOptionsGenericFormat',
   'data': { '*key-secret': 'str' } }
 
+##
+# @BlockdevOptionsPCache
+#
+# Driver specific block device options for pcache.
+#
+# @image:Reference to a block device image for caching.
+#
+# @pcache-stats-size:#optional Total volume of requests for statistics.
+#
+# @pcache-max-aio-size:  #optional Maximum size of read request which is 
handled
+#by pcache.
+#
+# @pcache-full-size: #optional Total cache size.
+#
+# @pcache-readahead-size #optional Prefetch cache readahead size.
+#
+# Since: 2.9
+##
+{ 'struct': 'BlockdevOptionsPCache',
+  'data': { 'image': 'BlockdevRef',
+'*pcache-stats-size': 'int',
+'*pcache-max-aio-size': 'int',
+'*pcache-full-size': 'int',
+'*pcache-readahead-size': 'int' } }
 
 ##
 # @BlockdevOptionsGenericCOWFormat:
@@ -2402,6 +2427,7 @@
   'null-aio':   'BlockdevOptionsNull',
   'null-co':'BlockdevOptionsNull',
   'parallels':  'BlockdevOptionsGenericFormat',
+  'pcache': 'BlockdevOptionsPCache',
   'qcow2':  'BlockdevOptionsQcow2',
   'qcow':   'BlockdevOptionsGenericCOWFormat',
   'qed':'BlockdevOptionsGenericCOWFormat',
-- 
2.11.0

[Qemu-block] [PATCH v2 05/18] block/pcache: statistics collection read requests

2016-12-30 Thread Pavel Butsykin

Here the rbcache is used as a repository statistics requests, in fact, data
requests are not cached, so we have the ability to store a large  number of
requests. We need statistics requests to determine the sequential requests.

Signed-off-by: Pavel Butsykin 
---
 block/pcache.c | 32 +++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/block/pcache.c b/block/pcache.c
index 7a67618408..296ae382b0 100644
--- a/block/pcache.c
+++ b/block/pcache.c
@@ -13,20 +13,38 @@
 #include "block/block_int.h"
 #include "qapi/error.h"
 #include "qapi/qmp/qstring.h"
+#include "qemu/rbcache.h"
 
+#define PCACHE_OPT_STATS_SIZE "pcache-stats-size"
 
 static QemuOptsList runtime_opts = {
 .name = "pcache",
 .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
 .desc = {
+{
+.name = PCACHE_OPT_STATS_SIZE,
+.type = QEMU_OPT_SIZE,
+.help = "Total volume of requests for statistics",
+},
 { /* end of list */ }
 },
 };
 
+#define MB_BITS 20
+#define PCACHE_DEFAULT_STATS_SIZE (3 << MB_BITS)
+
+typedef struct BDRVPCacheState {
+RBCache *req_stats;
+} BDRVPCacheState;
+
 static coroutine_fn int pcache_co_preadv(BlockDriverState *bs, uint64_t offset,
  uint64_t bytes, QEMUIOVector *qiov,
  int flags)
 {
+BDRVPCacheState *s = bs->opaque;
+
+rbcache_search_and_insert(s->req_stats, offset, bytes);
+
 return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
 }
 
@@ -37,6 +55,13 @@ static coroutine_fn int pcache_co_pwritev(BlockDriverState 
*bs, uint64_t offset,
 return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
 }
 
+static void pcache_state_init(QemuOpts *opts, BDRVPCacheState *s)
+{
+uint64_t stats_size = qemu_opt_get_size(opts, PCACHE_OPT_STATS_SIZE,
+PCACHE_DEFAULT_STATS_SIZE);
+s->req_stats = rbcache_create(NULL, NULL, stats_size, RBCACHE_FIFO, s);
+}
+
 static int pcache_file_open(BlockDriverState *bs, QDict *options, int flags,
 Error **errp)
 {
@@ -58,7 +83,9 @@ static int pcache_file_open(BlockDriverState *bs, QDict 
*options, int flags,
 if (local_err) {
 ret = -EINVAL;
 error_propagate(errp, local_err);
+goto fail;
 }
+pcache_state_init(opts, bs->opaque);
 fail:
 qemu_opts_del(opts);
 return ret;
@@ -66,6 +93,9 @@ fail:
 
 static void pcache_close(BlockDriverState *bs)
 {
+BDRVPCacheState *s = bs->opaque;
+
+rbcache_destroy(s->req_stats);
 }
 
 static int64_t pcache_getlength(BlockDriverState *bs)
@@ -81,7 +111,7 @@ static bool 
pcache_recurse_is_first_non_filter(BlockDriverState *bs,
 
 static BlockDriver bdrv_pcache = {
 .format_name= "pcache",
-.instance_size  = 0,
+.instance_size  = sizeof(BDRVPCacheState),
 
 .bdrv_file_open = pcache_file_open,
 .bdrv_close = pcache_close,
-- 
2.11.0

[Qemu-block] [PATCH v2 10/18] block/pcache: cache invalidation on write requests

2016-12-30 Thread Pavel Butsykin

In AIO write request completion we just drop all the intersecting nodes in the
cache, it's a simple way to keep the cache up-to-date.

Signed-off-by: Pavel Butsykin 
---
 block/pcache.c | 32 +++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/block/pcache.c b/block/pcache.c
index f30e9e7bfe..4007241d37 100644
--- a/block/pcache.c
+++ b/block/pcache.c
@@ -314,11 +314,41 @@ skip_large_request:
 return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
 }
 
+static void pcache_invalidation(BlockDriverState *bs, uint64_t offset,
+uint64_t bytes)
+{
+BDRVPCacheState *s = bs->opaque;
+uint64_t chunk_offset = offset, chunk_bytes = bytes;
+uint64_t end_offs = offset + bytes;
+
+do {
+PCacheNode *node = rbcache_search(s->cache, chunk_offset, chunk_bytes);
+if (node == NULL) {
+break;
+}
+assert(node->status == NODE_STATUS_COMPLETED ||
+   node->status == NODE_STATUS_INFLIGHT);
+
+chunk_offset = node->common.offset + node->common.bytes;
+chunk_bytes = end_offs - chunk_offset;
+
+if (node->status & NODE_STATUS_COMPLETED) {
+rbcache_remove(s->cache, &node->common);
+}
+} while (end_offs > chunk_offset);
+}
+
 static coroutine_fn int pcache_co_pwritev(BlockDriverState *bs, uint64_t 
offset,
   uint64_t bytes, QEMUIOVector *qiov,
   int flags)
 {
-return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
+int ret = bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
+if (ret < 0) {
+return ret;
+}
+pcache_invalidation(bs, offset, bytes);
+
+return ret;
 }
 
 static void pcache_state_init(QemuOpts *opts, BDRVPCacheState *s)
-- 
2.11.0

[Qemu-block] [PATCH v2 06/18] block/pcache: skip large aio read

2016-12-30 Thread Pavel Butsykin

This change will allow more efficient use of cache memory and filter the case
for which the pcache isn't efficient.  We skip requests that are not required in
the optimization and thereby reducing the number of unnecessary readaheads.

Signed-off-by: Pavel Butsykin 
---
 block/pcache.c | 16 +++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/block/pcache.c b/block/pcache.c
index 296ae382b0..1f3200af63 100644
--- a/block/pcache.c
+++ b/block/pcache.c
@@ -16,6 +16,7 @@
 #include "qemu/rbcache.h"
 
 #define PCACHE_OPT_STATS_SIZE "pcache-stats-size"
+#define PCACHE_OPT_MAX_AIO_SIZE "pcache-max-aio-size"
 
 static QemuOptsList runtime_opts = {
 .name = "pcache",
@@ -26,15 +27,23 @@ static QemuOptsList runtime_opts = {
 .type = QEMU_OPT_SIZE,
 .help = "Total volume of requests for statistics",
 },
+{
+.name = PCACHE_OPT_MAX_AIO_SIZE,
+.type = QEMU_OPT_SIZE,
+.help = "Maximum size of aio which is handled by pcache",
+},
 { /* end of list */ }
 },
 };
 
+#define KB_BITS 10
 #define MB_BITS 20
 #define PCACHE_DEFAULT_STATS_SIZE (3 << MB_BITS)
+#define PCACHE_DEFAULT_MAX_AIO_SIZE (64 << KB_BITS)
 
 typedef struct BDRVPCacheState {
 RBCache *req_stats;
+uint64_t max_aio_size;
 } BDRVPCacheState;
 
 static coroutine_fn int pcache_co_preadv(BlockDriverState *bs, uint64_t offset,
@@ -43,7 +52,9 @@ static coroutine_fn int pcache_co_preadv(BlockDriverState 
*bs, uint64_t offset,
 {
 BDRVPCacheState *s = bs->opaque;
 
-rbcache_search_and_insert(s->req_stats, offset, bytes);
+if (s->max_aio_size >= bytes) {
+rbcache_search_and_insert(s->req_stats, offset, bytes);
+}
 
 return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
 }
@@ -60,6 +71,9 @@ static void pcache_state_init(QemuOpts *opts, BDRVPCacheState 
*s)
 uint64_t stats_size = qemu_opt_get_size(opts, PCACHE_OPT_STATS_SIZE,
 PCACHE_DEFAULT_STATS_SIZE);
 s->req_stats = rbcache_create(NULL, NULL, stats_size, RBCACHE_FIFO, s);
+
+s->max_aio_size = qemu_opt_get_size(opts, PCACHE_OPT_MAX_AIO_SIZE,
+PCACHE_DEFAULT_MAX_AIO_SIZE);
 }
 
 static int pcache_file_open(BlockDriverState *bs, QDict *options, int flags,
-- 
2.11.0

[Qemu-block] [PATCH v2 04/18] tests/test-rbcache: add test cases

2016-12-30 Thread Pavel Butsykin

Signed-off-by: Pavel Butsykin 
---
 tests/Makefile.include |   3 +
 tests/test-rbcache.c   | 431 +
 2 files changed, 434 insertions(+)
 create mode 100644 tests/test-rbcache.c

diff --git a/tests/Makefile.include b/tests/Makefile.include
index 4841d582a1..74bdcd2aac 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -55,6 +55,8 @@ check-unit-y += tests/test-hbitmap$(EXESUF)
 gcov-files-test-hbitmap-y = blockjob.c
 check-unit-y += tests/test-blockjob$(EXESUF)
 check-unit-y += tests/test-blockjob-txn$(EXESUF)
+gcov-files-test-rbcache-y = util/rbcache.c
+check-unit-y += tests/test-rbcache$(EXESUF)
 check-unit-y += tests/test-x86-cpuid$(EXESUF)
 # all code tested by test-x86-cpuid is inside topology.h
 gcov-files-test-x86-cpuid-y =
@@ -498,6 +500,7 @@ tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o 
$(test-block-obj-y)
 tests/test-thread-pool$(EXESUF): tests/test-thread-pool.o $(test-block-obj-y)
 tests/test-iov$(EXESUF): tests/test-iov.o $(test-util-obj-y)
 tests/test-hbitmap$(EXESUF): tests/test-hbitmap.o $(test-util-obj-y)
+tests/test-rbcache$(EXESUF): tests/test-rbcache.o $(test-util-obj-y)
 tests/test-x86-cpuid$(EXESUF): tests/test-x86-cpuid.o
 tests/test-xbzrle$(EXESUF): tests/test-xbzrle.o migration/xbzrle.o 
page_cache.o $(test-util-obj-y)
 tests/test-cutils$(EXESUF): tests/test-cutils.o util/cutils.o
diff --git a/tests/test-rbcache.c b/tests/test-rbcache.c
new file mode 100644
index 00..40ccd6b225
--- /dev/null
+++ b/tests/test-rbcache.c
@@ -0,0 +1,431 @@
+/*
+ * QEMU Range-Based Cache core
+ *
+ * Copyright (C) 2015-2016 Parallels IP Holdings GmbH.
+ *
+ * Author: Pavel Butsykin 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/rbcache.h"
+
+typedef struct TestRBCacheData {
+RBCache *cache;
+} TestRBCacheData;
+
+typedef struct TestRBCacheConfig {
+uint64_t limit_size;
+int eviction_type;
+RBNodeAlloc *alloc;
+RBNodeFree  *free;
+void *opaque;
+} TestRBCacheConfig;
+
+#define KB(_n) ((_n) << 10)
+#define MB(_n) ((_n) << 20)
+
+#define OFFSET1 0
+#define SIZE1   KB(1)
+
+#define OFFSET2 KB(1)
+#define SIZE2   KB(2)
+
+#define OFFSET3 KB(18)
+#define SIZE3   KB(1)
+
+#define OFFSET4 KB(7)
+#define SIZE4   KB(7)
+
+#define OFFSET5 KB(1)
+#define SIZE5   KB(4)
+
+#define OFFSET6 KB(5)
+#define SIZE6   KB(5)
+
+#define OFFSET7 KB(15)
+#define SIZE7   KB(20)
+
+#define OFFSET8 KB(2)
+#define SIZE8   KB(20)
+
+
+static void test_rbcache_init(TestRBCacheData *data, const void *ctx)
+{
+g_assert_nonnull(data->cache);
+}
+
+static void test_rbcache_insert(TestRBCacheData *data, const void *ctx)
+{
+RBCacheNode *node1 = rbcache_node_alloc(data->cache, OFFSET1, SIZE1);
+RBCacheNode *node2 = rbcache_node_alloc(data->cache, OFFSET2, SIZE2);
+RBCacheNode *node3 = rbcache_node_alloc(data->cache, OFFSET3, SIZE3);
+RBCacheNode *node4 = rbcache_node_alloc(data->cache, OFFSET4, SIZE4);
+RBCacheNode *node5 = rbcache_node_alloc(data->cache, OFFSET5, SIZE5);
+RBCacheNode *node6 = rbcache_node_alloc(data->cache, OFFSET6, SIZE6);
+RBCacheNode *node7 = rbcache_node_alloc(data->cache, OFFSET7, SIZE7);
+RBCacheNode *node8 = rbcache_node_alloc(data->cache, OFFSET8, SIZE8);
+RBCacheNode *node;
+
+node = rbcache_insert(data->cache, node2);
+g_assert_true(node == node2);
+
+node = rbcache_insert(data->cache, node1);
+g_assert_true(node == node1);
+
+node = rbcache_insert(data->cache, node3);
+g_assert_true(node == node3);
+
+node = rbcache_insert(data->cache, node4);
+g_assert_true(node == node4);
+
+node = rbcache_insert(data->cache, node5);
+g_assert_true(node == node2);
+rbcache_node_free(data->cache, node5);
+
+node = rbcache_insert(data->cache, node6);
+g_assert_true(node == node4);
+rbcache_node_free(data->cache, node6);
+
+node = rbcache_insert(data->cache, node7);
+g_assert_true(node == node3);
+rbcache_node_free(data->cache, node7);
+
+node = rbcache_insert(data->cache, node8);
+g_assert_true(node == node2);
+rbcache_node_free(data->cache, node8);
+}
+
+static void test_rbcache_search(TestRBCacheData *data, const void *ctx)
+{
+RBCacheNode *node;
+
+test_rbcache_insert(data, ctx);
+
+node = rbcache_search(data->cache, OFFSET1, SIZE1);
+g_assert_nonnull(node);
+g_assert_cmpuint(node->offset, ==, OFFSET1);
+g_assert_cmpuint(node->bytes, ==, SIZE1);
+
+node = rbcache_search(data->cache, OFFSET2 + KB(1), SIZE2);
+g_assert_nonnull(node);
+g_assert_cmpuint(node->offset, ==, OFFSET2);
+g_assert_cmpuint(node->bytes, ==, SIZE2);
+
+node = rbcache_search(data->cache, OFFSET8, SIZE8);
+g_assert_nonnull(node);
+g_assert_cmpuint(node->offset, ==, OFFSET2);
+g_assert_cmpuint(node->bytes, ==, SIZE2);
+
+node = rbcache_sea

[Qemu-block] [PATCH v2 00/18] I/O prefetch cache

2016-12-30 Thread Pavel Butsykin

The prefetch cache aims to improve the performance of sequential read data.
Of most interest here are the requests of a small size of data for sequential
read, such requests can be optimized by extending them and moving into 
the prefetch cache. However, there are 2 issues:
 - In aggregate only a small portion of requests is sequential, so delays caused
   by the need to read more volumes of data will lead to an overall decrease
   in performance.
 - The presence of redundant data in the cache memory with a large number of
   random requests.
This pcache implementation solves the above and other problems prefetching data.
The pcache algorithm can be summarised by the following main steps.

1. Monitor I/O requests to identify typical sequences.
This implementation of prefetch cache works at the storage system level and has 
information only about the physical block addresses of I/O requests. Statistics 
are collected only from read requests to a maximum size of 64kb(by default),
each request that matches the criteria falls into a pool of requests. In order
to store request statistics used by the rb-tree, it's simple but for
this issue a quite efficient data structure.

2. Identifying sequential I/O streams.
For each read request to be carried out attempting to lift the chain sequence 
from the tree statistics, where this request will be element of a sequential
chain of requests. The key to search for consecutive requests is the area of 
bytes 
preceding the current request. The size of this area should not be too small to 
avoid false readahead. The sequential stream data requests can be identified
even when a large number of random requests. For example, if there is access to 
the blocks 100, 1157, 27520, 4, 101, 312, 1337, 102, in the context of request
processing 102 will be identified the chain of sequential requests 100, 101. 102
and then should a decision be made to do readahead. Also a situation may arise
when multiple applications A, B, C simultaneously perform sequential read of
data. For each separate application that will be sequential read data 
A(100, 101, 102), B(300, 301, 302), C(700, 701, 702), but for block devices it 
may look like a random data reading: 100,300,700,101,301,701,102,302,702. 
In this case, the sequential streams will also be recognised because location
requests in the rb-tree will allow to separate the sequential I/O streams.

3. Do the readahead into the cache for recognized sequential data streams.
After the issue of the detection of pcache case was resolved, need using larger 
requests to bring data into the cache. In this implementation the pcache used
readahead instead of the extension request, therefore the request goes as is. 
There is not any reason to put data in the cache that will never be picked up, 
but this will always happen in the case of extension requests. In order to store
areas of cached blocks is also used the rb-tree, it's simple but for this issue
a quite efficient data structure.

4. Control size of the prefetch cache pool and the requests statistic pool
For control the border of the pool statistic of requests, the data of requests 
are placed and replaced according to the FIFO principle, everything is simple.
For control the boundaries of the memory cache used LRU list, it allows to limit
the max amount memory that we can allocate for pcache. But the LRU is there
mainly to prevent displacement of the cache blocks that was read partially. 
The main way the memory is pushed out immediately after use, as soon as a chunk
of memory from the cache has been completely read, since the probability of
repetition of the request is very low. Cases when one and the same portion of
the cache memory has been read several times are not optimized and do not apply
to the cases that can optimize the pcache. Thus, using a cache memory of small
volume, by the optimization of the operations read-ahead and clear memory, we
can read entire volumes of data, providing a 100% cache hit. Also does not
decrease the effectiveness of random read requests.

PCache is implemented as a qemu block filter driver, has some configurable
parameters, such as: total cache size, statistics size, readahead size,
maximum size of block that can be processed.

For performance evaluation has been used several test cases with different
sequential and random read data on SSD disk. Here are the results of tests and
qemu parameters:

qemu parameters: 
-machine pc,accel=kvm,usb=off,vmport=off -m 1024 -smp 8
-drive file=/img/harddisk.hdd,if=none,cache=none,id=drive-scsi0-0-0-0,aio=native
-drive driver=pcache,image=drive-scsi0-0-0-0,if=virtio

*
* Testcase* Results in iops *
* ***
* *  clean qemu  *pcache*
*
* Create/open 16 file(s) of total *

[Qemu-block] [PATCH v2 01/18] block/pcache: empty pcache driver filter

2016-12-30 Thread Pavel Butsykin

The basic version of pcache driver for easy preparation of a patch set.

Signed-off-by: Pavel Butsykin 
---
 block/Makefile.objs |   1 +
 block/pcache.c  | 102 
 2 files changed, 103 insertions(+)
 create mode 100644 block/pcache.c

diff --git a/block/Makefile.objs b/block/Makefile.objs
index 67a036a1df..0a024f4e66 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -4,6 +4,7 @@ block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o 
qed-cluster.o
 block-obj-y += qed-check.o
 block-obj-y += vhdx.o vhdx-endian.o vhdx-log.o
 block-obj-y += quorum.o
+block-obj-y += pcache.o
 block-obj-y += parallels.o blkdebug.o blkverify.o blkreplay.o
 block-obj-y += block-backend.o snapshot.o qapi.o
 block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
diff --git a/block/pcache.c b/block/pcache.c
new file mode 100644
index 00..7a67618408
--- /dev/null
+++ b/block/pcache.c
@@ -0,0 +1,102 @@
+/*
+ * Prefetch cache driver filter
+ *
+ * Copyright (C) 2015-2016 Parallels IP Holdings GmbH.
+ *
+ * Author: Pavel Butsykin 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "block/block_int.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qstring.h"
+
+
+static QemuOptsList runtime_opts = {
+.name = "pcache",
+.head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+.desc = {
+{ /* end of list */ }
+},
+};
+
+static coroutine_fn int pcache_co_preadv(BlockDriverState *bs, uint64_t offset,
+ uint64_t bytes, QEMUIOVector *qiov,
+ int flags)
+{
+return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
+}
+
+static coroutine_fn int pcache_co_pwritev(BlockDriverState *bs, uint64_t 
offset,
+  uint64_t bytes, QEMUIOVector *qiov,
+  int flags)
+{
+return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
+}
+
+static int pcache_file_open(BlockDriverState *bs, QDict *options, int flags,
+Error **errp)
+{
+QemuOpts *opts;
+Error *local_err = NULL;
+int ret = 0;
+
+opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
+qemu_opts_absorb_qdict(opts, options, &local_err);
+if (local_err) {
+error_propagate(errp, local_err);
+ret = -EINVAL;
+goto fail;
+}
+
+assert(bs->file == NULL);
+bs->file = bdrv_open_child(NULL, options, "image", bs, &child_format,
+   false, &local_err);
+if (local_err) {
+ret = -EINVAL;
+error_propagate(errp, local_err);
+}
+fail:
+qemu_opts_del(opts);
+return ret;
+}
+
+static void pcache_close(BlockDriverState *bs)
+{
+}
+
+static int64_t pcache_getlength(BlockDriverState *bs)
+{
+return bdrv_getlength(bs->file->bs);
+}
+
+static bool pcache_recurse_is_first_non_filter(BlockDriverState *bs,
+   BlockDriverState *candidate)
+{
+return bdrv_recurse_is_first_non_filter(bs->file->bs, candidate);
+}
+
+static BlockDriver bdrv_pcache = {
+.format_name= "pcache",
+.instance_size  = 0,
+
+.bdrv_file_open = pcache_file_open,
+.bdrv_close = pcache_close,
+.bdrv_getlength = pcache_getlength,
+
+.bdrv_co_preadv = pcache_co_preadv,
+.bdrv_co_pwritev= pcache_co_pwritev,
+
+.is_filter  = true,
+.bdrv_recurse_is_first_non_filter   = pcache_recurse_is_first_non_filter,
+};
+
+static void bdrv_cache_init(void)
+{
+bdrv_register(&bdrv_pcache);
+}
+
+block_init(bdrv_cache_init);
-- 
2.11.0

[Qemu-block] [PATCH v2 09/18] block/pcache: skip readahead for unallocated clusters

2016-12-30 Thread Pavel Butsykin

Typically, data for unallocated clusters is filled with zeros, so it makes no
sense to store it in the cache.

Signed-off-by: Pavel Butsykin 
---
 block/pcache.c | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/block/pcache.c b/block/pcache.c
index 57eebd434a..f30e9e7bfe 100644
--- a/block/pcache.c
+++ b/block/pcache.c
@@ -174,6 +174,28 @@ static RBCacheNode *pcache_node_alloc(uint64_t offset, 
uint64_t bytes,
 return &node->common;
 }
 
+static bool check_allocated_clusters(BlockDriverState *bs, uint64_t offset,
+ uint64_t bytes)
+{
+int64_t sector_num = offset >> BDRV_SECTOR_BITS;
+int32_t nb_sectors = bytes >> BDRV_SECTOR_BITS;
+
+assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+
+do {
+int num, ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &num);
+if (ret <= 0) {
+return false;
+}
+sector_num += num;
+nb_sectors -= num;
+
+} while (nb_sectors);
+
+return true;
+}
+
 #define PCACHE_STEPS_FORWARD 2
 
 static PCacheNode *get_readahead_node(BlockDriverState *bs, RBCache *rbcache,
@@ -193,6 +215,10 @@ static PCacheNode *get_readahead_node(BlockDriverState 
*bs, RBCache *rbcache,
 break;
 }
 
+if (!check_allocated_clusters(bs, offset, bytes)) {
+break;
+}
+
 node = rbcache_search_and_insert(rbcache, offset, bytes);
 if (node->status == NODE_STATUS_NEW) {
 return node;
-- 
2.11.0

[Qemu-block] [PATCH v2 13/18] block/pcache: write through

2016-12-30 Thread Pavel Butsykin

Write-through is another way to keep the cache up-to-date. Even if this
case will be rare, write to the cache is more optimal than drop-cache.

Signed-off-by: Pavel Butsykin 
---
 block/pcache.c | 26 ++
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/block/pcache.c b/block/pcache.c
index c1cbfa7040..140f90d6d7 100644
--- a/block/pcache.c
+++ b/block/pcache.c
@@ -410,8 +410,26 @@ skip_large_request:
 return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
 }
 
-static void pcache_invalidation(BlockDriverState *bs, uint64_t offset,
-uint64_t bytes)
+static void write_cache_data(PCacheNode *node, uint64_t offset, uint64_t bytes,
+ QEMUIOVector *qiov)
+{
+uint64_t qiov_offs = 0, node_offs = 0;
+uint64_t size;
+uint64_t copy;
+
+if (offset < node->common.offset) {
+qiov_offs = node->common.offset - offset;
+} else {
+node_offs = offset - node->common.offset;
+}
+size = ranges_overlap_size(offset, bytes, node->common.offset,
+   node->common.bytes);
+copy = qemu_iovec_to_buf(qiov, qiov_offs, node->data + node_offs, size);
+assert(copy == size);
+}
+
+static void pcache_write_through(BlockDriverState *bs, uint64_t offset,
+ uint64_t bytes, QEMUIOVector *qiov)
 {
 BDRVPCacheState *s = bs->opaque;
 uint64_t chunk_offset = offset, chunk_bytes = bytes;
@@ -429,7 +447,7 @@ static void pcache_invalidation(BlockDriverState *bs, 
uint64_t offset,
 chunk_bytes = end_offs - chunk_offset;
 
 if (node->status & NODE_STATUS_COMPLETED) {
-rbcache_remove(s->cache, &node->common);
+write_cache_data(node, offset, bytes, qiov);
 }
 } while (end_offs > chunk_offset);
 }
@@ -442,7 +460,7 @@ static coroutine_fn int pcache_co_pwritev(BlockDriverState 
*bs, uint64_t offset,
 if (ret < 0) {
 return ret;
 }
-pcache_invalidation(bs, offset, bytes);
+pcache_write_through(bs, offset, bytes, qiov);
 
 return ret;
 }
-- 
2.11.0

[Qemu-block] [PATCH V4 01/10] virtio: convert to use DMA api

[Qemu-block] [PATCH v2 11/18] block/pcache: add reading data from the cache

[Qemu-block] [PATCH v2 15/18] block/pcache: pick up parts of the cache

[Qemu-block] [PATCH v2 08/18] block/pcache: add AIO readahead

[Qemu-block] [PATCH v2 16/18] block/pcache: drop used pcache nodes

[Qemu-block] [PATCH v2 02/18] util/rbtree: add rbtree from linux kernel

[Qemu-block] [PATCH v2 18/18] block/pcache: add tracepoints

[Qemu-block] [PATCH v2 12/18] block/pcache: inflight readahead request waiting for read

[Qemu-block] [PATCH v2 14/18] block/pcache: up-to-date cache for removed nodes

[Qemu-block] [PATCH v2 03/18] util/rbcache: range-based cache core

[Qemu-block] [PATCH v2 07/18] block/pcache: updating statistics for overlapping requests

[Qemu-block] [PATCH v2 17/18] qapi: allow blockdev-add for pcache

[Qemu-block] [PATCH v2 05/18] block/pcache: statistics collection read requests

[Qemu-block] [PATCH v2 10/18] block/pcache: cache invalidation on write requests

[Qemu-block] [PATCH v2 06/18] block/pcache: skip large aio read

[Qemu-block] [PATCH v2 04/18] tests/test-rbcache: add test cases

[Qemu-block] [PATCH v2 00/18] I/O prefetch cache

[Qemu-block] [PATCH v2 01/18] block/pcache: empty pcache driver filter

[Qemu-block] [PATCH v2 09/18] block/pcache: skip readahead for unallocated clusters

[Qemu-block] [PATCH v2 13/18] block/pcache: write through

20 matches

Site Navigation

Mail list logo

Footer information