[Qemu-block] [PATCH V4 01/10] virtio: convert to use DMA api
Currently, all virtio devices bypass IOMMU completely. This is because address_space_memory is assumed and used during DMA emulation. This patch converts the virtio core API to use DMA API. This idea is - introducing a new transport specific helper to query the dma address space. (only pci version is implemented). - query and use this address space during virtio device guest memory accessing when iommu platform (VIRTIO_F_IOMMU_PLATFORM) was enabled for this device. Cc: Michael S. Tsirkin Cc: Stefan Hajnoczi Cc: Kevin Wolf Cc: Amit Shah Cc: Paolo Bonzini Cc: qemu-block@nongnu.org Signed-off-by: Jason Wang --- hw/block/virtio-blk.c | 2 +- hw/char/virtio-serial-bus.c | 3 ++- hw/scsi/virtio-scsi.c | 4 ++- hw/virtio/virtio-bus.c| 8 ++ hw/virtio/virtio-pci.c| 14 ++ hw/virtio/virtio.c| 57 +-- include/hw/virtio/virtio-access.h | 31 ++--- include/hw/virtio/virtio-bus.h| 1 + include/hw/virtio/virtio.h| 9 --- 9 files changed, 93 insertions(+), 36 deletions(-) diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 0c5fd27..12657ff 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -857,7 +857,7 @@ static int virtio_blk_load_device(VirtIODevice *vdev, QEMUFile *f, } } -req = qemu_get_virtqueue_element(f, sizeof(VirtIOBlockReq)); +req = qemu_get_virtqueue_element(vdev, f, sizeof(VirtIOBlockReq)); virtio_blk_init_request(s, virtio_get_queue(vdev, vq_idx), req); req->next = s->rq; s->rq = req; diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c index 7975c2c..d544cd9 100644 --- a/hw/char/virtio-serial-bus.c +++ b/hw/char/virtio-serial-bus.c @@ -732,6 +732,7 @@ static void virtio_serial_post_load_timer_cb(void *opaque) static int fetch_active_ports_list(QEMUFile *f, VirtIOSerial *s, uint32_t nr_active_ports) { +VirtIODevice *vdev = VIRTIO_DEVICE(s); uint32_t i; s->post_load = g_malloc0(sizeof(*s->post_load)); @@ -765,7 +766,7 @@ static int fetch_active_ports_list(QEMUFile *f, qemu_get_be64s(f, &port->iov_offset); port->elem = -qemu_get_virtqueue_element(f, sizeof(VirtQueueElement)); +qemu_get_virtqueue_element(vdev, f, sizeof(VirtQueueElement)); /* * Port was throttled on source machine. Let's diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c index 34bba35..3da9d62 100644 --- a/hw/scsi/virtio-scsi.c +++ b/hw/scsi/virtio-scsi.c @@ -198,12 +198,14 @@ static void *virtio_scsi_load_request(QEMUFile *f, SCSIRequest *sreq) SCSIBus *bus = sreq->bus; VirtIOSCSI *s = container_of(bus, VirtIOSCSI, bus); VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s); +VirtIODevice *vdev = VIRTIO_DEVICE(s); VirtIOSCSIReq *req; uint32_t n; qemu_get_be32s(f, &n); assert(n < vs->conf.num_queues); -req = qemu_get_virtqueue_element(f, sizeof(VirtIOSCSIReq) + vs->cdb_size); +req = qemu_get_virtqueue_element(vdev, f, + sizeof(VirtIOSCSIReq) + vs->cdb_size); virtio_scsi_init_req(s, vs->cmd_vqs[n], req); if (virtio_scsi_parse_req(req, sizeof(VirtIOSCSICmdReq) + vs->cdb_size, diff --git a/hw/virtio/virtio-bus.c b/hw/virtio/virtio-bus.c index d6c0c72..d31cc00 100644 --- a/hw/virtio/virtio-bus.c +++ b/hw/virtio/virtio-bus.c @@ -28,6 +28,7 @@ #include "hw/qdev.h" #include "hw/virtio/virtio-bus.h" #include "hw/virtio/virtio.h" +#include "exec/address-spaces.h" /* #define DEBUG_VIRTIO_BUS */ @@ -61,6 +62,13 @@ void virtio_bus_device_plugged(VirtIODevice *vdev, Error **errp) if (klass->device_plugged != NULL) { klass->device_plugged(qbus->parent, errp); } + +if (klass->get_dma_as != NULL && +virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM)) { +vdev->dma_as = klass->get_dma_as(qbus->parent); +} else { +vdev->dma_as = &address_space_memory; +} } /* Reset the virtio_bus */ diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index 21c2b9d..213d57e 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -1144,6 +1144,14 @@ static int virtio_pci_query_nvectors(DeviceState *d) return proxy->nvectors; } +static AddressSpace *virtio_pci_get_dma_as(DeviceState *d) +{ +VirtIOPCIProxy *proxy = VIRTIO_PCI(d); +PCIDevice *dev = &proxy->pci_dev; + +return pci_get_address_space(dev); +} + static int virtio_pci_add_mem_cap(VirtIOPCIProxy *proxy, struct virtio_pci_cap *cap) { @@ -1601,6 +1609,11 @@ static void virtio_pci_device_plugged(DeviceState *d, Error **errp) } if (legacy) { +if (virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM)) { +error_setg(errp, "VIRTIO_
[Qemu-block] [PATCH v2 11/18] block/pcache: add reading data from the cache
Added read_cache_data_direct() allowing to read data from node to qiov. And the simplest use case - read data from node provided that the node is not in flight and fully covers read request. Signed-off-by: Pavel Butsykin --- block/pcache.c | 56 1 file changed, 56 insertions(+) diff --git a/block/pcache.c b/block/pcache.c index 4007241d37..e3749679ca 100644 --- a/block/pcache.c +++ b/block/pcache.c @@ -92,6 +92,30 @@ static void pcache_node_unref(PCacheNode *node) } } +static uint64_t ranges_overlap_size(uint64_t offset1, uint64_t size1, +uint64_t offset2, uint32_t size2) +{ +return MIN(offset1 + size1, offset2 + size2) - MAX(offset1, offset2); +} + +static void read_cache_data_direct(PCacheNode *node, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov) +{ +uint64_t qiov_offs = 0, node_offs = 0; +uint64_t size; +uint64_t copy; + +if (offset < node->common.offset) { +qiov_offs = node->common.offset - offset; +} else { +node_offs = offset - node->common.offset; +} +size = ranges_overlap_size(offset, bytes, node->common.offset, + node->common.bytes); +copy = qemu_iovec_from_buf(qiov, qiov_offs, node->data + node_offs, size); +assert(copy == size); +} + static void update_req_stats(RBCache *rbcache, uint64_t offset, uint64_t bytes) { do { @@ -288,6 +312,32 @@ static void pcache_readahead_entry(void *opaque) readahead_co->bytes); } +enum { +CACHE_MISS, +CACHE_HIT, +}; + +static int pcache_lookup_data(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov) +{ +BDRVPCacheState *s = bs->opaque; + +PCacheNode *node = rbcache_search(s->cache, offset, bytes); +if (node == NULL || node->status & NODE_STATUS_INFLIGHT) { +return CACHE_MISS; +} + +/* Node covers the whole request */ +if (node->common.offset <= offset && +node->common.offset + node->common.bytes >= offset + bytes) +{ +read_cache_data_direct(node, offset, bytes, qiov); +return CACHE_HIT; +} + +return CACHE_MISS; +} + static coroutine_fn int pcache_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) @@ -295,6 +345,7 @@ static coroutine_fn int pcache_co_preadv(BlockDriverState *bs, uint64_t offset, BDRVPCacheState *s = bs->opaque; PCacheReadaheadCo readahead_co; Coroutine *co; +int status; if (bytes > s->max_aio_size) { goto skip_large_request; @@ -310,6 +361,11 @@ static coroutine_fn int pcache_co_preadv(BlockDriverState *bs, uint64_t offset, co = qemu_coroutine_create(pcache_readahead_entry, &readahead_co); qemu_coroutine_enter(co); +status = pcache_lookup_data(bs, offset, bytes, qiov); +if (status == CACHE_HIT) { +return 0; +} + skip_large_request: return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags); } -- 2.11.0
[Qemu-block] [PATCH v2 15/18] block/pcache: pick up parts of the cache
Provided that the request size is less than the readahead size, a partial cache hit can occur in the following three cases: 1. The request covers the bottom part of the node 2. The request covers the upper part of the node 3. The request is between two nodes and partially covers both of them The function pickup_parts_of_cache() covers all three cases and for the uncached part of the request it creates a new request. Signed-off-by: Pavel Butsykin --- block/pcache.c | 174 ++--- 1 file changed, 166 insertions(+), 8 deletions(-) diff --git a/block/pcache.c b/block/pcache.c index 86b28de44b..3f89e40b05 100644 --- a/block/pcache.c +++ b/block/pcache.c @@ -10,6 +10,7 @@ */ #include "qemu/osdep.h" +#include "qemu/error-report.h" #include "block/block_int.h" #include "qapi/error.h" #include "qapi/qmp/qstring.h" @@ -355,6 +356,153 @@ static void pcache_readahead_entry(void *opaque) readahead_co->bytes); } +/* + * Provided that the request size is less or equal than the readahead size, + * a partial cache hit can occur in the following three cases: + * 1. The request covers the bottom part of the node. + * 2. The request covers the upper part of the node. + * 3. The request is between two nodes and partially covers both of them. + * + * Therefore, the request can be divided into no more than 3 parts. + */ +#define PCACHE_MAX_FRAGMENT_NUM 3 + +typedef struct PartReqEntry { +ReqLinkEntry rlink; +PCacheNode *node; +} PartReqEntry; + +typedef struct PartReqDesc { +Coroutine *co; +PartReqEntry parts[PCACHE_MAX_FRAGMENT_NUM]; +uint32_t cnt; +uint32_t completed; +} PartReqDesc; + +typedef struct PCachePartReqCo { +BlockDriverState *bs; +uint64_t offset; +uint64_t bytes; +PartReqDesc *desc; +} PCachePartReqCo; + +static void coroutine_fn pcache_co_part_req(BlockDriverState *bs, +uint64_t offset, uint64_t bytes, +PartReqDesc *req) +{ +BDRVPCacheState *s = bs->opaque; +QEMUIOVector qiov; +PartReqEntry *part = &req->parts[req->cnt]; +PCacheNode *node = container_of(rbcache_node_alloc(s->cache, offset, bytes), +PCacheNode, common); +node->status = NODE_STATUS_INFLIGHT; +qemu_iovec_init(&qiov, 1); +qemu_iovec_add(&qiov, node->data, node->common.bytes); + +part->node = node; +assert(++req->cnt <= PCACHE_MAX_FRAGMENT_NUM); + +part->rlink.ret = bdrv_co_preadv(bs->file, offset, bytes, &qiov, 0); + +node->status = NODE_STATUS_COMPLETED | NODE_STATUS_REMOVE; +QLIST_INSERT_HEAD(&s->remove_node_list, node, entry); + +if (!qemu_coroutine_entered(req->co)) { +qemu_coroutine_enter(req->co); +} else { +req->completed++; +} +} + +static void pcache_part_req_entry(void *opaque) +{ +PCachePartReqCo *req_co = opaque; + +pcache_co_part_req(req_co->bs, req_co->offset, req_co->bytes, req_co->desc); +} + +static int pickup_parts_of_cache(BlockDriverState *bs, PCacheNode *node, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov) +{ +BDRVPCacheState *s = bs->opaque; +PartReqDesc req = { +.co = qemu_coroutine_self(), +}; +PCachePartReqCo req_co = { +.bs = bs, +.desc = &req +}; +uint64_t chunk_offset = offset, chunk_bytes = bytes; +uint64_t up_size; +int ret = 0; + +do { +pcache_node_ref(node); + +if (chunk_offset < node->common.offset) { +Coroutine *co; + +req_co.offset = chunk_offset; +req_co.bytes = up_size = node->common.offset - chunk_offset; + +co = qemu_coroutine_create(pcache_part_req_entry, &req_co); +qemu_coroutine_enter(co); + +chunk_offset += up_size; +chunk_bytes -= up_size; +} + +req.parts[req.cnt].node = node; +if (node->status & NODE_STATUS_INFLIGHT) { +req.parts[req.cnt].rlink.co = qemu_coroutine_self(); +QTAILQ_INSERT_HEAD(&node->wait_list, + &req.parts[req.cnt].rlink, entry); +} else { +req.completed++; +} +assert(++req.cnt <= PCACHE_MAX_FRAGMENT_NUM); + +up_size = MIN(node->common.offset + node->common.bytes - chunk_offset, + chunk_bytes); +chunk_bytes -= up_size; +chunk_offset += up_size; + +if (chunk_bytes != 0) { +node = rbcache_search(s->cache, chunk_offset, chunk_bytes); +if (node == NULL) { +Coroutine *co; + +req_co.offset = chunk_offset; +req_co.bytes = chunk_bytes; + +co = qemu_coroutine_create(pcache_part_req_entry, &req_co); +qemu_coroutine_enter(co); +chunk_bytes = 0; +
[Qemu-block] [PATCH v2 08/18] block/pcache: add AIO readahead
This patch adds readahead data to the cache. Here the readahead is a separate asynchronous request, which doesn't depend on completion of filtered read requests. The readahead is done only by the condition, if before the current request there's sequential read data enough size. This information can give the request statistics, of course this method of detection is not very reliable, but in most cases it'll work. Signed-off-by: Pavel Butsykin --- block/pcache.c | 206 - 1 file changed, 204 insertions(+), 2 deletions(-) diff --git a/block/pcache.c b/block/pcache.c index deac57c58d..57eebd434a 100644 --- a/block/pcache.c +++ b/block/pcache.c @@ -17,6 +17,8 @@ #define PCACHE_OPT_STATS_SIZE "pcache-stats-size" #define PCACHE_OPT_MAX_AIO_SIZE "pcache-max-aio-size" +#define PCACHE_OPT_CACHE_SIZE "pcache-full-size" +#define PCACHE_OPT_READAHEAD_SIZE "pcache-readahead-size" static QemuOptsList runtime_opts = { .name = "pcache", @@ -32,6 +34,16 @@ static QemuOptsList runtime_opts = { .type = QEMU_OPT_SIZE, .help = "Maximum size of aio which is handled by pcache", }, +{ +.name = PCACHE_OPT_CACHE_SIZE, +.type = QEMU_OPT_SIZE, +.help = "Total cache size", +}, +{ +.name = PCACHE_OPT_READAHEAD_SIZE, +.type = QEMU_OPT_SIZE, +.help = "Prefetch cache readahead size", +}, { /* end of list */ } }, }; @@ -40,12 +52,46 @@ static QemuOptsList runtime_opts = { #define MB_BITS 20 #define PCACHE_DEFAULT_STATS_SIZE (3 << MB_BITS) #define PCACHE_DEFAULT_MAX_AIO_SIZE (64 << KB_BITS) +#define PCACHE_DEFAULT_CACHE_SIZE (4 << MB_BITS) +#define PCACHE_DEFAULT_READAHEAD_SIZE (128 << KB_BITS) typedef struct BDRVPCacheState { RBCache *req_stats; +RBCache *cache; uint64_t max_aio_size; +uint64_t readahead_size; } BDRVPCacheState; +typedef struct PCacheNode { +RBCacheNode common; +uint8_t *data; +enum { +NODE_STATUS_NEW = 0x01, +NODE_STATUS_INFLIGHT = 0x02, +NODE_STATUS_COMPLETED = 0x04, +NODE_STATUS_REMOVE= 0x08, +NODE_STATUS_DELETED = 0x10, /* only for debugging */ +} status; +int ref; +} PCacheNode; + +static inline void pcache_node_ref(PCacheNode *node) +{ +node->ref++; +} + +static void pcache_node_unref(PCacheNode *node) +{ +assert(node->ref > 0); +if (--node->ref == 0) { +assert(node->status & NODE_STATUS_REMOVE); +node->status |= NODE_STATUS_DELETED; + +g_free(node->data); +g_free(node); +} +} + static void update_req_stats(RBCache *rbcache, uint64_t offset, uint64_t bytes) { do { @@ -80,16 +126,165 @@ static void update_req_stats(RBCache *rbcache, uint64_t offset, uint64_t bytes) } while (true); } +static bool check_request_sequence(BDRVPCacheState *s, uint64_t offset) +{ +uint64_t cache_line_size = s->readahead_size; +uint64_t check_offset; + +if (offset <= cache_line_size) { +return false; +} +check_offset = offset - cache_line_size; + +do { +RBCacheNode *node = rbcache_search(s->req_stats, check_offset, + offset - check_offset); +if (node == NULL) { +return false; +} +if (node->offset > check_offset) { +return false; +} +check_offset = node->offset + node->bytes; +} while (check_offset < offset); + +return true; +} + +static void pcache_node_free(RBCacheNode *rbnode, void *opaque) +{ +PCacheNode *node = container_of(rbnode, PCacheNode, common); + +assert(node->status == NODE_STATUS_INFLIGHT || + node->status == NODE_STATUS_COMPLETED); + +node->status |= NODE_STATUS_REMOVE; +pcache_node_unref(node); +} + +static RBCacheNode *pcache_node_alloc(uint64_t offset, uint64_t bytes, + void *opaque) +{ +PCacheNode *node = g_malloc(sizeof(*node)); + +node->data = g_malloc(bytes); +node->status = NODE_STATUS_NEW; +node->ref = 1; + +return &node->common; +} + +#define PCACHE_STEPS_FORWARD 2 + +static PCacheNode *get_readahead_node(BlockDriverState *bs, RBCache *rbcache, + uint64_t offset, uint64_t bytes) +{ +uint32_t count = PCACHE_STEPS_FORWARD; + +int64_t total_bytes = bdrv_getlength(bs); +if (total_bytes < 0) { +return NULL; +} + +while (count--) { +PCacheNode *node; + +if (total_bytes <= offset + bytes) { +break; +} + +node = rbcache_search_and_insert(rbcache, offset, bytes); +if (node->status == NODE_STATUS_NEW) { +return node; +} + /* The range less than the readahead size is not cached to reduce + * fragmentation of the cache. If the data is already cached, then we +
[Qemu-block] [PATCH v2 16/18] block/pcache: drop used pcache nodes
The pcache is directed to certain situations to sequential reads. This concept allows to drop parts of the cache that were already used, which will reduce the size of cache and the number of displaced nodes. Signed-off-by: Pavel Butsykin --- block/pcache.c | 26 +++--- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/block/pcache.c b/block/pcache.c index 3f89e40b05..6d2b54cf78 100644 --- a/block/pcache.c +++ b/block/pcache.c @@ -81,6 +81,7 @@ typedef struct PCacheNode { NODE_STATUS_REMOVE= 0x08, NODE_STATUS_DELETED = 0x10, /* only for debugging */ } status; +uint64_t rdcnt; QLIST_ENTRY(PCacheNode) entry; int ref; } PCacheNode; @@ -109,13 +110,17 @@ static uint64_t ranges_overlap_size(uint64_t offset1, uint64_t size1, return MIN(offset1 + size1, offset2 + size2) - MAX(offset1, offset2); } -static void read_cache_data_direct(PCacheNode *node, uint64_t offset, - uint64_t bytes, QEMUIOVector *qiov) +static void read_cache_data_direct(BlockDriverState *bs, PCacheNode *node, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov) { +BDRVPCacheState *s = bs->opaque; uint64_t qiov_offs = 0, node_offs = 0; uint64_t size; uint64_t copy; +assert(node->status & NODE_STATUS_COMPLETED); + if (offset < node->common.offset) { qiov_offs = node->common.offset - offset; } else { @@ -124,11 +129,17 @@ static void read_cache_data_direct(PCacheNode *node, uint64_t offset, size = ranges_overlap_size(offset, bytes, node->common.offset, node->common.bytes); copy = qemu_iovec_from_buf(qiov, qiov_offs, node->data + node_offs, size); +node->rdcnt += size; +if (node->rdcnt >= node->common.bytes && +!(node->status & NODE_STATUS_REMOVE)) +{ +rbcache_remove(s->cache, &node->common); +} assert(copy == size); } -static int read_cache_data(PCacheNode *node, uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov) +static int read_cache_data(BlockDriverState *bs, PCacheNode *node, + uint64_t offset, uint64_t bytes, QEMUIOVector *qiov) { if (node->status & NODE_STATUS_INFLIGHT) { ReqLinkEntry rlink = { @@ -143,7 +154,7 @@ static int read_cache_data(PCacheNode *node, uint64_t offset, uint64_t bytes, return rlink.ret; } } -read_cache_data_direct(node, offset, bytes, qiov); +read_cache_data_direct(bs, node, offset, bytes, qiov); return 0; } @@ -228,6 +239,7 @@ static RBCacheNode *pcache_node_alloc(uint64_t offset, uint64_t bytes, node->data = g_malloc(bytes); node->status = NODE_STATUS_NEW; +node->rdcnt = 0; node->ref = 1; QTAILQ_INIT(&node->wait_list); @@ -492,7 +504,7 @@ static int pickup_parts_of_cache(BlockDriverState *bs, PCacheNode *node, PartReqEntry *part = &req.parts[req.cnt]; if (ret == 0) { if (part->rlink.ret == 0) { -read_cache_data_direct(part->node, offset, bytes, qiov); +read_cache_data_direct(bs, part->node, offset, bytes, qiov); } else { ret = part->rlink.ret; } @@ -523,7 +535,7 @@ static int pcache_lookup_data(BlockDriverState *bs, uint64_t offset, if (node->common.offset <= offset && node->common.offset + node->common.bytes >= offset + bytes) { -ret = read_cache_data(node, offset, bytes, qiov); +ret = read_cache_data(bs, node, offset, bytes, qiov); } else { ret = pickup_parts_of_cache(bs, node, offset, bytes, qiov); -- 2.11.0
[Qemu-block] [PATCH v2 02/18] util/rbtree: add rbtree from linux kernel
Why don't we use rbtree from glib? We need pointer to the parent node. For optimal implementation storing of cached chunks in the rbtree need to get next and previous nodes and content of parent node is very useful for effective implementation of these functions. In this implementation of rbtree (unlike rbtree of glib) the node contains a pointer to parent node. Moreover, this rbtree allows more flexibility to work with an algorithm because to use rbtrees you'll have to implement your own insert and search cores. This will avoid us to use callbacks and to drop drammatically performances. Signed-off-by: Pavel Butsykin --- MAINTAINERS | 7 + include/qemu/rbtree.h | 107 include/qemu/rbtree_augmented.h | 235 + util/Makefile.objs | 1 + util/rbtree.c | 570 5 files changed, 920 insertions(+) create mode 100644 include/qemu/rbtree.h create mode 100644 include/qemu/rbtree_augmented.h create mode 100644 util/rbtree.c diff --git a/MAINTAINERS b/MAINTAINERS index 585cd5abd7..228278c1ca 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1465,6 +1465,13 @@ F: include/qemu/throttle.h F: util/throttle.c L: qemu-block@nongnu.org +Red Black Trees +M: Denis V. Lunev +S: Supported +F: include/qemu/rbtree.h +F: include/qemu/rbtree_augmented.h +F: util/rbtree.c + UUID M: Fam Zheng S: Supported diff --git a/include/qemu/rbtree.h b/include/qemu/rbtree.h new file mode 100644 index 00..d2e3fdd149 --- /dev/null +++ b/include/qemu/rbtree.h @@ -0,0 +1,107 @@ +/* + Red Black Trees + (C) 1999 Andrea Arcangeli + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + include/qemu/rbtree.h + + To use rbtrees you'll have to implement your own insert and search cores. + This will avoid us to use callbacks and to drop drammatically performances. + I know it's not the cleaner way, but in C (not in C++) to get + performances and genericity... +*/ + +#ifndef QEMU_RBTREE_H +#define QEMU_RBTREE_H + +#include +#include +#include + +struct RbNode { +uintptr_t __rb_parent_color; +struct RbNode *rb_right; +struct RbNode *rb_left; +} __attribute__((aligned(sizeof(uintptr_t; +/* The alignment might seem pointless, but allegedly CRIS needs it */ + +struct RbRoot { +struct RbNode *rb_node; +}; + + +#define RB_PARENT(r) ((struct RbNode *)((r)->__rb_parent_color & ~3)) + +#define RB_ROOT (struct RbRoot) { NULL, } +#define RB_ENTRY(ptr, type, member) container_of(ptr, type, member) + +#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) + +/* 'empty' nodes are nodes that are known not to be inserted in an rbtree */ +#define RB_EMPTY_NODE(node) \ +((node)->__rb_parent_color == (uintptr_t)(node)) +#define RB_CLEAR_NODE(node) \ +((node)->__rb_parent_color = (uintptr_t)(node)) + + +extern void rb_insert_color(struct RbNode *, struct RbRoot *); +extern void rb_erase(struct RbNode *, struct RbRoot *); + + +/* Find logical next and previous nodes in a tree */ +extern struct RbNode *rb_next(const struct RbNode *); +extern struct RbNode *rb_prev(const struct RbNode *); +extern struct RbNode *rb_first(const struct RbRoot *); +extern struct RbNode *rb_last(const struct RbRoot *); + +/* Postorder iteration - always visit the parent after its children */ +extern struct RbNode *rb_first_postorder(const struct RbRoot *); +extern struct RbNode *rb_next_postorder(const struct RbNode *); + +/* Fast replacement of a single node without remove/rebalance/add/rebalance */ +extern void rb_replace_node(struct RbNode *victim, struct RbNode *new, +struct RbRoot *root); + +static inline void rb_link_node(struct RbNode *node, struct RbNode *parent, +struct RbNode **rb_link) +{ +node->__rb_parent_color = (uintptr_t)parent; +node->rb_left = node->rb_right = NULL; + +*rb_link = node; +} + +#define RB_ENTRY_SAFE(ptr, type, member) \ +({ typeof(ptr) ptr = (ptr); \ + ptr ? rb_entry(ptr, type, member) : NULL; \ +}) + +/** + * rbtree_postorder_for_each_entry_safe - iterate over rb_root in post order of + * given type safe against removal of rb_node entry + * + * @pos: the 'type *' to use as a loop cursor. + * @n
[Qemu-block] [PATCH v2 18/18] block/pcache: add tracepoints
Signed-off-by: Pavel Butsykin --- block/pcache.c | 20 block/trace-events | 10 ++ 2 files changed, 30 insertions(+) diff --git a/block/pcache.c b/block/pcache.c index 6d2b54cf78..2bce3efc59 100644 --- a/block/pcache.c +++ b/block/pcache.c @@ -15,6 +15,7 @@ #include "qapi/error.h" #include "qapi/qmp/qstring.h" #include "qemu/rbcache.h" +#include "trace.h" #define PCACHE_OPT_STATS_SIZE "pcache-stats-size" #define PCACHE_OPT_MAX_AIO_SIZE "pcache-max-aio-size" @@ -348,12 +349,18 @@ static void coroutine_fn pcache_co_readahead(BlockDriverState *bs, node->status |= NODE_STATUS_COMPLETED; if (ret < 0) { +trace_pcache_readahead_fail(ret, node->common.offset, +node->common.bytes); rbcache_remove(s->cache, &node->common); } QTAILQ_FOREACH_SAFE(rlink, &node->wait_list, entry, next) { QTAILQ_REMOVE(&node->wait_list, rlink, entry); rlink->ret = ret; + +trace_pcache_readahead_pending_node_complete(ret, node->common.offset, +node->common.bytes, offset, bytes); + qemu_coroutine_enter(rlink->co); } @@ -509,6 +516,10 @@ static int pickup_parts_of_cache(BlockDriverState *bs, PCacheNode *node, ret = part->rlink.ret; } } +if (part->rlink.ret < 0) { +trace_pcache_read_part_fail(ret, part->node->common.offset, +part->node->common.bytes); +} pcache_node_unref(part->node); } @@ -618,6 +629,8 @@ static void pcache_write_through(BlockDriverState *bs, uint64_t offset, if (node->status & NODE_STATUS_COMPLETED) { write_cache_data(node, offset, bytes, qiov); +trace_pcache_write_through(offset, bytes, node->common.offset, + node->common.bytes); } } while (end_offs > chunk_offset); @@ -631,6 +644,8 @@ static void pcache_write_through(BlockDriverState *bs, uint64_t offset, continue; } write_cache_data(node, offset, bytes, qiov); +trace_pcache_write_through_removed_node(offset, bytes, +node->common.offset, node->common.bytes); } } @@ -663,6 +678,9 @@ static int pcache_state_init(QemuOpts *opts, BDRVPCacheState *s) PCACHE_DEFAULT_READAHEAD_SIZE); QLIST_INIT(&s->remove_node_list); +trace_pcache_state_init(stats_size, s->max_aio_size, cache_size, +s->readahead_size); + if (s->readahead_size < s->max_aio_size) { error_report("Readahead size can't be less than maximum request size" "that can be handled by pcache"); @@ -704,6 +722,8 @@ static void pcache_close(BlockDriverState *bs) { BDRVPCacheState *s = bs->opaque; +trace_pcache_close(s->req_stats, s->cache); + rbcache_destroy(s->req_stats); rbcache_destroy(s->cache); diff --git a/block/trace-events b/block/trace-events index cfc05f2478..d9cef3bcec 100644 --- a/block/trace-events +++ b/block/trace-events @@ -112,3 +112,13 @@ qed_aio_write_data(void *s, void *acb, int ret, uint64_t offset, size_t len) "s qed_aio_write_prefill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64 qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64 qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu" + +# block/pcache.c +pcache_readahead_fail(int ret, uint64_t offset, uint64_t bytes) "ret: %d offset: %"PRIu64" bytes: %"PRIu64 +pcache_readahead_pending_node_complete(int ret, uint64_t node_offset, uint64_t node_bytes, uint64_t read_offset, uint64_t read_bytes) "ret: %d node: %"PRIu64" %"PRIu64" pending read: %"PRIu64" %"PRIu64 +pcache_aio_read_cb_fail(int ret, uint64_t offset, uint64_t bytes) "ret: %d offset: %"PRIu64" bytes: %"PRIu64 +pcache_read_part_fail(int ret, uint64_t offset, uint64_t bytes) "ret: %d offset: %"PRIu64" bytes: %"PRIu64 +pcache_write_through(uint64_t req_offset, uint64_t req_bytes, uint64_t node_offset, uint64_t node_bytes) "request: %"PRIu64" %"PRIu64" node: %"PRIu64" %"PRIu64 +pcache_write_through_removed_node(uint64_t req_offset, uint64_t req_bytes, uint64_t node_offset, uint64_t node_bytes) "request: %"PRIu64" %"PRIu64" node: %"PRIu64" %"PRIu64 +pcache_state_init(uint64_t stats_size, uint64_t max_aio_size, uint64_t cache_size, uint64_t readahead_size) "pool statistics size: %"PRIu64" max aio size: %"PRIu64" cache size: %"PRIu64" readahead size: %"PRIu64 +pcache_close(void *req_stats, void *cache) "pool statistics: %p cache: %p" -- 2.11.0
[Qemu-block] [PATCH v2 12/18] block/pcache: inflight readahead request waiting for read
If there was a cache hit, but the status of the node is NODE_STATUS_INFLIGHT, then this means that a readahead request for this node is in flight. In this case, we can wait for the completion of the readahead request, and then copy the data. It allows us even more to optimize aio read requests. Signed-off-by: Pavel Butsykin --- block/pcache.c | 48 1 file changed, 44 insertions(+), 4 deletions(-) diff --git a/block/pcache.c b/block/pcache.c index e3749679ca..c1cbfa7040 100644 --- a/block/pcache.c +++ b/block/pcache.c @@ -62,9 +62,16 @@ typedef struct BDRVPCacheState { uint64_t readahead_size; } BDRVPCacheState; +typedef struct ReqLinkEntry { +QTAILQ_ENTRY(ReqLinkEntry) entry; +Coroutine *co; +int ret; +} ReqLinkEntry; + typedef struct PCacheNode { RBCacheNode common; uint8_t *data; +QTAILQ_HEAD(, ReqLinkEntry) wait_list; enum { NODE_STATUS_NEW = 0x01, NODE_STATUS_INFLIGHT = 0x02, @@ -116,6 +123,27 @@ static void read_cache_data_direct(PCacheNode *node, uint64_t offset, assert(copy == size); } +static int read_cache_data(PCacheNode *node, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov) +{ +if (node->status & NODE_STATUS_INFLIGHT) { +ReqLinkEntry rlink = { +.co = qemu_coroutine_self(), +}; + +QTAILQ_INSERT_HEAD(&node->wait_list, &rlink, entry); + +qemu_coroutine_yield(); + +if (rlink.ret < 0) { +return rlink.ret; +} +} +read_cache_data_direct(node, offset, bytes, qiov); + +return 0; +} + static void update_req_stats(RBCache *rbcache, uint64_t offset, uint64_t bytes) { do { @@ -194,6 +222,7 @@ static RBCacheNode *pcache_node_alloc(uint64_t offset, uint64_t bytes, node->data = g_malloc(bytes); node->status = NODE_STATUS_NEW; node->ref = 1; +QTAILQ_INIT(&node->wait_list); return &node->common; } @@ -272,6 +301,7 @@ static void coroutine_fn pcache_co_readahead(BlockDriverState *bs, BDRVPCacheState *s = bs->opaque; QEMUIOVector qiov; PCacheNode *node; +ReqLinkEntry *rlink, *next; uint64_t readahead_offset; uint64_t readahead_bytes; int ret; @@ -301,6 +331,13 @@ static void coroutine_fn pcache_co_readahead(BlockDriverState *bs, if (ret < 0) { rbcache_remove(s->cache, &node->common); } + +QTAILQ_FOREACH_SAFE(rlink, &node->wait_list, entry, next) { +QTAILQ_REMOVE(&node->wait_list, rlink, entry); +rlink->ret = ret; +qemu_coroutine_enter(rlink->co); +} + pcache_node_unref(node); } @@ -323,7 +360,7 @@ static int pcache_lookup_data(BlockDriverState *bs, uint64_t offset, BDRVPCacheState *s = bs->opaque; PCacheNode *node = rbcache_search(s->cache, offset, bytes); -if (node == NULL || node->status & NODE_STATUS_INFLIGHT) { +if (node == NULL) { return CACHE_MISS; } @@ -331,7 +368,10 @@ static int pcache_lookup_data(BlockDriverState *bs, uint64_t offset, if (node->common.offset <= offset && node->common.offset + node->common.bytes >= offset + bytes) { -read_cache_data_direct(node, offset, bytes, qiov); +int ret = read_cache_data(node, offset, bytes, qiov); +if (ret < 0) { +return ret; +} return CACHE_HIT; } @@ -362,8 +402,8 @@ static coroutine_fn int pcache_co_preadv(BlockDriverState *bs, uint64_t offset, qemu_coroutine_enter(co); status = pcache_lookup_data(bs, offset, bytes, qiov); -if (status == CACHE_HIT) { -return 0; +if (status != CACHE_MISS) { +return status < 0 ? status : 0; } skip_large_request: -- 2.11.0
[Qemu-block] [PATCH v2 14/18] block/pcache: up-to-date cache for removed nodes
We must be able to keep the actual data even for the removed nodes. This is necessary when we need to defer the reading of node. Because there may come an overlapping write to removed node in the interval between node completion and reading of the node. For this we move the removed nodes to remove_node_list and update node->data on each overlapping write. This is preparatory patch for the resolution of partial cache-hit. Signed-off-by: Pavel Butsykin --- block/pcache.c | 24 +++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/block/pcache.c b/block/pcache.c index 140f90d6d7..86b28de44b 100644 --- a/block/pcache.c +++ b/block/pcache.c @@ -60,6 +60,7 @@ typedef struct BDRVPCacheState { RBCache *cache; uint64_t max_aio_size; uint64_t readahead_size; +QLIST_HEAD(, PCacheNode) remove_node_list; } BDRVPCacheState; typedef struct ReqLinkEntry { @@ -79,6 +80,7 @@ typedef struct PCacheNode { NODE_STATUS_REMOVE= 0x08, NODE_STATUS_DELETED = 0x10, /* only for debugging */ } status; +QLIST_ENTRY(PCacheNode) entry; int ref; } PCacheNode; @@ -94,6 +96,7 @@ static void pcache_node_unref(PCacheNode *node) assert(node->status & NODE_STATUS_REMOVE); node->status |= NODE_STATUS_DELETED; +QLIST_REMOVE(node, entry); g_free(node->data); g_free(node); } @@ -205,11 +208,14 @@ static bool check_request_sequence(BDRVPCacheState *s, uint64_t offset) static void pcache_node_free(RBCacheNode *rbnode, void *opaque) { +BDRVPCacheState *s = opaque; PCacheNode *node = container_of(rbnode, PCacheNode, common); assert(node->status == NODE_STATUS_INFLIGHT || node->status == NODE_STATUS_COMPLETED); +QLIST_INSERT_HEAD(&s->remove_node_list, node, entry); + node->status |= NODE_STATUS_REMOVE; pcache_node_unref(node); } @@ -432,11 +438,12 @@ static void pcache_write_through(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov) { BDRVPCacheState *s = bs->opaque; +PCacheNode *node, *next; uint64_t chunk_offset = offset, chunk_bytes = bytes; uint64_t end_offs = offset + bytes; do { -PCacheNode *node = rbcache_search(s->cache, chunk_offset, chunk_bytes); +node = rbcache_search(s->cache, chunk_offset, chunk_bytes); if (node == NULL) { break; } @@ -450,6 +457,18 @@ static void pcache_write_through(BlockDriverState *bs, uint64_t offset, write_cache_data(node, offset, bytes, qiov); } } while (end_offs > chunk_offset); + +QLIST_FOREACH_SAFE(node, &s->remove_node_list, entry, next) { +if (node->status & NODE_STATUS_INFLIGHT) { +continue; +} +if (offset >= node->common.offset + node->common.bytes || +offset + bytes <= node->common.offset) +{ +continue; +} +write_cache_data(node, offset, bytes, qiov); +} } static coroutine_fn int pcache_co_pwritev(BlockDriverState *bs, uint64_t offset, @@ -479,6 +498,7 @@ static void pcache_state_init(QemuOpts *opts, BDRVPCacheState *s) RBCACHE_LRU, s); s->readahead_size = qemu_opt_get_size(opts, PCACHE_OPT_READAHEAD_SIZE, PCACHE_DEFAULT_READAHEAD_SIZE); +QLIST_INIT(&s->remove_node_list); } static int pcache_file_open(BlockDriverState *bs, QDict *options, int flags, @@ -516,6 +536,8 @@ static void pcache_close(BlockDriverState *bs) rbcache_destroy(s->req_stats); rbcache_destroy(s->cache); + +assert(QLIST_EMPTY(&s->remove_node_list)); } static int64_t pcache_getlength(BlockDriverState *bs) -- 2.11.0
[Qemu-block] [PATCH v2 03/18] util/rbcache: range-based cache core
RBCache provides functionality to cache the data from block devices (basically). The range here is used as the main key for searching and storing data. The cache is based on red-black trees, so basic operations search, insert, delete are performed for O(log n). It is important to note that QEMU usually does not require a data cache, but in reality, there are already some cases where a cache of small amounts can increase performance, so as the data structure was selected red-black trees, this is a fairly simple data structure and show high efficiency on a small number of elements. Therefore, when the minimum range is 512 bytes, the recommended size of the cache memory no more than 8-16mb. Also note that this cache implementation allows to store ranges of different lengths without alignment. Generic cache core can easily be used to implement different caching policies at the block level, such as read-ahed. Also it can be used in some special cases, for example for caching data in qcow2 when sequential allocating writes to image with backing file. Signed-off-by: Pavel Butsykin --- MAINTAINERS| 6 ++ include/qemu/rbcache.h | 128 + util/Makefile.objs | 1 + util/rbcache.c | 253 + 4 files changed, 388 insertions(+) create mode 100644 include/qemu/rbcache.h create mode 100644 util/rbcache.c diff --git a/MAINTAINERS b/MAINTAINERS index 228278c1ca..01f4afa1e4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1472,6 +1472,12 @@ F: include/qemu/rbtree.h F: include/qemu/rbtree_augmented.h F: util/rbtree.c +Range-Based Cache +M: Denis V. Lunev +S: Supported +F: include/qemu/rbcache.h +F: util/rbcache.c + UUID M: Fam Zheng S: Supported diff --git a/include/qemu/rbcache.h b/include/qemu/rbcache.h new file mode 100644 index 00..24f7c1cb80 --- /dev/null +++ b/include/qemu/rbcache.h @@ -0,0 +1,128 @@ +/* + * QEMU Range-Based Cache core + * + * Copyright (C) 2015-2016 Parallels IP Holdings GmbH. + * + * Author: Pavel Butsykin + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#ifndef RBCACHE_H +#define RBCACHE_H + +#include "qemu/rbtree.h" +#include "qemu/queue.h" + +typedef struct RBCacheNode { +struct RbNode rb_node; +uint64_t offset; +uint64_t bytes; +QTAILQ_ENTRY(RBCacheNode) entry; +} RBCacheNode; + +typedef struct RBCache RBCache; + +/* These callbacks are used to extend the common structure RBCacheNode. The + * alloc callback should initialize only fields of the expanded node. Node + * common part is initialized in RBCache( see rbcache_node_alloc() ). + */ +typedef RBCacheNode *RBNodeAlloc(uint64_t offset, uint64_t bytes, void *opaque); +typedef void RBNodeFree(RBCacheNode *node, void *opaque); + + +enum eviction_type { +RBCACHE_FIFO, +RBCACHE_LRU, +}; + +/** + * rbcache_search: + * @rbcache: the cache object. + * @offset: the start of the range. + * @bytes: the size of the range. + * + * Returns the node corresponding to the range(offset, bytes), or NULL if + * the node was not found. In the case when the range covers multiple nodes, + * it returns the node with the lowest offset. + */ +void *rbcache_search(RBCache *rbcache, uint64_t offset, uint64_t bytes); + +/** + * rbcache_insert: + * @rbcache: the cache object. + * @node: a new node for the cache. + * + * Returns the new node, or old node if a node describing the same range + * already exists. In case of partial overlaps, the existing overlapping node + * with the lowest offset is returned. + */ +void *rbcache_insert(RBCache *rbcache, RBCacheNode *node); + +/** + * rbcache_search_and_insert: + * @rbcache: the cache object. + * @offset: the start of the range. + * @bytes: the size of the range. + * + * rbcache_search_and_insert() is like rbcache_insert(), except that a new node + * is allocated inside the function. Returns the new node, or old node if a node + * describing the same range. In case of partial overlaps, the existing + * overlapping node with the lowest offset is returned. + */ +void *rbcache_search_and_insert(RBCache *rbcache, uint64_t offset, +uint64_t byte); + +/** + * rbcache_remove: + * @rbcache: the cache object. + * @node: a node to remove. + * + * Removes the cached range owned by the node, it also frees the node. + */ +void rbcache_remove(RBCache *rbcache, RBCacheNode *node); + +/** + * rbcache_node_alloc: + * @rbcache: the cache object. + * @offset: the start of the range. + * @bytes: the size of the range. + * + * Returns an allocated and initialized node. + */ +RBCacheNode *rbcache_node_alloc(RBCache *rbcache, uint64_t offset, +uint64_t bytes); + +/** + * rbcache_node_free: + * @rbcache: the cache object. + * @node: a node to free. + * + * Frees the node. + */ +void rbcache_node_free(RBCache *rbcache, RBCacheNode *node); + +/** + * rbc
[Qemu-block] [PATCH v2 07/18] block/pcache: updating statistics for overlapping requests
When updating the statistics sometimes i/O requests can overlap each other, in this case the requests are not stored in the statistics. It's not very good, especially when the requests have a small range of intersection. We can cut the requests in the intersection and add the pieces of requests in the statistics. Maybe not significantly, but it can make the statistics more accurate. Here, update_req_stats() adds the ability to save overlapping requests. Signed-off-by: Pavel Butsykin --- block/pcache.c | 36 +++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/block/pcache.c b/block/pcache.c index 1f3200af63..deac57c58d 100644 --- a/block/pcache.c +++ b/block/pcache.c @@ -46,6 +46,40 @@ typedef struct BDRVPCacheState { uint64_t max_aio_size; } BDRVPCacheState; +static void update_req_stats(RBCache *rbcache, uint64_t offset, uint64_t bytes) +{ +do { +RBCacheNode *node = rbcache_search_and_insert(rbcache, offset, bytes); +/* The node was successfully added or already exists */ +if (node->offset <= offset && +node->offset + node->bytes >= offset + bytes) +{ +break; +} + +/* Request covers the whole node */ +if (offset <= node->offset && +offset + bytes >= node->offset + node->bytes) +{ +rbcache_remove(rbcache, node); +continue; +} + +if (offset < node->offset) { +RBCacheNode *new_node = +rbcache_node_alloc(rbcache, offset, node->offset - offset); +if (new_node != rbcache_insert(rbcache, new_node)) { +/* The presence of the node in this range is impossible */ +abort(); +} +break; +} + +bytes = (offset + bytes) - (node->offset + node->bytes); +offset = node->offset + node->bytes; +} while (true); +} + static coroutine_fn int pcache_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) @@ -53,7 +87,7 @@ static coroutine_fn int pcache_co_preadv(BlockDriverState *bs, uint64_t offset, BDRVPCacheState *s = bs->opaque; if (s->max_aio_size >= bytes) { -rbcache_search_and_insert(s->req_stats, offset, bytes); +update_req_stats(s->req_stats, offset, bytes); } return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags); -- 2.11.0
[Qemu-block] [PATCH v2 17/18] qapi: allow blockdev-add for pcache
Signed-off-by: Pavel Butsykin --- qapi/block-core.json | 30 -- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/qapi/block-core.json b/qapi/block-core.json index 6b42216960..00a6e15db3 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -1719,6 +1719,7 @@ # @nfs: Since 2.8 # @replication: Since 2.8 # @ssh: Since 2.8 +# @pcache: since 2.9 # # Since: 2.0 ## @@ -1726,8 +1727,8 @@ 'data': [ 'archipelago', 'blkdebug', 'blkverify', 'bochs', 'cloop', 'dmg', 'file', 'ftp', 'ftps', 'gluster', 'host_cdrom', 'host_device', 'http', 'https', 'luks', 'nbd', 'nfs', 'null-aio', -'null-co', 'parallels', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', -'replication', 'ssh', 'vdi', 'vhdx', 'vmdk', 'vpc', +'null-co', 'parallels', 'pcache', 'qcow', 'qcow2', 'qed', 'quorum', +'raw', 'replication', 'ssh', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] } ## @@ -1808,6 +1809,30 @@ 'base': 'BlockdevOptionsGenericFormat', 'data': { '*key-secret': 'str' } } +## +# @BlockdevOptionsPCache +# +# Driver specific block device options for pcache. +# +# @image:Reference to a block device image for caching. +# +# @pcache-stats-size:#optional Total volume of requests for statistics. +# +# @pcache-max-aio-size: #optional Maximum size of read request which is handled +#by pcache. +# +# @pcache-full-size: #optional Total cache size. +# +# @pcache-readahead-size #optional Prefetch cache readahead size. +# +# Since: 2.9 +## +{ 'struct': 'BlockdevOptionsPCache', + 'data': { 'image': 'BlockdevRef', +'*pcache-stats-size': 'int', +'*pcache-max-aio-size': 'int', +'*pcache-full-size': 'int', +'*pcache-readahead-size': 'int' } } ## # @BlockdevOptionsGenericCOWFormat: @@ -2402,6 +2427,7 @@ 'null-aio': 'BlockdevOptionsNull', 'null-co':'BlockdevOptionsNull', 'parallels': 'BlockdevOptionsGenericFormat', + 'pcache': 'BlockdevOptionsPCache', 'qcow2': 'BlockdevOptionsQcow2', 'qcow': 'BlockdevOptionsGenericCOWFormat', 'qed':'BlockdevOptionsGenericCOWFormat', -- 2.11.0
[Qemu-block] [PATCH v2 05/18] block/pcache: statistics collection read requests
Here the rbcache is used as a repository statistics requests, in fact, data requests are not cached, so we have the ability to store a large number of requests. We need statistics requests to determine the sequential requests. Signed-off-by: Pavel Butsykin --- block/pcache.c | 32 +++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/block/pcache.c b/block/pcache.c index 7a67618408..296ae382b0 100644 --- a/block/pcache.c +++ b/block/pcache.c @@ -13,20 +13,38 @@ #include "block/block_int.h" #include "qapi/error.h" #include "qapi/qmp/qstring.h" +#include "qemu/rbcache.h" +#define PCACHE_OPT_STATS_SIZE "pcache-stats-size" static QemuOptsList runtime_opts = { .name = "pcache", .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), .desc = { +{ +.name = PCACHE_OPT_STATS_SIZE, +.type = QEMU_OPT_SIZE, +.help = "Total volume of requests for statistics", +}, { /* end of list */ } }, }; +#define MB_BITS 20 +#define PCACHE_DEFAULT_STATS_SIZE (3 << MB_BITS) + +typedef struct BDRVPCacheState { +RBCache *req_stats; +} BDRVPCacheState; + static coroutine_fn int pcache_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) { +BDRVPCacheState *s = bs->opaque; + +rbcache_search_and_insert(s->req_stats, offset, bytes); + return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags); } @@ -37,6 +55,13 @@ static coroutine_fn int pcache_co_pwritev(BlockDriverState *bs, uint64_t offset, return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); } +static void pcache_state_init(QemuOpts *opts, BDRVPCacheState *s) +{ +uint64_t stats_size = qemu_opt_get_size(opts, PCACHE_OPT_STATS_SIZE, +PCACHE_DEFAULT_STATS_SIZE); +s->req_stats = rbcache_create(NULL, NULL, stats_size, RBCACHE_FIFO, s); +} + static int pcache_file_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { @@ -58,7 +83,9 @@ static int pcache_file_open(BlockDriverState *bs, QDict *options, int flags, if (local_err) { ret = -EINVAL; error_propagate(errp, local_err); +goto fail; } +pcache_state_init(opts, bs->opaque); fail: qemu_opts_del(opts); return ret; @@ -66,6 +93,9 @@ fail: static void pcache_close(BlockDriverState *bs) { +BDRVPCacheState *s = bs->opaque; + +rbcache_destroy(s->req_stats); } static int64_t pcache_getlength(BlockDriverState *bs) @@ -81,7 +111,7 @@ static bool pcache_recurse_is_first_non_filter(BlockDriverState *bs, static BlockDriver bdrv_pcache = { .format_name= "pcache", -.instance_size = 0, +.instance_size = sizeof(BDRVPCacheState), .bdrv_file_open = pcache_file_open, .bdrv_close = pcache_close, -- 2.11.0
[Qemu-block] [PATCH v2 10/18] block/pcache: cache invalidation on write requests
In AIO write request completion we just drop all the intersecting nodes in the cache, it's a simple way to keep the cache up-to-date. Signed-off-by: Pavel Butsykin --- block/pcache.c | 32 +++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/block/pcache.c b/block/pcache.c index f30e9e7bfe..4007241d37 100644 --- a/block/pcache.c +++ b/block/pcache.c @@ -314,11 +314,41 @@ skip_large_request: return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags); } +static void pcache_invalidation(BlockDriverState *bs, uint64_t offset, +uint64_t bytes) +{ +BDRVPCacheState *s = bs->opaque; +uint64_t chunk_offset = offset, chunk_bytes = bytes; +uint64_t end_offs = offset + bytes; + +do { +PCacheNode *node = rbcache_search(s->cache, chunk_offset, chunk_bytes); +if (node == NULL) { +break; +} +assert(node->status == NODE_STATUS_COMPLETED || + node->status == NODE_STATUS_INFLIGHT); + +chunk_offset = node->common.offset + node->common.bytes; +chunk_bytes = end_offs - chunk_offset; + +if (node->status & NODE_STATUS_COMPLETED) { +rbcache_remove(s->cache, &node->common); +} +} while (end_offs > chunk_offset); +} + static coroutine_fn int pcache_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) { -return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); +int ret = bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); +if (ret < 0) { +return ret; +} +pcache_invalidation(bs, offset, bytes); + +return ret; } static void pcache_state_init(QemuOpts *opts, BDRVPCacheState *s) -- 2.11.0
[Qemu-block] [PATCH v2 06/18] block/pcache: skip large aio read
This change will allow more efficient use of cache memory and filter the case for which the pcache isn't efficient. We skip requests that are not required in the optimization and thereby reducing the number of unnecessary readaheads. Signed-off-by: Pavel Butsykin --- block/pcache.c | 16 +++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/block/pcache.c b/block/pcache.c index 296ae382b0..1f3200af63 100644 --- a/block/pcache.c +++ b/block/pcache.c @@ -16,6 +16,7 @@ #include "qemu/rbcache.h" #define PCACHE_OPT_STATS_SIZE "pcache-stats-size" +#define PCACHE_OPT_MAX_AIO_SIZE "pcache-max-aio-size" static QemuOptsList runtime_opts = { .name = "pcache", @@ -26,15 +27,23 @@ static QemuOptsList runtime_opts = { .type = QEMU_OPT_SIZE, .help = "Total volume of requests for statistics", }, +{ +.name = PCACHE_OPT_MAX_AIO_SIZE, +.type = QEMU_OPT_SIZE, +.help = "Maximum size of aio which is handled by pcache", +}, { /* end of list */ } }, }; +#define KB_BITS 10 #define MB_BITS 20 #define PCACHE_DEFAULT_STATS_SIZE (3 << MB_BITS) +#define PCACHE_DEFAULT_MAX_AIO_SIZE (64 << KB_BITS) typedef struct BDRVPCacheState { RBCache *req_stats; +uint64_t max_aio_size; } BDRVPCacheState; static coroutine_fn int pcache_co_preadv(BlockDriverState *bs, uint64_t offset, @@ -43,7 +52,9 @@ static coroutine_fn int pcache_co_preadv(BlockDriverState *bs, uint64_t offset, { BDRVPCacheState *s = bs->opaque; -rbcache_search_and_insert(s->req_stats, offset, bytes); +if (s->max_aio_size >= bytes) { +rbcache_search_and_insert(s->req_stats, offset, bytes); +} return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags); } @@ -60,6 +71,9 @@ static void pcache_state_init(QemuOpts *opts, BDRVPCacheState *s) uint64_t stats_size = qemu_opt_get_size(opts, PCACHE_OPT_STATS_SIZE, PCACHE_DEFAULT_STATS_SIZE); s->req_stats = rbcache_create(NULL, NULL, stats_size, RBCACHE_FIFO, s); + +s->max_aio_size = qemu_opt_get_size(opts, PCACHE_OPT_MAX_AIO_SIZE, +PCACHE_DEFAULT_MAX_AIO_SIZE); } static int pcache_file_open(BlockDriverState *bs, QDict *options, int flags, -- 2.11.0
[Qemu-block] [PATCH v2 04/18] tests/test-rbcache: add test cases
Signed-off-by: Pavel Butsykin --- tests/Makefile.include | 3 + tests/test-rbcache.c | 431 + 2 files changed, 434 insertions(+) create mode 100644 tests/test-rbcache.c diff --git a/tests/Makefile.include b/tests/Makefile.include index 4841d582a1..74bdcd2aac 100644 --- a/tests/Makefile.include +++ b/tests/Makefile.include @@ -55,6 +55,8 @@ check-unit-y += tests/test-hbitmap$(EXESUF) gcov-files-test-hbitmap-y = blockjob.c check-unit-y += tests/test-blockjob$(EXESUF) check-unit-y += tests/test-blockjob-txn$(EXESUF) +gcov-files-test-rbcache-y = util/rbcache.c +check-unit-y += tests/test-rbcache$(EXESUF) check-unit-y += tests/test-x86-cpuid$(EXESUF) # all code tested by test-x86-cpuid is inside topology.h gcov-files-test-x86-cpuid-y = @@ -498,6 +500,7 @@ tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o $(test-block-obj-y) tests/test-thread-pool$(EXESUF): tests/test-thread-pool.o $(test-block-obj-y) tests/test-iov$(EXESUF): tests/test-iov.o $(test-util-obj-y) tests/test-hbitmap$(EXESUF): tests/test-hbitmap.o $(test-util-obj-y) +tests/test-rbcache$(EXESUF): tests/test-rbcache.o $(test-util-obj-y) tests/test-x86-cpuid$(EXESUF): tests/test-x86-cpuid.o tests/test-xbzrle$(EXESUF): tests/test-xbzrle.o migration/xbzrle.o page_cache.o $(test-util-obj-y) tests/test-cutils$(EXESUF): tests/test-cutils.o util/cutils.o diff --git a/tests/test-rbcache.c b/tests/test-rbcache.c new file mode 100644 index 00..40ccd6b225 --- /dev/null +++ b/tests/test-rbcache.c @@ -0,0 +1,431 @@ +/* + * QEMU Range-Based Cache core + * + * Copyright (C) 2015-2016 Parallels IP Holdings GmbH. + * + * Author: Pavel Butsykin + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/rbcache.h" + +typedef struct TestRBCacheData { +RBCache *cache; +} TestRBCacheData; + +typedef struct TestRBCacheConfig { +uint64_t limit_size; +int eviction_type; +RBNodeAlloc *alloc; +RBNodeFree *free; +void *opaque; +} TestRBCacheConfig; + +#define KB(_n) ((_n) << 10) +#define MB(_n) ((_n) << 20) + +#define OFFSET1 0 +#define SIZE1 KB(1) + +#define OFFSET2 KB(1) +#define SIZE2 KB(2) + +#define OFFSET3 KB(18) +#define SIZE3 KB(1) + +#define OFFSET4 KB(7) +#define SIZE4 KB(7) + +#define OFFSET5 KB(1) +#define SIZE5 KB(4) + +#define OFFSET6 KB(5) +#define SIZE6 KB(5) + +#define OFFSET7 KB(15) +#define SIZE7 KB(20) + +#define OFFSET8 KB(2) +#define SIZE8 KB(20) + + +static void test_rbcache_init(TestRBCacheData *data, const void *ctx) +{ +g_assert_nonnull(data->cache); +} + +static void test_rbcache_insert(TestRBCacheData *data, const void *ctx) +{ +RBCacheNode *node1 = rbcache_node_alloc(data->cache, OFFSET1, SIZE1); +RBCacheNode *node2 = rbcache_node_alloc(data->cache, OFFSET2, SIZE2); +RBCacheNode *node3 = rbcache_node_alloc(data->cache, OFFSET3, SIZE3); +RBCacheNode *node4 = rbcache_node_alloc(data->cache, OFFSET4, SIZE4); +RBCacheNode *node5 = rbcache_node_alloc(data->cache, OFFSET5, SIZE5); +RBCacheNode *node6 = rbcache_node_alloc(data->cache, OFFSET6, SIZE6); +RBCacheNode *node7 = rbcache_node_alloc(data->cache, OFFSET7, SIZE7); +RBCacheNode *node8 = rbcache_node_alloc(data->cache, OFFSET8, SIZE8); +RBCacheNode *node; + +node = rbcache_insert(data->cache, node2); +g_assert_true(node == node2); + +node = rbcache_insert(data->cache, node1); +g_assert_true(node == node1); + +node = rbcache_insert(data->cache, node3); +g_assert_true(node == node3); + +node = rbcache_insert(data->cache, node4); +g_assert_true(node == node4); + +node = rbcache_insert(data->cache, node5); +g_assert_true(node == node2); +rbcache_node_free(data->cache, node5); + +node = rbcache_insert(data->cache, node6); +g_assert_true(node == node4); +rbcache_node_free(data->cache, node6); + +node = rbcache_insert(data->cache, node7); +g_assert_true(node == node3); +rbcache_node_free(data->cache, node7); + +node = rbcache_insert(data->cache, node8); +g_assert_true(node == node2); +rbcache_node_free(data->cache, node8); +} + +static void test_rbcache_search(TestRBCacheData *data, const void *ctx) +{ +RBCacheNode *node; + +test_rbcache_insert(data, ctx); + +node = rbcache_search(data->cache, OFFSET1, SIZE1); +g_assert_nonnull(node); +g_assert_cmpuint(node->offset, ==, OFFSET1); +g_assert_cmpuint(node->bytes, ==, SIZE1); + +node = rbcache_search(data->cache, OFFSET2 + KB(1), SIZE2); +g_assert_nonnull(node); +g_assert_cmpuint(node->offset, ==, OFFSET2); +g_assert_cmpuint(node->bytes, ==, SIZE2); + +node = rbcache_search(data->cache, OFFSET8, SIZE8); +g_assert_nonnull(node); +g_assert_cmpuint(node->offset, ==, OFFSET2); +g_assert_cmpuint(node->bytes, ==, SIZE2); + +node = rbcache_sea
[Qemu-block] [PATCH v2 00/18] I/O prefetch cache
The prefetch cache aims to improve the performance of sequential read data. Of most interest here are the requests of a small size of data for sequential read, such requests can be optimized by extending them and moving into the prefetch cache. However, there are 2 issues: - In aggregate only a small portion of requests is sequential, so delays caused by the need to read more volumes of data will lead to an overall decrease in performance. - The presence of redundant data in the cache memory with a large number of random requests. This pcache implementation solves the above and other problems prefetching data. The pcache algorithm can be summarised by the following main steps. 1. Monitor I/O requests to identify typical sequences. This implementation of prefetch cache works at the storage system level and has information only about the physical block addresses of I/O requests. Statistics are collected only from read requests to a maximum size of 64kb(by default), each request that matches the criteria falls into a pool of requests. In order to store request statistics used by the rb-tree, it's simple but for this issue a quite efficient data structure. 2. Identifying sequential I/O streams. For each read request to be carried out attempting to lift the chain sequence from the tree statistics, where this request will be element of a sequential chain of requests. The key to search for consecutive requests is the area of bytes preceding the current request. The size of this area should not be too small to avoid false readahead. The sequential stream data requests can be identified even when a large number of random requests. For example, if there is access to the blocks 100, 1157, 27520, 4, 101, 312, 1337, 102, in the context of request processing 102 will be identified the chain of sequential requests 100, 101. 102 and then should a decision be made to do readahead. Also a situation may arise when multiple applications A, B, C simultaneously perform sequential read of data. For each separate application that will be sequential read data A(100, 101, 102), B(300, 301, 302), C(700, 701, 702), but for block devices it may look like a random data reading: 100,300,700,101,301,701,102,302,702. In this case, the sequential streams will also be recognised because location requests in the rb-tree will allow to separate the sequential I/O streams. 3. Do the readahead into the cache for recognized sequential data streams. After the issue of the detection of pcache case was resolved, need using larger requests to bring data into the cache. In this implementation the pcache used readahead instead of the extension request, therefore the request goes as is. There is not any reason to put data in the cache that will never be picked up, but this will always happen in the case of extension requests. In order to store areas of cached blocks is also used the rb-tree, it's simple but for this issue a quite efficient data structure. 4. Control size of the prefetch cache pool and the requests statistic pool For control the border of the pool statistic of requests, the data of requests are placed and replaced according to the FIFO principle, everything is simple. For control the boundaries of the memory cache used LRU list, it allows to limit the max amount memory that we can allocate for pcache. But the LRU is there mainly to prevent displacement of the cache blocks that was read partially. The main way the memory is pushed out immediately after use, as soon as a chunk of memory from the cache has been completely read, since the probability of repetition of the request is very low. Cases when one and the same portion of the cache memory has been read several times are not optimized and do not apply to the cases that can optimize the pcache. Thus, using a cache memory of small volume, by the optimization of the operations read-ahead and clear memory, we can read entire volumes of data, providing a 100% cache hit. Also does not decrease the effectiveness of random read requests. PCache is implemented as a qemu block filter driver, has some configurable parameters, such as: total cache size, statistics size, readahead size, maximum size of block that can be processed. For performance evaluation has been used several test cases with different sequential and random read data on SSD disk. Here are the results of tests and qemu parameters: qemu parameters: -machine pc,accel=kvm,usb=off,vmport=off -m 1024 -smp 8 -drive file=/img/harddisk.hdd,if=none,cache=none,id=drive-scsi0-0-0-0,aio=native -drive driver=pcache,image=drive-scsi0-0-0-0,if=virtio * * Testcase* Results in iops * * *** * * clean qemu *pcache* * * Create/open 16 file(s) of total *
[Qemu-block] [PATCH v2 01/18] block/pcache: empty pcache driver filter
The basic version of pcache driver for easy preparation of a patch set. Signed-off-by: Pavel Butsykin --- block/Makefile.objs | 1 + block/pcache.c | 102 2 files changed, 103 insertions(+) create mode 100644 block/pcache.c diff --git a/block/Makefile.objs b/block/Makefile.objs index 67a036a1df..0a024f4e66 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -4,6 +4,7 @@ block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o block-obj-y += qed-check.o block-obj-y += vhdx.o vhdx-endian.o vhdx-log.o block-obj-y += quorum.o +block-obj-y += pcache.o block-obj-y += parallels.o blkdebug.o blkverify.o blkreplay.o block-obj-y += block-backend.o snapshot.o qapi.o block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o diff --git a/block/pcache.c b/block/pcache.c new file mode 100644 index 00..7a67618408 --- /dev/null +++ b/block/pcache.c @@ -0,0 +1,102 @@ +/* + * Prefetch cache driver filter + * + * Copyright (C) 2015-2016 Parallels IP Holdings GmbH. + * + * Author: Pavel Butsykin + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "block/block_int.h" +#include "qapi/error.h" +#include "qapi/qmp/qstring.h" + + +static QemuOptsList runtime_opts = { +.name = "pcache", +.head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), +.desc = { +{ /* end of list */ } +}, +}; + +static coroutine_fn int pcache_co_preadv(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov, + int flags) +{ +return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags); +} + +static coroutine_fn int pcache_co_pwritev(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov, + int flags) +{ +return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); +} + +static int pcache_file_open(BlockDriverState *bs, QDict *options, int flags, +Error **errp) +{ +QemuOpts *opts; +Error *local_err = NULL; +int ret = 0; + +opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); +qemu_opts_absorb_qdict(opts, options, &local_err); +if (local_err) { +error_propagate(errp, local_err); +ret = -EINVAL; +goto fail; +} + +assert(bs->file == NULL); +bs->file = bdrv_open_child(NULL, options, "image", bs, &child_format, + false, &local_err); +if (local_err) { +ret = -EINVAL; +error_propagate(errp, local_err); +} +fail: +qemu_opts_del(opts); +return ret; +} + +static void pcache_close(BlockDriverState *bs) +{ +} + +static int64_t pcache_getlength(BlockDriverState *bs) +{ +return bdrv_getlength(bs->file->bs); +} + +static bool pcache_recurse_is_first_non_filter(BlockDriverState *bs, + BlockDriverState *candidate) +{ +return bdrv_recurse_is_first_non_filter(bs->file->bs, candidate); +} + +static BlockDriver bdrv_pcache = { +.format_name= "pcache", +.instance_size = 0, + +.bdrv_file_open = pcache_file_open, +.bdrv_close = pcache_close, +.bdrv_getlength = pcache_getlength, + +.bdrv_co_preadv = pcache_co_preadv, +.bdrv_co_pwritev= pcache_co_pwritev, + +.is_filter = true, +.bdrv_recurse_is_first_non_filter = pcache_recurse_is_first_non_filter, +}; + +static void bdrv_cache_init(void) +{ +bdrv_register(&bdrv_pcache); +} + +block_init(bdrv_cache_init); -- 2.11.0
[Qemu-block] [PATCH v2 09/18] block/pcache: skip readahead for unallocated clusters
Typically, data for unallocated clusters is filled with zeros, so it makes no sense to store it in the cache. Signed-off-by: Pavel Butsykin --- block/pcache.c | 26 ++ 1 file changed, 26 insertions(+) diff --git a/block/pcache.c b/block/pcache.c index 57eebd434a..f30e9e7bfe 100644 --- a/block/pcache.c +++ b/block/pcache.c @@ -174,6 +174,28 @@ static RBCacheNode *pcache_node_alloc(uint64_t offset, uint64_t bytes, return &node->common; } +static bool check_allocated_clusters(BlockDriverState *bs, uint64_t offset, + uint64_t bytes) +{ +int64_t sector_num = offset >> BDRV_SECTOR_BITS; +int32_t nb_sectors = bytes >> BDRV_SECTOR_BITS; + +assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); +assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + +do { +int num, ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &num); +if (ret <= 0) { +return false; +} +sector_num += num; +nb_sectors -= num; + +} while (nb_sectors); + +return true; +} + #define PCACHE_STEPS_FORWARD 2 static PCacheNode *get_readahead_node(BlockDriverState *bs, RBCache *rbcache, @@ -193,6 +215,10 @@ static PCacheNode *get_readahead_node(BlockDriverState *bs, RBCache *rbcache, break; } +if (!check_allocated_clusters(bs, offset, bytes)) { +break; +} + node = rbcache_search_and_insert(rbcache, offset, bytes); if (node->status == NODE_STATUS_NEW) { return node; -- 2.11.0
[Qemu-block] [PATCH v2 13/18] block/pcache: write through
Write-through is another way to keep the cache up-to-date. Even if this case will be rare, write to the cache is more optimal than drop-cache. Signed-off-by: Pavel Butsykin --- block/pcache.c | 26 ++ 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/block/pcache.c b/block/pcache.c index c1cbfa7040..140f90d6d7 100644 --- a/block/pcache.c +++ b/block/pcache.c @@ -410,8 +410,26 @@ skip_large_request: return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags); } -static void pcache_invalidation(BlockDriverState *bs, uint64_t offset, -uint64_t bytes) +static void write_cache_data(PCacheNode *node, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov) +{ +uint64_t qiov_offs = 0, node_offs = 0; +uint64_t size; +uint64_t copy; + +if (offset < node->common.offset) { +qiov_offs = node->common.offset - offset; +} else { +node_offs = offset - node->common.offset; +} +size = ranges_overlap_size(offset, bytes, node->common.offset, + node->common.bytes); +copy = qemu_iovec_to_buf(qiov, qiov_offs, node->data + node_offs, size); +assert(copy == size); +} + +static void pcache_write_through(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov) { BDRVPCacheState *s = bs->opaque; uint64_t chunk_offset = offset, chunk_bytes = bytes; @@ -429,7 +447,7 @@ static void pcache_invalidation(BlockDriverState *bs, uint64_t offset, chunk_bytes = end_offs - chunk_offset; if (node->status & NODE_STATUS_COMPLETED) { -rbcache_remove(s->cache, &node->common); +write_cache_data(node, offset, bytes, qiov); } } while (end_offs > chunk_offset); } @@ -442,7 +460,7 @@ static coroutine_fn int pcache_co_pwritev(BlockDriverState *bs, uint64_t offset, if (ret < 0) { return ret; } -pcache_invalidation(bs, offset, bytes); +pcache_write_through(bs, offset, bytes, qiov); return ret; } -- 2.11.0