The Thursday 17 Jul 2014 à 13:18:56 (+0800), Liu Yuan wrote : > This patch adds single read pattern to quorum driver and quorum vote is > default > pattern. > > For now we do a quorum vote on all the reads, it is designed for unreliable > underlying storage such as non-redundant NFS to make sure data integrity at > the > cost of the read performance. > > For some use cases as following: > > VM > -------------- > | | > v v > A B > > Both A and B has hardware raid storage to justify the data integrity on its > own. > So it would help performance if we do a single read instead of on all the > nodes. > Further, if we run VM on either of the storage node, we can make a local read > request for better performance. > > This patch generalize the above 2 nodes case in the N nodes. That is, > > vm -> write to all the N nodes, read just one of them. If single read fails, > we > try to read next node in FIFO order specified by the startup command. > > The 2 nodes case is very similar to DRBD[1] though lack of auto-sync > functionality in the single device/node failure for now. But compared with > DRBD > we still have some advantages over it: > > - Suppose we have 20 VMs running on one(assume A) of 2 nodes' DRBD backed > storage. And if A crashes, we need to restart all the VMs on node B. But for > practice case, we can't because B might not have enough resources to setup 20 > VMs > at once. So if we run our 20 VMs with quorum driver, and scatter the > replicated > images over the data center, we can very likely restart 20 VMs without any > resource problem. > > After all, I think we can build a more powerful replicated image functionality > on quorum and block jobs(block mirror) to meet various High Availibility > needs. > > E.g, Enable single read pattern on 2 children, > > -drive driver=quorum,children.0.file.filename=0.qcow2,\ > children.1.file.filename=1.qcow2,read-pattern=fifo,vote-threshold=1 > > [1] http://en.wikipedia.org/wiki/Distributed_Replicated_Block_Device > > Cc: Benoit Canet <ben...@irqsave.net> > Cc: Eric Blake <ebl...@redhat.com> > Cc: Kevin Wolf <kw...@redhat.com> > Cc: Stefan Hajnoczi <stefa...@redhat.com> > Signed-off-by: Liu Yuan <namei.u...@gmail.com> > --- > block/quorum.c | 179 > +++++++++++++++++++++++++++++++++++++++++---------------- > 1 file changed, 131 insertions(+), 48 deletions(-) > > diff --git a/block/quorum.c b/block/quorum.c > index d5ee9c0..ebf5c71 100644 > --- a/block/quorum.c > +++ b/block/quorum.c > @@ -24,6 +24,7 @@ > #define QUORUM_OPT_VOTE_THRESHOLD "vote-threshold" > #define QUORUM_OPT_BLKVERIFY "blkverify" > #define QUORUM_OPT_REWRITE "rewrite-corrupted" > +#define QUORUM_OPT_READ_PATTERN "read-pattern" > > /* This union holds a vote hash value */ > typedef union QuorumVoteValue { > @@ -74,6 +75,8 @@ typedef struct BDRVQuorumState { > bool rewrite_corrupted;/* true if the driver must rewrite-on-read > corrupted > * block if Quorum is reached. > */ > + > + QuorumReadPattern read_pattern; > } BDRVQuorumState; > > typedef struct QuorumAIOCB QuorumAIOCB; > @@ -117,6 +120,7 @@ struct QuorumAIOCB { > > bool is_read; > int vote_ret; > + int child_iter; /* which child to read in fifo pattern */ > }; > > static bool quorum_vote(QuorumAIOCB *acb); > @@ -154,8 +158,10 @@ static void quorum_aio_finalize(QuorumAIOCB *acb) > > if (acb->is_read) { > for (i = 0; i < s->num_children; i++) { > - qemu_vfree(acb->qcrs[i].buf); > - qemu_iovec_destroy(&acb->qcrs[i].qiov); > + if (i <= acb->child_iter) { > + qemu_vfree(acb->qcrs[i].buf); > + qemu_iovec_destroy(&acb->qcrs[i].qiov); > + } > } > } > > @@ -256,6 +262,21 @@ static void quorum_rewrite_aio_cb(void *opaque, int ret) > quorum_aio_finalize(acb); > } > > +static BlockDriverAIOCB *read_fifo_child(QuorumAIOCB *acb); > + > +static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source) > +{ > + int i; > + assert(dest->niov == source->niov); > + assert(dest->size == source->size); > + for (i = 0; i < source->niov; i++) { > + assert(dest->iov[i].iov_len == source->iov[i].iov_len); > + memcpy(dest->iov[i].iov_base, > + source->iov[i].iov_base, > + source->iov[i].iov_len); > + } > +} > + > static void quorum_aio_cb(void *opaque, int ret) > { > QuorumChildRequest *sacb = opaque; > @@ -263,6 +284,21 @@ static void quorum_aio_cb(void *opaque, int ret) > BDRVQuorumState *s = acb->common.bs->opaque; > bool rewrite = false; > > + if (acb->is_read && s->read_pattern == QUORUM_READ_PATTERN_FIFO) { > + /* We try to read next child in FIFO order if we fail to read */ > + if (ret < 0 && ++acb->child_iter < s->num_children) { > + read_fifo_child(acb); > + return; > + } > + > + if (ret == 0) { > + quorum_copy_qiov(acb->qiov, &acb->qcrs[acb->child_iter].qiov); > + } > + acb->vote_ret = ret; > + quorum_aio_finalize(acb); > + return; > + } > + > sacb->ret = ret; > acb->count++; > if (ret == 0) { > @@ -343,19 +379,6 @@ static bool quorum_rewrite_bad_versions(BDRVQuorumState > *s, QuorumAIOCB *acb, > return count; > } > > -static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source) > -{ > - int i; > - assert(dest->niov == source->niov); > - assert(dest->size == source->size); > - for (i = 0; i < source->niov; i++) { > - assert(dest->iov[i].iov_len == source->iov[i].iov_len); > - memcpy(dest->iov[i].iov_base, > - source->iov[i].iov_base, > - source->iov[i].iov_len); > - } > -} > - > static void quorum_count_vote(QuorumVotes *votes, > QuorumVoteValue *value, > int index) > @@ -615,34 +638,62 @@ free_exit: > return rewrite; > } > > -static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs, > - int64_t sector_num, > - QEMUIOVector *qiov, > - int nb_sectors, > - BlockDriverCompletionFunc *cb, > - void *opaque) > +static BlockDriverAIOCB *read_quorum_children(QuorumAIOCB *acb) > { > - BDRVQuorumState *s = bs->opaque; > - QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num, > - nb_sectors, cb, opaque); > + BDRVQuorumState *s = acb->common.bs->opaque; > int i; > > - acb->is_read = true; > - > for (i = 0; i < s->num_children; i++) { > - acb->qcrs[i].buf = qemu_blockalign(s->bs[i], qiov->size); > - qemu_iovec_init(&acb->qcrs[i].qiov, qiov->niov); > - qemu_iovec_clone(&acb->qcrs[i].qiov, qiov, acb->qcrs[i].buf); > + acb->qcrs[i].buf = qemu_blockalign(s->bs[i], acb->qiov->size); > + qemu_iovec_init(&acb->qcrs[i].qiov, acb->qiov->niov); > + qemu_iovec_clone(&acb->qcrs[i].qiov, acb->qiov, acb->qcrs[i].buf); > } > > for (i = 0; i < s->num_children; i++) { > - bdrv_aio_readv(s->bs[i], sector_num, &acb->qcrs[i].qiov, nb_sectors, > - quorum_aio_cb, &acb->qcrs[i]); > + bdrv_aio_readv(s->bs[i], acb->sector_num, &acb->qcrs[i].qiov, > + acb->nb_sectors, quorum_aio_cb, &acb->qcrs[i]); > } > > return &acb->common; > } > > +static BlockDriverAIOCB *read_fifo_child(QuorumAIOCB *acb) > +{ > + BDRVQuorumState *s = acb->common.bs->opaque; > + > + acb->qcrs[acb->child_iter].buf = qemu_blockalign(s->bs[acb->child_iter], > + acb->qiov->size); > + qemu_iovec_init(&acb->qcrs[acb->child_iter].qiov, acb->qiov->niov); > + qemu_iovec_clone(&acb->qcrs[acb->child_iter].qiov, acb->qiov, > + acb->qcrs[acb->child_iter].buf); > + bdrv_aio_readv(s->bs[acb->child_iter], acb->sector_num, > + &acb->qcrs[acb->child_iter].qiov, acb->nb_sectors, > + quorum_aio_cb, &acb->qcrs[acb->child_iter]); > + > + return &acb->common; > +} > + > +static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs, > + int64_t sector_num, > + QEMUIOVector *qiov, > + int nb_sectors, > + BlockDriverCompletionFunc *cb, > + void *opaque) > +{ > + BDRVQuorumState *s = bs->opaque; > + QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num, > + nb_sectors, cb, opaque); > + acb->is_read = true; > + > + if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) { > + acb->child_iter = s->num_children - 1; > + return read_quorum_children(acb); > + } > + > + acb->child_iter = 0; > + return read_fifo_child(acb); > +} > + > static BlockDriverAIOCB *quorum_aio_writev(BlockDriverState *bs, > int64_t sector_num, > QEMUIOVector *qiov, > @@ -782,10 +833,33 @@ static QemuOptsList quorum_runtime_opts = { > .type = QEMU_OPT_BOOL, > .help = "Rewrite corrupted block on read quorum", > }, > + { > + .name = QUORUM_OPT_READ_PATTERN, > + .type = QEMU_OPT_STRING, > + .help = "Allowed pattern: quorum, fifo. Quorum is default", > + }, > { /* end of list */ } > }, > }; > > +static int parse_read_pattern(const char *opt) > +{ > + int i; > + > + if (!opt) { > + /* Set quorum as default */ > + return QUORUM_READ_PATTERN_QUORUM; > + } > + > + for (i = 0; i < QUORUM_READ_PATTERN_MAX; i++) { > + if (!strcmp(opt, QuorumReadPattern_lookup[i])) { > + return i; > + } > + } > + > + return -EINVAL; > +} > + > static int quorum_open(BlockDriverState *bs, QDict *options, int flags, > Error **errp) > { > @@ -827,28 +901,37 @@ static int quorum_open(BlockDriverState *bs, QDict > *options, int flags, > } > > s->threshold = qemu_opt_get_number(opts, QUORUM_OPT_VOTE_THRESHOLD, 0); > - > - /* and validate it against s->num_children */ > - ret = quorum_valid_threshold(s->threshold, s->num_children, &local_err); > + ret = parse_read_pattern(qemu_opt_get(opts, QUORUM_OPT_READ_PATTERN)); > if (ret < 0) { > + error_setg(&local_err, "Please set read-pattern as fifo or > quorum\n"); > goto exit; > } > + s->read_pattern = ret; > > - /* is the driver in blkverify mode */ > - if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false) && > - s->num_children == 2 && s->threshold == 2) { > - s->is_blkverify = true; > - } else if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false)) { > - fprintf(stderr, "blkverify mode is set by setting blkverify=on " > - "and using two files with vote_threshold=2\n"); > - } > + if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) { > + /* and validate it against s->num_children */ > + ret = quorum_valid_threshold(s->threshold, s->num_children, > &local_err); > + if (ret < 0) { > + goto exit; > + } > > - s->rewrite_corrupted = qemu_opt_get_bool(opts, QUORUM_OPT_REWRITE, > false); > - if (s->rewrite_corrupted && s->is_blkverify) { > - error_setg(&local_err, > - "rewrite-corrupted=on cannot be used with blkverify=on"); > - ret = -EINVAL; > - goto exit; > + /* is the driver in blkverify mode */ > + if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false) && > + s->num_children == 2 && s->threshold == 2) { > + s->is_blkverify = true; > + } else if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false)) { > + fprintf(stderr, "blkverify mode is set by setting blkverify=on " > + "and using two files with vote_threshold=2\n"); > + } > + > + s->rewrite_corrupted = qemu_opt_get_bool(opts, QUORUM_OPT_REWRITE, > + false); > + if (s->rewrite_corrupted && s->is_blkverify) { > + error_setg(&local_err, > + "rewrite-corrupted=on cannot be used with > blkverify=on"); > + ret = -EINVAL; > + goto exit; > + } > } > > /* allocate the children BlockDriverState array */ > -- > 1.9.1 >
I think I responded you with comments signaling a potential memory leak but you missed it. Best regards Benoît