Am 28.10.2013 14:04, schrieb Benoît Canet: > Le Friday 04 Oct 2013 à 16:48:12 (+0200), Max Reitz a écrit : >> On 2013-10-02 14:39, Benoît Canet wrote: >>> Use gnutls's SHA-256 to compare versions. >> Wouldn't CRC32 suffice? (I don't really oppose using SHA, but taking >> in gnutls as a dependency just for comparing several memory areas >> seems a bit much to me) >> >>> Signed-off-by: Benoit Canet <ben...@irqsave.net> >>> --- >>> block/Makefile.objs | 2 +- >>> block/quorum.c | 321 >>> +++++++++++++++++++++++++++++++++++++++++++++- >>> configure | 36 ++++++ >>> include/monitor/monitor.h | 2 + >>> monitor.c | 2 + >>> 5 files changed, 361 insertions(+), 2 deletions(-) >>> >>> diff --git a/block/Makefile.objs b/block/Makefile.objs >>> index 05a65c2..adcdc21 100644 >>> --- a/block/Makefile.objs >>> +++ b/block/Makefile.objs >>> @@ -3,7 +3,7 @@ block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o >>> qcow2-snapshot.o qcow2-c >>> block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o >>> block-obj-y += qed-check.o >>> block-obj-y += vhdx.o >>> -block-obj-y += quorum.o >>> +block-obj-$(CONFIG_QUORUM) += quorum.o >>> block-obj-y += parallels.o blkdebug.o blkverify.o >>> block-obj-y += snapshot.o qapi.o >>> block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o >>> diff --git a/block/quorum.c b/block/quorum.c >>> index f0fc0e9..e235ac1 100644 >>> --- a/block/quorum.c >>> +++ b/block/quorum.c >>> @@ -13,7 +13,43 @@ >>> * See the COPYING file in the top-level directory. >>> */ >>> +#include <gnutls/gnutls.h> >>> +#include <gnutls/crypto.h> >>> #include "block/block_int.h" >>> +#include "qapi/qmp/qjson.h" >>> + >>> +#define HASH_LENGTH 32 >>> + >>> +/* This union hold a vote hash value */ >> *holds >> >>> +typedef union QuorumVoteValue { >>> + char h[HASH_LENGTH]; /* SHA-256 hash */ >>> + int64_t l; /* simpler 64 bits hash */ >>> +} QuorumVoteValue; >>> + >>> +/* A vote item */ >>> +typedef struct QuorumVoteItem { >>> + int index; >>> + QLIST_ENTRY(QuorumVoteItem) next; >>> +} QuorumVoteItem; >>> + >>> +/* this structure is a vote version. A version is the set of vote sharing >>> the >> *set of votes >> >>> + * same vote value. >>> + * The set of vote will be tracked with the items field and it's count is >> *set of votes or *vote set; also s/it's count/its cardinality/ or >> something like that >> >>> + * vote_count. >>> + */ >>> +typedef struct QuorumVoteVersion { >>> + QuorumVoteValue value; >>> + int index; >>> + int vote_count; >>> + QLIST_HEAD(, QuorumVoteItem) items; >>> + QLIST_ENTRY(QuorumVoteVersion) next; >>> +} QuorumVoteVersion; >>> + >>> +/* this structure hold a group of vote versions together */ >> *holds >> >>> +typedef struct QuorumVotes { >>> + QLIST_HEAD(, QuorumVoteVersion) vote_list; >>> + int (*compare)(QuorumVoteValue *a, QuorumVoteValue *b); >>> +} QuorumVotes; >>> /* the following structure hold the state of one quorum instance */ >>> typedef struct { >>> @@ -60,10 +96,14 @@ struct QuorumAIOCB { >>> int success_count; /* number of successfully completed AIOCB >>> */ >>> bool *finished; /* completion signal for cancel */ >>> + QuorumVotes votes; >>> + >>> bool is_read; >>> int vote_ret; >>> }; >>> +static void quorum_vote(QuorumAIOCB *acb); >>> + >>> static void quorum_aio_cancel(BlockDriverAIOCB *blockacb) >>> { >>> QuorumAIOCB *acb = container_of(blockacb, QuorumAIOCB, common); >>> @@ -111,6 +151,10 @@ static void quorum_aio_finalize(QuorumAIOCB *acb) >>> acb->aios[i].ret = 0; >>> } >>> + if (acb->vote_ret) { >>> + ret = acb->vote_ret; >>> + } >>> + >>> acb->common.cb(acb->common.opaque, ret); >>> if (acb->finished) { >>> *acb->finished = true; >>> @@ -122,6 +166,11 @@ static void quorum_aio_finalize(QuorumAIOCB *acb) >>> qemu_aio_release(acb); >>> } >>> +static int quorum_sha256_compare(QuorumVoteValue *a, QuorumVoteValue *b) >>> +{ >>> + return memcmp(a->h, b->h, HASH_LENGTH); >>> +} >>> + >>> static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s, >>> BlockDriverState *bs, >>> QEMUIOVector *qiov, >>> @@ -141,6 +190,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s, >>> acb->count = 0; >>> acb->success_count = 0; >>> acb->finished = NULL; >>> + acb->votes.compare = quorum_sha256_compare; >>> acb->is_read = false; >>> acb->vote_ret = 0; >>> @@ -170,9 +220,278 @@ static void quorum_aio_cb(void *opaque, int ret) >>> return; >>> } >>> + /* Do the vote on read */ >>> + if (acb->is_read) { >>> + quorum_vote(acb); >>> + } >>> + >>> quorum_aio_finalize(acb); >>> } >>> +static void quorum_report_bad(QuorumAIOCB *acb, int index) >>> +{ >>> + QObject *data; >>> + data = qobject_from_jsonf("{ 'children-index': %i" >> I'd prefer child-index. Generally, remember that the singular of >> "children" is "child". >> >>> + ", 'sector-num': %" PRId64 >>> + ", 'sectors-count': %i }", >>> + index, >>> + acb->sector_num, >>> + acb->nb_sectors); >>> + monitor_protocol_event(QEVENT_QUORUM_REPORT_BAD, data); >> How about decrementing the refcount of data? >> >>> +} >>> + >>> +static void quorum_report_failure(QuorumAIOCB *acb) >>> +{ >>> + QObject *data; >>> + data = qobject_from_jsonf("{ 'sector-num': %" PRId64 >>> + ", 'sectors-count': %i }", >>> + acb->sector_num, >>> + acb->nb_sectors); >>> + monitor_protocol_event(QEVENT_QUORUM_FAILURE, data); >> Same here. >> >>> +} >>> + >>> +static void quorum_report_bad_versions(QuorumAIOCB *acb, >>> + QuorumVoteValue *value) >>> +{ >>> + QuorumVoteVersion *version; >>> + QuorumVoteItem *item; >>> + >>> + QLIST_FOREACH(version, &acb->votes.vote_list, next) { >>> + if (!acb->votes.compare(&version->value, value)) { >>> + continue; >>> + } >>> + QLIST_FOREACH(item, &version->items, next) { >>> + quorum_report_bad(acb, item->index); >>> + } >>> + } >>> +} >>> + >>> +static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source) >>> +{ >>> + int i; >>> + assert(dest->niov == source->niov); >>> + assert(dest->size == source->size); >>> + for (i = 0; i < source->niov; i++) { >>> + assert(dest->iov[i].iov_len == source->iov[i].iov_len); >>> + memcpy(dest->iov[i].iov_base, >>> + source->iov[i].iov_base, >>> + source->iov[i].iov_len); >>> + } >>> +} >>> + >>> +static void quorum_count_vote(QuorumVotes *votes, >>> + QuorumVoteValue *value, >>> + int index) >>> +{ >>> + QuorumVoteVersion *v = NULL, *version = NULL; >>> + QuorumVoteItem *item; >>> + >>> + /* look if we have something with this hash */ >>> + QLIST_FOREACH(v, &votes->vote_list, next) { >>> + if (!votes->compare(&v->value, value)) { >>> + version = v; >>> + break; >>> + } >>> + } >>> + >>> + /* It's a version not yet in the list add it */ >>> + if (!version) { >>> + version = g_new0(QuorumVoteVersion, 1); >>> + QLIST_INIT(&version->items); >>> + memcpy(&version->value, value, sizeof(version->value)); >>> + version->index = index; >>> + version->vote_count = 0; >>> + QLIST_INSERT_HEAD(&votes->vote_list, version, next); >>> + } >>> + >>> + version->vote_count++; >>> + >>> + item = g_new0(QuorumVoteItem, 1); >>> + item->index = index; >>> + QLIST_INSERT_HEAD(&version->items, item, next); >>> +} >>> + >>> +static void quorum_free_vote_list(QuorumVotes *votes) >>> +{ >>> + QuorumVoteVersion *version, *next_version; >>> + QuorumVoteItem *item, *next_item; >>> + >>> + QLIST_FOREACH_SAFE(version, &votes->vote_list, next, next_version) { >>> + QLIST_REMOVE(version, next); >>> + QLIST_FOREACH_SAFE(item, &version->items, next, next_item) { >>> + QLIST_REMOVE(item, next); >>> + g_free(item); >>> + } >>> + g_free(version); >>> + } >>> +} >>> + >>> +static int quorum_compute_hash(QuorumAIOCB *acb, int i, QuorumVoteValue >>> *hash) >>> +{ >>> + int j, ret; >>> + gnutls_hash_hd_t dig; >>> + QEMUIOVector *qiov = &acb->aios[i].qiov; >>> + >>> + ret = gnutls_hash_init(&dig, GNUTLS_DIG_SHA256); >>> + >>> + if (ret < 0) { >>> + return ret; >>> + } >>> + >>> + for (j = 0; j < qiov->niov; j++) { >>> + ret = gnutls_hash(dig, qiov->iov[j].iov_base, >>> qiov->iov[j].iov_len); >>> + if (ret < 0) { >>> + return ret; >> gnutls_hash_deinit? >> >>> + } >>> + } >>> + >>> + gnutls_hash_deinit(dig, (void *) hash); >>> + >>> + return 0; >>> +} >>> + >>> +static QuorumVoteVersion *quorum_get_vote_winner(QuorumVotes *votes) >>> +{ >>> + int i = 0; >>> + QuorumVoteVersion *candidate, *winner = NULL; >>> + >>> + QLIST_FOREACH(candidate, &votes->vote_list, next) { >>> + if (candidate->vote_count > i) { >>> + i = candidate->vote_count; >>> + winner = candidate; >>> + } >>> + } >>> + >>> + return winner; >>> +} >>> + >>> +static bool quorum_iovec_compare(QEMUIOVector *a, QEMUIOVector *b) >>> +{ >>> + int i; >>> + int result; >>> + >>> + assert(a->niov == b->niov); >>> + for (i = 0; i < a->niov; i++) { >>> + assert(a->iov[i].iov_len == b->iov[i].iov_len); >>> + result = memcmp(a->iov[i].iov_base, >>> + b->iov[i].iov_base, >>> + a->iov[i].iov_len); >>> + if (result) { >>> + return false; >>> + } >>> + } >>> + >>> + return true; >>> +} >>> + >>> +static void GCC_FMT_ATTR(2, 3) quorum_err(QuorumAIOCB *acb, >>> + const char *fmt, ...) >>> +{ >>> + va_list ap; >>> + >>> + va_start(ap, fmt); >>> + fprintf(stderr, "quorum: sector_num=%" PRId64 " nb_sectors=%d ", >>> + acb->sector_num, acb->nb_sectors); >>> + vfprintf(stderr, fmt, ap); >>> + fprintf(stderr, "\n"); >>> + va_end(ap); >>> + exit(1); >>> +} >>> + >>> +static bool quorum_compare(QuorumAIOCB *acb, >>> + QEMUIOVector *a, >>> + QEMUIOVector *b) >>> +{ >>> + BDRVQuorumState *s = acb->bqs; >>> + ssize_t offset; >>> + >>> + /* This driver will replace blkverify in this particular case */ >>> + if (s->is_blkverify) { >>> + offset = qemu_iovec_compare(a, b); >>> + if (offset != -1) { >>> + quorum_err(acb, "contents mismatch in sector %" PRId64, >>> + acb->sector_num + >>> + (uint64_t)(offset / BDRV_SECTOR_SIZE)); >>> + } >>> + return true; >>> + } >>> + >>> + return quorum_iovec_compare(a, b); >>> +} >>> + >>> +static void quorum_vote(QuorumAIOCB *acb) >>> +{ >>> + bool quorum = true; >>> + int i, j, ret; >>> + QuorumVoteValue hash; >>> + BDRVQuorumState *s = acb->bqs; >>> + QuorumVoteVersion *winner; >>> + >>> + /* get the index of the first successful read */ >>> + for (i = 0; i < s->total; i++) { >>> + if (!acb->aios[i].ret) { >>> + break; >>> + } >>> + } >>> + >>> + /* compare this read with all other successful read looking for quorum >>> */ >>> + for (j = i + 1; j < s->total; j++) { >>> + if (acb->aios[j].ret) { >>> + continue; >>> + } >>> + quorum = quorum_compare(acb, &acb->aios[i].qiov, >>> &acb->aios[j].qiov); >>> + if (!quorum) { >>> + break; >>> + } >>> + } >>> + >>> + /* Every successful read agrees -> Quorum */ >>> + if (quorum) { >>> + quorum_copy_qiov(acb->qiov, &acb->aios[i].qiov); >> If no reads were successful at all, quorum will be true and i will >> be s->total. Therefore, this will result in an array access with an >> index out of bounds here. > Thanks a lot for spotting this. > >> Generally, why is this function executed at all if any read failed? >> If a read fails, the quorum read function will return an error, thus >> the caller will ignore the result anyway. > I should if there is a quorum between the successfull reads; if all the > success > ful reads agrees and their count is greater than threshold.
Ah, I see it now; quorum_aio_readv() returns "success" if more reads than the threshould succeeded (and return the same result). I thought it would fail as soon as just a single read operation failed – sorry, my fault. Max