Current object repair strategy of "dog vdi check" doesn't work well if objects are corrupted. This patch adds a mechanism for majority voting in the command.
Signed-off-by: Hitoshi Mitake <[email protected]> --- v3: use _random() for test 077 v2: determine majoriby based on a number of live copies dog/vdi.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 72 insertions(+), 12 deletions(-) diff --git a/dog/vdi.c b/dog/vdi.c index d9a9a0f..124fef4 100644 --- a/dog/vdi.c +++ b/dog/vdi.c @@ -1434,6 +1434,12 @@ struct vdi_check_work { struct work work; }; +enum vdi_check_result { + VDI_CHECK_NO_OBJ_FOUND, + VDI_CHECK_NO_MAJORITY_FOUND, + VDI_CHECK_SUCCESS, +}; + struct vdi_check_info { uint64_t oid; uint8_t nr_copies; @@ -1442,7 +1448,8 @@ struct vdi_check_info { uint64_t *done; int refcnt; struct work_queue *wq; - struct vdi_check_work *base; + enum vdi_check_result result; + struct vdi_check_work *majority; struct vdi_check_work vcw[0]; }; @@ -1462,7 +1469,7 @@ static void vdi_repair_work(struct work *work) struct vdi_check_info *info = vcw->info; void *buf; - buf = read_object_from(info->base->vnode, info->oid); + buf = read_object_from(info->majority->vnode, info->oid); write_object_to(vcw->vnode, info->oid, buf, !vcw->object_found, 0); free(buf); } @@ -1511,10 +1518,8 @@ static void vdi_check_object_work(struct work *work) switch (rsp->result) { case SD_RES_SUCCESS: vcw->object_found = true; - if (!is_erasure_oid(info->oid, info->copy_policy)) { + if (!is_erasure_oid(info->oid, info->copy_policy)) memcpy(vcw->hash, rsp->hash.digest, sizeof(vcw->hash)); - uatomic_set(&info->base, vcw); - } break; case SD_RES_NO_OBJ: vcw->object_found = false; @@ -1530,18 +1535,31 @@ static void vdi_check_object_work(struct work *work) static void check_replicatoin_object(struct vdi_check_info *info) { - if (info->base == NULL) { - sd_err("no node has %" PRIx64, info->oid); - exit(EXIT_FAILURE); + if (info->majority == NULL) { + switch (info->result) { + case VDI_CHECK_NO_OBJ_FOUND: + sd_err("no node has %" PRIx64, info->oid); + break; + case VDI_CHECK_NO_MAJORITY_FOUND: + sd_err("no majority of %" PRIx64, info->oid); + break; + default: + sd_err("unknown result of vdi check: %d", info->result); + exit(EXIT_FAILURE); + break; + } + + /* do nothing */ + return; } for (int i = 0; i < info->nr_copies; i++) { - if (&info->vcw[i] == info->base) + if (&info->vcw[i] == info->majority) continue; /* need repair when object not found or consistency broken */ if (!info->vcw[i].object_found || - memcmp(info->base->hash, info->vcw[i].hash, - sizeof(info->base->hash)) != 0) { + memcmp(info->majority->hash, info->vcw[i].hash, + sizeof(info->majority->hash)) != 0) { info->vcw[i].work.fn = vdi_repair_work; info->vcw[i].work.done = vdi_repair_main; info->refcnt++; @@ -1615,6 +1633,46 @@ out: ec_destroy(ctx); } +static void vote_majority_object(struct vdi_check_info *info) +{ + /* + * Voting majority object from existing ones. + * + * The linear majority vote algorithm by Boyer and Moore is used: + * http://www.cs.utexas.edu/~moore/best-ideas/mjrty/ + */ + + int count = 0, nr_live_copies = 0; + struct vdi_check_work *majority = NULL; + + for (int i = 0; i < info->nr_copies; i++) { + struct vdi_check_work *vcw = &info->vcw[i]; + + if (!vcw->object_found) + continue; + nr_live_copies++; + + if (!count) + majority = vcw; + + if (!memcmp(majority->hash, vcw->hash, sizeof(vcw->hash))) + count++; + else + count--; + } + + if (!majority) + info->result = VDI_CHECK_NO_OBJ_FOUND; + else if (count < nr_live_copies / 2) { + /* no majority found */ + majority = NULL; + info->result = VDI_CHECK_NO_MAJORITY_FOUND; + } else + info->result = VDI_CHECK_SUCCESS; + + info->majority = majority; +} + static void vdi_check_object_main(struct work *work) { struct vdi_check_work *vcw = container_of(work, struct vdi_check_work, @@ -1627,8 +1685,10 @@ static void vdi_check_object_main(struct work *work) if (is_erasure_oid(info->oid, info->copy_policy)) check_erasure_object(info); - else + else { + vote_majority_object(info); check_replicatoin_object(info); + } if (info->refcnt == 0) free_vdi_check_info(info); -- 1.7.10.4 -- sheepdog mailing list [email protected] http://lists.wpkg.org/mailman/listinfo/sheepdog
