Current object repair strategy of "dog vdi check" doesn't work well if
objects are corrupted. This patch adds a mechanism for majority voting
in the command.

Signed-off-by: Hitoshi Mitake <[email protected]>
---
v3: use _random() for test 077

v2: determine majoriby based on a number of live copies

 dog/vdi.c |   84 ++++++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 72 insertions(+), 12 deletions(-)

diff --git a/dog/vdi.c b/dog/vdi.c
index d9a9a0f..124fef4 100644
--- a/dog/vdi.c
+++ b/dog/vdi.c
@@ -1434,6 +1434,12 @@ struct vdi_check_work {
        struct work work;
 };
 
+enum vdi_check_result {
+       VDI_CHECK_NO_OBJ_FOUND,
+       VDI_CHECK_NO_MAJORITY_FOUND,
+       VDI_CHECK_SUCCESS,
+};
+
 struct vdi_check_info {
        uint64_t oid;
        uint8_t nr_copies;
@@ -1442,7 +1448,8 @@ struct vdi_check_info {
        uint64_t *done;
        int refcnt;
        struct work_queue *wq;
-       struct vdi_check_work *base;
+       enum vdi_check_result result;
+       struct vdi_check_work *majority;
        struct vdi_check_work vcw[0];
 };
 
@@ -1462,7 +1469,7 @@ static void vdi_repair_work(struct work *work)
        struct vdi_check_info *info = vcw->info;
        void *buf;
 
-       buf = read_object_from(info->base->vnode, info->oid);
+       buf = read_object_from(info->majority->vnode, info->oid);
        write_object_to(vcw->vnode, info->oid, buf, !vcw->object_found, 0);
        free(buf);
 }
@@ -1511,10 +1518,8 @@ static void vdi_check_object_work(struct work *work)
        switch (rsp->result) {
        case SD_RES_SUCCESS:
                vcw->object_found = true;
-               if (!is_erasure_oid(info->oid, info->copy_policy)) {
+               if (!is_erasure_oid(info->oid, info->copy_policy))
                        memcpy(vcw->hash, rsp->hash.digest, sizeof(vcw->hash));
-                       uatomic_set(&info->base, vcw);
-               }
                break;
        case SD_RES_NO_OBJ:
                vcw->object_found = false;
@@ -1530,18 +1535,31 @@ static void vdi_check_object_work(struct work *work)
 
 static void check_replicatoin_object(struct vdi_check_info *info)
 {
-       if (info->base == NULL) {
-               sd_err("no node has %" PRIx64, info->oid);
-               exit(EXIT_FAILURE);
+       if (info->majority == NULL) {
+               switch (info->result) {
+               case VDI_CHECK_NO_OBJ_FOUND:
+                       sd_err("no node has %" PRIx64, info->oid);
+                       break;
+               case VDI_CHECK_NO_MAJORITY_FOUND:
+                       sd_err("no majority of %" PRIx64, info->oid);
+                       break;
+               default:
+                       sd_err("unknown result of vdi check: %d", info->result);
+                       exit(EXIT_FAILURE);
+                       break;
+               }
+
+               /* do nothing */
+               return;
        }
 
        for (int i = 0; i < info->nr_copies; i++) {
-               if (&info->vcw[i] == info->base)
+               if (&info->vcw[i] == info->majority)
                        continue;
                /* need repair when object not found or consistency broken */
                if (!info->vcw[i].object_found ||
-                   memcmp(info->base->hash, info->vcw[i].hash,
-                          sizeof(info->base->hash)) != 0) {
+                   memcmp(info->majority->hash, info->vcw[i].hash,
+                          sizeof(info->majority->hash)) != 0) {
                        info->vcw[i].work.fn = vdi_repair_work;
                        info->vcw[i].work.done = vdi_repair_main;
                        info->refcnt++;
@@ -1615,6 +1633,46 @@ out:
        ec_destroy(ctx);
 }
 
+static void vote_majority_object(struct vdi_check_info *info)
+{
+       /*
+        * Voting majority object from existing ones.
+        *
+        * The linear majority vote algorithm by Boyer and Moore is used:
+        * http://www.cs.utexas.edu/~moore/best-ideas/mjrty/
+        */
+
+       int count = 0, nr_live_copies = 0;
+       struct vdi_check_work *majority = NULL;
+
+       for (int i = 0; i < info->nr_copies; i++) {
+               struct vdi_check_work *vcw = &info->vcw[i];
+
+               if (!vcw->object_found)
+                       continue;
+               nr_live_copies++;
+
+               if (!count)
+                       majority = vcw;
+
+               if (!memcmp(majority->hash, vcw->hash, sizeof(vcw->hash)))
+                       count++;
+               else
+                       count--;
+       }
+
+       if (!majority)
+               info->result = VDI_CHECK_NO_OBJ_FOUND;
+       else if (count < nr_live_copies / 2) {
+               /* no majority found */
+               majority = NULL;
+               info->result = VDI_CHECK_NO_MAJORITY_FOUND;
+       } else
+               info->result = VDI_CHECK_SUCCESS;
+
+       info->majority = majority;
+}
+
 static void vdi_check_object_main(struct work *work)
 {
        struct vdi_check_work *vcw = container_of(work, struct vdi_check_work,
@@ -1627,8 +1685,10 @@ static void vdi_check_object_main(struct work *work)
 
        if (is_erasure_oid(info->oid, info->copy_policy))
                check_erasure_object(info);
-       else
+       else {
+               vote_majority_object(info);
                check_replicatoin_object(info);
+       }
 
        if (info->refcnt == 0)
                free_vdi_check_info(info);
-- 
1.7.10.4

-- 
sheepdog mailing list
[email protected]
http://lists.wpkg.org/mailman/listinfo/sheepdog

Reply via email to