On Thu, Dec 12, 2013 at 02:11:43PM +0900, Hitoshi Mitake wrote: > At Tue, 10 Dec 2013 15:10:36 +0800, > Liu Yuan wrote: > > > > We make sure we write the exact number of copies to honor the promise of the > > redundancy for "strict mode". This means that after writing of targeted > > data, > > they are redundant as promised and can withstand the random node failures. > > > > For example, with a 4:2 policy, we need at least write to 6 nodes with data > > strip and parity strips. For non-strict mode, we allow to write successfully > > only if the data are written fully with 4 nodes alive. > > > > Signed-off-by: Liu Yuan <namei.u...@gmail.com> > > --- > > dog/cluster.c | 10 +++++++++- > > include/internal_proto.h | 2 ++ > > include/sheep.h | 3 ++- > > include/sheepdog_proto.h | 2 +- > > sheep/ops.c | 2 +- > > sheep/request.c | 34 +++++++++++++++++++++++++++++++--- > > 6 files changed, 46 insertions(+), 7 deletions(-) > > > > diff --git a/dog/cluster.c b/dog/cluster.c > > index 611c91d..43df232 100644 > > --- a/dog/cluster.c > > +++ b/dog/cluster.c > > @@ -21,6 +21,8 @@ static struct sd_option cluster_options[] = { > > {'b', "store", true, "specify backend store"}, > > {'c', "copies", true, "specify the default data redundancy (number of > > copies)"}, > > {'f', "force", false, "do not prompt for confirmation"}, > > + {'t', "strict", false, > > + "do not serve write request if number of nodes is not sufficient"}, > > {'s', "backend", false, "show backend store information"}, > > { 0, NULL, false, NULL }, > > }; > > @@ -30,6 +32,7 @@ static struct cluster_cmd_data { > > uint8_t copy_policy; > > bool force; > > bool show_store; > > + bool strict; > > char name[STORE_LEN]; > > } cluster_cmd_data; > > > > @@ -117,6 +120,8 @@ static int cluster_format(int argc, char **argv) > > pstrcpy(store_name, STORE_LEN, DEFAULT_STORE); > > hdr.data_length = strlen(store_name) + 1; > > hdr.flags |= SD_FLAG_CMD_WRITE; > > + if (cluster_cmd_data.strict) > > + hdr.cluster.flags |= SD_CLUSTER_FLAG_STRICT; > > > > printf("using backend %s store\n", store_name); > > ret = dog_exec_req(&sd_nid, &hdr, store_name); > > @@ -552,7 +557,7 @@ static int cluster_check(int argc, char **argv) > > static struct subcommand cluster_cmd[] = { > > {"info", NULL, "aprhs", "show cluster information", > > NULL, CMD_NEED_NODELIST, cluster_info, cluster_options}, > > - {"format", NULL, "bcaph", "create a Sheepdog store", > > + {"format", NULL, "bctaph", "create a Sheepdog store", > > NULL, CMD_NEED_NODELIST, cluster_format, cluster_options}, > > {"shutdown", NULL, "aph", "stop Sheepdog", > > NULL, 0, cluster_shutdown, cluster_options}, > > @@ -597,6 +602,9 @@ static int cluster_parser(int ch, const char *opt) > > case 's': > > cluster_cmd_data.show_store = true; > > break; > > + case 't': > > + cluster_cmd_data.strict = true; > > + break; > > } > > > > return 0; > > diff --git a/include/internal_proto.h b/include/internal_proto.h > > index b224c49..ac4e3f8 100644 > > --- a/include/internal_proto.h > > +++ b/include/internal_proto.h > > @@ -126,6 +126,8 @@ > > #define SD_RES_CLUSTER_ERROR 0x91 /* Cluster driver error */ > > #define SD_RES_OBJ_TAKEN 0x92 /* Object ID is taken up */ > > > > +#define SD_CLUSTER_FLAG_STRICT 0x0001 /* Strict mode for write */ > > + > > enum sd_status { > > SD_STATUS_OK = 1, > > SD_STATUS_WAIT, > > diff --git a/include/sheep.h b/include/sheep.h > > index 293e057..d460d54 100644 > > --- a/include/sheep.h > > +++ b/include/sheep.h > > @@ -160,7 +160,8 @@ static inline const char *sd_strerror(int err) > > [SD_RES_WAIT_FOR_FORMAT] = "Waiting for cluster to be > > formatted", > > [SD_RES_WAIT_FOR_JOIN] = "Waiting for other nodes to join > > cluster", > > [SD_RES_JOIN_FAILED] = "Node has failed to join cluster", > > - [SD_RES_HALT] = "IO has halted as there are no living nodes", > > + [SD_RES_HALT] = > > + "IO has halted as there are not enough living nodes", > > [SD_RES_READONLY] = "Object is read-only", > > > > /* from internal_proto.h */ > > diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h > > index cb47e3f..366499e 100644 > > --- a/include/sheepdog_proto.h > > +++ b/include/sheepdog_proto.h > > @@ -156,7 +156,7 @@ struct sd_req { > > uint64_t ctime; > > uint8_t copies; > > uint8_t copy_policy; > > - uint8_t reserved[2]; > > + uint16_t flags; > > uint32_t tag; > > } cluster; > > struct { > > diff --git a/sheep/ops.c b/sheep/ops.c > > index 75a2565..1e9bc1e 100644 > > --- a/sheep/ops.c > > +++ b/sheep/ops.c > > @@ -271,7 +271,7 @@ static int cluster_make_fs(const struct sd_req *req, > > struct sd_rsp *rsp, > > > > sys->cinfo.nr_copies = req->cluster.copies; > > sys->cinfo.copy_policy = req->cluster.copy_policy; > > - sys->cinfo.flags = req->flags; > > + sys->cinfo.flags = req->cluster.flags; > > if (!sys->cinfo.nr_copies) > > sys->cinfo.nr_copies = SD_DEFAULT_COPIES; > > sys->cinfo.ctime = req->cluster.ctime; > > diff --git a/sheep/request.c b/sheep/request.c > > index 5113fca..fd54253 100644 > > --- a/sheep/request.c > > +++ b/sheep/request.c > > @@ -284,6 +284,22 @@ static void queue_peer_request(struct request *req) > > queue_work(sys->io_wqueue, &req->work); > > } > > > > +/* > > + * We make sure we write the exact number of copies to honor the promise > > of the > > + * redundancy for strict mode. This means that after writing of targeted > > data, > > + * they are redundant as promised and can withstand the random node > > failures. > > + * > > + * For example, with a 4:2 policy, we need at least write to 6 nodes with > > data > > + * strip and parity strips. For non-strict mode, we allow to write > > successfully > > + * only if the data are written fully with 4 nodes alive. > > + */ > > +static bool has_enough_zones(struct request *req) > > +{ > > + uint64_t oid = req->rq.obj.oid; > > + > > + return req->vinfo->nr_zones >= get_vdi_copy_number(oid_to_vid(oid)); > > +} > > + > > static void queue_gateway_request(struct request *req) > > { > > struct sd_req *hdr = &req->rq; > > @@ -310,13 +326,25 @@ static void queue_gateway_request(struct request *req) > > queue_work: > > if (RB_EMPTY_ROOT(&req->vinfo->vroot)) { > > sd_err("there is no living nodes"); > > - req->rp.result = SD_RES_HALT; > > - put_request(req); > > - return; > > + goto end_request; > > + } > > + if (sys->cinfo.flags & SD_CLUSTER_FLAG_STRICT && > > + hdr->flags & SD_FLAG_CMD_WRITE && > > + !(hdr->flags & SD_FLAG_CMD_RECOVERY) && > > + !has_enough_zones(req)) { > > I think the above condition is not correct. > > 1. hdr->flags & SD_FLAG_CMD_WRITE > The flag SD_FLAG_CMD_WRITE is used for indicate that a request has its > own data. This shouldn't be treated as the condition of strict > mode. e.g. SD_OP_TRACE_ENABLE has this flag. > > This condition should be replaced with the below one: > (hdr->opcode == SD_OP_CREATE_AND_WRITE_OBJ || hdr->opcode == SD_OP_WRITE_OBJ)
Ah, yes. I think SD_FLAG_CMD_WRITE is kind of misleading (couple of times I forgot to assigne it while write something and spent time debugging it) > 2. !(hdr->flags & SD_FLAG_CMD_RECOVERY) > > SD_FLAG_CMD_RECOVERY is used for indicating a request PEER_READ is for > recovery. The flag is not related to the strict mode. I think it can > be removed simply. Oops, you are right. Good catch. Thanks Yuan -- sheepdog mailing list sheepdog@lists.wpkg.org http://lists.wpkg.org/mailman/listinfo/sheepdog