This patch changes vid space size from constant to variable. The vid space size value is assigned to the variables (member of cluster_info and system_info) only at cluster_make_fs or reloading config file. Not the constants, but the variables are always refered.
Signed-off-by: Takafumi Fujieda <fujieda.takaf...@lab.ntt.co.jp> --- dog/cluster.c | 38 +++++++++++++++++++++++++++++++------- dog/common.c | 32 +++++++++++++++++++++++++------- dog/dog.h | 3 ++- dog/farm/farm.c | 16 +++++++++++++++- dog/node.c | 3 ++- dog/vdi.c | 22 +++++++++++++++------- include/sheepdog_proto.h | 8 ++++---- sheep/config.c | 16 +++++++++++++++- sheep/gateway.c | 23 +++++++++++++---------- sheep/group.c | 10 ++++++++-- sheep/journal.c | 3 ++- sheep/nfs/fs.c | 4 ++-- sheep/nfs/nfs.c | 4 ++-- sheep/object_cache.c | 13 +++++++------ sheep/object_list_cache.c | 2 +- sheep/ops.c | 35 ++++++++++++++++++++++++++++------- sheep/plain_store.c | 25 +++++++++++++++---------- sheep/recovery.c | 10 ++++++---- sheep/request.c | 3 ++- sheep/sheep_priv.h | 4 ++-- sheep/vdi.c | 43 ++++++++++++++++++++++++++----------------- sheepfs/volume.c | 2 +- 22 files changed, 224 insertions(+), 95 deletions(-) diff --git a/dog/cluster.c b/dog/cluster.c index 2b6864a..d4a45ec 100644 --- a/dog/cluster.c +++ b/dog/cluster.c @@ -71,9 +71,9 @@ static int list_store(void) return EXIT_SYSFAIL; } -static bool no_vdi(const unsigned long *vdis) +static bool no_vdi(const unsigned long *vdis, uint32_t nr_vdis) { - return find_next_bit(vdis, SD_NR_VDIS, 0) == SD_NR_VDIS; + return find_next_bit(vdis, nr_vdis, 0) == nr_vdis; } #define FORMAT_PRINT \ @@ -90,8 +90,12 @@ static int cluster_format(int argc, char **argv) struct sd_rsp *rsp = (struct sd_rsp *)&hdr; struct timeval tv; char store_name[STORE_LEN]; - static DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS); struct sd_node *n; + uint8_t new_space = 0; + uint8_t old_space = 0; + uint32_t old_nr_vdis; + unsigned long *vdi_inuse = NULL; + size_t bmp_size; rb_for_each_entry(n, &sd_nroot, rb) { struct sd_req info_req; @@ -110,6 +114,17 @@ static int cluster_format(int argc, char **argv) return EXIT_FAILURE; } + if (!old_space) + if (!cinfo.vid_space) + old_space = SD_VID_SPACE; + else + old_space = cinfo.vid_space; + else + if (cinfo.vid_space && old_space != cinfo.vid_space) { + sd_err("there are nodes have different VID space"); + return EXIT_FAILURE; + } + if (n->nr_vnodes != 0) { if ((cinfo.flags & SD_CLUSTER_FLAG_AUTO_VNODES) && cluster_cmd_data.fixed_vnodes) { @@ -136,19 +151,25 @@ static int cluster_format(int argc, char **argv) confirm(info); } + old_nr_vdis = (1U << old_space); + bmp_size = sizeof(unsigned long) * BITS_TO_LONGS(old_nr_vdis); + vdi_inuse = (unsigned long *)malloc(bmp_size); + sd_init_req(&hdr, SD_OP_READ_VDIS); - hdr.data_length = sizeof(vdi_inuse); + hdr.data_length = bmp_size; ret = dog_exec_req(&sd_nid, &hdr, vdi_inuse); if (ret < 0) return EXIT_SYSFAIL; if (rsp->result != SD_RES_SUCCESS) { sd_err("%s", sd_strerror(rsp->result)); + free(vdi_inuse); return EXIT_FAILURE; } - if (!no_vdi(vdi_inuse)) + if (!no_vdi(vdi_inuse, old_nr_vdis)) confirm(FORMAT_PRINT); + free(vdi_inuse); gettimeofday(&tv, NULL); @@ -401,7 +422,8 @@ static void fill_cb(struct sd_index *idx, void *arg, int ignore) static void fill_object_tree(uint32_t vid, const char *name, const char *tag, uint32_t snapid, uint32_t flags, - const struct sd_inode *i, void *data) + const struct sd_inode *i, void *data, + uint32_t nr_vdis) { uint64_t vdi_oid = vid_to_vdi_oid(vid), vmstate_oid; uint32_t vdi_id; @@ -548,6 +570,7 @@ static int load_snapshot(int argc, char **argv) cluster_cmd_data.copies = hdr.copy_number; cluster_cmd_data.copy_policy = hdr.copy_policy; cluster_cmd_data.block_size_shift = hdr.block_size_shift; + cluster_cmd_data.vid_space = hdr.vid_space; if (cluster_format(0, NULL) != SD_RES_SUCCESS) goto out; @@ -714,7 +737,8 @@ static int cluster_reweight(int argc, char **argv) static void cluster_check_cb(uint32_t vid, const char *name, const char *tag, uint32_t snapid, uint32_t flags, - const struct sd_inode *inode, void *data) + const struct sd_inode *inode, void *data, + uint32_t nr_vdis) { if (vdi_is_snapshot(inode)) printf("fix snapshot %s (id: %d, tag: \"%s\")\n", name, diff --git a/dog/common.c b/dog/common.c index 6ff1e19..dbcc67b 100644 --- a/dog/common.c +++ b/dog/common.c @@ -126,7 +126,7 @@ int dog_write_object(uint64_t oid, uint64_t cow_oid, void *data, return SD_RES_SUCCESS; } -#define FOR_EACH_VDI(nr, vdis) FOR_EACH_BIT(nr, vdis, SD_NR_VDIS) +#define FOR_EACH_VDI(nr, vdis, nr_vdis) FOR_EACH_BIT(nr, vdis, nr_vdis) int parse_vdi(vdi_parser_func_t func, size_t size, void *data, bool no_deleted) @@ -136,12 +136,28 @@ int parse_vdi(vdi_parser_func_t func, size_t size, void *data, struct sd_inode *i = xmalloc(sizeof(*i)); struct sd_req req; struct sd_rsp *rsp = (struct sd_rsp *)&req; - static DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS); - static DECLARE_BITMAP(vdi_deleted, SD_NR_VDIS); uint32_t rlen; + struct cluster_info cinfo; + uint32_t nr_vdis; + size_t bmp_size; + unsigned long *vdi_inuse = NULL; + unsigned long *vdi_deleted = NULL; + + sd_init_req(&req, SD_OP_CLUSTER_INFO); + req.data_length = sizeof(cinfo); + ret = dog_exec_req(&sd_nid, &req, &cinfo); + if (ret < 0) { + sd_err("Fail to execute request: SD_OP_CLUSTER_INFO"); + ret = EXIT_FAILURE; + goto out; + } + nr_vdis = (1U << cinfo.vid_space); + bmp_size = sizeof(unsigned long) * BITS_TO_LONGS(nr_vdis); + vdi_inuse = (unsigned long *)malloc(bmp_size); + vdi_deleted = (unsigned long *)malloc(bmp_size); sd_init_req(&req, SD_OP_READ_VDIS); - req.data_length = sizeof(vdi_inuse); + req.data_length = bmp_size; ret = dog_exec_req(&sd_nid, &req, vdi_inuse); if (ret < 0) @@ -152,7 +168,7 @@ int parse_vdi(vdi_parser_func_t func, size_t size, void *data, } sd_init_req(&req, SD_OP_READ_DEL_VDIS); - req.data_length = sizeof(vdi_deleted); + req.data_length = bmp_size; ret = dog_exec_req(&sd_nid, &req, vdi_deleted); if (ret < 0) @@ -162,7 +178,7 @@ int parse_vdi(vdi_parser_func_t func, size_t size, void *data, goto out; } - FOR_EACH_VDI(nr, vdi_inuse) { + FOR_EACH_VDI(nr, vdi_inuse, nr_vdis) { uint64_t oid; uint32_t snapid; @@ -196,11 +212,13 @@ int parse_vdi(vdi_parser_func_t func, size_t size, void *data, } snapid = vdi_is_snapshot(i) ? i->snap_id : 0; - func(i->vdi_id, i->name, i->tag, snapid, 0, i, data); + func(i->vdi_id, i->name, i->tag, snapid, 0, i, data, nr_vdis); } out: free(i); + free(vdi_inuse); + free(vdi_deleted); return ret; } diff --git a/dog/dog.h b/dog/dog.h index 37355e5..1506a88 100644 --- a/dog/dog.h +++ b/dog/dog.h @@ -71,7 +71,8 @@ char *strnumber_raw(uint64_t _size, bool raw); typedef void (*vdi_parser_func_t)(uint32_t vid, const char *name, const char *tag, uint32_t snapid, uint32_t flags, - const struct sd_inode *i, void *data); + const struct sd_inode *i, void *data, + uint32_t nr_vdis); int parse_vdi(vdi_parser_func_t func, size_t size, void *data, bool no_deleted); int dog_read_object(uint64_t oid, void *data, unsigned int datalen, diff --git a/dog/farm/farm.c b/dog/farm/farm.c index e2b07ad..a9b6fd6 100644 --- a/dog/farm/farm.c +++ b/dog/farm/farm.c @@ -360,6 +360,7 @@ int farm_save_snapshot(const char *tag, bool multithread) log_hdr.copy_number = cinfo.nr_copies; log_hdr.copy_policy = cinfo.copy_policy; log_hdr.block_size_shift = cinfo.block_size_shift; + log_hdr.vid_space = cinfo.vid_space; snap_log_write_hdr(&log_hdr); } @@ -404,6 +405,10 @@ static void do_load_object(struct work *work) struct snapshot_work *sw; static unsigned long loaded; uint32_t vid; + struct sd_req req; + struct cluster_info cinfo; + uint64_t vdi_mask; + int ret = SD_RES_SUCCESS; if (uatomic_is_true(&work_error)) return; @@ -415,7 +420,16 @@ static void do_load_object(struct work *work) if (!buffer) goto error; - vid = oid_to_vid(sw->entry.oid); + sd_init_req(&req, SD_OP_CLUSTER_INFO); + req.data_length = sizeof(cinfo); + ret = dog_exec_req(&sd_nid, &req, &cinfo); + if (ret < 0) { + sd_err("Fail to execute request: SD_OP_CLUSTER_INFO"); + goto error; + } + vdi_mask = ((1LU << cinfo.vid_space) << VDI_SPACE_SHIFT) + - (1LU << VDI_SPACE_SHIFT); + vid = oid_to_vid(sw->entry.oid, vdi_mask); if (register_vdi(vid)) { if (notify_vdi_add(vid, sw->entry.nr_copies, sw->entry.copy_policy, diff --git a/dog/node.c b/dog/node.c index 36141ad..6af54f4 100644 --- a/dog/node.c +++ b/dog/node.c @@ -21,7 +21,8 @@ static struct node_cmd_data { static void cal_total_vdi_size(uint32_t vid, const char *name, const char *tag, uint32_t snapid, uint32_t flags, - const struct sd_inode *i, void *data) + const struct sd_inode *i, void *data, + uint32_t nr_vdis) { uint64_t *size = data; diff --git a/dog/vdi.c b/dog/vdi.c index 67e2f0b..d2ba096 100644 --- a/dog/vdi.c +++ b/dog/vdi.c @@ -117,7 +117,8 @@ static char *redundancy_scheme(uint8_t copy_nr, uint8_t policy) static void print_vdi_list(uint32_t vid, const char *name, const char *tag, uint32_t snapid, uint32_t flags, - const struct sd_inode *i, void *data) + const struct sd_inode *i, void *data, + uint32_t nr_vdis) { bool is_clone = false; uint64_t my_objs = 0, cow_objs = 0; @@ -174,7 +175,8 @@ static void print_vdi_list(uint32_t vid, const char *name, const char *tag, static void print_vdi_tree(uint32_t vid, const char *name, const char *tag, uint32_t snapid, uint32_t flags, - const struct sd_inode *i, void *data) + const struct sd_inode *i, void *data, + uint32_t nr_vdis) { time_t ti; struct tm tm; @@ -195,7 +197,8 @@ static void print_vdi_tree(uint32_t vid, const char *name, const char *tag, static void print_vdi_graph(uint32_t vid, const char *name, const char *tag, uint32_t snapid, uint32_t flags, - const struct sd_inode *i, void *data) + const struct sd_inode *i, void *data, + uint32_t nr_vdis) { time_t ti; struct tm tm; @@ -272,17 +275,21 @@ static void for_each_node_print(uint64_t oid) static void print_obj_ref(uint32_t vid, const char *name, const char *tag, uint32_t snapid, uint32_t flags, - const struct sd_inode *i, void *data) + const struct sd_inode *i, void *data, + uint32_t nr_vdis) { uint64_t oid = *(uint64_t *)data; uint64_t idx = data_oid_to_idx(oid); struct get_vdi_info info; + uint64_t vdi_mask = ((uint64_t)nr_vdis << VDI_SPACE_SHIFT) + - (1LU << VDI_SPACE_SHIFT); if (i->data_vdi_id[idx] != 0 && - i->data_vdi_id[idx] == oid_to_vid(oid)) { + i->data_vdi_id[idx] == oid_to_vid(oid, vdi_mask)) { memset(&info, 0, sizeof(info)); info.name = name; - print_vdi_list(vid, name, tag, snapid, flags, i, &info); + print_vdi_list(vid, name, tag, snapid, flags, i, &info, + nr_vdis); } } @@ -2959,7 +2966,8 @@ static int vdi_cache(int argc, char **argv) static void construct_vdi_tree(uint32_t vid, const char *name, const char *tag, uint32_t snapid, uint32_t flags, - const struct sd_inode *i, void *data) + const struct sd_inode *i, void *data, + uint32_t nr_vdis) { add_vdi_tree(name, tag, vid, i->parent_vdi_id, false); } diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h index 5db2394..7a2e2ad 100644 --- a/include/sheepdog_proto.h +++ b/include/sheepdog_proto.h @@ -451,11 +451,11 @@ static inline uint64_t sd_hash_oid(uint64_t oid) * Create a hash value from a vdi name. We cannot use sd_hash_buf for this * purpose because of backward compatibility. */ -static inline uint32_t sd_hash_vdi(const char *name) +static inline uint32_t sd_hash_vdi(const char *name, uint32_t nr_vdis) { uint64_t hval = fnv_64a_buf(name, strlen(name), FNV1A_64_INIT); - return (uint32_t)(hval & (SD_NR_VDIS - 1)); + return (uint32_t)(hval & (nr_vdis - 1)); } #ifndef __KERNEL__ @@ -535,9 +535,9 @@ static inline uint64_t vid_to_data_oid(uint32_t vid, uint64_t idx) return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx; } -static inline uint32_t oid_to_vid(uint64_t oid) +static inline uint32_t oid_to_vid(uint64_t oid, uint64_t mask) { - return (oid & SD_VDI_MASK) >> VDI_SPACE_SHIFT; + return (oid & mask) >> VDI_SPACE_SHIFT; } static inline uint64_t vid_to_attr_oid(uint32_t vid, uint32_t attrid) diff --git a/sheep/config.c b/sheep/config.c index 548a1e8..7ab3600 100644 --- a/sheep/config.c +++ b/sheep/config.c @@ -71,6 +71,7 @@ static int get_cluster_config(struct cluster_info *cinfo) (cinfo->flags & SD_CLUSTER_FLAG_AUTO_VNODES); cinfo->copy_policy = config.copy_policy; cinfo->block_size_shift = config.block_size_shift; + cinfo->vid_space = config.vid_space; memcpy(cinfo->store, config.store, sizeof(config.store)); return SD_RES_SUCCESS; @@ -79,6 +80,7 @@ static int get_cluster_config(struct cluster_info *cinfo) int init_config_file(void) { int fd, ret = 0; + size_t bmp_size; check_tmp_config(); @@ -135,7 +137,6 @@ reload: sd_err("Designation of before a restart and a vnodes option is different."); return -1; } - ret = 0; get_cluster_config(&sys->cinfo); if ((config.flags & SD_CLUSTER_FLAG_DISKMODE) != @@ -144,6 +145,18 @@ reload: "exists data format mismatch"); return -1; } + if (!sys->cinfo.vid_space) { + sys->cinfo.vid_space = SD_VID_SPACE; + sys->nr_vdis = SD_NR_VDIS; + sys->vdi_mask = SD_VDI_MASK; + } else { + sys->nr_vdis = (1U << sys->cinfo.vid_space); + sys->vdi_mask = ((uint64_t)sys->nr_vdis << VDI_SPACE_SHIFT) + - (1LU << VDI_SPACE_SHIFT); + } + bmp_size = sizeof(unsigned long) * BITS_TO_LONGS(sys->nr_vdis); + sys->vdi_inuse = (unsigned long *)malloc(bmp_size); + sys->vdi_deleted = (unsigned long *)malloc(bmp_size); create: config.version = SD_FORMAT_VERSION; @@ -171,6 +184,7 @@ int set_cluster_config(const struct cluster_info *cinfo) config.copy_policy = cinfo->copy_policy; config.flags = cinfo->flags; config.block_size_shift = cinfo->block_size_shift; + config.vid_space = cinfo->vid_space; memset(config.store, 0, sizeof(config.store)); pstrcpy((char *)config.store, sizeof(config.store), (char *)cinfo->store); diff --git a/sheep/gateway.c b/sheep/gateway.c index 89db9bf..b071f7a 100644 --- a/sheep/gateway.c +++ b/sheep/gateway.c @@ -117,7 +117,7 @@ static struct req_iter *prepare_erasure_requests(struct request *req, int *nr) struct req_iter *reqs; char *p, *buf = NULL; uint8_t policy = req->rq.obj.copy_policy ?: - get_vdi_copy_policy(oid_to_vid(req->rq.obj.oid)); + get_vdi_copy_policy(oid_to_vid(req->rq.obj.oid, sys->vdi_mask)); int ed = 0, ep = 0, edp; edp = ec_policy_to_dp(policy, &ed, &ep); @@ -183,7 +183,7 @@ bool is_erasure_oid(uint64_t oid) { return !is_vdi_obj(oid) && !is_vdi_btree_obj(oid) && !is_ledger_object(oid) && - get_vdi_copy_policy(oid_to_vid(oid)) > 0; + get_vdi_copy_policy(oid_to_vid(oid, sys->vdi_mask)) > 0; } /* Prepare request iterator and buffer for each replica */ @@ -216,7 +216,8 @@ static void finish_requests(struct request *req, struct req_iter *reqs, if (opcode == SD_OP_READ_OBJ) { char *p, *buf = xmalloc(SD_EC_DATA_STRIPE_SIZE * nr_stripe); uint8_t policy = req->rq.obj.copy_policy ?: - get_vdi_copy_policy(oid_to_vid(req->rq.obj.oid)); + get_vdi_copy_policy(oid_to_vid(req->rq.obj.oid, + sys->vdi_mask)); int ed = 0, strip_size; ec_policy_to_dp(policy, &ed, NULL); @@ -496,7 +497,8 @@ static int gateway_forward_request(struct request *req) nr_reqs = nr_to_send; if (nr_to_send > nr_copies) { uint8_t policy = req->rq.obj.copy_policy ?: - get_vdi_copy_policy(oid_to_vid(req->rq.obj.oid)); + get_vdi_copy_policy(oid_to_vid(req->rq.obj.oid, + sys->vdi_mask)); int ds; /* Only for erasure code, nr_to_send might > nr_copies */ ec_policy_to_dp(policy, &ds, NULL); @@ -638,7 +640,7 @@ int gateway_read_obj(struct request *req) if ((req->rq.flags & SD_FLAG_CMD_TGT) && !is_inode_refresh_req(req) && - is_refresh_required(oid_to_vid(oid))) { + is_refresh_required(oid_to_vid(oid, sys->vdi_mask))) { sd_debug("refresh is required: %"PRIx64, oid); return SD_RES_INODE_INVALIDATED; } @@ -656,7 +658,7 @@ int gateway_read_obj(struct request *req) return ret; if (is_inode_refresh_req(req)) - validate_myself(oid_to_vid(oid)); + validate_myself(oid_to_vid(oid, sys->vdi_mask)); return ret; } @@ -670,7 +672,7 @@ int gateway_write_obj(struct request *req) struct generation_reference *refs = NULL; if ((req->rq.flags & SD_FLAG_CMD_TGT) && - is_refresh_required(oid_to_vid(oid))) { + is_refresh_required(oid_to_vid(oid, sys->vdi_mask))) { sd_debug("refresh is required: %"PRIx64, oid); return SD_RES_INODE_INVALIDATED; } @@ -685,7 +687,7 @@ int gateway_write_obj(struct request *req) if (is_data_vid_update(hdr)) { size_t nr_vids = hdr->data_length / sizeof(*vids); - invalidate_other_nodes(oid_to_vid(oid)); + invalidate_other_nodes(oid_to_vid(oid, sys->vdi_mask)); /* read the previous vids to discard their references later */ vids = xzalloc(sizeof(*vids) * nr_vids); @@ -713,7 +715,8 @@ out: static int gateway_handle_cow(struct request *req) { uint64_t oid = req->rq.obj.oid; - size_t len = get_objsize(oid, get_vdi_object_size(oid_to_vid(oid))); + size_t len = get_objsize(oid, get_vdi_object_size(oid_to_vid(oid, + sys->vdi_mask))); struct sd_req hdr, *req_hdr = &req->rq; char *buf = xvalloc(len); int ret; @@ -746,7 +749,7 @@ int gateway_create_and_write_obj(struct request *req) uint64_t oid = req->rq.obj.oid; if ((req->rq.flags & SD_FLAG_CMD_TGT) && - is_refresh_required(oid_to_vid(oid))) { + is_refresh_required(oid_to_vid(oid, sys->vdi_mask))) { sd_debug("refresh is required: %"PRIx64, oid); return SD_RES_INODE_INVALIDATED; } diff --git a/sheep/group.c b/sheep/group.c index 85b9249..d03d667 100644 --- a/sheep/group.c +++ b/sheep/group.c @@ -18,8 +18,8 @@ struct node { struct get_vdis_work { struct work work; - DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS); - DECLARE_BITMAP(vdi_deleted, SD_NR_VDIS); + unsigned long *vdi_inuse; + unsigned long *vdi_deleted; struct sd_node joined; struct rb_root nroot; }; @@ -587,6 +587,8 @@ static void get_vdis_done(struct work *work) sd_mutex_unlock(&wait_vdis_lock); rb_destroy(&w->nroot, struct sd_node, rb); + free(w->vdi_inuse); + free(w->vdi_deleted); free(w); if (refcount_read(&nr_get_vdis_works) == 0) @@ -673,9 +675,13 @@ static void setup_backend_store(const struct cluster_info *cinfo) static void get_vdis(const struct rb_root *nroot, const struct sd_node *joined) { struct get_vdis_work *w; + size_t bmp_size; + bmp_size = sizeof(unsigned long) * BITS_TO_LONGS(sys->nr_vdis); w = xmalloc(sizeof(*w)); w->joined = *joined; + w->vdi_inuse = (unsigned long *)malloc(bmp_size); + w->vdi_deleted = (unsigned long *)malloc(bmp_size); INIT_RB_ROOT(&w->nroot); rb_copy(nroot, struct sd_node, rb, &w->nroot, node_cmp); refcount_inc(&nr_get_vdis_works); diff --git a/sheep/journal.c b/sheep/journal.c index 4df9a74..3802c74 100644 --- a/sheep/journal.c +++ b/sheep/journal.c @@ -170,7 +170,8 @@ static int replay_journal_entry(struct journal_descriptor *jd) return -1; } if (jd->create) { - object_size = get_vdi_object_size(oid_to_vid(jd->oid)); + object_size = get_vdi_object_size(oid_to_vid(jd->oid, + sys->vdi_mask)); ret = prealloc(fd, object_size); if (ret < 0) goto out; diff --git a/sheep/nfs/fs.c b/sheep/nfs/fs.c index ec92f12..eb070c1 100644 --- a/sheep/nfs/fs.c +++ b/sheep/nfs/fs.c @@ -151,7 +151,7 @@ static void dentry_add(struct inode *parent, struct dentry *dentry) int fs_create_dir(struct inode *inode, const char *name, struct inode *parent) { uint64_t myino, pino = parent->ino; - uint32_t vid = oid_to_vid(pino); + uint32_t vid = oid_to_vid(pino, sys->vdi_mask); struct inode_data *id = prepare_inode_data(inode, vid, name); struct dentry *entry; int ret; @@ -313,7 +313,7 @@ struct dentry *fs_lookup_dir(struct inode *inode, const char *name) int fs_create_file(uint64_t pino, struct inode *new, const char *name) { - uint32_t vid = oid_to_vid(pino); + uint32_t vid = oid_to_vid(pino, sys->vdi_mask); struct inode *inode; struct dentry *dentry; int ret; diff --git a/sheep/nfs/nfs.c b/sheep/nfs/nfs.c index 036f995..e01ac02 100644 --- a/sheep/nfs/nfs.c +++ b/sheep/nfs/nfs.c @@ -66,7 +66,7 @@ static void update_post_attr(struct inode *inode, fattr3 *post) post->gid = inode->gid; post->size = inode->size; post->used = inode->used; - post->fsid = oid_to_vid(inode->ino); + post->fsid = oid_to_vid(inode->ino, sys->vdi_mask); post->fileid = inode->ino; post->atime.seconds = inode->atime; post->mtime.seconds = inode->mtime; @@ -626,7 +626,7 @@ void *nfs3_fsstat(struct svc_req *req, struct nfs_arg *argp) static FSSTAT3res result; struct svc_fh *fh = get_svc_fh(argp); struct sd_inode *sd_inode = xmalloc(sizeof(*sd_inode)); - uint32_t vid = oid_to_vid(fh->ino); + uint32_t vid = oid_to_vid(fh->ino, sys->vdi_mask); uint64_t my = 0 , cow = 0; int ret; diff --git a/sheep/object_cache.c b/sheep/object_cache.c index 3794c19..c5293f8 100644 --- a/sheep/object_cache.c +++ b/sheep/object_cache.c @@ -126,7 +126,8 @@ static inline bool idx_has_vdi_bit(uint64_t idx) static inline size_t get_cache_block_size(uint64_t oid) { - uint32_t object_size = get_vdi_object_size(oid_to_vid(oid)); + uint32_t object_size = get_vdi_object_size(oid_to_vid(oid, + sys->vdi_mask)); size_t bsize = DIV_ROUND_UP(get_objsize(oid, object_size), sizeof(uint64_t) * BITS_PER_BYTE); @@ -927,7 +928,7 @@ static int object_cache_push(struct object_cache *oc) bool object_is_cached(uint64_t oid) { - uint32_t vid = oid_to_vid(oid); + uint32_t vid = oid_to_vid(oid, sys->vdi_mask); uint64_t idx = object_cache_oid_to_idx(oid); struct object_cache *cache; @@ -992,7 +993,7 @@ get_cache_entry_from(struct object_cache *cache, uint64_t idx) /* This helper increases the refcount */ static struct object_cache_entry *oid_to_entry(uint64_t oid) { - uint32_t vid = oid_to_vid(oid); + uint32_t vid = oid_to_vid(oid, sys->vdi_mask); uint64_t idx = object_cache_oid_to_idx(oid); struct object_cache *cache; struct object_cache_entry *entry; @@ -1065,7 +1066,7 @@ bool bypass_object_cache(const struct request *req) return true; if (req->rq.flags & SD_FLAG_CMD_DIRECT) { - uint32_t vid = oid_to_vid(oid); + uint32_t vid = oid_to_vid(oid, sys->vdi_mask); struct object_cache *cache; cache = find_object_cache(vid, false); @@ -1092,7 +1093,7 @@ int object_cache_handle_request(struct request *req) { struct sd_req *hdr = &req->rq; uint64_t oid = req->rq.obj.oid; - uint32_t vid = oid_to_vid(oid); + uint32_t vid = oid_to_vid(oid, sys->vdi_mask); uint64_t idx = object_cache_oid_to_idx(oid); struct object_cache *cache; struct object_cache_entry *entry; @@ -1208,7 +1209,7 @@ int object_cache_flush_vdi(uint32_t vid) int object_cache_flush_and_del(const struct request *req) { - uint32_t vid = oid_to_vid(req->rq.obj.oid); + uint32_t vid = oid_to_vid(req->rq.obj.oid, sys->vdi_mask); struct object_cache *cache; cache = find_object_cache(vid, false); diff --git a/sheep/object_list_cache.c b/sheep/object_list_cache.c index b9acaa0..ad40ed3 100644 --- a/sheep/object_list_cache.c +++ b/sheep/object_list_cache.c @@ -154,7 +154,7 @@ static void objlist_deletion_work(struct work *work) sd_write_lock(&obj_list_cache.lock); rb_for_each_entry(entry, &obj_list_cache.root, node) { - entry_vid = oid_to_vid(entry->oid); + entry_vid = oid_to_vid(entry->oid, sys->vdi_mask); if (entry_vid != vid) continue; diff --git a/sheep/ops.c b/sheep/ops.c index bc2848b..6768904 100644 --- a/sheep/ops.c +++ b/sheep/ops.c @@ -298,6 +298,7 @@ static int cluster_make_fs(const struct sd_req *req, struct sd_rsp *rsp, char *store_name = data; int32_t nr_vnodes; struct vnode_info *vinfo = get_vnode_info(); + size_t bmp_size; driver = find_store_driver(data); if (!driver) { @@ -305,6 +306,24 @@ static int cluster_make_fs(const struct sd_req *req, struct sd_rsp *rsp, goto out; } + if (!req->cluster.vid_space) { + sys->cinfo.vid_space = SD_VID_SPACE; + sys->nr_vdis = SD_NR_VDIS; + sys->vdi_mask = SD_VDI_MASK; + } else { + sys->cinfo.vid_space = req->cluster.vid_space; + sys->nr_vdis = (1U << req->cluster.vid_space); + sys->vdi_mask = ((uint64_t)sys->nr_vdis << VDI_SPACE_SHIFT) + - (1LU << VDI_SPACE_SHIFT); + } + if (is_cluster_formatted()){ + free(sys->vdi_inuse); + free(sys->vdi_deleted); + } + bmp_size = sizeof(unsigned long) * BITS_TO_LONGS(sys->nr_vdis); + sys->vdi_inuse = (unsigned long *)malloc(bmp_size); + sys->vdi_deleted = (unsigned long *)malloc(bmp_size); + pstrcpy((char *)sys->cinfo.store, sizeof(sys->cinfo.store), store_name); sd_store = driver; @@ -338,8 +357,8 @@ static int cluster_make_fs(const struct sd_req *req, struct sd_rsp *rsp, for (i = 1; i <= latest_epoch; i++) remove_epoch(i); - memset(sys->vdi_inuse, 0, sizeof(sys->vdi_inuse)); - memset(sys->vdi_deleted, 0, sizeof(sys->vdi_deleted)); + memset(sys->vdi_inuse, 0, bmp_size); + memset(sys->vdi_deleted, 0, bmp_size); clean_vdi_state(); sys->cinfo.epoch = 0; @@ -411,7 +430,7 @@ static int cluster_get_vdi_attr(struct request *req) * the current VDI id can change if we take a snapshot, * so we use the hash value of the VDI name as the VDI id */ - vid = sd_hash_vdi(vattr->name); + vid = sd_hash_vdi(vattr->name, sys->nr_vdis); ret = get_vdi_attr(req->data, hdr->data_length, vid, &attrid, info.create_time, !!(hdr->flags & SD_FLAG_CMD_CREAT), @@ -528,6 +547,7 @@ static int local_stat_cluster(struct request *req) elog->disable_recovery = sys->cinfo.disable_recovery; elog->nr_copies = sys->cinfo.nr_copies; elog->copy_policy = sys->cinfo.copy_policy; + elog->vid_space = sys->cinfo.vid_space; elog->flags = sys->cinfo.flags; strncpy(elog->drv_name, (char *)sys->cinfo.store, STORE_LEN); @@ -727,7 +747,7 @@ static int cluster_notify_vdi_del(const struct sd_req *req, struct sd_rsp *rsp, static int cluster_delete_cache(const struct sd_req *req, struct sd_rsp *rsp, void *data, const struct sd_node *sender) { - uint32_t vid = oid_to_vid(req->obj.oid); + uint32_t vid = oid_to_vid(req->obj.oid, sys->vdi_mask); if (sys->enable_object_cache) object_cache_delete(vid); @@ -911,7 +931,7 @@ static int local_get_cache_info(struct request *request) static int local_cache_purge(struct request *req) { const struct sd_req *hdr = &req->rq; - uint32_t vid = oid_to_vid(req->rq.obj.oid); + uint32_t vid = oid_to_vid(req->rq.obj.oid, sys->vdi_mask); if (hdr->flags == SD_FLAG_CMD_WRITE) { object_cache_delete(vid); @@ -936,7 +956,7 @@ static int local_flush_vdi(struct request *req) int ret = SD_RES_INVALID_PARMS; if (sys->enable_object_cache) { - uint32_t vid = oid_to_vid(req->rq.obj.oid); + uint32_t vid = oid_to_vid(req->rq.obj.oid, sys->vdi_mask); ret = object_cache_flush_vdi(vid); } @@ -946,7 +966,7 @@ static int local_flush_vdi(struct request *req) static int local_discard_obj(struct request *req) { uint64_t oid = req->rq.obj.oid; - uint32_t vid = oid_to_vid(oid), tmp_vid; + uint32_t vid = oid_to_vid(oid, sys->vdi_mask), tmp_vid; int ret, idx = data_oid_to_idx(oid); struct sd_inode *inode = xmalloc(sizeof(struct sd_inode)); @@ -1477,6 +1497,7 @@ static int local_get_cluster_default(const struct sd_req *req, rsp->cluster_default.nr_copies = sys->cinfo.nr_copies; rsp->cluster_default.copy_policy = sys->cinfo.copy_policy; rsp->cluster_default.block_size_shift = sys->cinfo.block_size_shift; + rsp->cluster_default.vid_space = sys->cinfo.vid_space; return SD_RES_SUCCESS; } diff --git a/sheep/plain_store.c b/sheep/plain_store.c index 92f9a14..390a964 100644 --- a/sheep/plain_store.c +++ b/sheep/plain_store.c @@ -152,7 +152,8 @@ static int default_trim(int fd, uint64_t oid, const struct siocb *iocb, if (*poffset + *plen < iocb->offset + iocb->length) { uint64_t end = iocb->offset + iocb->length; - uint32_t object_size = get_vdi_object_size(oid_to_vid(oid)); + uint32_t object_size = get_vdi_object_size(oid_to_vid(oid, + sys->vdi_mask)); if (end == get_objsize(oid, object_size)) /* This is necessary to punch the last block */ end = round_up(end, BLOCK_SIZE); @@ -281,14 +282,16 @@ static int init_vdi_state(uint64_t oid, const char *wd, uint32_t epoch) "wat %s", oid, epoch, wd); goto out; } - add_vdi_state_unordered(oid_to_vid(oid), inode->nr_copies, - vdi_is_snapshot(inode), inode->copy_policy, - inode->block_size_shift, inode->parent_vdi_id); + add_vdi_state_unordered(oid_to_vid(oid, sys->vdi_mask), + inode->nr_copies, vdi_is_snapshot(inode), + inode->copy_policy, inode->block_size_shift, + inode->parent_vdi_id); if (inode->name[0] == '\0') - atomic_set_bit(oid_to_vid(oid), sys->vdi_deleted); + atomic_set_bit(oid_to_vid(oid, sys->vdi_mask), + sys->vdi_deleted); - atomic_set_bit(oid_to_vid(oid), sys->vdi_inuse); + atomic_set_bit(oid_to_vid(oid, sys->vdi_mask), sys->vdi_inuse); ret = SD_RES_SUCCESS; out: @@ -400,12 +403,14 @@ int prealloc(int fd, uint32_t size) size_t get_store_objsize(uint64_t oid) { if (is_erasure_oid(oid)) { - uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid)); + uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid, + sys->vdi_mask)); int d; ec_policy_to_dp(policy, &d, NULL); - return get_vdi_object_size(oid_to_vid(oid)) / d; + return get_vdi_object_size(oid_to_vid(oid, sys->vdi_mask)) / d; } - return get_objsize(oid, get_vdi_object_size(oid_to_vid(oid))); + return get_objsize(oid, get_vdi_object_size(oid_to_vid(oid, + sys->vdi_mask))); } int default_create_and_write(uint64_t oid, const struct siocb *iocb) @@ -454,7 +459,7 @@ int default_create_and_write(uint64_t oid, const struct siocb *iocb) trim_zero_blocks(iocb->buf, &offset, &len); - object_size = get_vdi_object_size(oid_to_vid(oid)); + object_size = get_vdi_object_size(oid_to_vid(oid, sys->vdi_mask)); if (offset != 0 || len != get_objsize(oid, object_size)) { if (is_sparse_object(oid)) diff --git a/sheep/recovery.c b/sheep/recovery.c index dbd5146..b3e7a22 100644 --- a/sheep/recovery.c +++ b/sheep/recovery.c @@ -194,7 +194,7 @@ static void *read_erasure_object(uint64_t oid, uint8_t idx, struct vnode_info *old = grab_vnode_info(rw->old_vinfo), *new_old; uint32_t epoch = rw->epoch, tgt_epoch = rw->tgt_epoch; const struct sd_node *node; - uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid)); + uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid, sys->vdi_mask)); int edp = ec_policy_to_dp(policy, NULL, NULL); int ret; struct sd_node *excluded = row->base.rinfo->excluded; @@ -451,8 +451,9 @@ static void *rebuild_erasure_object(uint64_t oid, uint8_t idx, int len = get_store_objsize(oid); char *lost = xvalloc(len); int i, j; - uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid)); - uint32_t object_size = get_vdi_object_size(oid_to_vid(oid)); + uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid, sys->vdi_mask)); + uint32_t object_size = get_vdi_object_size(oid_to_vid(oid, + sys->vdi_mask)); int ed = 0, edp; edp = ec_policy_to_dp(policy, &ed, NULL); struct fec *ctx = ec_init(ed, edp); @@ -492,7 +493,8 @@ out: uint8_t local_ec_index(struct vnode_info *vinfo, uint64_t oid) { - int idx, m = min(get_vdi_copy_number(oid_to_vid(oid)), vinfo->nr_zones); + int idx, m = min(get_vdi_copy_number(oid_to_vid(oid, sys->vdi_mask)), + vinfo->nr_zones); if (!is_erasure_oid(oid)) return SD_MAX_COPIES; diff --git a/sheep/request.c b/sheep/request.c index 2f86c67..79204ba 100644 --- a/sheep/request.c +++ b/sheep/request.c @@ -318,7 +318,8 @@ static bool has_enough_zones(struct request *req) { uint64_t oid = req->rq.obj.oid; - return req->vinfo->nr_zones >= get_vdi_copy_number(oid_to_vid(oid)); + return req->vinfo->nr_zones >= get_vdi_copy_number(oid_to_vid(oid, + sys->vdi_mask)); } static void queue_gateway_request(struct request *req) diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h index 5608cbc..3b0d46d 100644 --- a/sheep/sheep_priv.h +++ b/sheep/sheep_priv.h @@ -132,8 +132,8 @@ struct system_info { uint32_t nr_vdis; uint64_t vdi_mask; - DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS); - DECLARE_BITMAP(vdi_deleted, SD_NR_VDIS); + unsigned long *vdi_inuse; + unsigned long *vdi_deleted; int local_req_efd; diff --git a/sheep/vdi.c b/sheep/vdi.c index 2889df6..b41f4b7 100644 --- a/sheep/vdi.c +++ b/sheep/vdi.c @@ -237,7 +237,7 @@ bool oid_is_readonly(uint64_t oid) if (!is_data_obj(oid)) return false; - return vid_is_snapshot(oid_to_vid(oid)); + return vid_is_snapshot(oid_to_vid(oid, sys->vdi_mask)); } int get_vdi_copy_number(uint32_t vid) @@ -314,7 +314,8 @@ uint8_t get_vdi_block_size_shift(uint32_t vid) int get_obj_copy_number(uint64_t oid, int nr_zones) { - return min(get_vdi_copy_number(oid_to_vid(oid)), nr_zones); + return min(get_vdi_copy_number(oid_to_vid(oid, sys->vdi_mask)), + nr_zones); } int get_req_copy_number(struct request *req) @@ -1346,8 +1347,8 @@ out: * Return SUCCESS (range of bits set): * Iff we get a bitmap range [left, right) that VDI might be set between. if * right < start, this means a wrap around case where we should examine the - * two split ranges, [left, SD_NR_VDIS - 1] and [0, right). 'Right' is the free - * bit that might be used by newly created VDI. + * two split ranges, [left, sys->nr_vdis - 1] and [0, right). + * 'Right' is the free bit that might be used by newly created VDI. * * Otherwise: * Return NO_VDI (bit not set) or FULL_VDI (bitmap fully set) @@ -1355,15 +1356,15 @@ out: static int get_vdi_bitmap_range(const char *name, unsigned long *left, unsigned long *right) { - *left = sd_hash_vdi(name); - *right = find_next_zero_bit(sys->vdi_inuse, SD_NR_VDIS, *left); + *left = sd_hash_vdi(name, sys->nr_vdis); + *right = find_next_zero_bit(sys->vdi_inuse, sys->nr_vdis, *left); if (*left == *right) return SD_RES_NO_VDI; - if (*right == SD_NR_VDIS) { + if (*right == sys->nr_vdis) { /* Wrap around */ - *right = find_next_zero_bit(sys->vdi_inuse, SD_NR_VDIS, 0); - if (*right == SD_NR_VDIS) + *right = find_next_zero_bit(sys->vdi_inuse, sys->nr_vdis, 0); + if (*right == sys->nr_vdis) return SD_RES_FULL_VDI; } return SD_RES_SUCCESS; @@ -1458,7 +1459,7 @@ static int fill_vdi_info(unsigned long left, unsigned long right, switch (ret) { case SD_RES_NO_VDI: case SD_RES_NO_TAG: - ret = fill_vdi_info_range(left, SD_NR_VDIS - 1, iocb, info); + ret = fill_vdi_info_range(left, sys->nr_vdis - 1, iocb, info); break; default: break; @@ -1495,7 +1496,7 @@ int vdi_lookup(const struct vdi_iocb *iocb, struct vdi_info *info) * TODO: for checking before creation, the below fill_vdi_info() * isn't required. It must be eliminated. */ - return fill_vdi_info(0, SD_NR_VDIS, iocb, info); + return fill_vdi_info(0, sys->nr_vdis, iocb, info); case SD_RES_FULL_VDI: return ret; case SD_RES_SUCCESS: @@ -1629,22 +1630,30 @@ int vdi_snapshot(const struct vdi_iocb *iocb, uint32_t *new_vid) int read_vdis(char *data, int len, unsigned int *rsp_len) { - if (len != sizeof(sys->vdi_inuse)) + if (!is_cluster_formatted()) { + *rsp_len = 0; + return SD_RES_SUCCESS; + } + if (len != (sizeof(unsigned long) * BITS_TO_LONGS(sys->nr_vdis))) return SD_RES_INVALID_PARMS; - memcpy(data, sys->vdi_inuse, sizeof(sys->vdi_inuse)); - *rsp_len = sizeof(sys->vdi_inuse); + memcpy(data, sys->vdi_inuse, len); + *rsp_len = len; return SD_RES_SUCCESS; } int read_del_vdis(char *data, int len, unsigned int *rsp_len) { - if (len != sizeof(sys->vdi_deleted)) + if (!is_cluster_formatted()) { + *rsp_len = 0; + return SD_RES_SUCCESS; + } + if (len != sizeof(unsigned long) * BITS_TO_LONGS(sys->nr_vdis)) return SD_RES_INVALID_PARMS; - memcpy(data, sys->vdi_deleted, sizeof(sys->vdi_deleted)); - *rsp_len = sizeof(sys->vdi_deleted); + memcpy(data, sys->vdi_deleted, len); + *rsp_len = len; return SD_RES_SUCCESS; } diff --git a/sheepfs/volume.c b/sheepfs/volume.c index d43304c..921d670 100644 --- a/sheepfs/volume.c +++ b/sheepfs/volume.c @@ -123,7 +123,7 @@ static int volume_rw_object(char *buf, uint64_t oid, size_t size, struct sd_rsp *rsp = (struct sd_rsp *)&hdr; int ret, fd, sock_idx; bool create = false; - uint32_t vid = oid_to_vid(oid), vdi_id; + uint32_t vid = oid_to_vid(oid, sys->vdi_mask), vdi_id; struct vdi_inode *vdi; unsigned long idx = 0; uint64_t cow_oid = 0; -- 1.7.1 -- sheepdog mailing list sheepdog@lists.wpkg.org https://lists.wpkg.org/mailman/listinfo/sheepdog