Reviewed-by: Robin Dong <san...@taobao.com>
2013/11/27 Liu Yuan <namei.u...@gmail.com> > To support object larger than 4M, we have to introduce extent like > structure > to map the data indexes to data holders(sheepdog object). So basically, we > use > one object index node(onode) to hold the metadata and extents. > > user object -> onode[metadata, extent1, ............., extentN] > | | > V | > +-----------------------------------+ | > | obj1 | obj2 | .............| objN | | > +-----------------------------------+ | > V > +-----------------------------------+ > | obj1` | obj2` |...........| objN` | > +-----------------------------------+ > > For user object size smaller than 4M, we just inline it to the onode > > user object -> onode[metadata, user-data]. > > kv_create_extented_object() is left for later patch set. > > For object name to onode mapping, we make use of old hash approach, that is > > - hash(objec_name) --> vdi[hash_value] -> onode > > Signed-off-by: Liu Yuan <namei.u...@gmail.com> > --- > include/internal_proto.h | 1 + > sheep/http/kv.c | 183 > ++++++++++++++++++++++++++++------------------ > 2 files changed, 113 insertions(+), 71 deletions(-) > > diff --git a/include/internal_proto.h b/include/internal_proto.h > index e5e0f05..70a7b5d 100644 > --- a/include/internal_proto.h > +++ b/include/internal_proto.h > @@ -124,6 +124,7 @@ > #define SD_RES_AGAIN 0x8F /* Ask to try again */ > #define SD_RES_STALE_OBJ 0x90 /* Object may be stale */ > #define SD_RES_CLUSTER_ERROR 0x91 /* Cluster driver error */ > +#define SD_RES_OBJ_TAKEN 0x92 /* Object ID is taken up */ > > enum sd_status { > SD_STATUS_OK = 1, > diff --git a/sheep/http/kv.c b/sheep/http/kv.c > index d30a6a1..c04e629 100644 > --- a/sheep/http/kv.c > +++ b/sheep/http/kv.c > @@ -165,43 +165,100 @@ int kv_list_buckets(struct http_request *req, > > /* Object operations */ > > -/* 4 KB header of kv object */ > -struct kv_object_hdr { > +/* 4 KB header of kv object index node */ > +struct kv_onode_hdr { > union { > struct { > char name[SD_MAX_OBJECT_NAME]; > + /* a hash value for etag */ > + uint8_t sha1[round_up(SHA1_DIGEST_SIZE, 8)]; > uint64_t size; > uint64_t ctime; > uint64_t mtime; > - > - /* the index of the multi parted object */ > - uint64_t segment; > - > - /* a hash value for etag */ > - uint8_t sha1[round_up(SHA1_DIGEST_SIZE, 8)]; > + uint32_t data_vid; > + uint32_t nr_extent; > + uint8_t inlined; > + uint8_t pad[5]; > }; > > uint8_t __pad[BLOCK_SIZE]; > }; > }; > > -struct kv_object { > - struct kv_object_hdr hdr; > - uint8_t data[SD_DATA_OBJ_SIZE - sizeof(struct kv_object_hdr)]; > +struct onode_extent { > + uint32_t vdi; > + uint32_t pad; > + uint64_t start; > + uint64_t count; > }; > > +struct kv_onode { > + struct kv_onode_hdr hdr; > + union { > + uint8_t data[SD_DATA_OBJ_SIZE - sizeof(struct > kv_onode_hdr)]; > + struct onode_extent *o_extent; > + }; > +}; > + > +#define KV_ONODE_INLINE_SIZE (SD_DATA_OBJ_SIZE - sizeof(struct > kv_onode_hdr)) > + > +static int kv_create_inlined_object(struct sd_inode *inode, > + struct kv_onode *onode, > + uint32_t vid, uint32_t idx, > + bool overwrite) > +{ > + uint64_t oid = vid_to_data_oid(vid, idx); > + int ret; > + > + if (overwrite) { > + sd_info("overwrite object %s", onode->hdr.name); > + ret = write_object(oid, (char *)onode, > + sizeof(onode->hdr) + onode->hdr.size, > + 0, false); > + if (ret != SD_RES_SUCCESS) { > + sd_err("failed to write object, %" PRIx64, oid); > + goto out; > + } > + } else { > + ret = write_object(oid, (char *)onode, > + sizeof(onode->hdr) + onode->hdr.size, > + 0, true); > + if (ret != SD_RES_SUCCESS) { > + sd_err("failed to create object, %" PRIx64, oid); > + goto out; > + } > + INODE_SET_VID(inode, idx, vid); > + ret = sd_inode_write_vid(sheep_bnode_writer, inode, idx, > + vid, vid, 0, false, false); > + if (ret != SD_RES_SUCCESS) { > + sd_err("failed to update inode, %" PRIx64, > + vid_to_vdi_oid(vid)); > + goto out; > + } > + } > +out: > + return ret; > +} > + > +static int kv_create_extented_object(struct sd_inode *inode, > + struct kv_onode *onode, > + uint32_t vid, uint32_t idx) > +{ > + return SD_RES_SUCCESS; > +} > + > /* > * Create the object if the index isn't taken. Overwrite the object if it > exists > - * Return 0 if the index is taken by other object. > + * Return SD_RES_OBJ_TAKEN if the index is taken by other object. > */ > -static int do_kv_create_object(struct http_request *req, const char > *obj_name, > - struct kv_object *obj, uint32_t vid, > - uint32_t idx) > +static int do_kv_create_object(struct http_request *req, > + struct kv_onode *onode, > + uint32_t vid, uint32_t idx) > { > + struct sd_inode *inode = xmalloc(sizeof(struct sd_inode)); > uint64_t oid = vid_to_data_oid(vid, idx); > + struct kv_onode_hdr hdr; > uint32_t tmp_vid; > - struct kv_object_hdr hdr; > - struct sd_inode *inode = xmalloc(sizeof(struct sd_inode)); > int ret; > > ret = read_object(vid_to_vdi_oid(vid), (char *)inode, > @@ -209,60 +266,37 @@ static int do_kv_create_object(struct http_request > *req, const char *obj_name, > if (ret != SD_RES_SUCCESS) { > sd_err("failed to read inode, %" PRIx64, > vid_to_vdi_oid(vid)); > - goto err; > + goto out; > } > tmp_vid = INODE_GET_VID(inode, idx); > if (tmp_vid) { > ret = read_object(oid, (char *)&hdr, sizeof(hdr), 0); > if (ret != SD_RES_SUCCESS) { > sd_err("failed to read object, %" PRIx64, oid); > - goto err; > + goto out; > } > > if (hdr.name[0] != '\0' && > - strcmp(hdr.name, obj->hdr.name) != 0){ > + strcmp(hdr.name, onode->hdr.name) != 0) { > sd_debug("index %d is already used", idx); > + ret = SD_RES_OBJ_TAKEN; > goto out; > } > - sd_info("overwrite object %s", obj_name); > - ret = write_object(oid, (char *)obj, > - sizeof(obj->hdr) + obj->hdr.size, > - 0, false); > - if (ret != SD_RES_SUCCESS) { > - sd_err("failed to write object, %" PRIx64, oid); > - goto err; > - } > - } else { > - ret = write_object(oid, (char *)obj, > - sizeof(obj->hdr) + obj->hdr.size, > - 0, true); > - if (ret != SD_RES_SUCCESS) { > - sd_err("failed to create object, %" PRIx64, oid); > - goto err; > - } > - INODE_SET_VID(inode, idx, vid); > - ret = sd_inode_write_vid(sheep_bnode_writer, inode, idx, > - vid, vid, 0, false, false); > - if (ret != SD_RES_SUCCESS) { > - sd_err("failed to update inode, %" PRIx64, > - vid_to_vdi_oid(vid)); > - goto err; > - } > } > - http_response_header(req, CREATED); > + if (onode->hdr.inlined) > + ret = kv_create_inlined_object(inode, onode, vid, idx, > + !!tmp_vid); > + else > + ret = kv_create_extented_object(inode, onode, vid, idx); > out: > free(inode); > - return 0; > -err: > - http_response_header(req, INTERNAL_SERVER_ERROR); > - free(inode); > - return -1; > + return ret; > } > > int kv_create_object(struct http_request *req, const char *bucket, > - const char *object) > + const char *name) > { > - struct kv_object *obj; > + struct kv_onode *onode; > ssize_t size; > int ret; > uint64_t hval; > @@ -273,44 +307,51 @@ int kv_create_object(struct http_request *req, const > char *bucket, > if (ret < 0) > return ret; > > - obj = xzalloc(sizeof(*obj)); > + onode = xzalloc(sizeof(*onode)); > > gettimeofday(&tv, NULL); > - pstrcpy(obj->hdr.name, sizeof(obj->hdr.name), object); > - obj->hdr.ctime = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000; > - obj->hdr.mtime = obj->hdr.ctime; > + pstrcpy(onode->hdr.name, sizeof(onode->hdr.name), name); > + onode->hdr.ctime = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000; > + onode->hdr.mtime = onode->hdr.ctime; > > - /* TODO: support multi parted object for large object */ > - size = http_request_read(req, obj->data, sizeof(obj->data)); > + size = http_request_read(req, onode->data, sizeof(onode->data)); > if (size < 0) { > sd_err("%s: bucket %s, object %s", sd_strerror(ret), > - bucket, object); > + bucket, name); > http_response_header(req, INTERNAL_SERVER_ERROR); > return -1; > } > > - obj->hdr.size = size; > - > - hval = sd_hash(object, strlen(object)); > + onode->hdr.size = size; > + if (size <= KV_ONODE_INLINE_SIZE) > + onode->hdr.inlined = 1; > + hval = sd_hash(name, strlen(name)); > for (int i = 0; i < MAX_DATA_OBJS; i++) { > uint32_t idx = (hval + i) % MAX_DATA_OBJS; > > - do_kv_create_object(req, object, obj, vid, idx); > - if (req->status != UNKNOWN) { > - free(obj); > + ret = do_kv_create_object(req, onode, vid, idx); > + switch (ret) { > + case SD_RES_SUCCESS: > + http_response_header(req, CREATED); > + free(onode); > return 0; > + case SD_RES_OBJ_TAKEN: > + break; > + default: > + http_response_header(req, INTERNAL_SERVER_ERROR); > + free(onode); > + return -1; > } > } > > - free(obj); > - > /* no free space to create a object */ > http_response_header(req, SERVICE_UNAVAILABLE); > + free(onode); > return -1; > } > > static int do_kv_read_object(struct http_request *req, const char > *obj_name, > - struct kv_object *obj, uint32_t vid, uint32_t > idx) > + struct kv_onode *obj, uint32_t vid, uint32_t > idx) > { > uint64_t oid = vid_to_data_oid(vid, idx); > int ret; > @@ -342,7 +383,7 @@ static int do_kv_read_object(struct http_request *req, > const char *obj_name, > int kv_read_object(struct http_request *req, const char *bucket, > const char *object) > { > - struct kv_object *obj; > + struct kv_onode *obj; > int ret; > uint64_t hval; > uint32_t vid; > @@ -371,7 +412,7 @@ int kv_read_object(struct http_request *req, const > char *bucket, > } > > static int do_kv_update_object(struct http_request *req, const char > *obj_name, > - struct kv_object *obj, uint32_t vid, > + struct kv_onode *obj, uint32_t vid, > uint32_t idx, size_t size) > { > uint64_t oid = vid_to_data_oid(vid, idx); > @@ -415,7 +456,7 @@ static int do_kv_update_object(struct http_request > *req, const char *obj_name, > int kv_update_object(struct http_request *req, const char *bucket, > const char *object) > { > - struct kv_object *obj; > + struct kv_onode *obj; > int ret; > uint64_t hval; > uint32_t vid; > -- > 1.7.9.5 > > -- > sheepdog mailing list > sheepdog@lists.wpkg.org > http://lists.wpkg.org/mailman/listinfo/sheepdog > -- -- Best Regard Robin Dong
-- sheepdog mailing list sheepdog@lists.wpkg.org http://lists.wpkg.org/mailman/listinfo/sheepdog