On Thu, Oct 24, 2013 at 05:46:20PM +0800, Robin Dong wrote: > 1. add sd_extent_header to manage meta-data in data_vdi_id[] or middle-node > 2. add new type of object: B-tree object as middle-node > > Signed-off-by: Robin Dong <san...@taobao.com> > --- > dog/vdi.c | 2 +- > include/sheepdog_proto.h | 43 ++++- > lib/sd_inode.c | 519 > +++++++++++++++++++++++++++++++++++++++++++++- > sheep/vdi.c | 1 + > sheepfs/volume.c | 2 +- > 5 files changed, 558 insertions(+), 9 deletions(-) > > diff --git a/dog/vdi.c b/dog/vdi.c > index faf06f0..960e2a0 100644 > --- a/dog/vdi.c > +++ b/dog/vdi.c > @@ -558,7 +558,7 @@ static int vdi_create(int argc, char **argv) > goto out; > } > > - sd_inode_set_vdi(inode, idx, vid); > + INODE_SET_VDI(inode, idx, vid); > ret = sd_write_object(vid_to_vdi_oid(vid), 0, &vid, sizeof(vid), > SD_INODE_HEADER_SIZE + sizeof(vid) * idx, > 0, inode->nr_copies, inode->copy_policy, > diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h > index 30ff397..c338efa 100644 > --- a/include/sheepdog_proto.h > +++ b/include/sheepdog_proto.h > @@ -74,6 +74,9 @@ > #define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog > */ > #define SD_RES_HALT 0x19 /* Sheepdog is stopped doing IO */ > #define SD_RES_READONLY 0x1A /* Object is read-only */ > +#define SD_RES_BTREE_NOT_FOUND 0x1B /* Cannot found node in btree */ > +#define SD_RES_BTREE_FOUND 0x1C /* Found node in btree */ > +#define SD_RES_BTREE_REPEAT 0x1D /* Should repeat op in btree */ > > /* errors above 0x80 are sheepdog-internal */ > > @@ -92,8 +95,9 @@ > #define VDI_BIT (UINT64_C(1) << 63) > #define VMSTATE_BIT (UINT64_C(1) << 62) > #define VDI_ATTR_BIT (UINT64_C(1) << 61) > +#define VDI_BTREE_BIT (UINT64_C(1) << 60) > #define MAX_DATA_OBJS (1ULL << 20) > -#define MAX_CHILDREN 1024U > +#define MAX_CHILDREN (1024U - 1) /* we use the last uint32_t as > btree_counter */ > #define SD_MAX_VDI_LEN 256U > #define SD_MAX_VDI_TAG_LEN 256U > #define SD_MAX_VDI_ATTR_KEY_LEN 256U > @@ -104,8 +108,8 @@ > #define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS) > > #define SD_INODE_SIZE (sizeof(struct sd_inode)) > -#define SD_INODE_HEADER_SIZE (sizeof(struct sd_inode) - \ > - sizeof(uint32_t) * MAX_DATA_OBJS) > +#define SD_INODE_INDEX_SIZE (sizeof(uint32_t) * MAX_DATA_OBJS) > +#define SD_INODE_HEADER_SIZE (sizeof(struct sd_inode) - SD_INODE_INDEX_SIZE) > #define SD_ATTR_OBJ_SIZE (sizeof(struct sheepdog_vdi_attr)) > #define CURRENT_VDI_ID 0 > > @@ -215,16 +219,35 @@ struct sd_inode { > uint64_t vdi_size; > uint64_t vm_state_size; > uint8_t copy_policy; > - uint8_t reserved; > + uint8_t store_policy; > uint8_t nr_copies; > uint8_t block_size_shift; > uint32_t snap_id; > uint32_t vdi_id; > uint32_t parent_vdi_id; > uint32_t child_vdi_id[MAX_CHILDREN]; > + uint32_t btree_counter; > uint32_t data_vdi_id[MAX_DATA_OBJS]; > }; > > +struct sd_extent { > + int idx; > + uint32_t vdi_id; > +}; > + > +struct sd_extent_idx { > + int idx; > + uint64_t oid; > +}; > + > +#define INODE_BTREE_MAGIC 0x6274 > + > +struct sd_extent_header { > + uint16_t magic; > + uint16_t depth; /* 1 -- ext node; 2 -- idx node */ > + uint32_t entries; > +}; > + > typedef int (*write_node_fn)(uint64_t id, void *mem, unsigned int len, > int copies, int copy_policy, int create); > typedef int (*read_node_fn)(uint64_t id, void **mem, unsigned int len); > @@ -347,10 +370,15 @@ static inline bool is_vdi_attr_obj(uint64_t oid) > return !!(oid & VDI_ATTR_BIT); > } > > +static inline bool is_vdi_btree_obj(uint64_t oid) > +{ > + return !!(oid & VDI_BTREE_BIT); > +} > + > static inline bool is_data_obj(uint64_t oid) > { > return !is_vdi_obj(oid) && !is_vmstate_obj(oid) && > - !is_vdi_attr_obj(oid); > + !is_vdi_attr_obj(oid) && !is_vdi_btree_obj(oid); > } > > static inline size_t count_data_objs(const struct sd_inode *inode) > @@ -394,6 +422,11 @@ static inline uint64_t vid_to_attr_oid(uint32_t vid, > uint32_t attrid) > return ((uint64_t)vid << VDI_SPACE_SHIFT) | VDI_ATTR_BIT | attrid; > } > > +static inline uint64_t vid_to_btree_oid(uint32_t vid, uint32_t btreeid) > +{ > + return ((uint64_t)vid << VDI_SPACE_SHIFT) | VDI_BTREE_BIT | btreeid; > +} > + > static inline uint64_t vid_to_vmstate_oid(uint32_t vid, uint32_t idx) > { > return VMSTATE_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT) | idx; > diff --git a/lib/sd_inode.c b/lib/sd_inode.c > index c03ca03..426e00c 100644 > --- a/lib/sd_inode.c > +++ b/lib/sd_inode.c > @@ -1,15 +1,530 @@ > +/* > + * B-tree is a tree data structure that keeps data sorted and allows > searches, > + * sequential access, insertions, and deletions in logarithmic time. > + * The B-tree is a generalization of a binary search tree in that a node can > + * have more than two children. (Comer 1979, p. 123) Unlike self-balancing > + * binary search trees, the B-tree is optimized for systems that read and > + * write large blocks of data. (ref: http://en.wikipedia.org/wiki/B-tree) > + * > + * In sheepdog, we use space in inode->data_vdi_id[] to store leaf-node at > + * beginning and store root-node of B-tree when it reach depths of two. > + * > + * At beginning, the inode->data_vdi_id[] is storing leaf-node which point > + * to data-obj directly: > + * > + * +------------------+-----------+-----------+--------+ > + * | sd_extent_header | sd_extent | sd_extent | ...... | > + * +------------------+-----------+-----------+--------+ > + * | | > + * / \ > + * / \ > + * / \ > + * +------------+ <------ ----> +------------+ > + * | data-obj 1 | | data-obj 2 | > + * +------------+ +------------+ > + * > + * After adding more oid into it, the leaf-node will be full of struct > sd_extent > + * and should be splited to two leaf-nodes, after it, the > inode->data_vdi_id[] > + * should become root-node which store sd_extent_idx and point to the two > + * leaf-nodes: > + * > + * +------------------+-----------------+-----------------+ > + * | sd_extent_header | sd_extent_idx | sd_extent_idx | > + * +------------------+-----------------+-----------------+ > + * | | > + * / \ > + * / ------------- > + * / \ > + * / \ > + * / \ > + * +------------------+-----------+-----------+--------+ > +------------------+-----------+-----------+--------+ > + * | sd_extent_header | sd_extent | sd_extent | ...... | | > sd_extent_header | sd_extent | sd_extent | ...... | > + * +------------------+-----------+-----------+--------+ > +------------------+-----------+-----------+--------+ > + * / \ > / \ > + * +------------+ <------ ---> +------------+ > +--------------+ <-- --> +--------------+ > + * | data-obj 1 | | data-obj 2 | | > data-obj 511 | | data-obj 512 | > + * +------------+ +------------+ > +--------------+ +--------------+ > + * > + * When a leaf-node is full, we could add a new leaf-node and add a > + * new sd_extent_idx in root-node to point to it: > + * > + * > +------------------+-----------------+-----------------+---------------+ > + * | sd_extent_header | sd_extent_idx | sd_extent_idx | > sd_extent_idx | > + * > +------------------+-----------------+-----------------+---------------+ > + * | | \ > + * / \ \ > (new leaf-node) > + * / --------- > ------ +------------------+-----------+--------+ > + * / \ > | sd_extent_header | sd_extent | ...... | > + * / \ > +------------------+-----------+--------+ > + * / \ > + * +------------------+-----------+--------+ > +------------------+-----------+--------+ > + * | sd_extent_header | sd_extent | ...... | | sd_extent_header | > sd_extent | ...... | > + * +------------------+-----------+--------+ > +------------------+-----------+--------+ > + * > + * > + * As above, the root-node point to leaf-node which point to data-obj > + * (the implemention of B-tree in sd_inode only support two depth), so it > could > + * store: > + * > + * (number of sd_extent_idx in root-node) * (number of sd_extent in > leaf-node) > + * > + * which is 349524 * 524287 = 183250889388 data-objects (about 680 PB with > 4MB data-objs). > + * > + */ > #include <string.h> > > +#include "util.h" > #include "sheepdog_proto.h" > > +#define EXT_MAX_SPACE (SD_INODE_INDEX_SIZE - sizeof(struct sd_extent_header)) > +#define EXT_MAX_ENTRIES (EXT_MAX_SPACE / sizeof(struct sd_extent)) > +#define EXT_IDX_MAX_ENTRIES (EXT_MAX_SPACE / sizeof(struct sd_extent_idx)) > + > +#define EXT_HEADER(data) ((struct sd_extent_header *)(data)) > +#define FIRST_EXT(data) ((struct sd_extent *)((char *)(data) + \ > + sizeof(struct sd_extent_header))) > +#define LAST_EXT(data) (FIRST_EXT(data) + EXT_HEADER(data)->entries) > +#define OFFSET_EXT(data, n) ((char *)(data) + sizeof(struct > sd_extent_header) \ > + + n * sizeof(struct sd_extent)) > + > +#define EXT_MAX_IDXS (EXT_MAX_SPACE / sizeof(struct sd_extent_idx)) > +#define FIRST_IDX(data) ((struct sd_extent_idx *)((char *)(data) + \ > + sizeof(struct sd_extent_header))) > +#define LAST_IDX(data) (FIRST_IDX(data) + EXT_HEADER(data)->entries) > +#define OFFSET_IDX(data, n) ((char *)(data) + sizeof(struct > sd_extent_header) \ > + + n * sizeof(struct sd_extent_idx)) > + > +struct find_path { > + struct sd_extent_idx *p_idx; > + struct sd_extent *p_ext; > + struct sd_extent_header *p_ext_header; > + int depth; > +}; > + > +typedef int (*comp)(void *a, void *b); > + > +static int extent_comp(void *a, void *b) > +{ > + struct sd_extent *ea = (struct sd_extent *)a; > + struct sd_extent *eb = (struct sd_extent *)b; > + > + return ea->idx - eb->idx; > +} > + > +static int index_comp(void *a, void *b) > +{ > + struct sd_extent_idx *ia = (struct sd_extent_idx *)a; > + struct sd_extent_idx *ib = (struct sd_extent_idx *)b; > + > + return ia->idx - ib->idx; > +} > + > +static void dump_btree(read_node_fn reader, struct sd_inode *inode) > +{ > +#ifdef DEBUG > + struct sd_extent_header *header = EXT_HEADER(inode->data_vdi_id); > + struct sd_extent_header *leaf_node = NULL; > + struct sd_extent *last, *itor; > + struct sd_extent_idx *last_idx, *itor_idx;
itor = iterater? If yes, it should be iter. Thanks Yuan -- sheepdog mailing list sheepdog@lists.wpkg.org http://lists.wpkg.org/mailman/listinfo/sheepdog