On Thu, Oct 24, 2013 at 05:46:20PM +0800, Robin Dong wrote:
> 1. add sd_extent_header to manage meta-data in data_vdi_id[] or middle-node
> 2. add new type of object: B-tree object as middle-node
> 
> Signed-off-by: Robin Dong <san...@taobao.com>
> ---
>  dog/vdi.c                |    2 +-
>  include/sheepdog_proto.h |   43 ++++-
>  lib/sd_inode.c           |  519 
> +++++++++++++++++++++++++++++++++++++++++++++-
>  sheep/vdi.c              |    1 +
>  sheepfs/volume.c         |    2 +-
>  5 files changed, 558 insertions(+), 9 deletions(-)
> 
> diff --git a/dog/vdi.c b/dog/vdi.c
> index faf06f0..960e2a0 100644
> --- a/dog/vdi.c
> +++ b/dog/vdi.c
> @@ -558,7 +558,7 @@ static int vdi_create(int argc, char **argv)
>                       goto out;
>               }
>  
> -             sd_inode_set_vdi(inode, idx, vid);
> +             INODE_SET_VDI(inode, idx, vid);
>               ret = sd_write_object(vid_to_vdi_oid(vid), 0, &vid, sizeof(vid),
>                                     SD_INODE_HEADER_SIZE + sizeof(vid) * idx,
>                                     0, inode->nr_copies, inode->copy_policy,
> diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
> index 30ff397..c338efa 100644
> --- a/include/sheepdog_proto.h
> +++ b/include/sheepdog_proto.h
> @@ -74,6 +74,9 @@
>  #define SD_RES_JOIN_FAILED   0x18 /* Target node had failed to join sheepdog 
> */
>  #define SD_RES_HALT          0x19 /* Sheepdog is stopped doing IO */
>  #define SD_RES_READONLY      0x1A /* Object is read-only */
> +#define SD_RES_BTREE_NOT_FOUND       0x1B /* Cannot found node in btree */
> +#define SD_RES_BTREE_FOUND   0x1C /* Found node in btree */
> +#define SD_RES_BTREE_REPEAT  0x1D /* Should repeat op in btree */
>  
>  /* errors above 0x80 are sheepdog-internal */
>  
> @@ -92,8 +95,9 @@
>  #define VDI_BIT (UINT64_C(1) << 63)
>  #define VMSTATE_BIT (UINT64_C(1) << 62)
>  #define VDI_ATTR_BIT (UINT64_C(1) << 61)
> +#define VDI_BTREE_BIT (UINT64_C(1) << 60)
>  #define MAX_DATA_OBJS (1ULL << 20)
> -#define MAX_CHILDREN 1024U
> +#define MAX_CHILDREN (1024U - 1) /* we use the last uint32_t as 
> btree_counter */
>  #define SD_MAX_VDI_LEN 256U
>  #define SD_MAX_VDI_TAG_LEN 256U
>  #define SD_MAX_VDI_ATTR_KEY_LEN 256U
> @@ -104,8 +108,8 @@
>  #define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
>  
>  #define SD_INODE_SIZE (sizeof(struct sd_inode))
> -#define SD_INODE_HEADER_SIZE (sizeof(struct sd_inode) - \
> -                           sizeof(uint32_t) * MAX_DATA_OBJS)
> +#define SD_INODE_INDEX_SIZE (sizeof(uint32_t) * MAX_DATA_OBJS)
> +#define SD_INODE_HEADER_SIZE (sizeof(struct sd_inode) - SD_INODE_INDEX_SIZE)
>  #define SD_ATTR_OBJ_SIZE (sizeof(struct sheepdog_vdi_attr))
>  #define CURRENT_VDI_ID 0
>  
> @@ -215,16 +219,35 @@ struct sd_inode {
>       uint64_t vdi_size;
>       uint64_t vm_state_size;
>       uint8_t  copy_policy;
> -     uint8_t  reserved;
> +     uint8_t  store_policy;
>       uint8_t  nr_copies;
>       uint8_t  block_size_shift;
>       uint32_t snap_id;
>       uint32_t vdi_id;
>       uint32_t parent_vdi_id;
>       uint32_t child_vdi_id[MAX_CHILDREN];
> +     uint32_t btree_counter;
>       uint32_t data_vdi_id[MAX_DATA_OBJS];
>  };
>  
> +struct sd_extent {
> +     int idx;
> +     uint32_t vdi_id;
> +};
> +
> +struct sd_extent_idx {
> +     int idx;
> +     uint64_t oid;
> +};
> +
> +#define INODE_BTREE_MAGIC    0x6274
> +
> +struct sd_extent_header {
> +     uint16_t magic;
> +     uint16_t depth; /* 1 -- ext node; 2 -- idx node */
> +     uint32_t entries;
> +};
> +
>  typedef int (*write_node_fn)(uint64_t id, void *mem, unsigned int len,
>                               int copies, int copy_policy, int create);
>  typedef int (*read_node_fn)(uint64_t id, void **mem, unsigned int len);
> @@ -347,10 +370,15 @@ static inline bool is_vdi_attr_obj(uint64_t oid)
>       return !!(oid & VDI_ATTR_BIT);
>  }
>  
> +static inline bool is_vdi_btree_obj(uint64_t oid)
> +{
> +     return !!(oid & VDI_BTREE_BIT);
> +}
> +
>  static inline bool is_data_obj(uint64_t oid)
>  {
>       return !is_vdi_obj(oid) && !is_vmstate_obj(oid) &&
> -             !is_vdi_attr_obj(oid);
> +             !is_vdi_attr_obj(oid) && !is_vdi_btree_obj(oid);
>  }
>  
>  static inline size_t count_data_objs(const struct sd_inode *inode)
> @@ -394,6 +422,11 @@ static inline uint64_t vid_to_attr_oid(uint32_t vid, 
> uint32_t attrid)
>       return ((uint64_t)vid << VDI_SPACE_SHIFT) | VDI_ATTR_BIT | attrid;
>  }
>  
> +static inline uint64_t vid_to_btree_oid(uint32_t vid, uint32_t btreeid)
> +{
> +     return ((uint64_t)vid << VDI_SPACE_SHIFT) | VDI_BTREE_BIT | btreeid;
> +}
> +
>  static inline uint64_t vid_to_vmstate_oid(uint32_t vid, uint32_t idx)
>  {
>       return VMSTATE_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT) | idx;
> diff --git a/lib/sd_inode.c b/lib/sd_inode.c
> index c03ca03..426e00c 100644
> --- a/lib/sd_inode.c
> +++ b/lib/sd_inode.c
> @@ -1,15 +1,530 @@
> +/*
> + * B-tree is a tree data structure that keeps data sorted and allows 
> searches,
> + * sequential access, insertions, and deletions in logarithmic time.
> + * The B-tree is a generalization of a binary search tree in that a node can
> + * have more than two children. (Comer 1979, p. 123) Unlike self-balancing
> + * binary search trees, the B-tree is optimized for systems that read and
> + * write large blocks of data. (ref: http://en.wikipedia.org/wiki/B-tree)
> + *
> + * In sheepdog, we use space in inode->data_vdi_id[] to store leaf-node at
> + * beginning and store root-node of B-tree when it reach depths of two.
> + *
> + * At beginning, the inode->data_vdi_id[] is storing leaf-node which point
> + * to data-obj directly:
> + *
> + *     +------------------+-----------+-----------+--------+
> + *     | sd_extent_header | sd_extent | sd_extent | ...... |
> + *     +------------------+-----------+-----------+--------+
> + *                              |          |
> + *                             /            \
> + *                            /              \
> + *                           /                \
> + *     +------------+ <------                  ----> +------------+
> + *     | data-obj 1 |                                | data-obj 2 |
> + *     +------------+                                +------------+
> + *
> + * After adding more oid into it, the leaf-node will be full of struct 
> sd_extent
> + * and should be splited to two leaf-nodes, after it, the 
> inode->data_vdi_id[]
> + * should become root-node which store sd_extent_idx and point to the two
> + * leaf-nodes:
> + *
> + *     +------------------+-----------------+-----------------+
> + *     | sd_extent_header |  sd_extent_idx  |  sd_extent_idx  |
> + *     +------------------+-----------------+-----------------+
> + *                              |                   |
> + *                             /                    \
> + *                            /                      -------------
> + *                           /                                    \
> + *                          /                                      \
> + *                         /                                        \
> + *     +------------------+-----------+-----------+--------+      
> +------------------+-----------+-----------+--------+
> + *     | sd_extent_header | sd_extent | sd_extent | ...... |      | 
> sd_extent_header | sd_extent | sd_extent | ...... |
> + *     +------------------+-----------+-----------+--------+      
> +------------------+-----------+-----------+--------+
> + *                           /                \                              
>                /           \
> + *     +------------+ <------                  ---> +------------+      
> +--------------+ <--             --> +--------------+
> + *     | data-obj 1 |                               | data-obj 2 |      | 
> data-obj 511 |                     | data-obj 512 |
> + *     +------------+                               +------------+      
> +--------------+                     +--------------+
> + *
> + * When a leaf-node is full, we could add a new leaf-node and add a
> + * new sd_extent_idx in root-node to point to it:
> + *
> + *     
> +------------------+-----------------+-----------------+---------------+
> + *     | sd_extent_header |  sd_extent_idx  |  sd_extent_idx  | 
> sd_extent_idx |
> + *     
> +------------------+-----------------+-----------------+---------------+
> + *                              |                   |                 \
> + *                             /                    \                  \     
> (new leaf-node)
> + *                            /                      ---------          
> ------ +------------------+-----------+--------+
> + *                           /                                \              
>   | sd_extent_header | sd_extent | ...... |
> + *                          /                                  \             
>   +------------------+-----------+--------+
> + *                         /                                    \
> + *     +------------------+-----------+--------+      
> +------------------+-----------+--------+
> + *     | sd_extent_header | sd_extent | ...... |      | sd_extent_header | 
> sd_extent | ...... |
> + *     +------------------+-----------+--------+      
> +------------------+-----------+--------+
> + *
> + *
> + * As above, the root-node point to leaf-node which point to data-obj
> + * (the implemention of B-tree in sd_inode only support two depth), so it 
> could
> + * store:
> + *
> + *   (number of sd_extent_idx in root-node) * (number of sd_extent in 
> leaf-node)
> + *
> + * which is 349524 * 524287 = 183250889388 data-objects (about 680 PB with 
> 4MB data-objs).
> + *
> + */
>  #include <string.h>
>  
> +#include "util.h"
>  #include "sheepdog_proto.h"
>  
> +#define EXT_MAX_SPACE (SD_INODE_INDEX_SIZE - sizeof(struct sd_extent_header))
> +#define EXT_MAX_ENTRIES (EXT_MAX_SPACE / sizeof(struct sd_extent))
> +#define EXT_IDX_MAX_ENTRIES (EXT_MAX_SPACE / sizeof(struct sd_extent_idx))
> +
> +#define EXT_HEADER(data) ((struct sd_extent_header *)(data))
> +#define FIRST_EXT(data)  ((struct sd_extent *)((char *)(data) + \
> +                     sizeof(struct sd_extent_header)))
> +#define LAST_EXT(data)   (FIRST_EXT(data) + EXT_HEADER(data)->entries)
> +#define OFFSET_EXT(data, n) ((char *)(data) + sizeof(struct 
> sd_extent_header) \
> +                     + n * sizeof(struct sd_extent))
> +
> +#define EXT_MAX_IDXS (EXT_MAX_SPACE / sizeof(struct sd_extent_idx))
> +#define FIRST_IDX(data)  ((struct sd_extent_idx *)((char *)(data) + \
> +                     sizeof(struct sd_extent_header)))
> +#define LAST_IDX(data)   (FIRST_IDX(data) + EXT_HEADER(data)->entries)
> +#define OFFSET_IDX(data, n) ((char *)(data) + sizeof(struct 
> sd_extent_header) \
> +                     + n * sizeof(struct sd_extent_idx))
> +
> +struct find_path {
> +     struct sd_extent_idx *p_idx;
> +     struct sd_extent *p_ext;
> +     struct sd_extent_header *p_ext_header;
> +     int depth;
> +};
> +
> +typedef int (*comp)(void *a, void *b);
> +
> +static int extent_comp(void *a, void *b)
> +{
> +     struct sd_extent *ea = (struct sd_extent *)a;
> +     struct sd_extent *eb = (struct sd_extent *)b;
> +
> +     return ea->idx - eb->idx;
> +}
> +
> +static int index_comp(void *a, void *b)
> +{
> +     struct sd_extent_idx *ia = (struct sd_extent_idx *)a;
> +     struct sd_extent_idx *ib = (struct sd_extent_idx *)b;
> +
> +     return ia->idx - ib->idx;
> +}
> +
> +static void dump_btree(read_node_fn reader, struct sd_inode *inode)
> +{
> +#ifdef DEBUG
> +     struct sd_extent_header *header = EXT_HEADER(inode->data_vdi_id);
> +     struct sd_extent_header *leaf_node = NULL;
> +     struct sd_extent *last, *itor;
> +     struct sd_extent_idx *last_idx, *itor_idx;

itor = iterater? If yes, it should be iter.

Thanks
Yuan
-- 
sheepdog mailing list
sheepdog@lists.wpkg.org
http://lists.wpkg.org/mailman/listinfo/sheepdog

Reply via email to