Re: [PATCH v2 1/8] Btrfs: introduce a tree for items that map UUIDs to something

2013-05-15 Thread Liu Bo
On Tue, May 14, 2013 at 11:36:53AM +0200, Stefan Behrens wrote:
> Mapping UUIDs to subvolume IDs is an operation with a high effort
> today. Today, the algorithm even has quadratic effort (based on the
> number of existing subvolumes), which means, that it takes minutes
> to send/receive a single subvolume if 10,000 subvolumes exist. But
> even linear effort would be too much since it is a waste. And these
> data structures to allow mapping UUIDs to subvolume IDs are created
> every time a btrfs send/receive instance is started.
> 
> It is much more efficient to maintain a searchable persistent data
> structure in the filesystem, one that is updated whenever a
> subvolume/snapshot is created and deleted, and when the received
> subvolume UUID is set by the btrfs-receive tool.
> 
> Therefore kernel code is added with this commit that is able to
> maintain data structures in the filesystem that allow to quickly
> search for a given UUID and to retrieve data that is assigned to
> this UUID, like which subvolume ID is related to this UUID.
> 
> This commit adds a new tree to hold UUID-to-data mapping items. The
> key of the items is the full UUID plus the key type BTRFS_UUID_KEY.
> Multiple data blocks can be stored for a given UUID, a type/length/
> value scheme is used.
> 
> Now follows the lengthy justification, why a new tree was added
> instead of using the existing root tree:
> 
> The first approach was to not create another tree that holds UUID
> items. Instead, the items should just go into the top root tree.
> Unfortunately this confused the algorithm to assign the objectid
> of subvolumes and snapshots. The reason is that
> btrfs_find_free_objectid() calls btrfs_find_highest_objectid() for
> the first created subvol or snapshot after mounting a filesystem,
> and this function simply searches for the largest used objectid in
> the root tree keys to pick the next objectid to assign. Of course,
> the UUID keys have always been the ones with the highest offset
> value, and the next assigned subvol ID was wastefully huge.
> 
> To use any other existing tree did not look proper. To apply a
> workaround such as setting the objectid to zero in the UUID item
> key and to implement collision handling would either add
> limitations (in case of a btrfs_extend_item() approach to handle
> the collisions) or a lot of complexity and source code (in case a
> key would be looked up that is free of collisions). Adding new code
> that introduces limitations is not good, and adding code that is
> complex and lengthy for no good reason is also not good. That's the
> justification why a completely new tree was introduced.
> 
> Signed-off-by: Stefan Behrens 
> ---
>  fs/btrfs/Makefile|   3 +-
>  fs/btrfs/ctree.h |  50 ++
>  fs/btrfs/uuid-tree.c | 481 
> +++
>  3 files changed, 533 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
> index 3932224..a550dfc 100644
> --- a/fs/btrfs/Makefile
> +++ b/fs/btrfs/Makefile
> @@ -8,7 +8,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o 
> root-tree.o dir-item.o \
>  extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
>  export.o tree-log.o free-space-cache.o zlib.o lzo.o \
>  compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
> -reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o
> +reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
> +uuid-tree.o
>  
>  btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
>  btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index f415377..ca9149a 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -91,6 +91,9 @@ struct btrfs_ordered_sum;
>  /* holds quota configuration and tracking */
>  #define BTRFS_QUOTA_TREE_OBJECTID 8ULL
>  
> +/* for storing items that use the BTRFS_UUID_KEY */
> +#define BTRFS_UUID_TREE_OBJECTID 9ULL
> +
>  /* for storing balance parameters in the root tree */
>  #define BTRFS_BALANCE_OBJECTID -4ULL
>  
> @@ -953,6 +956,18 @@ struct btrfs_dev_replace_item {
>   __le64 num_uncorrectable_read_errors;
>  } __attribute__ ((__packed__));
>  
> +/* for items that use the BTRFS_UUID_KEY */
> +#define BTRFS_UUID_ITEM_TYPE_SUBVOL  0 /* for UUIDs assigned to subvols */
> +#define BTRFS_UUID_ITEM_TYPE_RECEIVED_SUBVOL 1 /* for UUIDs assigned to
> +* received subvols */
> +
> +/* a sequence of such items is stored under the BTRFS_UUID_KEY */
> +struct btrfs_uuid_item {
> + __le64 type;/* refer to BTRFS_UUID_ITEM_TYPE* defines above */
> + __le64 len; /* number of following 64bit values */

Can we put it smaller, such as u8 or __le16?

Will type and len be that long as 2^64?

thanks,
liubo

> + __le64 subid[0];/* sequence of subids */
> +} __attribute__ ((__packed__));
> +
>  /* different ty

[PATCH v2 1/8] Btrfs: introduce a tree for items that map UUIDs to something

2013-05-14 Thread Stefan Behrens
Mapping UUIDs to subvolume IDs is an operation with a high effort
today. Today, the algorithm even has quadratic effort (based on the
number of existing subvolumes), which means, that it takes minutes
to send/receive a single subvolume if 10,000 subvolumes exist. But
even linear effort would be too much since it is a waste. And these
data structures to allow mapping UUIDs to subvolume IDs are created
every time a btrfs send/receive instance is started.

It is much more efficient to maintain a searchable persistent data
structure in the filesystem, one that is updated whenever a
subvolume/snapshot is created and deleted, and when the received
subvolume UUID is set by the btrfs-receive tool.

Therefore kernel code is added with this commit that is able to
maintain data structures in the filesystem that allow to quickly
search for a given UUID and to retrieve data that is assigned to
this UUID, like which subvolume ID is related to this UUID.

This commit adds a new tree to hold UUID-to-data mapping items. The
key of the items is the full UUID plus the key type BTRFS_UUID_KEY.
Multiple data blocks can be stored for a given UUID, a type/length/
value scheme is used.

Now follows the lengthy justification, why a new tree was added
instead of using the existing root tree:

The first approach was to not create another tree that holds UUID
items. Instead, the items should just go into the top root tree.
Unfortunately this confused the algorithm to assign the objectid
of subvolumes and snapshots. The reason is that
btrfs_find_free_objectid() calls btrfs_find_highest_objectid() for
the first created subvol or snapshot after mounting a filesystem,
and this function simply searches for the largest used objectid in
the root tree keys to pick the next objectid to assign. Of course,
the UUID keys have always been the ones with the highest offset
value, and the next assigned subvol ID was wastefully huge.

To use any other existing tree did not look proper. To apply a
workaround such as setting the objectid to zero in the UUID item
key and to implement collision handling would either add
limitations (in case of a btrfs_extend_item() approach to handle
the collisions) or a lot of complexity and source code (in case a
key would be looked up that is free of collisions). Adding new code
that introduces limitations is not good, and adding code that is
complex and lengthy for no good reason is also not good. That's the
justification why a completely new tree was introduced.

Signed-off-by: Stefan Behrens 
---
 fs/btrfs/Makefile|   3 +-
 fs/btrfs/ctree.h |  50 ++
 fs/btrfs/uuid-tree.c | 481 +++
 3 files changed, 533 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 3932224..a550dfc 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o 
root-tree.o dir-item.o \
   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
-  reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o
+  reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
+  uuid-tree.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f415377..ca9149a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -91,6 +91,9 @@ struct btrfs_ordered_sum;
 /* holds quota configuration and tracking */
 #define BTRFS_QUOTA_TREE_OBJECTID 8ULL
 
+/* for storing items that use the BTRFS_UUID_KEY */
+#define BTRFS_UUID_TREE_OBJECTID 9ULL
+
 /* for storing balance parameters in the root tree */
 #define BTRFS_BALANCE_OBJECTID -4ULL
 
@@ -953,6 +956,18 @@ struct btrfs_dev_replace_item {
__le64 num_uncorrectable_read_errors;
 } __attribute__ ((__packed__));
 
+/* for items that use the BTRFS_UUID_KEY */
+#define BTRFS_UUID_ITEM_TYPE_SUBVOL0 /* for UUIDs assigned to subvols */
+#define BTRFS_UUID_ITEM_TYPE_RECEIVED_SUBVOL   1 /* for UUIDs assigned to
+  * received subvols */
+
+/* a sequence of such items is stored under the BTRFS_UUID_KEY */
+struct btrfs_uuid_item {
+   __le64 type;/* refer to BTRFS_UUID_ITEM_TYPE* defines above */
+   __le64 len; /* number of following 64bit values */
+   __le64 subid[0];/* sequence of subids */
+} __attribute__ ((__packed__));
+
 /* different types of block groups (and chunks) */
 #define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
 #define BTRFS_BLOCK_GROUP_SYSTEM   (1ULL << 1)
@@ -1901,6 +1916,17 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_DEV_REPLACE_KEY  250
 
 /*
+ * Stores items that allow to quickly map UUIDs to something else.
+ * These items are part of the