Re: [PATCH] initial version of reference cache

2008-07-28 Thread Chris Mason
Yan and I are hammering this out a little, I've attached my current
patches.

I was seeing cache misses after long stress runs, which I think is
coming from references on the higher levels of the tree making us skip
some leaves while dropping the transaction that added them.

My new version using a single cache per root, and should avoid these
misses.


diff -r c038dde2ad20 extent-tree.c
--- a/extent-tree.c	Fri Jul 25 15:58:39 2008 -0400
+++ b/extent-tree.c	Sun Jul 27 06:39:00 2008 -0400
@@ -994,7 +994,7 @@ int btrfs_inc_ref(struct btrfs_trans_han
 		}
 	}
 	/* cache orignal leaf block's references */
-	if (cache_ref  nr_file_extents  0) {
+	if (level == 0  cache_ref  buf != root-commit_root) {
 		struct btrfs_leaf_ref *ref;
 		struct btrfs_extent_info *info;
 
@@ -1012,7 +1012,7 @@ int btrfs_inc_ref(struct btrfs_trans_han
 		ref-nritems = nr_file_extents;
 		info = ref-extents;
 		
-		for (i = 0; i  nritems; i++) {
+		for (i = 0; nr_file_extents  0  i  nritems; i++) {
 			u64 disk_bytenr;
 			btrfs_item_key_to_cpu(buf, key, i);
 			if (btrfs_key_type(key) != BTRFS_EXTENT_DATA_KEY)
@@ -2490,7 +2490,6 @@ static int noinline walk_down_tree(struc
 
 			if (path-slots[*level] == 0)
 reada_walk_down(root, cur, path-slots[*level]);
-
 			next = read_tree_block(root, bytenr, blocksize,
 	   ptr_gen);
 			cond_resched();
diff -r c038dde2ad20 ref-cache.c
--- a/ref-cache.c	Fri Jul 25 15:58:39 2008 -0400
+++ b/ref-cache.c	Sun Jul 27 06:39:00 2008 -0400
@@ -16,6 +16,7 @@
  * Boston, MA 021110-1307, USA.
  */
 
+#include linux/sched.h
 #include ctree.h
 #include ref-cache.h
 #include transaction.h
@@ -110,6 +111,34 @@ static struct rb_node *tree_search(struc
 	return NULL;
 }
 
+int btrfs_remove_leaf_refs(struct btrfs_root *root)
+{
+	struct rb_node *rb;
+	struct btrfs_leaf_ref *ref = NULL;
+	struct btrfs_leaf_ref_tree *tree = root-ref_tree;
+
+	if (!tree)
+		return 0;
+
+	spin_lock(tree-lock);
+	while(!btrfs_leaf_ref_tree_empty(tree)) {
+		tree-last = NULL;
+		rb = rb_first(tree-root);
+		ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
+		rb_erase(ref-rb_node, tree-root);
+		ref-in_tree = 0;
+
+		spin_unlock(tree-lock);
+
+		btrfs_free_leaf_ref(ref);
+
+		cond_resched();
+		spin_lock(tree-lock);
+	}
+	spin_unlock(tree-lock);
+	return 0;
+}
+
 struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
 	 struct btrfs_key *key)
 {
@@ -170,8 +199,6 @@ int btrfs_remove_leaf_ref(struct btrfs_r
 
 	BUG_ON(!ref-in_tree);
 	spin_lock(tree-lock);
-	rb_erase(ref-rb_node, tree-root);
-	ref-in_tree = 0;
 	
 	spin_lock(root-fs_info-ref_cache_lock);
 	root-fs_info-total_ref_cache_size -= size;
@@ -187,6 +214,10 @@ int btrfs_remove_leaf_ref(struct btrfs_r
 		} else
 			tree-last = NULL;
 	}
+
+	rb_erase(ref-rb_node, tree-root);
+	ref-in_tree = 0;
+
 	spin_unlock(tree-lock);
 
 	btrfs_free_leaf_ref(ref);
diff -r c038dde2ad20 ref-cache.h
--- a/ref-cache.h	Fri Jul 25 15:58:39 2008 -0400
+++ b/ref-cache.h	Sun Jul 27 06:39:00 2008 -0400
@@ -68,4 +68,5 @@ struct btrfs_leaf_ref *btrfs_lookup_leaf
 struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
 	 struct btrfs_key *key);
 int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
+int btrfs_remove_leaf_refs(struct btrfs_root *root);
 int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
diff -r c038dde2ad20 transaction.c
--- a/transaction.c	Fri Jul 25 15:58:39 2008 -0400
+++ b/transaction.c	Sun Jul 27 06:39:00 2008 -0400
@@ -539,9 +539,8 @@ static noinline int drop_dirty_roots(str
 		ret = btrfs_end_transaction(trans, tree_root);
 		BUG_ON(ret);
 
-		if (dirty-root-ref_tree)
-			WARN_ON(!btrfs_leaf_ref_tree_empty(dirty-root-ref_tree));
-	
+		btrfs_remove_leaf_refs(dirty-root);
+
 		free_extent_buffer(dirty-root-node);
 		kfree(dirty-root);
 		kfree(dirty);
diff -r cf052b443059 ctree.h
--- a/ctree.h	Sun Jul 27 06:39:00 2008 -0400
+++ b/ctree.h	Mon Jul 28 11:03:41 2008 -0400
@@ -594,7 +594,6 @@ struct btrfs_fs_info {
 
 	spinlock_t ref_cache_lock;
 	u64 total_ref_cache_size;
-	u64 running_ref_cache_size;
 
 	u64 avail_data_alloc_bits;
 	u64 avail_metadata_alloc_bits;
@@ -606,10 +605,17 @@ struct btrfs_fs_info {
 	void *bdev_holder;
 };
 
+struct btrfs_leaf_ref_tree {
+	struct rb_root root;
+	struct btrfs_leaf_ref *last;
+	spinlock_t lock;
+};
+
 /*
  * in ram representation of the tree.  extent_root is used for all allocations
  * and for the extent tree extent_root root.
  */
+struct dirty_root;
 struct btrfs_root {
 	struct extent_buffer *node;
 
@@ -618,6 +624,8 @@ struct btrfs_root {
 
 	struct extent_buffer *commit_root;
 	struct btrfs_leaf_ref_tree *ref_tree;
+	struct btrfs_leaf_ref_tree ref_tree_struct;
+	struct dirty_root *dirty_root;
 
 	struct btrfs_root_item root_item;
 	struct btrfs_key root_key;
diff -r cf052b443059 disk-io.c
--- a/disk-io.c	Sun Jul 27 06:39:00 2008 -0400
+++ b/disk-io.c	Mon Jul 28 11:03:41 2008 -0400
@@ -40,6 +40,7 @@
 #include print-tree.h
 #include 

[PATCH] initial version of reference cache

2008-07-28 Thread Miguel Sousa Filipe
Hi all,

On Mon, Jul 28, 2008 at 4:09 PM, Chris Mason [EMAIL PROTECTED] wrote:
 Yan and I are hammering this out a little, I've attached my current
 patches.

 I was seeing cache misses after long stress runs, which I think is
 coming from references on the higher levels of the tree making us skip
 some leaves while dropping the transaction that added them.

 My new version using a single cache per root, and should avoid these
 misses.

Just curious, what is the cache size and or eviction policy?
Is it LFU or LRU, fifo, something custom? Would ARC be a good policy
for this purpose?

Sorry if this is totally off-topic or clueless, just curious.. hell,
I'm not even shure what data/info/object is the cache for...

Kind regards,


--
Miguel Sousa Filipe
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] initial version of reference cache

2008-07-26 Thread Yan Zheng
2008/7/26 Chris Mason [EMAIL PROTECTED]:
 I have modified this locally to always cache leaves, even when they
 don't have file extents in them.  That way, walk_down_tree will find the
 cache and won't have to read the leaf (that doesn't have any extents).

 So far, it is working very well.  I did a run with fs_mark to create 58
 million files and had very steady numbers.  The unmount took 4 seconds.
 It used to take over an hour.

 One question, why not use the block number (byte number) as the key to
 the rbtree instead of the key?


When dropping old snapshots, tree leaves are processed in ascending
order of btrfs_key. After a given tree leaf is processed, we remove
the corresponding cache entry and update tree-last to point to next
entry in the tree. Therefore btrfs_lookup_leaf_ref can find the wanted
entry in tree-last in most cases.

Regards
YZ
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] initial version of reference cache

2008-07-25 Thread Yan Zheng
I miss two new created files in previous patch, please use this one. Thanks

---
diff -r eb4767aa190e Makefile
--- a/Makefile  Thu Jul 24 12:25:50 2008 -0400
+++ b/Makefile  Sat Jul 26 03:47:26 2008 +0800
@@ -6,7 +6,8 @@ btrfs-y := super.o ctree.o extent-tree.o
   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
-  extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o
+  extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
+  ref-cache.o

 btrfs-$(CONFIG_FS_POSIX_ACL)   += acl.o
 else
diff -r eb4767aa190e ctree.c
--- a/ctree.c   Thu Jul 24 12:25:50 2008 -0400
+++ b/ctree.c   Sat Jul 26 03:47:26 2008 +0800
@@ -165,7 +165,7 @@ int btrfs_copy_root(struct btrfs_trans_h
btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);

WARN_ON(btrfs_header_generation(buf)  trans-transid);
-   ret = btrfs_inc_ref(trans, new_root, buf);
+   ret = btrfs_inc_ref(trans, new_root, buf, 0);
kfree(new_root);

if (ret)
@@ -232,7 +232,7 @@ int __btrfs_cow_block(struct btrfs_trans
WARN_ON(btrfs_header_generation(buf)  trans-transid);
if (btrfs_header_generation(buf) != trans-transid) {
different_trans = 1;
-   ret = btrfs_inc_ref(trans, root, buf);
+   ret = btrfs_inc_ref(trans, root, buf, 1);
if (ret)
return ret;
} else {
diff -r eb4767aa190e ctree.h
--- a/ctree.h   Thu Jul 24 12:25:50 2008 -0400
+++ b/ctree.h   Sat Jul 26 03:47:26 2008 +0800
@@ -592,6 +592,10 @@ struct btrfs_fs_info {
u64 last_alloc;
u64 last_data_alloc;

+   spinlock_t ref_cache_lock;
+   u64 total_ref_cache_size;
+   u64 running_ref_cache_size;
+
u64 avail_data_alloc_bits;
u64 avail_metadata_alloc_bits;
u64 avail_system_alloc_bits;
@@ -613,6 +617,8 @@ struct btrfs_root {
spinlock_t node_lock;

struct extent_buffer *commit_root;
+   struct btrfs_leaf_ref_tree *ref_tree;
+
struct btrfs_root_item root_item;
struct btrfs_key root_key;
struct btrfs_fs_info *fs_info;
@@ -1430,7 +1436,7 @@ int btrfs_reserve_extent(struct btrfs_tr
  u64 search_end, struct btrfs_key *ins,
  u64 data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf);
+ struct extent_buffer *buf, int cache_ref);
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
  *root, u64 bytenr, u64 num_bytes,
  u64 root_objectid, u64 ref_generation,
diff -r eb4767aa190e disk-io.c
--- a/disk-io.c Thu Jul 24 12:25:50 2008 -0400
+++ b/disk-io.c Sat Jul 26 03:47:26 2008 +0800
@@ -716,6 +716,7 @@ static int __setup_root(u32 nodesize, u3
root-node = NULL;
root-inode = NULL;
root-commit_root = NULL;
+   root-ref_tree = NULL;
root-sectorsize = sectorsize;
root-nodesize = nodesize;
root-leafsize = leafsize;
@@ -1165,12 +1166,19 @@ static int transaction_kthread(void *arg
vfs_check_frozen(root-fs_info-sb, SB_FREEZE_WRITE);
mutex_lock(root-fs_info-transaction_kthread_mutex);

+   printk(btrfs: total reference cache size %Lu\n,
+   root-fs_info-total_ref_cache_size);
+
mutex_lock(root-fs_info-trans_mutex);
cur = root-fs_info-running_transaction;
if (!cur) {
mutex_unlock(root-fs_info-trans_mutex);
goto sleep;
}
+
+   printk(btrfs: running reference cache size %Lu\n,
+   root-fs_info-running_ref_cache_size);
+
now = get_seconds();
if (now  cur-start_time || now - cur-start_time  30) {
mutex_unlock(root-fs_info-trans_mutex);
@@ -1233,6 +1241,7 @@ struct btrfs_root *open_ctree(struct sup
spin_lock_init(fs_info-hash_lock);
spin_lock_init(fs_info-delalloc_lock);
spin_lock_init(fs_info-new_trans_lock);
+   spin_lock_init(fs_info-ref_cache_lock);

init_completion(fs_info-kobj_unregister);
fs_info-tree_root = tree_root;
@@ -1699,6 +1708,11 @@ int close_ctree(struct btrfs_root *root)
printk(btrfs: at unmount delalloc count %Lu\n,
   fs_info-delalloc_bytes);
}
+   if (fs_info-total_ref_cache_size) {
+   printk(btrfs: at umount reference cache size %Lu\n,
+   fs_info-total_ref_cache_size);
+   }
+   
if (fs_info-extent_root-node)
free_extent_buffer(fs_info-extent_root-node);

diff -r eb4767aa190e extent-tree.c
--- 

Re: [PATCH] initial version of reference cache

2008-07-25 Thread Chris Mason
On Fri, 2008-07-25 at 14:29 -0500, Yan Zheng wrote:
 Hello,
 
 This is the initial version of leaf reference cache. The cache stores leaf 
 node's extent references in memory, this can improve the performance of 
 snapshot dropping. Outlines of this patch are (1) allocate struct dirty_root 
 when starting transaction (2) put reference cache in struct dirty_root (3) 
 cache extent references when tree leaves are cow'ed (4) when dropping 
 snapshot, use cached references directly to avoid reading tree leaf. 
 
 I only can access a notebook currenly, so benchmarking isn't enough. I 
 appreciate any help and comment.
 

I have modified this locally to always cache leaves, even when they
don't have file extents in them.  That way, walk_down_tree will find the
cache and won't have to read the leaf (that doesn't have any extents).

So far, it is working very well.  I did a run with fs_mark to create 58
million files and had very steady numbers.  The unmount took 4 seconds.
It used to take over an hour.

One question, why not use the block number (byte number) as the key to
the rbtree instead of the key?

-chris


--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html