Re: [PATCH] initial version of reference cache
Yan and I are hammering this out a little, I've attached my current patches. I was seeing cache misses after long stress runs, which I think is coming from references on the higher levels of the tree making us skip some leaves while dropping the transaction that added them. My new version using a single cache per root, and should avoid these misses. diff -r c038dde2ad20 extent-tree.c --- a/extent-tree.c Fri Jul 25 15:58:39 2008 -0400 +++ b/extent-tree.c Sun Jul 27 06:39:00 2008 -0400 @@ -994,7 +994,7 @@ int btrfs_inc_ref(struct btrfs_trans_han } } /* cache orignal leaf block's references */ - if (cache_ref nr_file_extents 0) { + if (level == 0 cache_ref buf != root-commit_root) { struct btrfs_leaf_ref *ref; struct btrfs_extent_info *info; @@ -1012,7 +1012,7 @@ int btrfs_inc_ref(struct btrfs_trans_han ref-nritems = nr_file_extents; info = ref-extents; - for (i = 0; i nritems; i++) { + for (i = 0; nr_file_extents 0 i nritems; i++) { u64 disk_bytenr; btrfs_item_key_to_cpu(buf, key, i); if (btrfs_key_type(key) != BTRFS_EXTENT_DATA_KEY) @@ -2490,7 +2490,6 @@ static int noinline walk_down_tree(struc if (path-slots[*level] == 0) reada_walk_down(root, cur, path-slots[*level]); - next = read_tree_block(root, bytenr, blocksize, ptr_gen); cond_resched(); diff -r c038dde2ad20 ref-cache.c --- a/ref-cache.c Fri Jul 25 15:58:39 2008 -0400 +++ b/ref-cache.c Sun Jul 27 06:39:00 2008 -0400 @@ -16,6 +16,7 @@ * Boston, MA 021110-1307, USA. */ +#include linux/sched.h #include ctree.h #include ref-cache.h #include transaction.h @@ -110,6 +111,34 @@ static struct rb_node *tree_search(struc return NULL; } +int btrfs_remove_leaf_refs(struct btrfs_root *root) +{ + struct rb_node *rb; + struct btrfs_leaf_ref *ref = NULL; + struct btrfs_leaf_ref_tree *tree = root-ref_tree; + + if (!tree) + return 0; + + spin_lock(tree-lock); + while(!btrfs_leaf_ref_tree_empty(tree)) { + tree-last = NULL; + rb = rb_first(tree-root); + ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node); + rb_erase(ref-rb_node, tree-root); + ref-in_tree = 0; + + spin_unlock(tree-lock); + + btrfs_free_leaf_ref(ref); + + cond_resched(); + spin_lock(tree-lock); + } + spin_unlock(tree-lock); + return 0; +} + struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root, struct btrfs_key *key) { @@ -170,8 +199,6 @@ int btrfs_remove_leaf_ref(struct btrfs_r BUG_ON(!ref-in_tree); spin_lock(tree-lock); - rb_erase(ref-rb_node, tree-root); - ref-in_tree = 0; spin_lock(root-fs_info-ref_cache_lock); root-fs_info-total_ref_cache_size -= size; @@ -187,6 +214,10 @@ int btrfs_remove_leaf_ref(struct btrfs_r } else tree-last = NULL; } + + rb_erase(ref-rb_node, tree-root); + ref-in_tree = 0; + spin_unlock(tree-lock); btrfs_free_leaf_ref(ref); diff -r c038dde2ad20 ref-cache.h --- a/ref-cache.h Fri Jul 25 15:58:39 2008 -0400 +++ b/ref-cache.h Sun Jul 27 06:39:00 2008 -0400 @@ -68,4 +68,5 @@ struct btrfs_leaf_ref *btrfs_lookup_leaf struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root, struct btrfs_key *key); int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref); +int btrfs_remove_leaf_refs(struct btrfs_root *root); int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref); diff -r c038dde2ad20 transaction.c --- a/transaction.c Fri Jul 25 15:58:39 2008 -0400 +++ b/transaction.c Sun Jul 27 06:39:00 2008 -0400 @@ -539,9 +539,8 @@ static noinline int drop_dirty_roots(str ret = btrfs_end_transaction(trans, tree_root); BUG_ON(ret); - if (dirty-root-ref_tree) - WARN_ON(!btrfs_leaf_ref_tree_empty(dirty-root-ref_tree)); - + btrfs_remove_leaf_refs(dirty-root); + free_extent_buffer(dirty-root-node); kfree(dirty-root); kfree(dirty); diff -r cf052b443059 ctree.h --- a/ctree.h Sun Jul 27 06:39:00 2008 -0400 +++ b/ctree.h Mon Jul 28 11:03:41 2008 -0400 @@ -594,7 +594,6 @@ struct btrfs_fs_info { spinlock_t ref_cache_lock; u64 total_ref_cache_size; - u64 running_ref_cache_size; u64 avail_data_alloc_bits; u64 avail_metadata_alloc_bits; @@ -606,10 +605,17 @@ struct btrfs_fs_info { void *bdev_holder; }; +struct btrfs_leaf_ref_tree { + struct rb_root root; + struct btrfs_leaf_ref *last; + spinlock_t lock; +}; + /* * in ram representation of the tree. extent_root is used for all allocations * and for the extent tree extent_root root. */ +struct dirty_root; struct btrfs_root { struct extent_buffer *node; @@ -618,6 +624,8 @@ struct btrfs_root { struct extent_buffer *commit_root; struct btrfs_leaf_ref_tree *ref_tree; + struct btrfs_leaf_ref_tree ref_tree_struct; + struct dirty_root *dirty_root; struct btrfs_root_item root_item; struct btrfs_key root_key; diff -r cf052b443059 disk-io.c --- a/disk-io.c Sun Jul 27 06:39:00 2008 -0400 +++ b/disk-io.c Mon Jul 28 11:03:41 2008 -0400 @@ -40,6 +40,7 @@ #include print-tree.h #include
[PATCH] initial version of reference cache
Hi all, On Mon, Jul 28, 2008 at 4:09 PM, Chris Mason [EMAIL PROTECTED] wrote: Yan and I are hammering this out a little, I've attached my current patches. I was seeing cache misses after long stress runs, which I think is coming from references on the higher levels of the tree making us skip some leaves while dropping the transaction that added them. My new version using a single cache per root, and should avoid these misses. Just curious, what is the cache size and or eviction policy? Is it LFU or LRU, fifo, something custom? Would ARC be a good policy for this purpose? Sorry if this is totally off-topic or clueless, just curious.. hell, I'm not even shure what data/info/object is the cache for... Kind regards, -- Miguel Sousa Filipe -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] initial version of reference cache
2008/7/26 Chris Mason [EMAIL PROTECTED]: I have modified this locally to always cache leaves, even when they don't have file extents in them. That way, walk_down_tree will find the cache and won't have to read the leaf (that doesn't have any extents). So far, it is working very well. I did a run with fs_mark to create 58 million files and had very steady numbers. The unmount took 4 seconds. It used to take over an hour. One question, why not use the block number (byte number) as the key to the rbtree instead of the key? When dropping old snapshots, tree leaves are processed in ascending order of btrfs_key. After a given tree leaf is processed, we remove the corresponding cache entry and update tree-last to point to next entry in the tree. Therefore btrfs_lookup_leaf_ref can find the wanted entry in tree-last in most cases. Regards YZ -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] initial version of reference cache
I miss two new created files in previous patch, please use this one. Thanks --- diff -r eb4767aa190e Makefile --- a/Makefile Thu Jul 24 12:25:50 2008 -0400 +++ b/Makefile Sat Jul 26 03:47:26 2008 +0800 @@ -6,7 +6,8 @@ btrfs-y := super.o ctree.o extent-tree.o hash.o file-item.o inode-item.o inode-map.o disk-io.o \ transaction.o bit-radix.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ - extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o + extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ + ref-cache.o btrfs-$(CONFIG_FS_POSIX_ACL) += acl.o else diff -r eb4767aa190e ctree.c --- a/ctree.c Thu Jul 24 12:25:50 2008 -0400 +++ b/ctree.c Sat Jul 26 03:47:26 2008 +0800 @@ -165,7 +165,7 @@ int btrfs_copy_root(struct btrfs_trans_h btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN); WARN_ON(btrfs_header_generation(buf) trans-transid); - ret = btrfs_inc_ref(trans, new_root, buf); + ret = btrfs_inc_ref(trans, new_root, buf, 0); kfree(new_root); if (ret) @@ -232,7 +232,7 @@ int __btrfs_cow_block(struct btrfs_trans WARN_ON(btrfs_header_generation(buf) trans-transid); if (btrfs_header_generation(buf) != trans-transid) { different_trans = 1; - ret = btrfs_inc_ref(trans, root, buf); + ret = btrfs_inc_ref(trans, root, buf, 1); if (ret) return ret; } else { diff -r eb4767aa190e ctree.h --- a/ctree.h Thu Jul 24 12:25:50 2008 -0400 +++ b/ctree.h Sat Jul 26 03:47:26 2008 +0800 @@ -592,6 +592,10 @@ struct btrfs_fs_info { u64 last_alloc; u64 last_data_alloc; + spinlock_t ref_cache_lock; + u64 total_ref_cache_size; + u64 running_ref_cache_size; + u64 avail_data_alloc_bits; u64 avail_metadata_alloc_bits; u64 avail_system_alloc_bits; @@ -613,6 +617,8 @@ struct btrfs_root { spinlock_t node_lock; struct extent_buffer *commit_root; + struct btrfs_leaf_ref_tree *ref_tree; + struct btrfs_root_item root_item; struct btrfs_key root_key; struct btrfs_fs_info *fs_info; @@ -1430,7 +1436,7 @@ int btrfs_reserve_extent(struct btrfs_tr u64 search_end, struct btrfs_key *ins, u64 data); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf); + struct extent_buffer *buf, int cache_ref); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 root_objectid, u64 ref_generation, diff -r eb4767aa190e disk-io.c --- a/disk-io.c Thu Jul 24 12:25:50 2008 -0400 +++ b/disk-io.c Sat Jul 26 03:47:26 2008 +0800 @@ -716,6 +716,7 @@ static int __setup_root(u32 nodesize, u3 root-node = NULL; root-inode = NULL; root-commit_root = NULL; + root-ref_tree = NULL; root-sectorsize = sectorsize; root-nodesize = nodesize; root-leafsize = leafsize; @@ -1165,12 +1166,19 @@ static int transaction_kthread(void *arg vfs_check_frozen(root-fs_info-sb, SB_FREEZE_WRITE); mutex_lock(root-fs_info-transaction_kthread_mutex); + printk(btrfs: total reference cache size %Lu\n, + root-fs_info-total_ref_cache_size); + mutex_lock(root-fs_info-trans_mutex); cur = root-fs_info-running_transaction; if (!cur) { mutex_unlock(root-fs_info-trans_mutex); goto sleep; } + + printk(btrfs: running reference cache size %Lu\n, + root-fs_info-running_ref_cache_size); + now = get_seconds(); if (now cur-start_time || now - cur-start_time 30) { mutex_unlock(root-fs_info-trans_mutex); @@ -1233,6 +1241,7 @@ struct btrfs_root *open_ctree(struct sup spin_lock_init(fs_info-hash_lock); spin_lock_init(fs_info-delalloc_lock); spin_lock_init(fs_info-new_trans_lock); + spin_lock_init(fs_info-ref_cache_lock); init_completion(fs_info-kobj_unregister); fs_info-tree_root = tree_root; @@ -1699,6 +1708,11 @@ int close_ctree(struct btrfs_root *root) printk(btrfs: at unmount delalloc count %Lu\n, fs_info-delalloc_bytes); } + if (fs_info-total_ref_cache_size) { + printk(btrfs: at umount reference cache size %Lu\n, + fs_info-total_ref_cache_size); + } + if (fs_info-extent_root-node) free_extent_buffer(fs_info-extent_root-node); diff -r eb4767aa190e extent-tree.c ---
Re: [PATCH] initial version of reference cache
On Fri, 2008-07-25 at 14:29 -0500, Yan Zheng wrote: Hello, This is the initial version of leaf reference cache. The cache stores leaf node's extent references in memory, this can improve the performance of snapshot dropping. Outlines of this patch are (1) allocate struct dirty_root when starting transaction (2) put reference cache in struct dirty_root (3) cache extent references when tree leaves are cow'ed (4) when dropping snapshot, use cached references directly to avoid reading tree leaf. I only can access a notebook currenly, so benchmarking isn't enough. I appreciate any help and comment. I have modified this locally to always cache leaves, even when they don't have file extents in them. That way, walk_down_tree will find the cache and won't have to read the leaf (that doesn't have any extents). So far, it is working very well. I did a run with fs_mark to create 58 million files and had very steady numbers. The unmount took 4 seconds. It used to take over an hour. One question, why not use the block number (byte number) as the key to the rbtree instead of the key? -chris -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html