When doing DIO tracing I noticed we were doing a ton of allocations, a lot of the time for extent_states. Some of the time we don't even use the prealloc'ed extent_state, it just get's free'd up. So instead create a per-cpu cache like the radix tree stuff. So we will check to see if our per-cpu cache has a prealloc'ed extent_state in it and if so we just continue, else we alloc a new one and fill the cache. Then if we need to use a prealloc'ed extent_state we can just take it out of our per-cpu cache. We will also refill the cache on free to try and limit the number of times we have to ask the allocator for caches. With this patch dbench 50 goes from ~210 mb/s to ~260 mb/s. Thanks,
Signed-off-by: Josef Bacik <jo...@redhat.com> --- V1->V2: Fix a uneven preempt_disable() when we use a prealloc but still have to search again. fs/btrfs/extent_io.c | 167 +++++++++++++++++++++++++++++++++++++------------ 1 files changed, 126 insertions(+), 41 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7055d11..9adc614 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -12,6 +12,7 @@ #include <linux/pagevec.h> #include <linux/prefetch.h> #include <linux/cleancache.h> +#include <linux/cpu.h> #include "extent_io.h" #include "extent_map.h" #include "compat.h" @@ -31,6 +32,8 @@ static DEFINE_SPINLOCK(leak_lock); #define BUFFER_LRU_MAX 64 +static DEFINE_PER_CPU(struct extent_state *, extent_state_preloads) = NULL; + struct tree_entry { u64 start; u64 end; @@ -71,10 +74,36 @@ free_state_cache: return -ENOMEM; } +static void __free_extent_state(struct extent_state *state) +{ + if (!state) + return; + if (atomic_dec_and_test(&state->refs)) { +#if LEAK_DEBUG + unsigned long flags; +#endif + WARN_ON(state->tree); +#if LEAK_DEBUG + spin_lock_irqsave(&leak_lock, flags); + list_del(&state->leak_list); + spin_unlock_irqrestore(&leak_lock, flags); +#endif + kmem_cache_free(extent_state_cache, state); + } +} void extent_io_exit(void) { struct extent_state *state; struct extent_buffer *eb; + int cpu; + + for_each_possible_cpu(cpu) { + state = per_cpu(extent_state_preloads, cpu); + if (!state) + continue; + per_cpu(extent_state_preloads, cpu) = NULL; + __free_extent_state(state); + } while (!list_empty(&states)) { state = list_entry(states.next, struct extent_state, leak_list); @@ -114,16 +143,11 @@ void extent_io_tree_init(struct extent_io_tree *tree, tree->mapping = mapping; } -static struct extent_state *alloc_extent_state(gfp_t mask) +static void init_extent_state(struct extent_state *state) { - struct extent_state *state; #if LEAK_DEBUG unsigned long flags; #endif - - state = kmem_cache_alloc(extent_state_cache, mask); - if (!state) - return state; state->state = 0; state->private = 0; state->tree = NULL; @@ -134,6 +158,16 @@ static struct extent_state *alloc_extent_state(gfp_t mask) #endif atomic_set(&state->refs, 1); init_waitqueue_head(&state->wq); +} + +static struct extent_state *alloc_extent_state(gfp_t mask) +{ + struct extent_state *state; + + state = kmem_cache_alloc(extent_state_cache, mask); + if (!state) + return state; + init_extent_state(state); return state; } @@ -142,6 +176,7 @@ void free_extent_state(struct extent_state *state) if (!state) return; if (atomic_dec_and_test(&state->refs)) { + struct extent_state *tmp; #if LEAK_DEBUG unsigned long flags; #endif @@ -151,10 +186,53 @@ void free_extent_state(struct extent_state *state) list_del(&state->leak_list); spin_unlock_irqrestore(&leak_lock, flags); #endif - kmem_cache_free(extent_state_cache, state); + preempt_disable(); + tmp = __get_cpu_var(extent_state_preloads); + if (!tmp) { + init_extent_state(state); + __get_cpu_var(extent_state_preloads) = state; + } else { + kmem_cache_free(extent_state_cache, state); + } + preempt_enable(); } } +/* + * Pre-load an extent state. + */ +static int extent_state_preload(gfp_t mask) +{ + struct extent_state *state; + + preempt_disable(); + state = __get_cpu_var(extent_state_preloads); + if (!state) { + struct extent_state *tmp; + preempt_enable(); + tmp = alloc_extent_state(mask); + if (!tmp) + return -ENOMEM; + preempt_disable(); + state = __get_cpu_var(extent_state_preloads); + if (state) + free_extent_state(tmp); + else + __get_cpu_var(extent_state_preloads) = tmp; + } + + return 0; +} + +/* + * Just re-enable preemption, this is here to make patch reviewing easier, to + * make sure we match up extent_state_preloads with the end() properly. + */ +static void extent_state_preload_end(void) +{ + preempt_enable(); +} + static struct rb_node *tree_insert(struct rb_root *root, u64 offset, struct rb_node *node) { @@ -441,13 +519,24 @@ static int clear_state_bit(struct extent_io_tree *tree, return ret; } -static struct extent_state * -alloc_extent_state_atomic(struct extent_state *prealloc) +static struct extent_state *get_prealloc_state(void) { - if (!prealloc) - prealloc = alloc_extent_state(GFP_ATOMIC); + struct extent_state *state; - return prealloc; + state = __get_cpu_var(extent_state_preloads); + __get_cpu_var(extent_state_preloads) = NULL; + return state; +} + +static struct extent_state *alloc_extent_state_atomic(void) +{ + struct extent_state *state; + + state = get_prealloc_state(); + if (!state) + state = alloc_extent_state(GFP_ATOMIC); + + return state; } /* @@ -477,6 +566,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int err; int set = 0; int clear = 0; + bool preload = !(mask & __GFP_WAIT); if (delete) bits |= ~EXTENT_CTLBITS; @@ -485,9 +575,8 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) clear = 1; again: - if (!prealloc && (mask & __GFP_WAIT)) { - prealloc = alloc_extent_state(mask); - if (!prealloc) + if (preload) { + if (extent_state_preload(mask)) return -ENOMEM; } @@ -540,11 +629,10 @@ hit_next: */ if (state->start < start) { - prealloc = alloc_extent_state_atomic(prealloc); + prealloc = alloc_extent_state_atomic(); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, start); BUG_ON(err == -EEXIST); - prealloc = NULL; if (err) goto out; if (state->end <= end) { @@ -562,7 +650,7 @@ hit_next: * on the first half */ if (state->start <= end && state->end > end) { - prealloc = alloc_extent_state_atomic(prealloc); + prealloc = alloc_extent_state_atomic(); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, end + 1); BUG_ON(err == -EEXIST); @@ -571,11 +659,10 @@ hit_next: set |= clear_state_bit(tree, prealloc, &bits, wake); - prealloc = NULL; goto out; } - if (state->end < end && prealloc && !need_resched()) + if (state->end < end && !need_resched()) next_node = rb_next(&state->rb_node); else next_node = NULL; @@ -594,8 +681,8 @@ hit_next: out: spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); + if (preload) + extent_state_preload_end(); return set; @@ -603,8 +690,10 @@ search_again: if (start > end) goto out; spin_unlock(&tree->lock); - if (mask & __GFP_WAIT) + if (preload) { + extent_state_preload_end(); cond_resched(); + } goto again; } @@ -731,12 +820,13 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int err = 0; u64 last_start; u64 last_end; + bool preload = !(mask & __GFP_WAIT); bits |= EXTENT_FIRST_DELALLOC; again: - if (!prealloc && (mask & __GFP_WAIT)) { - prealloc = alloc_extent_state(mask); - BUG_ON(!prealloc); + if (preload) { + if (extent_state_preload(mask)) + return -ENOMEM; } spin_lock(&tree->lock); @@ -753,10 +843,9 @@ again: */ node = tree_search(tree, start); if (!node) { - prealloc = alloc_extent_state_atomic(prealloc); + prealloc = alloc_extent_state_atomic(); BUG_ON(!prealloc); err = insert_state(tree, prealloc, start, end, &bits); - prealloc = NULL; BUG_ON(err == -EEXIST); goto out; } @@ -790,7 +879,7 @@ hit_next: goto out; start = last_end + 1; - if (next_node && start < end && prealloc && !need_resched()) { + if (next_node && start < end && !need_resched()) { state = rb_entry(next_node, struct extent_state, rb_node); if (state->start == start) @@ -822,11 +911,10 @@ hit_next: goto out; } - prealloc = alloc_extent_state_atomic(prealloc); + prealloc = alloc_extent_state_atomic(); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, start); BUG_ON(err == -EEXIST); - prealloc = NULL; if (err) goto out; if (state->end <= end) { @@ -855,7 +943,7 @@ hit_next: else this_end = last_start - 1; - prealloc = alloc_extent_state_atomic(prealloc); + prealloc = alloc_extent_state_atomic(); BUG_ON(!prealloc); /* @@ -868,12 +956,10 @@ hit_next: BUG_ON(err == -EEXIST); if (err) { free_extent_state(prealloc); - prealloc = NULL; goto out; } cache_state(prealloc, cached_state); free_extent_state(prealloc); - prealloc = NULL; start = this_end + 1; goto search_again; } @@ -890,19 +976,16 @@ hit_next: goto out; } - prealloc = alloc_extent_state_atomic(prealloc); + prealloc = alloc_extent_state_atomic(); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, end + 1); BUG_ON(err == -EEXIST); err = set_state_bits(tree, prealloc, &bits); - if (err) { - prealloc = NULL; + if (err) goto out; - } cache_state(prealloc, cached_state); merge_state(tree, prealloc); - prealloc = NULL; goto out; } @@ -910,8 +993,8 @@ hit_next: out: spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); + if (preload) + extent_state_preload_end(); return err; @@ -919,6 +1002,8 @@ search_again: if (start > end) goto out; spin_unlock(&tree->lock); + if (preload) + extent_state_preload_end(); if (mask & __GFP_WAIT) cond_resched(); goto again; -- 1.7.5.2 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html