The sort in the btree write buffer flush path is a very hot path, and
it's particularly performance sensitive since it's single threaded and
can block every other thread on a multithreaded write workload.

It's well worth doing a sort with inlined cmp and swap functions.

Signed-off-by: Kent Overstreet <[email protected]>
---
 fs/bcachefs/btree_write_buffer.c | 57 +++++++++++++++++++++++++++-----
 1 file changed, 48 insertions(+), 9 deletions(-)

diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index e961cf33db1e..e482bbb767e6 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -10,23 +10,64 @@
 #include "journal_io.h"
 #include "journal_reclaim.h"
 
-#include <linux/sort.h>
-
 static int bch2_btree_write_buffer_journal_flush(struct journal *,
                                struct journal_entry_pin *, u64);
 
 static int bch2_journal_keys_to_write_buffer(struct bch_fs *, struct 
journal_buf *);
 
-static inline int wb_key_cmp(const void *_l, const void *_r)
+static inline int wb_key_cmp(struct btree_write_buffered_key_ref *l,
+                            struct btree_write_buffered_key_ref *r)
 {
-       const struct btree_write_buffered_key_ref *l = _l;
-       const struct btree_write_buffered_key_ref *r = _r;
-
        return  cmp_int(l->btree, r->btree) ?:
                bpos_cmp(l->pos, r->pos) ?:
                cmp_int(l->idx, r->idx);
 }
 
+static void wb_sort(struct btree_write_buffered_key_ref *base, size_t num)
+{
+       size_t n = num, a = num / 2;
+
+       if (!a)         /* num < 2 || size == 0 */
+               return;
+
+       for (;;) {
+               size_t b, c, d;
+
+               if (a)                  /* Building heap: sift down --a */
+                       --a;
+               else if (--n)           /* Sorting: Extract root to --n */
+                       swap(base[0], base[n]);
+               else                    /* Sort complete */
+                       break;
+
+               /*
+                * Sift element at "a" down into heap.  This is the
+                * "bottom-up" variant, which significantly reduces
+                * calls to cmp_func(): we find the sift-down path all
+                * the way to the leaves (one compare per level), then
+                * backtrack to find where to insert the target element.
+                *
+                * Because elements tend to sift down close to the leaves,
+                * this uses fewer compares than doing two per level
+                * on the way down.  (A bit more than half as many on
+                * average, 3/4 worst-case.)
+                */
+               for (b = a; c = 2*b + 1, (d = c + 1) < n;)
+                       b = wb_key_cmp(base + c, base + d) >= 0 ? c : d;
+               if (d == n)             /* Special case last leaf with no 
sibling */
+                       b = c;
+
+               /* Now backtrack from "b" to the correct location for "a" */
+               while (b != a && wb_key_cmp(base + a, base + b) >= 0)
+                       b = (b - 1) / 2;
+               c = b;                  /* Where "a" belongs */
+               while (b != a) {        /* Shift it into place */
+                       b = (b - 1) / 2;
+                       swap(base[b], base[c]);
+               }
+       }
+}
+
 static noinline int wb_flush_one_slowpath(struct btree_trans *trans,
                                          struct btree_iter *iter,
                                          struct btree_write_buffered_key *wb)
@@ -202,9 +243,7 @@ static int bch2_btree_write_buffer_flush_locked(struct 
btree_trans *trans)
         * If that happens, simply skip the key so we can optimistically insert
         * as many keys as possible in the fast path.
         */
-       sort(wb->sorted.data, wb->sorted.nr,
-            sizeof(wb->sorted.data[0]),
-            wb_key_cmp, NULL);
+       wb_sort(wb->sorted.data, wb->sorted.nr);
 
        darray_for_each(wb->sorted, i) {
                struct btree_write_buffered_key *k = 
&wb->flushing.keys.data[i->idx];
-- 
2.42.0


Reply via email to