Hi, (Background: 697492434 added 3 new sort functions to remove the indirect function calls for the comparator function. This sped up sorting for various of our built-in data types.)
There was a bit of unfinished discussion around exactly how far to take these specialisations for PG15. We could certainly add more. There are various other things we could do to further speed up sorting for these datatypes. One example is, we could add 3 more variations of these functions that can be called when there are no NULL datums to sort. That effectively multiplies the number of specialisations by 2, or adds another dimension. I have the following dimensions in mind for consideration: 1. Specialisations to handle sorting of non-null datums (eliminates checking for nulls in the comparison function) 2. Specialisations to handle single column sorts (eliminates tiebreaker function call or any checks for existence of tiebreaker) 3. ASC sort (No need for if (ssup->ssup_reverse) INVERT_COMPARE_RESULT(compare)) If we did all of the above then we'd end up with 3 * 2 * 2 * 2 = 24 specialization functions. That seems a bit excessive. So here I'd like to discuss which ones we should add, if any. I've attached a very basic implementation of #1 which adds 3 new functions for sorting non-null datums. This could be made a bit more advanced. For now, I just added a bool flag to track if we have any NULL datum1s in memtuples[]. For bounded sorts, we may remove NULLs from that array, and may end up with no nulls after having seen null. So maybe a count would be better than a flag. A quick performance test with 1 million random INTs shows ~6% performance improvement when there are no nulls. Master $ pgbench -n -f bench.sql -T 60 postgres latency average = 159.837 ms latency average = 161.193 ms latency average = 159.512 ms master + not_null_sort_specializations.patch $ pgbench -n -f bench.sql -T 60 postgres latency average = 150.791 ms latency average = 149.843 ms latency average = 150.319 ms I didn't test for any regression when there are NULLs and we're unable to use the new specializations. I'm hoping the null tracking will be almost free, but I will need to check. It's all quite subjective to know which specializations should be added. I think #1 is likely to have the biggest wins when it can be used as it removes the most branching in the comparator function, however the biggest gains are not the only thing to consider. We also need to consider how commonly these functions will be used. I don't have any information about that. David
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index d90a1aebdf..52927f93b8 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -218,6 +218,8 @@ struct Tuplesortstate int memtupsize; /* allocated length of memtuples array */ bool growmemtuples; /* memtuples' growth still underway? */ + bool memtuples_hasnull; /* does any SortTuple in memtuples have + * isnull1 set to true? */ /* * Memory for tuples is sometimes allocated using a simple slab allocator, * rather than with palloc(). Currently, we switch to slab allocation @@ -496,7 +498,10 @@ static void tuplesort_updatemax(Tuplesortstate *state); * abbreviations of text or multi-key sorts. There could be! Is it worth it? */ -/* Used if first key's comparator is ssup_datum_unsigned_compare */ +/* + * Used if first key's comparator is ssup_datum_unsigned_compare and + * state->memtuples_hasnull is true + */ static pg_attribute_always_inline int qsort_tuple_unsigned_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state) { @@ -518,8 +523,39 @@ qsort_tuple_unsigned_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state) return state->base.comparetup(a, b, state); } +/* + * Used if first key's comparator is ssup_datum_unsigned_compare and + * state->memtuples_hasnull is false + */ +static pg_attribute_always_inline int +qsort_tuple_unsigned_compare_notnull(SortTuple *a, SortTuple *b, + Tuplesortstate *state) +{ + int compare; + + /* make sure the null tracking code didn't mess up */ + Assert(!a->isnull1 && !b->isnull1); + + compare = ApplyUnsignedSortComparatorNotNull(a->datum1, b->datum1, + &state->base.sortKeys[0]); + if (compare != 0) + return compare; + + /* + * No need to waste effort calling the tiebreak function when there are no + * other keys to sort on. + */ + if (state->base.onlyKey != NULL) + return 0; + + return state->base.comparetup(a, b, state); +} + #if SIZEOF_DATUM >= 8 -/* Used if first key's comparator is ssup_datum_signed_compare */ +/* + * Used if first key's comparator is ssup_datum_signed_compare and + * state->memtuples_hasnull is true + */ static pg_attribute_always_inline int qsort_tuple_signed_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state) { @@ -541,9 +577,42 @@ qsort_tuple_signed_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state) return state->base.comparetup(a, b, state); } + +/* + * Used if first key's comparator is ssup_datum_signed_compare and + * state->memtuples_hasnull is false + */ +static pg_attribute_always_inline int +qsort_tuple_signed_compare_notnull(SortTuple *a, SortTuple *b, + Tuplesortstate *state) +{ + int compare; + + /* make sure the null tracking code didn't mess up */ + Assert(!a->isnull1 && !b->isnull1); + + compare = ApplySignedSortComparatorNotNull(a->datum1, b->datum1, + &state->base.sortKeys[0]); + + if (compare != 0) + return compare; + + /* + * No need to waste effort calling the tiebreak function when there are no + * other keys to sort on. + */ + if (state->base.onlyKey != NULL) + return 0; + + return state->base.comparetup(a, b, state); +} + #endif -/* Used if first key's comparator is ssup_datum_int32_compare */ +/* + * Used if first key's comparator is ssup_datum_int32_compare and + * state->memtuples_hasnull is true + */ static pg_attribute_always_inline int qsort_tuple_int32_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state) { @@ -566,6 +635,35 @@ qsort_tuple_int32_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state) return state->base.comparetup(a, b, state); } +/* + * Used if first key's comparator is ssup_datum_int32_compare and + * state->memtuples_hasnull is false + */ +static pg_attribute_always_inline int +qsort_tuple_int32_compare_notnull(SortTuple *a, SortTuple *b, + Tuplesortstate *state) +{ + int compare; + + /* make sure the null tracking code didn't mess up */ + Assert(!a->isnull1 && !b->isnull1); + + compare = ApplyInt32SortComparatorNotNull(a->datum1, b->datum1, + &state->base.sortKeys[0]); + + if (compare != 0) + return compare; + + /* + * No need to waste effort calling the tiebreak function when there are no + * other keys to sort on. + */ + if (state->base.onlyKey != NULL) + return 0; + + return state->base.comparetup(a, b, state); +} + /* * Special versions of qsort just for SortTuple objects. qsort_tuple() sorts * any variant of SortTuples, using the appropriate comparetup function. @@ -584,6 +682,15 @@ qsort_tuple_int32_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state) #define ST_DEFINE #include "lib/sort_template.h" +#define ST_SORT qsort_tuple_unsigned_notnull +#define ST_ELEMENT_TYPE SortTuple +#define ST_COMPARE(a, b, state) qsort_tuple_unsigned_compare_notnull(a, b, state) +#define ST_COMPARE_ARG_TYPE Tuplesortstate +#define ST_CHECK_FOR_INTERRUPTS +#define ST_SCOPE static +#define ST_DEFINE +#include "lib/sort_template.h" + #if SIZEOF_DATUM >= 8 #define ST_SORT qsort_tuple_signed #define ST_ELEMENT_TYPE SortTuple @@ -593,6 +700,15 @@ qsort_tuple_int32_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state) #define ST_SCOPE static #define ST_DEFINE #include "lib/sort_template.h" + +#define ST_SORT qsort_tuple_signed_notnull +#define ST_ELEMENT_TYPE SortTuple +#define ST_COMPARE(a, b, state) qsort_tuple_signed_compare_notnull(a, b, state) +#define ST_COMPARE_ARG_TYPE Tuplesortstate +#define ST_CHECK_FOR_INTERRUPTS +#define ST_SCOPE static +#define ST_DEFINE +#include "lib/sort_template.h" #endif #define ST_SORT qsort_tuple_int32 @@ -604,6 +720,15 @@ qsort_tuple_int32_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state) #define ST_DEFINE #include "lib/sort_template.h" +#define ST_SORT qsort_tuple_int32_notnull +#define ST_ELEMENT_TYPE SortTuple +#define ST_COMPARE(a, b, state) qsort_tuple_int32_compare_notnull(a, b, state) +#define ST_COMPARE_ARG_TYPE Tuplesortstate +#define ST_CHECK_FOR_INTERRUPTS +#define ST_SCOPE static +#define ST_DEFINE +#include "lib/sort_template.h" + #define ST_SORT qsort_tuple #define ST_ELEMENT_TYPE SortTuple #define ST_COMPARE_RUNTIME_POINTER @@ -801,6 +926,9 @@ tuplesort_begin_batch(Tuplesortstate *state) * see comments in grow_memtuples(). */ state->growmemtuples = true; + + state->memtuples_hasnull = false; + state->slabAllocatorUsed = false; if (state->memtuples != NULL && state->memtupsize != INITIAL_MEMTUPSIZE) { @@ -1247,6 +1375,7 @@ tuplesort_puttuple_common(Tuplesortstate *state, SortTuple *tuple, bool useAbbre Assert(state->memtupcount < state->memtupsize); } state->memtuples[state->memtupcount++] = *tuple; + state->memtuples_hasnull |= tuple->isnull1; /* * Check if it's time to switch over to a bounded heapsort. We do @@ -1325,6 +1454,7 @@ tuplesort_puttuple_common(Tuplesortstate *state, SortTuple *tuple, bool useAbbre * Save the tuple into the unsorted array (there must be space) */ state->memtuples[state->memtupcount++] = *tuple; + state->memtuples_hasnull |= tuple->isnull1; /* * If we are over the memory limit, dump all tuples. @@ -2421,6 +2551,7 @@ dumptuples(Tuplesortstate *state, bool alltuples) WRITETUP(state, state->destTape, &state->memtuples[i]); state->memtupcount--; } + state->memtuples_hasnull = false; /* * Reset tuple memory. We've freed all of the tuples that we previously @@ -2644,6 +2775,7 @@ make_bounded_heap(Tuplesortstate *state) reversedirection(state); state->memtupcount = 0; /* make the heap empty */ + state->memtuples_hasnull = false; for (i = 0; i < tupcount; i++) { if (state->memtupcount < state->bound) @@ -2733,25 +2865,40 @@ tuplesort_sort_memtuples(Tuplesortstate *state) { if (state->base.sortKeys[0].comparator == ssup_datum_unsigned_cmp) { - qsort_tuple_unsigned(state->memtuples, - state->memtupcount, - state); + if (state->memtuples_hasnull) + qsort_tuple_unsigned(state->memtuples, + state->memtupcount, + state); + else + qsort_tuple_unsigned_notnull(state->memtuples, + state->memtupcount, + state); return; } #if SIZEOF_DATUM >= 8 else if (state->base.sortKeys[0].comparator == ssup_datum_signed_cmp) { - qsort_tuple_signed(state->memtuples, - state->memtupcount, - state); + if (state->memtuples_hasnull) + qsort_tuple_signed(state->memtuples, + state->memtupcount, + state); + else + qsort_tuple_signed_notnull(state->memtuples, + state->memtupcount, + state); return; } #endif else if (state->base.sortKeys[0].comparator == ssup_datum_int32_cmp) { - qsort_tuple_int32(state->memtuples, - state->memtupcount, - state); + if (state->memtuples_hasnull) + qsort_tuple_int32(state->memtuples, + state->memtupcount, + state); + else + qsort_tuple_int32_notnull(state->memtuples, + state->memtupcount, + state); return; } } @@ -2788,6 +2935,7 @@ tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple) int j; memtuples = state->memtuples; + state->memtuples_hasnull |= tuple->isnull1; Assert(state->memtupcount < state->memtupsize); CHECK_FOR_INTERRUPTS(); diff --git a/src/include/utils/sortsupport.h b/src/include/utils/sortsupport.h index 8c36cf8d82..1ed8ebe645 100644 --- a/src/include/utils/sortsupport.h +++ b/src/include/utils/sortsupport.h @@ -262,6 +262,19 @@ ApplyUnsignedSortComparator(Datum datum1, bool isNull1, return compare; } +static inline int +ApplyUnsignedSortComparatorNotNull(Datum datum1, Datum datum2, + SortSupport ssup) +{ + int compare; + + compare = datum1 < datum2 ? -1 : datum1 > datum2 ? 1 : 0; + if (ssup->ssup_reverse) + INVERT_COMPARE_RESULT(compare); + + return compare; +} + #if SIZEOF_DATUM >= 8 static inline int ApplySignedSortComparator(Datum datum1, bool isNull1, @@ -296,6 +309,21 @@ ApplySignedSortComparator(Datum datum1, bool isNull1, return compare; } + +static inline int +ApplySignedSortComparatorNotNull(Datum datum1, Datum datum2, + SortSupport ssup) +{ + int compare; + + compare = DatumGetInt64(datum1) < DatumGetInt64(datum2) ? -1 : + DatumGetInt64(datum1) > DatumGetInt64(datum2) ? 1 : 0; + if (ssup->ssup_reverse) + INVERT_COMPARE_RESULT(compare); + + return compare; +} + #endif static inline int @@ -332,6 +360,19 @@ ApplyInt32SortComparator(Datum datum1, bool isNull1, return compare; } +static inline int +ApplyInt32SortComparatorNotNull(Datum datum1, Datum datum2, SortSupport ssup) +{ + int compare; + + compare = DatumGetInt32(datum1) < DatumGetInt32(datum2) ? -1 : + DatumGetInt32(datum1) > DatumGetInt32(datum2) ? 1 : 0; + if (ssup->ssup_reverse) + INVERT_COMPARE_RESULT(compare); + + return compare; +} + /* * Apply a sort comparator function and return a 3-way comparison using full, * authoritative comparator. This takes care of handling reverse-sort and