Re: [PATCH] speed up end_fde_sort using radix sort
On 11/22/22 03:00, Thomas Neumann wrote: When registering a dynamic unwinding frame the fde list is sorted. Previously, we split the list into a sorted and an unsorted part, sorted the later using heap sort, and merged both. That can be quite slow due to the large number of (expensive) comparisons. This patch replaces that logic with a radix sort instead. The radix sort uses the same amount of memory as the old logic, using the second list as auxiliary space, and it includes two techniques to speed up sorting: First, it computes the pointer addresses for blocks of values, reducing the decoding overhead. And it recognizes when the data has reached a sorted state, allowing for early termination. When running out of memory we fall back to pure heap sort, as before. For this test program #include int main(int argc, char** argv) { return 0; } compiled with g++ -O -o hello -static hello.c we get with perf stat -r 200 on a 5950X the following performance numbers: old logic: 0,20 msec task-clock 930.834 cycles 3.079.765 instructions 0,00030478 +- 0,0237 seconds time elapsed new logic: 0,10 msec task-clock 473.269 cycles 1.239.077 instructions 0,00021119 +- 0,0168 seconds time elapsed libgcc/ChangeLog: * unwind-dw2-fde.c: Use radix sort instead of split+sort+merge. --- libgcc/unwind-dw2-fde.c | 234 +++- 1 file changed, 134 insertions(+), 100 deletions(-) diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c index 3c0cc654ec0..b81540c41a4 100644 --- a/libgcc/unwind-dw2-fde.c +++ b/libgcc/unwind-dw2-fde.c @@ -456,22 +456,52 @@ fde_mixed_encoding_compare (struct object *ob, const fde *x, const fde *y) typedef int (*fde_compare_t) (struct object *, const fde *, const fde *); +// The extractor functions compute the pointer values for a block of +// fdes. The block processing hides the call overhead. -/* This is a special mix of insertion sort and heap sort, optimized for - the data sets that actually occur. They look like - 101 102 103 127 128 105 108 110 190 111 115 119 125 160 126 129 130. - I.e. a linearly increasing sequence (coming from functions in the text - section), with additionally a few unordered elements (coming from functions - in gnu_linkonce sections) whose values are higher than the values in the - surrounding linear sequence (but not necessarily higher than the values - at the end of the linear sequence!). - The worst-case total run time is O(N) + O(n log (n)), where N is the - total number of FDEs and n is the number of erratic ones. */ +static void +fde_unencoded_extract (struct object *ob __attribute__ ((unused)), + _Unwind_Ptr *target, const fde **x, int count) +{ + for (int index = 0; index < count; ++index) + memcpy (target + index, x[index]->pc_begin, sizeof (_Unwind_Ptr)); +} + +static void +fde_single_encoding_extract (struct object *ob, _Unwind_Ptr *target, + const fde **x, int count) +{ + _Unwind_Ptr base; + + base = base_from_object (ob->s.b.encoding, ob); + for (int index = 0; index < count; ++index) + read_encoded_value_with_base (ob->s.b.encoding, base, x[index]->pc_begin, + target + index); +} + +static void +fde_mixed_encoding_extract (struct object *ob, _Unwind_Ptr *target, + const fde **x, int count) +{ + for (int index = 0; index < count; ++index) + { + int encoding = get_fde_encoding (x[index]); + read_encoded_value_with_base (encoding, base_from_object (encoding, ob), + x[index]->pc_begin, target + index); + } +} + +typedef void (*fde_extractor_t) (struct object *, _Unwind_Ptr *, const fde **, + int); + +// Data is is sorted using radix sort if possible, using an temporary +// auxiliary data structure of the same size as the input. When running +// out of memory to in-place heap sort. s/to/do/ ? OK with that change or other fix to that sentence. struct fde_accumulator { struct fde_vector *linear; - struct fde_vector *erratic; + struct fde_vector *aux; }; static inline int @@ -485,8 +515,8 @@ start_fde_sort (struct fde_accumulator *accu, size_t count) if ((accu->linear = malloc (size))) { accu->linear->count = 0; - if ((accu->erratic = malloc (size))) - accu->erratic->count = 0; + if ((accu->aux = malloc (size))) + accu->aux->count = 0; return 1; } else @@ -500,59 +530,6 @@ fde_insert (struct fde_accumulator *accu, const fde *this_fde) accu->linear->array[accu->linear->count++] = this_fde; } -/* Split LINEAR into a linear sequence with low values and an erratic - sequence with high values, put the linear one (of longest possible - length) into LINEAR and the erratic one into ERRATIC. This is O(N). - - Because the longest linear sequence we
[PATCH] speed up end_fde_sort using radix sort
When registering a dynamic unwinding frame the fde list is sorted. Previously, we split the list into a sorted and an unsorted part, sorted the later using heap sort, and merged both. That can be quite slow due to the large number of (expensive) comparisons. This patch replaces that logic with a radix sort instead. The radix sort uses the same amount of memory as the old logic, using the second list as auxiliary space, and it includes two techniques to speed up sorting: First, it computes the pointer addresses for blocks of values, reducing the decoding overhead. And it recognizes when the data has reached a sorted state, allowing for early termination. When running out of memory we fall back to pure heap sort, as before. For this test program #include int main(int argc, char** argv) { return 0; } compiled with g++ -O -o hello -static hello.c we get with perf stat -r 200 on a 5950X the following performance numbers: old logic: 0,20 msec task-clock 930.834 cycles 3.079.765 instructions 0,00030478 +- 0,0237 seconds time elapsed new logic: 0,10 msec task-clock 473.269 cycles 1.239.077 instructions 0,00021119 +- 0,0168 seconds time elapsed libgcc/ChangeLog: * unwind-dw2-fde.c: Use radix sort instead of split+sort+merge. --- libgcc/unwind-dw2-fde.c | 234 +++- 1 file changed, 134 insertions(+), 100 deletions(-) diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c index 3c0cc654ec0..b81540c41a4 100644 --- a/libgcc/unwind-dw2-fde.c +++ b/libgcc/unwind-dw2-fde.c @@ -456,22 +456,52 @@ fde_mixed_encoding_compare (struct object *ob, const fde *x, const fde *y) typedef int (*fde_compare_t) (struct object *, const fde *, const fde *); +// The extractor functions compute the pointer values for a block of +// fdes. The block processing hides the call overhead. -/* This is a special mix of insertion sort and heap sort, optimized for - the data sets that actually occur. They look like - 101 102 103 127 128 105 108 110 190 111 115 119 125 160 126 129 130. - I.e. a linearly increasing sequence (coming from functions in the text - section), with additionally a few unordered elements (coming from functions - in gnu_linkonce sections) whose values are higher than the values in the - surrounding linear sequence (but not necessarily higher than the values - at the end of the linear sequence!). - The worst-case total run time is O(N) + O(n log (n)), where N is the - total number of FDEs and n is the number of erratic ones. */ +static void +fde_unencoded_extract (struct object *ob __attribute__ ((unused)), + _Unwind_Ptr *target, const fde **x, int count) +{ + for (int index = 0; index < count; ++index) +memcpy (target + index, x[index]->pc_begin, sizeof (_Unwind_Ptr)); +} + +static void +fde_single_encoding_extract (struct object *ob, _Unwind_Ptr *target, +const fde **x, int count) +{ + _Unwind_Ptr base; + + base = base_from_object (ob->s.b.encoding, ob); + for (int index = 0; index < count; ++index) +read_encoded_value_with_base (ob->s.b.encoding, base, x[index]->pc_begin, + target + index); +} + +static void +fde_mixed_encoding_extract (struct object *ob, _Unwind_Ptr *target, + const fde **x, int count) +{ + for (int index = 0; index < count; ++index) +{ + int encoding = get_fde_encoding (x[index]); + read_encoded_value_with_base (encoding, base_from_object (encoding, ob), + x[index]->pc_begin, target + index); +} +} + +typedef void (*fde_extractor_t) (struct object *, _Unwind_Ptr *, const fde **, +int); + +// Data is is sorted using radix sort if possible, using an temporary +// auxiliary data structure of the same size as the input. When running +// out of memory to in-place heap sort. struct fde_accumulator { struct fde_vector *linear; - struct fde_vector *erratic; + struct fde_vector *aux; }; static inline int @@ -485,8 +515,8 @@ start_fde_sort (struct fde_accumulator *accu, size_t count) if ((accu->linear = malloc (size))) { accu->linear->count = 0; - if ((accu->erratic = malloc (size))) - accu->erratic->count = 0; + if ((accu->aux = malloc (size))) + accu->aux->count = 0; return 1; } else @@ -500,59 +530,6 @@ fde_insert (struct fde_accumulator *accu, const fde *this_fde) accu->linear->array[accu->linear->count++] = this_fde; } -/* Split LINEAR into a linear sequence with low values and an erratic - sequence with high values, put the linear one (of longest possible - length) into LINEAR and the erratic one into ERRATIC. This is O(N). - - Because the longest linear sequence we are trying to locate within the - incoming LINEAR array