Re: [PATCH] speed up end_fde_sort using radix sort

2022-12-16 Thread Jason Merrill via Gcc-patches

On 11/22/22 03:00, Thomas Neumann wrote:

When registering a dynamic unwinding frame the fde list is sorted.
Previously, we split the list into a sorted and an unsorted part,
sorted the later using heap sort, and merged both. That can be
quite slow due to the large number of (expensive) comparisons.

This patch replaces that logic with a radix sort instead. The
radix sort uses the same amount of memory as the old logic,
using the second list as auxiliary space, and it includes two
techniques to speed up sorting: First, it computes the pointer
addresses for blocks of values, reducing the decoding overhead.
And it recognizes when the data has reached a sorted state,
allowing for early termination. When running out of memory
we fall back to pure heap sort, as before.

For this test program

#include 
int main(int argc, char** argv) {
  return 0;
}

compiled with g++ -O -o hello -static hello.c we get with
perf stat -r 200 on a 5950X the following performance numbers:

old logic:

   0,20 msec task-clock
    930.834  cycles
  3.079.765  instructions
     0,00030478 +- 0,0237 seconds time elapsed

new logic:

   0,10 msec task-clock
    473.269  cycles
  1.239.077  instructions
     0,00021119 +- 0,0168 seconds time elapsed

libgcc/ChangeLog:
     * unwind-dw2-fde.c: Use radix sort instead of split+sort+merge.
---
  libgcc/unwind-dw2-fde.c | 234 +++-
  1 file changed, 134 insertions(+), 100 deletions(-)

diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
index 3c0cc654ec0..b81540c41a4 100644
--- a/libgcc/unwind-dw2-fde.c
+++ b/libgcc/unwind-dw2-fde.c
@@ -456,22 +456,52 @@ fde_mixed_encoding_compare (struct object *ob, 
const fde *x, const fde *y)


  typedef int (*fde_compare_t) (struct object *, const fde *, const fde *);

+// The extractor functions compute the pointer values for a block of
+// fdes. The block processing hides the call overhead.

-/* This is a special mix of insertion sort and heap sort, optimized for
-   the data sets that actually occur. They look like
-   101 102 103 127 128 105 108 110 190 111 115 119 125 160 126 129 130.
-   I.e. a linearly increasing sequence (coming from functions in the text
-   section), with additionally a few unordered elements (coming from 
functions
-   in gnu_linkonce sections) whose values are higher than the values in 
the

-   surrounding linear sequence (but not necessarily higher than the values
-   at the end of the linear sequence!).
-   The worst-case total run time is O(N) + O(n log (n)), where N is the
-   total number of FDEs and n is the number of erratic ones.  */
+static void
+fde_unencoded_extract (struct object *ob __attribute__ ((unused)),
+   _Unwind_Ptr *target, const fde **x, int count)
+{
+  for (int index = 0; index < count; ++index)
+    memcpy (target + index, x[index]->pc_begin, sizeof (_Unwind_Ptr));
+}
+
+static void
+fde_single_encoding_extract (struct object *ob, _Unwind_Ptr *target,
+ const fde **x, int count)
+{
+  _Unwind_Ptr base;
+
+  base = base_from_object (ob->s.b.encoding, ob);
+  for (int index = 0; index < count; ++index)
+    read_encoded_value_with_base (ob->s.b.encoding, base, 
x[index]->pc_begin,

+  target + index);
+}
+
+static void
+fde_mixed_encoding_extract (struct object *ob, _Unwind_Ptr *target,
+    const fde **x, int count)
+{
+  for (int index = 0; index < count; ++index)
+    {
+  int encoding = get_fde_encoding (x[index]);
+  read_encoded_value_with_base (encoding, base_from_object 
(encoding, ob),

+    x[index]->pc_begin, target + index);
+    }
+}
+
+typedef void (*fde_extractor_t) (struct object *, _Unwind_Ptr *, const 
fde **,

+ int);
+
+// Data is is sorted using radix sort if possible, using an temporary
+// auxiliary data structure of the same size as the input. When running
+// out of memory to in-place heap sort.


s/to/do/ ?

OK with that change or other fix to that sentence.


  struct fde_accumulator
  {
    struct fde_vector *linear;
-  struct fde_vector *erratic;
+  struct fde_vector *aux;
  };

  static inline int
@@ -485,8 +515,8 @@ start_fde_sort (struct fde_accumulator *accu, size_t 
count)

    if ((accu->linear = malloc (size)))
  {
    accu->linear->count = 0;
-  if ((accu->erratic = malloc (size)))
-    accu->erratic->count = 0;
+  if ((accu->aux = malloc (size)))
+    accu->aux->count = 0;
    return 1;
  }
    else
@@ -500,59 +530,6 @@ fde_insert (struct fde_accumulator *accu, const fde 
*this_fde)

  accu->linear->array[accu->linear->count++] = this_fde;
  }

-/* Split LINEAR into a linear sequence with low values and an erratic
-   sequence with high values, put the linear one (of longest possible
-   length) into LINEAR and the erratic one into ERRATIC. This is O(N).
-
-   Because the longest linear sequence we 

[PATCH] speed up end_fde_sort using radix sort

2022-11-22 Thread Thomas Neumann via Gcc-patches

When registering a dynamic unwinding frame the fde list is sorted.
Previously, we split the list into a sorted and an unsorted part,
sorted the later using heap sort, and merged both. That can be
quite slow due to the large number of (expensive) comparisons.

This patch replaces that logic with a radix sort instead. The
radix sort uses the same amount of memory as the old logic,
using the second list as auxiliary space, and it includes two
techniques to speed up sorting: First, it computes the pointer
addresses for blocks of values, reducing the decoding overhead.
And it recognizes when the data has reached a sorted state,
allowing for early termination. When running out of memory
we fall back to pure heap sort, as before.

For this test program

#include 
int main(int argc, char** argv) {
 return 0;
}

compiled with g++ -O -o hello -static hello.c we get with
perf stat -r 200 on a 5950X the following performance numbers:

old logic:

  0,20 msec task-clock
   930.834  cycles
 3.079.765  instructions
0,00030478 +- 0,0237 seconds time elapsed

new logic:

  0,10 msec task-clock
   473.269  cycles
 1.239.077  instructions
0,00021119 +- 0,0168 seconds time elapsed

libgcc/ChangeLog:
* unwind-dw2-fde.c: Use radix sort instead of split+sort+merge.
---
 libgcc/unwind-dw2-fde.c | 234 +++-
 1 file changed, 134 insertions(+), 100 deletions(-)

diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
index 3c0cc654ec0..b81540c41a4 100644
--- a/libgcc/unwind-dw2-fde.c
+++ b/libgcc/unwind-dw2-fde.c
@@ -456,22 +456,52 @@ fde_mixed_encoding_compare (struct object *ob, const fde 
*x, const fde *y)
 
 typedef int (*fde_compare_t) (struct object *, const fde *, const fde *);
 
+// The extractor functions compute the pointer values for a block of

+// fdes. The block processing hides the call overhead.
 
-/* This is a special mix of insertion sort and heap sort, optimized for

-   the data sets that actually occur. They look like
-   101 102 103 127 128 105 108 110 190 111 115 119 125 160 126 129 130.
-   I.e. a linearly increasing sequence (coming from functions in the text
-   section), with additionally a few unordered elements (coming from functions
-   in gnu_linkonce sections) whose values are higher than the values in the
-   surrounding linear sequence (but not necessarily higher than the values
-   at the end of the linear sequence!).
-   The worst-case total run time is O(N) + O(n log (n)), where N is the
-   total number of FDEs and n is the number of erratic ones.  */
+static void
+fde_unencoded_extract (struct object *ob __attribute__ ((unused)),
+  _Unwind_Ptr *target, const fde **x, int count)
+{
+  for (int index = 0; index < count; ++index)
+memcpy (target + index, x[index]->pc_begin, sizeof (_Unwind_Ptr));
+}
+
+static void
+fde_single_encoding_extract (struct object *ob, _Unwind_Ptr *target,
+const fde **x, int count)
+{
+  _Unwind_Ptr base;
+
+  base = base_from_object (ob->s.b.encoding, ob);
+  for (int index = 0; index < count; ++index)
+read_encoded_value_with_base (ob->s.b.encoding, base, x[index]->pc_begin,
+ target + index);
+}
+
+static void
+fde_mixed_encoding_extract (struct object *ob, _Unwind_Ptr *target,
+   const fde **x, int count)
+{
+  for (int index = 0; index < count; ++index)
+{
+  int encoding = get_fde_encoding (x[index]);
+  read_encoded_value_with_base (encoding, base_from_object (encoding, ob),
+   x[index]->pc_begin, target + index);
+}
+}
+
+typedef void (*fde_extractor_t) (struct object *, _Unwind_Ptr *, const fde **,
+int);
+
+// Data is is sorted using radix sort if possible, using an temporary
+// auxiliary data structure of the same size as the input. When running
+// out of memory to in-place heap sort.
 
 struct fde_accumulator

 {
   struct fde_vector *linear;
-  struct fde_vector *erratic;
+  struct fde_vector *aux;
 };
 
 static inline int

@@ -485,8 +515,8 @@ start_fde_sort (struct fde_accumulator *accu, size_t count)
   if ((accu->linear = malloc (size)))
 {
   accu->linear->count = 0;
-  if ((accu->erratic = malloc (size)))
-   accu->erratic->count = 0;
+  if ((accu->aux = malloc (size)))
+   accu->aux->count = 0;
   return 1;
 }
   else
@@ -500,59 +530,6 @@ fde_insert (struct fde_accumulator *accu, const fde 
*this_fde)
 accu->linear->array[accu->linear->count++] = this_fde;
 }
 
-/* Split LINEAR into a linear sequence with low values and an erratic

-   sequence with high values, put the linear one (of longest possible
-   length) into LINEAR and the erratic one into ERRATIC. This is O(N).
-
-   Because the longest linear sequence we are trying to locate within the
-   incoming LINEAR array