On Mon, Oct 28, 2024 at 9:58 PM Andi Kleen <[email protected]> wrote:
>
> From: Andi Kleen <[email protected]>
>
> The current switch bit test clustering enumerates all possible case
> clusters combinations to find ones that fit the bit test constrains
> best. This causes performance problems with very large switches.
>
> For bit test clustering which happens naturally in word sized chunks
> I don't think such an expensive algorithm is really needed.
>
> This patch implements a simple greedy algorithm that walks
> the sorted list and examines word sized windows and tries
> to cluster them.
>
> Surprisingly the new algorithm gives consistly better clusters
> for the examples I tried.
>
> For example from the gcc bootstrap:
>
> old: 0-15 16-31 96-175
> new: 0-31 96-175
>
> I'm not fully sure why that is, probably some bug in the old
> algorithm? This shows even up in the test suite where if-to-switch-6
> now can generate a switch, as well as a case in switch-1.c
>
> I don't have a proof that the new algorithm is always as good or better,
> but so far at least I don't see any counter examples.
>
> It also fixes the excessive compile time in PR117091,
> however this was already fixed by an earlier patch
> that doesn't run clustering when no targets have multiple
> values.
OK if you add a comment (as part of the function comment for example)
explaining the idea of the algorithm.
Thanks,
Richard.
> gcc/ChangeLog:
>
> PR middle-end/117091
> * tree-switch-conversion.cc (bit_test_cluster::find_bit_tests):
> Change clustering algorithm to simple greedy.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/tree-ssa/if-to-switch-6.c: Allow condition chain.
> * gcc.dg/tree-ssa/switch-1.c: Allow more bit tests.
> ---
> .../gcc.dg/tree-ssa/if-to-switch-6.c | 2 +-
> gcc/testsuite/gcc.dg/tree-ssa/switch-1.c | 2 +-
> gcc/tree-switch-conversion.cc | 76 ++++++++++---------
> 3 files changed, 42 insertions(+), 38 deletions(-)
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/if-to-switch-6.c
> b/gcc/testsuite/gcc.dg/tree-ssa/if-to-switch-6.c
> index b1640673eae1..657af770e438 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/if-to-switch-6.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/if-to-switch-6.c
> @@ -39,4 +39,4 @@ int main(int argc, char **argv)
> return 0;
> }
>
> -/* { dg-final { scan-tree-dump-not "Condition chain" "iftoswitch" } } */
> +/* { dg-final { scan-tree-dump "Condition chain" "iftoswitch" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/switch-1.c
> b/gcc/testsuite/gcc.dg/tree-ssa/switch-1.c
> index 6f70c9de0c19..f1654aba6d99 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/switch-1.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/switch-1.c
> @@ -107,4 +107,4 @@ int foo5 (int x)
> }
> }
>
> -/* { dg-final { scan-tree-dump ";; GIMPLE switch case clusters: BT:10-62
> 600-700 JT:1000-1021 111111" "switchlower1" } } */
> +/* { dg-final { scan-tree-dump ";; GIMPLE switch case clusters: BT:10-62
> 600-700 BT:1000-1021 111111" "switchlower1" } } */
> diff --git a/gcc/tree-switch-conversion.cc b/gcc/tree-switch-conversion.cc
> index 3436c2a8b98c..b7736a9853d9 100644
> --- a/gcc/tree-switch-conversion.cc
> +++ b/gcc/tree-switch-conversion.cc
> @@ -1782,55 +1782,59 @@ bit_test_cluster::find_bit_tests (vec<cluster *>
> &clusters, int max_c)
> return clusters.copy ();
>
> unsigned l = clusters.length ();
> - auto_vec<min_cluster_item> min;
> - min.reserve (l + 1);
> + vec<cluster *> output;
>
> - min.quick_push (min_cluster_item (0, 0, 0));
> + output.create (l);
>
> - for (unsigned i = 1; i <= l; i++)
> + unsigned end;
> + for (unsigned i = 0; i < l; i += end)
> {
> - /* Set minimal # of clusters with i-th item to infinite. */
> - min.quick_push (min_cluster_item (INT_MAX, INT_MAX, INT_MAX));
> + HOST_WIDE_INT values = 0;
> + hash_set<basic_block> targets;
> + cluster *start_cluster = clusters[i];
>
> - for (unsigned j = 0; j < i; j++)
> + end = 0;
> + while (i + end < l)
> {
> - if (min[j].m_count + 1 < min[i].m_count
> - && can_be_handled (clusters, j, i - 1))
> - min[i] = min_cluster_item (min[j].m_count + 1, j, INT_MAX);
> + cluster *end_cluster = clusters[i + end];
> +
> + /* Does value range fit into the BITS_PER_WORD window? */
> + HOST_WIDE_INT w = cluster::get_range (start_cluster->get_low (),
> + end_cluster->get_high ());
> + if (w == 0 || w > BITS_PER_WORD)
> + break;
> +
> + /* Compute # of values tested for new case. */
> + HOST_WIDE_INT r = 1;
> + if (!end_cluster->is_single_value_p ())
> + r = cluster::get_range (end_cluster->get_low (),
> + end_cluster->get_high ());
> + if (r == 0)
> + break;
> +
> + /* Check for max # of targets. */
> + if (targets.elements() == m_max_case_bit_tests
> + && !targets.contains (end_cluster->m_case_bb))
> + break;
> +
> + targets.add (end_cluster->m_case_bb);
> + values += r;
> + end++;
> }
>
> - gcc_checking_assert (min[i].m_count != INT_MAX);
> - }
> -
> - /* No result. */
> - if (min[l].m_count == l)
> - return clusters.copy ();
> -
> - vec<cluster *> output;
> - output.create (4);
> -
> - /* Find and build the clusters. */
> - for (unsigned end = l;;)
> - {
> - int start = min[end].m_start;
> -
> - if (is_beneficial (clusters, start, end - 1))
> + if (is_beneficial (values, targets.elements ()))
> {
> - bool entire = start == 0 && end == clusters.length ();
> - output.safe_push (new bit_test_cluster (clusters, start, end - 1,
> - entire));
> + output.safe_push (new bit_test_cluster (clusters, i, i + end - 1,
> + i == 0 && end == l));
> }
> else
> - for (int i = end - 1; i >= start; i--)
> + {
> output.safe_push (clusters[i]);
> -
> - end = start;
> -
> - if (start <= 0)
> - break;
> + /* ??? Might be able to skip more. */
> + end = 1;
> + }
> }
>
> - output.reverse ();
> return output;
> }
>
> --
> 2.46.2
>