Hi Richard,

>> A smart reassociation pass could form more FMAs while also increasing
>> parallelism, but the way it currently works always results in fewer FMAs.
>
> Yeah, as Richard said, that seems the right long-term fix.
> It would also avoid the hack of treating PLUS_EXPR as a signal
> of an FMA, which has the drawback of assuming (for 2-FMA cores)
> that plain addition never benefits from reassociation in its own right.

True but it's hard to separate them. You will have a mix of FADD and FMAs
to reassociate (since FMA still counts as an add), and the ratio between
them as well as the number of operations may affect the best reassociation
width.

> Still, I guess the hackiness is pre-existing and the patch is removing
> the hackiness for some cores, so from that point of view it's a strict
> improvement over the status quo.  And it's too late in the GCC 13
> cycle to do FMA reassociation properly.  So I'm OK with the patch
> in principle, but could you post an update with more commentary?

Sure, here is an update with longer comment in aarch64_reassociation_width:


Add a reassocation width for FMAs in per-CPU tuning structures. Keep the
existing setting for cores with 2 FMA pipes, and use 4 for cores with 4
FMA pipes.  This improves SPECFP2017 on Neoverse V1 by ~1.5%.

Passes regress/bootstrap, OK for commit?

gcc/ChangeLog/
        PR 107413
        * config/aarch64/aarch64.cc (struct tune_params): Add
        fma_reassoc_width to all CPU tuning structures.
        (aarch64_reassociation_width): Use fma_reassoc_width.
        * config/aarch64/aarch64-protos.h (struct tune_params): Add
        fma_reassoc_width.

---
diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 
238820581c5ee7617f8eed1df2cf5418b1127e19..4be93c93c26e091f878bc8e4cf06e90888405fb2
 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -540,6 +540,7 @@ struct tune_params
   const char *loop_align;
   int int_reassoc_width;
   int fp_reassoc_width;
+  int fma_reassoc_width;
   int vec_reassoc_width;
   int min_div_recip_mul_sf;
   int min_div_recip_mul_df;
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
c91df6f5006c257690aafb75398933d628a970e1..15d478c77ceb2d6c52a70b6ffd8fdadcfa8deba0
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -1346,6 +1346,7 @@ static const struct tune_params generic_tunings =
   "8", /* loop_align.  */
   2,   /* int_reassoc_width.  */
   4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
   1,   /* vec_reassoc_width.  */
   2,   /* min_div_recip_mul_sf.  */
   2,   /* min_div_recip_mul_df.  */
@@ -1382,6 +1383,7 @@ static const struct tune_params cortexa35_tunings =
   "8", /* loop_align.  */
   2,   /* int_reassoc_width.  */
   4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
   1,   /* vec_reassoc_width.  */
   2,   /* min_div_recip_mul_sf.  */
   2,   /* min_div_recip_mul_df.  */
@@ -1415,6 +1417,7 @@ static const struct tune_params cortexa53_tunings =
   "8", /* loop_align.  */
   2,   /* int_reassoc_width.  */
   4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
   1,   /* vec_reassoc_width.  */
   2,   /* min_div_recip_mul_sf.  */
   2,   /* min_div_recip_mul_df.  */
@@ -1448,6 +1451,7 @@ static const struct tune_params cortexa57_tunings =
   "8", /* loop_align.  */
   2,   /* int_reassoc_width.  */
   4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
   1,   /* vec_reassoc_width.  */
   2,   /* min_div_recip_mul_sf.  */
   2,   /* min_div_recip_mul_df.  */
@@ -1481,6 +1485,7 @@ static const struct tune_params cortexa72_tunings =
   "8", /* loop_align.  */
   2,   /* int_reassoc_width.  */
   4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
   1,   /* vec_reassoc_width.  */
   2,   /* min_div_recip_mul_sf.  */
   2,   /* min_div_recip_mul_df.  */
@@ -1514,6 +1519,7 @@ static const struct tune_params cortexa73_tunings =
   "8", /* loop_align.  */
   2,   /* int_reassoc_width.  */
   4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
   1,   /* vec_reassoc_width.  */
   2,   /* min_div_recip_mul_sf.  */
   2,   /* min_div_recip_mul_df.  */
@@ -1548,6 +1554,7 @@ static const struct tune_params exynosm1_tunings =
   "4", /* loop_align.  */
   2,   /* int_reassoc_width.  */
   4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
   1,   /* vec_reassoc_width.  */
   2,   /* min_div_recip_mul_sf.  */
   2,   /* min_div_recip_mul_df.  */
@@ -1580,6 +1587,7 @@ static const struct tune_params thunderxt88_tunings =
   "8", /* loop_align.  */
   2,   /* int_reassoc_width.  */
   4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
   1,   /* vec_reassoc_width.  */
   2,   /* min_div_recip_mul_sf.  */
   2,   /* min_div_recip_mul_df.  */
@@ -1612,6 +1620,7 @@ static const struct tune_params thunderx_tunings =
   "8", /* loop_align.  */
   2,   /* int_reassoc_width.  */
   4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
   1,   /* vec_reassoc_width.  */
   2,   /* min_div_recip_mul_sf.  */
   2,   /* min_div_recip_mul_df.  */
@@ -1646,6 +1655,7 @@ static const struct tune_params tsv110_tunings =
   "8",  /* loop_align.  */
   2,    /* int_reassoc_width.  */
   4,    /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
   1,    /* vec_reassoc_width.  */
   2,    /* min_div_recip_mul_sf.  */
   2,    /* min_div_recip_mul_df.  */
@@ -1678,6 +1688,7 @@ static const struct tune_params xgene1_tunings =
   "16",        /* loop_align.  */
   2,   /* int_reassoc_width.  */
   4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
   1,   /* vec_reassoc_width.  */
   2,   /* min_div_recip_mul_sf.  */
   2,   /* min_div_recip_mul_df.  */
@@ -1710,6 +1721,7 @@ static const struct tune_params emag_tunings =
   "16",        /* loop_align.  */
   2,   /* int_reassoc_width.  */
   4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
   1,   /* vec_reassoc_width.  */
   2,   /* min_div_recip_mul_sf.  */
   2,   /* min_div_recip_mul_df.  */
@@ -1743,6 +1755,7 @@ static const struct tune_params qdf24xx_tunings =
   "16",        /* loop_align.  */
   2,   /* int_reassoc_width.  */
   4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
   1,   /* vec_reassoc_width.  */
   2,   /* min_div_recip_mul_sf.  */
   2,   /* min_div_recip_mul_df.  */
@@ -1778,6 +1791,7 @@ static const struct tune_params saphira_tunings =
   "16",        /* loop_align.  */
   2,   /* int_reassoc_width.  */
   4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
   1,   /* vec_reassoc_width.  */
   2,   /* min_div_recip_mul_sf.  */
   2,   /* min_div_recip_mul_df.  */
@@ -1811,6 +1825,7 @@ static const struct tune_params thunderx2t99_tunings =
   "16",        /* loop_align.  */
   3,   /* int_reassoc_width.  */
   2,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
   2,   /* vec_reassoc_width.  */
   2,   /* min_div_recip_mul_sf.  */
   2,   /* min_div_recip_mul_df.  */
@@ -1844,6 +1859,7 @@ static const struct tune_params thunderx3t110_tunings =
   "16",        /* loop_align.  */
   3,   /* int_reassoc_width.  */
   2,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
   2,   /* vec_reassoc_width.  */
   2,   /* min_div_recip_mul_sf.  */
   2,   /* min_div_recip_mul_df.  */
@@ -1876,6 +1892,7 @@ static const struct tune_params neoversen1_tunings =
   "32:16",     /* loop_align.  */
   2,   /* int_reassoc_width.  */
   4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
   2,   /* vec_reassoc_width.  */
   2,   /* min_div_recip_mul_sf.  */
   2,   /* min_div_recip_mul_df.  */
@@ -1912,6 +1929,7 @@ static const struct tune_params ampere1_tunings =
   "32:16",     /* loop_align.  */
   2,   /* int_reassoc_width.  */
   4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
   2,   /* vec_reassoc_width.  */
   2,   /* min_div_recip_mul_sf.  */
   2,   /* min_div_recip_mul_df.  */
@@ -1949,6 +1967,7 @@ static const struct tune_params ampere1a_tunings =
   "32:16",     /* loop_align.  */
   2,   /* int_reassoc_width.  */
   4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
   2,   /* vec_reassoc_width.  */
   2,   /* min_div_recip_mul_sf.  */
   2,   /* min_div_recip_mul_df.  */
@@ -2126,6 +2145,7 @@ static const struct tune_params neoversev1_tunings =
   "32:16",     /* loop_align.  */
   2,   /* int_reassoc_width.  */
   4,   /* fp_reassoc_width.  */
+  4,   /* fma_reassoc_width.  */
   2,   /* vec_reassoc_width.  */
   2,   /* min_div_recip_mul_sf.  */
   2,   /* min_div_recip_mul_df.  */
@@ -2263,6 +2283,7 @@ static const struct tune_params neoverse512tvb_tunings =
   "32:16",     /* loop_align.  */
   2,   /* int_reassoc_width.  */
   4,   /* fp_reassoc_width.  */
+  4,   /* fma_reassoc_width.  */
   2,   /* vec_reassoc_width.  */
   2,   /* min_div_recip_mul_sf.  */
   2,   /* min_div_recip_mul_df.  */
@@ -2451,6 +2472,7 @@ static const struct tune_params neoversen2_tunings =
   "32:16",     /* loop_align.  */
   2,   /* int_reassoc_width.  */
   4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
   2,   /* vec_reassoc_width.  */
   2,   /* min_div_recip_mul_sf.  */
   2,   /* min_div_recip_mul_df.  */
@@ -2640,6 +2662,7 @@ static const struct tune_params neoversev2_tunings =
   "32:16",     /* loop_align.  */
   3,   /* int_reassoc_width.  */
   6,   /* fp_reassoc_width.  */
+  4,   /* fma_reassoc_width.  */
   3,   /* vec_reassoc_width.  */
   2,   /* min_div_recip_mul_sf.  */
   2,   /* min_div_recip_mul_df.  */
@@ -2675,6 +2698,7 @@ static const struct tune_params a64fx_tunings =
   "32",        /* loop_align.  */
   4,   /* int_reassoc_width.  */
   2,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
   2,   /* vec_reassoc_width.  */
   2,   /* min_div_recip_mul_sf.  */
   2,   /* min_div_recip_mul_df.  */
@@ -3387,9 +3411,15 @@ aarch64_reassociation_width (unsigned opc, machine_mode 
mode)
     return aarch64_tune_params.vec_reassoc_width;
   if (INTEGRAL_MODE_P (mode))
     return aarch64_tune_params.int_reassoc_width;
-  /* Avoid reassociating floating point addition so we emit more FMAs.  */
-  if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
-    return aarch64_tune_params.fp_reassoc_width;
+  /* Reassociation reduces the number of FMAs which may result in worse
+     performance.  Use a per-CPU setting for FMA reassociation which allows
+     narrow CPUs with few FP pipes to switch it off (value of 1), and wider
+     CPUs with many FP pipes to enable reassociation.
+     Since the reassociation pass doesn't understand FMA at all, assume
+     that any FP addition might turn into FMA.  */
+  if (FLOAT_MODE_P (mode))
+    return opc == PLUS_EXPR ? aarch64_tune_params.fma_reassoc_width
+                           : aarch64_tune_params.fp_reassoc_width;
   return 1;
 }
 


Reply via email to