Re: [PATCH] AArch64: Add fma_reassoc_width [PR107413]

Richard Sandiford via Gcc-patches Mon, 21 Nov 2022 23:58:34 -0800

Wilco Dijkstra <wilco.dijks...@arm.com> writes:
> Add a reassocation width for FMAs in per-CPU tuning structures. Keep the
> existing setting for cores with 2 FMA pipes, and use 4 for cores with 4
> FMA pipes.  This improves SPECFP2017 on Neoverse V1 by ~1.5%.
>
> Passes regress/bootstrap, OK for commit?
>
> gcc/
>         PR 107413
>         * config/aarch64/aarch64.cc (struct tune_params): Add
>         fma_reassoc_width to all CPU tuning structures.
>         * config/aarch64/aarch64-protos.h (struct tune_params): Add
>         fma_reassoc_width.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64-protos.h 
> b/gcc/config/aarch64/aarch64-protos.h
> index 
> a73bfa20acb9b92ae0475794c3f11c67d22feb97..71365a446007c26b906b61ca8b2a68ee06c83037
>  100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -540,6 +540,7 @@ struct tune_params
>    const char *loop_align;
>    int int_reassoc_width;
>    int fp_reassoc_width;
> +  int fma_reassoc_width;
>    int vec_reassoc_width;
>    int min_div_recip_mul_sf;
>    int min_div_recip_mul_df;
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 
> 798363bcc449c414de5bbb4f26b8e1c64a0cf71a..643162cdecd6a8fe5587164cb2d0d62b709a491d
>  100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -1346,6 +1346,7 @@ static const struct tune_params generic_tunings =
>    "8", /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1382,6 +1383,7 @@ static const struct tune_params cortexa35_tunings =
>    "8", /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1415,6 +1417,7 @@ static const struct tune_params cortexa53_tunings =
>    "8", /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1448,6 +1451,7 @@ static const struct tune_params cortexa57_tunings =
>    "8", /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1481,6 +1485,7 @@ static const struct tune_params cortexa72_tunings =
>    "8", /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1514,6 +1519,7 @@ static const struct tune_params cortexa73_tunings =
>    "8", /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1548,6 +1554,7 @@ static const struct tune_params exynosm1_tunings =
>    "4", /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1580,6 +1587,7 @@ static const struct tune_params thunderxt88_tunings =
>    "8", /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1612,6 +1620,7 @@ static const struct tune_params thunderx_tunings =
>    "8", /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1646,6 +1655,7 @@ static const struct tune_params tsv110_tunings =
>    "8",  /* loop_align.  */
>    2,    /* int_reassoc_width.  */
>    4,    /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    1,    /* vec_reassoc_width.  */
>    2,    /* min_div_recip_mul_sf.  */
>    2,    /* min_div_recip_mul_df.  */
> @@ -1678,6 +1688,7 @@ static const struct tune_params xgene1_tunings =
>    "16",        /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1710,6 +1721,7 @@ static const struct tune_params emag_tunings =
>    "16",        /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1743,6 +1755,7 @@ static const struct tune_params qdf24xx_tunings =
>    "16",        /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1778,6 +1791,7 @@ static const struct tune_params saphira_tunings =
>    "16",        /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    1,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1811,6 +1825,7 @@ static const struct tune_params thunderx2t99_tunings =
>    "16",        /* loop_align.  */
>    3,   /* int_reassoc_width.  */
>    2,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    2,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1844,6 +1859,7 @@ static const struct tune_params thunderx3t110_tunings =
>    "16",        /* loop_align.  */
>    3,   /* int_reassoc_width.  */
>    2,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    2,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1876,6 +1892,7 @@ static const struct tune_params neoversen1_tunings =
>    "32:16",     /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    2,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -1912,6 +1929,7 @@ static const struct tune_params ampere1_tunings =
>    "32:16",     /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    2,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -2089,6 +2107,7 @@ static const struct tune_params neoversev1_tunings =
>    "32:16",     /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  4,   /* fma_reassoc_width.  */
>    2,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -2226,6 +2245,7 @@ static const struct tune_params neoverse512tvb_tunings =
>    "32:16",     /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  4,   /* fma_reassoc_width.  */
>    2,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -2414,6 +2434,7 @@ static const struct tune_params neoversen2_tunings =
>    "32:16",     /* loop_align.  */
>    2,   /* int_reassoc_width.  */
>    4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    2,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -2603,6 +2624,7 @@ static const struct tune_params neoversev2_tunings =
>    "32:16",     /* loop_align.  */
>    3,   /* int_reassoc_width.  */
>    6,   /* fp_reassoc_width.  */
> +  4,   /* fma_reassoc_width.  */
>    3,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -2638,6 +2660,7 @@ static const struct tune_params a64fx_tunings =
>    "32",        /* loop_align.  */
>    4,   /* int_reassoc_width.  */
>    2,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
>    2,   /* vec_reassoc_width.  */
>    2,   /* min_div_recip_mul_sf.  */
>    2,   /* min_div_recip_mul_df.  */
> @@ -3350,9 +3373,10 @@ aarch64_reassociation_width (unsigned opc, 
> machine_mode mode)
>      return aarch64_tune_params.vec_reassoc_width;
>    if (INTEGRAL_MODE_P (mode))
>      return aarch64_tune_params.int_reassoc_width;
> -  /* Avoid reassociating floating point addition so we emit more FMAs.  */
> -  if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
> -    return aarch64_tune_params.fp_reassoc_width;
> +  /* FMA's can have a different reassociation width.  */
> +  if (FLOAT_MODE_P (mode))
> +    return opc == PLUS_EXPR ? aarch64_tune_params.fma_reassoc_width
> +                           : aarch64_tune_params.fp_reassoc_width;
>    return 1;
>  }


I guess an obvious question is: if 1 (rather than 2) was the right value
for cores with 2 FMA pipes, why is 4 the right value for cores with 4 FMA
pipes?  It would be good to clarify how, conceptually, the core property
should map to the fma_reassoc_width value.

It sounds from the existing comment like the main motivation for returning 1
was to encourage more FMAs to be formed, rather than to prevent FMAs from
being reassociated.  Is that no longer an issue?  Or is the point that,
with more FMA pipes, lower FMA formation is a price worth paying for
the better parallelism we get when FMAs can be formed?

Does this code ever see opc == FMA?

Thanks,
Richard

Re: [PATCH] AArch64: Add fma_reassoc_width [PR107413]

Reply via email to