On Tue, Jan 13, 2026 at 1:34 PM Wilco Dijkstra <[email protected]> wrote:
>
>
> Use anchors for FP constants - instead of using mergeable sections which 
> blocks
> anchors, load FP constants from the constdata section.  To avoid the anchor 
> loads
> being deoptimized later, ensure the cost of a CONST_DOUBLE is larger than the 
> cost
> of a MEM that loads it from constdata.  Codesize is slightly smaller, 
> performance
> on SPECFP2017 is ~0.30% better.
>
> Passes regress, OK for commit?

Ok.

>
> gcc:
>         PR target/121240
>         * config/aarch64/aarch64.md (mov<mode>): Expand FP immediates early.
>         * config/aarch64/aarch64.cc (aarch64_select_rtx_section): Force
>         immediates <= 8 bytes to constdata.
>         (aarch64_rtx_costs): Increase cost of CONST_DOUBLE loaded from memory.
>
> gcc/testsuite:
>         PR target/121240
>         * gcc.target/aarch64/dbl_mov_immediate_1.c: Adjust test.
>         * gcc.target/aarch64/pr63304_1.c: Likewise.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 
> 4784d3dadc5c8811e84d042bcb24cf2928520219..3a453ad4918d5e3d9f37e1937725e6cdd74f3af6
>  100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -14315,6 +14315,10 @@ aarch64_select_rtx_section (machine_mode mode,
>    if (aarch64_can_use_per_function_literal_pools_p ())
>      return function_section (current_function_decl);
>
> +  /* When using anchors for constants use the readonly section.  */
> +  if (known_le (GET_MODE_SIZE (mode), 8))
> +    return readonly_data_section;
> +
>    return default_elf_select_rtx_section (mode, x, align);
>  }
>
> @@ -15269,11 +15273,13 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int 
> outer ATTRIBUTE_UNUSED,
>             *cost += extra_cost->fp[mode == DFmode || mode == DDmode].fpconst;
>           else if (!aarch64_float_const_zero_rtx_p (x))
>             {
> -             /* This will be a load from memory.  */
> +             /* Load from constdata - the cost of CONST_DOUBLE should be
> +                higher than the cost of a MEM so that later optimizations
> +                won't deoptimize an anchor load into a non-anchor load.  */
>               if (mode == DFmode || mode == DDmode)
> -               *cost += extra_cost->ldst.loadd;
> +               *cost += extra_cost->ldst.loadd + 1;
>               else
> -               *cost += extra_cost->ldst.loadf;
> +               *cost += extra_cost->ldst.loadf + 1;
>             }
>           else
>             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 
> 71458bf78f5cc4d926d7c5e0467daec9a5d75a03..4445208bf92ce0e08b72fde3de0f6dbc238cac3b
>  100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1960,6 +1960,18 @@ (define_expand "mov<mode>"
>         emit_move_insn (operands[0], gen_lowpart (<MODE>mode, tmp));
>         DONE;
>        }
> +
> +    /* Expand into a literal load using anchors.  */
> +    if (GET_CODE (operands[1]) == CONST_DOUBLE
> +       && !aarch64_can_const_movi_rtx_p (operands[1], <MODE>mode)
> +       && !aarch64_float_const_representable_p (operands[1])
> +       && !aarch64_float_const_zero_rtx_p (operands[1])
> +       && !aarch64_float_const_rtx_p (operands[1]))
> +      {
> +       operands[1] = force_const_mem (<MODE>mode, operands[1]);
> +       emit_move_insn (operands[0], operands[1]);
> +       DONE;
> +      }
>    }
>  )
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/dbl_mov_immediate_1.c 
> b/gcc/testsuite/gcc.target/aarch64/dbl_mov_immediate_1.c
> index 
> 8332035d80b91c497fc032cff1043922c328a701..fac32df8c24603d73840343e8069bf659f1e69fc
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/dbl_mov_immediate_1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/dbl_mov_immediate_1.c
> @@ -41,8 +41,8 @@ double d4(void)
>
>  /* { dg-final { scan-assembler-times "movi\td\[0-9\]+, #?0"                 
> 1 } } */
>
> -/* { dg-final { scan-assembler-times "adrp\tx\[0-9\]+, \.LC\[0-9\]"         
> 2 } } */
> -/* { dg-final { scan-assembler-times "ldr\td\[0-9\]+, \\\[x\[0-9\], 
> #:lo12:\.LC\[0-9\]\\\]" 2 } } */
> +/* { dg-final { scan-assembler-times "adrp\tx\[0-9\]+, "         2 } } */
> +/* { dg-final { scan-assembler-times "ldr\td\[0-9\]+, \\\[x\[0-9\], #:lo12:" 
> 2 } } */
>
>  /* { dg-final { scan-assembler-times "fmov\td\[0-9\]+, 1\\\.5e\\\+0"        
> 1 } } */
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr63304_1.c 
> b/gcc/testsuite/gcc.target/aarch64/pr63304_1.c
> index 
> 5d519d817ccdfe85849496dec654b1e8ac7a2888..134fd469b87c265a189507c82b49f8ad08248e14
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/pr63304_1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/pr63304_1.c
> @@ -45,4 +45,4 @@ cal3 (double a)
>      return 1;
>  }
>
> -/* { dg-final { scan-assembler-times "adrp" 6 } } */
> +/* { dg-final { scan-assembler-times "adrp" 4 } } */
>

Reply via email to