Hi Tamar,

> How can it both pass regress and have a test failing test :)
>
> Isn't the problem that now we force vectors to memory so early that CSE
> no longer sees the CONST_VECTOR, which is the thing it was taught to track?

CSE is not very good at dealing with CONST_VECTOR - it seems multiple fixes are
required. It's not clear whether it hashes both the load and the CONST_VECTOR
correctly so that later lookups will find either. The code that looks for a DUP 
of
an existing immediate ignores the REG_EQUAL note...

> I might be misunderstanding but the goal is to have these lowered before
> reload right? which is what's currently forcing them, and so we lose the 
> combination
> of anchors?

Constant loads need to be lowered before CSE so that they can use anchors.

> 1. Can we then extend CSE to track the vectors behind a MEM, the REG_EQUAL 
> notes
>     still seems to be there and could be used perhaps?

Possibly - see above, it's not clear how easy this is. With my new expansion, 
the
existing optimizations are no longer used.

> 2. Could we instead of forcing them at expand, force them after CSE, for 
> instance
>     split 1 that runs quite early too.

Then we wouldn't get CSE of the ADRPs which is the goal of using anchors. Split1
runs just before regalloc, so you'd need to add another (G)CSE pass after.

> 3. By forcing them to memory so early, does RTL's LIM still hoist them? Or 
> does it
>     think there's a side-effect now?

Both MEM_READONLY_P and MEM_NOTRAP_P are true for the constant loads
(as you'd expect), so it should work.

The underlying issue is that CONST_VECTOR is not optimized at all - the only
expansion is a literal load (besides MOVI). aarch64_expand_vector_init crashes
if called with a CONST_VECTOR that is really a constant...

So for v2 I've just added the most basic expansion using DUP for cases where
const_vec_duplicate_p is true. This doesn't consider the cost of generating the
immediate nor tries to use LD1R in case it is loaded anyway. However it catches
~17000 cases across SPEC2017, reduces codesize and improves performance.

Cheers,
Wilco


v2: Add support for DUP immediate expansion for CONST_VECTOR

Enable anchors for vector constants - like FP, expand vector constants early
and place them in the constdata section.  Avoid unnecessary loads by expanding
simple cases using DUP.  Performance on SPECFP2017 is ~0.3% better, codesize
increases by 0.05% due to extra const data.

Passes regress, OK for commit?

gcc:
        PR target/121240
        * config/aarch64/aarch64-simd.md (mov<mode>): Expand vector constants
        early.
        * config/aarch64/aarch64.cc (aarch64_select_rtx_section): Force 
        vector immediates <= 16 bytes to constdata.
 
gcc/testsuite:
        PR target/121240
        * gcc.target/aarch64/const_create_using_fmov.c: Fix test.
        * gcc.target/aarch64/pr121240.c: Add new test.
        * gcc.target/aarch64/vec-init-single-const.c: Fix test.
        * gcc.target/aarch64/vect-cse-codegen.c: Fix test.

---

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 
505e8b1126fa021a22f1ee37cecae1fc8098ae78..c314e85927d3824ad8efc61dadd9b6eedf113433
 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -70,11 +70,32 @@ (define_expand "mov<mode>"
      contains CONST_POLY_INTs), build it up from individual elements instead.
      We should only need to do this before RA; aarch64_legitimate_constant_p
      should ensure that we don't try to rematerialize the constant later.  */
-  if (GET_CODE (operands[1]) == CONST_VECTOR
-      && targetm.cannot_force_const_mem (<MODE>mode, operands[1]))
+  if (GET_CODE (operands[1]) == CONST_VECTOR)
     {
-      aarch64_expand_vector_init (operands[0], operands[1]);
-      DONE;
+      if (targetm.cannot_force_const_mem (<MODE>mode, operands[1]))
+       {
+         aarch64_expand_vector_init (operands[0], operands[1]);
+         DONE;
+       }
+      else if (!aarch64_simd_imm_zero (operands[1], <MODE>mode)
+              && !aarch64_simd_special_constant_p (operands[1], <MODE>mode)
+              && !aarch64_simd_valid_mov_imm (operands[1]))
+       {
+         rtx x;
+         /* Expand into VDUP.  */
+         if (TARGET_SIMD && const_vec_duplicate_p (operands[1], &x))
+           {
+             x = force_reg (GET_MODE_INNER (<MODE>mode), x);
+             operands[1] = gen_vec_duplicate (<MODE>mode, x);
+             emit_move_insn (operands[0], operands[1]);
+             DONE;
+           }
+
+         /* Expand into a literal load using anchors.  */
+         operands[1] = force_const_mem (<MODE>mode, operands[1]);
+         emit_move_insn (operands[0], operands[1]);
+         DONE;
+       }
     }
   "
 )
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
29d2400a6e9d6277ee5ed283ff8e11041d920435..f5285ed45962f3fe795348cc77534b3dfa010b52
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -14311,8 +14311,8 @@ aarch64_select_rtx_section (machine_mode mode,
     return function_section (current_function_decl);
 
   /* When using anchors for constants use the readonly section.  */
-  if ((CONST_INT_P (x) || CONST_DOUBLE_P (x))
-      && known_le (GET_MODE_SIZE (mode), 8))
+  if ((CONST_INT_P (x) || CONST_DOUBLE_P (x) || CONST_VECTOR_P (x))
+      && known_le (GET_MODE_SIZE (mode), 16))
     return readonly_data_section;
 
   return default_elf_select_rtx_section (mode, x, align);
diff --git a/gcc/testsuite/gcc.target/aarch64/const_create_using_fmov.c 
b/gcc/testsuite/gcc.target/aarch64/const_create_using_fmov.c
index 
e080afed8aa3578660027979335bfc859ca6bc91..21ea91f5073d936910988e848b0c54e42399ef08
 100644
--- a/gcc/testsuite/gcc.target/aarch64/const_create_using_fmov.c
+++ b/gcc/testsuite/gcc.target/aarch64/const_create_using_fmov.c
@@ -78,8 +78,8 @@ uint16x8_t f5() {
 
 /*
 ** f6:
-**     adrp    x0, \.LC0
-**     ldr     q0, \[x0, #:lo12:\.LC0\]
+**     mov     w0, 1333788672
+**     dup     v0.4s, w0
 **     ret
 */
 uint32x4_t f6() {
diff --git a/gcc/testsuite/gcc.target/aarch64/pr121240.c 
b/gcc/testsuite/gcc.target/aarch64/pr121240.c
new file mode 100644
index 
0000000000000000000000000000000000000000..f04574d411c3fd4d6ac2313fc27f2274e8926080
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr121240.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcmodel=small" } */
+
+const double b[4] = {0.2435334343f, 0.2233535343f, 0.4232433f, 0.34343434f};
+typedef double v2df __attribute__ ((vector_size (16)));
+typedef double v2df __attribute__ ((vector_size (16)));
+
+v2df f (v2df c1, v2df c2)
+{
+   v2df a1 = *(v2df *)&b[0];
+   v2df a2 = *(v2df *)&b[2];
+   return (a1 * c1) + (a2 * c2);
+}
+
+/* { dg-final { scan-assembler-times "adrp" 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-single-const.c 
b/gcc/testsuite/gcc.target/aarch64/vec-init-single-const.c
index 
274b0b39ac434d887b43a07c30fb41218bdf8768..587b7ec0e3b2541d3dd80cd5a66f83230c61489b
 100644
--- a/gcc/testsuite/gcc.target/aarch64/vec-init-single-const.c
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-single-const.c
@@ -46,9 +46,9 @@ int32x4_t f_s32(int32_t x)
 
 /*
 ** f_s64:
-**     adrp    x[0-9]+, .LC[0-9]+
-**     ldr     q0, \[x[0-9]+, #:lo12:.LC[0-9]+\]
-**     ins     v0\.d\[0\], x0
+**     fmov    d0, x0
+**     mov     (x[0-9]+), 1
+**     ins     v0\.d\[1\], \1
 **     ret
 */
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c 
b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
index 
2b8e64313bb47f995f071c728b1d84473807bc64..1aa0d3edd323c4783d14e93ed40d74b585d567ce
 100644
--- a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
+++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
@@ -6,12 +6,14 @@
 
 /*
 **test1:
-**     adrp    x[0-9]+, .LC[0-9]+
-**     ldr     q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**     mov     x[0-9]+, 16502
+**     movk    x[0-9]+, 0x1023, lsl 16
+**     movk    x[0-9]+, 0x4308, lsl 32
+**     movk    x[0-9]+, 0x942, lsl 48
+**     dup     v[0-9]+.2d, x[0-9]+
 **     add     v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
-**     str     q[0-9]+, \[x[0-9]+\]
-**     fmov    x[0-9]+, d[0-9]+
 **     orr     x[0-9]+, x[0-9]+, x[0-9]+
+**     str     q[0-9]+, \[x[0-9]+\]
 **     ret
 */
 
@@ -27,12 +29,14 @@ test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
 
 /*
 **test2:
-**     adrp    x[0-9]+, .LC[0-1]+
-**     ldr     q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**     mov     x[0-9]+, 16502
+**     movk    x[0-9]+, 0x4223, lsl 16
+**     movk    x[0-9]+, 0x3032, lsl 32
+**     movk    x[0-9]+, 0x424, lsl 48
+**     dup     v[0-9]+.2d, x[0-9]+
 **     add     v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
-**     str     q[0-9]+, \[x[0-9]+\]
-**     fmov    x[0-9]+, d[0-9]+
 **     orr     x[0-9]+, x[0-9]+, x[0-9]+
+**     str     q[0-9]+, \[x[0-9]+\]
 **     ret
 */
 
@@ -48,12 +52,12 @@ test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
 
 /*
 **test3:
-**     adrp    x[0-9]+, .LC[0-9]+
-**     ldr     q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**     mov     w[0-9]+, 16963
+**     movk    w[0-9]+, 0x9, lsl 16
+**     dup     v[0-9]+.4s, w[0-9]+
 **     add     v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
-**     str     q[0-9]+, \[x1\]
-**     fmov    w[0-9]+, s[0-9]+
 **     orr     w[0-9]+, w[0-9]+, w[0-9]+
+**     str     q[0-9]+, \[x1\]
 **     ret
 */
 



Reply via email to